"""Sequential sample selection"""
import warnings
import numpy as np
from scipy.interpolate import LinearNDInterpolator, interp1d
from scipy.interpolate._interpnd import _ndim_coords_from_arrays
from scipy.spatial import ConvexHull
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_array, check_is_fitted, check_X_y
from .._selection import _CUR, _FPS, _PCovCUR, _PCovFPS
def _linear_interpolator(points, values):
"""
Returns linear interpolator for unstructured D-D data. Tessellate the input point
set to N-D simplices, and interpolate linearly on each simplex. See
``LinearNDInterpolator`` for more details.
Parameters
----------
points : 1- or 2-D numpy.ndarray of shape (n,) or (n, D)
Data point coordinates.
values : numpy.ndarray of float or complex, shape (n, )
Data values.
Reference:
---------
The code is an adapted excerpt from
https://github.com/scipy/scipy/blob/dde50595862a4f9cede24b5d1c86935c30f1f88a/scipy/interpolate/_ndgriddata.py#L119-L273
""" # NoQa: E501
points = _ndim_coords_from_arrays(points)
if points.ndim < 2:
ndim = points.ndim
else:
ndim = points.shape[-1]
if ndim == 1:
points = points.ravel()
# Sort points/values together, necessary as input for interp1d
idx = np.argsort(points)
points = points[idx]
values = values[idx]
return interp1d(
points, values, kind="linear", axis=0, bounds_error=False, fill_value=np.nan
)
else:
return LinearNDInterpolator(points, values, fill_value=np.nan, rescale=False)
[docs]
class FPS(_FPS):
"""Transformer performing Greedy Sample Selection using Farthest Point Sampling.
Parameters
----------
initialize: int, list of int, numpy.ndarray of int, or 'random', default=0
Index of the first selection(s). If 'random', picks a random
value when fit starts. Stored in :py:attr:`self.initialize`.
n_to_select : int or float, default=None
The number of selections to make. If `None`, half of the samples are selected.
If integer, the parameter is the absolute number of selections to make. If float
between 0 and 1, it is the fraction of the total dataset to select. Stored in
:py:attr:`self.n_to_select`.
score_threshold : float, default=None
Threshold for the score. If `None` selection will continue until the n_to_select
is chosen. Otherwise will stop when the score falls below the threshold. Stored
in :py:attr:`self.score_threshold`.
score_threshold_type : str, default="absolute"
How to interpret the ``score_threshold``. When "absolute", the score used by the
selector is compared to the threshold directly. When "relative", at each
iteration, the score used by the selector is compared proportionally to the
score of the first selection, i.e. the selector quits when ``current_score /
first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`.
progress_bar: bool, default=False
option to use `tqdm <https://tqdm.github.io/>`_ progress bar to monitor
selections. Stored in :py:attr:`self.report_progress`.
full : bool, default=False
In the case that all non-redundant selections are exhausted, choose randomly
from the remaining samples. Stored in :py:attr:`self.full`.
random_state : int or :class:`numpy.random.RandomState` instance, default=0
Attributes
----------
n_selected_ : int
Counter tracking the number of selections that have been made
X_selected_ :numpy.ndarray,
Matrix containing the selected samples, for use in fitting
y_selected_ : numpy.ndarray,
In sample selection, the matrix containing the selected targets, for
use in fitting.
selected_idx_ : numpy.ndarray
indices of selected samples
Examples
--------
>>> from skmatter.sample_selection import FPS
>>> import numpy as np
>>> selector = FPS(
... n_to_select=2,
... # int or 'random', default=0
... # Index of the first selection.
... # If "random", picks a random value when fit starts.
... initialize=0,
... )
>>> X = np.array(
... [
... [0.12, 0.21, 0.02], # 3 samples, 3 features
... [-0.09, 0.32, -0.10],
... [-0.03, -0.53, 0.08],
... ]
... )
>>> selector.fit(X)
FPS(n_to_select=2)
>>> selector.selected_idx_
array([0, 2])
"""
def __init__(
self,
initialize=0,
n_to_select=None,
score_threshold=None,
score_threshold_type="absolute",
progress_bar=False,
full=False,
random_state=0,
):
super().__init__(
selection_type="sample",
initialize=initialize,
n_to_select=n_to_select,
score_threshold=score_threshold,
score_threshold_type=score_threshold_type,
progress_bar=progress_bar,
full=full,
random_state=random_state,
)
[docs]
class PCovFPS(_PCovFPS):
r"""Transformer performing Greedy Sample Selection using PCovR-weighted Farthest
Point Sampling.
Parameters
----------
mixing: float, default=0.5
The PCovR mixing parameter, as described in PCovR as
:math:`{\alpha}`
initialize: int or 'random', default=0
Index of the first selection. If 'random', picks a random value when fit starts.
n_to_select : int or float, default=None
The number of selections to make. If `None`, half of the samples are selected.
If integer, the parameter is the absolute number of selections to make. If float
between 0 and 1, it is the fraction of the total dataset to select. Stored in
:py:attr:`self.n_to_select`.
score_threshold : float, default=None
Threshold for the score. If `None` selection will continue until the n_to_select
is chosen. Otherwise will stop when the score falls below the threshold. Stored
in :py:attr:`self.score_threshold`.
score_threshold_type : str, default="absolute"
How to interpret the ``score_threshold``. When "absolute", the score used by the
selector is compared to the threshold directly. When "relative", at each
iteration, the score used by the selector is compared proportionally to the
score of the first selection, i.e. the selector quits when ``current_score /
first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`.
progress_bar: bool, default=False
option to use `tqdm <https://tqdm.github.io/>`_ progress bar to monitor
selections. Stored in :py:attr:`self.report_progress`.
full : bool, default=False
In the case that all non-redundant selections are exhausted, choose
randomly from the remaining samples. Stored in :py:attr:`self.full`.
random_state : int or :class:`numpy.random.RandomState` instance, default=0
Attributes
----------
n_selected_ : int
Counter tracking the number of selections that have been made
X_selected_ : numpy.ndarray,
Matrix containing the selected samples, for use in fitting
y_selected_ : numpy.ndarray,
In sample selection, the matrix containing the selected targets, for use in
fitting
selected_idx_ : numpy.ndarray
indices of selected samples
Examples
--------
>>> from skmatter.sample_selection import PCovFPS
>>> import numpy as np
>>> selector = PCovFPS(
... n_to_select=2,
... # int or 'random', default=0
... # Index of the first selection.
... # If "random", picks a random value when fit starts.
... initialize=0,
... )
>>> X = np.array(
... [
... [0.12, 0.21, 0.02], # 3 samples, 3 features
... [-0.09, 0.32, -0.10],
... [-0.03, -0.53, 0.08],
... ]
... )
>>> y = np.array([0.0, 0.0, 1.0]) # classes of each sample
>>> selector.fit(X, y)
PCovFPS(n_to_select=2)
>>> selector.selected_idx_
array([0, 2])
"""
def __init__(
self,
mixing=0.5,
initialize=0,
n_to_select=None,
score_threshold=None,
score_threshold_type="absolute",
progress_bar=False,
full=False,
random_state=0,
):
super().__init__(
selection_type="sample",
mixing=mixing,
initialize=initialize,
n_to_select=n_to_select,
score_threshold=score_threshold,
score_threshold_type=score_threshold_type,
progress_bar=progress_bar,
full=full,
random_state=random_state,
)
[docs]
class CUR(_CUR):
"""Transformer that performs Greedy Sample Selection by choosing samples
which maximize the magnitude of the left singular vectors, consistent with classic
CUR matrix decomposition.
Parameters
----------
recompute_every : int
number of steps after which to recompute the pi score defaults to 1, if 0 no
re-computation is done
k : int
number of eigenvectors to compute the importance score with, defaults to 1
tolerance: float
threshold below which scores will be considered 0, defaults to 1E-12
n_to_select : int or float, default=None
The number of selections to make. If `None`, half of the samples are selected.
If integer, the parameter is the absolute number of selections to make. If float
between 0 and 1, it is the fraction of the total dataset to select. Stored in
:py:attr:`self.n_to_select`.
score_threshold : float, default=None
Threshold for the score. If `None` selection will continue until the n_to_select
is chosen. Otherwise will stop when the score falls below the threshold. Stored
in :py:attr:`self.score_threshold`.
score_threshold_type : str, default="absolute"
How to interpret the ``score_threshold``. When "absolute", the score used by the
selector is compared to the threshold directly. When "relative", at each
iteration, the score used by the selector is compared proportionally to the
score of the first selection, i.e. the selector quits when ``current_score /
first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`.
progress_bar: bool, default=False
option to use `tqdm <https://tqdm.github.io/>`_ progress bar to monitor
selections. Stored in :py:attr:`self.report_progress`.
full : bool, default=False
In the case that all non-redundant selections are exhausted, choose
randomly from the remaining samples. Stored in :py:attr:`self.full`.
random_state : int or :class:`numpy.random.RandomState` instance, default=0
Attributes
----------
X_current_ : numpy.ndarray (n_samples, n_features)
The original matrix orthogonalized by previous selections
n_selected_ : int
Counter tracking the number of selections that have been made
X_selected_ : numpy.ndarray,
Matrix containing the selected samples, for use in fitting
y_selected_ : numpy.ndarray,
In sample selection, the matrix containing the selected targets, for use in
fitting
pi_ : numpy.ndarray (n_features),
the importance score see :func:`_compute_pi`
selected_idx_ : ndarray
indices of selected features
Examples
--------
>>> from skmatter.sample_selection import CUR
>>> import numpy as np
>>> selector = CUR(n_to_select=2, random_state=0)
>>> X = np.array(
... [
... [0.12, 0.21, -0.11], # 4 samples, 3 features
... [-0.09, 0.32, 0.51],
... [-0.03, 0.53, 0.14],
... [-0.83, -0.13, 0.82],
... ]
... )
>>> selector.fit(X)
CUR(n_to_select=2)
>>> np.round(selector.pi_, 2) # importance score
array([0.01, 0.99, 0. , 0. ])
>>> selector.selected_idx_ # selected idx
array([3, 2])
>>> # selector.transform(X) cannot be used as sklearn API
>>> # restricts the change of sample size using transformers
>>> # So one has to do
>>> X[selector.selected_idx_]
array([[-0.83, -0.13, 0.82],
[-0.03, 0.53, 0.14]])
"""
def __init__(
self,
recompute_every=1,
k=1,
tolerance=1e-12,
n_to_select=None,
score_threshold=None,
score_threshold_type="absolute",
progress_bar=False,
full=False,
random_state=0,
):
super().__init__(
selection_type="sample",
recompute_every=recompute_every,
k=k,
tolerance=tolerance,
n_to_select=n_to_select,
score_threshold=score_threshold,
score_threshold_type=score_threshold_type,
progress_bar=progress_bar,
full=full,
random_state=random_state,
)
[docs]
class PCovCUR(_PCovCUR):
r"""Transformer that performs Greedy Sample Selection by choosing samples
which maximize the importance score :math:`\pi`, which is the sum over
the squares of the first :math:`k` components of the PCovR-modified
left singular vectors.
Parameters
----------
mixing: float, default=0.5
The PCovR mixing parameter, as described in PCovR as :math:`{\alpha}`. Stored
in :py:attr:`self.mixing`.
recompute_every : int
number of steps after which to recompute the pi score defaults to 1, if 0 no
re-computation is done
k : int
number of eigenvectors to compute the importance score with, defaults to 1
tolerance: float
threshold below which scores will be considered 0, defaults to 1E-12
n_to_select : int or float, default=None
The number of selections to make. If `None`, half of the samples are
selected. If integer, the parameter is the absolute number of selections
to make. If float between 0 and 1, it is the fraction of the total dataset to
select. Stored in :py:attr:`self.n_to_select`.
score_threshold : float, default=None
Threshold for the score. If `None` selection will continue until the
n_to_select is chosen. Otherwise will stop when the score falls below the
threshold. Stored in :py:attr:`self.score_threshold`.
score_threshold_type : str, default="absolute"
How to interpret the ``score_threshold``. When "absolute", the score used by
the selector is compared to the threshold directly. When "relative", at each
iteration, the score used by the selector is compared proportionally to the
score of the first selection, i.e. the selector quits when
``current_score / first_score < threshold``. Stored in
:py:attr:`self.score_threshold_type`.
progress_bar: bool, default=False
option to use `tqdm <https://tqdm.github.io/>`_ progress bar to monitor
selections. Stored in :py:attr:`self.report_progress`.
full : bool, default=False
In the case that all non-redundant selections are exhausted, choose
randomly from the remaining samples. Stored in :py:attr:`self.full`.
random_state : int or :class`numpy.random.RandomState` instance, default=0
Attributes
----------
X_current_ : numpy.ndarray (n_samples, n_features)
The original matrix orthogonalized by previous selections
y_current_ : numpy.ndarray (n_samples, n_properties)
The targets orthogonalized by a regression on the previous selections.
n_selected_ : int
Counter tracking the number of selections that have been made
X_selected_ : numpy.ndarray
Matrix containing the selected samples, for use in fitting
y_selected_ : numpy.ndarray,
In sample selection, the matrix containing the selected targets, for use in
fitting
pi_ : numpy.ndarray (n_features),
the importance score see :func:`_compute_pi`
selected_idx_ : numpy.ndarray
indices of selected features
Examples
--------
>>> from skmatter.sample_selection import PCovCUR
>>> import numpy as np
>>> selector = PCovCUR(n_to_select=2, random_state=0)
>>> X = np.array(
... [
... [0.12, 0.21, 0.02], # 3 samples, 3 features
... [-0.09, 0.32, -0.10],
... [-0.03, -0.53, 0.08],
... ]
... )
>>> y = np.array([0.0, 0.0, 1.0]) # classes of each sample
>>> selector.fit(X, y)
PCovCUR(n_to_select=2)
>>> np.round(selector.pi_, 2) # importance scole
array([1., 0., 0.])
>>> selector.selected_idx_ # importance scole
array([2, 1])
>>> # selector.transform(X) cannot be used as sklearn API
>>> # restricts the change of sample size using transformers
>>> # So one has to do
>>> X[selector.selected_idx_]
array([[-0.03, -0.53, 0.08],
[-0.09, 0.32, -0.1 ]])
"""
def __init__(
self,
mixing=0.5,
recompute_every=1,
k=1,
tolerance=1e-12,
n_to_select=None,
score_threshold=None,
score_threshold_type="absolute",
progress_bar=False,
full=False,
random_state=0,
):
super().__init__(
selection_type="sample",
mixing=mixing,
recompute_every=recompute_every,
k=k,
tolerance=tolerance,
n_to_select=n_to_select,
score_threshold=score_threshold,
score_threshold_type=score_threshold_type,
progress_bar=progress_bar,
full=full,
random_state=random_state,
)
def _directional_distance(equations, points):
"""
Computes the distance of the points to the planes defined by the equations with
respect to the direction of the first dimension.
equations : numpy.ndarray of shape (n_facets, n_dim)
each row contains the coefficienst for the plane equation of the form
.. math::
equations[i, 0]*x_1 + ... + equations[i, -2]*x_{n_dim}
= equations[i, -1] -equations[i, -1] is the offset
points : numpy.ndarray of shape (n_samples, n_dim)
points to compute the directional distance from
Returns
-------
directional_distance : numpy.ndarray of shape (nsamples, nequations)
closest distance wrt. the first dimension of the point to the planes defined by
the equations
"""
orthogonal_distances = -(points @ equations[:, :-1].T) - equations[:, -1:].T
return -orthogonal_distances / equations[:, :1].T
[docs]
class DirectionalConvexHull(BaseEstimator):
"""
Performs Sample Selection by constructing a Directional Convex Hull and determining
the distance to the hull as outlined in the reference [dch]_.
Parameters
----------
low_dim_idx : list of ints, default None
Indices of columns of X containing features to be used for the
directional convex hull construction (also known as the low-
dimensional (LD) hull). By default [0] is used.
tolerance : float, default=1.0E-12
Tolerance for the negative distances to the directional
convex hull to consider a point below the convex hull. Depending
if a point is below or above the convex hull the distance is
differently computed. A very low value can result in a completely
wrong distances. Distances cannot be distinguished from zero up
to tolerance. It is recommended to leave the default setting.
Attributes
----------
high_dim_idx_ : list of ints
Indices of columns in data containing high-dimensional
features (i.e. those not used for the convex hull
construction)
selected_idx_ : numpy.ndarray
Indices of datapoints that form the vertices of the
convex hull
interpolator_high_dim_ : scipy.interpolate._interpnd.LinearNDInterpolator
Interpolator for the features in the high-
dimensional space
Examples
--------
>>> from skmatter.sample_selection import DirectionalConvexHull
>>> selector = DirectionalConvexHull(
... # Indices of columns of X to use for fitting
... # the convex hull
... low_dim_idx=[0, 1],
... )
>>> X = np.array(
... [
... [0.12, 0.21, 0.02], # 3 samples, 3 features
... [-0.09, 0.32, -0.10],
... [-0.03, -0.53, 0.08],
... [-0.41, 0.25, 0.34],
... ]
... )
>>> y = np.array([0.1, 1.0, 0.2, 0.4]) # classes of each sample
>>> dch = selector.fit(X, y)
>>> # Get the distance to the convex hull for samples used to fit the
>>> # convex hull. This can also be called using other samples (X_new)
>>> # and corresponding properties (y_new) that were not used to fit
>>> # the hull. In this case they are alle one the conex hull so we
>>> # zeros
>>> np.allclose(dch.score_samples(X, y), [0.0, 0.0, 0.0, 0.0])
True
References
----------
.. [dch] A. Anelli, E. A. Engel, C. J. Pickard and M. Ceriotti,
Physical Review Materials, 2018.
"""
def __init__(self, low_dim_idx=None, tolerance=1e-12):
self.low_dim_idx = low_dim_idx
if low_dim_idx is None:
self.low_dim_idx = [0]
else:
self.low_dim_idx = low_dim_idx
self.tolerance = tolerance
[docs]
def fit(self, X, y):
"""Learn the samples that form the convex hull.
Parameters
----------
X : numpy.ndarray of shape (n_samples, n_features)
Feature matrix of samples to use for constructing the convex
hull.
y : numpy.ndarray of shape (n_samples,)
Target values (property on which the convex hull should be
constructed, e.g. Gibbs free energy)
Returns
-------
self : object
Fitted scorer.
"""
X, y = self._check_X_y(X, y)
self.n_features_in_ = X.shape[1]
if len(y.shape) == 1:
y = y.reshape((len(y), 1))
if (max(np.abs(self.low_dim_idx)) > X.shape[1]) and (
min(self.low_dim_idx) >= 0
):
raise ValueError(
"One or more columns indexed with low_dim_idx is"
" out of bounds with the dimensions of X."
)
self.high_dim_idx_ = np.setdiff1d(np.arange(X.shape[1]), self.low_dim_idx)
# get number of dimensions for the convex (lower dimensional) hull construction
n_low_dim_idx = len(self.low_dim_idx)
# append features and target property to the same data matrix (for the
# convex hull construction)
convex_hull_data = np.zeros((X.shape[0], n_low_dim_idx + 1))
convex_hull_data[:, :1] = y
convex_hull_data[:, 1:] = X[:, self.low_dim_idx].copy()
# create high-dimensional feature matrix
high_dim_feats = X[:, self.high_dim_idx_]
# get scipy convex hull
self.convex_hull_ = ConvexHull(convex_hull_data, incremental=True)
# get normal equations to the hull simplices
y_normal = self.convex_hull_.equations[:, 0]
# get vertices_idx of the convex hull
directional_facets_idx = np.where(y_normal < 0)[0]
self.directional_simplices_ = self.convex_hull_.simplices[
directional_facets_idx
]
self._directional_equations_ = self.convex_hull_.equations[
directional_facets_idx
]
self.selected_idx_ = np.unique(self.directional_simplices_.flatten())
self._directional_points_ = convex_hull_data[self.selected_idx_]
# required for the score_feature_matrix function
self.interpolator_high_dim_ = _linear_interpolator(
points=convex_hull_data[self.selected_idx_, 1:],
values=high_dim_feats[self.selected_idx_],
)
return self
@property
def directional_vertices_(self):
return self.selected_idx_
def _check_X_y(self, X, y):
return check_X_y(X, y, ensure_min_features=1, multi_output=False)
def _check_is_fitted(self, X):
check_is_fitted(
self,
[
"high_dim_idx_",
"interpolator_high_dim_",
"selected_idx_",
],
)
n_features = X.shape[1]
if n_features != self.n_features_in_:
raise ValueError(
f"X has {n_features} features, but {self.__class__.__name__} "
f"is expecting {self.n_features_in_} features as input."
)
return True
[docs]
def score_samples(self, X, y):
"""Calculate the distance of the samples to the convex hull in the target
direction y. Samples with a distance > 0 lie above the convex surface. Samples
with a distance of zero lie on the convex surface. Samples with a distance value
< 0 lie below the convex surface.
Parameters
----------
X : numpy.ndarray of shape (n_samples, n_features)
Feature matrix of samples to use for determining distance to the convex
hull. Please note that samples provided should have the same dimensions
(features) as used during fitting of the convex hull. The same column
indices will be used for the low- and high-dimensional features.
y : numpy.ndarray of shape (n_samples,)
Target values (property on which the convex hull should be constructed, e.g.
Gibbs free energy)
Returns
-------
dch_distance : numpy.ndarray of shape (n_samples, len(high_dim_idx_))
The distance (residuals) of samples to the convex hull in the
higher-dimensional space.
"""
X, y = self._check_X_y(X, y)
self._check_is_fitted(X)
# features used for the convex hull construction
low_dim_feats = X[:, self.low_dim_idx]
hull_space_data = np.hstack((y.reshape(-1, 1), low_dim_feats))
# the X points projected on the convex surface
return self._directional_convex_hull_distance(hull_space_data).reshape(y.shape)
def _directional_convex_hull_distance(self, points):
"""Distance to the fitted directional convex hull in the target dimension y.
If a point lies above the convex hull, it will have a positive distance. If a
point lies below the convex hull, it will have a negative distance - this should
only occur if you pass a point that the convex hull has not been fitted on in
the `fit` function. If a point lies on the convex hull, it will have a distance
of zero.
Parameters
----------
points : The points for which you would like to calculate the distance to the
fitted directional convex hull.
"""
# directional distances to all plane equations
all_directional_distances = _directional_distance(
self._directional_equations_, points
)
# we get negative distances for each plane to check if any distance is below the
# threshold
below_directional_convex_hull = np.any(
all_directional_distances < -self.tolerance, axis=1
)
# directional distances to corresponding plane equation
directional_distances = np.zeros(len(points))
directional_distances[~below_directional_convex_hull] = np.min(
all_directional_distances[~below_directional_convex_hull], axis=1
)
# some distances can be positive, so we take the max of all negative distances
negative_directional_distances = all_directional_distances.copy()
negative_directional_distances[all_directional_distances > 0] = -np.inf
directional_distances[below_directional_convex_hull] = np.max(
negative_directional_distances[below_directional_convex_hull], axis=1
)
return directional_distances
[docs]
def score_feature_matrix(self, X):
"""Calculate the distance (or more specifically, the residuals) of the
samples to the convex hull in the high-dimensional space. Samples with a
distance value of zero in all the higher dimensions lie on the convex hull.
Parameters
----------
X : numpy.ndarray of shape (n_samples, n_features)
Feature matrix of samples to use for determining distance to the convex
hull. Please note that samples provided should have the same dimensions
(features) as used during fitting of the convex hull. The same column
indices will be used for the low- and high-dimensional features.
Returns
-------
dch_distance : numpy.ndarray of shape (n_samples, len(high_dim_idx_))
The distance (residuals) of samples to the convex hull in
the higher-dimensional space.
"""
X = check_array(X)
self._check_is_fitted(X)
# features used for the convex hull construction
low_dim_feats = X[:, self.low_dim_idx]
# HD features not used for the convex hull
high_dim_feats = X[:, self.high_dim_idx_]
if len(self.low_dim_idx) == 1:
low_dim_feats = low_dim_feats.reshape(
-1,
)
# interpolate features
interpolated_high_dim_feats = self.interpolator_high_dim_(low_dim_feats)
if np.any(np.isnan(interpolated_high_dim_feats)):
warnings.warn(
"There are samples in X with a low-dimensional part that is outside "
"of the range of the convex surface. Distance will contain nans.",
UserWarning,
stacklevel=1,
)
# determine the distance between the original high-dimensional data and
# interpolated high-dimensional data
dch_distance = high_dim_feats - interpolated_high_dim_feats
return dch_distance