Source code for skmatter.sample_selection._base

"""Sequential sample selection"""

import warnings

import numpy as np
from scipy.interpolate import LinearNDInterpolator, interp1d
from scipy.interpolate._interpnd import _ndim_coords_from_arrays
from scipy.spatial import ConvexHull
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_array, check_is_fitted, check_X_y

from .._selection import _CUR, _FPS, _PCovCUR, _PCovFPS


def _linear_interpolator(points, values):
    """
    Returns linear interpolator for unstructured D-D data. Tessellate the input point
    set to N-D simplices, and interpolate linearly on each simplex. See
    ``LinearNDInterpolator`` for more details.

    Parameters
    ----------
    points : 1- or 2-D numpy.ndarray of shape (n,) or (n, D)
        Data point coordinates.
    values : numpy.ndarray of float or complex, shape (n, )
        Data values.

    Reference:
    ---------
    The code is an adapted excerpt from
    https://github.com/scipy/scipy/blob/dde50595862a4f9cede24b5d1c86935c30f1f88a/scipy/interpolate/_ndgriddata.py#L119-L273
    """  # NoQa: E501
    points = _ndim_coords_from_arrays(points)

    if points.ndim < 2:
        ndim = points.ndim
    else:
        ndim = points.shape[-1]

    if ndim == 1:
        points = points.ravel()
        # Sort points/values together, necessary as input for interp1d
        idx = np.argsort(points)
        points = points[idx]
        values = values[idx]
        return interp1d(
            points, values, kind="linear", axis=0, bounds_error=False, fill_value=np.nan
        )
    else:
        return LinearNDInterpolator(points, values, fill_value=np.nan, rescale=False)


[docs] class FPS(_FPS): """Transformer performing Greedy Sample Selection using Farthest Point Sampling. Parameters ---------- initialize: int, list of int, numpy.ndarray of int, or 'random', default=0 Index of the first selection(s). If 'random', picks a random value when fit starts. Stored in :py:attr:`self.initialize`. n_to_select : int or float, default=None The number of selections to make. If `None`, half of the samples are selected. If integer, the parameter is the absolute number of selections to make. If float between 0 and 1, it is the fraction of the total dataset to select. Stored in :py:attr:`self.n_to_select`. score_threshold : float, default=None Threshold for the score. If `None` selection will continue until the n_to_select is chosen. Otherwise will stop when the score falls below the threshold. Stored in :py:attr:`self.score_threshold`. score_threshold_type : str, default="absolute" How to interpret the ``score_threshold``. When "absolute", the score used by the selector is compared to the threshold directly. When "relative", at each iteration, the score used by the selector is compared proportionally to the score of the first selection, i.e. the selector quits when ``current_score / first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`. progress_bar: bool, default=False option to use `tqdm <https://tqdm.github.io/>`_ progress bar to monitor selections. Stored in :py:attr:`self.report_progress`. full : bool, default=False In the case that all non-redundant selections are exhausted, choose randomly from the remaining samples. Stored in :py:attr:`self.full`. random_state : int or :class:`numpy.random.RandomState` instance, default=0 Attributes ---------- n_selected_ : int Counter tracking the number of selections that have been made X_selected_ :numpy.ndarray, Matrix containing the selected samples, for use in fitting y_selected_ : numpy.ndarray, In sample selection, the matrix containing the selected targets, for use in fitting. selected_idx_ : numpy.ndarray indices of selected samples Examples -------- >>> from skmatter.sample_selection import FPS >>> import numpy as np >>> selector = FPS( ... n_to_select=2, ... # int or 'random', default=0 ... # Index of the first selection. ... # If "random", picks a random value when fit starts. ... initialize=0, ... ) >>> X = np.array( ... [ ... [0.12, 0.21, 0.02], # 3 samples, 3 features ... [-0.09, 0.32, -0.10], ... [-0.03, -0.53, 0.08], ... ] ... ) >>> selector.fit(X) FPS(n_to_select=2) >>> selector.selected_idx_ array([0, 2]) """ def __init__( self, initialize=0, n_to_select=None, score_threshold=None, score_threshold_type="absolute", progress_bar=False, full=False, random_state=0, ): super().__init__( selection_type="sample", initialize=initialize, n_to_select=n_to_select, score_threshold=score_threshold, score_threshold_type=score_threshold_type, progress_bar=progress_bar, full=full, random_state=random_state, )
[docs] class PCovFPS(_PCovFPS): r"""Transformer performing Greedy Sample Selection using PCovR-weighted Farthest Point Sampling. Parameters ---------- mixing: float, default=0.5 The PCovR mixing parameter, as described in PCovR as :math:`{\alpha}` initialize: int or 'random', default=0 Index of the first selection. If 'random', picks a random value when fit starts. n_to_select : int or float, default=None The number of selections to make. If `None`, half of the samples are selected. If integer, the parameter is the absolute number of selections to make. If float between 0 and 1, it is the fraction of the total dataset to select. Stored in :py:attr:`self.n_to_select`. score_threshold : float, default=None Threshold for the score. If `None` selection will continue until the n_to_select is chosen. Otherwise will stop when the score falls below the threshold. Stored in :py:attr:`self.score_threshold`. score_threshold_type : str, default="absolute" How to interpret the ``score_threshold``. When "absolute", the score used by the selector is compared to the threshold directly. When "relative", at each iteration, the score used by the selector is compared proportionally to the score of the first selection, i.e. the selector quits when ``current_score / first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`. progress_bar: bool, default=False option to use `tqdm <https://tqdm.github.io/>`_ progress bar to monitor selections. Stored in :py:attr:`self.report_progress`. full : bool, default=False In the case that all non-redundant selections are exhausted, choose randomly from the remaining samples. Stored in :py:attr:`self.full`. random_state : int or :class:`numpy.random.RandomState` instance, default=0 Attributes ---------- n_selected_ : int Counter tracking the number of selections that have been made X_selected_ : numpy.ndarray, Matrix containing the selected samples, for use in fitting y_selected_ : numpy.ndarray, In sample selection, the matrix containing the selected targets, for use in fitting selected_idx_ : numpy.ndarray indices of selected samples Examples -------- >>> from skmatter.sample_selection import PCovFPS >>> import numpy as np >>> selector = PCovFPS( ... n_to_select=2, ... # int or 'random', default=0 ... # Index of the first selection. ... # If "random", picks a random value when fit starts. ... initialize=0, ... ) >>> X = np.array( ... [ ... [0.12, 0.21, 0.02], # 3 samples, 3 features ... [-0.09, 0.32, -0.10], ... [-0.03, -0.53, 0.08], ... ] ... ) >>> y = np.array([0.0, 0.0, 1.0]) # classes of each sample >>> selector.fit(X, y) PCovFPS(n_to_select=2) >>> selector.selected_idx_ array([0, 2]) """ def __init__( self, mixing=0.5, initialize=0, n_to_select=None, score_threshold=None, score_threshold_type="absolute", progress_bar=False, full=False, random_state=0, ): super().__init__( selection_type="sample", mixing=mixing, initialize=initialize, n_to_select=n_to_select, score_threshold=score_threshold, score_threshold_type=score_threshold_type, progress_bar=progress_bar, full=full, random_state=random_state, )
[docs] class CUR(_CUR): """Transformer that performs Greedy Sample Selection by choosing samples which maximize the magnitude of the left singular vectors, consistent with classic CUR matrix decomposition. Parameters ---------- recompute_every : int number of steps after which to recompute the pi score defaults to 1, if 0 no re-computation is done k : int number of eigenvectors to compute the importance score with, defaults to 1 tolerance: float threshold below which scores will be considered 0, defaults to 1E-12 n_to_select : int or float, default=None The number of selections to make. If `None`, half of the samples are selected. If integer, the parameter is the absolute number of selections to make. If float between 0 and 1, it is the fraction of the total dataset to select. Stored in :py:attr:`self.n_to_select`. score_threshold : float, default=None Threshold for the score. If `None` selection will continue until the n_to_select is chosen. Otherwise will stop when the score falls below the threshold. Stored in :py:attr:`self.score_threshold`. score_threshold_type : str, default="absolute" How to interpret the ``score_threshold``. When "absolute", the score used by the selector is compared to the threshold directly. When "relative", at each iteration, the score used by the selector is compared proportionally to the score of the first selection, i.e. the selector quits when ``current_score / first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`. progress_bar: bool, default=False option to use `tqdm <https://tqdm.github.io/>`_ progress bar to monitor selections. Stored in :py:attr:`self.report_progress`. full : bool, default=False In the case that all non-redundant selections are exhausted, choose randomly from the remaining samples. Stored in :py:attr:`self.full`. random_state : int or :class:`numpy.random.RandomState` instance, default=0 Attributes ---------- X_current_ : numpy.ndarray (n_samples, n_features) The original matrix orthogonalized by previous selections n_selected_ : int Counter tracking the number of selections that have been made X_selected_ : numpy.ndarray, Matrix containing the selected samples, for use in fitting y_selected_ : numpy.ndarray, In sample selection, the matrix containing the selected targets, for use in fitting pi_ : numpy.ndarray (n_features), the importance score see :func:`_compute_pi` selected_idx_ : ndarray indices of selected features Examples -------- >>> from skmatter.sample_selection import CUR >>> import numpy as np >>> selector = CUR(n_to_select=2, random_state=0) >>> X = np.array( ... [ ... [0.12, 0.21, -0.11], # 4 samples, 3 features ... [-0.09, 0.32, 0.51], ... [-0.03, 0.53, 0.14], ... [-0.83, -0.13, 0.82], ... ] ... ) >>> selector.fit(X) CUR(n_to_select=2) >>> np.round(selector.pi_, 2) # importance score array([0.01, 0.99, 0. , 0. ]) >>> selector.selected_idx_ # selected idx array([3, 2]) >>> # selector.transform(X) cannot be used as sklearn API >>> # restricts the change of sample size using transformers >>> # So one has to do >>> X[selector.selected_idx_] array([[-0.83, -0.13, 0.82], [-0.03, 0.53, 0.14]]) """ def __init__( self, recompute_every=1, k=1, tolerance=1e-12, n_to_select=None, score_threshold=None, score_threshold_type="absolute", progress_bar=False, full=False, random_state=0, ): super().__init__( selection_type="sample", recompute_every=recompute_every, k=k, tolerance=tolerance, n_to_select=n_to_select, score_threshold=score_threshold, score_threshold_type=score_threshold_type, progress_bar=progress_bar, full=full, random_state=random_state, )
[docs] class PCovCUR(_PCovCUR): r"""Transformer that performs Greedy Sample Selection by choosing samples which maximize the importance score :math:`\pi`, which is the sum over the squares of the first :math:`k` components of the PCovR-modified left singular vectors. Parameters ---------- mixing: float, default=0.5 The PCovR mixing parameter, as described in PCovR as :math:`{\alpha}`. Stored in :py:attr:`self.mixing`. recompute_every : int number of steps after which to recompute the pi score defaults to 1, if 0 no re-computation is done k : int number of eigenvectors to compute the importance score with, defaults to 1 tolerance: float threshold below which scores will be considered 0, defaults to 1E-12 n_to_select : int or float, default=None The number of selections to make. If `None`, half of the samples are selected. If integer, the parameter is the absolute number of selections to make. If float between 0 and 1, it is the fraction of the total dataset to select. Stored in :py:attr:`self.n_to_select`. score_threshold : float, default=None Threshold for the score. If `None` selection will continue until the n_to_select is chosen. Otherwise will stop when the score falls below the threshold. Stored in :py:attr:`self.score_threshold`. score_threshold_type : str, default="absolute" How to interpret the ``score_threshold``. When "absolute", the score used by the selector is compared to the threshold directly. When "relative", at each iteration, the score used by the selector is compared proportionally to the score of the first selection, i.e. the selector quits when ``current_score / first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`. progress_bar: bool, default=False option to use `tqdm <https://tqdm.github.io/>`_ progress bar to monitor selections. Stored in :py:attr:`self.report_progress`. full : bool, default=False In the case that all non-redundant selections are exhausted, choose randomly from the remaining samples. Stored in :py:attr:`self.full`. random_state : int or :class`numpy.random.RandomState` instance, default=0 Attributes ---------- X_current_ : numpy.ndarray (n_samples, n_features) The original matrix orthogonalized by previous selections y_current_ : numpy.ndarray (n_samples, n_properties) The targets orthogonalized by a regression on the previous selections. n_selected_ : int Counter tracking the number of selections that have been made X_selected_ : numpy.ndarray Matrix containing the selected samples, for use in fitting y_selected_ : numpy.ndarray, In sample selection, the matrix containing the selected targets, for use in fitting pi_ : numpy.ndarray (n_features), the importance score see :func:`_compute_pi` selected_idx_ : numpy.ndarray indices of selected features Examples -------- >>> from skmatter.sample_selection import PCovCUR >>> import numpy as np >>> selector = PCovCUR(n_to_select=2, random_state=0) >>> X = np.array( ... [ ... [0.12, 0.21, 0.02], # 3 samples, 3 features ... [-0.09, 0.32, -0.10], ... [-0.03, -0.53, 0.08], ... ] ... ) >>> y = np.array([0.0, 0.0, 1.0]) # classes of each sample >>> selector.fit(X, y) PCovCUR(n_to_select=2) >>> np.round(selector.pi_, 2) # importance scole array([1., 0., 0.]) >>> selector.selected_idx_ # importance scole array([2, 1]) >>> # selector.transform(X) cannot be used as sklearn API >>> # restricts the change of sample size using transformers >>> # So one has to do >>> X[selector.selected_idx_] array([[-0.03, -0.53, 0.08], [-0.09, 0.32, -0.1 ]]) """ def __init__( self, mixing=0.5, recompute_every=1, k=1, tolerance=1e-12, n_to_select=None, score_threshold=None, score_threshold_type="absolute", progress_bar=False, full=False, random_state=0, ): super().__init__( selection_type="sample", mixing=mixing, recompute_every=recompute_every, k=k, tolerance=tolerance, n_to_select=n_to_select, score_threshold=score_threshold, score_threshold_type=score_threshold_type, progress_bar=progress_bar, full=full, random_state=random_state, )
def _directional_distance(equations, points): """ Computes the distance of the points to the planes defined by the equations with respect to the direction of the first dimension. equations : numpy.ndarray of shape (n_facets, n_dim) each row contains the coefficienst for the plane equation of the form .. math:: equations[i, 0]*x_1 + ... + equations[i, -2]*x_{n_dim} = equations[i, -1] -equations[i, -1] is the offset points : numpy.ndarray of shape (n_samples, n_dim) points to compute the directional distance from Returns ------- directional_distance : numpy.ndarray of shape (nsamples, nequations) closest distance wrt. the first dimension of the point to the planes defined by the equations """ orthogonal_distances = -(points @ equations[:, :-1].T) - equations[:, -1:].T return -orthogonal_distances / equations[:, :1].T
[docs] class DirectionalConvexHull(BaseEstimator): """ Performs Sample Selection by constructing a Directional Convex Hull and determining the distance to the hull as outlined in the reference [dch]_. Parameters ---------- low_dim_idx : list of ints, default None Indices of columns of X containing features to be used for the directional convex hull construction (also known as the low- dimensional (LD) hull). By default [0] is used. tolerance : float, default=1.0E-12 Tolerance for the negative distances to the directional convex hull to consider a point below the convex hull. Depending if a point is below or above the convex hull the distance is differently computed. A very low value can result in a completely wrong distances. Distances cannot be distinguished from zero up to tolerance. It is recommended to leave the default setting. Attributes ---------- high_dim_idx_ : list of ints Indices of columns in data containing high-dimensional features (i.e. those not used for the convex hull construction) selected_idx_ : numpy.ndarray Indices of datapoints that form the vertices of the convex hull interpolator_high_dim_ : scipy.interpolate._interpnd.LinearNDInterpolator Interpolator for the features in the high- dimensional space Examples -------- >>> from skmatter.sample_selection import DirectionalConvexHull >>> selector = DirectionalConvexHull( ... # Indices of columns of X to use for fitting ... # the convex hull ... low_dim_idx=[0, 1], ... ) >>> X = np.array( ... [ ... [0.12, 0.21, 0.02], # 3 samples, 3 features ... [-0.09, 0.32, -0.10], ... [-0.03, -0.53, 0.08], ... [-0.41, 0.25, 0.34], ... ] ... ) >>> y = np.array([0.1, 1.0, 0.2, 0.4]) # classes of each sample >>> dch = selector.fit(X, y) >>> # Get the distance to the convex hull for samples used to fit the >>> # convex hull. This can also be called using other samples (X_new) >>> # and corresponding properties (y_new) that were not used to fit >>> # the hull. In this case they are alle one the conex hull so we >>> # zeros >>> np.allclose(dch.score_samples(X, y), [0.0, 0.0, 0.0, 0.0]) True References ---------- .. [dch] A. Anelli, E. A. Engel, C. J. Pickard and M. Ceriotti, Physical Review Materials, 2018. """ def __init__(self, low_dim_idx=None, tolerance=1e-12): self.low_dim_idx = low_dim_idx if low_dim_idx is None: self.low_dim_idx = [0] else: self.low_dim_idx = low_dim_idx self.tolerance = tolerance
[docs] def fit(self, X, y): """Learn the samples that form the convex hull. Parameters ---------- X : numpy.ndarray of shape (n_samples, n_features) Feature matrix of samples to use for constructing the convex hull. y : numpy.ndarray of shape (n_samples,) Target values (property on which the convex hull should be constructed, e.g. Gibbs free energy) Returns ------- self : object Fitted scorer. """ X, y = self._check_X_y(X, y) self.n_features_in_ = X.shape[1] if len(y.shape) == 1: y = y.reshape((len(y), 1)) if (max(np.abs(self.low_dim_idx)) > X.shape[1]) and ( min(self.low_dim_idx) >= 0 ): raise ValueError( "One or more columns indexed with low_dim_idx is" " out of bounds with the dimensions of X." ) self.high_dim_idx_ = np.setdiff1d(np.arange(X.shape[1]), self.low_dim_idx) # get number of dimensions for the convex (lower dimensional) hull construction n_low_dim_idx = len(self.low_dim_idx) # append features and target property to the same data matrix (for the # convex hull construction) convex_hull_data = np.zeros((X.shape[0], n_low_dim_idx + 1)) convex_hull_data[:, :1] = y convex_hull_data[:, 1:] = X[:, self.low_dim_idx].copy() # create high-dimensional feature matrix high_dim_feats = X[:, self.high_dim_idx_] # get scipy convex hull self.convex_hull_ = ConvexHull(convex_hull_data, incremental=True) # get normal equations to the hull simplices y_normal = self.convex_hull_.equations[:, 0] # get vertices_idx of the convex hull directional_facets_idx = np.where(y_normal < 0)[0] self.directional_simplices_ = self.convex_hull_.simplices[ directional_facets_idx ] self._directional_equations_ = self.convex_hull_.equations[ directional_facets_idx ] self.selected_idx_ = np.unique(self.directional_simplices_.flatten()) self._directional_points_ = convex_hull_data[self.selected_idx_] # required for the score_feature_matrix function self.interpolator_high_dim_ = _linear_interpolator( points=convex_hull_data[self.selected_idx_, 1:], values=high_dim_feats[self.selected_idx_], ) return self
@property def directional_vertices_(self): return self.selected_idx_ def _check_X_y(self, X, y): return check_X_y(X, y, ensure_min_features=1, multi_output=False) def _check_is_fitted(self, X): check_is_fitted( self, [ "high_dim_idx_", "interpolator_high_dim_", "selected_idx_", ], ) n_features = X.shape[1] if n_features != self.n_features_in_: raise ValueError( f"X has {n_features} features, but {self.__class__.__name__} " f"is expecting {self.n_features_in_} features as input." ) return True
[docs] def score_samples(self, X, y): """Calculate the distance of the samples to the convex hull in the target direction y. Samples with a distance > 0 lie above the convex surface. Samples with a distance of zero lie on the convex surface. Samples with a distance value < 0 lie below the convex surface. Parameters ---------- X : numpy.ndarray of shape (n_samples, n_features) Feature matrix of samples to use for determining distance to the convex hull. Please note that samples provided should have the same dimensions (features) as used during fitting of the convex hull. The same column indices will be used for the low- and high-dimensional features. y : numpy.ndarray of shape (n_samples,) Target values (property on which the convex hull should be constructed, e.g. Gibbs free energy) Returns ------- dch_distance : numpy.ndarray of shape (n_samples, len(high_dim_idx_)) The distance (residuals) of samples to the convex hull in the higher-dimensional space. """ X, y = self._check_X_y(X, y) self._check_is_fitted(X) # features used for the convex hull construction low_dim_feats = X[:, self.low_dim_idx] hull_space_data = np.hstack((y.reshape(-1, 1), low_dim_feats)) # the X points projected on the convex surface return self._directional_convex_hull_distance(hull_space_data).reshape(y.shape)
def _directional_convex_hull_distance(self, points): """Distance to the fitted directional convex hull in the target dimension y. If a point lies above the convex hull, it will have a positive distance. If a point lies below the convex hull, it will have a negative distance - this should only occur if you pass a point that the convex hull has not been fitted on in the `fit` function. If a point lies on the convex hull, it will have a distance of zero. Parameters ---------- points : The points for which you would like to calculate the distance to the fitted directional convex hull. """ # directional distances to all plane equations all_directional_distances = _directional_distance( self._directional_equations_, points ) # we get negative distances for each plane to check if any distance is below the # threshold below_directional_convex_hull = np.any( all_directional_distances < -self.tolerance, axis=1 ) # directional distances to corresponding plane equation directional_distances = np.zeros(len(points)) directional_distances[~below_directional_convex_hull] = np.min( all_directional_distances[~below_directional_convex_hull], axis=1 ) # some distances can be positive, so we take the max of all negative distances negative_directional_distances = all_directional_distances.copy() negative_directional_distances[all_directional_distances > 0] = -np.inf directional_distances[below_directional_convex_hull] = np.max( negative_directional_distances[below_directional_convex_hull], axis=1 ) return directional_distances
[docs] def score_feature_matrix(self, X): """Calculate the distance (or more specifically, the residuals) of the samples to the convex hull in the high-dimensional space. Samples with a distance value of zero in all the higher dimensions lie on the convex hull. Parameters ---------- X : numpy.ndarray of shape (n_samples, n_features) Feature matrix of samples to use for determining distance to the convex hull. Please note that samples provided should have the same dimensions (features) as used during fitting of the convex hull. The same column indices will be used for the low- and high-dimensional features. Returns ------- dch_distance : numpy.ndarray of shape (n_samples, len(high_dim_idx_)) The distance (residuals) of samples to the convex hull in the higher-dimensional space. """ X = check_array(X) self._check_is_fitted(X) # features used for the convex hull construction low_dim_feats = X[:, self.low_dim_idx] # HD features not used for the convex hull high_dim_feats = X[:, self.high_dim_idx_] if len(self.low_dim_idx) == 1: low_dim_feats = low_dim_feats.reshape( -1, ) # interpolate features interpolated_high_dim_feats = self.interpolator_high_dim_(low_dim_feats) if np.any(np.isnan(interpolated_high_dim_feats)): warnings.warn( "There are samples in X with a low-dimensional part that is outside " "of the range of the convex surface. Distance will contain nans.", UserWarning, stacklevel=1, ) # determine the distance between the original high-dimensional data and # interpolated high-dimensional data dch_distance = high_dim_feats - interpolated_high_dim_feats return dch_distance