Source code for skmatter.feature_selection._base

"""Sequential feature selection."""

from .._selection import _CUR, _FPS, _PCovCUR, _PCovFPS


[docs] class FPS(_FPS): """Transformer performing Greedy Feature Selection using Farthest Point Sampling. Parameters ---------- initialize: int, list of int, numpy.ndarray of int, or 'random', default=0 Index of the first selection(s). If 'random', picks a random value when fit starts. Stored in :py:attr:`self.initialize`. n_to_select : int or float, default=None The number of selections to make. If `None`, half of the features are selected. If integer, the parameter is the absolute number of selections to make. If float between 0 and 1, it is the fraction of the total dataset to select. Stored in :py:attr:`self.n_to_select`. score_threshold : float, default=None Threshold for the score. If `None` selection will continue until the n_to_select is chosen. Otherwise will stop when the score falls below the threshold. Stored in :py:attr:`self.score_threshold`. score_threshold_type : str, default="absolute" How to interpret the ``score_threshold``. When "absolute", the score used by the selector is compared to the threshold directly. When "relative", at each iteration, the score used by the selector is compared proportionally to the score of the first selection, i.e. the selector quits when ``current_score / first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`. progress_bar: bool, default=False option to use `tqdm <https://tqdm.github.io/>`_ progress bar to monitor selections. Stored in :py:attr:`self.report_progress`. full : bool, default=False In the case that all non-redundant selections are exhausted, choose randomly from the remaining features. Stored in :py:attr:`self.full`. random_state : int or :class:`numpy.random.RandomState` instance, default=0 Attributes ---------- n_selected_ : int Counter tracking the number of selections that have been made X_selected_ : ndarray, Matrix containing the selected features, for use in fitting selected_idx_ : ndarray indices of selected samples Examples -------- >>> from skmatter.feature_selection import FPS >>> import numpy as np >>> selector = FPS( ... n_to_select=2, ... # int or 'random', default=0 ... # Index of the first selection. ... # If "random", picks a random value when fit starts. ... initialize=0, ... ) >>> X = np.array( ... [ ... [0.12, 0.21, 0.02], # 3 samples, 3 features ... [-0.09, 0.32, -0.10], ... [-0.03, -0.53, 0.08], ... ] ... ) >>> selector.fit(X) FPS(n_to_select=2) >>> Xr = selector.transform(X) >>> selector.selected_idx_ array([0, 1]) """ def __init__( self, initialize=0, n_to_select=None, score_threshold=None, score_threshold_type="absolute", progress_bar=False, full=False, random_state=0, ): super().__init__( selection_type="feature", initialize=initialize, n_to_select=n_to_select, score_threshold=score_threshold, score_threshold_type=score_threshold_type, progress_bar=progress_bar, full=full, random_state=random_state, )
[docs] class PCovFPS(_PCovFPS): r"""Transformer that performs Greedy Feature Selection using PCovR-weighted Farthest Point Sampling. Parameters ---------- mixing: float, default=0.5 The PCovR mixing parameter, as described in PCovR as :math:`{\alpha}` initialize: int or 'random', default=0 Index of the first selection. If 'random', picks a random value when fit starts. n_to_select : int or float, default=None The number of selections to make. If `None`, half of the features are selected. If integer, the parameter is the absolute number of selections to make. If float between 0 and 1, it is the fraction of the total dataset to select. Stored in :py:attr:`self.n_to_select`. score_threshold : float, default=None Threshold for the score. If `None` selection will continue until the n_to_select is chosen. Otherwise will stop when the score falls below the threshold. Stored in :py:attr:`self.score_threshold`. score_threshold_type : str, default="absolute" How to interpret the ``score_threshold``. When "absolute", the score used by the selector is compared to the threshold directly. When "relative", at each iteration, the score used by the selector is compared proportionally to the score of the first selection, i.e. the selector quits when ``current_score / first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`. progress_bar: bool, default=False option to use `tqdm <https://tqdm.github.io/>`_ progress bar to monitor selections. Stored in :py:attr:`self.report_progress`. full : bool, default=False In the case that all non-redundant selections are exhausted, choose randomly from the remaining features. Stored in :py:attr:`self.full`. random_state : int or :class:`numpy.random.RandomState` instance, default=0 Attributes ---------- n_selected_ : int Counter tracking the number of selections that have been made X_selected_ : numpy.ndarray, Matrix containing the selected features, for use in fitting Examples -------- >>> from skmatter.feature_selection import PCovFPS >>> import numpy as np >>> selector = PCovFPS( ... n_to_select=2, ... # int or 'random', default=0 ... # Index of the first selection. ... # If ‘random’, picks a random value when fit starts. ... initialize=0, ... ) >>> X = np.array( ... [ ... [0.12, 0.21, 0.02], # 3 samples, 3 features ... [-0.09, 0.32, -0.10], ... [-0.03, -0.53, 0.08], ... ] ... ) >>> y = np.array([0.0, 0.0, 1.0]) # classes of each sample >>> selector.fit(X, y) PCovFPS(n_to_select=2) >>> Xr = selector.transform(X) >>> selector.selected_idx_ array([0, 1]) """ def __init__( self, mixing=0.5, initialize=0, n_to_select=None, score_threshold=None, score_threshold_type="absolute", progress_bar=False, full=False, random_state=0, ): super().__init__( selection_type="feature", mixing=mixing, initialize=initialize, n_to_select=n_to_select, score_threshold=score_threshold, score_threshold_type=score_threshold_type, progress_bar=progress_bar, full=full, random_state=random_state, )
[docs] class CUR(_CUR): """Transformer that performs Greedy Feature Selection by choosing features which maximize the magnitude of the right singular vectors, consistent with classic CUR matrix decomposition. Parameters ---------- recompute_every : int number of steps after which to recompute the pi score defaults to 1, if 0 no re-computation is done k : int number of eigenvectors to compute the importance score with, defaults to ``1`` tolerance: float threshold below which scores will be considered 0, defaults to ``1e-12`` n_to_select : int or float, default=None The number of selections to make. If `None`, half of the features are selected. If integer, the parameter is the absolute number of selections to make. If float between 0 and 1, it is the fraction of the total dataset to select. Stored in :py:attr:`self.n_to_select`. score_threshold : float, default=None Threshold for the score. If `None` selection will continue until the n_to_select is chosen. Otherwise will stop when the score falls below the threshold. Stored in :py:attr:`self.score_threshold`. score_threshold_type : str, default="absolute" How to interpret the ``score_threshold``. When "absolute", the score used by the selector is compared to the threshold directly. When "relative", at each iteration, the score used by the selector is compared proportionally to the score of the first selection, i.e. the selector quits when ``current_score / first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`. progress_bar: bool, default=False option to use `tqdm <https://tqdm.github.io/>`_ progress bar to monitor selections. Stored in :py:attr:`self.report_progress`. full : bool, default=False In the case that all non-redundant selections are exhausted, choose randomly from the remaining features. Stored in :py:attr:`self.full`. random_state : int or :class:`numpy.random`RandomState` instance, default=0 Attributes ---------- X_current_ : numpy.ndarray (n_samples, n_features) The original matrix orthogonalized by previous selections n_selected_ : int Counter tracking the number of selections that have been made X_selected_ : numpy.ndarray Matrix containing the selected features, for use in fitting pi_ : numpy.ndarray (n_features), the importance score see :func:`_compute_pi` selected_idx_ : numpy.ndarray indices of selected features Examples -------- >>> from skmatter.feature_selection import CUR >>> import numpy as np >>> selector = CUR(n_to_select=2, random_state=0) >>> X = np.array( ... [ ... [0.12, 0.21, 0.02], # 3 samples, 3 features ... [-0.09, 0.32, -0.10], ... [-0.03, -0.53, 0.08], ... ] ... ) >>> selector.fit(X) CUR(n_to_select=2) >>> Xr = selector.transform(X) >>> print(Xr.shape) (3, 2) >>> np.round(selector.pi_) # importance score array([0., 0., 0.]) >>> selector.selected_idx_ array([1, 0]) """ def __init__( self, recompute_every=1, k=1, tolerance=1e-12, n_to_select=None, score_threshold=None, score_threshold_type="absolute", progress_bar=False, full=False, random_state=0, ): super().__init__( selection_type="feature", recompute_every=recompute_every, k=k, tolerance=tolerance, n_to_select=n_to_select, score_threshold=score_threshold, score_threshold_type=score_threshold_type, progress_bar=progress_bar, full=full, random_state=random_state, )
[docs] class PCovCUR(_PCovCUR): r"""Transformer that performs Greedy Feature Selection by choosing features which maximize the importance score :math:`\pi`, which is the sum over the squares of the first :math:`k` components of the PCovR-modified right singular vectors. Parameters ---------- recompute_every : int number of steps after which to recompute the pi score defaults to 1, if 0 no re-computation is done k : int number of eigenvectors to compute the importance score with, defaults to 1 tolerance: float threshold below which scores will be considered 0, defaults to ``1e-12`` mixing: float, default=0.5 The PCovR mixing parameter, as described in PCovR as :math:`{\alpha}`. Stored in :py:attr:`self.mixing`. n_to_select : int or float, default=None The number of selections to make. If `None`, half of the features are selected. If integer, the parameter is the absolute number of selections to make. If float between 0 and 1, it is the fraction of the total dataset to select. Stored in :py:attr:`self.n_to_select`. score_threshold : float, default=None Threshold for the score. If `None` selection will continue until the n_to_select is chosen. Otherwise will stop when the score falls below the threshold. Stored in :py:attr:`self.score_threshold`. score_threshold_type : str, default="absolute" How to interpret the ``score_threshold``. When "absolute", the score used by the selector is compared to the threshold directly. When "relative", at each iteration, the score used by the selector is compared proportionally to the score of the first selection, i.e. the selector quits when ``current_score / first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`. progress_bar: bool, default=False option to use `tqdm <https://tqdm.github.io/>`_ progress bar to monitor selections. Stored in :py:attr:`self.report_progress`. full : bool, default=False In the case that all non-redundant selections are exhausted, choose randomly from the remaining features. Stored in :py:attr:`self.full`. random_state : int or :class:`numpy.random.RandomState` instance, default=0 Attributes ---------- X_current_ : numpy.ndarray (n_samples, n_features) The original matrix orthogonalized by previous selections y_current_ : numpy.ndarray (n_samples, n_properties) The targets orthogonalized by a regression on the previous selections. n_selected_ : int Counter tracking the number of selections that have been made X_selected_ : numpy.ndarray, Matrix containing the selected features, for use in fitting pi_ : numpy.ndarray (n_features), the importance score see :func:`_compute_pi` selected_idx_ : numpy.ndarray indices of selected features Examples -------- >>> from skmatter.feature_selection import PCovCUR >>> import numpy as np >>> selector = PCovCUR(n_to_select=2, mixing=0.5, random_state=0) >>> X = np.array( ... [ ... [0.12, 0.21, 0.02], # 3 samples, 3 features ... [-0.09, 0.32, -0.10], ... [-0.03, -0.53, 0.08], ... ] ... ) >>> y = np.array([0.0, 0.0, 1.0]) # classes of each sample >>> selector.fit(X, y) PCovCUR(n_to_select=2) >>> Xr = selector.transform(X) >>> print(Xr.shape) (3, 2) >>> np.round(selector.pi_) # importance score array([0., 0., 0.]) >>> selector.selected_idx_ array([1, 0]) """ def __init__( self, mixing=0.5, recompute_every=1, k=1, tolerance=1e-12, n_to_select=None, score_threshold=None, score_threshold_type="absolute", progress_bar=False, full=False, random_state=0, ): super().__init__( selection_type="feature", mixing=mixing, recompute_every=recompute_every, k=k, tolerance=tolerance, n_to_select=n_to_select, score_threshold=score_threshold, score_threshold_type=score_threshold_type, progress_bar=progress_bar, full=full, random_state=random_state, )