Source code for skmatter.feature_selection._base

"""Sequential feature selection."""

from .._selection import _CUR, _FPS, _PCovCUR, _PCovFPS



[docs]
class FPS(_FPS):
    """Transformer performing Greedy Feature Selection using Farthest Point Sampling.

    Parameters
    ----------
    initialize: int, list of int, numpy.ndarray of int, or 'random', default=0
        Index of the first selection(s). If 'random', picks a random
        value when fit starts. Stored in :py:attr:`self.initialize`.

    n_to_select : int or float, default=None
        The number of selections to make. If `None`, half of the features are selected.
        If integer, the parameter is the absolute number of selections to make. If float
        between 0 and 1, it is the fraction of the total dataset to select. Stored in
        :py:attr:`self.n_to_select`.
    score_threshold : float, default=None
        Threshold for the score. If `None` selection will continue until the n_to_select
        is chosen. Otherwise will stop when the score falls below the threshold. Stored
        in :py:attr:`self.score_threshold`.
    score_threshold_type : str, default="absolute"
        How to interpret the ``score_threshold``. When "absolute", the score used by the
        selector is compared to the threshold directly. When "relative", at each
        iteration, the score used by the selector is compared proportionally to the
        score of the first selection, i.e. the selector quits when ``current_score /
        first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`.
    progress_bar: bool, default=False
        option to use `tqdm <https://tqdm.github.io/>`_ progress bar to monitor
        selections. Stored in :py:attr:`self.report_progress`.
    full : bool, default=False
        In the case that all non-redundant selections are exhausted, choose
        randomly from the remaining features. Stored in :py:attr:`self.full`.
    random_state : int or :class:`numpy.random.RandomState` instance, default=0

    Attributes
    ----------
    n_selected_ : int
        Counter tracking the number of selections that have been made
    X_selected_ : ndarray,
        Matrix containing the selected features, for use in fitting
    selected_idx_ : ndarray
        indices of selected samples

    Examples
    --------
    >>> from skmatter.feature_selection import FPS
    >>> import numpy as np
    >>> selector = FPS(
    ...     n_to_select=2,
    ...     # int or 'random', default=0
    ...     # Index of the first selection.
    ...     # If "random", picks a random value when fit starts.
    ...     initialize=0,
    ... )
    >>> X = np.array(
    ...     [
    ...         [0.12, 0.21, 0.02],  # 3 samples, 3 features
    ...         [-0.09, 0.32, -0.10],
    ...         [-0.03, -0.53, 0.08],
    ...     ]
    ... )
    >>> selector.fit(X)
    FPS(n_to_select=2)
    >>> Xr = selector.transform(X)
    >>> selector.selected_idx_
    array([0, 1])
    """

    def __init__(
        self,
        initialize=0,
        n_to_select=None,
        score_threshold=None,
        score_threshold_type="absolute",
        progress_bar=False,
        full=False,
        random_state=0,
    ):
        super().__init__(
            selection_type="feature",
            initialize=initialize,
            n_to_select=n_to_select,
            score_threshold=score_threshold,
            score_threshold_type=score_threshold_type,
            progress_bar=progress_bar,
            full=full,
            random_state=random_state,
        )




[docs]
class PCovFPS(_PCovFPS):
    r"""Transformer that performs Greedy Feature Selection using PCovR-weighted
    Farthest Point Sampling.

    Parameters
    ----------
    mixing: float, default=0.5
        The PCovR mixing parameter, as described in PCovR as :math:`{\alpha}`
    initialize: int or 'random', default=0
        Index of the first selection. If 'random', picks a random value when fit starts.
    n_to_select : int or float, default=None
        The number of selections to make. If `None`, half of the features are selected.
        If integer, the parameter is the absolute number of selections to make. If float
        between 0 and 1, it is the fraction of the total dataset to select. Stored in
        :py:attr:`self.n_to_select`.
    score_threshold : float, default=None
        Threshold for the score. If `None` selection will continue until the n_to_select
        is chosen. Otherwise will stop when the score falls below the threshold. Stored
        in :py:attr:`self.score_threshold`.
    score_threshold_type : str, default="absolute"
        How to interpret the ``score_threshold``. When "absolute", the score used by the
        selector is compared to the threshold directly. When "relative", at each
        iteration, the score used by the selector is compared proportionally to the
        score of the first selection, i.e. the selector quits when ``current_score /
        first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`.
    progress_bar: bool, default=False
        option to use `tqdm <https://tqdm.github.io/>`_ progress bar to monitor
        selections. Stored in :py:attr:`self.report_progress`.
    full : bool, default=False
        In the case that all non-redundant selections are exhausted, choose
        randomly from the remaining features. Stored in :py:attr:`self.full`.
    random_state : int or :class:`numpy.random.RandomState` instance, default=0

    Attributes
    ----------
    n_selected_ : int
        Counter tracking the number of selections that have been made
    X_selected_ : numpy.ndarray,
        Matrix containing the selected features, for use in fitting

    Examples
    --------
    >>> from skmatter.feature_selection import PCovFPS
    >>> import numpy as np
    >>> selector = PCovFPS(
    ...     n_to_select=2,
    ...     # int or 'random', default=0
    ...     # Index of the first selection.
    ...     # If ‘random’, picks a random value when fit starts.
    ...     initialize=0,
    ... )
    >>> X = np.array(
    ...     [
    ...         [0.12, 0.21, 0.02],  # 3 samples, 3 features
    ...         [-0.09, 0.32, -0.10],
    ...         [-0.03, -0.53, 0.08],
    ...     ]
    ... )
    >>> y = np.array([0.0, 0.0, 1.0])  # classes of each sample
    >>> selector.fit(X, y)
    PCovFPS(n_to_select=2)
    >>> Xr = selector.transform(X)
    >>> selector.selected_idx_
    array([0, 1])
    """

    def __init__(
        self,
        mixing=0.5,
        initialize=0,
        n_to_select=None,
        score_threshold=None,
        score_threshold_type="absolute",
        progress_bar=False,
        full=False,
        random_state=0,
    ):
        super().__init__(
            selection_type="feature",
            mixing=mixing,
            initialize=initialize,
            n_to_select=n_to_select,
            score_threshold=score_threshold,
            score_threshold_type=score_threshold_type,
            progress_bar=progress_bar,
            full=full,
            random_state=random_state,
        )




[docs]
class CUR(_CUR):
    """Transformer that performs Greedy Feature Selection by choosing features
    which maximize the magnitude of the right singular vectors, consistent with
    classic CUR matrix decomposition.

    Parameters
    ----------
    recompute_every : int
        number of steps after which to recompute the pi score
        defaults to 1, if 0 no re-computation is done
    k : int
        number of eigenvectors to compute the importance score with, defaults to ``1``
    tolerance: float
        threshold below which scores will be considered 0, defaults to ``1e-12``
    n_to_select : int or float, default=None
        The number of selections to make. If `None`, half of the features are selected.
        If integer, the parameter is the absolute number of selections to make. If float
        between 0 and 1, it is the fraction of the total dataset to select. Stored in
        :py:attr:`self.n_to_select`.
    score_threshold : float, default=None
        Threshold for the score. If `None` selection will continue until the
        n_to_select is chosen. Otherwise will stop when the score falls below the
        threshold. Stored in :py:attr:`self.score_threshold`.
    score_threshold_type : str, default="absolute"
        How to interpret the ``score_threshold``. When "absolute", the score used by
        the selector is compared to the threshold directly. When "relative", at each
        iteration, the score used by the selector is compared proportionally to the
        score of the first selection, i.e. the selector quits when
        ``current_score / first_score < threshold``. Stored in
        :py:attr:`self.score_threshold_type`.
    progress_bar: bool, default=False
              option to use `tqdm <https://tqdm.github.io/>`_ progress bar to monitor
              selections. Stored in :py:attr:`self.report_progress`.
    full : bool, default=False
        In the case that all non-redundant selections are exhausted, choose
        randomly from the remaining features. Stored in :py:attr:`self.full`.
    random_state : int or :class:`numpy.random`RandomState` instance, default=0

    Attributes
    ----------
    X_current_ : numpy.ndarray (n_samples, n_features)
        The original matrix orthogonalized by previous selections
    n_selected_ : int
        Counter tracking the number of selections that have been made
    X_selected_ : numpy.ndarray
        Matrix containing the selected features, for use in fitting
    pi_ : numpy.ndarray (n_features),
        the importance score see :func:`_compute_pi`
    selected_idx_ : numpy.ndarray
        indices of selected features

    Examples
    --------
    >>> from skmatter.feature_selection import CUR
    >>> import numpy as np
    >>> selector = CUR(n_to_select=2, random_state=0)
    >>> X = np.array(
    ...     [
    ...         [0.12, 0.21, 0.02],  # 3 samples, 3 features
    ...         [-0.09, 0.32, -0.10],
    ...         [-0.03, -0.53, 0.08],
    ...     ]
    ... )
    >>> selector.fit(X)
    CUR(n_to_select=2)
    >>> Xr = selector.transform(X)
    >>> print(Xr.shape)
    (3, 2)
    >>> np.round(selector.pi_)  # importance score
    array([0., 0., 0.])
    >>> selector.selected_idx_
    array([1, 0])
    """

    def __init__(
        self,
        recompute_every=1,
        k=1,
        tolerance=1e-12,
        n_to_select=None,
        score_threshold=None,
        score_threshold_type="absolute",
        progress_bar=False,
        full=False,
        random_state=0,
    ):
        super().__init__(
            selection_type="feature",
            recompute_every=recompute_every,
            k=k,
            tolerance=tolerance,
            n_to_select=n_to_select,
            score_threshold=score_threshold,
            score_threshold_type=score_threshold_type,
            progress_bar=progress_bar,
            full=full,
            random_state=random_state,
        )




[docs]
class PCovCUR(_PCovCUR):
    r"""Transformer that performs Greedy Feature Selection by choosing features
    which maximize the importance score :math:`\pi`, which is the sum over
    the squares of the first :math:`k` components of the PCovR-modified
    right singular vectors.

    Parameters
    ----------
    recompute_every : int
        number of steps after which to recompute the pi score defaults to 1, if 0 no
        re-computation is done
    k : int
        number of eigenvectors to compute the importance score with, defaults to 1
    tolerance: float
        threshold below which scores will be considered 0, defaults to ``1e-12``
    mixing: float, default=0.5
        The PCovR mixing parameter, as described in PCovR as
        :math:`{\alpha}`. Stored in :py:attr:`self.mixing`.
    n_to_select : int or float, default=None
        The number of selections to make. If `None`, half of the features are selected.
        If integer, the parameter is the absolute number of selections to make. If float
        between 0 and 1, it is the fraction of the total dataset to select. Stored in
        :py:attr:`self.n_to_select`.
    score_threshold : float, default=None
        Threshold for the score. If `None` selection will continue until the n_to_select
        is chosen. Otherwise will stop when the score falls below the threshold. Stored
        in :py:attr:`self.score_threshold`.
    score_threshold_type : str, default="absolute"
        How to interpret the ``score_threshold``. When "absolute", the score used by the
        selector is compared to the threshold directly. When "relative", at each
        iteration, the score used by the selector is compared proportionally to the
        score of the first selection, i.e. the selector quits when ``current_score /
        first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`.
    progress_bar: bool, default=False
        option to use `tqdm <https://tqdm.github.io/>`_ progress bar to monitor
        selections. Stored in :py:attr:`self.report_progress`.
    full : bool, default=False
        In the case that all non-redundant selections are exhausted, choose randomly
        from the remaining features. Stored in :py:attr:`self.full`.
    random_state : int or :class:`numpy.random.RandomState` instance, default=0

    Attributes
    ----------
    X_current_ : numpy.ndarray (n_samples, n_features)
        The original matrix orthogonalized by previous selections
    y_current_ : numpy.ndarray (n_samples, n_properties)
        The targets orthogonalized by a regression on the previous selections.
    n_selected_ : int
        Counter tracking the number of selections that have been made
    X_selected_ : numpy.ndarray,
        Matrix containing the selected features, for use in fitting
    pi_ : numpy.ndarray (n_features),
        the importance score see :func:`_compute_pi`
    selected_idx_ : numpy.ndarray
        indices of selected features

    Examples
    --------
    >>> from skmatter.feature_selection import PCovCUR
    >>> import numpy as np
    >>> selector = PCovCUR(n_to_select=2, mixing=0.5, random_state=0)
    >>> X = np.array(
    ...     [
    ...         [0.12, 0.21, 0.02],  # 3 samples, 3 features
    ...         [-0.09, 0.32, -0.10],
    ...         [-0.03, -0.53, 0.08],
    ...     ]
    ... )
    >>> y = np.array([0.0, 0.0, 1.0])  # classes of each sample
    >>> selector.fit(X, y)
    PCovCUR(n_to_select=2)
    >>> Xr = selector.transform(X)
    >>> print(Xr.shape)
    (3, 2)
    >>> np.round(selector.pi_)  # importance score
    array([0., 0., 0.])
    >>> selector.selected_idx_
    array([1, 0])
    """

    def __init__(
        self,
        mixing=0.5,
        recompute_every=1,
        k=1,
        tolerance=1e-12,
        n_to_select=None,
        score_threshold=None,
        score_threshold_type="absolute",
        progress_bar=False,
        full=False,
        random_state=0,
    ):
        super().__init__(
            selection_type="feature",
            mixing=mixing,
            recompute_every=recompute_every,
            k=k,
            tolerance=tolerance,
            n_to_select=n_to_select,
            score_threshold=score_threshold,
            score_threshold_type=score_threshold_type,
            progress_bar=progress_bar,
            full=full,
            random_state=random_state,
        )