Source code for skmatter.preprocessing._data

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing._data import KernelCenterer
from sklearn.utils.validation import (
    FLOAT_DTYPES,
    _check_sample_weight,
    check_is_fitted,
    validate_data,
)


[docs] class StandardFlexibleScaler(TransformerMixin, BaseEstimator): """Standardize features by removing the mean and scaling to unit variance. Reduce the mean of the column to zero and, in the case of `column_wise=True` the variance of each column equal to one / number of columns. The standard score of a sample `x` is calculated as: z = (x - u) / s where `u` is the mean of the samples if `with_mean`, otherwise zero, and `s` is the standard deviation of the samples if `with_std` or one. Centering and scaling can occur independently for each feature by calculating the appropriate statistics for the input or for the whole matrix (`column_wise=False`). The mean and standard deviation are then stored for use on later data using :py:meth:`transform`. Standardization of a dataset is a common requirement for many machine learning estimators: an improperly scaled / centered dataset may result in anomalous behavior. At the same time, depending on the conditions of the task, it may be necessary to preserve the ratio in the scale between the features (for example, in the case where the feature matrix is something like a covariance matrix), so the standardization should be carried out for the whole matrix, as opposed to the individual columns, as is done in `sklearn.preprocessing.StandardScaler`. Parameters ---------- with_mean: bool, default=True If True, center the data before scaling. If False, keep the mean intact with_std: bool, default=True If True, scale the data to unit variance. If False, keep the variance intact column_wise: bool, default=False If True, normalize each column separately. If False, normalize the whole matrix with respect to its total variance. rtol: float, default=0 The relative tolerance for the optimization: variance is considered zero when it is less than abs(mean) * rtol + atol. atol: float, default=1.0E-12 The relative tolerance for the optimization: variance is considered zero when it is less than abs(mean) * rtol + atol. copy : bool, default=None Copy the input X or not. Attributes ---------- n_samples_in_: int Number of samples in the reference ndarray n_features_in_: int Number of features in the reference ndarray mean_ : numpy.ndarray of shape (n_features,) The mean value for each feature in the training set. Equal to :class:`numpy.ndarray` of zeros shape (n_features,) when ``with_mean=False``. scale_ : numpy.ndarray of shape (n_features,), float or None The scaling factor, :class:`numpy.ndarray` of shape (n_features,) when `column_wise=True` or float when `column_wise = False`. copy : bool, default=None Copy the input X or not. Examples -------- >>> import numpy as np >>> from skmatter.preprocessing import StandardFlexibleScaler >>> X = np.array([[1.0, -2.0, 2.0], [-2.0, 1.0, 3.0], [4.0, 1.0, -2.0]]) >>> transformer = StandardFlexibleScaler().fit(X) >>> transformer StandardFlexibleScaler() >>> transformer.transform(X) array([[ 0. , -0.56195149, 0.28097574], [-0.84292723, 0.28097574, 0.56195149], [ 0.84292723, 0.28097574, -0.84292723]]) >>> transformer.scale_ * transformer.transform(X) array([[ 0., -2., 1.], [-3., 1., 2.], [ 3., 1., -3.]]) >>> transformer.scale_ * transformer.transform(X) + transformer.mean_ array([[ 1., -2., 2.], [-2., 1., 3.], [ 4., 1., -2.]]) """ def __init__( self, with_mean=True, with_std=True, column_wise=False, rtol=0, atol=1e-12, copy=False, ): """Initialize StandardFlexibleScaler.""" self.with_mean = with_mean self.with_std = with_std self.column_wise = column_wise self.rtol = rtol self.atol = atol self.copy = copy
[docs] def fit(self, X, y=None, sample_weight=None): """Compute mean and scaling to be applied for subsequent normalization. Parameters ---------- X : numpy.ndarray of shape (n_samples, n_features) The data used to compute the mean and standard deviation used for later scaling along the features axis. y: None Ignored. sample_weight: numpy.ndarray of shape (n_samples,) Weights for each sample. Sample weighting can be used to center (and scale) data using a weighted mean. Weights are internally normalized before preprocessing. Returns ------- self : object Fitted scaler. """ X = validate_data( self, X, copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, ensure_min_samples=2, ) self.n_samples_in_, self.n_features_in_ = X.shape if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) sample_weight = sample_weight / np.sum(sample_weight) if self.with_mean: self.mean_ = np.average(X, weights=sample_weight, axis=0) else: self.mean_ = np.zeros(self.n_features_in_) self.scale_ = 1.0 if self.with_std: X_mean = np.average(X, weights=sample_weight, axis=0) var = np.average((X - X_mean) ** 2, weights=sample_weight, axis=0) if self.column_wise: if np.any(var < self.atol + abs(X_mean) * self.rtol): raise ValueError("Cannot normalize a feature with zero variance") self.scale_ = np.sqrt(var) else: var_sum = var.sum() if var_sum < abs(np.average(X_mean)) * self.rtol + self.atol: raise ValueError("Cannot normalize a matrix with zero variance") self.scale_ = np.sqrt(var_sum) return self
[docs] def transform(self, X, y=None, copy=None): """Normalize a vector based on previously computed mean and scaling. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The data used to scale along the features axis. y: None Ignored. copy : bool, default=None Copy the input X or not. Returns ------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Transformed array. """ copy = copy if copy is not None else self.copy X = validate_data( self, X, reset=False, copy=copy, estimator=self, dtype=FLOAT_DTYPES, ) check_is_fitted( self, attributes=["n_samples_in_", "n_features_in_", "scale_", "mean_"] ) if self.n_features_in_ != X.shape[1]: raise ValueError("X shape does not match training shape") return (X - self.mean_) / self.scale_
[docs] def inverse_transform(self, X_tr): """Scale back the data to the original representation. Parameters ---------- X_tr : numpy.ndarray of shape (n_samples, n_features) Transformed matrix Returns ------- X : original matrix """ check_is_fitted( self, attributes=["n_samples_in_", "n_features_in_", "scale_", "mean_"] ) if self.n_features_in_ != X_tr.shape[1]: raise ValueError("X shape does not match training shape") return X_tr * self.scale_ + self.mean_
[docs] class KernelNormalizer(KernelCenterer): r"""Kernel centering method, similar to KernelCenterer, but with additional scaling and ability to pass a set of sample weights. Let :math:`K(x, z)` be a kernel defined by :math:`\phi(x)^T \phi(z)`, where :math:`\phi` is a function mapping x to a Hilbert space. KernelNormalizer centers (i.e., normalize to have zero mean) the data without explicitly computing :math:`\phi(x)`. It is equivalent to centering and scaling :math:`\phi(x)` with sklearn.preprocessing.StandardScaler(with_std=False). Parameters ---------- with_center: bool, default=True If True, center the kernel matrix before scaling. If False, do not center the kernel with_trace: bool, default=True If True, scale the kernel so that the trace is equal to the number of samples. If False, do not scale the kernel Attributes ---------- K_fit_rows_ : numpy.ndarray of shape (n_samples,) Average of each column of kernel matrix. K_fit_all_ : float Average of kernel matrix. sample_weight_ : float Sample weights (if provided during the fit) scale_ : float Scaling parameter used when 'with_trace'=True Calculated as np.trace(K) / K.shape[0] Examples -------- >>> from skmatter.preprocessing import KernelNormalizer >>> from sklearn.metrics.pairwise import pairwise_kernels >>> X = [[1.0, -2.0, 2.0], [-2.0, 1.0, 3.0], [4.0, 1.0, -2.0]] >>> K = pairwise_kernels(X, metric="linear") >>> K array([[ 9., 2., -2.], [ 2., 14., -13.], [ -2., -13., 21.]]) >>> transformer = KernelNormalizer().fit(K) >>> transformer KernelNormalizer() >>> transformer.transform(K) array([[ 0.39473684, 0. , -0.39473684], [ 0. , 1.10526316, -1.10526316], [-0.39473684, -1.10526316, 1.5 ]]) >>> transformer.scale_ * transformer.transform(K) array([[ 5., 0., -5.], [ 0., 14., -14.], [ -5., -14., 19.]]) >>> """ def __init__(self, with_center=True, with_trace=True): """Initialize KernelNormalizer.""" self.with_center = with_center self.with_trace = with_trace super().__init__()
[docs] def fit(self, K, y=None, sample_weight=None): """Fit KernelFlexibleCenterer Parameters ---------- K : numpy.ndarray of shape (n_samples, n_samples) Kernel matrix. y : None Ignored. sample_weight: numpy.ndarray of shape (n_samples,), default=None Weights for each sample. Sample weighting can be used to center (and scale) data using a weighted mean. Weights are internally normalized before preprocessing. Returns ------- self : object Fitted transformer. """ K = validate_data(self, K, copy=True, dtype=FLOAT_DTYPES, reset=False) if sample_weight is not None: self.sample_weight_ = _check_sample_weight(sample_weight, K, dtype=K.dtype) self.sample_weight_ = self.sample_weight_ / np.sum(self.sample_weight_) else: self.sample_weight_ = sample_weight if self.with_center: if self.sample_weight_ is not None: self.K_fit_rows_ = np.average(K, weights=self.sample_weight_, axis=0) self.K_fit_all_ = np.average( self.K_fit_rows_, weights=self.sample_weight_ ) else: super().fit(K, y) K_pred_cols = np.average(K, weights=self.sample_weight_, axis=1)[ :, np.newaxis ] else: self.K_fit_rows_ = np.zeros(K.shape[1]) self.K_fit_all_ = 0.0 K_pred_cols = np.zeros((K.shape[0], 1)) if self.with_trace: K -= self.K_fit_rows_ K -= K_pred_cols K += self.K_fit_all_ self.scale_ = np.trace(K) / K.shape[0] else: self.scale_ = 1.0 return self
[docs] def transform(self, K, copy=True): """Center kernel matrix. Parameters ---------- K : numpy.ndarray of shape (n_samples1, n_samples2) Kernel matrix. copy : bool, default=True Set to False to perform inplace computation. Returns ------- K_new : numpy.ndarray of shape (n_samples1, n_samples2) Transformed array """ check_is_fitted(self) K = validate_data(self, K, copy=copy, dtype=FLOAT_DTYPES, reset=False) if self.with_center: K_pred_cols = np.average(K, weights=self.sample_weight_, axis=1)[ :, np.newaxis ] else: K_pred_cols = np.zeros((K.shape[0], 1)) K -= self.K_fit_rows_ K -= K_pred_cols K += self.K_fit_all_ return K / self.scale_
[docs] def fit_transform(self, K, y=None, sample_weight=None, copy=True, **fit_params): r"""Fit to data, then transform it. Parameters ---------- K : numpy.ndarray of shape (n_samples, n_samples) Kernel matrix. y : None Ignored. sample_weight: numpy.ndarray of shape (n_samples,), default=None Weights for each sample. Sample weighting can be used to center (and scale) data using a weighted mean. Weights are internally normalized before preprocessing. \**fit_params: necessary for compatibility with the functions of the TransformerMixin class Returns ------- K_new : numpy.ndarray of shape (n_samples1, n_samples2) Transformed array """ self.fit(K, y, sample_weight=sample_weight) return self.transform(K, copy)
[docs] class SparseKernelCenterer(TransformerMixin, BaseEstimator): r"""Kernel centering method for sparse kernels, similar to :class:`KernelFlexibleCenterer`. The main disadvantage of kernel methods, which is widely used in machine learning it is that they quickly grow in time and space complexity with the number of sample. It is clear that with a large dataset, not only do you need to store a huge amount of information, but you also need to use it constantly in calculations. In order to avoid this, so-called sparse kernel methods are used formulated from the low-dimensional (The Nystrom) approximation: .. math:: \mathbf{K} \approx \hat{\mathbf{K}}_{N N} = \mathbf{K}_{N M} \mathbf{K}_{M M}^{-1} \mathbf{K}_{N M}^{T} where the subscripts for $\mathbf{K}$ denote the size of the sets of samples compared in each kernel, with $N$ being the size of the full data set and $M$ referring a small, active set containing $M$ samples. With this method it is only need to save and use the matrix $\mathbf{K}_{NM}$, i.e. it is possible to get a $N/M$ times improvement in the asymptotic by memory. Parameters ---------- with_center: bool, default=True If True, center the kernel matrix before scaling. If False, do not center the kernel with_trace : bool, default=True If True, scale the kernel so that the trace is equal to the number of samples. If False, do not scale the kernel rcond : float, default 1E-12 conditioning parameter to use when computing the Nystrom-approximated kernel for scaling Attributes ---------- K_fit_rows_ : numpy.ndarray of shape (n_samples,) Average of each column of kernel matrix. K_fit_all_ : float Average of kernel matrix. sample_weight_ : float Sample weights (if provided during the fit) scale_ : float Scaling parameter used when 'with_trace'=True Calculated as np.trace(K) / K.shape[0] n_active_: int size of active set """ def __init__(self, with_center=True, with_trace=True, rcond=1e-12): self.with_center = with_center self.with_trace = with_trace self.rcond = rcond
[docs] def fit(self, Knm, Kmm, y=None, sample_weight=None): """Fit ``KernelFlexibleCenterer`` Parameters ---------- Knm : numpy.ndarray of shape (n_samples, n_active) Kernel matrix between the reference data set and the active set Kmm : numpy.ndarray of shape (n_active, n_active) Kernel matrix between the active set and itself y : None Ignored. sample_weight: numpy.ndarray of shape (n_samples,), default=None Weights for each sample. Sample weighting can be used to center (and scale) data using a weighted mean. Weights are internally normalized before preprocessing. Returns ------- self : object Fitted transformer. """ if Knm.shape[1] != Kmm.shape[0]: raise ValueError( "The reference kernel is not commensurate shape with the active kernel." ) if Kmm.shape[0] != Kmm.shape[1]: raise ValueError("The active kernel is not square.") if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, Knm, dtype=Knm.dtype) sample_weight = sample_weight / np.sum(sample_weight) self.n_active_ = Kmm.shape[0] if self.with_center: self.K_fit_rows_ = np.average(Knm, weights=sample_weight, axis=0) else: self.K_fit_rows_ = np.zeros(Knm.shape[1]) if self.with_trace: Knm_centered = Knm - self.K_fit_rows_ Khat = Knm_centered @ np.linalg.pinv(Kmm, self.rcond) @ Knm_centered.T self.scale_ = np.sqrt(np.trace(Khat) / Knm.shape[0]) else: self.scale_ = 1.0 return self
[docs] def transform(self, Knm, y=None): """Centering our Kernel. Previously you should fit data. Parameters ---------- Knm: numpy.ndarray of shape (n_samples, n_active) Kernel matrix between the reference data set and the active set y : None Ignored. Returns ------- K_new : numpy.ndarray of shape (n_samples, n_active) Transformed array """ check_is_fitted(self, attributes=["scale_", "K_fit_rows_", "n_active_"]) if Knm.shape[1] != self.n_active_: raise ValueError( "The reference kernel and received kernel have different shape" ) Kc = (Knm - self.K_fit_rows_) / self.scale_ return Kc
[docs] def fit_transform(self, Knm, Kmm, y=None, sample_weight=None, **fit_params): r"""Fit to data, then transform it. Parameters ---------- Knm: numpy.ndarray of shape (n_samples, n_active) Kernel matrix between the reference data set and the active set Kmm: numpy.ndarray of shape (n_active, n_active) Kernel matrix between the active set and itself y : None Ignored. sample_weight: numpy.ndarray of shape (n_samples,), default=None Weights for each sample. Sample weighting can be used to center (and scale) data using a weighted mean. Weights are internally normalized before preprocessing. \**fit_params: necessary for compatibility with the functions of the TransformerMixin class Returns ------- K_new : numpy.ndarray of shape (n_samples, n_active) Transformed array """ self.fit(Knm=Knm, Kmm=Kmm, sample_weight=sample_weight) return self.transform(Knm)