import numpy as np
from joblib import Parallel, delayed
from ..linear_model import OrthogonalRegression, Ridge2FoldCV
from ..model_selection import train_test_split
from ..preprocessing import StandardFlexibleScaler
[docs]
def pointwise_global_reconstruction_error(
X,
Y,
train_idx=None,
test_idx=None,
scaler=None,
estimator=None,
):
r"""Computes the pointwise global reconstruction error using the source X
to reconstruct the features or samples of target Y based on a minimization
by linear regression:
.. math::
GRE^{(i)}(X,Y) = \min_W ||y_i - x_iW||
If used with X and Y of shape (n_samples, n_features) it computes the pointwise
global reconstruction error of the features as defined in Ref. [Goscinski2021]_.
In this case the number of samples of X and Y should agree with each other,
but the number of features can be different. The error is expressed per sample.
If used with X and Y of shape(n_features, n_samples) it computes the
reconstruction error of the samples. In this case the number of
features of X and Y should agree with each other, but the number of
samples can be different. The error is expressed per feature.
The default parameters mimics the ones of Ref. [Goscinski2021]_.
Parameters
----------
X : numpy.ndarray of shape (n_samples, X_n_features)
Source data which reconstructs target Y. For feature reconstruction of Y using X
use input shape (samples, features). For sample reconstruction of Y using X use
input shape (features, samples).
Y : numpy.ndarray of shape (n_samples, Y_n_targets)
Target data which is reconstructed with X. For feature reconstruction of Y using
X use input shape (samples, features). For sample reconstruction of Y using X
use input shape (features, samples).
train_idx : numpy.ndarray, dtype=int, default=None
array of indices used for training, if None,
If None, the complement of the ``test_idx`` is used. If ``train_size`` is
also None, 2-fold split is taken.
test_idx : numpy.ndarray, dtype=int, default=None
array of indices used for training, if None,
If None, the complement of the ``train_idx`` is used. If ``test_size`` is
also None, 2-fold split is taken.
scaler : object implementing fit/transfom
Scales the X and Y before computing the reconstruction measure.
The default value scales the features such that the reconstruction
measure on the training set is upper bounded to 1.
estimator : object implementing fit/predict, default=None
Sklearn estimator used to reconstruct features/samples.
Returns
-------
pointwise_global_reconstruction_error : numpy.ndarray
The global reconstruction error for each sample/point
"""
(
train_idx,
test_idx,
scaler,
estimator,
) = check_global_reconstruction_measures_input(
X, Y, train_idx, test_idx, scaler, estimator
)
X_train, X_test, Y_train, Y_test = (
X[train_idx],
X[test_idx],
Y[train_idx],
Y[test_idx],
)
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
scaler.fit(Y_train)
Y_train = scaler.transform(Y_train)
Y_test = scaler.transform(Y_test)
estimator.fit(X_train, Y_train)
return np.linalg.norm(Y_test - estimator.predict(X_test), axis=1)
[docs]
def global_reconstruction_error(
X,
Y,
test_idx=None,
train_idx=None,
scaler=None,
estimator=None,
):
r"""Computes the global reconstruction error using the source X
to reconstruct the features or samples of target Y based on a minimization
by linear regression:
.. math::
GRE(X,Y) = \min_W ||Y - XW||
If used with X and Y of shape (n_samples, n_features) it computes the
global reconstruction error of the features as defined in Ref. [Goscinski2021]_.
In this case the number of samples of X and Y should agree with each other,
but the number of features can be different. The error is expressed per sample.
If used with X and Y of shape(n_features, n_samples) it computes the
reconstruction error of the samples. In this case the number of
features of X and Y should agree with each other, but the number of
samples can be different. The error is expressed per feature.
The default parameters mimics the ones of Ref. [Goscinski2021]_.
Parameters
----------
X : numpy.ndarray of shape (n_samples, X_n_features)
Source data which reconstructs target Y.
For feature reconstruction of Y using X use input shape (samples, features).
For sample reconstruction of Y using X use input shape (features, samples).
Y : numpy.ndarray of shape (n_samples, Y_n_targets)
Target data which is reconstructed with X.
For feature reconstruction of Y using X use input shape (samples, features).
For sample reconstruction of Y using X use input shape (features, samples).
train_idx : numpy.ndarray, dtype=int, default=None
array of indices used for training, if None,
If None, the complement of the ``test_idx`` is used. If ``train_size`` is
also None, 2-fold split is taken.
test_idx : numpy.ndarray, dtype=int, default=None
array of indices used for training, if None,
If None, the complement of the ``train_idx`` is used. If ``test_size`` is
also None, 2-fold split is taken.
scaler : object implementing fit/transfom
Scales the X and Y before computing the reconstruction measure.
The default value scales the features such that the reconstruction
measure on the training set is upper bounded to 1.
estimator : object implementing fit/predict, default=None
Sklearn estimator used to reconstruct features/samples.
Returns
-------
global_reconstruction_error : ndarray
The global reconstruction error
"""
pointwise_global_reconstruction_error_values = (
pointwise_global_reconstruction_error(
X,
Y,
train_idx=train_idx,
test_idx=test_idx,
scaler=scaler,
estimator=estimator,
)
)
return np.linalg.norm(pointwise_global_reconstruction_error_values) / np.sqrt(
len(pointwise_global_reconstruction_error_values)
)
[docs]
def pointwise_global_reconstruction_distortion(
X,
Y,
test_idx=None,
train_idx=None,
scaler=None,
estimator=None,
):
r"""Computes the pointwise global reconstruction distortion using the source X
to reconstruct the features or samples of target Y based on a minimization
by orthogonal regression:
.. math::
GRD^{(i)}(X,Y) = \min_Q ||y_i - x_iQ\|| \quad\mathrm{subject\ to}\quad Q^TQ=I
If used with X and Y of shape (n_samples, n_features) it computes the pointwise
global reconstruction distortion of the features as defined in
Ref. [Goscinski2021]_.
In this case the number of samples of X and Y should agree with each other,
but the number of features can be different. The distortion is expressed per sample.
If used with X and Y of shape(n_features, n_samples) it computes the
reconstruction distortion of the samples. In this case the number of
features of X and Y should agree with each other, but the number of
samples can be different. The distortion is expressed per feature.
The default parameters mimics the ones of Ref. [Goscinski2021]_.
Parameters
----------
X : numpy.ndarray of shape (n_samples, X_n_features)
Source data which reconstructs target Y.
For feature reconstruction of Y using X use input shape (samples, features).
For sample reconstruction of Y using X use input shape (features, samples).
Y : numpy.ndarray of shape (n_samples, Y_n_targets)
Target data which is reconstructed with X.
For feature reconstruction of Y using X use input shape (samples, features).
For sample reconstruction of Y using X use input shape (features, samples).
train_idx : numpy.ndarray, dtype=int, default=None
array of indices used for training, if None,
If None, the complement of the ``test_idx`` is used. If ``train_size`` is
also None, 2-fold split is taken.
test_idx : numpy.ndarray, dtype=int, default=None
array of indices used for training, if None,
If None, the complement of the ``train_idx`` is used. If ``test_size`` is
also None, 2-fold split is taken.
scaler : object implementing fit/transfom
Scales the X and Y before computing the reconstruction measure.
The default value scales the features such that the reconstruction
measure on the training set is upper bounded to 1.
estimator : object implementing fit/predict, default=None
Sklearn estimator used to reconstruct features/samples.
Returns
-------
pointwise_global_reconstruction_distortion : ndarray
The global reconstruction distortion for each sample/point
"""
(
train_idx,
test_idx,
scaler,
estimator,
) = check_global_reconstruction_measures_input(
X, Y, train_idx, test_idx, scaler, estimator
)
X_train, X_test, Y_train, Y_test = (
X[train_idx],
X[test_idx],
Y[train_idx],
Y[test_idx],
)
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
scaler.fit(Y_train)
Y_train = scaler.transform(Y_train)
Y_test = scaler.transform(Y_test)
predictions_Y_test = estimator.fit(X_train, Y_train).predict(X_test)
orthogonal_predictions_Y_test = (
OrthogonalRegression(use_orthogonal_projector=False)
.fit(X_train, estimator.predict(X_train))
.predict(X_test)
)
return np.linalg.norm(predictions_Y_test - orthogonal_predictions_Y_test, axis=1)
[docs]
def global_reconstruction_distortion(
X,
Y,
test_idx=None,
train_idx=None,
scaler=None,
estimator=None,
):
r"""Computes the global reconstruction distortion using the source X
to reconstruct the features or samples of target Y based on a minimization
by orthogonal regression:
.. math::
GRD(X,Y) = \min_Q ||y - XQ\|| \quad\mathrm{subject\ to}\quad Q^TQ=I
If used with X and Y of shape (n_samples, n_features) it computes the
global reconstruction distortion of the features as defined in
Ref. [Goscinski2021]_.
In this case the number of samples of X and Y should agree with each other,
but the number of features can be different. The distortion is expressed per sample.
If used with X and Y of shape(n_features, n_samples) it computes the
reconstruction distortion of the samples. In this case the number of
features of X and Y should agree with each other, but the number of
samples can be different. The distortion is expressed per feature.
The default parameters mimics the ones of Ref. [Goscinski2021]_.
Parameters
----------
X : numpy.ndarray of shape (n_samples, X_n_features)
Source data which reconstructs target Y.
For feature reconstruction of Y using X use input shape (samples, features).
For sample reconstruction of Y using X use input shape (features, samples).
Y : numpy.ndarray of shape (n_samples, Y_n_targets)
Target data which is reconstructed with X.
For feature reconstruction of Y using X use input shape (samples, features).
For sample reconstruction of Y using X use input shape (features, samples).
train_idx : numpy.ndarray, dtype=int, default=None
array of indices used for training, if None,
If None, the complement of the ``test_idx`` is used. If ``train_size`` is
also None, 2-fold split is taken.
test_idx : numpy.ndarray, dtype=int, default=None
array of indices used for training, if None,
If None, the complement of the ``train_idx`` is used. If ``test_size`` is
also None, 2-fold split is taken.
scaler : object implementing fit/transfom
Scales the X and Y before computing the reconstruction measure.
The default value scales the features such that the reconstruction
measure on the training set is upper bounded to 1.
estimator : object implementing fit/predict, default=None
Sklearn estimator used to reconstruct features/samples.
Returns
-------
global_reconstruction_distortion : numpy.ndarray
The global reconstruction distortion
"""
pointwise_global_reconstruction_distortion_values = (
pointwise_global_reconstruction_distortion(
X,
Y,
train_idx=train_idx,
test_idx=test_idx,
scaler=scaler,
estimator=estimator,
)
)
return np.linalg.norm(pointwise_global_reconstruction_distortion_values) / np.sqrt(
len(pointwise_global_reconstruction_distortion_values)
)
[docs]
def pointwise_local_reconstruction_error(
X,
Y,
n_local_points,
test_idx=None,
train_idx=None,
scaler=None,
estimator=None,
n_jobs=None,
):
r"""Computes the pointwise local reconstruction error using the source X to
reconstruct the features or samples of target Y based on a minimization by linear
regression:
.. math::
\tilde{\mathbf{x}}'_i = \bar{\mathbf{x}} + (\mathbf{x}_i
- \bar{\mathbf{x}})\mathbf{P}^{(i)}
.. math::
LRE^{(i)}(X,Y) = \|\mathbf{x}'_i - \tilde{\mathbf{x}}'_i\|^2
If used with X and Y of shape (n_samples, n_features) it computes the pointwise
local reconstruction error of the features as defined in Ref. [Goscinski2021]_.
In this case the number of samples of X and Y should agree with each other,
but the number of features can be different. The error is expressed per sample.
If used with X and Y of shape(n_features, n_samples) it computes the
reconstruction error of the samples. In this case the number of
features of X and Y should agree with each other, but the number of
samples can be different. The error is expressed per feature.
The default parameters mimics the ones of Ref. [Goscinski2021]_.
Parameters
----------
X : numpy.ndarray of shape (n_samples, X_n_features)
Source data which reconstructs target Y.
For feature reconstruction of Y using X use input shape (samples, features).
For sample reconstruction of Y using X use input shape (features, samples).
Y : numpy.ndarray of shape (n_samples, Y_n_targets)
Target data which is reconstructed with X.
For feature reconstruction of Y using X use input shape (samples, features).
For sample reconstruction of Y using X use input shape (features, samples).
n_local_points : int,
Number of neighbour points used to compute the local reconstruction weight for
each sample/point.
train_idx : numpy.ndarray, dtype=int, default=None
array of indices used for training, if None,
If None, the complement of the ``test_idx`` is used. If ``train_size`` is
also None, 2-fold split is taken.
test_idx : numpy.ndarray, dtype=int, default=None
array of indices used for training, if None,
If None, the complement of the ``train_idx`` is used. If ``test_size`` is
also None, 2-fold split is taken.
scaler : object implementing fit/transfom
Scales the X and Y before computing the reconstruction measure.
The default value scales the features such that the reconstruction
measure on the training set is upper bounded to 1.
estimator : object implementing fit/predict, default=None
Sklearn estimator used to reconstruct features/samples.
Returns
-------
pointwise_local_reconstruction_error : numpy.ndarray
The local reconstruction error for each sample/point
"""
(
train_idx,
test_idx,
scaler,
estimator,
) = check_local_reconstruction_measures_input(
X, Y, n_local_points, train_idx, test_idx, scaler, estimator
)
X_train, X_test, Y_train, Y_test = (
X[train_idx],
X[test_idx],
Y[train_idx],
Y[test_idx],
)
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test).astype(X_train.dtype)
scaler.fit(Y_train)
Y_train = scaler.transform(Y_train)
Y_test = scaler.transform(Y_test)
squared_dist = (
np.sum(X_train**2, axis=1)
+ np.sum(X_test**2, axis=1)[:, np.newaxis]
- 2 * X_test @ X_train.T
)
n_test = X_test.shape[0]
def local_reconstruction_error_i(i):
# comments correspond notation in [gfrm]_
local_env_idx = np.argsort(squared_dist[i])[:n_local_points]
# D_{k-neigh}^{(i)}
local_X_train = X_train[local_env_idx]
# \bar{x}_F
local_X_train_mean = np.mean(X_train[local_env_idx], axis=0)
# D_{k-neigh}^{(i)}
local_Y_train = Y_train[local_env_idx]
# \bar{x}_F'
local_Y_train_mean = np.mean(Y_train[local_env_idx], axis=0)
# P_{FF'}
estimator.fit(
local_X_train - local_X_train_mean,
local_Y_train - local_Y_train_mean,
)
# \tilde{x}_i' = \bar{x}_{F'} + (x_i - \bar{x}_F)P_{FF'}
tilde_x_i_dash_test = local_Y_train_mean + estimator.predict(
X_test[i, :][np.newaxis, :] - local_X_train_mean
)
# \|x_i' - \tilde{x}_i'\|
return np.linalg.norm(Y_test[i, :][np.newaxis, :] - tilde_x_i_dash_test)
pointwise_local_reconstruction_error_values = np.array(
Parallel(n_jobs=n_jobs)(
delayed(local_reconstruction_error_i)(i) for i in range(n_test)
)
)
return pointwise_local_reconstruction_error_values
[docs]
def local_reconstruction_error(
X,
Y,
n_local_points,
test_idx=None,
train_idx=None,
scaler=None,
estimator=None,
n_jobs=None,
):
r"""Computes the local reconstruction error using the source X to reconstruct the
features or samples of target Y based on a minimization by linear regression:
.. math::
LRE(X,Y) = \sqrt{\sum_i LRE^{(i)}(X,Y)}/\sqrt{n_\text{test}}
If used with X and Y of shape (n_samples, n_features) it computes the
local reconstruction error of the features as defined in Ref. [Goscinski2021]_.
In this case the number of samples of X and Y should agree with each other,
but the number of features can be different. The error is expressed per sample.
If used with X and Y of shape(n_features, n_samples) it computes the
reconstruction error of the samples. In this case the number of
features of X and Y should agree with each other, but the number of
samples can be different. The error is expressed per feature.
The default parameters mimics the ones of Ref. [Goscinski2021]_.
Parameters
----------
X : numpy.ndarray of shape (n_samples, X_n_features)
Source data which reconstructs target Y.
For feature reconstruction of Y using X use input shape (samples, features).
For sample reconstruction of Y using X use input shape (features, samples).
Y : numpy.ndarray of shape (n_samples, Y_n_targets)
Target data which is reconstructed with X.
For feature reconstruction of Y using X use input shape (samples, features).
For sample reconstruction of Y using X use input shape (features, samples).
n_local_points : int,
Number of neighbour points used to compute the local reconstruction weight for
each sample/point.
train_idx : numpy.ndarray, dtype=int, default=None
array of indices used for training, if None,
If None, the complement of the ``test_idx`` is used. If ``train_size`` is
also None, 2-fold split is taken.
test_idx : numpy.ndarray, dtype=int, default=None
array of indices used for training, if None,
If None, the complement of the ``train_idx`` is used. If ``test_size`` is
also None, 2-fold split is taken.
scaler : object implementing fit/transfom
Scales the X and Y before computing the reconstruction measure.
The default value scales the features such that the reconstruction
measure on the training set is upper bounded to 1.
estimator : object implementing fit/predict, default=None
Sklearn estimator used to reconstruct features/samples.
Returns
-------
local_reconstruction_error : numpy.ndarray
The local reconstruction error
"""
pointwise_local_reconstruction_error_values = pointwise_local_reconstruction_error(
X,
Y,
n_local_points,
train_idx=train_idx,
test_idx=test_idx,
scaler=scaler,
estimator=estimator,
n_jobs=n_jobs,
)
return np.linalg.norm(pointwise_local_reconstruction_error_values) / np.sqrt(
len(pointwise_local_reconstruction_error_values)
)
def check_global_reconstruction_measures_input(
X, Y, train_idx, test_idx, scaler, estimator
):
"""Returns default reconstruction measure inputs for all None parameters"""
assert len(X) == len(Y)
if (train_idx is None) and (test_idx is None):
train_idx, test_idx = train_test_split(
np.arange(len(X)),
test_size=0.5,
train_size=0.5,
random_state=0x5F3759DF,
shuffle=True,
train_test_overlap=False,
)
elif train_idx is None:
train_idx = np.setdiff1d(np.arange(len(X)), test_idx)
elif test_idx is None:
test_idx = np.setdiff1d(np.arange(len(X)), train_idx)
if scaler is None:
scaler = StandardFlexibleScaler()
if estimator is None:
estimator = Ridge2FoldCV(
alphas=np.geomspace(1e-9, 0.9, 20),
alpha_type="relative",
regularization_method="cutoff",
random_state=0x5F3759DF,
shuffle=True,
scoring="neg_root_mean_squared_error",
n_jobs=1,
)
return train_idx, test_idx, scaler, estimator
def check_local_reconstruction_measures_input(
X, Y, n_local_points, train_idx, test_idx, scaler, estimator
):
"""Returns default reconstruction measure inputs for all None parameters"""
# only needs to check one extra parameter
assert len(X) >= n_local_points
return check_global_reconstruction_measures_input(
X, Y, train_idx, test_idx, scaler, estimator
)