Source code for skmatter.utils._orthogonalizers

# -*- coding: utf-8 -*-
"""Necessary orthogonalizers for the CUR decomposition subselection method.

Authors: Rose K. Cersonsky
         Michele Ceriotti
"""

import warnings

import numpy as np



[docs]
def X_orthogonalizer(x1, c=None, x2=None, tol=1e-12, copy=False):
    """Orthogonalizes a feature matrix by the given columns.

    Can be used to orthogonalize by samples by calling `X = X_orthogonalizer(X.T,
    row_index).T`. After orthogonalization, each column of X will contain only what is
    orthogonal to X[:, c] or x2.

    Parameters
    ----------
    x1: numpy.ndarray of shape (n x m)
        feature matrix to orthogonalize
    c: int, less than m, default=None
       index of the column to orthogonalize by
    x2: numpy.ndarray of shape (n x a), default=x1[:, c]
        a separate set of columns to orthogonalize with respect to
        Note: the orthogonalizer will work column-by-column in column-index order
    """
    if x2 is None and c is not None:
        cols = x1[:, [c]]
    elif x2.shape[0] == x1.shape[0]:
        cols = np.reshape(x2, (x1.shape[0], -1))
    else:
        raise ValueError(
            "You can only orthogonalize a matrix using a vector with the same number "
            f"of rows. Matrix X has {x1.shape[0]} rows, whereas the orthogonalizing "
            f"matrix has {x2.shape[0]} rows."
        )

    if copy:
        xnew = x1.copy()
    else:
        xnew = x1

    for i in range(cols.shape[-1]):
        col = cols[:, [i]]

        if np.linalg.norm(col) < tol:
            warnings.warn("Column vector contains only zeros.", stacklevel=1)
        else:
            col = np.divide(col, np.linalg.norm(col, axis=0))

        xnew -= (col @ (col.T @ xnew)).astype(xnew.dtype)

    return xnew




[docs]
def Y_feature_orthogonalizer(y, X, tol=1e-12, copy=True):
    r"""Orthogonalizes a property matrix given the selected features in
    :math:`\mathbf{X}`.

    .. math::
        \mathbf{Y} \leftarrow \mathbf{Y} -
        \mathbf{X} \left(\mathbf{X}^T\mathbf{X}\right)^{-1}\mathbf{X}^T \mathbf{Y}

    Parameters
    ----------
    y : numpy.ndarray of shape (n_samples x n_properties)
       property matrix
    X : numpy.ndarray of shape (n_samples x n_features)
       feature matrix
    tol: float
        cutoff for small eigenvalues to send to np.linalg.pinv
    copy: bool
        whether to return a copy of y or edit in-place, default=True
    """
    v = np.linalg.pinv(np.matmul(X.T, X), rcond=tol)
    v = np.matmul(X, v)
    v = np.matmul(v, X.T)

    if copy:
        return y.copy() - np.matmul(v, y)
    else:
        y -= np.matmul(v, y)
        return y




[docs]
def Y_sample_orthogonalizer(y, X, y_ref, X_ref, tol=1e-12, copy=True):
    r"""Orthogonalizes a matrix of targets :math:`{\mathbf{Y}}` given a reference
    feature matrix :math:`{\mathbf{X}_r}` and reference target matrix
    :math:`{\mathbf{Y}_r}`:

    .. math::
        \mathbf{Y} \leftarrow \mathbf{Y} -
        \mathbf{X} \left(\mathbf{X}_{\mathbf{r}}^T
        \mathbf{X}_{\mathbf{r}}\right)^{-1}\mathbf{X}_{\mathbf{r}}^T
        \mathbf{Y}_{\mathbf{r}}

    Parameters
    ----------
    y : numpy.ndarray of shape (n_samples x n_properties)
       property matrix
    X : numpy.ndarray of shape (n_samples x n_features)
       feature matrix
    y_ref : numpy.ndarray of shape (n_ref x n_properties)
        reference property matrix
    X_ref : numpy.ndarray of shape (n_ref x n_features)
        reference feature matrix
    tol: float
        cutoff for small eigenvalues to send to np.linalg.pinv
    copy: bool
        whether to return a copy of y or edit in-place, default=True
    """
    y_frag = (X @ (np.linalg.lstsq(X_ref, y_ref, rcond=tol)[0])).reshape(y.shape)

    if copy:
        return y.copy() - y_frag
    else:
        y -= y_frag
        return y