Source code for skmatter.metrics._pairwise

from typing import Union

import numpy as np
from sklearn.metrics.pairwise import _euclidean_distances, check_pairwise_arrays


[docs] def periodic_pairwise_euclidean_distances( X, Y=None, *, squared=False, cell_length=None, ): r""" Compute the pairwise distance matrix between each pair from a vector array X and Y. .. math:: d_{i, j} = \\sqrt{\\sum_{k=1}^n (x_{i, k} - y_{j, k})^2} For efficiency reasons, the euclidean distance between a pair of row vector x and y is computed as:: dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y)) This formulation has two advantages over other ways of computing distances. First, it is computationally efficient when dealing with sparse data. Second, if one argument varies but the other remains unchanged, then `dot(x, x)` and/or `dot(y, y)` can be pre-computed. However, this is not the most precise way of doing this computation, because this equation potentially suffers from "catastrophic cancellation". Also, the distance matrix returned by this function may not be exactly symmetric as required by, e.g., ``scipy.spatial.distance`` functions. Read more in the :ref:`User Guide <metrics>`. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples_X, n_components) An array where each row is a sample and each column is a component. Y : {array-like, sparse matrix} of shape (n_samples_Y, n_components), \ default=None An array where each row is a sample and each column is a component. If `None`, method uses `Y=X`. cell_length : array-like of shape (n_components,), default=None The side length of rectangular cell used for periodic boundary conditions. `None` for non-periodic boundary conditions. .. note:: Only side lengths of rectangular cells are supported. Cell format: `[side_length_1, ..., side_length_n]` Returns ------- distances : ndarray of shape (n_samples_X, n_samples_Y) Returns the distances between the row vectors of `X` and the row vectors of `Y`. Examples -------- >>> import numpy as np >>> from skmatter.metrics import periodic_pairwise_euclidean_distances >>> X = np.array([[0, 1], [1, 1]]) >>> origin = np.array([[0, 0]]) >>> # distance between rows of X >>> periodic_pairwise_euclidean_distances(X, X) array([[0., 1.], [1., 0.]]) >>> # get distance to origin >>> periodic_pairwise_euclidean_distances(X, origin, cell_length=[0.5, 0.7]) array([[0.3], [0.3]]) """ _check_dimension(X, cell_length) X, Y = check_pairwise_arrays(X, Y) if cell_length is None: return _euclidean_distances(X, Y, squared=squared) else: return _periodic_euclidean_distances(X, Y, squared=squared, cell=cell_length)
def _periodic_euclidean_distances(X, Y=None, *, squared=False, cell=None): X, Y = np.array(X).astype(float), np.array(Y).astype(float) XY = np.concatenate([x - Y for x in X]) XY -= np.round(XY / cell) * cell distance = np.linalg.norm(XY, axis=1).reshape(X.shape[0], Y.shape[0]) if squared: distance **= 2 return distance
[docs] def pairwise_mahalanobis_distances( X: np.ndarray, Y: np.ndarray, cov_inv: np.ndarray, cell_length: Union[np.ndarray, None] = None, squared: bool = False, ): r""" Calculate the pairwise Mahalanobis distance between two arrays. This metric is used for calculating the distances between observations from Gaussian distributions. It is defined as: .. math:: d_{\Sigma}(x, y)^2 = (x - y)^T \Sigma^{-1} (x - y) where :math:`\Sigma` is the covariance matrix, :math:`x` and :math:`y` are observations from the same distribution. Parameters ---------- X : numpy.ndarray of shape (n_samples_X, n_components) An array where each row is a sample and each column is a component. Y : np.ndarray of shape (n_samples_Y, n_components) An array where each row is a sample and each column is a component. cov_inv : np.ndarray The inverse covariance matrix of shape (n_components, n_components). cell_length : np.ndarray, optinal, default=None The cell size for periodic boundary conditions. None for non-periodic boundary conditions. .. note:: Only cubic cells are supported. Cell format: `[side_length_1, ..., side_length_n]` squared : bool, default=False Whether to return the squared distance. Returns ------- np.ndarray The pairwise Mahalanobis distance between the two input arrays, of shape `(cov_inv.shape[0], x.shape[0], y.shape[0])`. Examples -------- >>> import numpy as np >>> from skmatter.metrics import pairwise_mahalanobis_distances >>> iv = np.array([[1, 0.5, 0.5], [0.5, 1, 0.5], [0.5, 0.5, 1]]) >>> X = np.array([[1, 0, 0], [0, 2, 0], [2, 0, 0]]) >>> Y = np.array([[0, 1, 0]]) >>> pairwise_mahalanobis_distances(X, Y, iv) array([[[1. ], [1. ], [1.73205081]]]) """ def _mahalanobis( cell: np.ndarray, X: np.ndarray, Y: np.ndarray, cov_inv: np.ndarray ): XY = np.concatenate([x - Y for x in X]) if cell is not None: XY -= np.round(XY / cell) * cell return np.sum(XY * np.transpose(cov_inv @ XY.T, (0, 2, 1)), axis=-1).reshape( (cov_inv.shape[0], X.shape[0], Y.shape[0]) ) _check_dimension(X, cell_length) X, Y = check_pairwise_arrays(X, Y) if len(cov_inv.shape) == 2: cov_inv = cov_inv[np.newaxis, :, :] dists = _mahalanobis(cell_length, X, Y, cov_inv) if not squared: dists **= 0.5 return dists
def _check_dimension(X, cell_length): if (cell_length is not None) and (X.shape[1] != len(cell_length)): raise ValueError("Cell dimension does not match the data dimension.")