Source code for allel.stats.distance

# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division


import itertools


import numpy as np
import scipy.linalg


from allel.model import SortedIndex
from allel.util import asarray_ndim, ensure_square
from allel.stats.diversity import sequence_divergence


[docs]def pairwise_distance(x, metric): """Compute pairwise distance between individuals (e.g., samples or haplotypes). Parameters ---------- x : array_like, shape (n, m, ...) Array of m observations (e.g., samples or haplotypes) in a space with n dimensions (e.g., variants). Note that the order of the first two dimensions is **swapped** compared to what is expected by scipy.spatial.distance.pdist. metric : string or function Distance metric. See documentation for the function :func:`scipy.spatial.distance.pdist` for a list of built-in distance metrics. Returns ------- dist : ndarray, shape (n_individuals * (n_individuals - 1) / 2,) Distance matrix in condensed form. See Also -------- allel.plot.pairwise_distance Notes ----- If `x` is a bcolz carray, a chunk-wise implementation will be used to avoid loading the entire input array into memory. This means that a distance matrix will be calculated for each chunk in the input array, and the results will be summed to produce the final output. For some distance metrics this will return a different result from the standard implementation, although the relative distances may be equivalent. Examples -------- >>> import allel >>> g = allel.model.GenotypeArray([[[0, 0], [0, 1], [1, 1]], ... [[0, 1], [1, 1], [1, 2]], ... [[0, 2], [2, 2], [-1, -1]]]) >>> d = allel.stats.pairwise_distance(g.to_n_alt(), metric='cityblock') >>> d array([ 3., 4., 3.]) >>> import scipy.spatial >>> scipy.spatial.distance.squareform(d) array([[ 0., 3., 4.], [ 3., 0., 3.], [ 4., 3., 0.]]) """ import scipy.spatial # check inputs if not hasattr(x, 'ndim'): x = np.asarray(x) if x.ndim < 2: raise ValueError('array with at least 2 dimensions expected') if x.ndim == 2: # use scipy to calculate distance, it's most efficient def f(b): # transpose as pdist expects (m, n) for m observations in an # n-dimensional space t = b.T # compute the distance matrix return scipy.spatial.distance.pdist(t, metric=metric) else: # use our own implementation, it handles multidimensional observations def f(b): return pdist(b, metric=metric) if hasattr(x, 'chunklen'): # use chunk-wise implementation blen = x.chunklen dist = None for i in range(0, x.shape[0], blen): block = x[i:i+blen] if dist is None: dist = f(block) else: dist += f(block) else: # standard implementation dist = f(x) return dist
def pdist(x, metric): """Alternative implementation of :func:`scipy.spatial.distance.pdist` which is slower but more flexible in that arrays with >2 dimensions can be passed, allowing for multidimensional observations, e.g., diploid genotype calls or allele counts. Parameters ---------- x : array_like, shape (n, m, ...) Array of m observations (e.g., samples or haplotypes) in a space with n dimensions (e.g., variants). Note that the order of the first two dimensions is **swapped** compared to what is expected by scipy.spatial.distance.pdist. metric : string or function Distance metric. See documentation for the function :func:`scipy.spatial.distance.pdist` for a list of built-in distance metrics. Returns ------- dist : ndarray Distance matrix in condensed form. """ if isinstance(metric, str): import scipy.spatial if hasattr(scipy.spatial.distance, metric): metric = getattr(scipy.spatial.distance, metric) else: raise ValueError('metric name not found') m = x.shape[1] dist = list() for i, j in itertools.combinations(range(m), 2): a = x[:, i, ...] b = x[:, j, ...] d = metric(a, b) dist.append(d) return np.array(dist)
[docs]def pairwise_dxy(pos, gac, start=None, stop=None, is_accessible=None): """Convenience function to calculate a pairwise distance matrix using nucleotide divergence (a.k.a. Dxy) as the distance metric. Parameters ---------- pos : array_like, int, shape (n_variants,) Variant positions. gac : array_like, int, shape (n_variants, n_samples, n_alleles) Per-genotype allele counts. start : int, optional Start position of region to use. stop : int, optional Stop position of region to use. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. Returns ------- dist : ndarray Distance matrix in condensed form. See Also -------- allel.model.GenotypeArray.to_allele_counts """ if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) gac = asarray_ndim(gac, 3) # compute this once here, to avoid repeated evaluation within the loop gan = np.sum(gac, axis=2) m = gac.shape[1] dist = list() for i, j in itertools.combinations(range(m), 2): ac1 = gac[:, i, ...] an1 = gan[:, i] ac2 = gac[:, j, ...] an2 = gan[:, j] d = sequence_divergence(pos, ac1, ac2, an1=an1, an2=an2, start=start, stop=stop, is_accessible=is_accessible) dist.append(d) return np.array(dist)
[docs]def pcoa(dist): """Perform principal coordinate analysis of a distance matrix, a.k.a. classical multi-dimensional scaling. Parameters ---------- dist : array_like Distance matrix in condensed form. Returns ------- coords : ndarray, shape (n_samples, n_dimensions) Transformed coordinates for the samples. explained_ratio : ndarray, shape (n_dimensions) Variance explained by each dimension. """ # This implementation is based on the skbio.math.stats.ordination.PCoA # implementation, with some minor adjustments. # check inputs dist = ensure_square(dist) # perform scaling e_matrix = (dist ** 2) / -2 row_means = e_matrix.mean(axis=1, keepdims=True) col_means = e_matrix.mean(axis=0, keepdims=True) matrix_mean = e_matrix.mean() f_matrix = e_matrix - row_means - col_means + matrix_mean eigvals, eigvecs = scipy.linalg.eigh(f_matrix) # deal with eigvals close to zero close_to_zero = np.isclose(eigvals, 0) eigvals[close_to_zero] = 0 # sort descending idxs = eigvals.argsort()[::-1] eigvals = eigvals[idxs] eigvecs = eigvecs[:, idxs] # keep only positive eigenvalues keep = eigvals >= 0 eigvecs = eigvecs[:, keep] eigvals = eigvals[keep] # compute coordinates coords = eigvecs * np.sqrt(eigvals) # compute ratio explained explained_ratio = eigvals / eigvals.sum() return coords, explained_ratio