Source code for allel.stats.distance

# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division


import itertools


import numpy as np


from allel.model.ndarray import SortedIndex
from allel.util import asarray_ndim, ensure_square
from allel.stats.diversity import sequence_divergence
from allel.chunked import get_blen_array


[docs]def pairwise_distance(x, metric, chunked=False, blen=None): """Compute pairwise distance between individuals (e.g., samples or haplotypes). Parameters ---------- x : array_like, shape (n, m, ...) Array of m observations (e.g., samples or haplotypes) in a space with n dimensions (e.g., variants). Note that the order of the first two dimensions is **swapped** compared to what is expected by scipy.spatial.distance.pdist. metric : string or function Distance metric. See documentation for the function :func:`scipy.spatial.distance.pdist` for a list of built-in distance metrics. chunked : bool, optional If True, use a block-wise implementation to avoid loading the entire input array into memory. This means that a distance matrix will be calculated for each block of the input array, and the results will be summed to produce the final output. For some distance metrics this will return a different result from the standard implementation. blen : int, optional Block length to use for chunked implementation. Returns ------- dist : ndarray, shape (m * (m - 1) / 2,) Distance matrix in condensed form. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 1], [1, 1]], ... [[0, 1], [1, 1], [1, 2]], ... [[0, 2], [2, 2], [-1, -1]]]) >>> d = allel.stats.pairwise_distance(g.to_n_alt(), metric='cityblock') >>> d array([ 3., 4., 3.]) >>> import scipy.spatial >>> scipy.spatial.distance.squareform(d) array([[ 0., 3., 4.], [ 3., 0., 3.], [ 4., 3., 0.]]) """ import scipy.spatial # check inputs if not hasattr(x, 'ndim'): x = np.asarray(x) if x.ndim < 2: raise ValueError('array with at least 2 dimensions expected') if x.ndim == 2: # use scipy to calculate distance, it's most efficient def f(b): # transpose as pdist expects (m, n) for m observations in an # n-dimensional space t = b.T # compute the distance matrix return scipy.spatial.distance.pdist(t, metric=metric) else: # use our own implementation, it handles multidimensional observations def f(b): return pdist(b, metric=metric) if chunked: # use block-wise implementation blen = get_blen_array(x, blen) dist = None for i in range(0, x.shape[0], blen): j = min(x.shape[0], i+blen) block = x[i:j] if dist is None: dist = f(block) else: dist += f(block) else: # standard implementation dist = f(x) return dist
def pdist(x, metric): """Alternative implementation of :func:`scipy.spatial.distance.pdist` which is slower but more flexible in that arrays with >2 dimensions can be passed, allowing for multidimensional observations, e.g., diploid genotype calls or allele counts. Parameters ---------- x : array_like, shape (n, m, ...) Array of m observations (e.g., samples or haplotypes) in a space with n dimensions (e.g., variants). Note that the order of the first two dimensions is **swapped** compared to what is expected by scipy.spatial.distance.pdist. metric : string or function Distance metric. See documentation for the function :func:`scipy.spatial.distance.pdist` for a list of built-in distance metrics. Returns ------- dist : ndarray Distance matrix in condensed form. """ if isinstance(metric, str): import scipy.spatial if hasattr(scipy.spatial.distance, metric): metric = getattr(scipy.spatial.distance, metric) else: raise ValueError('metric name not found') m = x.shape[1] dist = list() for i, j in itertools.combinations(range(m), 2): a = x[:, i, ...] b = x[:, j, ...] d = metric(a, b) dist.append(d) return np.array(dist)
[docs]def pairwise_dxy(pos, gac, start=None, stop=None, is_accessible=None): """Convenience function to calculate a pairwise distance matrix using nucleotide divergence (a.k.a. Dxy) as the distance metric. Parameters ---------- pos : array_like, int, shape (n_variants,) Variant positions. gac : array_like, int, shape (n_variants, n_samples, n_alleles) Per-genotype allele counts. start : int, optional Start position of region to use. stop : int, optional Stop position of region to use. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. Returns ------- dist : ndarray Distance matrix in condensed form. See Also -------- allel.model.ndarray.GenotypeArray.to_allele_counts """ if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) gac = asarray_ndim(gac, 3) # compute this once here, to avoid repeated evaluation within the loop gan = np.sum(gac, axis=2) m = gac.shape[1] dist = list() for i, j in itertools.combinations(range(m), 2): ac1 = gac[:, i, ...] an1 = gan[:, i] ac2 = gac[:, j, ...] an2 = gan[:, j] d = sequence_divergence(pos, ac1, ac2, an1=an1, an2=an2, start=start, stop=stop, is_accessible=is_accessible) dist.append(d) return np.array(dist)
[docs]def pcoa(dist): """Perform principal coordinate analysis of a distance matrix, a.k.a. classical multi-dimensional scaling. Parameters ---------- dist : array_like Distance matrix in condensed form. Returns ------- coords : ndarray, shape (n_samples, n_dimensions) Transformed coordinates for the samples. explained_ratio : ndarray, shape (n_dimensions) Variance explained by each dimension. """ import scipy.linalg # This implementation is based on the skbio.math.stats.ordination.PCoA # implementation, with some minor adjustments. # check inputs dist = ensure_square(dist) # perform scaling e_matrix = (dist ** 2) / -2 row_means = e_matrix.mean(axis=1, keepdims=True) col_means = e_matrix.mean(axis=0, keepdims=True) matrix_mean = e_matrix.mean() f_matrix = e_matrix - row_means - col_means + matrix_mean eigvals, eigvecs = scipy.linalg.eigh(f_matrix) # deal with eigvals close to zero close_to_zero = np.isclose(eigvals, 0) eigvals[close_to_zero] = 0 # sort descending idxs = eigvals.argsort()[::-1] eigvals = eigvals[idxs] eigvecs = eigvecs[:, idxs] # keep only positive eigenvalues keep = eigvals >= 0 eigvecs = eigvecs[:, keep] eigvals = eigvals[keep] # compute coordinates coords = eigvecs * np.sqrt(eigvals) # compute ratio explained explained_ratio = eigvals / eigvals.sum() return coords, explained_ratio
[docs]def condensed_coords(i, j, n): """Transform square distance matrix coordinates to the corresponding index into a condensed, 1D form of the matrix. Parameters ---------- i : int Row index. j : int Column index. n : int Size of the square matrix (length of first or second dimension). Returns ------- ix : int """ # guard conditions if i == j or i >= n or j >= n or i < 0 or j < 0: raise ValueError('invalid coordinates: %s, %s' % (i, j)) # normalise order i, j = sorted([i, j]) # calculate number of items in rows before this one (sum of arithmetic # progression) x = i * ((2 * n) - i - 1) / 2 # add on previous items in current row ix = x + j - i - 1 return int(ix)
[docs]def condensed_coords_within(pop, n): """Return indices into a condensed distance matrix for all pairwise comparisons within the given population. Parameters ---------- pop : array_like, int Indices of samples or haplotypes within the population. n : int Size of the square matrix (length of first or second dimension). Returns ------- indices : ndarray, int """ return [condensed_coords(i, j, n) for i, j in itertools.combinations(sorted(pop), 2)]
[docs]def condensed_coords_between(pop1, pop2, n): """Return indices into a condensed distance matrix for all pairwise comparisons between two populations. Parameters ---------- pop1 : array_like, int Indices of samples or haplotypes within the first population. pop2 : array_like, int Indices of samples or haplotypes within the second population. n : int Size of the square matrix (length of first or second dimension). Returns ------- indices : ndarray, int """ return [condensed_coords(i, j, n) for i, j in itertools.product(sorted(pop1), sorted(pop2))]
[docs]def plot_pairwise_distance(dist, labels=None, colorbar=True, ax=None, imshow_kwargs=None): """Plot a pairwise distance matrix. Parameters ---------- dist : array_like The distance matrix in condensed form. labels : sequence of strings, optional Sample labels for the axes. colorbar : bool, optional If True, add a colorbar to the current figure. ax : axes, optional The axes on which to draw. If not provided, a new figure will be created. imshow_kwargs : dict-like, optional Additional keyword arguments passed through to :func:`matplotlib.pyplot.imshow`. Returns ------- ax : axes The axes on which the plot was drawn """ import matplotlib.pyplot as plt # check inputs dist_square = ensure_square(dist) # set up axes if ax is None: # make a square figure x = plt.rcParams['figure.figsize'][0] fig, ax = plt.subplots(figsize=(x, x)) fig.tight_layout() # setup imshow arguments if imshow_kwargs is None: imshow_kwargs = dict() imshow_kwargs.setdefault('interpolation', 'none') imshow_kwargs.setdefault('cmap', 'jet') imshow_kwargs.setdefault('vmin', np.min(dist)) imshow_kwargs.setdefault('vmax', np.max(dist)) # plot as image im = ax.imshow(dist_square, **imshow_kwargs) # tidy up if labels: ax.set_xticks(range(len(labels))) ax.set_yticks(range(len(labels))) ax.set_xticklabels(labels, rotation=90) ax.set_yticklabels(labels, rotation=0) else: ax.set_xticks([]) ax.set_yticks([]) if colorbar: plt.gcf().colorbar(im, shrink=.5) return ax