Source code for allel.stats

# -*- coding: utf-8 -*-
"""
This module defines statistical functions for use with variant call data.

"""
from __future__ import absolute_import, print_function, division


import numpy as np
import scipy.stats


from allel.model import GenotypeArray, PositionIndex
from allel.util import ignore_invalid, check_arrays_aligned, asarray_ndim


def make_window_edges(pos, window, start=None, stop=None):
    region_start = pos.min() if start is None else start
    region_stop = pos.max() if stop is None else stop
    edges = np.arange(region_start, region_stop, window)
    if stop is None and edges[-1] < region_stop:
        # add one more window to ensure stop is included
        edges = np.append(edges, edges[-1] + window)
    elif stop is not None and edges[-1] < region_stop:
        # add one more window to ensure explicit stop is final edge
        edges = np.append(edges, stop)
    return edges


[docs]def windowed_statistic(pos, values, window, statistic, start=None, stop=None): """Compute a statistic in non-overlapping windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_variants,) Positions array (1-based). values : array_like, shape (n_variants,) Values to be summarised within each window. window : int Window size. statistic : string or function Statistic to compute. start : int, optional Start position of first window (1-based). stop : int, optional Stop position of last window (1-based). Returns ------- s : ndarray, shape (n_windows,) Computed statistic. edges : ndarray, shape (n_windows + 1,) Window edge positions. """ # check inputs pos = np.asarray(pos) values = np.asarray(values) check_arrays_aligned(pos, values) # determine bin edges edges = make_window_edges(pos, window, start, stop) # compute statistic s, _, _ = scipy.stats.binned_statistic(pos, values, statistic=statistic, bins=edges) return s, edges
[docs]def windowed_nnz(pos, b, window, start=None, stop=None): """Count nonzero (i.e., True) elements in non-overlapping windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_variants,) Positions array. b : array_like, bool, shape (n_variants,) or (n_variants, n_samples) Boolean array. window : int Window size. start : int, optional Start position of first window (1-based). stop : int, optional Stop position of last window (1-based). Returns ------- counts : ndarray, int, shape (n_windows,) or (n_windows, n_samples) Counts array. edges : ndarray, shape (n_windows + 1,) Window edge positions. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 1], [0, 1]], ... [[1, 1], [1, 2]], ... [[2, 2], [-1, -1]]], dtype='i1') >>> pos = allel.PositionIndex([2, 14, 15, 27]) >>> b = g.is_variant() >>> counts, edges = allel.windowed_nnz(pos, b, window=10) >>> edges array([ 2, 12, 22, 32]) >>> counts array([0, 2, 1]) >>> counts, edges = allel.windowed_nnz(pos, b, window=10, start=1, ... stop=27) >>> edges array([ 1, 11, 21, 27]) >>> counts array([0, 2, 1]) >>> b = g.is_het() >>> counts, edges = allel.windowed_nnz(pos, b, window=10) >>> edges array([ 2, 12, 22, 32]) >>> counts array([[0, 0], [1, 2], [0, 0]]) """ # check arguments pos = PositionIndex(pos) b = asarray_ndim(b, 1, 2) check_arrays_aligned(pos, b) # determine bin edges edges = make_window_edges(pos, window, start, stop) if b.ndim == 1: # 1D case pos_true = np.compress(b, pos) counts, _ = np.histogram(pos_true, bins=edges) else: # 2D case n_bins = len(edges) - 1 n_samples = b.shape[1] counts = np.empty((n_bins, n_samples), dtype=int) for i in range(n_samples): pos_true = np.compress(b[:, i], pos) h, _ = np.histogram(pos_true, bins=edges) counts[:, i] = h return counts, edges
def per_base(edges, values, is_accessible=None, fill=np.nan): """Compute the per-base value of a windowed statistic, optionally taking into account genome accessibility. Parameters ---------- edges : array_like, int, shape (n_windows + 1,) Window edge positions (1-based). values : array_like, shape (n_windows,) Values for each window. is_accessible : array_like, bool, shape (len(contig),), optional Accessibility mask. If provided, the size of each window will be calculated as the number of accessible positions, rather than the absolute window size. fill : optional Fill value to use if window size is zero. Returns ------- m : array_like, shape (n_windows,) Values divided by corresponding window sizes. widths : array_like, int, shape (n_windows,) Number of accessible positions in each window. """ # check inputs values = asarray_ndim(values, 1, 2) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) # determine window sizes if is_accessible is None: # assume all genome positions are accessible widths = np.diff(edges) # final bin includes right edge widths[-1] += 1 else: # check accessibility array spans all windows if is_accessible.size < edges[-1]: raise ValueError('accessibility array does not span region') pos_accessible, = np.nonzero(is_accessible) # convert to 1-based coordinates pos_accessible += 1 widths, _ = np.histogram(pos_accessible, bins=edges) if values.ndim == 2: # insert singleton dimension to enable broadcasting widths = widths[:, None] # calculate value per base with ignore_invalid(): m = np.where(widths > 0, values / widths, fill) return m, widths[:]
[docs]def windowed_mean_per_base(pos, values, window, start=None, stop=None, is_accessible=None, fill=np.nan): """Calculate the mean per genome position of the given values in non-overlapping windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_variants,) Positions array. values : array_like, shape (n_variants,) Values to be summarised within each window. window : int Window size. start : int, optional Start position of first window (1-based). stop : int, optional Stop position of last window (1-based). is_accessible : array_like, bool, shape (len(contig),), optional Accessibility mask. If provided, the size of each window will be calculated as the number of accessible positions, rather than the absolute window size. fill : optional Fill value to use if window size is zero. Returns ------- m : ndarray, shape (n_windows,) Mean per base of values in each window. edges : ndarray, shape (n_windows + 1,) Window edge positions. widths : array_like, int, shape (n_windows,) Number of accessible positions in each window. """ s, edges = windowed_statistic(pos, values, window, statistic='sum', start=start, stop=stop) m, widths = per_base(edges, s, is_accessible=is_accessible, fill=fill) return m, edges, widths
[docs]def windowed_nnz_per_base(pos, b, window, start=None, stop=None, is_accessible=None, fill=np.nan): """Calculate the per-base-pair density of nonzero (i.e., True) elements in non-overlapping windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_variants,) Positions array. b : array_like, bool, shape (n_variants,) or (n_variants, n_samples) Boolean array. window : int Window size. start : int, optional Start position of first window (1-based). stop : int, optional Stop position of last window (1-based). is_accessible : array_like, bool, shape (len(contig),), optional Accessibility mask. If provided, the size of each window will be calculated as the number of accessible positions, rather than the absolute window size. fill : optional Fill value to use if window size is zero. Returns ------- densities : ndarray, shape (n_windows,) or (n_windows, n_samples) Per base density of True values in each window. edges : ndarray, shape (n_windows + 1,) Window edge positions. counts : ndarray, int, shape (n_windows,) or (n_windows, n_samples) Counts array. widths : array_like, int, shape (n_windows,) Number of accessible positions in each window. Examples -------- Assuming all positions are accessible:: >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 1], [0, 1]], ... [[1, 1], [1, 2]], ... [[2, 2], [-1, -1]]], dtype='i1') >>> pos = allel.PositionIndex([2, 14, 15, 27]) >>> b = g.is_variant() >>> densities, edges, counts, widths = allel.windowed_nnz_per_base( ... pos, b, window=10 ... ) >>> edges array([ 2, 12, 22, 32]) >>> widths array([10, 10, 11]) >>> counts array([0, 2, 1]) >>> densities array([ 0. , 0.2 , 0.09090909]) Density calculations can take into account the number of accessible positions within each window, e.g.:: >>> is_accessible = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, ... 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], ... dtype=bool) >>> densities, edges, counts, widths = allel.windowed_nnz_per_base( ... pos, b, start=1, stop=31, window=10, ... is_accessible=is_accessible, fill=np.nan ... ) >>> edges array([ 1, 11, 21, 31]) >>> widths array([ 0, 5, 11]) >>> counts array([0, 2, 1]) >>> densities array([ nan, 0.4 , 0.09090909]) """ counts, edges = windowed_nnz(pos, b, window, start=start, stop=stop) densities, widths = per_base(edges, counts, is_accessible=is_accessible, fill=fill) return densities, edges, counts, widths
[docs]def windowed_nucleotide_diversity(g, pos, window, start=None, stop=None, is_accessible=None, fill=np.nan): """Calculate nucleotide diversity in non-overlapping windows over a single chromosome/contig. Parameters ---------- g : array_like, int, shape (n_variants, n_samples, ploidy) Genotype array. pos : array_like, int, shape (n_variants,) Positions array. window : int Window size. start : int, optional Start position of first window (1-based). stop : int, optional Stop position of last window (1-based). is_accessible : array_like, bool, shape (len(contig),), optional Accessibility mask. If provided, the size of each window will be calculated as the number of accessible positions, rather than the absolute window size. fill : optional Fill value to use if window size is zero. Returns ------- pi : ndarray, shape (n_windows,) Nucleotide diversity in each window. edges : ndarray, shape (n_windows + 1,) Window edge positions. widths : array_like, int, shape (n_windows,) Number of accessible positions in each window. """ # check inputs g = GenotypeArray(g, copy=False) pos = PositionIndex(pos, copy=False) check_arrays_aligned(g, pos) # compute pairwise differences mpd = g.mean_pairwise_difference(fill=0) # mean per base pi, edges, widths = windowed_mean_per_base( pos, mpd, window, start=start, stop=stop, is_accessible=is_accessible, fill=fill ) return pi, edges, widths