Source code for allel.stats

# -*- coding: utf-8 -*-
"""
This module provides statistical functions for use with variant call data.

"""
from __future__ import absolute_import, print_function, division


import itertools


import numpy as np


from allel.model import GenotypeArray, SortedIndex
from allel.util import asarray_ndim, ignore_invalid, check_arrays_aligned


[docs]def moving_statistic(values, statistic, size=None, start=0, stop=None,
                     step=None):
    """Calculate a statistic in a moving window over `values`.

    Parameters
    ----------

    values : array_like
        The data to summarise.
    statistic : function
        The statistic to compute within each window.
    size : int
        The window size (number of values).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.

    Returns
    -------

    out : ndarray, shape (n_windows,)

    Examples
    --------

    >>> import allel
    >>> values = [2, 5, 8, 16]
    >>> allel.stats.moving_statistic(values, np.sum, size=2)
    array([ 7, 24])
    >>> allel.stats.moving_statistic(values, np.sum, size=2, step=1)
    array([ 7, 13, 24])

    """

    windows = index_windows(values, size, start, stop, step)

    # setup output
    out = np.array([statistic(values[i:j]) for i, j in windows])

    return out


def index_windows(values, size, start, stop, step):
    """Convenience function to construct windows for the
    :func:`moving_statistic` function.

    """

    # determine step
    if stop is None:
        stop = len(values)
    if step is None:
        # non-overlapping
        step = size

    # iterate over windows
    last = False
    for window_start in range(start, stop, step):

        window_stop = window_start + size
        if window_stop >= stop:
            # last window
            window_stop = stop
            last = True

        yield (window_start, window_stop)

        if last:
            raise StopIteration


def position_windows(pos, size, start, stop, step):
    """Convenience function to construct windows for the
    :func:`windowed_statistic` and :func:`windowed_count` functions.

    """
    last = False

    # determine start and stop positions
    if start is None:
        start = pos[0]
    if stop is None:
        stop = pos[-1]
    if step is None:
        # non-overlapping
        step = size

    windows = []
    for window_start in range(start, stop, step):

        # determine window stop
        window_stop = window_start + size
        if window_stop >= stop:
            # last window
            window_stop = stop
            last = True
        else:
            window_stop -= 1

        windows.append([window_start, window_stop])

        if last:
            break

    return np.asarray(windows)


def window_locations(pos, windows):
    """Locate indices in `pos` corresponding to the start and stop positions
    of `windows`.

    """
    start_locs = np.searchsorted(pos, windows[:, 0])
    stop_locs = np.searchsorted(pos, windows[:, 1], side='right')
    locs = np.column_stack((start_locs, stop_locs))
    return locs


[docs]def windowed_count(pos, size=None, start=None, stop=None, step=None,
                   windows=None):
    """Count the number of items in windows over a single chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        The item positions in ascending order, using 1-based coordinates..
    size : int
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.

    Returns
    -------

    counts : ndarray, int, shape (n_windows,)
        The number of items in each window.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.

    Notes
    -----

    The window stop positions are included within a window.

    The final window will be truncated to the specified stop position,
    and so may be smaller than the other windows.

    Examples
    --------

    Non-overlapping windows::

        >>> import allel
        >>> pos = [1, 7, 12, 15, 28]
        >>> counts, windows = allel.stats.windowed_count(pos, size=10)
        >>> counts
        array([2, 2, 1])
        >>> windows
        array([[ 1, 10],
               [11, 20],
               [21, 28]])

    Half-overlapping windows::

        >>> counts, windows = allel.stats.windowed_count(pos, size=10, step=5)
        >>> counts
        array([2, 3, 2, 0, 1])
        >>> windows
        array([[ 1, 10],
               [ 6, 15],
               [11, 20],
               [16, 25],
               [21, 28]])

    """

    # assume sorted positions
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)

    # setup windows
    if windows is None:
        windows = position_windows(pos, size, start, stop, step)
    else:
        windows = asarray_ndim(windows, 2)

    # find window locations
    locs = window_locations(pos, windows)

    # count number of items in each window
    counts = np.diff(locs, axis=1).reshape(-1)

    return counts, windows


[docs]def windowed_statistic(pos, values, statistic, size, start=None, stop=None,
                       step=None, windows=None, fill=np.nan):
    """Calculate a statistic from items in windows over a single
    chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        The item positions in ascending order, using 1-based coordinates..
    values : array_like, int, shape (n_items,)
        The values to summarise.
    statistic : function
        The statistic to compute.
    size : int
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    fill : object, optional
        The value to use where a window is empty, i.e., contains no items.

    Returns
    -------

    out : ndarray, shape (n_windows,)
        The value of the statistic for each window.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    counts : ndarray, int, shape (n_windows,)
        The number of items in each window.

    Notes
    -----

    The window stop positions are included within a window.

    The final window will be truncated to the specified stop position,
    and so may be smaller than the other windows.

    Examples
    --------

    Count non-zero (i.e., True) items in non-overlapping windows::

        >>> import allel
        >>> pos = [1, 7, 12, 15, 28]
        >>> values = [True, False, True, False, False]
        >>> nnz, windows, counts = allel.stats.windowed_statistic(
        ...     pos, values, statistic=np.count_nonzero, size=10
        ... )
        >>> nnz
        array([1, 1, 0])
        >>> windows
        array([[ 1, 10],
               [11, 20],
               [21, 28]])
        >>> counts
        array([2, 2, 1])

    Compute a sum over items in half-overlapping windows::

        >>> values = [3, 4, 2, 6, 9]
        >>> x, windows, counts = allel.stats.windowed_statistic(
        ...     pos, values, statistic=np.sum, size=10, step=5, fill=0
        ... )
        >>> x
        array([  7.,  12.,   8.,   0.,   9.])
        >>> windows
        array([[ 1, 10],
               [ 6, 15],
               [11, 20],
               [16, 25],
               [21, 28]])
        >>> counts
        array([2, 3, 2, 0, 1])

    """

    # assume sorted positions
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)

    # check lengths are equal
    if len(pos) != len(values):
        raise ValueError('arrays must be of equal length')

    # setup windows
    if windows is None:
        windows = position_windows(pos, size, start, stop, step)
    else:
        windows = asarray_ndim(windows, 2)

    # find window locations
    locs = window_locations(pos, windows)

    # setup outputs
    out = []
    counts = []

    # iterate over windows
    for start_idx, stop_idx in locs:

        # calculate number of values in window
        n = stop_idx - start_idx

        # extract values for window
        window_values = values[start_idx:stop_idx]

        # compute statistic
        s = statistic(window_values)

        # store outputs
        out.append(s)
        counts.append(n)

    # convert to arrays for output
    return np.asarray(out), windows, np.asarray(counts)


[docs]def per_base(x, windows, is_accessible=None, fill=np.nan):
    """Calculate the per-base value of a windowed statistic.

    Parameters
    ----------

    x : array_like, shape (n_windows,)
        The statistic to average per-base.
    windows : array_like, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop)
        positions using 1-based coordinates.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.
    fill : object, optional
        Use this value where there are no accessible bases in a window.

    Returns
    -------

    y : ndarray, float, shape (n_windows,)
        The input array divided by the number of (accessible) bases in each
        window.
    n_bases : ndarray, int, shape (n_windows,)
        The number of (accessible) bases in each window

    """

    # calculate window sizes
    if is_accessible is None:
        # N.B., window stops are included
        n_bases = np.diff(windows, axis=1).reshape(-1) + 1
    else:
        n_bases = np.array([np.count_nonzero(is_accessible[i-1:j])
                            for i, j in windows])

    # deal with multidimensional x
    if x.ndim == 1:
        pass
    elif x.ndim == 2:
        n_bases = n_bases[:, None]
    else:
        raise NotImplementedError('only arrays of 1 or 2 dimensions supported')

    # calculate density per-base
    with ignore_invalid():
        y = np.where(n_bases > 0, x / n_bases, fill)

    # restore to 1-dimensional
    if n_bases.ndim > 1:
        n_bases = n_bases.reshape(-1)

    return y, n_bases


[docs]def mean_pairwise_diversity(ac, fill=np.nan):
    """Calculate for each variant the mean number of pairwise differences
    between haplotypes within a single population.

    Parameters
    ----------

    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    fill : float
        Use this value where there are no pairs to compare (e.g.,
        all allele calls are missing).

    Returns
    -------

    mpd : ndarray, float, shape (n_variants,)

    Notes
    -----

    The values returned by this function can be summed over a genome
    region and divided by the number of accessible bases to estimate
    nucleotide diversity, a.k.a. *pi*.

    Examples
    --------

    >>> import allel
    >>> h = allel.model.HaplotypeArray([[0, 0, 0, 0],
    ...                                 [0, 0, 0, 1],
    ...                                 [0, 0, 1, 1],
    ...                                 [0, 1, 1, 1],
    ...                                 [1, 1, 1, 1],
    ...                                 [0, 0, 1, 2],
    ...                                 [0, 1, 1, 2],
    ...                                 [0, 1, -1, -1]])
    >>> ac = h.count_alleles()
    >>> allel.stats.mean_pairwise_diversity(ac)
    array([ 0.        ,  0.5       ,  0.66666667,  0.5       ,  0.        ,
            0.83333333,  0.83333333,  1.        ])

    See Also
    --------

    sequence_diversity, windowed_diversity

    """

    # This function calculates the mean number of pairwise differences
    # between haplotypes within a single population, generalising to any number
    # of alleles.

    # check inputs
    ac = asarray_ndim(ac, 2)

    # total number of haplotypes
    an = np.sum(ac, axis=1)

    # total number of pairwise comparisons for each variant:
    # (an choose 2)
    n_pairs = an * (an - 1) / 2

    # number of pairwise comparisons where there is no difference:
    # sum of (ac choose 2) for each allele (i.e., number of ways to
    # choose the same allele twice)
    n_same = np.sum(ac * (ac - 1) / 2, axis=1)

    # number of pairwise differences
    n_diff = n_pairs - n_same

    # mean number of pairwise differences, accounting for cases where
    # there are no pairs
    with ignore_invalid():
        mpd = np.where(n_pairs > 0, n_diff / n_pairs, fill)

    return mpd


def _resize_dim2(a, l):
    newshape = a.shape[0], l
    b = np.zeros(newshape, dtype=a.dtype)
    b[:, :a.shape[1]] = a
    return b


[docs]def mean_pairwise_divergence(ac1, ac2, an1=None, an2=None, fill=np.nan):
    """Calculate for each variant the mean number of pairwise differences
    between haplotypes from two different populations.

    Parameters
    ----------

    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the second population.
    an1 : array_like, int, shape (n_variants,), optional
        Allele numbers for the first population. If not provided, will be
        calculated from `ac1`.
    an2 : array_like, int, shape (n_variants,), optional
        Allele numbers for the second population. If not provided, will be
        calculated from `ac2`.
    fill : float
        Use this value where there are no pairs to compare (e.g.,
        all allele calls are missing).

    Returns
    -------

    mpd : ndarray, float, shape (n_variants,)

    Notes
    -----

    The values returned by this function can be summed over a genome
    region and divided by the number of accessible bases to estimate
    nucleotide divergence between two populations, a.k.a. *Dxy*.

    Examples
    --------

    >>> import allel
    >>> h = allel.model.HaplotypeArray([[0, 0, 0, 0],
    ...                                 [0, 0, 0, 1],
    ...                                 [0, 0, 1, 1],
    ...                                 [0, 1, 1, 1],
    ...                                 [1, 1, 1, 1],
    ...                                 [0, 0, 1, 2],
    ...                                 [0, 1, 1, 2],
    ...                                 [0, 1, -1, -1]])
    >>> ac1 = h.take([0, 1], axis=1).count_alleles()
    >>> ac2 = h.take([2, 3], axis=1).count_alleles()
    >>> allel.stats.mean_pairwise_divergence(ac1, ac2)
    array([ 0.  ,  0.5 ,  1.  ,  0.5 ,  0.  ,  1.  ,  0.75,   nan])

    See Also
    --------

    sequence_divergence, windowed_divergence

    """

    # This function calculates the mean number of pairwise differences
    # between haplotypes from two different populations, generalising to any
    # number of alleles.

    # check inputs
    ac1 = asarray_ndim(ac1, 2)
    ac2 = asarray_ndim(ac2, 2)
    # check lengths match
    check_arrays_aligned(ac1, ac2)
    # ensure same number of alleles in both pops
    if ac1.shape[1] < ac2.shape[1]:
        ac1 = _resize_dim2(ac1, ac2.shape[1])
    elif ac2.shape[1] < ac1.shape[1]:
        ac2 = _resize_dim2(ac2, ac1.shape[1])

    # total number of haplotypes sampled from each population
    if an1 is None:
        an1 = np.sum(ac1, axis=1)
    if an2 is None:
        an2 = np.sum(ac2, axis=1)

    # total number of pairwise comparisons for each variant
    n_pairs = an1 * an2

    # number of pairwise comparisons where there is no difference:
    # sum of (ac1 * ac2) for each allele (i.e., number of ways to
    # choose the same allele twice)
    n_same = np.sum(ac1 * ac2, axis=1)

    # number of pairwise differences
    n_diff = n_pairs - n_same

    # mean number of pairwise differences, accounting for cases where
    # there are no pairs
    with ignore_invalid():
        mpd = np.where(n_pairs > 0, n_diff / n_pairs, fill)

    return mpd


[docs]def sequence_diversity(pos, ac, start=None, stop=None,
                       is_accessible=None):
    """Calculate nucleotide diversity within a given region.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.

    Returns
    -------

    pi : ndarray, float, shape (n_windows,)
        Nucleotide diversity.

    Examples
    --------

    >>> import allel
    >>> g = allel.model.GenotypeArray([[[0, 0], [0, 0]],
    ...                                [[0, 0], [0, 1]],
    ...                                [[0, 0], [1, 1]],
    ...                                [[0, 1], [1, 1]],
    ...                                [[1, 1], [1, 1]],
    ...                                [[0, 0], [1, 2]],
    ...                                [[0, 1], [1, 2]],
    ...                                [[0, 1], [-1, -1]],
    ...                                [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
    >>> pi = allel.stats.sequence_diversity(pos, ac, start=1, stop=31)
    >>> pi
    0.13978494623655915

    """

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    if start is not None or stop is not None:
        loc = pos.locate_range(start, stop)
        pos = pos[loc]
        ac = ac[loc]
    if start is None:
        start = pos[0]
    if stop is None:
        stop = pos[-1]
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # calculate mean pairwise diversity
    mpd = mean_pairwise_diversity(ac, fill=0)

    # sum diversity
    mpd_sum = np.sum(mpd)

    # calculate value per base
    if is_accessible is None:
        n_bases = stop - start + 1
    else:
        n_bases = np.count_nonzero(is_accessible[start-1:stop])

    pi = mpd_sum / n_bases
    return pi


[docs]def sequence_divergence(pos, ac1, ac2, an1=None, an2=None, start=None,
                        stop=None, is_accessible=None):
    """Calculate nucleotide divergence between two populations within a
    given region.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the second population.
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.

    Returns
    -------

    Dxy : ndarray, float, shape (n_windows,)
        Nucleotide divergence.

    Examples
    --------

    Simplest case, two haplotypes in each population::

        >>> import allel
        >>> h = allel.model.HaplotypeArray([[0, 0, 0, 0],
        ...                                 [0, 0, 0, 1],
        ...                                 [0, 0, 1, 1],
        ...                                 [0, 1, 1, 1],
        ...                                 [1, 1, 1, 1],
        ...                                 [0, 0, 1, 2],
        ...                                 [0, 1, 1, 2],
        ...                                 [0, 1, -1, -1],
        ...                                 [-1, -1, -1, -1]])
        >>> h1 = h.subset(haplotypes=[0, 1])
        >>> h2 = h.subset(haplotypes=[2, 3])
        >>> ac1 = h1.count_alleles()
        >>> ac2 = h2.count_alleles()
        >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
        >>> dxy = sequence_divergence(pos, ac1, ac2, start=1, stop=31)
        >>> dxy
        0.12096774193548387

    """

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    if start is not None or stop is not None:
        loc = pos.locate_range(start, stop)
        pos = pos[loc]
        ac1 = ac1[loc]
        ac2 = ac2[loc]
    if start is None:
        start = pos[0]
    if stop is None:
        stop = pos[-1]
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # calculate mean pairwise diversity
    mpd = mean_pairwise_divergence(ac1, ac2, an1=an1, an2=an2, fill=0)

    # sum divergence
    mpd_sum = np.sum(mpd)

    # calculate value per base
    if is_accessible is None:
        n_bases = stop - start + 1
    else:
        n_bases = np.count_nonzero(is_accessible[start-1:stop])

    dxy = mpd_sum / n_bases

    return dxy


[docs]def windowed_diversity(pos, ac, size, start=None, stop=None, step=None,
                       windows=None, is_accessible=None, fill=np.nan):
    """Calculate nucleotide diversity in windows over a single
    chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    size : int
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.
    fill : object, optional
        The value to use where a window is completely inaccessible.

    Returns
    -------

    pi : ndarray, float, shape (n_windows,)
        Nucleotide diversity in each window.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    n_bases : ndarray, int, shape (n_windows,)
        Number of (accessible) bases in each window.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    Examples
    --------

    >>> import allel
    >>> g = allel.model.GenotypeArray([[[0, 0], [0, 0]],
    ...                                [[0, 0], [0, 1]],
    ...                                [[0, 0], [1, 1]],
    ...                                [[0, 1], [1, 1]],
    ...                                [[1, 1], [1, 1]],
    ...                                [[0, 0], [1, 2]],
    ...                                [[0, 1], [1, 2]],
    ...                                [[0, 1], [-1, -1]],
    ...                                [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
    >>> pi, windows, n_bases, counts = allel.stats.windowed_diversity(
    ...     pos, ac, size=10, start=1, stop=31
    ... )
    >>> pi
    array([ 0.11666667,  0.21666667,  0.09090909])
    >>> windows
    array([[ 1, 10],
           [11, 20],
           [21, 31]])
    >>> n_bases
    array([10, 10, 11])
    >>> counts
    array([3, 4, 2])

    """

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # calculate mean pairwise diversity
    mpd = mean_pairwise_diversity(ac, fill=0)

    # sum in windows
    mpd_sum, windows, counts = windowed_statistic(
        pos, values=mpd, statistic=np.sum, size=size, start=start, stop=stop,
        step=step, windows=windows, fill=0
    )

    # calculate value per base
    pi, n_bases = per_base(mpd_sum, windows, is_accessible=is_accessible,
                           fill=fill)

    return pi, windows, n_bases, counts


[docs]def windowed_divergence(pos, ac1, ac2, size, start=None, stop=None, step=None,
                        is_accessible=None, fill=np.nan):
    """Calculate nucleotide divergence between two populations in windows
    over a single chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the second population.
    size : int
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.
    fill : object, optional
        The value to use where a window is completely inaccessible.

    Returns
    -------

    Dxy : ndarray, float, shape (n_windows,)
        Nucleotide divergence in each window.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    n_bases : ndarray, int, shape (n_windows,)
        Number of (accessible) bases in each window.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    Examples
    --------

    Simplest case, two haplotypes in each population::

        >>> import allel
        >>> h = allel.model.HaplotypeArray([[0, 0, 0, 0],
        ...                                 [0, 0, 0, 1],
        ...                                 [0, 0, 1, 1],
        ...                                 [0, 1, 1, 1],
        ...                                 [1, 1, 1, 1],
        ...                                 [0, 0, 1, 2],
        ...                                 [0, 1, 1, 2],
        ...                                 [0, 1, -1, -1],
        ...                                 [-1, -1, -1, -1]])
        >>> h1 = h.subset(haplotypes=[0, 1])
        >>> h2 = h.subset(haplotypes=[2, 3])
        >>> ac1 = h1.count_alleles()
        >>> ac2 = h2.count_alleles()
        >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
        >>> dxy, windows, n_bases, counts = windowed_divergence(
        ...     pos, ac1, ac2, size=10, start=1, stop=31
        ... )
        >>> dxy
        array([ 0.15 ,  0.225,  0.   ])
        >>> windows
        array([[ 1, 10],
               [11, 20],
               [21, 31]])
        >>> n_bases
        array([10, 10, 11])
        >>> counts
        array([3, 4, 2])

    """

    # check inputs
    pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # calculate mean pairwise divergence
    mpd = mean_pairwise_divergence(ac1, ac2, fill=0)

    # sum in windows
    mpd_sum, windows, counts = windowed_statistic(
        pos, values=mpd, statistic=np.sum, size=size, start=start,
        stop=stop, step=step, fill=0
    )

    # calculate value per base
    dxy, n_bases = per_base(mpd_sum, windows, is_accessible=is_accessible,
                            fill=fill)

    return dxy, windows, n_bases, counts


[docs]def heterozygosity_observed(g, fill=np.nan):
    """Calculate the rate of observed heterozygosity for each variant.

    Parameters
    ----------

    g : array_like, int, shape (n_variants, n_samples, ploidy)
        Genotype array.
    fill : float, optional
        Use this value for variants where all calls are missing.

    Returns
    -------

    ho : ndarray, float, shape (n_variants,)
        Observed heterozygosity

    Examples
    --------

    >>> import allel
    >>> g = allel.model.GenotypeArray([[[0, 0], [0, 0], [0, 0]],
    ...                                [[0, 0], [0, 1], [1, 1]],
    ...                                [[0, 0], [1, 1], [2, 2]],
    ...                                [[1, 1], [1, 2], [-1, -1]]])
    >>> allel.stats.heterozygosity_observed(g)
    array([ 0.        ,  0.33333333,  0.        ,  0.5       ])

    """

    # check inputs
    if not hasattr(g, 'count_het') or not hasattr(g, 'count_called'):
        g = GenotypeArray(g, copy=False)

    # count hets
    n_het = np.asarray(g.count_het(axis=1))
    n_called = np.asarray(g.count_called(axis=1))

    # calculate rate of observed heterozygosity, accounting for variants
    # where all calls are missing
    with ignore_invalid():
        ho = np.where(n_called > 0, n_het / n_called, fill)

    return ho


[docs]def heterozygosity_expected(af, ploidy, fill=np.nan):
    """Calculate the expected rate of heterozygosity for each variant
    under Hardy-Weinberg equilibrium.

    Parameters
    ----------

    af : array_like, float, shape (n_variants, n_alleles)
        Allele frequencies array.
    fill : float, optional
        Use this value for variants where allele frequencies do not sum to 1.

    Returns
    -------

    he : ndarray, float, shape (n_variants,)
        Expected heterozygosity

    Examples
    --------

    >>> import allel
    >>> g = allel.model.GenotypeArray([[[0, 0], [0, 0], [0, 0]],
    ...                                [[0, 0], [0, 1], [1, 1]],
    ...                                [[0, 0], [1, 1], [2, 2]],
    ...                                [[1, 1], [1, 2], [-1, -1]]])
    >>> af = g.count_alleles().to_frequencies()
    >>> allel.stats.heterozygosity_expected(af, ploidy=2)
    array([ 0.        ,  0.5       ,  0.66666667,  0.375     ])

    """

    # check inputs
    af = asarray_ndim(af, 2)

    # calculate expected heterozygosity
    out = 1 - np.sum(np.power(af, ploidy), axis=1)

    # fill values where allele frequencies could not be calculated
    af_sum = np.sum(af, axis=1)
    with ignore_invalid():
        out[(af_sum < 1) | np.isnan(af_sum)] = fill

    return out


[docs]def inbreeding_coefficient(g, fill=np.nan):
    """Calculate the inbreeding coefficient for each variant.

    Parameters
    ----------

    g : array_like, int, shape (n_variants, n_samples, ploidy)
        Genotype array.
    fill : float, optional
        Use this value for variants where the expected heterozygosity is
        zero.

    Returns
    -------

    f : ndarray, float, shape (n_variants,)
        Inbreeding coefficient.

    Notes
    -----

    The inbreeding coefficient is calculated as *1 - (Ho/He)* where *Ho* is
    the observed heterozygosity and *He* is the expected heterozygosity.

    Examples
    --------

    >>> import allel
    >>> g = allel.model.GenotypeArray([[[0, 0], [0, 0], [0, 0]],
    ...                                [[0, 0], [0, 1], [1, 1]],
    ...                                [[0, 0], [1, 1], [2, 2]],
    ...                                [[1, 1], [1, 2], [-1, -1]]])
    >>> allel.stats.inbreeding_coefficient(g)
    array([        nan,  0.33333333,  1.        , -0.33333333])

    """

    # check inputs
    if not hasattr(g, 'count_het') or not hasattr(g, 'count_called'):
        g = GenotypeArray(g, copy=False)

    # calculate observed and expected heterozygosity
    ho = heterozygosity_observed(g)
    af = g.count_alleles().to_frequencies()
    he = heterozygosity_expected(af, ploidy=g.shape[-1], fill=0)

    # calculate inbreeding coefficient, accounting for variants with no
    # expected heterozygosity
    with ignore_invalid():
        f = np.where(he > 0, 1 - (ho / he), fill)

    return f


[docs]def pairwise_distance(x, metric):
    """Compute pairwise distance between individuals (e.g., samples or
    haplotypes).

    Parameters
    ----------

    x : array_like, shape (n, m, ...)
        Array of m observations (e.g., samples or haplotypes) in a space
        with n dimensions (e.g., variants). Note that the order of the first
        two dimensions is **swapped** compared to what is expected by
        scipy.spatial.distance.pdist.
    metric : string or function
        Distance metric. See documentation for the function
        :func:`scipy.spatial.distance.pdist` for a list of built-in
        distance metrics.

    Returns
    -------

    dist : ndarray, shape (n_individuals * (n_individuals - 1) / 2,)
        Distance matrix in condensed form.

    See Also
    --------

    allel.plot.pairwise_distance

    Notes
    -----

    If `x` is a bcolz carray, a chunk-wise implementation will be used to
    avoid loading the entire input array into memory. This means that
    a distance matrix will be calculated for each chunk in the input array,
    and the results will be summed to produce the final output. For some
    distance metrics this will return a different result from the standard
    implementation, although the relative distances may be equivalent.

    Examples
    --------

    >>> import allel
    >>> g = allel.model.GenotypeArray([[[0, 0], [0, 1], [1, 1]],
    ...                                [[0, 1], [1, 1], [1, 2]],
    ...                                [[0, 2], [2, 2], [-1, -1]]])
    >>> d = allel.stats.pairwise_distance(g.to_n_alt(), metric='cityblock')
    >>> d
    array([ 3.,  4.,  3.])
    >>> import scipy.spatial
    >>> scipy.spatial.distance.squareform(d)
    array([[ 0.,  3.,  4.],
           [ 3.,  0.,  3.],
           [ 4.,  3.,  0.]])

    """

    import scipy.spatial

    # check inputs
    if not hasattr(x, 'ndim'):
        x = np.asarray(x)
    if x.ndim < 2:
        raise ValueError('array with at least 2 dimensions expected')

    if x.ndim == 2:
        # use scipy to calculate distance, it's most efficient

        def f(b):

            # transpose as pdist expects (m, n) for m observations in an
            # n-dimensional space
            t = b.T

            # compute the distance matrix
            return scipy.spatial.distance.pdist(t, metric=metric)

    else:
        # use our own implementation, it handles multidimensional observations

        def f(b):
            return pdist(b, metric=metric)

    if hasattr(x, 'chunklen'):
        # use block-wise implementation
        blen = x.chunklen
        dist = None
        for i in range(0, x.shape[0], blen):
            block = x[i:i+blen]
            if dist is None:
                dist = f(block)
            else:
                dist += f(block)

    else:
        # standard implementation
        dist = f(x)

    return dist


def pdist(x, metric):
    """Alternative implementation of :func:`scipy.spatial.distance.pdist`
    which is slower but more flexible in that arrays with >2 dimensions can be
    passed, allowing for multidimensional observations, e.g., diploid
    genotype calls or allele counts.

    Parameters
    ----------

    x : array_like, shape (n, m, ...)
        Array of m observations (e.g., samples or haplotypes) in a space
        with n dimensions (e.g., variants). Note that the order of the first
        two dimensions is **swapped** compared to what is expected by
        scipy.spatial.distance.pdist.
    metric : string or function
        Distance metric. See documentation for the function
        :func:`scipy.spatial.distance.pdist` for a list of built-in
        distance metrics.

    Returns
    -------

    dist : ndarray
        Distance matrix in condensed form.

    """

    if isinstance(metric, str):
        import scipy.spatial
        if hasattr(scipy.spatial.distance, metric):
            metric = getattr(scipy.spatial.distance, metric)
        else:
            raise ValueError('metric name not found')

    m = x.shape[1]
    dist = list()
    for i, j in itertools.combinations(range(m), 2):
        a = x[:, i, ...]
        b = x[:, j, ...]
        d = metric(a, b)
        dist.append(d)
    return np.array(dist)


[docs]def pairwise_dxy(pos, gac, start=None, stop=None, is_accessible=None):
    """Convenience function to calculate a pairwise distance matrix using
    nucleotide divergence (a.k.a. Dxy) as the distance metric.

    Parameters
    ----------

    pos : array_like, int, shape (n_variants,)
        Variant positions.
    gac : array_like, int, shape (n_variants, n_samples, n_alleles)
        Per-genotype allele counts.
    start : int, optional
        Start position of region to use.
    stop : int, optional
        Stop position of region to use.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.

    Returns
    -------

    dist : ndarray
        Distance matrix in condensed form.

    See Also
    --------

    allel.model.GenotypeArray.to_allele_counts
    """

    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    gac = asarray_ndim(gac, 3)
    # compute this once here, to avoid repeated evaluation within the loop
    gan = np.sum(gac, axis=2)
    m = gac.shape[1]
    dist = list()
    for i, j in itertools.combinations(range(m), 2):
        ac1 = gac[:, i, ...]
        an1 = gan[:, i]
        ac2 = gac[:, j, ...]
        an2 = gan[:, j]
        d = sequence_divergence(pos, ac1, ac2, an1=an1, an2=an2,
                                start=start, stop=stop,
                                is_accessible=is_accessible)
        dist.append(d)
    return np.array(dist)