Source code for allel.stats.diversity

# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division


import logging


import numpy as np


from allel.model import SortedIndex, GenotypeArray, locate_fixed_differences
from allel.util import asarray_ndim, ignore_invalid, check_dim0_aligned, \
    ensure_dim1_aligned
from allel.stats.window import windowed_statistic, per_base


logger = logging.getLogger(__name__)
debug = logger.debug


[docs]def mean_pairwise_difference(ac, an=None, fill=np.nan):
    """Calculate for each variant the mean number of pairwise differences
    between chromosomes sampled from within a single population.

    Parameters
    ----------

    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    an : array_like, int, shape (n_variants,), optional
        Allele numbers. If not provided, will be calculated from `ac`.
    fill : float
        Use this value where there are no pairs to compare (e.g.,
        all allele calls are missing).

    Returns
    -------

    mpd : ndarray, float, shape (n_variants,)

    Notes
    -----

    The values returned by this function can be summed over a genome
    region and divided by the number of accessible bases to estimate
    nucleotide diversity, a.k.a. *pi*.

    Examples
    --------

    >>> import allel
    >>> h = allel.model.HaplotypeArray([[0, 0, 0, 0],
    ...                                 [0, 0, 0, 1],
    ...                                 [0, 0, 1, 1],
    ...                                 [0, 1, 1, 1],
    ...                                 [1, 1, 1, 1],
    ...                                 [0, 0, 1, 2],
    ...                                 [0, 1, 1, 2],
    ...                                 [0, 1, -1, -1]])
    >>> ac = h.count_alleles()
    >>> allel.stats.mean_pairwise_difference(ac)
    array([ 0.        ,  0.5       ,  0.66666667,  0.5       ,  0.        ,
            0.83333333,  0.83333333,  1.        ])

    See Also
    --------

    sequence_diversity, windowed_diversity

    """

    # This function calculates the mean number of pairwise differences
    # between haplotypes within a single population, generalising to any number
    # of alleles.

    # check inputs
    ac = asarray_ndim(ac, 2)

    # total number of haplotypes
    if an is None:
        an = np.sum(ac, axis=1)
    else:
        an = asarray_ndim(an, 1)
        check_dim0_aligned(ac, an)

    # total number of pairwise comparisons for each variant:
    # (an choose 2)
    n_pairs = an * (an - 1) / 2

    # number of pairwise comparisons where there is no difference:
    # sum of (ac choose 2) for each allele (i.e., number of ways to
    # choose the same allele twice)
    n_same = np.sum(ac * (ac - 1) / 2, axis=1)

    # number of pairwise differences
    n_diff = n_pairs - n_same

    # mean number of pairwise differences, accounting for cases where
    # there are no pairs
    with ignore_invalid():
        mpd = np.where(n_pairs > 0, n_diff / n_pairs, fill)

    return mpd


[docs]def mean_pairwise_difference_between(ac1, ac2, an1=None, an2=None,
                                     fill=np.nan):
    """Calculate for each variant the mean number of pairwise differences
    between chromosomes sampled from two different populations.

    Parameters
    ----------

    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the second population.
    an1 : array_like, int, shape (n_variants,), optional
        Allele numbers for the first population. If not provided, will be
        calculated from `ac1`.
    an2 : array_like, int, shape (n_variants,), optional
        Allele numbers for the second population. If not provided, will be
        calculated from `ac2`.
    fill : float
        Use this value where there are no pairs to compare (e.g.,
        all allele calls are missing).

    Returns
    -------

    mpd : ndarray, float, shape (n_variants,)

    Notes
    -----

    The values returned by this function can be summed over a genome
    region and divided by the number of accessible bases to estimate
    nucleotide divergence between two populations, a.k.a. *Dxy*.

    Examples
    --------

    >>> import allel
    >>> h = allel.model.HaplotypeArray([[0, 0, 0, 0],
    ...                                 [0, 0, 0, 1],
    ...                                 [0, 0, 1, 1],
    ...                                 [0, 1, 1, 1],
    ...                                 [1, 1, 1, 1],
    ...                                 [0, 0, 1, 2],
    ...                                 [0, 1, 1, 2],
    ...                                 [0, 1, -1, -1]])
    >>> ac1 = h.count_alleles(subpop=[0, 1])
    >>> ac2 = h.count_alleles(subpop=[2, 3])
    >>> allel.stats.mean_pairwise_difference_between(ac1, ac2)
    array([ 0.  ,  0.5 ,  1.  ,  0.5 ,  0.  ,  1.  ,  0.75,   nan])

    See Also
    --------

    sequence_divergence, windowed_divergence

    """

    # This function calculates the mean number of pairwise differences
    # between haplotypes from two different populations, generalising to any
    # number of alleles.

    # check inputs
    ac1 = asarray_ndim(ac1, 2)
    ac2 = asarray_ndim(ac2, 2)
    check_dim0_aligned(ac1, ac2)
    ac1, ac2 = ensure_dim1_aligned(ac1, ac2)

    # total number of haplotypes sampled from each population
    if an1 is None:
        an1 = np.sum(ac1, axis=1)
    else:
        an1 = asarray_ndim(an1, 1)
        check_dim0_aligned(ac1, an1)
    if an2 is None:
        an2 = np.sum(ac2, axis=1)
    else:
        an2 = asarray_ndim(an2, 1)
        check_dim0_aligned(ac2, an2)

    # total number of pairwise comparisons for each variant
    n_pairs = an1 * an2

    # number of pairwise comparisons where there is no difference:
    # sum of (ac1 * ac2) for each allele (i.e., number of ways to
    # choose the same allele twice)
    n_same = np.sum(ac1 * ac2, axis=1)

    # number of pairwise differences
    n_diff = n_pairs - n_same

    # mean number of pairwise differences, accounting for cases where
    # there are no pairs
    with ignore_invalid():
        mpd = np.where(n_pairs > 0, n_diff / n_pairs, fill)

    return mpd


[docs]def sequence_diversity(pos, ac, start=None, stop=None,
                       is_accessible=None):
    """Estimate nucleotide diversity within a given region.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.

    Returns
    -------

    pi : ndarray, float, shape (n_windows,)
        Nucleotide diversity.

    Examples
    --------

    >>> import allel
    >>> g = allel.model.GenotypeArray([[[0, 0], [0, 0]],
    ...                                [[0, 0], [0, 1]],
    ...                                [[0, 0], [1, 1]],
    ...                                [[0, 1], [1, 1]],
    ...                                [[1, 1], [1, 1]],
    ...                                [[0, 0], [1, 2]],
    ...                                [[0, 1], [1, 2]],
    ...                                [[0, 1], [-1, -1]],
    ...                                [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
    >>> pi = allel.stats.sequence_diversity(pos, ac, start=1, stop=31)
    >>> pi
    0.13978494623655915

    """

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    if start is not None or stop is not None:
        loc = pos.locate_range(start, stop)
        pos = pos[loc]
        ac = ac[loc]
    if start is None:
        start = pos[0]
    if stop is None:
        stop = pos[-1]
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # calculate mean pairwise difference
    mpd = mean_pairwise_difference(ac, fill=0)

    # sum differences over variants
    mpd_sum = np.sum(mpd)

    # calculate value per base
    if is_accessible is None:
        n_bases = stop - start + 1
    else:
        n_bases = np.count_nonzero(is_accessible[start-1:stop])

    pi = mpd_sum / n_bases
    return pi


[docs]def sequence_divergence(pos, ac1, ac2, an1=None, an2=None, start=None,
                        stop=None, is_accessible=None):
    """Estimate nucleotide divergence between two populations within a
    given region.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the second population.
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.

    Returns
    -------

    Dxy : ndarray, float, shape (n_windows,)
        Nucleotide divergence.

    Examples
    --------

    Simplest case, two haplotypes in each population::

        >>> import allel
        >>> h = allel.model.HaplotypeArray([[0, 0, 0, 0],
        ...                                 [0, 0, 0, 1],
        ...                                 [0, 0, 1, 1],
        ...                                 [0, 1, 1, 1],
        ...                                 [1, 1, 1, 1],
        ...                                 [0, 0, 1, 2],
        ...                                 [0, 1, 1, 2],
        ...                                 [0, 1, -1, -1],
        ...                                 [-1, -1, -1, -1]])
        >>> ac1 = h.count_alleles(subpop=[0, 1])
        >>> ac2 = h.count_alleles(subpop=[2, 3])
        >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
        >>> dxy = sequence_divergence(pos, ac1, ac2, start=1, stop=31)
        >>> dxy
        0.12096774193548387

    """

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    if start is not None or stop is not None:
        loc = pos.locate_range(start, stop)
        pos = pos[loc]
        ac1 = ac1[loc]
        ac2 = ac2[loc]
    if start is None:
        start = pos[0]
    if stop is None:
        stop = pos[-1]
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # calculate mean pairwise difference between the two populations
    mpd = mean_pairwise_difference_between(ac1, ac2, an1=an1, an2=an2, fill=0)

    # sum differences over variants
    mpd_sum = np.sum(mpd)

    # calculate value per base
    if is_accessible is None:
        n_bases = stop - start + 1
    else:
        n_bases = np.count_nonzero(is_accessible[start-1:stop])

    dxy = mpd_sum / n_bases

    return dxy


[docs]def windowed_diversity(pos, ac, size, start=None, stop=None, step=None,
                       windows=None, is_accessible=None, fill=np.nan):
    """Estimate nucleotide diversity in windows over a single
    chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    size : int
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.
    fill : object, optional
        The value to use where a window is completely inaccessible.

    Returns
    -------

    pi : ndarray, float, shape (n_windows,)
        Nucleotide diversity in each window.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    n_bases : ndarray, int, shape (n_windows,)
        Number of (accessible) bases in each window.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    Examples
    --------

    >>> import allel
    >>> g = allel.model.GenotypeArray([[[0, 0], [0, 0]],
    ...                                [[0, 0], [0, 1]],
    ...                                [[0, 0], [1, 1]],
    ...                                [[0, 1], [1, 1]],
    ...                                [[1, 1], [1, 1]],
    ...                                [[0, 0], [1, 2]],
    ...                                [[0, 1], [1, 2]],
    ...                                [[0, 1], [-1, -1]],
    ...                                [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
    >>> pi, windows, n_bases, counts = allel.stats.windowed_diversity(
    ...     pos, ac, size=10, start=1, stop=31
    ... )
    >>> pi
    array([ 0.11666667,  0.21666667,  0.09090909])
    >>> windows
    array([[ 1, 10],
           [11, 20],
           [21, 31]])
    >>> n_bases
    array([10, 10, 11])
    >>> counts
    array([3, 4, 2])

    """

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # calculate mean pairwise difference
    mpd = mean_pairwise_difference(ac, fill=0)

    # sum differences in windows
    mpd_sum, windows, counts = windowed_statistic(
        pos, values=mpd, statistic=np.sum, size=size, start=start, stop=stop,
        step=step, windows=windows, fill=0
    )

    # calculate value per base
    pi, n_bases = per_base(mpd_sum, windows, is_accessible=is_accessible,
                           fill=fill)

    return pi, windows, n_bases, counts


[docs]def windowed_divergence(pos, ac1, ac2, size, start=None, stop=None, step=None,
                        windows=None, is_accessible=None, fill=np.nan):
    """Estimate nucleotide divergence between two populations in windows
    over a single chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the second population.
    size : int
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.
    fill : object, optional
        The value to use where a window is completely inaccessible.

    Returns
    -------

    Dxy : ndarray, float, shape (n_windows,)
        Nucleotide divergence in each window.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    n_bases : ndarray, int, shape (n_windows,)
        Number of (accessible) bases in each window.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    Examples
    --------

    Simplest case, two haplotypes in each population::

        >>> import allel
        >>> h = allel.model.HaplotypeArray([[0, 0, 0, 0],
        ...                                 [0, 0, 0, 1],
        ...                                 [0, 0, 1, 1],
        ...                                 [0, 1, 1, 1],
        ...                                 [1, 1, 1, 1],
        ...                                 [0, 0, 1, 2],
        ...                                 [0, 1, 1, 2],
        ...                                 [0, 1, -1, -1],
        ...                                 [-1, -1, -1, -1]])
        >>> ac1 = h.count_alleles(subpop=[0, 1])
        >>> ac2 = h.count_alleles(subpop=[2, 3])
        >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
        >>> dxy, windows, n_bases, counts = windowed_divergence(
        ...     pos, ac1, ac2, size=10, start=1, stop=31
        ... )
        >>> dxy
        array([ 0.15 ,  0.225,  0.   ])
        >>> windows
        array([[ 1, 10],
               [11, 20],
               [21, 31]])
        >>> n_bases
        array([10, 10, 11])
        >>> counts
        array([3, 4, 2])

    """

    # check inputs
    pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # calculate mean pairwise divergence
    mpd = mean_pairwise_difference_between(ac1, ac2, fill=0)

    # sum in windows
    mpd_sum, windows, counts = windowed_statistic(
        pos, values=mpd, statistic=np.sum, size=size, start=start,
        stop=stop, step=step, windows=windows, fill=0
    )

    # calculate value per base
    dxy, n_bases = per_base(mpd_sum, windows, is_accessible=is_accessible,
                            fill=fill)

    return dxy, windows, n_bases, counts


[docs]def weir_cockerham_fst(g, subpops, max_allele=None):
    """Compute the variance components from the analyses of variance of
    allele frequencies according to Weir and Cockerham (1984).

    Parameters
    ----------

    g : array_like, int, shape (n_variants, n_samples, ploidy)
        Genotype array.
    subpops : sequence of sequences of ints
        Sample indices for each subpopulation.
    max_allele : int, optional
        The highest allele index to consider.

    Returns
    -------

    a : ndarray, float, shape (n_variants, n_alleles)
        Component of variance between populations.
    b : ndarray, float, shape (n_variants, n_alleles)
        Component of variance between individuals within populations.
    c : ndarray, float, shape (n_variants, n_alleles)
        Component of variance between gametes within individuals.

    Examples
    --------

    Calculate variance components from some genotype data::

        >>> import allel
        >>> g = [[[0, 0], [0, 0], [1, 1], [1, 1]],
        ...      [[0, 1], [0, 1], [0, 1], [0, 1]],
        ...      [[0, 0], [0, 0], [0, 0], [0, 0]],
        ...      [[0, 1], [1, 2], [1, 1], [2, 2]],
        ...      [[0, 0], [1, 1], [0, 1], [-1, -1]]]
        >>> subpops = [[0, 1], [2, 3]]
        >>> a, b, c = allel.stats.weir_cockerham_fst(g, subpops)
        >>> a
        array([[ 0.5  ,  0.5  ,  0.   ],
               [ 0.   ,  0.   ,  0.   ],
               [ 0.   ,  0.   ,  0.   ],
               [ 0.   , -0.125, -0.125],
               [-0.375, -0.375,  0.   ]])
        >>> b
        array([[ 0.        ,  0.        ,  0.        ],
               [-0.25      , -0.25      ,  0.        ],
               [ 0.        ,  0.        ,  0.        ],
               [ 0.        ,  0.125     ,  0.25      ],
               [ 0.41666667,  0.41666667,  0.        ]])
        >>> c
        array([[ 0.        ,  0.        ,  0.        ],
               [ 0.5       ,  0.5       ,  0.        ],
               [ 0.        ,  0.        ,  0.        ],
               [ 0.125     ,  0.25      ,  0.125     ],
               [ 0.16666667,  0.16666667,  0.        ]])

    Estimate the parameter theta (a.k.a., Fst) for each variant
    and each allele individually::

        >>> fst = a / (a + b + c)
        >>> fst
        array([[ 1. ,  1. ,  nan],
               [ 0. ,  0. ,  nan],
               [ nan,  nan,  nan],
               [ 0. , -0.5, -0.5],
               [-1.8, -1.8,  nan]])

    Estimate Fst for each variant individually (averaging over alleles)::

        >>> fst = (np.sum(a, axis=1) /
        ...        (np.sum(a, axis=1) + np.sum(b, axis=1) + np.sum(c, axis=1)))
        >>> fst
        array([ 1. ,  0. ,  nan, -0.4, -1.8])

    Estimate Fst averaging over all variants and alleles::

        >>> fst = np.sum(a) / (np.sum(a) + np.sum(b) + np.sum(c))
        >>> fst
        -4.3680905886891398e-17

    Note that estimated Fst values may be negative.

    """

    # check inputs
    if not hasattr(g, 'shape') or not hasattr(g, 'ndim'):
        g = GenotypeArray(g, copy=False)
    if g.ndim != 3:
        raise ValueError('g must have three dimensions')
    if g.shape[2] != 2:
        raise NotImplementedError('only diploid genotypes are supported')

    # determine highest allele index
    if max_allele is None:
        max_allele = g.max()

    if hasattr(g, 'chunklen'):
        # use a chunk-wise implementation
        blen = g.chunklen
        n_variants = g.shape[0]
        shape = (n_variants, max_allele + 1)
        a = np.zeros(shape, dtype='f8')
        b = np.zeros(shape, dtype='f8')
        c = np.zeros(shape, dtype='f8')
        for i in range(0, n_variants, blen):
            gb = g[i:i+blen]
            ab, bb, cb = _weir_cockerham_fst(gb, subpops, max_allele)
            a[i:i+blen] = ab
            b[i:i+blen] = bb
            c[i:i+blen] = cb

    else:
        a, b, c = _weir_cockerham_fst(g, subpops, max_allele)

    return a, b, c


# noinspection PyPep8Naming
def _weir_cockerham_fst(g, subpops, max_allele):

    # check inputs
    g = GenotypeArray(g, copy=False)
    n_variants, n_samples, ploidy = g.shape
    n_alleles = max_allele + 1

    # number of populations sampled
    r = len(subpops)
    n_populations = r
    debug('r: %r', r)

    # count alleles within each subpopulation
    ac = [g.count_alleles(subpop=s, max_allele=max_allele) for s in subpops]

    # stack allele counts from each sub-population into a single array
    ac = np.dstack(ac)
    assert ac.shape == (n_variants, n_alleles, n_populations)
    debug('ac: %s, %r', ac.shape, ac)

    # count number of alleles called within each population by summing
    # allele counts along the alleles dimension
    an = np.sum(ac, axis=1)
    assert an.shape == (n_variants, n_populations)
    debug('an: %s, %r', an.shape, an)

    # compute number of individuals sampled from each population
    n = an // 2
    assert n.shape == (n_variants, n_populations)
    debug('n: %s, %r', n.shape, n)

    # compute the total number of individuals sampled across all populations
    n_total = np.sum(n, axis=1)
    assert n_total.shape == (n_variants,)
    debug('n_total: %s, %r', n_total.shape, n_total)

    # compute the average sample size across populations
    n_bar = np.mean(n, axis=1)
    assert n_bar.shape == (n_variants,)
    debug('n_bar: %s, %r', n_bar.shape, n_bar)

    # compute the term n sub C incorporating the coefficient of variation in
    # sample sizes
    n_C = (n_total - (np.sum(n**2, axis=1) / n_total)) / (r - 1)
    assert n_C.shape == (n_variants,)
    debug('n_C: %s, %r', n_C.shape, n_C)

    # compute allele frequencies within each population
    p = ac / an[:, np.newaxis, :]
    assert p.shape == (n_variants, n_alleles, n_populations)
    debug('p: %s, %r', p.shape, p)

    # compute the average sample frequency of each allele
    ac_total = np.sum(ac, axis=2)
    an_total = np.sum(an, axis=1)
    p_bar = ac_total / an_total[:, np.newaxis]
    assert p_bar.shape == (n_variants, n_alleles)
    debug('p_bar: %s, %r', p_bar.shape, p_bar)

    # add in some extra dimensions to enable broadcasting
    n_bar = n_bar[:, np.newaxis]
    n_C = n_C[:, np.newaxis]

    # compute the sample variance of allele frequencies over populations
    s_squared = (
        np.sum(n[:, np.newaxis, :] * ((p - p_bar[:, :, np.newaxis]) ** 2),
               axis=2) /
        (n_bar * (r - 1))
    )
    assert s_squared.shape == (n_variants, n_alleles)
    debug('s_squared: %s, %r', s_squared.shape, s_squared)

    # compute the average heterozygosity over all populations
    h_bar = [g.count_het(allele=allele, axis=1) / n_total
             for allele in range(n_alleles)]
    h_bar = np.column_stack(h_bar)
    assert h_bar.shape == (n_variants, n_alleles)
    debug('h_bar: %s, %r', h_bar.shape, h_bar)

    # now comes the tricky bit...

    # component of variance between populations
    a = ((n_bar / n_C) *
         (s_squared -
          ((1 / (n_bar - 1)) *
           ((p_bar * (1 - p_bar)) -
            ((r - 1) * s_squared / r) -
            (h_bar / 4)))))
    assert a.shape == (n_variants, n_alleles)

    # component of variance between individuals within populations
    b = ((n_bar / (n_bar - 1)) *
         ((p_bar * (1 - p_bar)) -
          ((r - 1) * s_squared / r) -
           (((2 * n_bar) - 1) * h_bar / (4 * n_bar))))
    assert b.shape == (n_variants, n_alleles)

    # component of variance between gametes within individuals
    c = h_bar / 2
    assert c.shape == (n_variants, n_alleles)

    return a, b, c


[docs]def hudson_fst(ac1, ac2, fill=np.nan):
    """Calculate the numerator and denominator for Fst estimation using the
    method of Hudson (1992) elaborated by Bhatia et al. (2013).

    Parameters
    ----------

    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the second population.
    fill : float
        Use this value where there are no pairs to compare (e.g.,
        all allele calls are missing).

    Returns
    -------

    num : ndarray, float, shape (n_variants,)
        Heterozygosity between the two populations minus average
        of heterozygosity within each population.
    den : ndarray, float, shape (n_variants,)
        Heterozygosity between the two populations.

    Examples
    --------

    Calculate numerator and denominator for Fst estimation::

        >>> import allel
        >>> g = allel.model.GenotypeArray([[[0, 0], [0, 0], [1, 1], [1, 1]],
        ...                                [[0, 1], [0, 1], [0, 1], [0, 1]],
        ...                                [[0, 0], [0, 0], [0, 0], [0, 0]],
        ...                                [[0, 1], [1, 2], [1, 1], [2, 2]],
        ...                                [[0, 0], [1, 1], [0, 1], [-1, -1]]])
        >>> subpops = [[0, 1], [2, 3]]
        >>> ac1 = g.count_alleles(subpop=subpops[0])
        >>> ac2 = g.count_alleles(subpop=subpops[1])
        >>> num, den = allel.stats.hudson_fst(ac1, ac2)
        >>> num
        array([ 1.        , -0.16666667,  0.        , -0.125     , -0.33333333])
        >>> den
        array([ 1.   ,  0.5  ,  0.   ,  0.625,  0.5  ])

    Estimate Fst for each variant individually::

        >>> fst = num / den
        >>> fst
        array([ 1.        , -0.33333333,         nan, -0.2       , -0.66666667])

    Estimate Fst averaging over variants::

        >>> fst = np.sum(num) / np.sum(den)
        >>> fst
        0.1428571428571429

    """  # flake8: noqa

    # check inputs
    ac1 = asarray_ndim(ac1, 2)
    ac2 = asarray_ndim(ac2, 2)
    check_dim0_aligned(ac1, ac2)
    ac1, ac2 = ensure_dim1_aligned(ac1, ac2)

    # calculate these once only
    an1 = np.sum(ac1, axis=1)
    an2 = np.sum(ac2, axis=1)

    # calculate average diversity (a.k.a. heterozygosity) within each
    # population
    within = (mean_pairwise_difference(ac1, an1, fill=fill) +
              mean_pairwise_difference(ac2, an2, fill=fill)) / 2

    # calculate divergence (a.k.a. heterozygosity) between each population
    between = mean_pairwise_difference_between(ac1, ac2, an1, an2, fill=fill)

    # define numerator and denominator for Fst calculations
    num = between - within
    den = between

    return num, den


[docs]def windowed_weir_cockerham_fst(pos, g, subpops, size, start=None, stop=None,
                                step=None, windows=None, fill=np.nan,
                                max_allele=None):
    """Estimate average Fst in windows over a single chromosome/contig,
    following the method of Weir and Cockerham (1984).

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    g : array_like, int, shape (n_variants, n_samples, ploidy)
        Genotype array.
    subpops : sequence of sequences of ints
        Sample indices for each subpopulation.
    size : int
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    fill : object, optional
        The value to use where there are no variants within a window.
    max_allele : int, optional
        The highest allele index to consider.

    Returns
    -------

    fst : ndarray, float, shape (n_windows,)
        Average Fst in each window.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    """

    # check inputs
    if not hasattr(g, 'shape') or not hasattr(g, 'ndim'):
        g = GenotypeArray(g, copy=False)
    if g.ndim != 3:
        raise ValueError('g must have three dimensions')
    if g.shape[2] != 2:
        raise NotImplementedError('only diploid genotypes are supported')

    # determine highest allele index
    if max_allele is None:
        max_allele = g.max()

    # define the statistic to compute within each window
    def average_fst(wg):
        a, b, c = _weir_cockerham_fst(wg, subpops=subpops,
                                      max_allele=max_allele)
        return np.sum(a) / (np.sum(a) + np.sum(b) + np.sum(c))

    # calculate average Fst in windows
    fst, windows, counts = windowed_statistic(pos, values=g,
                                              statistic=average_fst,
                                              size=size, start=start,
                                              stop=stop, step=step,
                                              windows=windows, fill=fill)

    return fst, windows, counts


[docs]def windowed_hudson_fst(pos, ac1, ac2, size, start=None, stop=None,
                        step=None, windows=None, fill=np.nan):
    """Estimate average Fst in windows over a single chromosome/contig,
    following the method of Hudson (1992) elaborated by Bhatia et al. (2013).

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the second population.
    size : int
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    fill : object, optional
        The value to use where there are no variants within a window.

    Returns
    -------

    fst : ndarray, float, shape (n_windows,)
        Average Fst in each window.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    """

    # check inputs
    ac1 = asarray_ndim(ac1, 2)
    ac2 = asarray_ndim(ac2, 2)
    check_dim0_aligned(ac1, ac2)
    ac1, ac2 = ensure_dim1_aligned(ac1, ac2)

    # define the statistic to compute within each window
    def average_fst(wac1, wac2):
        num, den = hudson_fst(wac1, wac2, fill=fill)
        return np.sum(num) / np.sum(den)

    # calculate average Fst in windows
    fst, windows, counts = windowed_statistic(pos, values=(ac1, ac2),
                                              statistic=average_fst,
                                              size=size, start=start,
                                              stop=stop, step=step,
                                              windows=windows, fill=fill)

    return fst, windows, counts


[docs]def windowed_df(pos, ac1, ac2, size, start=None, stop=None,
                               step=None, windows=None, is_accessible=None,
                               fill=np.nan):
    """Calculate the density of fixed differences between two populations in
    windows over a single chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the second population.
    size : int
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.
    fill : object, optional
        The value to use where a window is completely inaccessible.

    Returns
    -------

    df : ndarray, float, shape (n_windows,)
        Per-base density of fixed differences in each window.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    n_bases : ndarray, int, shape (n_windows,)
        Number of (accessible) bases in each window.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    See Also
    --------

    allel.model.locate_fixed_differences

    """

    # check inputs
    pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # locate fixed differences
    loc_df = locate_fixed_differences(ac1, ac2)

    # count number of fixed differences in windows
    n_df, windows, counts = windowed_statistic(
        pos, values=loc_df, statistic=np.count_nonzero, size=size, start=start,
        stop=stop, step=step, windows=windows, fill=0
    )

    # calculate value per base
    df, n_bases = per_base(n_df, windows, is_accessible=is_accessible,
                           fill=fill)

    return df, windows, n_bases, counts