Source code for allel.stats.admixture

# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division


from allel.model.ndarray import AlleleCountsArray
from allel.util import asarray_ndim, check_dim0_aligned
from allel.stats.window import moving_statistic
from allel.stats.misc import jackknife


import numpy as np


def h_hat(ac):
    """Unbiased estimator for h, where 2*h is the heterozygosity
    of the population.

    Parameters
    ----------
    ac : array_like, int, shape (n_variants, 2)
        Allele counts array for a single population.

    Returns
    -------
    h_hat : ndarray, float, shape (n_variants,)

    Notes
    -----
    Used in Patterson (2012) for calculation of various statistics.

    """

    # check inputs
    ac = asarray_ndim(ac, 2)
    assert ac.shape[1] == 2, 'only biallelic variants supported'

    # compute allele number
    an = ac.sum(axis=1)

    # compute estimator
    x = (ac[:, 0] * ac[:, 1]) / (an * (an - 1))

    return x


[docs]def patterson_f2(aca, acb):
    """Unbiased estimator for F2(A, B), the branch length between populations
    A and B.

    Parameters
    ----------
    aca : array_like, int, shape (n_variants, 2)
        Allele counts for population A.
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for population B.

    Returns
    -------
    f2 : ndarray, float, shape (n_variants,)

    Notes
    -----
    See Patterson (2012), Appendix A.

    """

    # check inputs
    aca = AlleleCountsArray(aca, copy=False)
    assert aca.shape[1] == 2, 'only biallelic variants supported'
    acb = AlleleCountsArray(acb, copy=False)
    assert acb.shape[1] == 2, 'only biallelic variants supported'
    check_dim0_aligned(aca, acb)

    # compute allele numbers
    sa = aca.sum(axis=1)
    sb = acb.sum(axis=1)

    # compute heterozygosities
    ha = h_hat(aca)
    hb = h_hat(acb)

    # compute sample frequencies for the alternate allele
    a = aca.to_frequencies()[:, 1]
    b = acb.to_frequencies()[:, 1]

    # compute estimator
    x = ((a - b) ** 2) - (ha / sa) - (hb / sb)

    return x


# noinspection PyPep8Naming
[docs]def patterson_f3(acc, aca, acb):
    """Unbiased estimator for F3(C; A, B), the three-population test for
    admixture in population C.

    Parameters
    ----------
    acc : array_like, int, shape (n_variants, 2)
        Allele counts for the test population (C).
    aca : array_like, int, shape (n_variants, 2)
        Allele counts for the first source population (A).
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for the second source population (B).

    Returns
    -------
    T : ndarray, float, shape (n_variants,)
        Un-normalized f3 estimates per variant.
    B : ndarray, float, shape (n_variants,)
        Estimates for heterozygosity in population C.

    Notes
    -----
    See Patterson (2012), main text and Appendix A.

    For un-normalized f3 statistics, ignore the `B` return value.

    To compute the f3* statistic, which is normalized by heterozygosity in
    population C to remove numerical dependence on the allele frequency
    spectrum, compute ``np.sum(T) / np.sum(B)``.

    """

    # check inputs
    aca = AlleleCountsArray(aca, copy=False)
    assert aca.shape[1] == 2, 'only biallelic variants supported'
    acb = AlleleCountsArray(acb, copy=False)
    assert acb.shape[1] == 2, 'only biallelic variants supported'
    acc = AlleleCountsArray(acc, copy=False)
    assert acc.shape[1] == 2, 'only biallelic variants supported'
    check_dim0_aligned(aca, acb, acc)

    # compute allele number and heterozygosity in test population
    sc = acc.sum(axis=1)
    hc = h_hat(acc)

    # compute sample frequencies for the alternate allele
    a = aca.to_frequencies()[:, 1]
    b = acb.to_frequencies()[:, 1]
    c = acc.to_frequencies()[:, 1]

    # compute estimator
    T = ((c - a) * (c - b)) - (hc / sc)
    B = 2 * hc

    return T, B


[docs]def patterson_d(aca, acb, acc, acd):
    """Unbiased estimator for D(A, B; C, D), the normalised four-population
    test for admixture between (A or B) and (C or D), also known as the
    "ABBA BABA" test.

    Parameters
    ----------
    aca : array_like, int, shape (n_variants, 2),
        Allele counts for population A.
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for population B.
    acc : array_like, int, shape (n_variants, 2)
        Allele counts for population C.
    acd : array_like, int, shape (n_variants, 2)
        Allele counts for population D.

    Returns
    -------
    num : ndarray, float, shape (n_variants,)
        Numerator (un-normalised f4 estimates).
    den : ndarray, float, shape (n_variants,)
        Denominator.

    Notes
    -----
    See Patterson (2012), main text and Appendix A.

    For un-normalized f4 statistics, ignore the `den` return value.

    """

    # check inputs
    aca = AlleleCountsArray(aca, copy=False)
    assert aca.shape[1] == 2, 'only biallelic variants supported'
    acb = AlleleCountsArray(acb, copy=False)
    assert acb.shape[1] == 2, 'only biallelic variants supported'
    acc = AlleleCountsArray(acc, copy=False)
    assert acc.shape[1] == 2, 'only biallelic variants supported'
    acd = AlleleCountsArray(acd, copy=False)
    assert acd.shape[1] == 2, 'only biallelic variants supported'
    check_dim0_aligned(aca, acb, acc, acd)

    # compute sample frequencies for the alternate allele
    a = aca.to_frequencies()[:, 1]
    b = acb.to_frequencies()[:, 1]
    c = acc.to_frequencies()[:, 1]
    d = acd.to_frequencies()[:, 1]

    # compute estimator
    num = (a - b) * (c - d)
    den = (a + b - (2 * a * b)) * (c + d - (2 * c * d))

    return num, den


# noinspection PyPep8Naming
[docs]def moving_patterson_f3(acc, aca, acb, size, start=0, stop=None, step=None,
                        normed=True):
    """Estimate F3(C; A, B) in moving windows.

    Parameters
    ----------
    acc : array_like, int, shape (n_variants, 2)
        Allele counts for the test population (C).
    aca : array_like, int, shape (n_variants, 2)
        Allele counts for the first source population (A).
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for the second source population (B).
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    step : int, optional
        The number of variants between start positions of windows. If not
        given, defaults to the window size, i.e., non-overlapping windows.
    normed : bool, optional
        If False, use un-normalised f3 values.

    Returns
    -------
    f3 : ndarray, float, shape (n_windows,)
        Estimated value of the statistic in each window.

    """

    # calculate per-variant values
    T, B = patterson_f3(acc, aca, acb)

    # calculate value of statistic within each block
    if normed:
        T_bsum = moving_statistic(T, statistic=np.nansum, size=size,
                                  start=start, stop=stop, step=step)
        B_bsum = moving_statistic(B, statistic=np.nansum, size=size,
                                  start=start, stop=stop, step=step)
        f3 = T_bsum / B_bsum

    else:
        f3 = moving_statistic(T, statistic=np.nanmean, size=size,
                              start=start, stop=stop, step=step)

    return f3


[docs]def moving_patterson_d(aca, acb, acc, acd, size, start=0, stop=None,
                       step=None):
    """Estimate D(A, B; C, D) in moving windows.

    Parameters
    ----------
    aca : array_like, int, shape (n_variants, 2),
        Allele counts for population A.
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for population B.
    acc : array_like, int, shape (n_variants, 2)
        Allele counts for population C.
    acd : array_like, int, shape (n_variants, 2)
        Allele counts for population D.
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    step : int, optional
        The number of variants between start positions of windows. If not
        given, defaults to the window size, i.e., non-overlapping windows.

    Returns
    -------
    d : ndarray, float, shape (n_windows,)
        Estimated value of the statistic in each window.

    """

    # calculate per-variant values
    num, den = patterson_d(aca, acb, acc, acd)

    # N.B., nans can occur if any of the populations have completely missing
    # genotype calls at a variant (i.e., allele number is zero). Here we
    # assume that is rare enough to be negligible.

    # compute the numerator and denominator within each window
    num_sum = moving_statistic(num, statistic=np.nansum, size=size,
                               start=start, stop=stop, step=step)
    den_sum = moving_statistic(den, statistic=np.nansum, size=size,
                               start=start, stop=stop, step=step)

    # calculate the statistic values in each block
    d = num_sum / den_sum

    return d


# noinspection PyPep8Naming
[docs]def average_patterson_f3(acc, aca, acb, blen, normed=True):
    """Estimate F3(C; A, B) and standard error using the block-jackknife.

    Parameters
    ----------
    acc : array_like, int, shape (n_variants, 2)
        Allele counts for the test population (C).
    aca : array_like, int, shape (n_variants, 2)
        Allele counts for the first source population (A).
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for the second source population (B).
    blen : int
        Block size (number of variants).
    normed : bool, optional
        If False, use un-normalised f3 values.

    Returns
    -------
    f3 : float
        Estimated value of the statistic using all data.
    se : float
        Estimated standard error.
    z : float
        Z-score (number of standard errors from zero).
    vb : ndarray, float, shape (n_blocks,)
        Value of the statistic in each block.
    vj : ndarray, float, shape (n_blocks,)
        Values of the statistic from block-jackknife resampling.

    Notes
    -----
    See Patterson (2012), main text and Appendix A.

    See Also
    --------
    allel.stats.admixture.patterson_f3

    """

    # calculate per-variant values
    T, B = patterson_f3(acc, aca, acb)

    # N.B., nans can occur if any of the populations have completely missing
    # genotype calls at a variant (i.e., allele number is zero). Here we
    # assume that is rare enough to be negligible.

    # calculate overall value of statistic
    if normed:
        f3 = np.nansum(T) / np.nansum(B)
    else:
        f3 = np.nanmean(T)

    # calculate value of statistic within each block
    if normed:
        T_bsum = moving_statistic(T, statistic=np.nansum, size=blen)
        B_bsum = moving_statistic(B, statistic=np.nansum, size=blen)
        vb = T_bsum / B_bsum
        _, se, vj = jackknife((T_bsum, B_bsum),
                              statistic=lambda t, b: np.sum(t) / np.sum(b))

    else:
        vb = moving_statistic(T, statistic=np.nanmean, size=blen)
        _, se, vj = jackknife(vb, statistic=np.mean)

    # compute Z score
    z = f3 / se

    return f3, se, z, vb, vj


[docs]def average_patterson_d(aca, acb, acc, acd, blen):
    """Estimate D(A, B; C, D) and standard error using the block-jackknife.

    Parameters
    ----------
    aca : array_like, int, shape (n_variants, 2),
        Allele counts for population A.
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for population B.
    acc : array_like, int, shape (n_variants, 2)
        Allele counts for population C.
    acd : array_like, int, shape (n_variants, 2)
        Allele counts for population D.
    blen : int
        Block size (number of variants).

    Returns
    -------
    d : float
        Estimated value of the statistic using all data.
    se : float
        Estimated standard error.
    z : float
        Z-score (number of standard errors from zero).
    vb : ndarray, float, shape (n_blocks,)
        Value of the statistic in each block.
    vj : ndarray, float, shape (n_blocks,)
        Values of the statistic from block-jackknife resampling.

    Notes
    -----
    See Patterson (2012), main text and Appendix A.

    See Also
    --------
    allel.stats.admixture.patterson_d

    """

    # calculate per-variant values
    num, den = patterson_d(aca, acb, acc, acd)

    # N.B., nans can occur if any of the populations have completely missing
    # genotype calls at a variant (i.e., allele number is zero). Here we
    # assume that is rare enough to be negligible.

    # calculate overall estimate
    d_avg = np.nansum(num) / np.nansum(den)

    # compute the numerator and denominator within each block
    num_bsum = moving_statistic(num, statistic=np.nansum, size=blen)
    den_bsum = moving_statistic(den, statistic=np.nansum, size=blen)

    # calculate the statistic values in each block
    vb = num_bsum / den_bsum

    # estimate standard error
    _, se, vj = jackknife((num_bsum, den_bsum),
                          statistic=lambda n, d: np.sum(n) / np.sum(d))

    # compute Z score
    z = d_avg / se

    return d_avg, se, z, vb, vj


# backwards compatibility
blockwise_patterson_f3 = average_patterson_f3
blockwise_patterson_d = average_patterson_d