Source code for allel.util

# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division


import numpy as np
from allel.compat import range


import allel.model


[docs]def windowed_count(pos, b, window, start=None, stop=None):
    """Count nonzero (i.e., True) elements in non-overlapping windows over a
    single chromosome or contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_variants,)
        Positions array.
    b : array_like, bool, shape (n_variants,) or (n_variants, n_samples)
        Boolean array.
    window : int
        Window size.
    start : int, optional
        Start position.
    stop : int, optional
        Stop position.

    Returns
    -------

    counts : ndarray, int, shape (n_bins,) or (n_bins, n_samples)
        Counts array.
    edges : ndarray, int, shape (n_bins + 1,)
        Bin edges used for counting.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 1], [0, 1]],
    ...                          [[1, 1], [1, 2]],
    ...                          [[2, 2], [-1, -1]]], dtype='i1')
    >>> pos = allel.PositionIndex([2, 14, 15, 27])
    >>> b = g.is_variant()
    >>> counts, edges = allel.windowed_count(pos, b, window=10)
    >>> edges
    array([ 2, 12, 22, 32])
    >>> counts
    array([0, 2, 1])
    >>> counts, edges = allel.windowed_count(pos, b, window=10, start=1,
    ...                                      stop=27)
    >>> edges
    array([ 1, 11, 21, 27])
    >>> counts
    array([0, 2, 1])
    >>> b = g.is_het()
    >>> counts, edges = allel.windowed_count(pos, b, window=10)
    >>> edges
    array([ 2, 12, 22, 32])
    >>> counts
    array([[0, 0],
           [1, 2],
           [0, 0]])

    """

    # check arguments
    pos = allel.model.PositionIndex(pos)
    b = np.asarray(b)
    if pos.shape[0] != b.shape[0]:
        raise ValueError(
            'arrays do not have matching length for first '
            'dimension: pos %s, b %s'
            % (pos.shape[0], b.shape[0])
        )

    # determine bin edges
    bin_start = pos.min() if start is None else start
    bin_stop = pos.max() if stop is None else stop
    edges = np.arange(bin_start, bin_stop, window)
    if stop is None and edges[-1] < bin_stop:
        # add one more window to ensure stop is included
        edges = np.append(edges, edges[-1] + window)
    elif stop is not None and edges[-1] < bin_stop:
        # add one more window to ensure explicit stop is final edge
        edges = np.append(edges, stop)

    if b.ndim == 1:
        pos_incl = np.compress(b, pos)
        counts, _ = np.histogram(pos_incl, bins=edges)

    else:
        # assume 2D
        n_bins = len(edges) - 1
        n_samples = b.shape[1]
        counts = np.empty((n_bins, n_samples), dtype=int)
        for i in range(n_samples):
            pos_incl = np.compress(b[:, i], pos)
            h, _ = np.histogram(pos_incl, bins=edges)
            counts[:, i] = h

    return counts, edges


[docs]def windowed_density(pos, b, window, start=None, stop=None,
                     is_accessible=None, fill=0):
    """Calculate the per-base-pair density of nonzero (i.e., True) elements in
    non-overlapping windows over a single chromosome or contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_variants,)
        Positions array.
    b : array_like, bool, shape (n_variants,) or (n_variants, n_samples)
        Boolean array.
    window : int
        Window size.
    start : int, optional
        Start position.
    stop : int, optional
        Stop position.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Accessibility mask. If provided, the size of each window will be
        calculated as the number of accessible positions, rather than the
        absolute bin width.

    Returns
    -------

    densities : ndarray, int, shape (n_bins,) or (n_bins, n_samples)
        Densities array.
    counts : ndarray, int, shape (n_bins,) or (n_bins, n_samples)
        Counts array.
    widths : ndarray, int, shape (n_bins,)
        Size of each bin, taking accessibility into account.
    edges : ndarray, int, shape (n_bins + 1,)
        Bin edges used for counting.

    Examples
    --------

    Assuming all positions are accessible::

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
        ...                          [[0, 1], [0, 1]],
        ...                          [[1, 1], [1, 2]],
        ...                          [[2, 2], [-1, -1]]], dtype='i1')
        >>> pos = allel.PositionIndex([2, 14, 15, 27])
        >>> b = g.is_variant()
        >>> densities, counts, widths, edges = allel.windowed_density(
        ...     pos, b, window=10
        ... )
        >>> edges
        array([ 2, 12, 22, 32])
        >>> widths
        array([10, 10, 11])
        >>> counts
        array([0, 2, 1])
        >>> densities
        array([ 0.        ,  0.2       ,  0.09090909])

    Density calculations can take into account the number of accessible
    positions within each window, e.g.::

        >>> is_accessible = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        ...                           0, 1, 1, 1, 0, 0, 1, 1, 0, 0,
        ...                           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        ...                           dtype=bool)
        >>> densities, counts, widths, edges = allel.windowed_density(
        ...     pos, b, start=1, stop=31, window=10,
        ...     is_accessible=is_accessible, fill=np.nan
        ... )
        >>> edges
        array([ 1, 11, 21, 31])
        >>> widths
        array([ 0,  5, 11])
        >>> counts
        array([0, 2, 1])
        >>> densities
        array([        nan,  0.4     ,  0.09090909])

    """

    counts, edges = windowed_count(pos, b, window, start=start, stop=stop)

    # determine window sizes (i.e., bin widths)
    if is_accessible is None:
        # assume all genome positions are accessible
        widths = np.diff(edges)
        # final bin includes right edge
        widths[-1] += 1
    else:
        is_accessible = np.asarray(is_accessible)
        if is_accessible.ndim != 1:
            raise ValueError('expected 1 dimension, found %s' %
                             is_accessible.ndim)
        pos_accessible, = np.nonzero(is_accessible)
        # convert to 1-based coordinates
        pos_accessible += 1
        widths, _ = np.histogram(pos_accessible, bins=edges)

    if counts.ndim > 1:
        widths = widths[:, None]

    # calculate densities
    err = np.seterr(invalid='ignore')
    densities = np.where(widths > 0, counts / widths, fill)
    np.seterr(**err)

    return densities, counts, widths, edges