Source code for allel.model

# -*- coding: utf-8 -*-
"""
This module defines array classes for variant call data.

"""
from __future__ import absolute_import, print_function, division


import logging


import numpy as np
import numexpr as ne


from allel.util import ignore_invalid


logger = logging.getLogger(__name__)
debug = logger.debug


from allel.constants import DIM_SAMPLES, DIM_PLOIDY, DIPLOID


[docs]class GenotypeArray(np.ndarray):
    """Array of discrete genotype calls.

    Parameters
    ----------

    data : array_like, int, shape (n_variants, n_samples, ploidy)
        Genotype data.
    **kwargs : keyword arguments
        All keyword arguments are passed through to :func:`numpy.array`.

    Notes
    -----

    This class represents data on discrete genotype calls as a
    3-dimensional numpy array of integers. By convention the first
    dimension corresponds to the variants genotyped, the second
    dimension corresponds to the samples genotyped, and the third
    dimension corresponds to the ploidy of the samples.

    Each integer within the array corresponds to an **allele index**,
    where 0 is the reference allele, 1 is the first alternate allele,
    2 is the second alternate allele, ... and -1 (or any other
    negative integer) is a missing allele call. A single byte integer
    dtype (int8) can represent up to 127 distinct alleles, which is
    usually sufficient.  The actual alleles (i.e., the alternate
    nucleotide sequences) and the physical positions of the variants
    within the genome of an organism are stored in separate arrays,
    discussed elsewhere.

    In many cases the number of distinct alleles for each variant is
    small, e.g., less than 10, or even 2 (all variants are
    biallelic). In these cases a genotype array is not the most
    compact way of storing genotype data in memory. This class defines
    functions for bit-packing diploid genotype calls into single
    bytes, and for transforming genotype arrays into sparse matrices,
    which can assist in cases where memory usage needs to be
    minimised. Note however that these more compact representations do
    not allow the same flexibility in terms of using numpy universal
    functions to access and manipulate data.

    Arrays of this class can store either **phased or unphased**
    genotype calls. If the genotypes are phased (i.e., haplotypes have
    been resolved) then individual haplotypes can be extracted by
    converting to a :class:`HaplotypeArray` then indexing the second
    dimension. If the genotype calls are unphased then the ordering of
    alleles along the third (ploidy) dimension is arbitrary. N.B.,
    this means that an unphased diploid heterozygous call could be
    stored as (0, 1) or equivalently as (1, 0).

    A genotype array can store genotype calls with any ploidy > 1. For
    haploid calls, use a :class:`HaplotypeArray`. Note that genotype
    arrays are not capable of storing calls for samples with differing
    or variable ploidy.

    Examples
    --------

    Instantiate a genotype array::

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 1]],
        ...                          [[0, 1], [1, 1]],
        ...                          [[0, 2], [-1, -1]]], dtype='i1')
        >>> g.dtype
        dtype('int8')
        >>> g.ndim
        3
        >>> g.shape
        (3, 2, 2)
        >>> g.n_variants
        3
        >>> g.n_samples
        2
        >>> g.ploidy
        2

    Genotype calls for a single variant at all samples can be obtained
    by indexing the first dimension, e.g.::

        >>> g[1]
        array([[0, 1],
               [1, 1]], dtype=int8)

    Genotype calls for a single sample at all variants can be obtained
    by indexing the second dimension, e.g.::

        >>> g[:, 1]
        array([[ 0,  1],
               [ 1,  1],
               [-1, -1]], dtype=int8)

    A genotype call for a single sample at a single variant can be
    obtained by indexing the first and second dimensions, e.g.::

        >>> g[1, 0]
        array([0, 1], dtype=int8)

    A genotype array can store polyploid calls, e.g.::

        >>> g = allel.GenotypeArray([[[0, 0, 0], [0, 0, 1]],
        ...                          [[0, 1, 1], [1, 1, 1]],
        ...                          [[0, 1, 2], [-1, -1, -1]]], dtype='i1')
        >>> g.ploidy
        3

    """

    @staticmethod
    def _check_input_data(obj):

        # check dtype
        if obj.dtype.kind not in 'ui':
            raise TypeError('integer dtype required')

        # check dimensionality
        if obj.ndim != 3:
            raise TypeError('array with 3 dimensions required')

        # check length of ploidy dimension
        if obj.shape[DIM_PLOIDY] == 1:
            raise ValueError('use HaplotypeArray for haploid calls')

    def __new__(cls, data, **kwargs):
        """Constructor."""
        obj = np.array(data, **kwargs)
        cls._check_input_data(obj)
        obj = obj.view(cls)
        return obj

    def __array_finalize__(self, obj):

        # called after constructor
        if obj is None:
            return

        # called after slice (new-from-template)
        if isinstance(obj, GenotypeArray):
            return

        # called after view
        GenotypeArray._check_input_data(obj)

    # noinspection PyUnusedLocal
    def __array_wrap__(self, out_arr, context=None):
        # don't wrap results of any ufuncs
        return np.asarray(out_arr)

    def __getslice__(self, *args, **kwargs):
        s = np.ndarray.__getslice__(self, *args, **kwargs)
        if hasattr(s, 'ndim'):
            if s.ndim == 3:
                return s
            elif s.ndim > 0:
                return np.asarray(s)
        return s

    def __getitem__(self, *args, **kwargs):
        s = np.ndarray.__getitem__(self, *args, **kwargs)
        if hasattr(s, 'ndim'):
            if s.ndim == 3:
                return s
            elif s.ndim > 0:
                return np.asarray(s)
        return s

    @property
[docs]    def n_variants(self):
        """Number of variants (length of first array dimension)."""
        return self.shape[0]

    @property
[docs]    def n_samples(self):
        """Number of samples (length of second array dimension)."""
        return self.shape[1]

    @property
[docs]    def ploidy(self):
        """Sample ploidy (length of third array dimension)."""
        return self.shape[2]

    def __repr__(self):
        s = super(GenotypeArray, self).__repr__()
        return s[:-1] + ', n_variants=%s, n_samples=%s, ploidy=%s)' % \
                        (self.n_variants, self.n_samples, self.ploidy)

    # noinspection PyUnusedLocal
[docs]    def is_called(self):
        """Find non-missing genotype calls.

        Returns
        -------

        out : ndarray, bool, shape (n_variants, n_samples)
            Array where elements are True if the genotype call matches the
            condition.

        Examples
        --------

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 1]],
        ...                          [[0, 1], [1, 1]],
        ...                          [[0, 2], [-1, -1]]])
        >>> g.is_called()
        array([[ True,  True],
               [ True,  True],
               [ True, False]], dtype=bool)

        """

        # special case diploid
        if self.ploidy == DIPLOID:
            allele1 = self[..., 0]  # noqa
            allele2 = self[..., 1]  # noqa
            ex = '(allele1 >= 0) & (allele2 >= 0)'
            out = ne.evaluate(ex)

        # general ploidy case
        else:
            out = np.all(self >= 0, axis=DIM_PLOIDY)

        return out

    # noinspection PyUnusedLocal
[docs]    def is_missing(self):
        """Find missing genotype calls.

        Returns
        -------

        out : ndarray, bool, shape (n_variants, n_samples)
            Array where elements are True if the genotype call matches the
            condition.

        Examples
        --------

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 1]],
        ...                          [[0, 1], [1, 1]],
        ...                          [[0, 2], [-1, -1]]])
        >>> g.is_missing()
        array([[False, False],
               [False, False],
               [False,  True]], dtype=bool)

        """

        # special case diploid
        if self.ploidy == DIPLOID:
            allele1 = self[..., 0]  # noqa
            allele2 = self[..., 1]  # noqa
            # call is missing if either allele is missing
            ex = '(allele1 < 0) | (allele2 < 0)'
            out = ne.evaluate(ex)

        # general ploidy case
        else:
            # call is missing if any allele is missing
            out = np.any(self < 0, axis=DIM_PLOIDY)

        return out

    # noinspection PyUnusedLocal
[docs]    def is_hom(self, allele=None):
        """Find genotype calls that are homozygous.

        Parameters
        ----------

        allele : int, optional
            Allele index.

        Returns
        -------

        out : ndarray, bool, shape (n_variants, n_samples)
            Array where elements are True if the genotype call matches the
            condition.

        Examples
        --------

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 1]],
        ...                          [[0, 1], [1, 1]],
        ...                          [[2, 2], [-1, -1]]])
        >>> g.is_hom()
        array([[ True, False],
               [False,  True],
               [ True, False]], dtype=bool)
        >>> g.is_hom(allele=1)
        array([[False, False],
               [False,  True],
               [False, False]], dtype=bool)

        """

        # special case diploid
        if self.ploidy == DIPLOID:
            allele1 = self[..., 0]  # noqa
            allele2 = self[..., 1]  # noqa
            if allele is None:
                ex = '(allele1 >= 0) & (allele1  == allele2)'
            else:
                ex = '(allele1 == {0}) & (allele2 == {0})'.format(allele)
            out = ne.evaluate(ex)

        # general ploidy case
        else:
            if allele is None:
                allele1 = self[..., 0, None]  # noqa
                other_alleles = self[..., 1:]  # noqa
                ex = '(allele1 >= 0) & (allele1 == other_alleles)'
                out = np.all(ne.evaluate(ex), axis=DIM_PLOIDY)
            else:
                out = np.all(self == allele, axis=DIM_PLOIDY)

        return out

[docs]    def is_hom_ref(self):
        """Find genotype calls that are homozygous for the reference allele.

        Returns
        -------

        out : ndarray, bool, shape (n_variants, n_samples)
            Array where elements are True if the genotype call matches the
            condition.

        Examples
        --------

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 1]],
        ...                          [[0, 1], [1, 1]],
        ...                          [[0, 2], [-1, -1]]])
        >>> g.is_hom_ref()
        array([[ True, False],
               [False, False],
               [False, False]], dtype=bool)

        """

        return self.is_hom(allele=0)

    # noinspection PyUnusedLocal
[docs]    def is_hom_alt(self):
        """Find genotype calls that are homozygous for any alternate (i.e.,
        non-reference) allele.

        Returns
        -------

        out : ndarray, bool, shape (n_variants, n_samples)
            Array where elements are True if the genotype call matches the
            condition.

        Examples
        --------

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 1]],
        ...                          [[0, 1], [1, 1]],
        ...                          [[2, 2], [-1, -1]]])
        >>> g.is_hom_alt()
        array([[False, False],
               [False,  True],
               [ True, False]], dtype=bool)

        """

        # special case diploid
        if self.ploidy == DIPLOID:
            allele1 = self[..., 0]  # noqa
            allele2 = self[..., 1]  # noqa
            ex = '(allele1 > 0) & (allele1  == allele2)'
            out = ne.evaluate(ex)

        # general ploidy case
        else:
            allele1 = self[..., 0, None]  # noqa
            other_alleles = self[..., 1:]  # noqa
            ex = '(allele1 > 0) & (allele1 == other_alleles)'
            out = np.all(ne.evaluate(ex), axis=DIM_PLOIDY)

        return out

    # noinspection PyUnusedLocal
[docs]    def is_het(self):
        """Find genotype calls that are heterozygous.

        Returns
        -------

        out : ndarray, bool, shape (n_variants, n_samples)
            Array where elements are True if the genotype call matches the
            condition.

        Examples
        --------

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 1]],
        ...                          [[0, 1], [1, 1]],
        ...                          [[0, 2], [-1, -1]]])
        >>> g.is_het()
        array([[False,  True],
               [ True, False],
               [ True, False]], dtype=bool)

        """

        # special case diploid
        if self.ploidy == DIPLOID:
            allele1 = self[..., 0]  # noqa
            allele2 = self[..., 1]  # noqa
            ex = '(allele1 >= 0) & (allele2  >= 0) & (allele1 != allele2)'
            out = ne.evaluate(ex)

        # general ploidy case
        else:
            allele1 = self[..., 0, None]  # noqa
            other_alleles = self[..., 1:]  # noqa
            out = np.all(self >= 0, axis=DIM_PLOIDY) \
                & np.any(allele1 != other_alleles, axis=DIM_PLOIDY)

        return out

    # noinspection PyUnusedLocal
[docs]    def is_call(self, call):
        """Find genotypes with a given call.

        Parameters
        ----------

        call : array_like, int, shape (ploidy,)
            The genotype call to find.

        Returns
        -------

        out : ndarray, bool, shape (n_variants, n_samples)
            Array where elements are True if the genotype is `call`.

        Examples
        --------

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 1]],
        ...                          [[0, 1], [1, 1]],
        ...                          [[0, 2], [-1, -1]]])
        >>> g.is_call((0, 2))
        array([[False, False],
               [False, False],
               [ True, False]], dtype=bool)

        """

        # special case diploid
        if self.ploidy == DIPLOID:
            if not len(call) == DIPLOID:
                raise ValueError('invalid call: %r', call)
            allele1 = self[..., 0]  # noqa
            allele2 = self[..., 1]  # noqa
            ex = '(allele1 == {0}) & (allele2  == {1})'.format(*call)
            out = ne.evaluate(ex)

        # general ploidy case
        else:
            if not len(call) == self.ploidy:
                raise ValueError('invalid call: %r', call)
            call = np.asarray(call)[None, None, :]
            out = np.all(self == call, axis=DIM_PLOIDY)

        return out

[docs]    def count_called(self, axis=None):
        b = self.is_called()
        return np.sum(b, axis=axis)

[docs]    def count_missing(self, axis=None):
        b = self.is_missing()
        return np.sum(b, axis=axis)

[docs]    def count_hom(self, allele=None, axis=None):
        b = self.is_hom(allele=allele)
        return np.sum(b, axis=axis)

[docs]    def count_hom_ref(self, axis=None):
        b = self.is_hom_ref()
        return np.sum(b, axis=axis)

[docs]    def count_hom_alt(self, axis=None):
        b = self.is_hom_alt()
        return np.sum(b, axis=axis)

[docs]    def count_het(self, axis=None):
        b = self.is_het()
        return np.sum(b, axis=axis)

[docs]    def count_call(self, call, axis=None):
        b = self.is_call(call=call)
        return np.sum(b, axis=axis)

[docs]    def view_haplotypes(self):
        """Reshape a genotype array to view it as haplotypes by
        dropping the ploidy dimension.

        Returns
        -------

        h : HaplotypeArray, shape (n_variants, n_samples * ploidy)
            Haplotype array (sharing same underlying buffer).

        Notes
        -----

        If genotype calls are unphased, the haplotypes returned by
        this function will bear no resemblance to the true haplotypes.

        Examples
        --------

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 1]],
        ...                          [[0, 1], [1, 1]],
        ...                          [[0, 2], [-1, -1]]])
        >>> g.view_haplotypes()
        HaplotypeArray([[ 0,  0,  0,  1],
               [ 0,  1,  1,  1],
               [ 0,  2, -1, -1]], n_variants=3, n_haplotypes=4)

        """

        # reshape, preserving size of variants dimension
        newshape = (self.n_variants, -1)
        data = np.reshape(self, newshape)
        h = HaplotypeArray(data, copy=False)
        return h

[docs]    def to_n_alt(self, fill=0):
        """Transform each genotype call into the number of
        non-reference alleles.

        Parameters
        ----------

        fill : int, optional
            Use this value to represent missing calls.

        Returns
        -------

        out : ndarray, int, shape (n_variants, n_samples)
            Array of non-ref alleles per genotype call.

        Notes
        -----

        This function simply counts the number of non-reference
        alleles, it makes no distinction between different alternate
        alleles.

        By default this function returns 0 for missing genotype calls
        **and** for homozygous reference genotype calls. Use the
        `fill` argument to change how missing calls are represented.

        Examples
        --------

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 1]],
        ...                          [[0, 2], [1, 1]],
        ...                          [[2, 2], [-1, -1]]])
        >>> g.to_n_alt()
        array([[0, 1],
               [1, 2],
               [2, 0]], dtype=int8)
        >>> g.to_n_alt(fill=-1)
        array([[ 0,  1],
               [ 1,  2],
               [ 2, -1]], dtype=int8)

        """

        # count number of alternate alleles
        out = np.empty((self.n_variants, self.n_samples), dtype='i1')
        np.sum(self > 0, axis=DIM_PLOIDY, out=out)

        # fill missing calls
        if fill != 0:
            m = self.is_missing()
            out[m] = fill

        return out

[docs]    def to_allele_counts(self, alleles=None):
        """Transform genotype calls into allele counts per call.

        Parameters
        ----------

        alleles : sequence of ints, optional
            If not None, count only the given alleles. (By default, count all
            alleles.)

        Returns
        -------

        out : ndarray, uint8, shape (n_variants, n_samples, len(alleles))
            Array of allele counts per call.

        Examples
        --------

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 1]],
        ...                         [[0, 2], [1, 1]],
        ...                         [[2, 2], [-1, -1]]])
        >>> g.to_allele_counts()
        array([[[2, 0, 0],
                [1, 1, 0]],
               [[1, 0, 1],
                [0, 2, 0]],
               [[0, 0, 2],
                [0, 0, 0]]], dtype=uint8)
        >>> g.to_allele_counts(alleles=(0, 1))
        array([[[2, 0],
                [1, 1]],
               [[1, 0],
                [0, 2]],
               [[0, 0],
                [0, 0]]], dtype=uint8)

        """

        # determine alleles to count
        if alleles is None:
            m = self.max()
            alleles = list(range(m+1))

        # set up output array
        outshape = (self.n_variants, self.n_samples, len(alleles))
        out = np.zeros(outshape, dtype='u1')

        for i, allele in enumerate(alleles):
            # count alleles along ploidy dimension
            np.sum(self == allele, axis=DIM_PLOIDY, out=out[..., i])

        return out

[docs]    def to_packed(self, boundscheck=True):
        """Pack diploid genotypes into a single byte for each genotype,
        using the left-most 4 bits for the first allele and the right-most 4
        bits for the second allele. Allows single byte encoding of diploid
        genotypes for variants with up to 15 alleles.

        Parameters
        ----------

        boundscheck : bool, optional
            If False, do not check that minimum and maximum alleles are
            compatible with bit-packing.

        Returns
        -------

        packed : ndarray, uint8, shape (n_variants, n_samples)
            Bit-packed genotype array.

        Examples
        --------

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 1]],
        ...                          [[0, 2], [1, 1]],
        ...                          [[2, 2], [-1, -1]]], dtype='i1')
        >>> g.to_packed()
        array([[  0,   1],
               [  2,  17],
               [ 34, 239]], dtype=uint8)

        """

        if self.ploidy != 2:
            raise ValueError('can only pack diploid calls')

        if boundscheck:
            amx = self.max()
            if amx > 14:
                raise ValueError('max allele for packing is 14, found %s'
                                 % amx)
            amn = self.min()
            if amn < -1:
                raise ValueError('min allele for packing is -1, found %s'
                                 % amn)

        from allel.opt.gt import pack_diploid

        # ensure int8 dtype
        if self.dtype == np.int8:
            data = self
        else:
            data = self.astype(dtype=np.int8)

        # pack data
        packed = pack_diploid(data)

        return packed

    @staticmethod
[docs]    def from_packed(packed):
        """Unpack diploid genotypes that have been bit-packed into single
        bytes.

        Parameters
        ----------

        packed : ndarray, uint8, shape (n_variants, n_samples)
            Bit-packed diploid genotype array.

        Returns
        -------

        g : GenotypeArray, shape (n_variants, n_samples, 2)
            Genotype array.

        Examples
        --------

        >>> import allel
        >>> import numpy as np
        >>> packed = np.array([[0, 1],
        ...                    [2, 17],
        ...                    [34, 239]], dtype='u1')
        >>> allel.GenotypeArray.from_packed(packed)
        GenotypeArray([[[ 0,  0],
                [ 0,  1]],
               [[ 0,  2],
                [ 1,  1]],
               [[ 2,  2],
                [-1, -1]]], dtype=int8, n_variants=3, n_samples=2, ploidy=2)

        """

        # check arguments
        packed = np.asarray(packed)
        if packed.ndim != 2:
            raise ValueError('packed array must have 2 dimensions')
        if packed.dtype != np.uint8:
            packed = packed.astype(np.uint8)

        from allel.opt.gt import unpack_diploid
        data = unpack_diploid(packed)
        return GenotypeArray(data)

[docs]    def to_sparse(self, format='csr', **kwargs):
        """Convert into a sparse matrix.

        Parameters
        ----------

        format : {'coo', 'csc', 'csr', 'dia', 'dok', 'lil'}
            Sparse matrix format.
        kwargs : keyword arguments
            Passed through to sparse matrix constructor.

        Returns
        -------

        m : scipy.sparse.spmatrix
            Sparse matrix

        Examples
        --------

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
        ...                          [[0, 1], [0, 1]],
        ...                          [[1, 1], [0, 0]],
        ...                          [[0, 0], [-1, -1]]], dtype='i1')
        >>> m = g.to_sparse(format='csr')
        >>> m
        <4x4 sparse matrix of type '<class 'numpy.int8'>'
            with 6 stored elements in Compressed Sparse Row format>
        >>> m.data
        array([ 1,  1,  1,  1, -1, -1], dtype=int8)
        >>> m.indices
        array([1, 3, 0, 1, 2, 3], dtype=int32)
        >>> m.indptr
        array([0, 0, 2, 4, 6], dtype=int32)

        """

        h = self.view_haplotypes()
        m = h.to_sparse(format=format, **kwargs)
        return m

    @staticmethod
[docs]    def from_sparse(m, ploidy, order=None, out=None):
        """Construct a genotype array from a sparse matrix.

        Parameters
        ----------

        m : scipy.sparse.spmatrix
            Sparse matrix
        ploidy : int
            The sample ploidy.
        order : {'C', 'F'}, optional
            Whether to store data in C (row-major) or Fortran (column-major)
            order in memory.
        out : ndarray, shape (n_variants, n_samples), optional
            Use this array as the output buffer.

        Returns
        -------

        g : GenotypeArray, shape (n_variants, n_samples, ploidy)
            Genotype array.

        Examples
        --------

        >>> import allel
        >>> import numpy as np
        >>> import scipy.sparse
        >>> data = np.array([ 1,  1,  1,  1, -1, -1], dtype=np.int8)
        >>> indices = np.array([1, 3, 0, 1, 2, 3], dtype=np.int32)
        >>> indptr = np.array([0, 0, 2, 4, 6], dtype=np.int32)
        >>> m = scipy.sparse.csr_matrix((data, indices, indptr))
        >>> g = allel.GenotypeArray.from_sparse(m, ploidy=2)
        >>> g
        GenotypeArray([[[ 0,  0],
                [ 0,  0]],
               [[ 0,  1],
                [ 0,  1]],
               [[ 1,  1],
                [ 0,  0]],
               [[ 0,  0],
                [-1, -1]]], dtype=int8, n_variants=4, n_samples=2, ploidy=2)

        """

        h = HaplotypeArray.from_sparse(m, order=order, out=out)
        g = h.view_genotypes(ploidy=ploidy)
        return g

[docs]    def allelism(self):
        """Determine the number of distinct alleles for each variant.

        Returns
        -------

        n : ndarray, int, shape (n_variants,)
            Allelism array.

        Examples
        --------

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 1]],
        ...                          [[0, 2], [1, 1]],
        ...                          [[2, 2], [-1, -1]]])
        >>> g.allelism()
        array([2, 3, 1])

        """

        return self.view_haplotypes().allelism()

[docs]    def allele_number(self):
        """Count the number of non-missing allele calls per variant.

        Returns
        -------

        an : ndarray, int, shape (n_variants,)
            Allele number array.

        Examples
        --------

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 1]],
        ...                          [[0, 2], [1, 1]],
        ...                          [[2, 2], [-1, -1]]])
        >>> g.allele_number()
        array([4, 4, 2])

        """

        return self.view_haplotypes().allele_number()

[docs]    def allele_count(self, allele=1):
        """Count the number of calls of the given allele per variant.

        Parameters
        ----------

        allele : int, optional
            Allele index.

        Returns
        -------

        ac : ndarray, int, shape (n_variants,)
            Allele count array.

        Examples
        --------

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 1]],
        ...                          [[0, 2], [1, 1]],
        ...                          [[2, 2], [-1, -1]]])
        >>> g.allele_count(allele=1)
        array([1, 2, 0])
        >>> g.allele_count(allele=2)
        array([0, 1, 2])

        """

        return self.view_haplotypes().allele_count(allele=allele)

[docs]    def allele_frequency(self, allele=1, fill=np.nan):
        """Calculate the frequency of the given allele per variant.

        Parameters
        ----------

        allele : int, optional
            Allele index.
        fill : int, optional
            The value to use where all genotype calls are missing for a
            variant.

        Returns
        -------

        af : ndarray, float, shape (n_variants,)
            Allele frequency array.
        ac : ndarray, int, shape (n_variants,)
            Allele count array (numerator).
        an : ndarray, int, shape (n_variants,)
            Allele number array (denominator).

        Examples
        --------

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 1]],
        ...                          [[0, 2], [1, 1]],
        ...                          [[2, 2], [-1, -1]]])
        >>> af, ac, an = g.allele_frequency(allele=1)
        >>> af
        array([ 0.25,  0.5 ,  0.  ])
        >>> af, ac, an = g.allele_frequency(allele=2)
        >>> af
        array([ 0.  ,  0.25,  1.  ])

        """

        return self.view_haplotypes().allele_frequency(allele=allele,
                                                       fill=fill)

[docs]    def allele_counts(self, alleles=None):
        """Count the number of calls of each allele per variant.

        Parameters
        ----------

        alleles : sequence of ints, optional
            The alleles to count. If None, all alleles will be counted.

        Returns
        -------

        ac : ndarray, int, shape (n_variants, len(alleles))
            Allele counts array.

        Examples
        --------

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 1]],
        ...                          [[0, 2], [1, 1]],
        ...                          [[2, 2], [-1, -1]]])
        >>> g.allele_counts()
        array([[3, 1, 0],
               [1, 2, 1],
               [0, 0, 2]], dtype=int32)
        >>> g.allele_counts(alleles=(1, 2))
        array([[1, 0],
               [2, 1],
               [0, 2]], dtype=int32)

        """

        return self.view_haplotypes().allele_counts(alleles=alleles)

[docs]    def allele_frequencies(self, alleles=None, fill=np.nan):
        """Calculate the frequency of each allele per variant.

        Parameters
        ----------

        alleles : sequence of ints, optional
            The alleles to calculate frequency of. If None, all allele
            frequencies will be calculated.
        fill : int, optional
            The value to use where all genotype calls are missing for a
            variant.

        Returns
        -------

        af : ndarray, float, shape (n_variants, len(alleles))
            Allele frequencies array.
        ac : ndarray, int, shape (n_variants, len(alleles))
            Allele counts array (numerator).
        an : ndarray, int, shape (n_variants,)
            Allele number array (denominator).

        Examples
        --------

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 1]],
        ...                          [[0, 2], [1, 1]],
        ...                          [[2, 2], [-1, -1]]])
        >>> af, ac, an = g.allele_frequencies()
        >>> af
        array([[ 0.75,  0.25,  0.  ],
               [ 0.25,  0.5 ,  0.25],
               [ 0.  ,  0.  ,  1.  ]])
        >>> af, ac, an = g.allele_frequencies(alleles=(1, 2))
        >>> af
        array([[ 0.25,  0.  ],
               [ 0.5 ,  0.25],
               [ 0.  ,  1.  ]])
        """

        return self.view_haplotypes().allele_frequencies(alleles=alleles,
                                                         fill=fill)

[docs]    def is_variant(self):
        """Find variants with at least one non-reference allele call.

        Returns
        -------

        out : ndarray, bool, shape (n_variants,)
            Boolean array where elements are True if variant matches the
            condition.

        Examples
        --------

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
        ...                          [[0, 0], [0, 1]],
        ...                          [[0, 2], [1, 1]],
        ...                          [[2, 2], [-1, -1]]])
        >>> g.is_variant()
        array([False,  True,  True,  True], dtype=bool)

        """

        return self.view_haplotypes().is_variant()

[docs]    def is_non_variant(self):
        """Find variants with no non-reference allele calls.

        Returns
        -------

        out : ndarray, bool, shape (n_variants,)
            Boolean array where elements are True if variant matches the
            condition.

        Examples
        --------

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
        ...                          [[0, 0], [0, 1]],
        ...                          [[0, 2], [1, 1]],
        ...                          [[2, 2], [-1, -1]]])
        >>> g.is_non_variant()
        array([ True, False, False, False], dtype=bool)

        """

        return self.view_haplotypes().is_non_variant()

[docs]    def is_segregating(self):
        """Find segregating variants (where more than one allele is observed).

        Returns
        -------

        out : ndarray, bool, shape (n_variants,)
            Boolean array where elements are True if variant matches the
            condition.

        Examples
        --------

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
        ...                          [[0, 0], [0, 1]],
        ...                          [[0, 2], [1, 1]],
        ...                          [[2, 2], [-1, -1]]])
        >>> g.is_segregating()
        array([False,  True,  True, False], dtype=bool)

        """

        return self.view_haplotypes().is_segregating()

[docs]    def is_non_segregating(self, allele=None):
        """Find non-segregating variants (where at most one allele is
        observed).

        Parameters
        ----------

        allele : int, optional
            Allele index.

        Returns
        -------

        out : ndarray, bool, shape (n_variants,)
            Boolean array where elements are True if variant matches the
            condition.

        Examples
        --------

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
        ...                          [[0, 0], [0, 1]],
        ...                          [[0, 2], [1, 1]],
        ...                          [[2, 2], [-1, -1]]])
        >>> g.is_non_segregating()
        array([ True, False, False,  True], dtype=bool)
        >>> g.is_non_segregating(allele=2)
        array([False, False, False,  True], dtype=bool)

        """

        return self.view_haplotypes().is_non_segregating(allele=allele)

[docs]    def is_singleton(self, allele=1):
        """Find variants with a single call for the given allele.

        Parameters
        ----------

        allele : int, optional
            Allele index.

        Returns
        -------

        out : ndarray, bool, shape (n_variants,)
            Boolean array where elements are True if variant matches the
            condition.

        Examples
        --------

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
        ...                          [[0, 0], [0, 1]],
        ...                          [[1, 1], [1, 2]],
        ...                          [[2, 2], [-1, -1]]])
        >>> g.is_singleton(allele=1)
        array([False,  True, False, False], dtype=bool)
        >>> g.is_singleton(allele=2)
        array([False, False,  True, False], dtype=bool)

        """

        return self.view_haplotypes().is_singleton(allele=allele)

[docs]    def is_doubleton(self, allele=1):
        """Find variants with exactly two calls for the given allele.

        Parameters
        ----------

        allele : int, optional
            Allele index.

        Returns
        -------

        out : ndarray, bool, shape (n_variants,)
            Boolean array where elements are True if variant matches the
            condition.

        Examples
        --------

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
        ...                          [[0, 0], [1, 1]],
        ...                          [[1, 1], [1, 2]],
        ...                          [[2, 2], [-1, -1]]])
        >>> g.is_doubleton(allele=1)
        array([False,  True, False, False], dtype=bool)
        >>> g.is_doubleton(allele=2)
        array([False, False, False,  True], dtype=bool)

        """

        return self.view_haplotypes().is_doubleton(allele=allele)

[docs]    def count_variant(self):
        return np.sum(self.is_variant())

[docs]    def count_non_variant(self):
        return np.sum(self.is_non_variant())

[docs]    def count_segregating(self):
        return np.sum(self.is_segregating())

[docs]    def count_non_segregating(self, allele=None):
        return np.sum(self.is_non_segregating(allele=allele))

[docs]    def count_singleton(self, allele=1):
        return np.sum(self.is_singleton(allele=allele))

[docs]    def count_doubleton(self, allele=1):
        return np.sum(self.is_doubleton(allele=allele))

[docs]    def haploidify_samples(self):
        """Construct a pseudo-haplotype for each sample by randomly
        selecting an allele from each genotype call.

        Returns
        -------

        h : HaplotypeArray

        Examples
        --------

        >>> import allel
        >>> import numpy as np
        >>> np.random.seed(42)
        >>> g = allel.GenotypeArray([[[0, 0], [0, 1]],
        ...                          [[0, 2], [1, 1]],
        ...                          [[1, 2], [2, 1]],
        ...                          [[2, 2], [-1, -1]]])
        >>> g.haploidify_samples()
        HaplotypeArray([[ 0,  1],
               [ 0,  1],
               [ 1,  1],
               [ 2, -1]], n_variants=4, n_haplotypes=2)
        >>> g = allel.GenotypeArray([[[0, 0, 0], [0, 0, 1]],
        ...                          [[0, 1, 1], [1, 1, 1]],
        ...                          [[0, 1, 2], [-1, -1, -1]]])
        >>> g.haploidify_samples()
        HaplotypeArray([[ 0,  0],
               [ 1,  1],
               [ 2, -1]], n_variants=3, n_haplotypes=2)

        """

        # N.B., this implementation is obscure and uses more memory that
        # necessary, TODO review

        # define the range of possible indices, e.g., diploid => (0, 1)
        index_range = np.arange(0, self.ploidy, dtype='u1')

        # create a random index for each genotype call
        indices = np.random.choice(index_range,
                                   size=(self.n_variants * self.n_samples),
                                   replace=True)

        # reshape genotype data so it's suitable for passing to np.choose
        # by merging the variants and samples dimensions
        choices = self.reshape(-1, self.ploidy).T

        # now use random indices to haploidify
        data = np.choose(indices, choices)

        # reshape the haploidified data to restore the variants and samples
        # dimensions
        data = data.reshape((self.n_variants, self.n_samples))

        # view as haplotype array
        h = HaplotypeArray(data, copy=False)

        return h

[docs]    def heterozygosity_observed(self, fill=np.nan):
        """Calculate the rate of observed heterozygosity for each variant.

        Parameters
        ----------

        fill : float, optional
            Use this value for variants where all calls are missing.

        Returns
        -------

        ho : ndarray, float, shape (n_variants,)
            Observed heterozygosity

        Examples
        --------

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [0, 0]],
        ...                          [[0, 0], [0, 1], [1, 1]],
        ...                          [[0, 0], [1, 1], [2, 2]],
        ...                          [[1, 1], [1, 2], [-1, -1]]])
        >>> g.heterozygosity_observed()
        array([ 0.        ,  0.33333333,  0.        ,  0.5       ])

        """

        # count hets
        n_het = self.count_het(axis=DIM_SAMPLES)
        n_called = self.count_called(axis=DIM_SAMPLES)

        # calculate rate of observed heterozygosity, accounting for variants
        # where all calls are missing
        with ignore_invalid():
            ho = np.where(n_called > 0, n_het / n_called, fill)

        return ho

[docs]    def heterozygosity_expected(self, fill=np.nan):
        """Calculate the expected rate of heterozygosity for each variant
        under Hardy-Weinberg equilibrium.

        Parameters
        ----------

        fill : float, optional
            Use this value for variants where all calls are missing.

        Returns
        -------

        he : ndarray, float, shape (n_variants,)
            Expected heterozygosity

        Examples
        --------

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [0, 0]],
        ...                          [[0, 0], [0, 1], [1, 1]],
        ...                          [[0, 0], [1, 1], [2, 2]],
        ...                          [[1, 1], [1, 2], [-1, -1]]])
        >>> g.heterozygosity_expected()
        array([ 0.        ,  0.5       ,  0.66666667,  0.375     ])

        """

        # calculate allele frequencies
        af, _, an = self.allele_frequencies(fill=0)

        # calculate expected heterozygosity
        out = 1 - np.sum(np.power(af, self.ploidy), axis=1)

        # fill values where allele frequencies could not be calculated
        out[an == 0] = fill

        return out

[docs]    def inbreeding_coefficient(self, fill=np.nan):
        """Calculate the inbreeding coefficient for each variant.

        Parameters
        ----------

        fill : float, optional
            Use this value for variants where the expected heterozygosity is
            zero.

        Returns
        -------

        f : ndarray, float, shape (n_variants,)
            Inbreeding coefficient.

        Notes
        -----

        The inbreeding coefficient is calculated as *1 - (Ho/He)* where *Ho* is
        the observed heterozygosity and *He* is the expected heterozygosity.

        Examples
        --------

        >>> import allel
        >>> import numpy as np
        >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [0, 0]],
        ...                          [[0, 0], [0, 1], [1, 1]],
        ...                          [[0, 0], [1, 1], [2, 2]],
        ...                          [[1, 1], [1, 2], [-1, -1]]])
        >>> g.heterozygosity_observed()
        array([ 0.        ,  0.33333333,  0.        ,  0.5       ])
        >>> g.heterozygosity_expected()
        array([ 0.        ,  0.5       ,  0.66666667,  0.375     ])
        >>> g.inbreeding_coefficient()
        array([        nan,  0.33333333,  1.        , -0.33333333])

        """

        # calculate observed and expected heterozygosity
        ho = self.heterozygosity_observed()
        he = self.heterozygosity_expected()

        # calculate inbreeding coefficient, accounting for variants with no
        # expected heterozygosity
        with ignore_invalid():
            f = np.where(he > 0, 1 - (ho / he), fill)

        return f

[docs]    def mean_pairwise_difference(self, fill=np.nan):
        """Calculate the mean number of pairwise differences between
        haplotypes for each variant.

        Parameters
        ----------

        fill : float
            Use this value where there are no pairs to compare (e.g.,
            all allele calls are missing).

        Returns
        -------

        m : ndarray, float, shape (n_variants,)

        Notes
        -----

        The values returned by this function can be summed over a genome
        region and divided by the number of accessible bases to estimate
        nucleotide diversity.

        Examples
        --------

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
        ...                          [[0, 0], [0, 1]],
        ...                          [[0, 0], [1, 1]],
        ...                          [[0, 1], [1, 1]],
        ...                          [[1, 1], [1, 1]],
        ...                          [[0, 0], [1, 2]],
        ...                          [[0, 1], [1, 2]],
        ...                          [[0, 1], [-1, -1]]])
        >>> g.mean_pairwise_difference()
        array([ 0.        ,  0.5       ,  0.66666667,  0.5       ,  0.        ,
                0.83333333,  0.83333333,  1.        ])

        """

        return self.view_haplotypes().mean_pairwise_difference(fill=fill)


[docs]class HaplotypeArray(np.ndarray):
    """Array of haplotypes.

    Parameters
    ----------

    data : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype data.
    **kwargs : keyword arguments
        All keyword arguments are passed through to :func:`numpy.array`.

    Notes
    -----

    This class represents haplotype data as a 2-dimensional numpy
    array of integers. By convention the first dimension corresponds
    to the variants genotyped, the second dimension corresponds to the
    haplotypes.

    Each integer within the array corresponds to an **allele index**,
    where 0 is the reference allele, 1 is the first alternate allele,
    2 is the second alternate allele, ... and -1 (or any other
    negative integer) is a missing allele call.

    If adjacent haplotypes originate from the same sample, then a
    haplotype array can also be viewed as a genotype array. However,
    this is not a requirement.

    Examples
    --------

    Instantiate a haplotype array::

        >>> import allel
        >>> h = allel.HaplotypeArray([[0, 0, 0, 1],
        ...                           [0, 1, 1, 1],
        ...                           [0, 2, -1, -1]], dtype='i1')
        >>> h.dtype
        dtype('int8')
        >>> h.ndim
        2
        >>> h.shape
        (3, 4)
        >>> h.n_variants
        3
        >>> h.n_haplotypes
        4

    Allele calls for a single variant at all haplotypes can be obtained
    by indexing the first dimension, e.g.::

        >>> h[1]
        array([0, 1, 1, 1], dtype=int8)

    A single haplotype can be obtained by indexing the second
    dimension, e.g.::

        >>> h[:, 1]
        array([0, 1, 2], dtype=int8)

    An allele call for a single haplotype at a single variant can be
    obtained by indexing the first and second dimensions, e.g.::

        >>> h[1, 0]
        0

    View haplotypes as diploid genotypes::

        >>> h.view_genotypes(ploidy=2)
        GenotypeArray([[[ 0,  0],
                [ 0,  1]],
               [[ 0,  1],
                [ 1,  1]],
               [[ 0,  2],
                [-1, -1]]], dtype=int8, n_variants=3, n_samples=2, ploidy=2)

    """

    @staticmethod
    def _check_input_data(obj):

        # check dtype
        if obj.dtype.kind not in 'ui':
            raise TypeError('integer dtype required')

        # check dimensionality
        if obj.ndim != 2:
            raise TypeError('array with 2 dimensions required')

    def __new__(cls, data, **kwargs):
        """Constructor."""
        obj = np.array(data, **kwargs)
        cls._check_input_data(obj)
        obj = obj.view(cls)
        return obj

    def __array_finalize__(self, obj):

        # called after constructor
        if obj is None:
            return

        # called after slice (new-from-template)
        if isinstance(obj, HaplotypeArray):
            return

        # called after view
        HaplotypeArray._check_input_data(obj)

    # noinspection PyUnusedLocal
    def __array_wrap__(self, out_arr, context=None):
        # don't wrap results of any ufuncs
        return np.asarray(out_arr)

    def __getslice__(self, *args, **kwargs):
        s = np.ndarray.__getslice__(self, *args, **kwargs)
        if hasattr(s, 'ndim'):
            if s.ndim == 2:
                return s
            elif s.ndim > 0:
                return np.asarray(s)
        return s

    def __getitem__(self, *args, **kwargs):
        s = np.ndarray.__getitem__(self, *args, **kwargs)
        if hasattr(s, 'ndim'):
            if s.ndim == 2:
                return s
            elif s.ndim > 0:
                return np.asarray(s)
        return s

    @property
[docs]    def n_variants(self):
        """Number of variants (length of first dimension)."""
        return self.shape[0]

    @property
[docs]    def n_haplotypes(self):
        """Number of haplotypes (length of second dimension)."""
        return self.shape[1]

    def __repr__(self):
        s = super(HaplotypeArray, self).__repr__()
        return s[:-1] + ', n_variants=%s, n_haplotypes=%s)' % \
                        (self.n_variants, self.n_haplotypes)

[docs]    def view_genotypes(self, ploidy):
        """Reshape a haplotype array to view it as genotypes by restoring the
        ploidy dimension.

        Parameters
        ----------

        ploidy : int
            The sample ploidy.

        Returns
        -------

        g : ndarray, int, shape (n_variants, n_samples, ploidy)
            Genotype array (sharing same underlying buffer).

        Examples
        --------

        >>> import allel
        >>> h = allel.HaplotypeArray([[0, 0, 0, 1],
        ...                           [0, 1, 1, 1],
        ...                           [0, 2, -1, -1]], dtype='i1')
        >>> h.view_genotypes(ploidy=2)
        GenotypeArray([[[ 0,  0],
                [ 0,  1]],
               [[ 0,  1],
                [ 1,  1]],
               [[ 0,  2],
                [-1, -1]]], dtype=int8, n_variants=3, n_samples=2, ploidy=2)

        """

        # check ploidy is compatible
        if (self.n_haplotypes % ploidy) > 0:
            raise ValueError('incompatible ploidy')

        # reshape
        newshape = (self.n_variants, -1, ploidy)
        data = self.reshape(newshape)

        # wrap
        g = GenotypeArray(data, copy=False)

        return g

[docs]    def to_sparse(self, format='csr', **kwargs):
        """Convert into a sparse matrix.

        Parameters
        ----------

        format : {'coo', 'csc', 'csr', 'dia', 'dok', 'lil'}
            Sparse matrix format.
        kwargs : keyword arguments
            Passed through to sparse matrix constructor.

        Returns
        -------

        m : scipy.sparse.spmatrix
            Sparse matrix

        Examples
        --------

        >>> import allel
        >>> h = allel.HaplotypeArray([[0, 0, 0, 0],
        ...                           [0, 1, 0, 1],
        ...                           [1, 1, 0, 0],
        ...                           [0, 0, -1, -1]], dtype='i1')
        >>> m = h.to_sparse(format='csr')
        >>> m
        <4x4 sparse matrix of type '<class 'numpy.int8'>'
            with 6 stored elements in Compressed Sparse Row format>
        >>> m.data
        array([ 1,  1,  1,  1, -1, -1], dtype=int8)
        >>> m.indices
        array([1, 3, 0, 1, 2, 3], dtype=int32)
        >>> m.indptr
        array([0, 0, 2, 4, 6], dtype=int32)

        """

        import scipy.sparse

        # check arguments
        f = {
            'bsr': scipy.sparse.bsr_matrix,
            'coo': scipy.sparse.coo_matrix,
            'csc': scipy.sparse.csc_matrix,
            'csr': scipy.sparse.csr_matrix,
            'dia': scipy.sparse.dia_matrix,
            'dok': scipy.sparse.dok_matrix,
            'lil': scipy.sparse.lil_matrix
        }
        if format not in f:
            raise ValueError('invalid format: %r' % format)

        # create sparse matrix
        m = f[format](self, **kwargs)

        return m

    @staticmethod
[docs]    def from_sparse(m, order=None, out=None):
        """Construct a haplotype array from a sparse matrix.

        Parameters
        ----------

        m : scipy.sparse.spmatrix
            Sparse matrix
        order : {'C', 'F'}, optional
            Whether to store data in C (row-major) or Fortran (column-major)
            order in memory.
        out : ndarray, shape (n_variants, n_samples), optional
            Use this array as the output buffer.

        Returns
        -------

        h : HaplotypeArray, shape (n_variants, n_haplotypes)
            Haplotype array.

        Examples
        --------

        >>> import allel
        >>> import numpy as np
        >>> import scipy.sparse
        >>> data = np.array([ 1,  1,  1,  1, -1, -1], dtype=np.int8)
        >>> indices = np.array([1, 3, 0, 1, 2, 3], dtype=np.int32)
        >>> indptr = np.array([0, 0, 2, 4, 6], dtype=np.int32)
        >>> m = scipy.sparse.csr_matrix((data, indices, indptr))
        >>> h = allel.HaplotypeArray.from_sparse(m)
        >>> h
        HaplotypeArray([[ 0,  0,  0,  0],
               [ 0,  1,  0,  1],
               [ 1,  1,  0,  0],
               [ 0,  0, -1, -1]], dtype=int8, n_variants=4, n_haplotypes=4)

        """

        import scipy.sparse

        # check arguments
        if not scipy.sparse.isspmatrix(m):
            raise ValueError('not a sparse matrix: %r' % m)

        # convert to dense array
        data = m.toarray(order=order, out=out)

        # wrap
        h = HaplotypeArray(data)

        return h

[docs]    def allelism(self):
        """Determine the number of distinct alleles for each variant.

        Returns
        -------

        n : ndarray, int, shape (n_variants,)
            Allelism array.

        """

        # calculate allele counts
        ac = self.allele_counts()

        # count alleles present
        n = np.sum(ac > 0, axis=1)

        return n

[docs]    def allele_number(self):
        """Count the number of non-missing allele calls per variant.

        Returns
        -------

        an : ndarray, int, shape (n_variants,)
            Allele number array.

        """

        # count non-missing calls over samples
        an = np.sum(self >= 0, axis=1)

        return an

[docs]    def allele_count(self, allele=1):
        """Count the number of calls of the given allele per variant.

        Parameters
        ----------

        allele : int, optional
            Allele index.

        Returns
        -------

        ac : ndarray, int, shape (n_variants,)
            Allele count array.

        """

        # count non-missing calls over samples
        return np.sum(self == allele, axis=1)

[docs]    def allele_frequency(self, allele=1, fill=np.nan):
        """Calculate the frequency of the given allele per variant.

        Parameters
        ----------

        allele : int, optional
            Allele index.
        fill : int, optional
            The value to use where all genotype calls are missing for a
            variant.

        Returns
        -------

        af : ndarray, float, shape (n_variants,)
            Allele frequency array.
        ac : ndarray, int, shape (n_variants,)
            Allele count array (numerator).
        an : ndarray, int, shape (n_variants,)
            Allele number array (denominator).

        """

        # intermediate variables
        an = self.allele_number()
        ac = self.allele_count(allele=allele)

        # calculate allele frequency, accounting for variants with no calls
        with ignore_invalid():
            af = np.where(an > 0, ac / an, fill)

        return af, ac, an

[docs]    def allele_counts(self, alleles=None):
        """Count the number of calls of each allele per variant.

        Parameters
        ----------

        alleles : sequence of ints, optional
            The alleles to count. If None, all alleles will be counted.

        Returns
        -------

        ac : ndarray, int, shape (n_variants, len(alleles))
            Allele counts array.

        """

        # if alleles not specified, count all alleles
        if alleles is None:
            m = self.max()
            alleles = list(range(m+1))

        # set up output array
        ac = np.zeros((self.n_variants, len(alleles)), dtype='i4')

        # count alleles
        for i, allele in enumerate(alleles):
            np.sum(self == allele, axis=1, out=ac[:, i])

        return ac

[docs]    def allele_frequencies(self, alleles=None, fill=np.nan):
        """Calculate the frequency of each allele per variant.

        Parameters
        ----------

        alleles : sequence of ints, optional
            The alleles to calculate frequency of. If None, all allele
            frequencies will be calculated.
        fill : int, optional
            The value to use where all genotype calls are missing for a
            variant.

        Returns
        -------

        af : ndarray, float, shape (n_variants, len(alleles))
            Allele frequencies array.
        ac : ndarray, int, shape (n_variants, len(alleles))
            Allele counts array (numerator).
        an : ndarray, int, shape (n_variants,)
            Allele number array (denominator).

        """

        # intermediate variables
        an = self.allele_number()[:, None]
        ac = self.allele_counts(alleles=alleles)

        # calculate allele frequency, accounting for variants with no calls
        with ignore_invalid():
            af = np.where(an > 0, ac / an, fill)

        return af, ac, an[:, 0]

[docs]    def is_variant(self):
        """Find variants with at least one non-reference allele call.

        Returns
        -------

        out : ndarray, bool, shape (n_variants,)
            Boolean array where elements are True if variant matches the
            condition.

        """

        # find variants with at least 1 non-reference allele
        out = np.sum(self > 0, axis=1) >= 1

        return out

[docs]    def is_non_variant(self):
        """Find variants with no non-reference allele calls.

        Returns
        -------

        out : ndarray, bool, shape (n_variants,)
            Boolean array where elements are True if variant matches the
            condition.

        """

        # find variants with no non-reference alleles
        out = np.all(self <= 0, axis=1)

        return out

[docs]    def is_segregating(self):
        """Find segregating variants (where more than one allele is observed).

        Returns
        -------

        out : ndarray, bool, shape (n_variants,)
            Boolean array where elements are True if variant matches the
            condition.

        """

        # find segregating variants
        out = self.allelism() > 1

        return out

[docs]    def is_non_segregating(self, allele=None):
        """Find non-segregating variants (where at most one allele is
        observed).

        Parameters
        ----------

        allele : int, optional
            Allele index.

        Returns
        -------

        out : ndarray, bool, shape (n_variants,)
            Boolean array where elements are True if variant matches the
            condition.

        """

        if allele is None:

            # find fixed variants
            out = self.allelism() <= 1

        else:

            # find fixed variants with respect to a specific allele
            ex = '(self < 0) | (self == {})'.format(allele)
            b = ne.evaluate(ex)
            out = np.all(b, axis=1)

        return out

[docs]    def is_singleton(self, allele=1):
        """Find variants with a single call for the given allele.

        Parameters
        ----------

        allele : int, optional
            Allele index.

        Returns
        -------

        out : ndarray, bool, shape (n_variants,)
            Boolean array where elements are True if variant matches the
            condition.

        """

        # count allele
        ac = self.allele_count(allele=allele)

        # find singletons
        out = ac == 1

        return out

[docs]    def is_doubleton(self, allele=1):
        """Find variants with exactly two calls for the given allele.

        Parameters
        ----------

        allele : int, optional
            Allele index.

        Returns
        -------

        out : ndarray, bool, shape (n_variants,)
            Boolean array where elements are True if variant matches the
            condition.

        """

        # count allele
        ac = self.allele_count(allele=allele)

        # find doubletons
        out = ac == 2

        return out

[docs]    def count_variant(self):
        return np.sum(self.is_variant())

[docs]    def count_non_variant(self):
        return np.sum(self.is_non_variant())

[docs]    def count_segregating(self):
        return np.sum(self.is_segregating())

[docs]    def count_non_segregating(self, allele=None):
        return np.sum(self.is_non_segregating(allele=allele))

[docs]    def count_singleton(self, allele=1):
        return np.sum(self.is_singleton(allele=allele))

[docs]    def count_doubleton(self, allele=1):
        return np.sum(self.is_doubleton(allele=allele))

[docs]    def mean_pairwise_difference(self, fill=np.nan):
        """Calculate the mean number of pairwise differences between
        haplotypes for each variant.

        Parameters
        ----------

        fill : float
            Use this value where there are no pairs to compare (e.g.,
            all allele calls are missing).

        Returns
        -------

        m : ndarray, float, shape (n_variants,)

        Notes
        -----

        The values returned by this function can be summed over a genome
        region and divided by the number of accessible bases to estimate
        nucleotide diversity.

        Examples
        --------

        >>> import allel
        >>> h = allel.HaplotypeArray([[0, 0, 0, 0],
        ...                           [0, 0, 0, 1],
        ...                           [0, 0, 1, 1],
        ...                           [0, 1, 1, 1],
        ...                           [1, 1, 1, 1],
        ...                           [0, 0, 1, 2],
        ...                           [0, 1, 1, 2],
        ...                           [0, 1, -1, -1]])
        >>> h.mean_pairwise_difference()
        array([ 0.        ,  0.5       ,  0.66666667,  0.5       ,  0.        ,
                0.83333333,  0.83333333,  1.        ])

        """

        # This function calculates the mean number of pairwise differences
        # between haplotypes, generalising to any number of alleles.

        # number of non-missing alleles (i.e., haplotypes) for each variant
        an = self.allele_number()

        # allele counts for each variant
        ac = self.allele_counts()

        # total number of pairwise comparisons for each variant:
        # (an choose 2)
        n_pairs = an * (an - 1) / 2

        # number of pairwise comparisons where there is no difference:
        # sum of (ac choose 2) for each allele (i.e., number of ways to
        # choose the same allele twice)
        n_same = np.sum(ac * (ac - 1) / 2, axis=1)

        # number of pairwise differences
        n_diff = n_pairs - n_same

        # mean number of pairwise differences, accounting for cases where
        # there are no pairs
        with ignore_invalid():
            m = np.where(n_pairs > 0, n_diff / n_pairs, fill)

        return m


[docs]class PositionIndex(np.ndarray):
    """Array of positions from a single chromosome or contig.

    Parameters
    ----------

    data : array_like, int
        Positions (1-based) in ascending order.
    **kwargs : keyword arguments
        All keyword arguments are passed through to :func:`numpy.array`.

    Notes
    -----

    This class represents the genomic positions of a set of variants as a
    1-dimensional numpy array of integers.

    Each integer within the array is a 1-based coordinate position within a
    single chromosome or contig. Positions must be given in ascending order,
    although duplicate positions may be present (i.e., positions must be
    monotonically increasing).

    Examples
    --------

    >>> import allel
    >>> pos = allel.PositionIndex([2, 5, 14, 15, 42, 42, 77], dtype='i4')
    >>> pos.dtype
    dtype('int32')
    >>> pos.ndim
    1
    >>> pos.shape
    (7,)
    >>> pos.is_unique
    False

    """

    @staticmethod
    def _check_input_data(obj):

        # check dtype
        if obj.dtype.kind not in 'ui':
            raise TypeError('integer dtype required')

        # check dimensionality
        if obj.ndim != 1:
            raise TypeError('array with 1 dimension required')

        # check sorted ascending
        diff = np.diff(obj)
        if np.any(diff < 0):
            raise ValueError('array is not monotonically increasing')

    def __new__(cls, data, **kwargs):
        """Constructor."""
        obj = np.array(data, **kwargs)
        cls._check_input_data(obj)
        obj = obj.view(cls)
        return obj

    def __array_finalize__(self, obj):

        # called after constructor
        if obj is None:
            return

        # called after slice (new-from-template)
        if isinstance(obj, PositionIndex):
            return

        # called after view
        PositionIndex._check_input_data(obj)

    # noinspection PyUnusedLocal
    def __array_wrap__(self, out_arr, context=None):
        # don't wrap results of any ufuncs
        return np.asarray(out_arr)

    def __getslice__(self, *args, **kwargs):
        s = np.ndarray.__getslice__(self, *args, **kwargs)
        if hasattr(s, 'ndim'):
            if s.ndim == 1:
                return s
            elif s.ndim > 0:
                return np.asarray(s)
        return s

    def __getitem__(self, *args, **kwargs):
        s = np.ndarray.__getitem__(self, *args, **kwargs)
        if hasattr(s, 'ndim'):
            if s.ndim == 1:
                return s
            elif s.ndim > 0:
                return np.asarray(s)
        return s

    @property
[docs]    def is_unique(self):
        """True if no duplicate entries."""
        if not hasattr(self, '_is_unique'):
            self._is_unique = ~np.any(np.diff(self) == 0)
        return self._is_unique

[docs]    def locate_key(self, key):
        """Get index location for the requested key.

        Parameters
        ----------

        key : int
            Position to locate.

        Returns
        -------

        loc : int or slice
            Location of `key` (will be slice if there are duplicate entries).

        Examples
        --------

        >>> import allel
        >>> pos = allel.PositionIndex([3, 6, 6, 11])
        >>> pos.locate_key(3)
        0
        >>> pos.locate_key(11)
        3
        >>> pos.locate_key(6)
        slice(1, 3, None)
        >>> try:
        ...     pos.locate_key(2)
        ... except KeyError as e:
        ...     print(e)
        ...
        2

        """

        left = np.searchsorted(self, key, side='left')
        right = np.searchsorted(self, key, side='right')
        diff = right - left
        if diff == 0:
            raise KeyError(key)
        elif diff == 1:
            return left
        else:
            return slice(left, right)

[docs]    def locate_intersection(self, other):
        """Locate the intersection with another position array.

        Parameters
        ----------

        other : array_like, int
            Array of positions to intersect.

        Returns
        -------

        loc : ndarray, bool
            Boolean array with location of intersection.
        loc_other : ndarray, bool
            Boolean array with location in `other` of intersection.

        Examples
        --------

        >>> import allel
        >>> pos1 = allel.PositionIndex([3, 6, 11, 20, 35])
        >>> pos2 = allel.PositionIndex([4, 6, 20, 39])
        >>> loc1, loc2 = pos1.locate_intersection(pos2)
        >>> loc1
        array([False,  True, False,  True, False], dtype=bool)
        >>> loc2
        array([False,  True,  True, False], dtype=bool)
        >>> pos1[loc1]
        PositionIndex([ 6, 20])
        >>> pos2[loc2]
        PositionIndex([ 6, 20])

        """

        # check inputs
        other = PositionIndex(other)

        # find intersection
        assume_unique = self.is_unique and other.is_unique
        loc = np.in1d(self, other, assume_unique=assume_unique)
        loc_other = np.in1d(other, self, assume_unique=assume_unique)

        return loc, loc_other

[docs]    def locate_keys(self, keys, strict=True):
        """Get index locations for the requested keys.

        Parameters
        ----------

        keys : array_like, int
            Array of keys to locate.
        strict : bool, optional
            If True, raise KeyError if any keys are not found in the index.

        Returns
        -------

        loc : ndarray, bool
            Boolean array with location of positions.

        Examples
        --------

        >>> import allel
        >>> pos1 = allel.PositionIndex([3, 6, 11, 20, 35])
        >>> pos2 = allel.PositionIndex([4, 6, 20, 39])
        >>> loc = pos1.locate_keys(pos2, strict=False)
        >>> loc
        array([False,  True, False,  True, False], dtype=bool)
        >>> pos1[loc]
        PositionIndex([ 6, 20])

        """

        # check inputs
        keys = PositionIndex(keys)

        # find intersection
        loc, found = self.locate_intersection(keys)

        if strict and np.any(~found):
            raise KeyError(keys[~found])

        return loc

[docs]    def intersect(self, other):
        """Intersect with `other` positions.

        Parameters
        ----------

        other : array_like, int
            Array of positions to locate.

        Returns
        -------

        out : PositionIndex
            Positions in common.

        Examples
        --------

        >>> import allel
        >>> pos1 = allel.PositionIndex([3, 6, 11, 20, 35])
        >>> pos2 = allel.PositionIndex([4, 6, 20, 39])
        >>> pos1.intersect(pos2)
        PositionIndex([ 6, 20])

        """

        loc = self.locate_keys(other, strict=False)
        return np.compress(loc, self)

[docs]    def locate_range(self, start=None, stop=None):
        """Locate slice of index containing all entries within `start` and
        `stop` positions **inclusive**.

        Parameters
        ----------

        start : int, optional
            Start position.
        stop : int, optional
            Stop position.

        Returns
        -------

        loc : slice
            Slice object.

        Examples
        --------

        >>> import allel
        >>> pos = allel.PositionIndex([3, 6, 11, 20, 35])
        >>> loc = pos.locate_range(4, 32)
        >>> loc
        slice(1, 4, None)
        >>> pos[loc]
        PositionIndex([ 6, 11, 20])

        """

        # locate start and stop indices
        if start is None:
            start_index = 0
        else:
            start_index = np.searchsorted(self, start)
        if stop is None:
            stop_index = len(self)
        else:
            stop_index = np.searchsorted(self, stop, side='right')

        if stop_index - start_index == 0:
            raise KeyError(start, stop)

        loc = slice(start_index, stop_index)
        return loc

[docs]    def intersect_range(self, start=None, stop=None):
        """Intersect with range defined by `start` and `stop` positions
        **inclusive**.

        Parameters
        ----------

        start : int, optional
            Start position.
        stop : int, optional
            Stop position.

        Returns
        -------

        pos : PositionIndex

        Examples
        --------

        >>> import allel
        >>> pos = allel.PositionIndex([3, 6, 11, 20, 35])
        >>> pos.intersect_range(4, 32)
        PositionIndex([ 6, 11, 20])

        """

        try:
            loc = self.locate_range(start=start, stop=stop)
        except KeyError:
            return self[0:0]
        else:
            return self[loc]

[docs]    def locate_intersection_ranges(self, starts, stops):
        """Locate the intersection with a set of ranges.

        Parameters
        ----------

        starts : array_like, int
            Range start positions.
        stops : array_like, int
            Range stop positions.

        Returns
        -------

        loc : ndarray, bool
            Boolean array with location of entries found.
        loc_ranges : ndarray, bool
            Boolean array with location of ranges containing one or more
            entries.

        Examples
        --------

        >>> import allel
        >>> import numpy as np
        >>> pos = allel.PositionIndex([3, 6, 11, 20, 35])
        >>> ranges = np.array([[0, 2], [6, 17], [12, 15], [31, 35],
        ...                    [100, 120]])
        >>> starts = ranges[:, 0]
        >>> stops = ranges[:, 1]
        >>> loc, loc_ranges = pos.locate_intersection_ranges(starts, stops)
        >>> loc
        array([False,  True,  True, False,  True], dtype=bool)
        >>> loc_ranges
        array([False,  True, False,  True, False], dtype=bool)
        >>> pos[loc]
        PositionIndex([ 6, 11, 35])
        >>> ranges[loc_ranges]
        array([[ 6, 17],
               [31, 35]])

        """

        # check inputs
        starts = np.asarray(starts)
        stops = np.asarray(stops)
        # TODO raise ValueError
        assert starts.ndim == stops.ndim == 1
        assert starts.shape[0] == stops.shape[0]

        # find indices of start and stop positions in pos
        start_indices = np.searchsorted(self, starts)
        stop_indices = np.searchsorted(self, stops, side='right')

        # find intervals overlapping at least one position
        loc_ranges = start_indices < stop_indices

        # find positions within at least one interval
        loc = np.zeros(self.shape, dtype=np.bool)
        for i, j in zip(start_indices[loc_ranges], stop_indices[loc_ranges]):
            loc[i:j] = True

        return loc, loc_ranges

[docs]    def locate_ranges(self, starts, stops, strict=True):
        """Locate items within the given ranges.

        Parameters
        ----------

        starts : array_like, int
            Range start positions.
        stops : array_like, int
            Range stop positions.
        strict : bool, optional
            If True, raise KeyError if any ranges contain no entries.

        Returns
        -------

        loc : ndarray, bool
            Boolean array with location of entries found.

        Examples
        --------

        >>> import allel
        >>> import numpy as np
        >>> pos = allel.PositionIndex([3, 6, 11, 20, 35])
        >>> ranges = np.array([[0, 2], [6, 17], [12, 15], [31, 35],
        ...                    [100, 120]])
        >>> starts = ranges[:, 0]
        >>> stops = ranges[:, 1]
        >>> loc = pos.locate_ranges(starts, stops, strict=False)
        >>> loc
        array([False,  True,  True, False,  True], dtype=bool)
        >>> pos[loc]
        PositionIndex([ 6, 11, 35])

        """

        loc, found = self.locate_intersection_ranges(starts, stops)

        if strict and np.any(~found):
            raise KeyError(starts[~found], stops[~found])

        return loc

[docs]    def intersect_ranges(self, starts, stops):
        """Intersect with a set of ranges.

        Parameters
        ----------

        starts : array_like, int
            Range start positions.
        stops : array_like, int
            Range stop positions.

        Returns
        -------

        pos : PositionIndex

        Examples
        --------

        >>> import allel
        >>> import numpy as np
        >>> pos = allel.PositionIndex([3, 6, 11, 20, 35])
        >>> ranges = np.array([[0, 2], [6, 17], [12, 15], [31, 35],
        ...                    [100, 120]])
        >>> starts = ranges[:, 0]
        >>> stops = ranges[:, 1]
        >>> pos.intersect_ranges(starts, stops)
        PositionIndex([ 6, 11, 35])

        """

        loc = self.locate_ranges(starts, stops, strict=False)
        return np.compress(loc, self)


[docs]class LabelIndex(np.ndarray):
    """Array of labels (e.g., variant or sample identifiers).

    Parameters
    ----------

    data : array_like
        Labels.
    **kwargs : keyword arguments
        All keyword arguments are passed through to :func:`numpy.array`.

    Notes
    -----

    This class represents an arbitrary set of unique labels, e.g., sample or
    variant identifiers.

    There is no need for labels to be sorted. However, all labels must be
    unique within the array, and must be hashable objects.

    Examples
    --------

    >>> import allel
    >>> lbl = allel.LabelIndex(['A', 'C', 'B', 'F'])
    >>> lbl.dtype
    dtype('<U1')
    >>> lbl.ndim
    1
    >>> lbl.shape
    (4,)

    """

    @staticmethod
    def _check_input_data(obj):

        # check dimensionality
        if obj.ndim != 1:
            raise TypeError('array with 1 dimension required')

        # check unique
        # noinspection PyTupleAssignmentBalance
        _, counts = np.unique(obj, return_counts=True)
        if np.any(counts > 1):
            raise ValueError('values are not unique')

    def __new__(cls, data, **kwargs):
        """Constructor."""
        obj = np.array(data, **kwargs)
        cls._check_input_data(obj)
        obj = obj.view(cls)
        return obj

    def __array_finalize__(self, obj):

        # called after constructor
        if obj is None:
            return

        # called after slice (new-from-template)
        if isinstance(obj, LabelIndex):
            return

        # called after view
        LabelIndex._check_input_data(obj)

    # noinspection PyUnusedLocal
    def __array_wrap__(self, out_arr, context=None):
        # don't wrap results of any ufuncs
        return np.asarray(out_arr)

    def __getslice__(self, *args, **kwargs):
        s = np.ndarray.__getslice__(self, *args, **kwargs)
        if hasattr(s, 'ndim'):
            if s.ndim == 1:
                return s
            elif s.ndim > 0:
                return np.asarray(s)
        return s

    def __getitem__(self, *args, **kwargs):
        s = np.ndarray.__getitem__(self, *args, **kwargs)
        if hasattr(s, 'ndim'):
            if s.ndim == 1:
                return s
            elif s.ndim > 0:
                return np.asarray(s)
        return s

[docs]    def locate_key(self, key):
        """Get index location for the requested key.

        Parameters
        ----------

        key : object
            Key to locate.

        Returns
        -------

        loc : int
            Location of `key`.

        Examples
        --------

        >>> import allel
        >>> lbl = allel.LabelIndex(['A', 'C', 'B', 'F'])
        >>> lbl.locate_key('A')
        0
        >>> lbl.locate_key('B')
        2
        >>> try:
        ...     lbl.locate_key('X')
        ... except KeyError as e:
        ...     print(e)
        ...
        'X'

        """

        # TODO review implementation for performance with larger arrays

        loc = np.nonzero(self == key)[0]
        if len(loc) == 0:
            raise KeyError(key)
        return loc[0]

[docs]    def locate_intersection(self, other):
        """Locate the intersection with another array.

        Parameters
        ----------

        other : array_like
            Array to intersect.

        Returns
        -------

        loc : ndarray, bool
            Boolean array with location of intersection.
        loc_other : ndarray, bool
            Boolean array with location in `other` of intersection.

        Examples
        --------

        >>> import allel
        >>> lbl1 = allel.LabelIndex(['A', 'C', 'B', 'F'])
        >>> lbl2 = allel.LabelIndex(['X', 'F', 'G', 'C', 'Z'])
        >>> loc1, loc2 = lbl1.locate_intersection(lbl2)
        >>> loc1
        array([False,  True, False,  True], dtype=bool)
        >>> loc2
        array([False,  True, False,  True, False], dtype=bool)
        >>> lbl1[loc1]
        LabelIndex(['C', 'F'],
              dtype='<U1')
        >>> lbl2[loc2]
        LabelIndex(['F', 'C'],
              dtype='<U1')

        """

        # TODO review implementation for performance with larger arrays

        # check inputs
        other = LabelIndex(other)

        # find intersection
        assume_unique = True
        loc = np.in1d(self, other, assume_unique=assume_unique)
        loc_other = np.in1d(other, self, assume_unique=assume_unique)

        return loc, loc_other

[docs]    def locate_keys(self, keys, strict=True):
        """Get index locations for the requested keys.

        Parameters
        ----------

        keys : array_like
            Array of keys to locate.
        strict : bool, optional
            If True, raise KeyError if any keys are not found in the index.

        Returns
        -------

        loc : ndarray, bool
            Boolean array with location of keys.

        Examples
        --------

        >>> import allel
        >>> lbl = allel.LabelIndex(['A', 'C', 'B', 'F'])
        >>> lbl.locate_keys(['F', 'C'])
        array([False,  True, False,  True], dtype=bool)
        >>> lbl.locate_keys(['X', 'F', 'G', 'C', 'Z'], strict=False)
        array([False,  True, False,  True], dtype=bool)

        """

        # check inputs
        keys = LabelIndex(keys)

        # find intersection
        loc, found = self.locate_intersection(keys)

        if strict and np.any(~found):
            raise KeyError(keys[~found])

        return loc

[docs]    def intersect(self, other):
        """Intersect with `other`.

        Parameters
        ----------

        other : array_like
            Array to intersect.

        Returns
        -------

        out : LabelIndex

        Examples
        --------

        >>> import allel
        >>> lbl1 = allel.LabelIndex(['A', 'C', 'B', 'F'])
        >>> lbl2 = allel.LabelIndex(['X', 'F', 'G', 'C', 'Z'])
        >>> lbl1.intersect(lbl2)
        LabelIndex(['C', 'F'],
              dtype='<U1')
        >>> lbl2.intersect(lbl1)
        LabelIndex(['F', 'C'],
              dtype='<U1')

        """

        loc = self.locate_keys(other, strict=False)
        return np.compress(loc, self)