Source code for allel.model.util

# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division


# third-party imports
import numpy as np


# internal imports
from allel.util import asarray_ndim, check_dim0_aligned, ensure_dim1_aligned


__all__ = ['create_allele_mapping', 'locate_private_alleles', 'locate_fixed_differences',
           'sample_to_haplotype_selection']


[docs]def create_allele_mapping(ref, alt, alleles, dtype='i1'): """Create an array mapping variant alleles into a different allele index system. Parameters ---------- ref : array_like, S1, shape (n_variants,) Reference alleles. alt : array_like, S1, shape (n_variants, n_alt_alleles) Alternate alleles. alleles : array_like, S1, shape (n_variants, n_alleles) Alleles defining the new allele indexing. dtype : dtype, optional Output dtype. Returns ------- mapping : ndarray, int8, shape (n_variants, n_alt_alleles + 1) Examples -------- Example with biallelic variants:: >>> import allel >>> ref = [b'A', b'C', b'T', b'G'] >>> alt = [b'T', b'G', b'C', b'A'] >>> alleles = [[b'A', b'T'], # no transformation ... [b'G', b'C'], # swap ... [b'T', b'A'], # 1 missing ... [b'A', b'C']] # 1 missing >>> mapping = allel.create_allele_mapping(ref, alt, alleles) >>> mapping array([[ 0, 1], [ 1, 0], [ 0, -1], [-1, 0]], dtype=int8) Example with multiallelic variants:: >>> ref = [b'A', b'C', b'T'] >>> alt = [[b'T', b'G'], ... [b'A', b'T'], ... [b'G', b'.']] >>> alleles = [[b'A', b'T'], ... [b'C', b'T'], ... [b'G', b'A']] >>> mapping = create_allele_mapping(ref, alt, alleles) >>> mapping array([[ 0, 1, -1], [ 0, -1, 1], [-1, 0, -1]], dtype=int8) See Also -------- GenotypeArray.map_alleles, HaplotypeArray.map_alleles, AlleleCountsArray.map_alleles """ ref = asarray_ndim(ref, 1) alt = asarray_ndim(alt, 1, 2) alleles = asarray_ndim(alleles, 1, 2) check_dim0_aligned(ref, alt, alleles) # reshape for convenience ref = ref[:, None] if alt.ndim == 1: alt = alt[:, None] if alleles.ndim == 1: alleles = alleles[:, None] source_alleles = np.append(ref, alt, axis=1) # setup output array out = np.empty(source_alleles.shape, dtype=dtype) out.fill(-1) # find matches for ai in range(source_alleles.shape[1]): match = source_alleles[:, ai, None] == alleles match_i, match_j = match.nonzero() out[match_i, ai] = match_j return out
[docs]def locate_fixed_differences(ac1, ac2): """Locate variants with no shared alleles between two populations. Parameters ---------- ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the second population. Returns ------- loc : ndarray, bool, shape (n_variants,) See Also -------- allel.stats.diversity.windowed_df Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [1, 1], [1, 1]], ... [[0, 1], [0, 1], [0, 1], [0, 1]], ... [[0, 1], [0, 1], [1, 1], [1, 1]], ... [[0, 0], [0, 0], [1, 1], [2, 2]], ... [[0, 0], [-1, -1], [1, 1], [-1, -1]]]) >>> ac1 = g.count_alleles(subpop=[0, 1]) >>> ac2 = g.count_alleles(subpop=[2, 3]) >>> loc_df = allel.locate_fixed_differences(ac1, ac2) >>> loc_df array([ True, False, False, True, True], dtype=bool) """ # check inputs ac1 = asarray_ndim(ac1, 2) ac2 = asarray_ndim(ac2, 2) check_dim0_aligned(ac1, ac2) ac1, ac2 = ensure_dim1_aligned(ac1, ac2) # stack allele counts for convenience pac = np.dstack([ac1, ac2]) # count numbers of alleles called in each population pan = np.sum(pac, axis=1) # count the numbers of populations with each allele npa = np.sum(pac > 0, axis=2) # locate variants with allele calls in both populations non_missing = np.all(pan > 0, axis=1) # locate variants where all alleles are only found in a single population no_shared_alleles = np.all(npa <= 1, axis=1) return non_missing & no_shared_alleles
[docs]def locate_private_alleles(*acs): """Locate alleles that are found only in a single population. Parameters ---------- *acs : array_like, int, shape (n_variants, n_alleles) Allele counts arrays from each population. Returns ------- loc : ndarray, bool, shape (n_variants, n_alleles) Boolean array where elements are True if allele is private to a single population. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [1, 1], [1, 1]], ... [[0, 1], [0, 1], [0, 1], [0, 1]], ... [[0, 1], [0, 1], [1, 1], [1, 1]], ... [[0, 0], [0, 0], [1, 1], [2, 2]], ... [[0, 0], [-1, -1], [1, 1], [-1, -1]]]) >>> ac1 = g.count_alleles(subpop=[0, 1]) >>> ac2 = g.count_alleles(subpop=[2]) >>> ac3 = g.count_alleles(subpop=[3]) >>> loc_private_alleles = allel.locate_private_alleles(ac1, ac2, ac3) >>> loc_private_alleles array([[ True, False, False], [False, False, False], [ True, False, False], [ True, True, True], [ True, True, False]], dtype=bool) >>> loc_private_variants = np.any(loc_private_alleles, axis=1) >>> loc_private_variants array([ True, False, True, True, True], dtype=bool) """ # check inputs acs = [asarray_ndim(ac, 2) for ac in acs] check_dim0_aligned(*acs) acs = ensure_dim1_aligned(*acs) # stack allele counts for convenience pac = np.dstack(acs) # count the numbers of populations with each allele npa = np.sum(pac > 0, axis=2) # locate alleles found only in a single population loc_pa = npa == 1 return loc_pa
[docs]def sample_to_haplotype_selection(indices, ploidy): return [(i * ploidy) + n for i in indices for n in range(ploidy)]