# -*- coding: utf-8 -*-
# third-party imports
import numpy as np
# internal imports
from allel.util import asarray_ndim, check_dim0_aligned, ensure_dim1_aligned
__all__ = ['create_allele_mapping', 'locate_private_alleles', 'locate_fixed_differences',
'sample_to_haplotype_selection']
[docs]def create_allele_mapping(ref, alt, alleles, dtype='i1'):
"""Create an array mapping variant alleles into a different allele index
system.
Parameters
----------
ref : array_like, S1, shape (n_variants,)
Reference alleles.
alt : array_like, S1, shape (n_variants, n_alt_alleles)
Alternate alleles.
alleles : array_like, S1, shape (n_variants, n_alleles)
Alleles defining the new allele indexing.
dtype : dtype, optional
Output dtype.
Returns
-------
mapping : ndarray, int8, shape (n_variants, n_alt_alleles + 1)
Examples
--------
Example with biallelic variants::
>>> import allel
>>> ref = [b'A', b'C', b'T', b'G']
>>> alt = [b'T', b'G', b'C', b'A']
>>> alleles = [[b'A', b'T'], # no transformation
... [b'G', b'C'], # swap
... [b'T', b'A'], # 1 missing
... [b'A', b'C']] # 1 missing
>>> mapping = allel.create_allele_mapping(ref, alt, alleles)
>>> mapping
array([[ 0, 1],
[ 1, 0],
[ 0, -1],
[-1, 0]], dtype=int8)
Example with multiallelic variants::
>>> ref = [b'A', b'C', b'T']
>>> alt = [[b'T', b'G'],
... [b'A', b'T'],
... [b'G', b'.']]
>>> alleles = [[b'A', b'T'],
... [b'C', b'T'],
... [b'G', b'A']]
>>> mapping = create_allele_mapping(ref, alt, alleles)
>>> mapping
array([[ 0, 1, -1],
[ 0, -1, 1],
[-1, 0, -1]], dtype=int8)
See Also
--------
GenotypeArray.map_alleles, HaplotypeArray.map_alleles, AlleleCountsArray.map_alleles
"""
ref = asarray_ndim(ref, 1)
alt = asarray_ndim(alt, 1, 2)
alleles = asarray_ndim(alleles, 1, 2)
check_dim0_aligned(ref, alt, alleles)
# reshape for convenience
ref = ref[:, None]
if alt.ndim == 1:
alt = alt[:, None]
if alleles.ndim == 1:
alleles = alleles[:, None]
source_alleles = np.append(ref, alt, axis=1)
# setup output array
out = np.empty(source_alleles.shape, dtype=dtype)
out.fill(-1)
# find matches
for ai in range(source_alleles.shape[1]):
match = source_alleles[:, ai, None] == alleles
match_i, match_j = match.nonzero()
out[match_i, ai] = match_j
return out
[docs]def locate_fixed_differences(ac1, ac2):
"""Locate variants with no shared alleles between two populations.
Parameters
----------
ac1 : array_like, int, shape (n_variants, n_alleles)
Allele counts array from the first population.
ac2 : array_like, int, shape (n_variants, n_alleles)
Allele counts array from the second population.
Returns
-------
loc : ndarray, bool, shape (n_variants,)
See Also
--------
allel.stats.diversity.windowed_df
Examples
--------
>>> import allel
>>> g = allel.GenotypeArray([[[0, 0], [0, 0], [1, 1], [1, 1]],
... [[0, 1], [0, 1], [0, 1], [0, 1]],
... [[0, 1], [0, 1], [1, 1], [1, 1]],
... [[0, 0], [0, 0], [1, 1], [2, 2]],
... [[0, 0], [-1, -1], [1, 1], [-1, -1]]])
>>> ac1 = g.count_alleles(subpop=[0, 1])
>>> ac2 = g.count_alleles(subpop=[2, 3])
>>> loc_df = allel.locate_fixed_differences(ac1, ac2)
>>> loc_df
array([ True, False, False, True, True])
"""
# check inputs
ac1 = asarray_ndim(ac1, 2)
ac2 = asarray_ndim(ac2, 2)
check_dim0_aligned(ac1, ac2)
ac1, ac2 = ensure_dim1_aligned(ac1, ac2)
# stack allele counts for convenience
pac = np.dstack([ac1, ac2])
# count numbers of alleles called in each population
pan = np.sum(pac, axis=1)
# count the numbers of populations with each allele
npa = np.sum(pac > 0, axis=2)
# locate variants with allele calls in both populations
non_missing = np.all(pan > 0, axis=1)
# locate variants where all alleles are only found in a single population
no_shared_alleles = np.all(npa <= 1, axis=1)
return non_missing & no_shared_alleles
[docs]def locate_private_alleles(*acs):
"""Locate alleles that are found only in a single population.
Parameters
----------
*acs : array_like, int, shape (n_variants, n_alleles)
Allele counts arrays from each population.
Returns
-------
loc : ndarray, bool, shape (n_variants, n_alleles)
Boolean array where elements are True if allele is private to a
single population.
Examples
--------
>>> import allel
>>> g = allel.GenotypeArray([[[0, 0], [0, 0], [1, 1], [1, 1]],
... [[0, 1], [0, 1], [0, 1], [0, 1]],
... [[0, 1], [0, 1], [1, 1], [1, 1]],
... [[0, 0], [0, 0], [1, 1], [2, 2]],
... [[0, 0], [-1, -1], [1, 1], [-1, -1]]])
>>> ac1 = g.count_alleles(subpop=[0, 1])
>>> ac2 = g.count_alleles(subpop=[2])
>>> ac3 = g.count_alleles(subpop=[3])
>>> loc_private_alleles = allel.locate_private_alleles(ac1, ac2, ac3)
>>> loc_private_alleles
array([[ True, False, False],
[False, False, False],
[ True, False, False],
[ True, True, True],
[ True, True, False]])
>>> loc_private_variants = np.any(loc_private_alleles, axis=1)
>>> loc_private_variants
array([ True, False, True, True, True])
"""
# check inputs
acs = [asarray_ndim(ac, 2) for ac in acs]
check_dim0_aligned(*acs)
acs = ensure_dim1_aligned(*acs)
# stack allele counts for convenience
pac = np.dstack(acs)
# count the numbers of populations with each allele
npa = np.sum(pac > 0, axis=2)
# locate alleles found only in a single population
loc_pa = npa == 1
return loc_pa
[docs]def sample_to_haplotype_selection(indices, ploidy):
return [(i * ploidy) + n for i in indices for n in range(ploidy)]