Source code for allel.stats.mendel

# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division


import numpy as np


from allel.model.ndarray import GenotypeArray, HaplotypeArray
from allel.util import check_ploidy, check_min_samples, check_type, check_dtype
from allel.opt.stats import phase_progeny_by_transmission as _opt_phase_progeny_by_transmission, \
    phase_parents_by_transmission as _opt_phase_parents_by_transmission


[docs]def mendel_errors(parent_genotypes, progeny_genotypes): """Locate genotype calls not consistent with Mendelian transmission of alleles. Parameters ---------- parent_genotypes : array_like, int, shape (n_variants, 2, 2) Genotype calls for the two parents. progeny_genotypes : array_like, int, shape (n_variants, n_progeny, 2) Genotype calls for the progeny. Returns ------- me : ndarray, int, shape (n_variants, n_progeny) Count of Mendel errors for each progeny genotype call. Examples -------- The following are all consistent with Mendelian transmission. Note that a value of 0 is returned for missing calls:: >>> import allel >>> import numpy as np >>> genotypes = np.array([ ... # aa x aa -> aa ... [[0, 0], [0, 0], [0, 0], [-1, -1], [-1, -1], [-1, -1]], ... [[1, 1], [1, 1], [1, 1], [-1, -1], [-1, -1], [-1, -1]], ... [[2, 2], [2, 2], [2, 2], [-1, -1], [-1, -1], [-1, -1]], ... # aa x ab -> aa or ab ... [[0, 0], [0, 1], [0, 0], [0, 1], [-1, -1], [-1, -1]], ... [[0, 0], [0, 2], [0, 0], [0, 2], [-1, -1], [-1, -1]], ... [[1, 1], [0, 1], [1, 1], [0, 1], [-1, -1], [-1, -1]], ... # aa x bb -> ab ... [[0, 0], [1, 1], [0, 1], [-1, -1], [-1, -1], [-1, -1]], ... [[0, 0], [2, 2], [0, 2], [-1, -1], [-1, -1], [-1, -1]], ... [[1, 1], [2, 2], [1, 2], [-1, -1], [-1, -1], [-1, -1]], ... # aa x bc -> ab or ac ... [[0, 0], [1, 2], [0, 1], [0, 2], [-1, -1], [-1, -1]], ... [[1, 1], [0, 2], [0, 1], [1, 2], [-1, -1], [-1, -1]], ... # ab x ab -> aa or ab or bb ... [[0, 1], [0, 1], [0, 0], [0, 1], [1, 1], [-1, -1]], ... [[1, 2], [1, 2], [1, 1], [1, 2], [2, 2], [-1, -1]], ... [[0, 2], [0, 2], [0, 0], [0, 2], [2, 2], [-1, -1]], ... # ab x bc -> ab or ac or bb or bc ... [[0, 1], [1, 2], [0, 1], [0, 2], [1, 1], [1, 2]], ... [[0, 1], [0, 2], [0, 0], [0, 1], [0, 1], [1, 2]], ... # ab x cd -> ac or ad or bc or bd ... [[0, 1], [2, 3], [0, 2], [0, 3], [1, 2], [1, 3]], ... ]) >>> me = allel.stats.mendel_errors(genotypes[:, :2], genotypes[:, 2:]) >>> me array([[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) The following are cases of 'non-parental' inheritance where one or two alleles are found in the progeny that are not present in either parent. Note that the number of errors may be 1 or 2 depending on the number of non-parental alleles:: >>> genotypes = np.array([ ... # aa x aa -> ab or ac or bb or cc ... [[0, 0], [0, 0], [0, 1], [0, 2], [1, 1], [2, 2]], ... [[1, 1], [1, 1], [0, 1], [1, 2], [0, 0], [2, 2]], ... [[2, 2], [2, 2], [0, 2], [1, 2], [0, 0], [1, 1]], ... # aa x ab -> ac or bc or cc ... [[0, 0], [0, 1], [0, 2], [1, 2], [2, 2], [2, 2]], ... [[0, 0], [0, 2], [0, 1], [1, 2], [1, 1], [1, 1]], ... [[1, 1], [0, 1], [1, 2], [0, 2], [2, 2], [2, 2]], ... # aa x bb -> ac or bc or cc ... [[0, 0], [1, 1], [0, 2], [1, 2], [2, 2], [2, 2]], ... [[0, 0], [2, 2], [0, 1], [1, 2], [1, 1], [1, 1]], ... [[1, 1], [2, 2], [0, 1], [0, 2], [0, 0], [0, 0]], ... # ab x ab -> ac or bc or cc ... [[0, 1], [0, 1], [0, 2], [1, 2], [2, 2], [2, 2]], ... [[0, 2], [0, 2], [0, 1], [1, 2], [1, 1], [1, 1]], ... [[1, 2], [1, 2], [0, 1], [0, 2], [0, 0], [0, 0]], ... # ab x bc -> ad or bd or cd or dd ... [[0, 1], [1, 2], [0, 3], [1, 3], [2, 3], [3, 3]], ... [[0, 1], [0, 2], [0, 3], [1, 3], [2, 3], [3, 3]], ... [[0, 2], [1, 2], [0, 3], [1, 3], [2, 3], [3, 3]], ... # ab x cd -> ae or be or ce or de ... [[0, 1], [2, 3], [0, 4], [1, 4], [2, 4], [3, 4]], ... ]) >>> me = allel.stats.mendel_errors(genotypes[:, :2], genotypes[:, 2:]) >>> me array([[1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 1, 2], [1, 1, 1, 2], [1, 1, 1, 2], [1, 1, 1, 1]]) The following are cases of 'hemi-parental' inheritance, where progeny appear to have inherited two copies of an allele found only once in one of the parents:: >>> genotypes = np.array([ ... # aa x ab -> bb ... [[0, 0], [0, 1], [1, 1], [-1, -1]], ... [[0, 0], [0, 2], [2, 2], [-1, -1]], ... [[1, 1], [0, 1], [0, 0], [-1, -1]], ... # ab x bc -> aa or cc ... [[0, 1], [1, 2], [0, 0], [2, 2]], ... [[0, 1], [0, 2], [1, 1], [2, 2]], ... [[0, 2], [1, 2], [0, 0], [1, 1]], ... # ab x cd -> aa or bb or cc or dd ... [[0, 1], [2, 3], [0, 0], [1, 1]], ... [[0, 1], [2, 3], [2, 2], [3, 3]], ... ]) >>> me = allel.stats.mendel_errors(genotypes[:, :2], genotypes[:, 2:]) >>> me array([[1, 0], [1, 0], [1, 0], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1]]) The following are cases of 'uni-parental' inheritance, where progeny appear to have inherited both alleles from a single parent:: >>> genotypes = np.array([ ... # aa x bb -> aa or bb ... [[0, 0], [1, 1], [0, 0], [1, 1]], ... [[0, 0], [2, 2], [0, 0], [2, 2]], ... [[1, 1], [2, 2], [1, 1], [2, 2]], ... # aa x bc -> aa or bc ... [[0, 0], [1, 2], [0, 0], [1, 2]], ... [[1, 1], [0, 2], [1, 1], [0, 2]], ... # ab x cd -> ab or cd ... [[0, 1], [2, 3], [0, 1], [2, 3]], ... ]) >>> me = allel.stats.mendel_errors(genotypes[:, :2], genotypes[:, 2:]) >>> me array([[1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1]]) """ # setup parent_genotypes = GenotypeArray(parent_genotypes) progeny_genotypes = GenotypeArray(progeny_genotypes) check_ploidy(parent_genotypes.ploidy, 2) check_ploidy(progeny_genotypes.ploidy, 2) # transform into per-call allele counts max_allele = max(parent_genotypes.max(), progeny_genotypes.max()) parent_gc = parent_genotypes.to_allele_counts(max_allele=max_allele, dtype='i1') progeny_gc = progeny_genotypes.to_allele_counts(max_allele=max_allele, dtype='i1') # detect nonparental and hemiparental inheritance by comparing allele # counts between parents and progeny max_progeny_gc = parent_gc.clip(max=1).sum(axis=1) max_progeny_gc = max_progeny_gc[:, np.newaxis, :] me = (progeny_gc - max_progeny_gc).clip(min=0).sum(axis=2) # detect uniparental inheritance by finding cases where no alleles are # shared between parents, then comparing progeny allele counts to each # parent p1_gc = parent_gc[:, 0, np.newaxis, :] p2_gc = parent_gc[:, 1, np.newaxis, :] # find variants where parents don't share any alleles is_shared_allele = (p1_gc > 0) & (p2_gc > 0) no_shared_alleles = ~np.any(is_shared_allele, axis=2) # find calls where progeny genotype is identical to one or the other parent me[no_shared_alleles & (np.all(progeny_gc == p1_gc, axis=2) | np.all(progeny_gc == p2_gc, axis=2))] = 1 # retrofit where either or both parent has a missing call me[np.any(parent_genotypes.is_missing(), axis=1)] = 0 return me
# constants to represent inheritance states INHERIT_UNDETERMINED = 0 INHERIT_PARENT1 = 1 INHERIT_PARENT2 = 2 INHERIT_NONSEG_REF = 3 INHERIT_NONSEG_ALT = 4 INHERIT_NONPARENTAL = 5 INHERIT_PARENT_MISSING = 6 INHERIT_MISSING = 7
[docs]def paint_transmission(parent_haplotypes, progeny_haplotypes): """Paint haplotypes inherited from a single diploid parent according to their allelic inheritance. Parameters ---------- parent_haplotypes : array_like, int, shape (n_variants, 2) Both haplotypes from a single diploid parent. progeny_haplotypes : array_like, int, shape (n_variants, n_progeny) Haplotypes found in progeny of the given parent, inherited from the given parent. I.e., haplotypes from gametes of the given parent. Returns ------- painting : ndarray, uint8, shape (n_variants, n_progeny) An array of integers coded as follows: 1 = allele inherited from first parental haplotype; 2 = allele inherited from second parental haplotype; 3 = reference allele, also carried by both parental haplotypes; 4 = non-reference allele, also carried by both parental haplotypes; 5 = non-parental allele; 6 = either or both parental alleles missing; 7 = missing allele; 0 = undetermined. Examples -------- >>> import allel >>> haplotypes = allel.HaplotypeArray([ ... [0, 0, 0, 1, 2, -1], ... [0, 1, 0, 1, 2, -1], ... [1, 0, 0, 1, 2, -1], ... [1, 1, 0, 1, 2, -1], ... [0, 2, 0, 1, 2, -1], ... [0, -1, 0, 1, 2, -1], ... [-1, 1, 0, 1, 2, -1], ... [-1, -1, 0, 1, 2, -1], ... ], dtype='i1') >>> painting = allel.stats.paint_transmission(haplotypes[:, :2], ... haplotypes[:, 2:]) >>> painting array([[3, 5, 5, 7], [1, 2, 5, 7], [2, 1, 5, 7], [5, 4, 5, 7], [1, 5, 2, 7], [6, 6, 6, 7], [6, 6, 6, 7], [6, 6, 6, 7]], dtype=uint8) """ # check inputs parent_haplotypes = HaplotypeArray(parent_haplotypes) progeny_haplotypes = HaplotypeArray(progeny_haplotypes) if parent_haplotypes.n_haplotypes != 2: raise ValueError('exactly two parental haplotypes should be provided') # convenience variables parent1 = parent_haplotypes[:, 0, np.newaxis] parent2 = parent_haplotypes[:, 1, np.newaxis] progeny_is_missing = progeny_haplotypes < 0 parent_is_missing = np.any(parent_haplotypes < 0, axis=1) # need this for broadcasting, but also need to retain original for later parent_is_missing_bc = parent_is_missing[:, np.newaxis] parent_diplotype = GenotypeArray(parent_haplotypes[:, np.newaxis, :]) parent_is_hom_ref = parent_diplotype.is_hom_ref() parent_is_het = parent_diplotype.is_het() parent_is_hom_alt = parent_diplotype.is_hom_alt() # identify allele calls where inheritance can be determined is_callable = ~progeny_is_missing & ~parent_is_missing_bc is_callable_seg = is_callable & parent_is_het # main inheritance states inherit_parent1 = is_callable_seg & (progeny_haplotypes == parent1) inherit_parent2 = is_callable_seg & (progeny_haplotypes == parent2) nonseg_ref = (is_callable & parent_is_hom_ref & (progeny_haplotypes == parent1)) nonseg_alt = (is_callable & parent_is_hom_alt & (progeny_haplotypes == parent1)) nonparental = ( is_callable & (progeny_haplotypes != parent1) & (progeny_haplotypes != parent2) ) # record inheritance states # N.B., order in which these are set matters painting = np.zeros(progeny_haplotypes.shape, dtype='u1') painting[inherit_parent1] = INHERIT_PARENT1 painting[inherit_parent2] = INHERIT_PARENT2 painting[nonseg_ref] = INHERIT_NONSEG_REF painting[nonseg_alt] = INHERIT_NONSEG_ALT painting[nonparental] = INHERIT_NONPARENTAL painting[parent_is_missing] = INHERIT_PARENT_MISSING painting[progeny_is_missing] = INHERIT_MISSING return painting
[docs]def phase_progeny_by_transmission(g): """Phase progeny genotypes from a trio or cross using Mendelian transmission. Parameters ---------- g : array_like, int, shape (n_variants, n_samples, 2) Genotype array, with parents as first two columns and progeny as remaining columns. Returns ------- g : ndarray, int8, shape (n_variants, n_samples, 2) Genotype array with progeny phased where possible. Examples -------- >>> import allel >>> g = allel.GenotypeArray([ ... [[0, 0], [0, 0], [0, 0]], ... [[1, 1], [1, 1], [1, 1]], ... [[0, 0], [1, 1], [0, 1]], ... [[1, 1], [0, 0], [0, 1]], ... [[0, 0], [0, 1], [0, 0]], ... [[0, 0], [0, 1], [0, 1]], ... [[0, 1], [0, 0], [0, 1]], ... [[0, 1], [0, 1], [0, 1]], ... [[0, 1], [1, 2], [0, 1]], ... [[1, 2], [0, 1], [1, 2]], ... [[0, 1], [2, 3], [0, 2]], ... [[2, 3], [0, 1], [1, 3]], ... [[0, 0], [0, 0], [-1, -1]], ... [[0, 0], [0, 0], [1, 1]], ... ], dtype='i1') >>> g = allel.stats.phase_progeny_by_transmission(g) >>> print(g.to_str(row_threshold=None)) 0/0 0/0 0|0 1/1 1/1 1|1 0/0 1/1 0|1 1/1 0/0 1|0 0/0 0/1 0|0 0/0 0/1 0|1 0/1 0/0 1|0 0/1 0/1 0/1 0/1 1/2 0|1 1/2 0/1 2|1 0/1 2/3 0|2 2/3 0/1 3|1 0/0 0/0 ./. 0/0 0/0 1/1 >>> g.is_phased array([[False, False, True], [False, False, True], [False, False, True], [False, False, True], [False, False, True], [False, False, True], [False, False, True], [False, False, False], [False, False, True], [False, False, True], [False, False, True], [False, False, True], [False, False, False], [False, False, False]], dtype=bool) """ # setup g = GenotypeArray(g, dtype='i1', copy=True) check_ploidy(g.ploidy, 2) check_min_samples(g.n_samples, 3) # run the phasing is_phased = _opt_phase_progeny_by_transmission(g.values) g.is_phased = np.asarray(is_phased).view(bool) # outputs return g
[docs]def phase_parents_by_transmission(g, window_size): """Phase parent genotypes from a trio or cross, given progeny genotypes already phased by Mendelian transmission. Parameters ---------- g : GenotypeArray Genotype array, with parents as first two columns and progeny as remaining columns, where progeny genotypes are already phased. window_size : int Number of previous heterozygous sites to include when phasing each parent. A number somewhere between 10 and 100 may be appropriate, depending on levels of heterozygosity and quality of data. Returns ------- g : GenotypeArray Genotype array with parents phased where possible. """ # setup check_type(g, GenotypeArray) check_dtype(g.values, 'i1') check_ploidy(g.ploidy, 2) if g.is_phased is None: raise ValueError('genotype array must first have progeny phased by transmission') check_min_samples(g.n_samples, 3) # run the phasing is_phased = g.is_phased.view('u1') _opt_phase_parents_by_transmission(g.values, is_phased, window_size) # outputs return g
[docs]def phase_by_transmission(g, window_size, copy=True): """Phase genotypes in a trio or cross where possible using Mendelian transmission. Parameters ---------- g : array_like, int, shape (n_variants, n_samples, 2) Genotype array, with parents as first two columns and progeny as remaining columns. window_size : int Number of previous heterozygous sites to include when phasing each parent. A number somewhere between 10 and 100 may be appropriate, depending on levels of heterozygosity and quality of data. copy : bool, optional If False, attempt to phase genotypes in-place. Note that this is only possible if the input array has int8 dtype, otherwise a copy is always made regardless of this parameter. Returns ------- g : GenotypeArray Genotype array with progeny phased where possible. """ # setup g = GenotypeArray(g, dtype='i1', copy=copy) check_ploidy(g.ploidy, 2) check_min_samples(g.n_samples, 3) # phase the progeny is_phased = _opt_phase_progeny_by_transmission(g.values) g.is_phased = np.asarray(is_phased).view(bool) # phase the parents _opt_phase_parents_by_transmission(g.values, is_phased, window_size) return g