Source code for allel.model.chunked

# -*- coding: utf-8 -*-
"""This module is maintained for backwards-compatibility, however it is recommended
to migrate to use the equivalent classes from the :mod:`allel.model.dask` module
where possible.

This module provides alternative implementations of array and table
classes defined in the :mod:`allel.model.ndarray` module, using
chunked arrays for data storage. Chunked arrays can be compressed and
optionally stored on disk, providing a means for working with data too
large to fit uncompressed in main memory.

Either `Zarr <http://zarr.readthedocs.io>`_, HDF5 (via `h5py <http://www.h5py.org/>`_)
or `bcolz <http://bcolz.blosc.org>`_ can be used as the underlying storage
layer. Choice of storage layer can be made via the `storage` keyword
argument which all class methods accept.  This argument can either be
a string identifying one of the predefined storage layer
configurations, or an object implementing the chunked storage API. For more
information about controlling storage see the :mod:`allel.chunked` module.

"""
from __future__ import absolute_import, print_function, division


import numpy as np


from allel.compat import copy_method_doc, string_types
from allel import chunked as _chunked
from allel.chunked import ChunkedArrayWrapper, ChunkedTableWrapper
from allel.io.vcf_write import write_vcf_header, write_vcf_data, normalize_callset
from allel.util import check_ndim, check_integer_dtype
from allel.abc import DisplayAs2D
from .ndarray import (
    GenotypeVector, GenotypeArray, HaplotypeArray, AlleleCountsArray, VariantTable,
    SortedIndex, SortedMultiIndex, GenotypeAlleleCountsArray, GenotypeAlleleCountsVector
)
from .generic import (
    compress_genotypes, take_genotypes, concatenate_genotypes,
    index_genotype_array, subset_genotype_array, index_haplotype_array,
    compress_haplotype_array, take_haplotype_array, subset_haplotype_array,
    concatenate_haplotype_array, index_allele_counts_array, compress_allele_counts_array,
    take_allele_counts_array, concatenate_allele_counts_array, compress_genotype_ac,
    take_genotype_ac, concatenate_genotype_ac, subset_genotype_ac_array,
    index_genotype_ac_array
)


__all__ = ['GenotypeChunkedArray', 'HaplotypeChunkedArray',
           'AlleleCountsChunkedArray', 'VariantChunkedTable',
           'AlleleCountsChunkedTable', 'GenotypeAlleleCountsChunkedArray']


[docs]class GenotypeChunkedArray(ChunkedArrayWrapper, DisplayAs2D): """Alternative implementation of the :class:`allel.model.ndarray.GenotypeArray` class, wrapping a chunked array as the backing store. Parameters ---------- data : array_like, int, shape (n_variants, n_samples, ploidy) Genotype data to be wrapped. May be a bcolz carray, h5py dataset, or anything providing a similar interface. Examples -------- Wrap an HDF5 dataset:: >>> import h5py >>> with h5py.File('callset.h5', mode='w') as h5f: ... h5g = h5f.create_group('/3L/calldata') ... h5g.create_dataset('genotype', ... data=[[[0, 0], [0, 1]], ... [[0, 1], [1, 1]], ... [[0, 2], [-1, -1]]], ... dtype='i1', chunks=(2, 2, 2), ... compression='gzip', compression_opts=1) ... <HDF5 dataset "genotype": shape (3, 2, 2), type "|i1"> >>> import allel >>> callset = h5py.File('callset.h5', mode='r') >>> g = allel.GenotypeChunkedArray(callset['/3L/calldata/genotype']) >>> g <GenotypeChunkedArray shape=(3, 2, 2) dtype=int8 chunks=(2, 2, 2) nbytes=12 cbytes=30 cratio=0.4 compression=gzip compression_opts=1 values=h5py._hl.dataset.Dataset> >>> g.values <HDF5 dataset "genotype": shape (3, 2, 2), type "|i1"> Obtain a numpy array by slicing, e.g.:: >>> g[:] <GenotypeArray shape=(3, 2, 2) dtype=int8> 0/0 0/1 0/1 1/1 0/2 ./. Note that most methods will return a chunked array, using whatever chunked storage is set as default (bcolz carray) or specified directly via the `storage` keyword argument. E.g.:: >>> g.copy() <GenotypeChunkedArray shape=(3, 2, 2) dtype=int8 chunks=(3, 2, 2) ... >>> g.copy(storage='bcolzmem') <GenotypeChunkedArray shape=(3, 2, 2) dtype=int8 chunks=(4096, 2, 2) ... >>> g.copy(storage='hdf5mem_zlib1') <GenotypeChunkedArray shape=(3, 2, 2) dtype=int8 chunks=(3, 2, 2) ... """ # flake8: noqa def __init__(self, data): super(GenotypeChunkedArray, self).__init__(data) check_ndim(self.values, 3) check_integer_dtype(self.values) self._mask = None self._is_phased = None def __getitem__(self, item): return index_genotype_array(self, item, array_cls=GenotypeArray, vector_cls=GenotypeVector) @property def n_variants(self): """Number of variants.""" return self.shape[0] @property def n_samples(self): """Number of samples.""" return self.shape[1] @property def ploidy(self): """Sample ploidy.""" return self.shape[-1] @property def n_allele_calls(self): """Total number of allele calls.""" return np.prod(self.shape) @property def n_calls(self): """Total number of genotype calls.""" return self.n_allele_calls // self.ploidy @property def mask(self): return self._mask @mask.setter def mask(self, mask): if not hasattr(mask, 'shape'): mask = np.asarray(mask, dtype=bool) if mask.shape != self.shape[:2]: raise ValueError('mask has incorrect shape') self._mask = mask @property def is_phased(self): return self._is_phased @is_phased.setter def is_phased(self, is_phased): if not hasattr(is_phased, 'shape'): is_phased = np.asarray(is_phased, dtype=bool) if is_phased.shape != self.shape[:2]: raise ValueError('is_phased has incorrect shape') self._is_phased = is_phased def fill_masked(self, value=-1, **kwargs): out = self.map_blocks_method('fill_masked', kwargs=dict(value=value), **kwargs) return GenotypeChunkedArray(out) def is_called(self, **kwargs): return self.map_blocks_method('is_called', **kwargs) def is_missing(self, **kwargs): return self.map_blocks_method('is_missing', **kwargs) def is_hom(self, allele=None, **kwargs): return self.map_blocks_method('is_hom', kwargs=dict(allele=allele), **kwargs) def is_hom_ref(self, **kwargs): return self.map_blocks_method('is_hom_ref', **kwargs) def is_hom_alt(self, **kwargs): return self.map_blocks_method('is_hom_alt', **kwargs) def is_het(self, allele=None, **kwargs): return self.map_blocks_method('is_het', kwargs=dict(allele=allele), **kwargs) def is_call(self, call, **kwargs): return self.map_blocks_method('is_call', kwargs=dict(call=call), **kwargs) def _count(self, method_name, axis, kwargs=None, **storage_kwargs): if kwargs is None: kwargs = dict() def mapper(block): method = getattr(block, method_name) return method(**kwargs) out = self.sum(axis=axis, mapper=mapper, **storage_kwargs) return out def count_called(self, axis=None, **kwargs): return self._count('is_called', axis, **kwargs) def count_missing(self, axis=None, **kwargs): return self._count('is_missing', axis, **kwargs) def count_hom(self, allele=None, axis=None, **kwargs): return self._count('is_hom', axis, kwargs=dict(allele=allele), **kwargs) def count_hom_ref(self, axis=None, **kwargs): return self._count('is_hom_ref', axis, **kwargs) def count_hom_alt(self, axis=None, **kwargs): return self._count('is_hom_alt', axis, **kwargs) def count_het(self, allele=None, axis=None, **kwargs): return self._count('is_het', axis, kwargs=dict(allele=allele), **kwargs) def count_call(self, call, axis=None, **kwargs): return self._count('is_call', axis, kwargs=dict(call=call), **kwargs) def to_haplotypes(self, **kwargs): out = self.map_blocks_method('to_haplotypes', **kwargs) return HaplotypeChunkedArray(out) def to_n_ref(self, fill=0, dtype='i1', **kwargs): out = self.map_blocks_method('to_n_ref', kwargs=dict(fill=fill, dtype=dtype), **kwargs) return out def to_n_alt(self, fill=0, dtype='i1', **kwargs): out = self.map_blocks_method('to_n_alt', kwargs=dict(fill=fill, dtype=dtype), **kwargs) return out def to_allele_counts(self, max_allele=None, **kwargs): # determine alleles to count if max_allele is None: max_allele = self.max() out = self.map_blocks_method('to_allele_counts', kwargs=dict(max_allele=max_allele), **kwargs) out = GenotypeAlleleCountsChunkedArray(out) return out def to_packed(self, boundscheck=True, **kwargs): out = self.map_blocks_method('to_packed', kwargs=dict(boundscheck=boundscheck), **kwargs) return out @classmethod def from_packed(cls, packed, **kwargs): def f(block): return GenotypeArray.from_packed(block) out = _chunked.map_blocks(packed, f, **kwargs) return cls(out) def count_alleles(self, max_allele=None, subpop=None, **kwargs): if max_allele is None: max_allele = self.max() out = self.map_blocks_method('count_alleles', kwargs=dict(max_allele=max_allele, subpop=subpop), **kwargs) return AlleleCountsChunkedArray(out) def count_alleles_subpops(self, subpops, max_allele=None, **kwargs): if max_allele is None: max_allele = self.max() def f(block): return block.count_alleles_subpops(subpops, max_allele=max_allele) out = _chunked.map_blocks(self, f, create='table', **kwargs) return AlleleCountsChunkedTable(out) def to_gt(self, max_allele=None, **kwargs): out = self.map_blocks_method('to_gt', kwargs=dict(max_allele=max_allele), **kwargs) return out def map_alleles(self, mapping, **kwargs): def f(block, bmapping): return block.map_alleles(bmapping, copy=False) domain = (self, mapping) out = _chunked.map_blocks(domain, f, **kwargs) return GenotypeChunkedArray(out) def compress(self, condition, axis=0, out=None, **kwargs): return compress_genotypes(self, condition, axis=axis, wrap_axes={0, 1}, cls=type(self), compress=_chunked.compress, out=out, **kwargs) def take(self, indices, axis=0, out=None, mode='raise', **kwargs): return take_genotypes(self, indices, axis=axis, wrap_axes={0, 1}, cls=type(self), take=_chunked.take, out=out, mode=mode, **kwargs) def subset(self, sel0=None, sel1=None, **kwargs): return subset_genotype_array(self, sel0, sel1, cls=type(self), subset=_chunked.subset, **kwargs) def concatenate(self, others, axis=0, **kwargs): return concatenate_genotypes(self, others, axis=axis, wrap_axes={0, 1}, cls=type(self), concatenate=_chunked.concatenate, **kwargs)
# copy docstrings copy_method_doc(GenotypeChunkedArray.fill_masked, GenotypeArray.fill_masked) copy_method_doc(GenotypeChunkedArray.subset, GenotypeArray.subset) copy_method_doc(GenotypeChunkedArray.is_called, GenotypeArray.is_called) copy_method_doc(GenotypeChunkedArray.is_missing, GenotypeArray.is_missing) copy_method_doc(GenotypeChunkedArray.is_hom, GenotypeArray.is_hom) copy_method_doc(GenotypeChunkedArray.is_hom_ref, GenotypeArray.is_hom_ref) copy_method_doc(GenotypeChunkedArray.is_hom_alt, GenotypeArray.is_hom_alt) copy_method_doc(GenotypeChunkedArray.is_het, GenotypeArray.is_het) copy_method_doc(GenotypeChunkedArray.is_call, GenotypeArray.is_call) copy_method_doc(GenotypeChunkedArray.to_haplotypes, GenotypeArray.to_haplotypes) copy_method_doc(GenotypeChunkedArray.to_n_ref, GenotypeArray.to_n_ref) copy_method_doc(GenotypeChunkedArray.to_n_alt, GenotypeArray.to_n_alt) copy_method_doc(GenotypeChunkedArray.to_allele_counts, GenotypeArray.to_allele_counts) copy_method_doc(GenotypeChunkedArray.to_packed, GenotypeArray.to_packed) # TODO # copy_method_doc(GenotypeChunkedArray.from_packed, GenotypeArray.from_packed) copy_method_doc(GenotypeChunkedArray.count_alleles, GenotypeArray.count_alleles) copy_method_doc(GenotypeChunkedArray.count_alleles_subpops, GenotypeArray.count_alleles_subpops) copy_method_doc(GenotypeChunkedArray.to_gt, GenotypeArray.to_gt) copy_method_doc(GenotypeChunkedArray.map_alleles, GenotypeArray.map_alleles) copy_method_doc(GenotypeChunkedArray.concatenate, GenotypeArray.concatenate)
[docs]class HaplotypeChunkedArray(ChunkedArrayWrapper, DisplayAs2D): """Alternative implementation of the :class:`allel.model.ndarray.HaplotypeArray` class, using a chunked array as the backing store. Parameters ---------- data : array_like, int, shape (n_variants, n_haplotypes) Haplotype data to be wrapped. May be a bcolz carray, h5py dataset, or anything providing a similar interface. """ def __init__(self, data): super(HaplotypeChunkedArray, self).__init__(data) check_ndim(self.values, 2) check_integer_dtype(self.values) def __getitem__(self, item): return index_haplotype_array(self, item, cls=HaplotypeArray) @property def n_variants(self): """Number of variants.""" return self.shape[0] @property def n_haplotypes(self): """Number of haplotypes.""" return self.shape[1] def to_genotypes(self, ploidy=2, **kwargs): # check ploidy is compatible if (self.n_haplotypes % ploidy) > 0: raise ValueError('incompatible ploidy') # build output def f(block): return block.to_genotypes(ploidy) out = self.map_blocks(f, **kwargs) return GenotypeChunkedArray(out) def is_called(self, **kwargs): return self.__ge__(0, **kwargs) def is_missing(self, **kwargs): return self.__lt__(0, **kwargs) def is_ref(self, **kwargs): return self.__eq__(0, **kwargs) def is_alt(self, **kwargs): return self.__gt__(0, **kwargs) def is_call(self, allele, **kwargs): return self.__eq__(allele, **kwargs) def _count(self, method_name, axis, kwargs=None, **storage_kwargs): if kwargs is None: kwargs = dict() def mapper(block): method = getattr(block, method_name) return method(**kwargs) out = self.sum(axis=axis, mapper=mapper, **storage_kwargs) return out def count_called(self, axis=None, **kwargs): return self._count('is_called', axis=axis, **kwargs) def count_missing(self, axis=None, **kwargs): return self._count('is_missing', axis=axis, **kwargs) def count_ref(self, axis=None, **kwargs): return self._count('is_ref', axis=axis, **kwargs) def count_alt(self, axis=None, **kwargs): return self._count('is_alt', axis=axis, **kwargs) def count_call(self, allele, axis=None, **kwargs): return self._count('is_call', axis=axis, kwargs=dict(allele=allele), **kwargs) def count_alleles(self, max_allele=None, subpop=None, **kwargs): # if max_allele not specified, count all alleles if max_allele is None: max_allele = self.max() def f(block): return block.count_alleles(max_allele=max_allele, subpop=subpop) out = self.map_blocks(f, **kwargs) return AlleleCountsChunkedArray(out) def count_alleles_subpops(self, subpops, max_allele=None, **kwargs): if max_allele is None: max_allele = self.max() def f(block): return block.count_alleles_subpops(subpops, max_allele=max_allele) out = _chunked.map_blocks(self, f, create='table', **kwargs) return AlleleCountsChunkedTable(out) def map_alleles(self, mapping, **kwargs): def f(block, bmapping): return block.map_alleles(bmapping, copy=False) domain = (self, mapping) out = _chunked.map_blocks(domain, f, **kwargs) return HaplotypeChunkedArray(out) def compress(self, condition, axis=0, out=None, **kwargs): return compress_haplotype_array(self, condition, axis=axis, cls=type(self), compress=_chunked.compress, out=out, **kwargs) def take(self, indices, axis=0, out=None, mode='raise', **kwargs): return take_haplotype_array(self, indices, axis=axis, cls=type(self), take=_chunked.take, out=out, mode=mode, **kwargs) def subset(self, sel0=None, sel1=None, **kwargs): return subset_haplotype_array(self, sel0, sel1, cls=type(self), subset=_chunked.subset, **kwargs) def concatenate(self, others, axis=0, **kwargs): return concatenate_haplotype_array(self, others, axis=axis, cls=type(self), concatenate=_chunked.concatenate, **kwargs)
# copy docstrings copy_method_doc(HaplotypeChunkedArray.to_genotypes, HaplotypeArray.to_genotypes) copy_method_doc(HaplotypeChunkedArray.count_alleles, HaplotypeArray.count_alleles) copy_method_doc(HaplotypeChunkedArray.count_alleles_subpops, HaplotypeArray.count_alleles_subpops) copy_method_doc(HaplotypeChunkedArray.map_alleles, HaplotypeArray.map_alleles) copy_method_doc(HaplotypeChunkedArray.subset, HaplotypeArray.subset)
[docs]class AlleleCountsChunkedArray(ChunkedArrayWrapper, DisplayAs2D): """Alternative implementation of the :class:`allel.model.ndarray.AlleleCountsArray` class, using a chunked array as the backing store. Parameters ---------- data : array_like, int, shape (n_variants, n_alleles) Allele counts data to be wrapped. May be a bcolz carray, h5py dataset, or anything providing a similar interface. """ def __init__(self, data): super(AlleleCountsChunkedArray, self).__init__(data) check_ndim(self.values, 2) check_integer_dtype(self.values) def __getitem__(self, item): return index_allele_counts_array(self, item, cls=AlleleCountsArray) def __add__(self, other, **kwargs): ret = super(AlleleCountsChunkedArray, self).__add__(other, **kwargs) if hasattr(ret, 'shape') and ret.shape == self.shape: ret = AlleleCountsChunkedArray(ret) return ret def __sub__(self, other, **kwargs): ret = super(AlleleCountsChunkedArray, self).__sub__(other, **kwargs) if hasattr(ret, 'shape') and ret.shape == self.shape: ret = AlleleCountsChunkedArray(ret) return ret @property def n_variants(self): """Number of variants (length of first array dimension).""" return self.shape[0] @property def n_alleles(self): """Number of alleles (length of second array dimension).""" return self.shape[1] def to_frequencies(self, fill=np.nan, **kwargs): out = self.map_blocks_method('to_frequencies', kwargs=dict(fill=fill), **kwargs) return ChunkedArrayWrapper(out) def allelism(self, **kwargs): out = self.map_blocks_method('allelism', **kwargs) return ChunkedArrayWrapper(out) def max_allele(self, **kwargs): out = self.map_blocks_method('max_allele', **kwargs) return ChunkedArrayWrapper(out) def is_variant(self, **kwargs): out = self.map_blocks_method('is_variant', **kwargs) return ChunkedArrayWrapper(out) def is_non_variant(self, **kwargs): out = self.map_blocks_method('is_non_variant', **kwargs) return ChunkedArrayWrapper(out) def is_segregating(self, **kwargs): out = self.map_blocks_method('is_segregating', **kwargs) return ChunkedArrayWrapper(out) def is_non_segregating(self, allele=None, **kwargs): out = self.map_blocks_method('is_non_segregating', kwargs=dict(allele=allele), **kwargs) return ChunkedArrayWrapper(out) def is_singleton(self, allele=1, **kwargs): out = self.map_blocks_method('is_singleton', kwargs=dict(allele=allele), **kwargs) return ChunkedArrayWrapper(out) def is_doubleton(self, allele=1, **kwargs): out = self.map_blocks_method('is_doubleton', kwargs=dict(allele=allele), **kwargs) return ChunkedArrayWrapper(out) def is_biallelic(self, **kwargs): out = self.map_blocks_method('is_biallelic', **kwargs) return ChunkedArrayWrapper(out) def is_biallelic_01(self, min_mac=None, **kwargs): out = self.map_blocks_method('is_biallelic_01', kwargs=dict(min_mac=min_mac), **kwargs) return ChunkedArrayWrapper(out) def _count(self, method_name, kwargs=None, **storage_kwargs): if kwargs is None: kwargs = dict() def mapper(block): method = getattr(block, method_name) return method(**kwargs) out = self.sum(mapper=mapper, **storage_kwargs) return out def count_variant(self, **kwargs): return self._count('is_variant', **kwargs) def count_non_variant(self, **kwargs): return self._count('is_non_variant', **kwargs) def count_segregating(self, **kwargs): return self._count('is_segregating', **kwargs) def count_non_segregating(self, allele=None, **kwargs): return self._count('is_non_segregating', kwargs=dict(allele=allele), **kwargs) def count_singleton(self, allele=1, **kwargs): return self._count('is_singleton', kwargs=dict(allele=allele), **kwargs) def count_doubleton(self, allele=1, **kwargs): return self._count('is_doubleton', kwargs=dict(allele=allele), **kwargs) def map_alleles(self, mapping, max_allele=None, **kwargs): if max_allele is None: max_allele = np.max(mapping) def f(block, bmapping): return block.map_alleles(bmapping, max_allele=max_allele) domain = (self, mapping) out = _chunked.map_blocks(domain, f, **kwargs) return AlleleCountsChunkedArray(out) def compress(self, condition, axis=0, out=None, **kwargs): return compress_allele_counts_array(self, condition, axis=axis, cls=type(self), compress=_chunked.compress, out=out, **kwargs) def take(self, indices, axis=0, out=None, mode='raise', **kwargs): return take_allele_counts_array(self, indices, axis=axis, cls=type(self), take=_chunked.take, out=out, mode=mode, **kwargs) def concatenate(self, others, axis=0, **kwargs): return concatenate_allele_counts_array(self, others, axis=axis, cls=type(self), concatenate=_chunked.concatenate, **kwargs)
copy_method_doc(AlleleCountsChunkedArray.allelism, AlleleCountsArray.allelism) copy_method_doc(AlleleCountsChunkedArray.max_allele, AlleleCountsArray.max_allele) copy_method_doc(AlleleCountsChunkedArray.map_alleles, AlleleCountsArray.map_alleles) copy_method_doc(AlleleCountsChunkedArray.to_frequencies, AlleleCountsArray.to_frequencies) copy_method_doc(AlleleCountsChunkedArray.is_variant, AlleleCountsArray.is_variant) copy_method_doc(AlleleCountsChunkedArray.is_non_variant, AlleleCountsArray.is_non_variant) copy_method_doc(AlleleCountsChunkedArray.is_segregating, AlleleCountsArray.is_segregating) copy_method_doc(AlleleCountsChunkedArray.is_non_segregating, AlleleCountsArray.is_non_segregating) copy_method_doc(AlleleCountsChunkedArray.is_singleton, AlleleCountsArray.is_singleton) copy_method_doc(AlleleCountsChunkedArray.is_doubleton, AlleleCountsArray.is_doubleton) copy_method_doc(AlleleCountsChunkedArray.is_biallelic, AlleleCountsArray.is_biallelic) copy_method_doc(AlleleCountsChunkedArray.is_biallelic_01, AlleleCountsArray.is_biallelic_01) class GenotypeAlleleCountsChunkedArray(ChunkedArrayWrapper, DisplayAs2D): def __init__(self, data): super(GenotypeAlleleCountsChunkedArray, self).__init__(data) check_ndim(self.values, 3) check_integer_dtype(self.values) def __getitem__(self, item): return index_genotype_ac_array(self, item, array_cls=GenotypeAlleleCountsArray, vector_cls=GenotypeAlleleCountsVector) @property def n_variants(self): """Number of variants.""" return self.shape[0] @property def n_samples(self): """Number of samples.""" return self.shape[1] @property def n_alleles(self): """Number of alleles.""" return self.shape[2] def is_called(self, **kwargs): return self.map_blocks_method('is_called', **kwargs) def is_missing(self, **kwargs): return self.map_blocks_method('is_missing', **kwargs) def is_hom(self, allele=None, **kwargs): return self.map_blocks_method('is_hom', kwargs=dict(allele=allele), **kwargs) def is_hom_ref(self, **kwargs): return self.map_blocks_method('is_hom_ref', **kwargs) def is_hom_alt(self, **kwargs): return self.map_blocks_method('is_hom_alt', **kwargs) def is_het(self, allele=None, **kwargs): return self.map_blocks_method('is_het', kwargs=dict(allele=allele), **kwargs) def count_alleles(self, subpop=None, **kwargs): out = self.map_blocks_method('count_alleles', kwargs=dict(subpop=subpop), **kwargs) return AlleleCountsChunkedArray(out) def to_gt(self, max_allele=None, **kwargs): out = self.map_blocks_method('to_gt', kwargs=dict(max_allele=max_allele), **kwargs) return out def compress(self, condition, axis=0, out=None, **kwargs): return compress_genotype_ac(self, condition=condition, axis=axis, wrap_axes={0, 1}, cls=type(self), compress=_chunked.compress, out=out, **kwargs) def take(self, indices, axis=0, out=None, mode='raise', **kwargs): return take_genotype_ac(self, indices=indices, axis=axis, wrap_axes={0, 1}, cls=type(self), take=_chunked.take, out=out, mode=mode, **kwargs) def concatenate(self, others, axis=0, **kwargs): return concatenate_genotype_ac(self, others=others, axis=axis, wrap_axes={0, 1}, cls=type(self), concatenate=_chunked.concatenate, **kwargs) def subset(self, sel0=None, sel1=None, **kwargs): return subset_genotype_ac_array(self, sel0, sel1, cls=type(self), subset=_chunked.subset, **kwargs)
[docs]class VariantChunkedTable(ChunkedTableWrapper): """Alternative implementation of the :class:`allel.model.ndarray.VariantTable` class, using a chunked table as the backing store. Parameters ---------- data: table_like Data to be wrapped. May be a tuple or list of columns (array-like), a dict mapping names to columns, a bcolz ctable, h5py group, numpy recarray, or anything providing a similar interface. names : sequence of strings Column names. Examples -------- Wrap columns stored as datasets within an HDF5 group:: >>> import h5py >>> chrom = [b'chr1', b'chr1', b'chr2', b'chr2', b'chr3'] >>> pos = [2, 7, 3, 9, 6] >>> dp = [35, 12, 78, 22, 99] >>> qd = [4.5, 6.7, 1.2, 4.4, 2.8] >>> ac = [(1, 2), (3, 4), (5, 6), (7, 8), (9, 10)] >>> with h5py.File('callset.h5', mode='w') as h5f: ... h5g = h5f.create_group('/3L/variants') ... h5g.create_dataset('CHROM', data=chrom, chunks=True) ... h5g.create_dataset('POS', data=pos, chunks=True) ... h5g.create_dataset('DP', data=dp, chunks=True) ... h5g.create_dataset('QD', data=qd, chunks=True) ... h5g.create_dataset('AC', data=ac, chunks=True) ... <HDF5 dataset "CHROM": shape (5,), type "|S4"> <HDF5 dataset "POS": shape (5,), type "<i8"> <HDF5 dataset "DP": shape (5,), type "<i8"> <HDF5 dataset "QD": shape (5,), type "<f8"> <HDF5 dataset "AC": shape (5, 2), type "<i8"> >>> import allel >>> callset = h5py.File('callset.h5', mode='r') >>> vt = allel.VariantChunkedTable(callset['/3L/variants'], ... names=['CHROM', 'POS', 'AC', 'QD', 'DP']) >>> vt <VariantChunkedTable shape=(5,) dtype=[('CHROM', 'S4'), ('POS', '<i8'), ('AC', ... ... Obtain a single row:: >>> vt[0] row(CHROM=b'chr1', POS=2, AC=array([1, 2]), QD=4.5, DP=35) Obtain a numpy array by slicing:: >>> vt[:] # doctest: +ELLIPSIS <VariantTable shape=(5,) dtype=(numpy.record, [('CHROM', 'S4'), ('POS', '<i8'), ... [(b'chr1', 2, [ 1, 2], 4.5, 35) (b'chr1', 7, [ 3, 4], 6.7, 12) (b'chr2', 3, [ 5, 6], 1.2, 78) (b'chr2', 9, [ 7, 8], 4.4, 22) (b'chr3', 6, [ 9, 10], 2.8, 99)] Access a subset of columns:: >>> vt[['CHROM', 'POS']] <VariantChunkedTable shape=(5,) dtype=[('CHROM', 'S4'), ('POS', '<i8')] nbytes=60 cbytes=60 cratio=1.0 values=builtins.list> Note that most methods will return a chunked table, using whatever chunked storage is set as default (bcolz ctable) or specified directly via the `storage` keyword argument. E.g.:: >>> vt.copy() # doctest: +ELLIPSIS <VariantChunkedTable shape=(5,) dtype=[('CHROM', 'S4'), ('POS', '<i8'), ('AC', ... nbytes=220 cbytes=... cratio=... values=allel.chunked.storage_zarr.ZarrTable> >>> vt.copy(storage='bcolzmem') # doctest: +ELLIPSIS <VariantChunkedTable shape=(5,) dtype=[('CHROM', 'S4'), ('POS', '<i8'), ('AC', ... nbytes=220 cbytes=... cratio=... values=bcolz.ctable.ctable> >>> vt.copy(storage='hdf5mem_zlib1') # doctest: +ELLIPSIS <VariantChunkedTable shape=(5,) dtype=[('CHROM', 'S4'), ('POS', '<i8'), ('AC', ... nbytes=220 cbytes=... cratio=... values=h5py._hl.files.File> """ array_cls = VariantTable def __init__(self, data, names=None, index=None): super(VariantChunkedTable, self).__init__(data, names=names) self.index = None if index is not None: self.set_index(index) @property def n_variants(self): return len(self) def set_index(self, spec): if isinstance(spec, string_types): self.index = SortedIndex(self[spec][:], copy=False) elif isinstance(spec, (tuple, list)) and len(spec) == 2: self.index = SortedMultiIndex(self[spec[0]][:], self[spec[1]][:], copy=False) else: raise ValueError('invalid index argument, expected string or ' 'pair of strings, found %s' % repr(spec)) def to_vcf(self, path, rename=None, number=None, description=None, fill=None, blen=None, write_header=True): names, callset = normalize_callset(self) with open(path, 'w') as vcf_file: if write_header: write_vcf_header(vcf_file, names, callset, rename=rename, number=number, description=description) blen = _chunked.get_blen_table(self, blen) for i in range(0, len(self), blen): j = min(i+blen, len(self)) block = self[i:j] write_vcf_data(vcf_file, names, block, rename=rename, fill=fill)
[docs]class AlleleCountsChunkedTable(ChunkedTableWrapper): def __getitem__(self, item): out = super(AlleleCountsChunkedTable, self).__getitem__(item) if isinstance(item, string_types): # rewrap out = AlleleCountsChunkedArray(out.values) return out @property def n_variants(self): return len(self)