Source code for allel.stats.decomposition

# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division


import numpy as np
import scipy.linalg


from allel.stats.preprocessing import get_scaler


[docs]def pca(gn, n_components=10, copy=True, scaler='patterson', ploidy=2): """Perform principal components analysis of genotype data, via singular value decomposition. Parameters ---------- gn : array_like, float, shape (n_variants, n_samples) Genotypes at biallelic variants, coded as the number of alternate alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt). n_components : int, optional Number of components to keep. copy : bool, optional If False, data passed to fit are overwritten. scaler : {'patterson', 'standard', None} Scaling method; 'patterson' applies the method of Patterson et al 2006; 'standard' scales to unit variance; None centers the data only. ploidy : int, optional Sample ploidy, only relevant if 'patterson' scaler is used. Returns ------- coords : ndarray, float, shape (n_samples, n_components) Transformed coordinates for the samples. model : GenotypePCA Model instance containing the variance ratio explained and the stored components (a.k.a., loadings). Can be used to project further data into the same principal components space via the transform() method. Notes ----- Genotype data should be filtered prior to using this function to remove variants in linkage disequilibrium. See Also -------- randomized_pca, allel.stats.ld.locate_unlinked """ # set up the model model = GenotypePCA(n_components, copy=copy, scaler=scaler, ploidy=ploidy) # fit the model and project the input data onto the new dimensions coords = model.fit_transform(gn) return coords, model
class GenotypePCA(object): def __init__(self, n_components=10, copy=True, scaler='patterson', ploidy=2): self.n_components = n_components self.copy = copy self.scaler = scaler self.scaler_ = get_scaler(scaler, copy, ploidy) def fit(self, gn): self._fit(gn) return self def fit_transform(self, gn): u, s, v = self._fit(gn) u = u[:, :self.n_components] u *= s[:self.n_components] return u def _fit(self, gn): # apply scaling gn = self.scaler_.fit(gn).transform(gn) # transpose for svd # TODO eliminate need for transposition x = gn.T n_samples, n_features = x.shape # singular value decomposition u, s, v = scipy.linalg.svd(x, full_matrices=False) # calculate explained variance explained_variance_ = (s ** 2) / n_samples explained_variance_ratio_ = (explained_variance_ / explained_variance_.sum()) # store variables n_components = self.n_components self.components_ = v[:n_components] self.explained_variance_ = explained_variance_[:n_components] self.explained_variance_ratio_ = \ explained_variance_ratio_[:n_components] return u, s, v def transform(self, gn, copy=None): if not hasattr(self, 'components_'): raise ValueError('model has not been not fitted') # scaling gn = self.scaler_.transform(gn, copy=copy) # transpose for transformation # TODO eliminate need for transposition x = gn.T # apply transformation x_transformed = np.dot(x, self.components_.T) return x_transformed
[docs]def randomized_pca(gn, n_components=10, copy=True, iterated_power=3, random_state=None, scaler='patterson', ploidy=2): """Perform principal components analysis of genotype data, via an approximate truncated singular value decomposition using randomization to speed up the computation. Parameters ---------- gn : array_like, float, shape (n_variants, n_samples) Genotypes at biallelic variants, coded as the number of alternate alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt). n_components : int, optional Number of components to keep. copy : bool, optional If False, data passed to fit are overwritten. iterated_power : int, optional Number of iterations for the power method. random_state : int or RandomState instance or None (default) Pseudo Random Number generator seed control. If None, use the numpy.random singleton. scaler : {'patterson', 'standard', None} Scaling method; 'patterson' applies the method of Patterson et al 2006; 'standard' scales to unit variance; None centers the data only. ploidy : int, optional Sample ploidy, only relevant if 'patterson' scaler is used. Returns ------- coords : ndarray, float, shape (n_samples, n_components) Transformed coordinates for the samples. model : GenotypeRandomizedPCA Model instance containing the variance ratio explained and the stored components (a.k.a., loadings). Can be used to project further data into the same principal components space via the transform() method. Notes ----- Genotype data should be filtered prior to using this function to remove variants in linkage disequilibrium. Based on the :class:`sklearn.decomposition.RandomizedPCA` implementation. See Also -------- pca, allel.stats.ld.locate_unlinked """ # set up the model model = GenotypeRandomizedPCA(n_components, copy=copy, iterated_power=iterated_power, random_state=random_state, scaler=scaler, ploidy=ploidy) # fit the model and project the input data onto the new dimensions coords = model.fit_transform(gn) return coords, model
class GenotypeRandomizedPCA(object): def __init__(self, n_components=10, copy=True, iterated_power=3, random_state=None, scaler='patterson', ploidy=2): self.n_components = n_components self.copy = copy self.iterated_power = iterated_power self.random_state = random_state self.scaler = scaler self.scaler_ = get_scaler(scaler, copy, ploidy) def fit(self, gn): self._fit(gn) return self def fit_transform(self, gn): u, s, v = self._fit(gn) u *= s return u def _fit(self, gn): from sklearn.utils.validation import check_random_state from sklearn.utils.extmath import randomized_svd # apply scaling gn = self.scaler_.fit(gn).transform(gn) # transpose for svd # TODO eliminate need for transposition x = gn.T n_samples, n_features = x.shape # intermediates random_state = check_random_state(self.random_state) n_components = self.n_components n_samples, n_features = x.shape # singular value decomposition u, s, v = randomized_svd(x, n_components, n_iter=self.iterated_power, random_state=random_state) # calculate explained variance self.explained_variance_ = exp_var = (s ** 2) / n_samples full_var = np.var(x, axis=0).sum() self.explained_variance_ratio_ = exp_var / full_var # store components self.components_ = v return u, s, v def transform(self, gn, copy=None): if not hasattr(self, 'components_'): raise ValueError('model has not been not fitted') # scaling gn = self.scaler_.transform(gn, copy=copy) # transpose for transformation # TODO eliminate need for transposition x = gn.T # apply transformation x_transformed = np.dot(x, self.components_.T) return x_transformed