# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division
import subprocess
import gzip
import numpy as np
from allel.compat import PY2, unquote_plus
def gff3_parse_attributes(attributes_string):
"""Parse a string of GFF3 attributes ('key=value' pairs delimited by ';')
and return a dictionary."""
attributes = dict()
fields = attributes_string.split(';')
for f in fields:
if '=' in f:
key, value = f.split('=')
key = unquote_plus(key).strip()
value = unquote_plus(value.strip())
attributes[key] = value
elif len(f) > 0:
# not strictly kosher
attributes[unquote_plus(f).strip()] = True
return attributes
[docs]def iter_gff3(path, attributes=None, region=None, score_fill=-1,
phase_fill=-1, attributes_fill=b'.', tabix='tabix'):
"""Iterate over records in a GFF3 file.
Parameters
----------
path : string
Path to input file.
attributes : list of strings, optional
List of columns to extract from the "attributes" field.
region : string, optional
Genome region to extract. If given, file must be position
sorted, bgzipped and tabix indexed. Tabix must also be installed
and on the system path.
score_fill : int, optional
Value to use where score field has a missing value.
phase_fill : int, optional
Value to use where phase field has a missing value.
attributes_fill : object or list of objects, optional
Value(s) to use where attribute field(s) have a missing value.
tabix : string
Tabix command.
Returns
-------
Iterator
"""
# prepare fill values for attributes
if attributes is not None:
attributes = list(attributes)
if isinstance(attributes_fill, (list, tuple)):
if len(attributes) != len(attributes_fill):
raise ValueError('number of fills does not match attributes')
else:
attributes_fill = [attributes_fill] * len(attributes)
# open input stream
if region is not None:
cmd = [tabix, path, region]
buffer = subprocess.Popen(cmd, stdout=subprocess.PIPE).stdout
elif path.endswith('.gz') or path.endswith('.bgz'):
buffer = gzip.open(path, mode='rb')
else:
buffer = open(path, mode='rb')
try:
for line in buffer:
if line[0] == b'>':
# assume begin embedded FASTA
raise StopIteration
if line[0] == b'#':
# skip comment lines
continue
vals = line.split(b'\t')
if len(vals) == 9:
# unpack for processing
fseqid, fsource, ftype, fstart, fend, fscore, fstrand, fphase, fattrs = vals
# convert numerics
fstart = int(fstart)
fend = int(fend)
if fscore == b'.':
fscore = score_fill
else:
fscore = float(fscore)
if fphase == b'.':
fphase = phase_fill
else:
fphase = int(fphase)
if not PY2:
fseqid = str(fseqid, 'ascii')
fsource = str(fsource, 'ascii')
ftype = str(ftype, 'ascii')
fstrand = str(fstrand, 'ascii')
fattrs = str(fattrs, 'ascii')
rec = (fseqid, fsource, ftype, fstart, fend, fscore, fstrand, fphase)
if attributes is not None:
dattrs = gff3_parse_attributes(fattrs)
vattrs = tuple(
dattrs.get(k, f)
for k, f in zip(attributes, attributes_fill)
)
rec += vattrs
yield rec
finally:
buffer.close()
# TODO dry docstrings
[docs]def gff3_to_recarray(path, attributes=None, region=None, score_fill=-1,
phase_fill=-1, attributes_fill=b'.', tabix='tabix', dtype=None):
"""Load data from a GFF3 into a NumPy recarray.
Parameters
----------
path : string
Path to input file.
attributes : list of strings, optional
List of columns to extract from the "attributes" field.
region : string, optional
Genome region to extract. If given, file must be position
sorted, bgzipped and tabix indexed. Tabix must also be installed
and on the system path.
score_fill : int, optional
Value to use where score field has a missing value.
phase_fill : int, optional
Value to use where phase field has a missing value.
attributes_fill : object or list of objects, optional
Value(s) to use where attribute field(s) have a missing value.
tabix : string, optional
Tabix command.
dtype : dtype, optional
Override dtype.
Returns
-------
np.recarray
"""
# read records
recs = list(iter_gff3(path, attributes=attributes, region=region,
score_fill=score_fill, phase_fill=phase_fill,
attributes_fill=attributes_fill, tabix=tabix))
if not recs:
return None
# determine dtype
if dtype is None:
dtype = [('seqid', object),
('source', object),
('type', object),
('start', int),
('end', int),
('score', float),
('strand', object),
('phase', int)]
if attributes:
for n in attributes:
dtype.append((n, object))
a = np.rec.fromrecords(recs, dtype=dtype)
return a
[docs]def gff3_to_dataframe(path, attributes=None, region=None, score_fill=-1,
phase_fill=-1, attributes_fill=b'.', tabix='tabix', **kwargs):
"""Load data from a GFF3 into a pandas DataFrame.
Parameters
----------
path : string
Path to input file.
attributes : list of strings, optional
List of columns to extract from the "attributes" field.
region : string, optional
Genome region to extract. If given, file must be position
sorted, bgzipped and tabix indexed. Tabix must also be installed
and on the system path.
score_fill : int, optional
Value to use where score field has a missing value.
phase_fill : int, optional
Value to use where phase field has a missing value.
attributes_fill : object or list of objects, optional
Value(s) to use where attribute field(s) have a missing value.
tabix : string, optional
Tabix command.
Returns
-------
pandas.DataFrame
"""
import pandas
# read records
recs = list(iter_gff3(path, attributes=attributes, region=region,
score_fill=score_fill, phase_fill=phase_fill,
attributes_fill=attributes_fill, tabix=tabix))
# load into pandas
columns = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase']
if attributes:
columns += list(attributes)
df = pandas.DataFrame.from_records(recs, columns=columns, **kwargs)
return df