# Copyright (c) 2015, 2016 Florian Wagner
#
# This file is part of GenomeTools.
#
# GenomeTools is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License, Version 3,
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""Module containing the `ExpGenome` class."""
from __future__ import (absolute_import, division,
print_function, unicode_literals)
_oldstr = str
from builtins import *
import os
import logging
import hashlib
from collections import OrderedDict, Iterable, Counter
import copy
import six
import pandas as pd
import unicodecsv as csv
import numpy as np
from .gene import ExpGene
logger = logging.getLogger(__name__)
[docs]class ExpGenome(object):
"""A complete set of genes in a gene expression analysis.
The class represents a "genome" in the form of an ordered set of genes.
This means that each gene has an index value, i.e. an integer indicating
its 0-based position in the genome.
Parameters
----------
genes : Iterable of `ExpGene` objects
See :attr:`genes` attribute.
Attributes
----------
genes : list of `ExpGene`
The genes in the genome.
Notes
-----
The implementation is very similar to the
`genometools.basic.GeneSetCollection` class. It uses ordered dictionaries
to support efficient access by gene name or index, as well as looking up
the index of specific gene.
"""
def __init__(self, genes):
assert isinstance(genes, Iterable)
self._genes = OrderedDict([g, None] for g in genes)
for g in self._genes.keys():
assert isinstance(g, ExpGene)
"""
# check for duplicate gene names
counts = Counter(g.name for g in genes)
for k, v in counts.items():
if v > 1:
raise ValueError('Cannot create ExpGenome: more than one gene '
'with name "%s".' % k)
# check for duplicate Ensembl IDs
counts = Counter(g.ensembl_id for g in self.genes)
for k, v in counts.items():
if v > 1 and k is not None:
raise ValueError('Cannot create ExpGenome: more than one gene '
'with Ensembl ID "%s"' % k)
"""
# auxiliary variables
self._genes_by_name = dict([g.name, g] for g in self._genes.keys())
self._genes_by_index = dict([i, g]
for i, g in
enumerate(self._genes.keys()))
self._gene_indices = dict([g, i]
for i, g in enumerate(self._genes.keys()))
logger.debug('Initialized ExpGenome with %d genes.', len(self))
def __repr__(self):
return '<%s object (%d genes, hash="%s")>' \
% (self.__class__.__name__, len(self), self.hash)
def __str__(self):
return '<%s object with %d genes>' \
% (self.__class__.__name__, len(self))
def __len__(self):
return len(self._genes)
def __iter__(self):
return iter(self._genes.keys())
def __getitem__(self, key):
"""Simple interface for accessing genes.
Depending on whether key is an integer or not, look up a gene
either by index, or by name.
"""
if isinstance(key, (int, np.integer)):
if key < 0 or key >= len(self):
raise ValueError('Index %d out of bounds!' % key)
return self._genes_by_index[key]
elif isinstance(key, (str, _oldstr)):
try:
return self._genes_by_name[key]
except KeyError:
raise ValueError('No gene with name "%s"!' % key)
else:
raise TypeError('Key must be int or str.')
def __eq__(self, other):
if self is other:
return True
elif type(self) is type(other):
return repr(self) == repr(other)
else:
return NotImplemented
def __ne__(self, other):
return not self.__eq__(other)
def __contains__(self, gene_or_name):
if isinstance(gene_or_name, (str, _oldstr)):
return gene_or_name in self._genes_by_name
elif isinstance(gene_or_name, ExpGene):
return gene_or_name in self.genes
else:
raise TypeError('Must be string or ExpGene instance.')
@property
def hash(self):
"""Returns an MD5 hash value for the genome."""
data_str = ';'.join(repr(g) for g in self.genes)
data = data_str.encode('UTF-8')
return str(hashlib.md5(data).hexdigest())
@property
def genes(self):
"""Returns a list with all genes."""
return list(self._genes.keys())
@property
def gene_names(self):
"""Returns a list of all gene names."""
return [g.name for g in self._genes.keys()]
@property
def gene_set(self):
"""Returns a set of all genes."""
return set(self._genes.keys())
@classmethod
[docs] def from_gene_names(cls, names):
"""Generate a genome from a list of gene names.
Parameters
----------
names : Iterable of str
The list of gene names.
Returns
-------
ExpGenome
The genome.
"""
assert isinstance(names, Iterable)
genome = cls([ExpGene(n) for n in names])
return genome
[docs] def index(self, gene_or_name):
"""Returns the index of a given gene.
The index is 0-based, so the first gene in the genome has the index 0,
and the last one has index ``len(genome) - 1``.
Parameters
----------
gene_or_name : str
The gene or its name (symbol).
Returns
-------
int
The gene index.
"""
if isinstance(gene_or_name, (str, _oldstr)):
# name specified
try:
gene = self._genes_by_name[gene_or_name]
except KeyError:
raise ValueError('No gene with name "%s"!' % gene_or_name)
elif isinstance(gene_or_name, ExpGene):
gene = gene_or_name
else:
raise TypeError('Must be a gene name or an ExpGene instance.')
return self._gene_indices[gene]
@classmethod
[docs] def read_tsv(cls, path, encoding='UTF-8'):
"""Read genes from tab-delimited text file.
Parameters
----------
path : str
The path of the text file.
encoding : str, optional
The file encoding. ('UTF-8')
Returns
-------
None
"""
df = pd.read_csv(path, sep='\t', na_values=['nan', 'NaN'], keep_default_na=False)
df.columns = [c.lower() for c in df.columns]
genes = []
for _, row in df.iterrows():
genes.append(ExpGene.from_dict(row.to_dict()))
return cls(genes)
[docs] def write_tsv(self, path, encoding='UTF-8', overwrite=False):
"""Write genes to tab-delimited text file in alphabetical order.
Parameters
----------
path : str
The path of the output file.
encoding: str, optional
The file encoding. ["UTF-8"]
Returns
-------
None
"""
# create pandas data frame from the genes
data = OrderedDict([i, g.to_dict()]
for i, g in enumerate(self._genes.keys()))
df = pd.DataFrame.from_dict(data, orient='index')
# write to tab-delimited text file
sep = '\t'
if six.PY2:
sep = sep.encode('UTF-8')
df.to_csv(path, sep=sep, index=False)
logger.info('Wrote %d genes to file "%s".', len(self), path)