Source code for genometools.expression.genome

# Copyright (c) 2015, 2016 Florian Wagner
#
# This file is part of GenomeTools.
#
# GenomeTools is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License, Version 3,
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""Module containing the `ExpGenome` class."""

from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
_oldstr = str
from builtins import *

import os
import logging
import hashlib
from collections import OrderedDict, Iterable, Counter
import copy

import six
import pandas as pd
import unicodecsv as csv
import numpy as np

from .gene import ExpGene

logger = logging.getLogger(__name__)


[docs]class ExpGenome(object): """A complete set of genes in a gene expression analysis. The class represents a "genome" in the form of an ordered set of genes. This means that each gene has an index value, i.e. an integer indicating its 0-based position in the genome. Parameters ---------- genes : Iterable of `ExpGene` objects See :attr:`genes` attribute. Attributes ---------- genes : list of `ExpGene` The genes in the genome. Notes ----- The implementation is very similar to the `genometools.basic.GeneSetCollection` class. It uses ordered dictionaries to support efficient access by gene name or index, as well as looking up the index of specific gene. """ def __init__(self, genes): assert isinstance(genes, Iterable) self._genes = OrderedDict([g, None] for g in genes) for g in self._genes.keys(): assert isinstance(g, ExpGene) """ # check for duplicate gene names counts = Counter(g.name for g in genes) for k, v in counts.items(): if v > 1: raise ValueError('Cannot create ExpGenome: more than one gene ' 'with name "%s".' % k) # check for duplicate Ensembl IDs counts = Counter(g.ensembl_id for g in self.genes) for k, v in counts.items(): if v > 1 and k is not None: raise ValueError('Cannot create ExpGenome: more than one gene ' 'with Ensembl ID "%s"' % k) """ # auxiliary variables self._genes_by_name = dict([g.name, g] for g in self._genes.keys()) self._genes_by_index = dict([i, g] for i, g in enumerate(self._genes.keys())) self._gene_indices = dict([g, i] for i, g in enumerate(self._genes.keys())) logger.debug('Initialized ExpGenome with %d genes.', len(self)) def __repr__(self): return '<%s object (%d genes, hash="%s")>' \ % (self.__class__.__name__, len(self), self.hash) def __str__(self): return '<%s object with %d genes>' \ % (self.__class__.__name__, len(self)) def __len__(self): return len(self._genes) def __iter__(self): return iter(self._genes.keys()) def __getitem__(self, key): """Simple interface for accessing genes. Depending on whether key is an integer or not, look up a gene either by index, or by name. """ if isinstance(key, (int, np.integer)): if key < 0 or key >= len(self): raise ValueError('Index %d out of bounds!' % key) return self._genes_by_index[key] elif isinstance(key, (str, _oldstr)): try: return self._genes_by_name[key] except KeyError: raise ValueError('No gene with name "%s"!' % key) else: raise TypeError('Key must be int or str.') def __eq__(self, other): if self is other: return True elif type(self) is type(other): return repr(self) == repr(other) else: return NotImplemented def __ne__(self, other): return not self.__eq__(other) def __contains__(self, gene_or_name): if isinstance(gene_or_name, (str, _oldstr)): return gene_or_name in self._genes_by_name elif isinstance(gene_or_name, ExpGene): return gene_or_name in self.genes else: raise TypeError('Must be string or ExpGene instance.') @property def hash(self): """Returns an MD5 hash value for the genome.""" data_str = ';'.join(repr(g) for g in self.genes) data = data_str.encode('UTF-8') return str(hashlib.md5(data).hexdigest()) @property def genes(self): """Returns a list with all genes.""" return list(self._genes.keys()) @property def gene_names(self): """Returns a list of all gene names.""" return [g.name for g in self._genes.keys()] @property def gene_set(self): """Returns a set of all genes.""" return set(self._genes.keys()) @classmethod
[docs] def from_gene_names(cls, names): """Generate a genome from a list of gene names. Parameters ---------- names : Iterable of str The list of gene names. Returns ------- ExpGenome The genome. """ assert isinstance(names, Iterable) genome = cls([ExpGene(n) for n in names]) return genome
[docs] def index(self, gene_or_name): """Returns the index of a given gene. The index is 0-based, so the first gene in the genome has the index 0, and the last one has index ``len(genome) - 1``. Parameters ---------- gene_or_name : str The gene or its name (symbol). Returns ------- int The gene index. """ if isinstance(gene_or_name, (str, _oldstr)): # name specified try: gene = self._genes_by_name[gene_or_name] except KeyError: raise ValueError('No gene with name "%s"!' % gene_or_name) elif isinstance(gene_or_name, ExpGene): gene = gene_or_name else: raise TypeError('Must be a gene name or an ExpGene instance.') return self._gene_indices[gene]
@classmethod
[docs] def read_tsv(cls, path, encoding='UTF-8'): """Read genes from tab-delimited text file. Parameters ---------- path : str The path of the text file. encoding : str, optional The file encoding. ('UTF-8') Returns ------- None """ df = pd.read_csv(path, sep='\t', na_values=['nan', 'NaN'], keep_default_na=False) df.columns = [c.lower() for c in df.columns] genes = [] for _, row in df.iterrows(): genes.append(ExpGene.from_dict(row.to_dict())) return cls(genes)
[docs] def write_tsv(self, path, encoding='UTF-8', overwrite=False): """Write genes to tab-delimited text file in alphabetical order. Parameters ---------- path : str The path of the output file. encoding: str, optional The file encoding. ["UTF-8"] Returns ------- None """ # create pandas data frame from the genes data = OrderedDict([i, g.to_dict()] for i, g in enumerate(self._genes.keys())) df = pd.DataFrame.from_dict(data, orient='index') # write to tab-delimited text file sep = '\t' if six.PY2: sep = sep.encode('UTF-8') df.to_csv(path, sep=sep, index=False) logger.info('Wrote %d genes to file "%s".', len(self), path)