Source code for genometools.expression.profile

# Copyright (c) 2016 Florian Wagner
#
# This file is part of GenomeTools.
#
# GenomeTools is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License, Version 3,
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""Module containing the `ExpProfile` class."""

from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
_oldstr = str
from builtins import *

import logging
import importlib
import hashlib
from collections import Iterable

import six
import pandas as pd
import numpy as np

from . import ExpGene, ExpGenome
matrix = importlib.import_module('.matrix', package='genometools.expression')
# "from . import matrix" does not work, due to cyclical imports

logger = logging.getLogger(__name__)


[docs]class ExpProfile(pd.Series): """A gene expression profile. This class inherits from `pandas.Series`. Parameters ---------- x : 1-dimensional `numpy.ndarray` See :attr:`x` attribute. Keyword-only Parameters ----------------------- genes : list or tuple of str See :attr:`genes` attribute. name : str See :attr:`name` attribute. Additional Parameters --------------------- All `pandas.Series` parameters. Attributes ---------- x : 1-dimensional `numpy.ndarray` The vector with expression values. genes : `pandas.Index` Alias for :attr:`pandas.Series.index`. Contains the names of the genes in the matrix. label : str Alias for :attr:`pandas.Series.name`. The sample label. """ def __init__(self, *args, **kwargs): # check if user provided "x" keyword argument x = kwargs.pop('x', None) if x is not None: assert isinstance(x, np.ndarray) and x.ndim == 1 kwargs['data'] = x # check if user provided "genes" keyword argument genes = kwargs.pop('genes', None) if genes is not None: assert isinstance(genes, Iterable) # check if user provided "label" keyword argument label = kwargs.pop('label', None) # call base class constructor pd.Series.__init__(self, *args, **kwargs) if genes is not None: # set (overwrite) index with user-provided list self.index = genes if label is not None: # set (overwrite) series name with user-provided sample label self.name = label # set default index name to "Genes" gene_label = kwargs.pop('gene_label', None) if gene_label is not None: self.index.name = gene_label elif self.index.name is None: self.index.name = 'Genes' def __eq__(self, other): if self is other: return True elif type(self) is type(other): return (self.label == other.label and self.index.equals(other.index) and self.equals(other)) else: return pd.Series.__eq__(self, other) def __ne__(self, other): return not self.__eq__(other) def __repr__(self): return '<%s instance (label="%s", p=%d, hash="%s">' \ % (self.__class__.__name__, self._label_str, self.p, self.hash) #def __str__(self): # if self.label is not None: # label_str = self._label_str # else: # label_str = '(unlabeled)' # return '<%s %s with p=%d genes>' \ # % (self.__class__.__name__, label_str, self.p) @property def _label_str(self): return str(self.label) if self.label is not None else '' @property def _constructor(self): return ExpProfile @property def _constructor_expanddim(self): return matrix.ExpMatrix @property def hash(self): # warning: involves copying all the data gene_str = ','.join([str(g) for g in self.genes]) data_str = ';'.join([self._label_str, gene_str]) + ';' data = data_str.encode('UTF-8') + self.x.tobytes() return str(hashlib.md5(data).hexdigest()) @property def p(self): """The number of genes.""" return self.shape[0] @property def genes(self): """Alias for `Series.index`.""" return self.index @genes.setter def genes(self, gene_list): self.index = gene_list @property def label(self): """Alias for `Series.name`.""" return self.name @label.setter def label(self, label): self.name = label @property def x(self): """Alias for `Series.values`.""" return self.values @x.setter def x(self, x): self.x[:] = x @property def genome(self): """Get an `ExpGenome` representation of the genes in the profile.""" genes = [ExpGene(g) for g in self.genes] return ExpGenome(genes)
[docs] def sort_genes(self, inplace=False): """Sort the rows of the profile alphabetically by gene name. Parameters ---------- inplace: bool, optional If set to True, perform the sorting in-place. Returns ------- None Notes ----- pandas 0.18.0's `Series.sort_index` method does not support the ``kind`` keyword, which is needed to select a stable sort algorithm. """ # kind = 'quicksort' # if stable: # kind = 'mergesort' self.sort_index(inplace=inplace)
[docs] def filter_against_genome(self, genome): """Filter the expression matrix against a _genome (set of genes). Parameters ---------- genome: `genometools.expression.ExpGenome` The genome to filter the genes against. Returns ------- ExpMatrix The filtered expression matrix. """ assert isinstance(genome, ExpGenome) filt = self.loc[self.index & genome.gene_names] return filt
@classmethod
[docs] def read_tsv(cls, path, genome=None, encoding='UTF-8'): """Read expression profile from a tab-delimited text file. Parameters ---------- path: str The path of the text file. genome: `ExpGenome` object, optional The set of valid genes. If given, the genes in the text file will be filtered against this set of genes. (None) encoding: str, optional The file encoding. ("UTF-8") Returns ------- `ExpProfile` The expression profile. """ # checks assert isinstance(path, (str, _oldstr)) if genome is not None: assert isinstance(genome, ExpGenome) assert isinstance(encoding, (str, _oldstr)) # "squeeze = True" ensures that a pd.read_tsv returns a series # as long as there is only one column e = cls(pd.read_csv(path, sep='\t', index_col=0, header=0, encoding=encoding, squeeze=True)) if genome is not None: # filter genes e = e.filter_against_genome(genome) return e
[docs] def write_tsv(self, path, encoding='UTF-8'): """Write expression matrix to a tab-delimited text file. Parameters ---------- path: str The path of the output file. encoding: str, optional The file encoding. ("UTF-8") Returns ------- None """ assert isinstance(path, (str, _oldstr)) assert isinstance(encoding, (str, _oldstr)) sep = '\t' if six.PY2: sep = sep.encode('UTF-8') self.to_csv( path, sep=sep, float_format='%.5f', mode='w', encoding=encoding, header=True ) logger.info('Wrote expression profile "%s" with %d genes to "%s".', self.name, self.p, path)