Source code for genometools.expression.profile

# Copyright (c) 2016 Florian Wagner
#
# This file is part of GenomeTools.
#
# GenomeTools is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License, Version 3,
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""Module containing the `ExpProfile` class."""

from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
_oldstr = str
from builtins import *

import logging
import importlib
import hashlib
from collections import Iterable

import six
import pandas as pd
import numpy as np

from . import ExpGene, ExpGenome
matrix = importlib.import_module('.matrix', package='genometools.expression')
# "from . import matrix" does not work, due to cyclical imports

logger = logging.getLogger(__name__)


[docs]class ExpProfile(pd.Series):
    """A gene expression profile.

    This class inherits from `pandas.Series`.

    Parameters
    ----------
    x : 1-dimensional `numpy.ndarray`
        See :attr:`x` attribute.
        
    Keyword-only Parameters
    -----------------------
    genes : list or tuple of str
        See :attr:`genes` attribute.
    name : str
        See :attr:`name` attribute.
        
    Additional Parameters
    ---------------------
    All `pandas.Series` parameters.

    Attributes
    ----------
    x : 1-dimensional `numpy.ndarray`
        The vector with expression values.
    genes : `pandas.Index`
        Alias for :attr:`pandas.Series.index`. Contains the names of the genes
        in the matrix.
    label : str
        Alias for :attr:`pandas.Series.name`. The sample label.
    """
    def __init__(self, *args, **kwargs):
        
        # check if user provided "x" keyword argument
        x = kwargs.pop('x', None)
        if x is not None:
            assert isinstance(x, np.ndarray) and x.ndim == 1
            kwargs['data'] = x

        # check if user provided "genes" keyword argument
        genes = kwargs.pop('genes', None)
        if genes is not None:
            assert isinstance(genes, Iterable)

        # check if user provided "label" keyword argument
        label = kwargs.pop('label', None)

        # call base class constructor
        pd.Series.__init__(self, *args, **kwargs)
        
        if genes is not None:
            # set (overwrite) index with user-provided list
            self.index = genes

        if label is not None:
            # set (overwrite) series name with user-provided sample label
            self.name = label

        # set default index name to "Genes"
        gene_label = kwargs.pop('gene_label', None)
        if gene_label is not None:
            self.index.name = gene_label
        elif self.index.name is None:
            self.index.name = 'Genes'

    def __eq__(self, other):
        if self is other:
            return True
        elif type(self) is type(other):
            return (self.label == other.label and
                    self.index.equals(other.index) and
                    self.equals(other))
        else:
            return pd.Series.__eq__(self, other)

    def __ne__(self, other):
        return not self.__eq__(other)

    def __repr__(self):
        return '<%s instance (label="%s", p=%d, hash="%s">' \
               % (self.__class__.__name__, self._label_str,
                  self.p, self.hash)

    #def __str__(self):
    #    if self.label is not None:
    #        label_str = self._label_str
    #    else:
    #        label_str = '(unlabeled)'
    #    return '<%s %s with p=%d genes>'  \
    #           % (self.__class__.__name__, label_str, self.p)

    @property
    def _label_str(self):
        return str(self.label) if self.label is not None else ''

    @property
    def _constructor(self):
        return ExpProfile

    @property
    def _constructor_expanddim(self):
        return matrix.ExpMatrix

    @property
    def hash(self):
        # warning: involves copying all the data
        gene_str = ','.join([str(g) for g in self.genes])
        data_str = ';'.join([self._label_str, gene_str]) + ';'
        data = data_str.encode('UTF-8') + self.x.tobytes()
        return str(hashlib.md5(data).hexdigest())

    @property
    def p(self):
        """The number of genes."""
        return self.shape[0]

    @property
    def genes(self):
        """Alias for `Series.index`."""
        return self.index

    @genes.setter
    def genes(self, gene_list):
        self.index = gene_list

    @property
    def label(self):
        """Alias for `Series.name`."""
        return self.name

    @label.setter
    def label(self, label):
        self.name = label

    @property
    def x(self):
        """Alias for `Series.values`."""
        return self.values

    @x.setter
    def x(self, x):
        self.x[:] = x

    @property
    def genome(self):
        """Get an `ExpGenome` representation of the genes in the profile."""
        genes = [ExpGene(g) for g in self.genes]
        return ExpGenome(genes)

[docs]    def sort_genes(self, inplace=False):
        """Sort the rows of the profile alphabetically by gene name.

        Parameters
        ----------
        inplace: bool, optional
            If set to True, perform the sorting in-place.
        
        Returns
        -------
        None

        Notes
        -----
        pandas 0.18.0's `Series.sort_index` method does not support the
        ``kind`` keyword, which is needed to select a stable sort algorithm.
        """
        # kind = 'quicksort'
        # if stable:
        #    kind = 'mergesort'
        self.sort_index(inplace=inplace)

[docs]    def filter_against_genome(self, genome):
        """Filter the expression matrix against a _genome (set of genes).

        Parameters
        ----------
        genome: `genometools.expression.ExpGenome`
            The genome to filter the genes against.

        Returns
        -------
        ExpMatrix
            The filtered expression matrix.
        """
        assert isinstance(genome, ExpGenome)

        filt = self.loc[self.index & genome.gene_names]
        return filt

    @classmethod
[docs]    def read_tsv(cls, path, genome=None, encoding='UTF-8'):
        """Read expression profile from a tab-delimited text file.

        Parameters
        ----------
        path: str
            The path of the text file.
        genome: `ExpGenome` object, optional
            The set of valid genes. If given, the genes in the text file will
            be filtered against this set of genes. (None)
        encoding: str, optional
            The file encoding. ("UTF-8")

        Returns
        -------
        `ExpProfile`
            The expression profile.
        """
        # checks
        assert isinstance(path, (str, _oldstr))
        if genome is not None:
            assert isinstance(genome, ExpGenome)
        assert isinstance(encoding, (str, _oldstr))

        # "squeeze = True" ensures that a pd.read_tsv returns a series
        # as long as there is only one column
        e = cls(pd.read_csv(path, sep='\t', index_col=0, header=0,
                            encoding=encoding, squeeze=True))

        if genome is not None:
            # filter genes
            e = e.filter_against_genome(genome)

        return e

[docs]    def write_tsv(self, path, encoding='UTF-8'):
        """Write expression matrix to a tab-delimited text file.

        Parameters
        ----------
        path: str
            The path of the output file.
        encoding: str, optional
            The file encoding. ("UTF-8")

        Returns
        -------
        None
        """
        assert isinstance(path, (str, _oldstr))
        assert isinstance(encoding, (str, _oldstr))

        sep = '\t'
        if six.PY2:
            sep = sep.encode('UTF-8')

        self.to_csv(
            path, sep=sep, float_format='%.5f', mode='w',
            encoding=encoding, header=True
        )

        logger.info('Wrote expression profile "%s" with %d genes to "%s".',
                    self.name, self.p, path)