Source code for genometools.expression.matrix

# Copyright (c) 2015, 2016 Florian Wagner
#
# This file is part of GenomeTools.
#
# GenomeTools is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License, Version 3,
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""Module containing the `ExpMatrix` class."""

from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
_oldstr = str
from builtins import *

import logging
import importlib
import hashlib
from collections import Iterable

import pandas as pd
import numpy as np
import unicodecsv as csv
import six

# from .. import misc
from . import ExpGene, ExpGenome
profile = importlib.import_module('.profile', package='genometools.expression')
# - "import profile" is not possible, since a "profile" module exists
#    in the standard library
# - "from . import profile" fails due to cyclical imports

logger = logging.getLogger(__name__)


[docs]class ExpMatrix(pd.DataFrame):
    """A gene expression matrix.

    This class inherits from `pandas.DataFrame`.

    Parameters
    ----------
    X : 2-dimensional `numpy.ndarray`
        See :attr:`X` attribute.

    Keyword-only Parameters
    -----------------------
    genes : list or tuple of str
        See :attr:`genes` attribute.
    samples : list or tuple of str
        See :attr:`samples` attribute.

    Additional Parameters
    ---------------------
    All `pandas.DataFrame` parameters.

    Attributes
    ----------
    genes : tuple of str
        The names of the genes (rows) in the matrix.
    samples : tuple of str
        The names of the samples (columns) in the matrix.
    X : 2-dimensional `numpy.ndarray`
        The matrix of expression values.
    """
    def __init__(self, *args, **kwargs):
        
        # check if user provided "X" keyword argument
        X = kwargs.pop('X', None)
        if X is not None:
            assert isinstance(X, np.ndarray) and X.ndim == 2
            kwargs['data'] = X

        # check if user provided "genes" keyword argument
        genes = kwargs.pop('genes', None)
        if genes is not None:
            assert isinstance(genes, Iterable)

        # check if user provided "samples" keyword argument
        samples = kwargs.pop('samples', None)
        if samples is not None:
            assert isinstance(samples, Iterable)

        # call base class constructor
        pd.DataFrame.__init__(self, *args, **kwargs)

        if genes is not None:
            self.index = genes

        if samples is not None:
            self.columns = samples

        # set default index name to "Genes"
        gene_label = kwargs.pop('gene_label', None)
        if gene_label is not None:
            self.index.name = gene_label
        elif self.index.name is None:
            self.index.name = 'Genes'

        # set default column name to "Samples"
        sample_label = kwargs.pop('sample_label', None)
        if sample_label is not None:
            self.columns.name = sample_label
        elif self.columns.name is None:
            self.columns.name = 'Samples'


    def __eq__(self, other):
        if self is other:
            return True
        elif type(self) is type(other):
            return (self.index.equals(other.index) and \
                    self.columns.equals(other.columns) and \
                    self.equals(other))
        else:
            return pd.DataFrame.__eq__(self, other)

    def __ne__(self, other):
        return not self.__eq__(other)

    def __repr__(self):
        return '<%s instance (p=%d, n=%d, hash="%s">' \
               % (self.__class__.__name__, self.p, self.n, self.hash)

    #def __str__(self):
    #    return '<%s object with p=%d genes and n=%d samples>' \
    #           % (self.__class__.__name__, self.p, self.n)

    @property
    def hash(self):
        # warning: involves copying all the data
        gene_str = ','.join(str(s) for s in self.genes)
        sample_str = ','.join(str(s) for s in self.samples)
        data_str = ';'.join([gene_str, sample_str]) + ';'
        data = data_str.encode('UTF-8') + self.X.tobytes()
        return str(hashlib.md5(data).hexdigest())

    @property
    def _constructor(self):
        return ExpMatrix
    
    @property
    def _constructor_sliced(self):
        return profile.ExpProfile

    @property
    def p(self):
        """The number of genes."""
        return self.shape[0]

    @property
    def n(self):
        """The number of samples."""
        return self.shape[1]

    @property
    def genes(self):
        """Alias for `DataFrame.index`."""
        # return tuple(str(g) for g in self.index)
        return self.index

    @genes.setter
    def genes(self, gene_list):
        self.index = gene_list

    @property
    def samples(self):
        """Alias for `DataFrame.columns`."""
        # return tuple(str(s) for s in self.columns)
        return self.columns

    @samples.setter
    def samples(self, sample_list):
        self.columns = sample_list

    @property
    def X(self):
        """Alias for `DataFrame.values`."""
        return self.values

    @property
    def genome(self):
        """Get an `ExpGenome` representation of the genes in the matrix."""
        return ExpGenome.from_gene_names(self.genes.tolist())


    def filter_variance(self, top):
        from .filter import filter_variance
        if top > self.p:
            raise ValueError('filter_variance() called with top=%d, but '
                             'there are only %d genes in the matrix.'
                             %(top, self.p))
        return filter_variance(self, top)
        

[docs]    def get_heatmap(self, highlight_genes=None, highlight_samples=None,
                          highlight_color=None, **kwargs):
        """Generate a heatmap (`ExpHeatmap`) of the matrix.

        See :class:`ExpHeatmap` constructor for keyword arguments.

        Parameters
        ----------
        highlight_genes : list of str
            List of genes to highlight
        highlight_color : str
            Color to use for highlighting

        Returns
        -------
        `ExpHeatmap`
            The heatmap.
        """
        from .visualize import ExpHeatmap
        from .visualize import HeatmapGeneAnnotation
        from .visualize import HeatmapSampleAnnotation

        if highlight_genes is not None:
            assert isinstance(highlight_genes, Iterable)
        if highlight_samples is not None:
            assert isinstance(highlight_genes, Iterable)
        if highlight_color is not None:
            assert isinstance(highlight_color, (str, _oldstr))

        if highlight_color is None:
            highlight_color = 'blue'

        if highlight_genes is None:
            highlight_genes = []

        if highlight_samples is None:
            highlight_samples = []

        gene_annotations = kwargs.pop('gene_annotations', [])
        for g in highlight_genes:
            gene_annotations.append(
                HeatmapGeneAnnotation(g, highlight_color, label=g))

        sample_annotations = kwargs.pop('sample_annotations', [])
        for s in highlight_samples:
            sample_annotations.append(
                HeatmapSampleAnnotation(s, highlight_color, label=s)
            )

        return ExpHeatmap(self,
                          gene_annotations=gene_annotations,
                          sample_annotations=sample_annotations,
                          **kwargs)


[docs]    def get_figure(self, heatmap_kw=None, **kwargs):
        """Generate a plotly figure showing the matrix as a heatmap.

        This is a shortcut for ``ExpMatrix.get_heatmap(...).get_figure(...)``.

        See :func:`ExpHeatmap.get_figure` for keyword arguments.

        Parameters
        ----------
        heatmap_kw : dict or None
            If not None, dictionary containing keyword arguments to be passed
            to the `ExpHeatmap` constructor.

        Returns
        -------
        `plotly.graph_objs.Figure`
            The plotly figure.
        """
        if heatmap_kw is not None:
            assert isinstance(heatmap_kw, dict)

        if heatmap_kw is None:
            heatmap_kw = {}

        return self.get_heatmap(**heatmap_kw).get_figure(**kwargs)


[docs]    def sort_genes(self, stable=True, inplace=False, ascending=True):
        """Sort the rows of the matrix alphabetically by gene name.

        Parameters
        ----------
        stable: bool, optional
            Whether to use a stable sorting algorithm. [True]
        inplace: bool, optional
            Whether to perform the operation in place.[False]
        ascending: bool, optional
            Whether to sort in ascending order [True]
        
        Returns
        -------
        `ExpMatrix`
            The sorted matrix.
        """
        kind = 'quicksort'
        if stable:
            kind = 'mergesort'
        return self.sort_index(kind=kind, inplace=inplace, ascending=ascending)

[docs]    def sort_samples(self, stable=True, inplace=False, ascending=True):
        """Sort the columns of the matrix alphabetically by sample name.

        Parameters
        ----------
        stable: bool, optional
            Whether to use a stable sorting algorithm. [True]
        inplace: bool, optional
            Whether to perform the operation in place.[False]
        ascending: bool, optional
            Whether to sort in ascending order [True]

        Returns
        -------
        `ExpMatrix`
            The sorted matrix.
        """
        kind = 'quicksort'
        if stable:
            kind = 'mergesort'
        return self.sort_index(axis=1, kind=kind, inplace=inplace,
                               ascending=ascending)

[docs]    def center_genes(self, use_median=False, inplace=False):
        """Center the expression of each gene (row)."""
        if use_median:
            X = self.X - \
                np.tile(np.median(self.X, axis=1), (self.n, 1)).T
        else:
            X = self.X - \
                np.tile(np.mean(self.X, axis=1), (self.n, 1)).T

        if inplace:
            self.X[:,:] = X
            matrix = self
        else:
            matrix = ExpMatrix(genes=self.genes, samples=self.samples,
                               X=X)
        return matrix

[docs]    def standardize_genes(self, inplace=False):
        """Standardize the expression of each gene (row)."""
        matrix = self.center_genes(inplace=inplace)
        matrix.X[:,:] = matrix.X / \
            np.tile(np.std(matrix.X, axis=1, ddof=1), (matrix.n, 1)).T
        return matrix

[docs]    def filter_against_genome(self, genome, inplace=False):
        """Filter the expression matrix against a _genome (set of genes).

        Parameters
        ----------
        genome: `genometools.expression.ExpGenome`
            The genome to filter the genes against.
        inplace: bool, optional
            Whether to perform the operation in-place.

        Returns
        -------
        ExpMatrix
            The filtered expression matrix.
        """
        assert isinstance(genome, ExpGenome)

        return self.drop(set(self.genes) - set(genome.gene_names),
                         inplace=inplace)

    @property
    def sample_correlations(self):
        """Returns an `ExpMatrix` containing all pairwise sample correlations.

        Returns
        -------
        `ExpMatrix`
            The sample correlation matrix.

        """
        C = np.corrcoef(self.X.T)
        corr_matrix = ExpMatrix(genes=self.samples, samples=self.samples, X=C)
        return corr_matrix

    @classmethod
[docs]    def read_tsv(cls, path, genome=None, encoding='UTF-8'):
        """Read expression matrix from a tab-delimited text file.

        Parameters
        ----------
        path: str
            The path of the text file.
        genome: `ExpGenome` object, optional
            The set of valid genes. If given, the genes in the text file will
            be filtered against this set of genes. (None)
        encoding: str, optional
            The file encoding. ("UTF-8")

        Returns
        -------
        `ExpMatrix`
            The expression matrix.
        """
        # checks
        assert isinstance(path, (str, _oldstr))
        if genome is not None:
            assert isinstance(genome, ExpGenome)
        assert isinstance(encoding, (str, _oldstr))

        # use pd.read_csv to parse the tsv file into a DataFrame
        matrix = cls(pd.read_csv(path, sep='\t', index_col=0, header=0,
                                 encoding=encoding))

        # parse index column separately
        # (this seems to be the only way we can prevent pandas from converting
        #  "nan" or "NaN" to floats in the index)
        ind = pd.read_csv(path, sep='\t', usecols=[0, ], header=0,
                          encoding=encoding, na_filter=False)

        matrix.index = ind.iloc[:, 0]

        if genome is not None:
            # filter genes
            matrix = matrix.filter_against_genome(genome)

        return matrix

[docs]    def write_tsv(self, path, encoding='UTF-8'):
        """Write expression matrix to a tab-delimited text file.

        Parameters
        ----------
        path: str
            The path of the output file.
        encoding: str, optional
            The file encoding. ("UTF-8")

        Returns
        -------
        None
        """
        assert isinstance(path, (str, _oldstr))
        assert isinstance(encoding, (str, _oldstr))

        # sep = str('\t')
        sep = '\t'
        if six.PY2:
            sep = sep.encode('UTF-8')

        self.to_csv(
            path, sep=sep, float_format='%.5f', mode='w',
            encoding=encoding, quoting=csv.QUOTE_NONE
        )

        logger.info('Wrote %d x %d expression matrix to "%s".',
                    self.p, self.n, path)