Source code for genometools.enrichment.analysis

# Copyright (c) 2015, 2016 Florian Wagner
#
# This file is part of GenomeTools.
#
# GenomeTools is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License, Version 3,
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""Module containing the `GeneSetEnrichmentAnalysis` class."""

from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
_oldstr = str
from builtins import *

import logging
from math import ceil
import copy
from collections import Iterable
import sys

import numpy as np
from scipy.stats import hypergeom

import xlmhg

# from ..basic import GeneSet, GeneSetCollection
from ..basic import GeneSetCollection
from ..expression import ExpGenome
from . import StaticGSEResult, RankBasedGSEResult

logger = logging.getLogger(__name__)


[docs]class GeneSetEnrichmentAnalysis(object):
    """Test a set of genes or a ranked list of genes for gene set enrichment.

    Parameters
    ----------
    genome: `ExpGenome` object
        See :attr:`_genome` attribute.
    gene_set_coll: `GeneSetCollection` object
        See :attr:`_gene_set_coll` attribute.

    Attributes
    ----------
    _genome: `ExpGenome` object
        The universe of genes.
    _gene_set_coll: `GeneSetCollection` object
        The list of gene sets to be tested.

    Notes
    -----
    The class is initialized with a set of valid gene names (an `ExpGenome`
    object), as well as a set of gene sets (a `GeneSetCollection` object).
    During initialization, a binary "gene-by-gene set" matrix is constructed,
    which stores information about which gene is contained in each gene set.
    This matrix is quite sparse, and requires a significant amount of memory.
    As an example, for a set of p = 10,000 genes and n = 10,000 gene sets, this
    matrix is of size 100 MB in the memory (i.e., p x n bytes).

    Once the class has been initialized, the function `get_static_enrichment`
    can be used to test a set of genes for gene set enrichment, and the
    function `get_rank_based_enrichment` can be used to test a ranked list of
    genes for gene set enrichment.

    Note also that two conventions get mixed here: In the `GeneSet` class, a
    gene simply corresponds to a string containing the gene name, whereas in
    the `ExpGenome` class, a gene is an `ExpGene` object. Here, we represent
    genes as simple strings, since the user can always obtain the
    corresponding `ExpGene` object from the `ExpGenome` genome.
    """

    def __init__(self, genome, gene_set_coll):

        assert isinstance(genome, ExpGenome)
        assert isinstance(gene_set_coll, GeneSetCollection)

        self._genome = genome
        self._gene_set_coll = gene_set_coll

        # generate annotation matrix by going over all gene sets
        logger.info('Generating gene-by-gene set membership matrix...')
        gene_memberships = np.zeros((len(genome), gene_set_coll.n),
                                    dtype=np.uint8)
        for j, gs in enumerate(self._gene_set_coll.gene_sets):
            for g in gs.genes:
                try:
                    idx = self._genome.index(g)
                except ValueError:
                    pass
                else:
                    gene_memberships[idx, j] = 1
        self._gene_memberships = gene_memberships

    def __repr__(self):
        return '<%s object (genome=%s; gene_set_coll=%s)>' \
               % (self.__class__.__name__,
                  repr(self._genome), repr(self._gene_set_coll))

    def __str__(self):
        return '<%s object (%d genes in _genome; %d gene sets)>' \
               % (self.__class__.__name__,
                  len(self._genome), len(self._gene_set_coll))

    @property
    def genome(self):
        return copy.deepcopy(self._genome)

[docs]    def get_static_enrichment(
            self, genes, pval_thresh, adjust_pval_thresh=True, K_min=3,
            gene_set_ids=None):
        """Find enriched gene sets in a set of genes.

        Parameters
        ----------
        genes : set of str
            The set of genes to test for gene set enrichment.
        pval_thresh : float
            The significance level (p-value threshold) to use in the analysis.
        adjust_pval_thresh : bool, optional
            Whether to adjust the p-value threshold using a Bonferroni
            correction. (Warning: This is a very conservative correction!)
            [True]
        K_min : int, optional
            The minimum number of gene set genes present in the analysis. [3]
        gene_set_ids : Iterable or None
            A list of gene set IDs to test. If ``None``, all gene sets are
            tested that meet the :attr:`K_min` criterion.

        Returns
        -------
        list of `StaticGSEResult`
            A list of all significantly enriched gene sets. 
        """
        assert isinstance(genes, set)
        assert isinstance(pval_thresh, (float, np.float))
        assert isinstance(K_min, (int, np.integer))
        if gene_set_ids is not None:
            assert isinstance(gene_set_ids, Iterable)

        gene_set_coll = self._gene_set_coll
        gene_sets = self._gene_set_coll.gene_sets
        gene_memberships = self._gene_memberships
        sorted_genes = sorted(genes)

        # test only some terms?
        if gene_set_ids is not None:
            gs_indices = np.int64([self._gene_set_coll.index(id_)
                                   for id_ in gene_set_ids])
            gene_sets = [gene_set_coll[id_] for id_ in gene_set_ids]
            # gene_set_coll = GeneSetCollection(gene_sets)
            gene_memberships = gene_memberships[:, gs_indices]  # not a view!

        # determine K's
        K_vec = np.sum(gene_memberships, axis=0, dtype=np.int64)

        # exclude terms with too few genes
        sel = np.nonzero(K_vec >= K_min)[0]
        K_vec = K_vec[sel]
        gene_sets = [gene_sets[j] for j in sel]
        gene_memberships = gene_memberships[:, sel]

        # determine k's, ignoring unknown genes
        unknown = 0
        sel = []
        filtered_genes = []
        logger.debug('Looking up indices for %d genes...', len(sorted_genes))
        for i, g in enumerate(sorted_genes):
            assert isinstance(g, (str, _oldstr))
            try:
                idx = self._genome.index(g)
            except ValueError:
                unknown += 1
            else:
                sel.append(idx)
                filtered_genes.append(g)

        sel = np.int64(sel)
        gene_indices = np.int64(sel)
        # gene_memberships = gene_memberships[sel, :]
        k_vec = np.sum(gene_memberships[sel, :], axis=0, dtype=np.int64)
        if unknown > 0:
            logger.warn('%d / %d unknown genes (%.1f %%), will be ignored.',
                        unknown, len(genes),
                        100 * (unknown / float(len(genes))))

        # determine n and N
        n = len(filtered_genes)
        N, m = gene_memberships.shape
        logger.info('Conducting %d tests.', m)

        # correct p-value threshold, if specified
        final_pval_thresh = pval_thresh
        if adjust_pval_thresh:
            final_pval_thresh /= float(m)
            logger.info('Using Bonferroni-corrected p-value threshold: %.1e',
                        final_pval_thresh)

        # calculate p-values and get significantly enriched gene sets
        enriched = []

        logger.debug('N=%d, n=%d', N, n)
        sys.stdout.flush()
        for j in range(m):
            pval = hypergeom.sf(k_vec[j] - 1, N, K_vec[j], n)
            if pval <= final_pval_thresh:
                # found significant enrichment
                # sel_genes = [filtered_genes[i] for i in np.nonzero(gene_memberships[:, j])[0]]
                sel_genes = [self._genome[i] for i in
                             np.nonzero(gene_memberships[gene_indices, j])[0]]
                enriched.append(
                    StaticGSEResult(gene_sets[j], N, n, set(sel_genes), pval))

        return enriched


[docs]    def get_rank_based_enrichment(
            self, ranked_genes, pval_thresh=0.05,
            X_frac=0.25, X_min=5, L=None,
            adjust_pval_thresh=True, escore_pval_thresh=None,
            exact_pval='always', gene_set_ids=None, table=None):
        """Test for gene set enrichment at the top of a ranked list of genes.

        This function uses the XL-mHG test to identify enriched gene sets.

        This function also calculates XL-mHG E-scores for the enriched gene
        sets, using ``escore_pval_thresh`` as the p-value threshold "psi".

        Parameters
        ----------
        ranked_genes : list of str
            The ranked list of genes.
        pval_thresh : float, optional
            The p-value threshold used to determine significance.
            See also ``adjust_pval_thresh``. [0.05]
        X_frac : float, optional
            The min. fraction of genes from a gene set required for enrichment. [0.25]
        X_min : int, optional
            The min. no. of genes from a gene set required for enrichment. [5]
        L : int, optional
            The lowest cutoff to test for enrichment. If ``None``,
            int(0.25*(no. of genes)) will be used. [None]
        adjust_pval_thresh : bool, optional
            Whether to adjust the p-value thershold for multiple testing,
            using the Bonferroni method. [True]
        escore_pval_thresh : float or None, optional
            The "psi" p-value threshold used in calculating E-scores. [None]
        exact_pval : str
            Choices are: "always", "if_significant", "if_necessary". Parameter
            will be passed to `xlmhg.get_xlmhg_test_result`. ["always"]
        gene_set_ids : list of str or None, optional
            A list of gene set IDs to specify which gene sets should be tested for enrichment. If ``None``, all gene sets will be tested. [None]
        table : 2-dim numpy.ndarray of type numpy.longdouble or None, optional
            The dynamic programming table used by the algorithm for calculating XL-mHG p-values. Passing this avoids memory re-allocation when calling this function repetitively. [None]

        Returns
        -------
        list of `RankBasedGSEResult`
            A list of all significantly enriched gene sets. 
        """
        if isinstance(X_frac, (int, np.integer)):
            X_frac = float(X_frac)

        # type checks
        assert isinstance(ranked_genes, Iterable)
        assert isinstance(pval_thresh, (float, np.float))
        assert isinstance(X_frac, (float, np.float))
        assert isinstance(X_min, (int, np.integer))
        if L is not None:
            assert isinstance(L, (int, np.integer))
        assert isinstance(adjust_pval_thresh, bool)
        assert isinstance(exact_pval, (str, _oldstr))

        if escore_pval_thresh is not None:
            assert isinstance(escore_pval_thresh, (float, np.float))
        if gene_set_ids is not None:
            assert isinstance(gene_set_ids, Iterable)
        if table is not None:
            assert isinstance(table, np.ndarray) and \
                   np.issubdtype(table.dtype, np.longdouble)

        if L is None:
            L = int(len(ranked_genes)/4.0)

        gene_set_coll = self._gene_set_coll
        gene_memberships = self._gene_memberships

        # postpone this
        if escore_pval_thresh is None:
            # if no separate E-score p-value threshold is specified, use the
            # p-value threshold (this results in conservative E-scores)
            logger.warning('No E-score p-value threshold supplied. '
                           'The E-score p-value threshold will be set to the'
                           'global significance threshold. This will result '
                           'in conservative E-scores.')

        # test only some terms?
        if gene_set_ids is not None:
            gs_indices = np.int64([self._gene_set_coll.index(id_)
                                   for id_ in gene_set_ids])
            gene_sets = [gene_set_coll[id_] for id_ in gene_set_ids]
            gene_set_coll = GeneSetCollection(gene_sets)
            gene_memberships = gene_memberships[:, gs_indices]  # not a view!

        # reorder rows in annotation matrix to match the given gene ranking
        # also exclude genes not in the ranking
        unknown = 0
        L_adj = L
        sel = []
        filtered_genes = []
        logger.debug('Looking up indices for %d genes...' % len(ranked_genes))
        for i, g in enumerate(ranked_genes):
            assert isinstance(g, (str, _oldstr))
            try:
                idx = self._genome.index(g)
            except ValueError:
                unknown += 1
                # adjust L if the gene was above the original L cutoff
                if i < L:
                    L_adj -= 1
            else:
                sel.append(idx)
                filtered_genes.append(g)
        sel = np.int64(sel)
        logger.debug('Adjusted L: %d', L_adj)

        # the following also copies the data (not a view)
        gene_memberships = gene_memberships[sel, :]
        N, m = gene_memberships.shape
        if unknown > 0:
            # Some genes in the ranked list were unknown (i.e., not present in
            # the specified genome).
            logger.warn('%d / %d unknown genes (%.1f %%), will be ignored.',
                        unknown, len(ranked_genes),
                        100 * (unknown / float(len(ranked_genes))))


        # Determine the number of gene set genes above the L'th cutoff,
        # for all gene sets. This quantity is useful, because we don't need
        # to perform any more work for gene sets that have less than X genes
        # above the cutoff.
        k_above_L = np.sum(gene_memberships[:L_adj, :], axis=0, dtype=np.int64)

        # Determine the number of genes below the L'th cutoff, for all gene
        # sets.
        k_below_L = np.sum(gene_memberships[L_adj:, :], axis=0, dtype=np.int64)

        # Determine the total number K of genes in each gene set that are
        # present in the ranked list (this is equal to k_above_L + k_below_L)
        K_vec = k_above_L + k_below_L

        # Determine the largest K across all gene sets.
        K_max = np.amax(K_vec)

        # Determine X for all gene sets.
        X = np.amax(
            np.c_[np.tile(X_min, m), np.int64(np.ceil(X_frac * K_vec))],
            axis=1)

        # Determine the number of tests (we do not conduct a test if the
        # total number of gene set genes in the ranked list is below X).
        num_tests = np.sum(K_vec-X >= 0)
        logger.info('Conducting %d tests.', num_tests)

        # determine Bonferroni-corrected p-value, if desired
        final_pval_thresh = pval_thresh
        if adjust_pval_thresh and num_tests > 0:
            final_pval_thresh /= float(num_tests)
            logger.info('Using Bonferroni-corrected p-value threshold: %.1e',
                        final_pval_thresh)

        if escore_pval_thresh is None:
            escore_pval_thresh = final_pval_thresh

        elif escore_pval_thresh < final_pval_thresh:
            logger.warning('The E-score p-value threshold is smaller than '
                           'the p-value threshold. Setting E-score p-value '
                           'threshold to the p-value threshold.')
            escore_pval_thresh = final_pval_thresh

        # Prepare the matrix that holds the dynamic programming table for
        # the calculation of the XL-mHG p-value.
        if table is None:
            table = np.empty((K_max+1, N+1), dtype=np.longdouble)
        else:
            if table.shape[0] < K_max+1 or table.shape[1] < N+1:
                raise ValueError(
                    'The supplied array is too small (%d x %d) to hold the '
                    'entire dynamic programming table. The required size is'
                    '%d x %d (rows x columns).'
                    % (table.shape[0], table.shape[1], K_max+1, N+1))

        # find enriched GO terms
        # logger.info('Testing %d gene sets for enrichment...', m)
        logger.debug('(N=%d, X_frac=%.2f, X_min=%d, L=%d; K_max=%d)',
                     len(ranked_genes), X_frac, X_min, L, K_max)

        enriched = []
        num_tests = 0  # number of tests conducted
        for j in range(m):
            # determine gene set-specific value for X
            X = max(X_min, int(ceil(X_frac * float(K_vec[j]))))

            # Determine significance of gene set enrichment using the XL-mHG
            # test (only if there are at least X gene set genes in the list).
            if K_vec[j] >= X:
                num_tests += 1

                # We only need to perform the XL-mHG test if there are enough
                # gene set genes above the L'th cutoff (otherwise, pval = 1.0).
                if k_above_L[j] >= X:
                    # perform test

                    # Determine the ranks of the gene set genes in the
                    # ranked list.
                    indices = np.uint16(np.nonzero(gene_memberships[:, j])[0])
                    res = xlmhg.get_xlmhg_test_result(
                        N, indices, X, L, pval_thresh=final_pval_thresh,
                        escore_pval_thresh=escore_pval_thresh,
                        exact_pval=exact_pval, table=table)

                    # check if gene set is significantly enriched
                    if res.pval <= final_pval_thresh:
                        # generate RankedGSEResult
                        ind_genes = [ranked_genes[i] for i in indices]
                        gse_result = RankBasedGSEResult(
                            gene_set_coll[j], N, indices, ind_genes,
                            X, L, res.stat, res.cutoff, res.pval,
                            escore_pval_thresh=escore_pval_thresh
                        )
                        enriched.append(gse_result)

        # report results
        q = len(enriched)
        ignored = m - num_tests
        if ignored > 0:
            logger.debug('%d / %d gene sets (%.1f%%) had less than X genes '
                         'annotated with them and were ignored.',
                         ignored, m, 100 * (ignored / float(m)))

        logger.info('%d / %d gene sets were found to be significantly '
                    'enriched (p-value <= %.1e).', q, m, final_pval_thresh)

        return enriched