Source code for genometools.enrichment.analysis

# Copyright (c) 2015, 2016 Florian Wagner
#
# This file is part of GenomeTools.
#
# GenomeTools is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License, Version 3,
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""Module containing the `GeneSetEnrichmentAnalysis` class."""

from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
_oldstr = str
from builtins import *

import logging
from math import ceil
import copy
from collections import Iterable
import sys

import numpy as np
from scipy.stats import hypergeom

import xlmhg

# from ..basic import GeneSet, GeneSetCollection
from ..basic import GeneSetCollection
from ..expression import ExpGenome
from . import StaticGSEResult, RankBasedGSEResult

logger = logging.getLogger(__name__)


[docs]class GeneSetEnrichmentAnalysis(object): """Test a set of genes or a ranked list of genes for gene set enrichment. Parameters ---------- genome: `ExpGenome` object See :attr:`_genome` attribute. gene_set_coll: `GeneSetCollection` object See :attr:`_gene_set_coll` attribute. Attributes ---------- _genome: `ExpGenome` object The universe of genes. _gene_set_coll: `GeneSetCollection` object The list of gene sets to be tested. Notes ----- The class is initialized with a set of valid gene names (an `ExpGenome` object), as well as a set of gene sets (a `GeneSetCollection` object). During initialization, a binary "gene-by-gene set" matrix is constructed, which stores information about which gene is contained in each gene set. This matrix is quite sparse, and requires a significant amount of memory. As an example, for a set of p = 10,000 genes and n = 10,000 gene sets, this matrix is of size 100 MB in the memory (i.e., p x n bytes). Once the class has been initialized, the function `get_static_enrichment` can be used to test a set of genes for gene set enrichment, and the function `get_rank_based_enrichment` can be used to test a ranked list of genes for gene set enrichment. Note also that two conventions get mixed here: In the `GeneSet` class, a gene simply corresponds to a string containing the gene name, whereas in the `ExpGenome` class, a gene is an `ExpGene` object. Here, we represent genes as simple strings, since the user can always obtain the corresponding `ExpGene` object from the `ExpGenome` genome. """ def __init__(self, genome, gene_set_coll): assert isinstance(genome, ExpGenome) assert isinstance(gene_set_coll, GeneSetCollection) self._genome = genome self._gene_set_coll = gene_set_coll # generate annotation matrix by going over all gene sets logger.info('Generating gene-by-gene set membership matrix...') gene_memberships = np.zeros((len(genome), gene_set_coll.n), dtype=np.uint8) for j, gs in enumerate(self._gene_set_coll.gene_sets): for g in gs.genes: try: idx = self._genome.index(g) except ValueError: pass else: gene_memberships[idx, j] = 1 self._gene_memberships = gene_memberships def __repr__(self): return '<%s object (genome=%s; gene_set_coll=%s)>' \ % (self.__class__.__name__, repr(self._genome), repr(self._gene_set_coll)) def __str__(self): return '<%s object (%d genes in _genome; %d gene sets)>' \ % (self.__class__.__name__, len(self._genome), len(self._gene_set_coll)) @property def genome(self): return copy.deepcopy(self._genome)
[docs] def get_static_enrichment( self, genes, pval_thresh, adjust_pval_thresh=True, K_min=3, gene_set_ids=None): """Find enriched gene sets in a set of genes. Parameters ---------- genes : set of str The set of genes to test for gene set enrichment. pval_thresh : float The significance level (p-value threshold) to use in the analysis. adjust_pval_thresh : bool, optional Whether to adjust the p-value threshold using a Bonferroni correction. (Warning: This is a very conservative correction!) [True] K_min : int, optional The minimum number of gene set genes present in the analysis. [3] gene_set_ids : Iterable or None A list of gene set IDs to test. If ``None``, all gene sets are tested that meet the :attr:`K_min` criterion. Returns ------- list of `StaticGSEResult` A list of all significantly enriched gene sets. """ assert isinstance(genes, set) assert isinstance(pval_thresh, (float, np.float)) assert isinstance(K_min, (int, np.integer)) if gene_set_ids is not None: assert isinstance(gene_set_ids, Iterable) gene_set_coll = self._gene_set_coll gene_sets = self._gene_set_coll.gene_sets gene_memberships = self._gene_memberships sorted_genes = sorted(genes) # test only some terms? if gene_set_ids is not None: gs_indices = np.int64([self._gene_set_coll.index(id_) for id_ in gene_set_ids]) gene_sets = [gene_set_coll[id_] for id_ in gene_set_ids] # gene_set_coll = GeneSetCollection(gene_sets) gene_memberships = gene_memberships[:, gs_indices] # not a view! # determine K's K_vec = np.sum(gene_memberships, axis=0, dtype=np.int64) # exclude terms with too few genes sel = np.nonzero(K_vec >= K_min)[0] K_vec = K_vec[sel] gene_sets = [gene_sets[j] for j in sel] gene_memberships = gene_memberships[:, sel] # determine k's, ignoring unknown genes unknown = 0 sel = [] filtered_genes = [] logger.debug('Looking up indices for %d genes...', len(sorted_genes)) for i, g in enumerate(sorted_genes): assert isinstance(g, (str, _oldstr)) try: idx = self._genome.index(g) except ValueError: unknown += 1 else: sel.append(idx) filtered_genes.append(g) sel = np.int64(sel) gene_indices = np.int64(sel) # gene_memberships = gene_memberships[sel, :] k_vec = np.sum(gene_memberships[sel, :], axis=0, dtype=np.int64) if unknown > 0: logger.warn('%d / %d unknown genes (%.1f %%), will be ignored.', unknown, len(genes), 100 * (unknown / float(len(genes)))) # determine n and N n = len(filtered_genes) N, m = gene_memberships.shape logger.info('Conducting %d tests.', m) # correct p-value threshold, if specified final_pval_thresh = pval_thresh if adjust_pval_thresh: final_pval_thresh /= float(m) logger.info('Using Bonferroni-corrected p-value threshold: %.1e', final_pval_thresh) # calculate p-values and get significantly enriched gene sets enriched = [] logger.debug('N=%d, n=%d', N, n) sys.stdout.flush() for j in range(m): pval = hypergeom.sf(k_vec[j] - 1, N, K_vec[j], n) if pval <= final_pval_thresh: # found significant enrichment # sel_genes = [filtered_genes[i] for i in np.nonzero(gene_memberships[:, j])[0]] sel_genes = [self._genome[i] for i in np.nonzero(gene_memberships[gene_indices, j])[0]] enriched.append( StaticGSEResult(gene_sets[j], N, n, set(sel_genes), pval)) return enriched
[docs] def get_rank_based_enrichment( self, ranked_genes, pval_thresh=0.05, X_frac=0.25, X_min=5, L=None, adjust_pval_thresh=True, escore_pval_thresh=None, exact_pval='always', gene_set_ids=None, table=None): """Test for gene set enrichment at the top of a ranked list of genes. This function uses the XL-mHG test to identify enriched gene sets. This function also calculates XL-mHG E-scores for the enriched gene sets, using ``escore_pval_thresh`` as the p-value threshold "psi". Parameters ---------- ranked_genes : list of str The ranked list of genes. pval_thresh : float, optional The p-value threshold used to determine significance. See also ``adjust_pval_thresh``. [0.05] X_frac : float, optional The min. fraction of genes from a gene set required for enrichment. [0.25] X_min : int, optional The min. no. of genes from a gene set required for enrichment. [5] L : int, optional The lowest cutoff to test for enrichment. If ``None``, int(0.25*(no. of genes)) will be used. [None] adjust_pval_thresh : bool, optional Whether to adjust the p-value thershold for multiple testing, using the Bonferroni method. [True] escore_pval_thresh : float or None, optional The "psi" p-value threshold used in calculating E-scores. [None] exact_pval : str Choices are: "always", "if_significant", "if_necessary". Parameter will be passed to `xlmhg.get_xlmhg_test_result`. ["always"] gene_set_ids : list of str or None, optional A list of gene set IDs to specify which gene sets should be tested for enrichment. If ``None``, all gene sets will be tested. [None] table : 2-dim numpy.ndarray of type numpy.longdouble or None, optional The dynamic programming table used by the algorithm for calculating XL-mHG p-values. Passing this avoids memory re-allocation when calling this function repetitively. [None] Returns ------- list of `RankBasedGSEResult` A list of all significantly enriched gene sets. """ if isinstance(X_frac, (int, np.integer)): X_frac = float(X_frac) # type checks assert isinstance(ranked_genes, Iterable) assert isinstance(pval_thresh, (float, np.float)) assert isinstance(X_frac, (float, np.float)) assert isinstance(X_min, (int, np.integer)) if L is not None: assert isinstance(L, (int, np.integer)) assert isinstance(adjust_pval_thresh, bool) assert isinstance(exact_pval, (str, _oldstr)) if escore_pval_thresh is not None: assert isinstance(escore_pval_thresh, (float, np.float)) if gene_set_ids is not None: assert isinstance(gene_set_ids, Iterable) if table is not None: assert isinstance(table, np.ndarray) and \ np.issubdtype(table.dtype, np.longdouble) if L is None: L = int(len(ranked_genes)/4.0) gene_set_coll = self._gene_set_coll gene_memberships = self._gene_memberships # postpone this if escore_pval_thresh is None: # if no separate E-score p-value threshold is specified, use the # p-value threshold (this results in conservative E-scores) logger.warning('No E-score p-value threshold supplied. ' 'The E-score p-value threshold will be set to the' 'global significance threshold. This will result ' 'in conservative E-scores.') # test only some terms? if gene_set_ids is not None: gs_indices = np.int64([self._gene_set_coll.index(id_) for id_ in gene_set_ids]) gene_sets = [gene_set_coll[id_] for id_ in gene_set_ids] gene_set_coll = GeneSetCollection(gene_sets) gene_memberships = gene_memberships[:, gs_indices] # not a view! # reorder rows in annotation matrix to match the given gene ranking # also exclude genes not in the ranking unknown = 0 L_adj = L sel = [] filtered_genes = [] logger.debug('Looking up indices for %d genes...' % len(ranked_genes)) for i, g in enumerate(ranked_genes): assert isinstance(g, (str, _oldstr)) try: idx = self._genome.index(g) except ValueError: unknown += 1 # adjust L if the gene was above the original L cutoff if i < L: L_adj -= 1 else: sel.append(idx) filtered_genes.append(g) sel = np.int64(sel) logger.debug('Adjusted L: %d', L_adj) # the following also copies the data (not a view) gene_memberships = gene_memberships[sel, :] N, m = gene_memberships.shape if unknown > 0: # Some genes in the ranked list were unknown (i.e., not present in # the specified genome). logger.warn('%d / %d unknown genes (%.1f %%), will be ignored.', unknown, len(ranked_genes), 100 * (unknown / float(len(ranked_genes)))) # Determine the number of gene set genes above the L'th cutoff, # for all gene sets. This quantity is useful, because we don't need # to perform any more work for gene sets that have less than X genes # above the cutoff. k_above_L = np.sum(gene_memberships[:L_adj, :], axis=0, dtype=np.int64) # Determine the number of genes below the L'th cutoff, for all gene # sets. k_below_L = np.sum(gene_memberships[L_adj:, :], axis=0, dtype=np.int64) # Determine the total number K of genes in each gene set that are # present in the ranked list (this is equal to k_above_L + k_below_L) K_vec = k_above_L + k_below_L # Determine the largest K across all gene sets. K_max = np.amax(K_vec) # Determine X for all gene sets. X = np.amax( np.c_[np.tile(X_min, m), np.int64(np.ceil(X_frac * K_vec))], axis=1) # Determine the number of tests (we do not conduct a test if the # total number of gene set genes in the ranked list is below X). num_tests = np.sum(K_vec-X >= 0) logger.info('Conducting %d tests.', num_tests) # determine Bonferroni-corrected p-value, if desired final_pval_thresh = pval_thresh if adjust_pval_thresh and num_tests > 0: final_pval_thresh /= float(num_tests) logger.info('Using Bonferroni-corrected p-value threshold: %.1e', final_pval_thresh) if escore_pval_thresh is None: escore_pval_thresh = final_pval_thresh elif escore_pval_thresh < final_pval_thresh: logger.warning('The E-score p-value threshold is smaller than ' 'the p-value threshold. Setting E-score p-value ' 'threshold to the p-value threshold.') escore_pval_thresh = final_pval_thresh # Prepare the matrix that holds the dynamic programming table for # the calculation of the XL-mHG p-value. if table is None: table = np.empty((K_max+1, N+1), dtype=np.longdouble) else: if table.shape[0] < K_max+1 or table.shape[1] < N+1: raise ValueError( 'The supplied array is too small (%d x %d) to hold the ' 'entire dynamic programming table. The required size is' '%d x %d (rows x columns).' % (table.shape[0], table.shape[1], K_max+1, N+1)) # find enriched GO terms # logger.info('Testing %d gene sets for enrichment...', m) logger.debug('(N=%d, X_frac=%.2f, X_min=%d, L=%d; K_max=%d)', len(ranked_genes), X_frac, X_min, L, K_max) enriched = [] num_tests = 0 # number of tests conducted for j in range(m): # determine gene set-specific value for X X = max(X_min, int(ceil(X_frac * float(K_vec[j])))) # Determine significance of gene set enrichment using the XL-mHG # test (only if there are at least X gene set genes in the list). if K_vec[j] >= X: num_tests += 1 # We only need to perform the XL-mHG test if there are enough # gene set genes above the L'th cutoff (otherwise, pval = 1.0). if k_above_L[j] >= X: # perform test # Determine the ranks of the gene set genes in the # ranked list. indices = np.uint16(np.nonzero(gene_memberships[:, j])[0]) res = xlmhg.get_xlmhg_test_result( N, indices, X, L, pval_thresh=final_pval_thresh, escore_pval_thresh=escore_pval_thresh, exact_pval=exact_pval, table=table) # check if gene set is significantly enriched if res.pval <= final_pval_thresh: # generate RankedGSEResult ind_genes = [ranked_genes[i] for i in indices] gse_result = RankBasedGSEResult( gene_set_coll[j], N, indices, ind_genes, X, L, res.stat, res.cutoff, res.pval, escore_pval_thresh=escore_pval_thresh ) enriched.append(gse_result) # report results q = len(enriched) ignored = m - num_tests if ignored > 0: logger.debug('%d / %d gene sets (%.1f%%) had less than X genes ' 'annotated with them and were ignored.', ignored, m, 100 * (ignored / float(m))) logger.info('%d / %d gene sets were found to be significantly ' 'enriched (p-value <= %.1e).', q, m, final_pval_thresh) return enriched