# Copyright (c) 2015, 2016 Florian Wagner
#
# This file is part of GenomeTools.
#
# GenomeTools is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License, Version 3,
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""Module containing the `StaticGSEResult` and `RankBasedGSEResult` classes."""
from __future__ import (absolute_import, division,
print_function, unicode_literals)
from builtins import *
import logging
import hashlib
from collections import Iterable
import numpy as np
from xlmhg import mHGResult
from ..basic import GeneSet
logger = logging.getLogger(__name__)
[docs]class StaticGSEResult(object):
"""Result of a hypergeometric test for gene set enrichment.
Parameters
----------
gene_set : `genometools.basic.GeneSet`
See :attr:`gene_set`.
N : int
See :attr:`N`.
n : int
See :attr:`n`.
selected_genes : iterable of `ExpGene`
See :attr:`selected_genes`.
pval : float
See :attr:`pval`.
Attributes
----------
gene_set : `genometools.basic.GeneSet`
The gene set.
N : int
The total number of genes in the analysis.
n : int
The number of genes selected.
selected_genes : set of `ExpGene`
The genes from the gene set found present.
pval : float
The hypergeometric p-value.
"""
def __init__(self, gene_set, N, n, selected_genes, pval):
assert isinstance(gene_set, GeneSet)
assert isinstance(N, (int, np.integer))
assert isinstance(n, int)
assert isinstance(selected_genes, Iterable)
assert isinstance(pval, (float, np.float))
self.gene_set = gene_set
self.N = N
self.n = n
self.selected_genes = set(selected_genes)
self.pval = pval
def __repr__(self):
return '<%s object (pval=%.1e, gene_set_id="%s", hash="%s">' \
% (self.__class__.__name__,
self.pval, self.gene_set._id, self.hash)
def __str__(self):
return '<%s object (pval=%.1e; N=%d; K=%d; n=%d; k=%d; gene_set=%s)>' \
% (self.__class__.__name__, self.pval, self.N, self.K, self.n,
self.k, str(self.gene_set))
def __eq__(self, other):
if self is other:
return True
elif type(self) is type(other):
return self.hash == other.hash
else:
return NotImplemented
def __ne__(self, other):
return not self.__eq__(other)
@property
def hash(self):
data_str = ';'.join(
[str(repr(var)) for var in
[self.gene_set, self.N, self.n, sorted(self.selected_genes),
self.pval]])
data = data_str.encode('UTF-8')
return str(hashlib.md5(data).hexdigest())
@property
def K(self):
return self.gene_set.size
@property
def k(self):
return len(self.selected_genes)
@property
def fold_enrichment(self):
"""Returns the fold enrichment of the gene set.
Fold enrichment is defined as ratio between the observed and the
expected number of gene set genes present.
"""
expected = self.K * (self.n/float(self.N))
return self.k / expected
[docs]class RankBasedGSEResult(mHGResult):
"""Result of an XL-mHG-based test for gene set enrichment.
This class inherits from `xlmhg.mHGResult`.
Parameters
----------
gene_set : `genometools.basic.GeneSet`
See :attr:`gene_set` attribute.
N: int
The total number of genes in the ranked list.
See also :attr:`xlmhg.mHGResult.N`.
indices: `numpy.ndarray` of integers
The indices of the gene set genes in the ranked list.
ind_genes: list of str
See :attr:`ind_genes` attribute.
X: int
The XL-mHG X parameter.
L: int
The XL-mHG L parameter.
stat: float
The XL-mHG test statistic.
cutoff: int
The cutoff at which the XL-mHG test statistic was attained.
pval: float
The XL-mHG p-value.
pval_thresh: float, optional
The p-value threshold used in the analysis. [None]
escore_pval_thresh: float, optional
The hypergeometric p-value threshold used for calculating the E-score.
If not specified, the XL-mHG p-value will be used, resulting in a
conservative E-score. [None]
escore_tol: float, optional
The tolerance used for calculating the E-score. [None]
Attributes
----------
gene_set : `genometools.basic.GeneSet`
The gene set.
ind_genes : list of str
The names of the genes corresponding to the indices.
"""
def __init__(self, gene_set, N, indices, ind_genes, X, L,
stat, cutoff, pval,
pval_thresh=None, escore_pval_thresh=None, escore_tol=None):
# call parent constructor
mHGResult.__init__(self, N, indices, X, L, stat, cutoff, pval,
pval_thresh=pval_thresh,
escore_pval_thresh=escore_pval_thresh,
escore_tol=escore_tol)
# type checks
assert isinstance(gene_set, GeneSet)
assert isinstance(ind_genes, Iterable)
if len(ind_genes) != indices.size:
raise ValueError('The number of genes must match the number of '
'indices.')
self.gene_set = gene_set
self.ind_genes = list(ind_genes)
# @classmethod
# def from_mHGResult(...)
def __repr__(self):
return '<%s object (N=%d, gene_set_id="%s", hash="%s">' \
% (self.__class__.__name__,
self.N, self.gene_set._id, self.hash)
def __str__(self):
return '<%s object (gene_set=%s, pval=%.1e)>' \
% (self.__class__.__name__, str(self.gene_set), self.pval)
def __eq__(self, other):
if self is other:
return True
elif type(self) is type(other):
return self.hash == other.hash
else:
return NotImplemented
def __ne__(self, other):
return not self.__eq__(other)
@property
def hash(self):
data_str = ';'.join(
[super().hash] +
[str(repr(var)) for var in [self.gene_set, self.ind_genes]])
data = data_str.encode('UTF-8')
return str(hashlib.md5(data).hexdigest())
@property
def genes_above_cutoff(self):
return self.ind_genes[:self.k]
def get_pretty_format(self, omit_param=True, max_name_length=0):
# TO-DO: clean up, commenting
gs_name = self.gene_set._name
if max_name_length > 0 and len(gs_name) > max_name_length:
assert max_name_length >= 3
gs_name = gs_name[:(max_name_length - 3)] + '...'
gs_str = gs_name + ' (%d / %d @ %d)' % \
(self.k, len(self.ind_genes), self.cutoff)
param_str = ''
if not omit_param:
param_str = ' [X=%d,L=%d,N=%d]' % (self.X, self.L, self.N)
details = ', p=%.1e, e=%.1fx%s' % (self.pval, self.escore, param_str)
return '%s%s' % (gs_str, details)