Source code for genometools.basic.gene_set_collection

# Copyright (c) 2015, 2016 Florian Wagner
#
# This file is part of GenomeTools.
#
# GenomeTools is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License, Version 3,
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""Module containing the `GeneSetCollection` class.

Class supports unicode using UTF-8.

"""

from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
_oldstr = str
from builtins import *

import os
import io
import logging
import hashlib
from collections import OrderedDict, Iterable

import numpy as np
import xmltodict
import unicodecsv as csv

from . import GeneSet

logger = logging.getLogger(__name__)


[docs]class GeneSetCollection(object): """A collection of gene sets. This is a class that basically just contains a list of gene sets, and supports different ways of accessing individual gene sets. The gene sets are ordered, so each gene set has a unique position (index) in the database. Parameters ---------- gene_sets: list or tuple of `GeneSet` See :attr:`gene_sets` attribute. Attributes ---------- gene_sets: tuple of `GeneSet` The list of gene sets in the database. Note that this is a read-only property. """ def __init__(self, gene_sets): assert isinstance(gene_sets, Iterable) gene_sets = list(gene_sets) for gs in gene_sets: assert isinstance(gs, GeneSet) # make sure all IDs are unique all_ids = [gs.id for gs in gene_sets] if len(all_ids) != len(set(all_ids)): raise ValueError('Cannot create GeneSetCollection:' 'gene set IDs are not unique!') self._gene_sets = OrderedDict([gs.id, gs] for gs in gene_sets) self._gene_set_ids = list(self._gene_sets.keys()) self._gene_set_indices = OrderedDict( [gs.id, i] for i, gs in enumerate(self._gene_sets.values()) ) def __repr__(self): return '<%s object (n=%d; hash=%s)>' \ % (self.__class__.__name__, self.n, self.hash) def __str__(self): return '<%s object (n=%d)>' % (self.__class__.__name__, self.n) def __len__(self): return len(self._gene_sets) def __iter__(self): return iter(self._gene_sets.values()) def __getitem__(self, key): """Simple interface for querying the database. Depending on whether key is an integer or not, look up a gene set either by index, or by ID. """ if isinstance(key, (int, np.integer)): return self.get_by_index(key) else: return self.get_by_id(key) def __eq__(self, other): if self is other: return True elif type(self) is type(other): return self.__dict__ == other.__dict__ else: return NotImplemented def __ne__(self, other): return not self.__eq__(other) @property def hash(self): data = ';'.join(repr(gs) for gs in self.gene_sets) return str(hashlib.md5(data.encode('ascii')).hexdigest()) @property def gene_sets(self): return list(self._gene_sets.values()) @property def n(self): """The number of gene sets in the database.""" return len(self) def add_gene_set(self, gs, overwrite=False): if gs.id in self._gene_sets and not overwrite: raise ValueError('Gene set with this ID already exists, and ' '`overwrite` was not set to ``True``.') self._gene_sets[gs.id] = gs self._gene_set_ids.append(gs) self._gene_set_indices[gs.id] = len(self)-1
[docs] def get_by_id(self, id_): """Look up a gene set by its ID. Parameters ---------- id_: str The ID of the gene set. Returns ------- GeneSet The gene set. Raises ------ ValueError If the given ID is not in the database. """ try: return self._gene_sets[id_] except KeyError: raise ValueError('No gene set with ID "%s"!' % id_)
[docs] def get_by_index(self, i): """Look up a gene set by its index. Parameters ---------- i: int The index of the gene set. Returns ------- GeneSet The gene set. Raises ------ ValueError If the given index is out of bounds. """ if i >= self.n: raise ValueError('Index %d out of bounds ' % i + 'for database with %d gene sets.' % self.n) return self._gene_sets[self._gene_set_ids[i]]
[docs] def index(self, id_): """Get the index corresponding to a gene set, identified by its ID. Parameters ---------- id_: str The ID of the gene set. Returns ------- int The index of the gene set. Raises ------ ValueError If the given ID is not in the database. """ try: return self._gene_set_indices[id_] except KeyError: raise ValueError('No gene set with ID "%s"!' % id_)
@classmethod
[docs] def read_tsv(cls, path, encoding='utf-8'): """Read a gene set database from a tab-delimited text file. Parameters ---------- path: str The path name of the the file. encoding: str The encoding of the text file. Returns ------- None """ gene_sets = [] n = 0 with open(path, 'rb') as fh: reader = csv.reader(fh, dialect='excel-tab', encoding=encoding) for l in reader: n += 1 gs = GeneSet.from_list(l) gene_sets.append(gs) logger.debug('Read %d gene sets.', n) logger.debug('Size of gene set list: %d', len(gene_sets)) return cls(gene_sets)
[docs] def write_tsv(self, path): """Write the database to a tab-delimited text file. Parameters ---------- path: str The path name of the file. Returns ------- None """ with open(path, 'wb') as ofh: writer = csv.writer( ofh, dialect='excel-tab', quoting=csv.QUOTE_NONE, lineterminator=os.linesep ) for gs in self._gene_sets.values(): writer.writerow(gs.to_list())
@classmethod
[docs] def read_msigdb_xml(cls, path, entrez2gene, species=None): # pragma: no cover """Read the complete MSigDB database from an XML file. The XML file can be downloaded from here: http://software.broadinstitute.org/gsea/msigdb/download_file.jsp?filePath=/resources/msigdb/5.0/msigdb_v5.0.xml Parameters ---------- path: str The path name of the XML file. entrez2gene: dict or OrderedDict (str: str) A dictionary mapping Entrez Gene IDs to gene symbols (names). species: str, optional A species name (e.g., "Homo_sapiens"). Only gene sets for that species will be retained. (None) Returns ------- GeneSetCollection The gene set database containing the MSigDB gene sets. """ # note: is XML file really encoded in UTF-8? assert isinstance(path, (str, _oldstr)) assert isinstance(entrez2gene, (dict, OrderedDict)) assert species is None or isinstance(species, (str, _oldstr)) logger.debug('Path: %s', path) logger.debug('entrez2gene type: %s', str(type(entrez2gene))) i = [0] gene_sets = [] total_gs = [0] total_genes = [0] species_excl = [0] unknown_entrezid = [0] src = 'MSigDB' def handle_item(pth, item): # callback function for xmltodict.parse() total_gs[0] += 1 data = pth[1][1] spec = data['ORGANISM'] # filter by species if species is not None and spec != species: species_excl[0] += 1 return True id_ = data['SYSTEMATIC_NAME'] name = data['STANDARD_NAME'] coll = data['CATEGORY_CODE'] desc = data['DESCRIPTION_BRIEF'] entrez = data['MEMBERS_EZID'].split(',') genes = [] for e in entrez: total_genes[0] += 1 try: genes.append(entrez2gene[e]) except KeyError: unknown_entrezid[0] += 1 if not genes: logger.warning('Gene set "%s" (%s) has no known genes!', name, id_) return True gs = GeneSet(id_, name, genes, source=src, collection=coll, description=desc) gene_sets.append(gs) i[0] += 1 return True # parse the XML file using the xmltodict package with io.open(path, 'rb') as fh: xmltodict.parse(fh.read(), encoding='UTF-8', item_depth=2, item_callback=handle_item) # report some statistics if species_excl[0] > 0: kept = total_gs[0] - species_excl[0] perc = 100 * (kept / float(total_gs[0])) logger.info('%d of all %d gene sets (%.1f %%) belonged to the ' 'specified species.', kept, total_gs[0], perc) if unknown_entrezid[0] > 0: unkn = unknown_entrezid[0] # known = total_genes[0] - unknown_entrezid[0] perc = 100 * (unkn / float(total_genes[0])) logger.warning('%d of a total of %d genes (%.1f %%) had an ' + 'unknown Entrez ID.', unkn, total_genes[0], perc) logger.info('Parsed %d entries, resulting in %d gene sets.', total_gs[0], len(gene_sets)) return cls(gene_sets)