Source code for genometools.basic.gene_set

# Copyright (c) 2015, 2016 Florian Wagner
#
# This file is part of GenomeTools.
#
# GenomeTools is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License, Version 3,
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""Module containing the `GeneSet` class."""

from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
_oldstr = str
from builtins import *

import hashlib
from collections import Iterable


[docs]class GeneSet(object): """A gene set. A gene set is just what the name implies: A set of genes. Usually, gene sets are used to group genes that share a certain property (e.g., genes that perform related functions, or genes that are frequently co-expressed). The genes in the gene set are not ordered. GeneSet instances are hashable and should therefore be considered to be immutable. Parameters ---------- id: str See :attr:`id` attribute. name: str See :attr:`name` attribute. genes: set, list or tuple of str See :attr:`genes` attribute. source: str, optional See :attr:`source` attribute. (None) collection: str, optional See :attr:`collection` attribute. (None) description: str, optional See :attr:`description` attribute. (None) Attributes ---------- id_: str The (unique) ID of the gene set. name: str The name of the gene set. genes: set of str The list of genes in the gene set. source: None or str The source / origin of the gene set (e.g., "MSigDB") collection: None or str The collection that the gene set belongs to (e.g., "c4" for gene sets from MSigDB). description: None or str The description of the gene set. """ def __init__(self, id, name, genes, source=None, collection=None, description=None): assert isinstance(id, (str, _oldstr)) assert isinstance(name, Iterable) if source is not None: assert isinstance(source, (str, _oldstr)) if collection is not None: assert isinstance(collection, (str, _oldstr)) if description is not None: assert isinstance(description, (str, _oldstr)) self._id = id self._name = name self._genes = frozenset(genes) self._source = source self._collection = collection self._description = description @property def _gene_str(self): return ', '.join('"%s"' % g for g in sorted(self._genes)) @property def _source_str(self): return '"%s"' % self._source \ if self._source is not None else 'None' @property def _coll_str(self): return '"%s"' % self._collection \ if self._collection is not None else 'None' @property def _desc_str(self): return '"%s"' % self._description \ if self._description is not None else 'None' def __repr__(self): return ('<%s instance (id="%s", name="%s", genes=[%s], source=%s, ' 'collection=%s, description=%s)' % (self.__class__.__name__, self._id, self._name, self._gene_str, self._source_str, self._coll_str, self._desc_str)) def __str__(self): return ('<%s "%s" (id=%s, source=%s, collection=%s, size=%d' % (self.__class__.__name__, self._name, self._id, self._source_str, self._coll_str, self.size)) def __eq__(self, other): if self is other: return True elif type(self) is type(other): return self.__dict__ == other.__dict__ else: return NotImplemented def __ne__(self, other): return not self.__eq__(other) @property def _data(self): data_str = ';'.join([ str(repr(var)) for var in [self._id, self._name, self._genes, self._source, self._collection, self._description] ]) data = data_str.encode('UTF-8') return data def __hash__(self): return hash(self._data) @property def hash(self): """MD5 hash value for the gene set.""" return str(hashlib.md5(self._data).hexdigest()) @property def id(self): return self._id @property def name(self): return self._name @property def genes(self): return self._genes @property def source(self): return self._source @property def collection(self): return self._collection @property def description(self): return self._description @property def size(self): """The size of the gene set (i.e., the number of genes in it).""" return len(self._genes)
[docs] def to_list(self): """Converts the GeneSet object to a flat list of strings. Note: see also :meth:`from_list`. Parameters ---------- Returns ------- list of str The data from the GeneSet object as a flat list. """ src = self._source or '' coll = self._collection or '' desc = self._description or '' l = [self._id, src, coll, self._name, ','.join(sorted(self._genes)), desc] return l
@classmethod
[docs] def from_list(cls, l): """Generate an GeneSet object from a list of strings. Note: See also :meth:`to_list`. Parameters ---------- l: list or tuple of str A list of strings representing gene set ID, name, genes, source, collection, and description. The genes must be comma-separated. See also :meth:`to_list`. Returns ------- `genometools.basic.GeneSet` The gene set. """ id_ = l[0] name = l[3] genes = l[4].split(',') src = l[1] or None coll = l[2] or None desc = l[5] or None return cls(id_, name, genes, src, coll, desc)