Source code for genometools.expression.normalize

# Copyright (c) 2015, 2016 Florian Wagner
#
# This file is part of GenomeTools.
#
# GenomeTools is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License, Version 3,
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""Functions for normalizing expression data.

Note: Currently, only quantile normalization is implemented.
"""

import logging

import numpy as np

from . import ExpMatrix

logger = logging.getLogger(__name__)


[docs]def quantile_normalize(matrix, inplace=False, target=None):
    """Quantile normalization, allowing for missing values (NaN).

    In case of nan values, this implementation will calculate evenly
    distributed quantiles and fill in the missing data with those values.
    Quantile normalization is then performed on the filled-in matrix,
    and the nan values are restored afterwards.

    Parameters
    ----------
    matrix: `ExpMatrix`
        The expression matrix (rows = genes, columns = samples).
    inplace: bool
        Whether or not to perform the operation in-place. [False]
    target: `numpy.ndarray`
        Target distribution to use. needs to be a vector whose first
        dimension matches that of the expression matrix. If ``None``,
        the target distribution is calculated based on the matrix
        itself. [None]

    Returns
    -------
    numpy.ndarray (ndim = 2)
        The normalized matrix.
    """
    assert isinstance(matrix, ExpMatrix)
    assert isinstance(inplace, bool)
    if target is not None:
        assert isinstance(target, np.ndarray) and \
               np.issubdtype(target.dtype, np.float)

    if not inplace:
        # make a copy of the original data
        matrix = matrix.copy()

    X = matrix.X
    _, n = X.shape
    nan = []
     # fill in missing values with evenly spaced quantiles
    for j in range(n):
        nan.append(np.nonzero(np.isnan(X[:, j]))[0])
        if nan[j].size > 0:
            q = np.arange(1, nan[j].size + 1, dtype=np.float64) / \
                (nan[j].size + 1.0)
            fill = np.nanpercentile(X[:, j], 100 * q)
            X[nan[j], j] = fill

    # generate sorting indices
    A = np.argsort(X, axis=0, kind='mergesort')  # mergesort is stable

    # reorder matrix
    for j in range(n):
        matrix.iloc[:, j] = matrix.X[A[:, j], j]

    # determine target distribution
    if target is None:
        # No target distribution is specified, calculate one based on the
        # expression matrix.
        target = np.mean(matrix.X, axis=1)
    else:
        # Use specified target distribution (after sorting).
        target = np.sort(target)

    # generate indices to reverse sorting
    A = np.argsort(A, axis=0, kind='mergesort')  # mergesort is stable

    # quantile-normalize
    for j in range(n):
        matrix.iloc[:, j] = target[A[:, j]]

    # set missing values to NaN again
    for j in range(n):
        if nan[j].size > 0:
            matrix.iloc[nan[j], j] = np.nan

    return matrix