Source code for scedar.eda.stats

import numpy as np


[docs]def gc1d(x):
    """
    Compute Gini Index for 1D array.

    References
    ----------
    [1] http://mathworld.wolfram.com/GiniCoefficient.html

    [2] Damgaard, C. and Weiner, J. "Describing Inequality in Plant Size
    or Fecundity." Ecology 81, 1139-1142, 2000.

    [3] Dixon, P. M.; Weiner, J.; Mitchell-Olds, T.; and Woodley, R.
    "Bootstrapping the Gini Coefficient of Inequality."
    Ecology 68, 1548-1551, 1987.

    [4] Dixon, P. M.; Weiner, J.; Mitchell-Olds, T.; and Woodley, R.
    "Erratum to 'Bootstrapping the Gini Coefficient of Inequality.' "
    Ecology 69, 1307, 1988.

    [5] https://en.wikipedia.org/wiki/Gini_coefficient

    [6] https://github.com/oliviaguest/gini/blob/master/gini.py
    """
    x = np.array(x, dtype="float64")
    if x.ndim != 1:
        raise ValueError("Only support 1D array.")
    if x.shape[0] == 0:
        raise ValueError("Array size cannot be 0.")
    if x.shape[0] == 1:
        return np.nan
    # sort into ascending order
    xs = np.sort(x)
    if xs[0] == xs[-1]:
        return 0
    # Wrap with np.int64 to prevent division by 0
    n = np.int64(x.shape[0])
    xs_ranks = np.arange(1, n+1)
    xmean = x.mean()
    gc = ((2 * xs_ranks - n - 1) * xs).sum() / (n * (n-1) * xmean)
    return gc


# Multiple testing correction adapted from
# https://github.com/CoBiG2/cobig_misc_scripts/blob/master/FDR.py
#
# Following is the lincense info:
# Copyright 2017 Francisco Pina Martins <f.pinamartins@gmail.com>
# This file is part of structure_threader.
# structure_threader is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# structure_threader is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with structure_threader. If not, see <http://www.gnu.org/licenses/>.
#
# Taken from https://stackoverflow.com/a/21739593/3091595, ported to python 3
# and improved readability.
[docs]def multiple_testing_correction(pvalues, correction_type="FDR"):
    """
    Consistent with R.

    correct_pvalues_for_multiple_testing([0.0, 0.01, 0.029, 0.03, 0.031, 0.05,
                                          0.069, 0.07, 0.071, 0.09, 0.1])
    """
    pvalues = np.array(pvalues)
    sample_size = pvalues.shape[0]
    qvalues = np.empty(sample_size)
    if correction_type == "Bonferroni":
        # Bonferroni correction
        qvalues = sample_size * pvalues
    elif correction_type == "FDR":
        # Benjamini-Hochberg, AKA - FDR test
        values = [(pvalue, i) for i, pvalue in enumerate(pvalues)]
        values.sort()
        values.reverse()
        new_values = []
        for i, vals in enumerate(values):
            rank = sample_size - i
            pvalue, index = vals
            new_values.append((sample_size/rank) * pvalue)
        for i in range(0, int(sample_size)-1):
            if new_values[i] < new_values[i+1]:
                new_values[i+1] = new_values[i]
        for i, vals in enumerate(values):
            pvalue, index = vals
            qvalues[index] = new_values[i]
    else:
        raise ValueError("Not supported correction type "
                         "{}".format(correction_type))
    return np.clip(qvalues, 0, 1)


# lower upper bound
# start, end, lb, ub should all be scalar
[docs]def bidir_ReLU(x, start, end, lb=0, ub=1):
    if start > end:
        raise ValueError("start should <= end"
                         "start: {}. end: {}.".format(start, end))

    if lb > ub:
        raise ValueError("lb should <= ub"
                         "lower bound: {}. "
                         "upper bound: {}. ".format(start, end))

    if start < end:
        width = end - start
        height = ub - lb
        return np.clip(a=height * (x - start) / width + lb,
                       a_min=lb, a_max=ub)
    else:
        # start == end
        return np.where(x >= start, ub, lb)