Source code for scedar.eda.slcs

import itertools

from collections import namedtuple

import numpy as np

from scipy.stats import ks_2samp

from sklearn.model_selection import train_test_split
import sklearn.utils

import matplotlib as mpl
import matplotlib.colors
import seaborn as sns

from collections import defaultdict

import xgboost as xgb

import inspect

from scedar.eda.sdm import SampleDistanceMatrix
from scedar.eda.plot import swarm
from scedar.eda.plot import heatmap
from scedar.eda import mdl
from scedar.eda import mtype
from scedar import utils


[docs]class SingleLabelClassifiedSamples(SampleDistanceMatrix): """Data structure of single label classified samples Attributes: _x (2D number array): (n_samples, n_features) data matrix. _d (2D number array): (n_samples, n_samples) distance matrix. _labs (list of labels): list of labels in the same type, int or str. _fids (list of feature IDs): list of feature IDs in the same type, int or str. _sids (list of sample IDs): list of sample IDs in the same type, int or str. _metric (str): Distance metric. Note: If sort by labels, the samples will be reordered, so that samples from left to right are from one label to another. """ # sid, lab, fid, x def __init__(self, x, labs, sids=None, fids=None, d=None, metric="cosine", use_pdist=True, nprocs=None): # sids: sample IDs. String or int. # labs: sample classified labels. String or int. # x: (n_samples, n_features) super(SingleLabelClassifiedSamples, self).__init__( x=x, d=d, metric=metric, use_pdist=use_pdist, sids=sids, fids=fids, nprocs=nprocs) mtype.check_is_valid_labs(labs) labs = np.array(labs) if self._sids.shape[0] != labs.shape[0]: raise ValueError("sids must have the same length as labs") self._labs = labs self._set_up_lab_rel_attrs() # keep a copy of original labels self._orig_labs = labs self._xgb_lut = {} return def _set_up_lab_rel_attrs(self): """Set up labels related attrs """ self._uniq_labs, self._uniq_lab_cnts = np.unique( self._labs, return_counts=True) # {lab: array([sid0, ...]), ...} sid_lut = {} for ulab in self._uniq_labs: sid_lut[ulab] = self._sids[self._labs == ulab] self._sid_lut = sid_lut # {sid1: lab1, ...} lab_lut = {} # sids only contain unique values for i in range(self._sids.shape[0]): lab_lut[self._sids[i]] = self._labs[i] self._lab_lut = lab_lut
[docs] def sort_by_labels(self): """ Return a copy with sorted sample indices by labels and distances. """ labels = np.array(self.labs) # slcs is empty if len(labels) == 0 or self._x.size == 0: return self.ind_x() uniq_labs = np.unique(labels) s_ind_lut = dict([(ulab, np.where(labels == ulab)[0]) for ulab in uniq_labs]) # sort within each label for ulab in uniq_labs: # get sample indices of that class s_inds = s_ind_lut[ulab] # sort that class by distance to the first sample # get a list of distances to the frist sample s_dist_to_s0_list = [self._d[s_inds[0], s_inds[i]] for i in range(len(s_inds))] # sort indices by distances sorted_s_inds = s_inds[np.argsort(s_dist_to_s0_list, kind="mergesort")] # update lut s_ind_lut[ulab] = sorted_s_inds # sort classes by distances of first samples # frist sample indices lab_fs_inds = [s_ind_lut[ulab][0] for ulab in uniq_labs] # distance of first samples to the first class first sample lab_fs_dist_to_fc_list = [self._d[lab_fs_inds[0], lab_fs_inds[i]] for i in range(len(lab_fs_inds))] sorted_ulabs = uniq_labs[np.argsort(lab_fs_dist_to_fc_list, kind="mergesort")] sorted_s_inds = np.concatenate([s_ind_lut[ulab] for ulab in sorted_ulabs]) return self.ind_x(sorted_s_inds)
[docs] def filter_min_class_n(self, min_class_n): uniq_lab_cnts = np.unique(self._labs, return_counts=True) nf_sid_ind = np.in1d( self._labs, (uniq_lab_cnts[0])[uniq_lab_cnts[1] >= min_class_n]) return self.ind_x(nf_sid_ind)
[docs] def labs_to_sids(self, labs): return tuple(tuple(self._sid_lut[y].tolist()) for y in labs)
[docs] def sids_to_labs(self, sids): return np.array([self._lab_lut[x] for x in sids])
[docs] def ind_x(self, selected_s_inds=None, selected_f_inds=None): """ Subset samples by (sample IDs, feature IDs). Parameters ---------- selected_s_inds: int array Index array of selected samples. If is None, select all. selected_f_inds: int array Index array of selected features. If is None, select all. Returns ------- subset: SingleLabelClassifiedSamples """ if selected_s_inds is None: selected_s_inds = slice(None, None) if selected_f_inds is None: selected_f_inds = slice(None, None) if self._use_pdist: return SingleLabelClassifiedSamples( x=self._x[selected_s_inds, :][:, selected_f_inds].copy(), d=self._d[selected_s_inds, :][:, selected_s_inds].copy(), labs=self._labs[selected_s_inds].tolist(), metric=self._metric, use_pdist=self._use_pdist, sids=self._sids[selected_s_inds].tolist(), fids=self._fids[selected_f_inds].tolist(), nprocs=self._nprocs) else: return SingleLabelClassifiedSamples( x=self._x[selected_s_inds, :][:, selected_f_inds].copy(), d=None, labs=self._labs[selected_s_inds].tolist(), metric=self._metric, use_pdist=self._use_pdist, sids=self._sids[selected_s_inds].tolist(), fids=self._fids[selected_f_inds].tolist(), nprocs=self._nprocs)
[docs] def merge_labels(self, orig_labs_to_merge, new_lab): """Merge selected labels into a new label Args: orig_labs_to_merge (list of unique labels): original labels to be merged into a new label new_lab (label): new label of the merged labels Returns: None Note: Update labels in place. """ if not mtype.is_valid_lab(new_lab): raise ValueError("new_lab, {}, must be str or int") mtype.check_is_valid_labs(orig_labs_to_merge) # all labs must be unique if len(orig_labs_to_merge) != len(np.unique(orig_labs_to_merge)): raise ValueError("orig_labs_to_merge must all be unique") for ulab in orig_labs_to_merge: if ulab not in self._uniq_labs: raise ValueError("label {} not in original unique " "labels".format(ulab)) updated_labs = self._labs.copy() for i, lab in enumerate(self._labs): if lab in orig_labs_to_merge: updated_labs[i] = new_lab self._labs = updated_labs self._set_up_lab_rel_attrs() return
[docs] def relabel(self, labels): """ Return a new SingleLabelClassifiedSamples with new labels. """ return SingleLabelClassifiedSamples( x=self._x.copy(), labs=labels, d=self._d.copy(), sids=self._sids.tolist(), fids=self._fids.tolist(), metric=self._metric, nprocs=self._nprocs)
[docs] def id_x(self, selected_sids=None, selected_fids=None): """ Subset samples by (sample IDs, feature IDs). Parameters ---------- selected_s_inds: int array Index array of selected samples. If is None, select all. selected_f_inds: int array Index array of selected features. If is None, select all. Returns ------- subset: SingleLabelClassifiedSamples """ if selected_sids is None: selected_s_inds = None else: selected_s_inds = self.s_id_to_ind(selected_sids) if selected_fids is None: selected_f_inds = None else: selected_f_inds = self.f_id_to_ind(selected_fids) return self.ind_x(selected_s_inds, selected_f_inds)
[docs] @staticmethod def select_labs_bool_inds(ref_labs, selected_labs): if selected_labs is None: return np.repeat(True, len(ref_labs)) if np.isscalar(selected_labs): selected_labs = [selected_labs] ref_uniq_labs = np.unique(ref_labs).tolist() if not all([x in ref_uniq_labs for x in selected_labs]): raise ValueError("selected_labs: {} are not all existed " "in the unique ref labels " "{}".format(selected_labs, ref_uniq_labs)) lab_selected_s_bool_inds = np.in1d(ref_labs, selected_labs) return lab_selected_s_bool_inds
[docs] def lab_x_bool_inds(self, selected_labs): return self.select_labs_bool_inds(self._labs, selected_labs)
[docs] def lab_x(self, selected_labs): lab_selected_s_bool_inds = self.lab_x_bool_inds(selected_labs) return self.ind_x(lab_selected_s_bool_inds)
@staticmethod def _xgb_train_runner(x, lab_inds, fids, test_size=0.3, num_boost_round=10, nprocs=1, random_state=None, silent=1, xgb_params=None): """ Run xgboost train with prepared data and parameters. Returns ------- sorted_fscore_list: list Ordered important features and their results. bst: xgb.Booster Fitted model. eval_stats: tuple Final test and train error. """ n_uniq_labs = len(np.unique(lab_inds)) # xgb only takes string as feature names str_fids = list(map(str, fids)) orig_fid_lut = dict(zip(str_fids, fids)) # Prepare default xgboost parameters xgb_random_state = 0 if random_state is None else random_state if xgb_params is None: # Use log2(n_features) as default depth. xgb_params = { "eta": 0.3, "max_depth": 6, "silent": silent, "nthread": nprocs, "alpha": 1, "lambda": 0, "seed": xgb_random_state } if n_uniq_labs == 2: # do binary classification xgb_params["objective"] = "binary:logistic" xgb_params["eval_metric"] = "error" else: # do multi-label classification xgb_params["num_class"] = n_uniq_labs xgb_params["objective"] = "multi:softmax" xgb_params["eval_metric"] = "merror" # split training and testing # random state determined by numpy train_x, test_x, train_labs, test_labs = train_test_split( x, lab_inds, test_size=test_size) # xgb datastructure to hold data and labels dtrain = xgb.DMatrix(train_x, train_labs, feature_names=str_fids) dtest = xgb.DMatrix(test_x, test_labs, feature_names=str_fids) # list of data to evaluate eval_list = [(dtest, "test"), (dtrain, "train")] evals_result = {} if silent: verbose_eval = False else: verbose_eval = True # bst is the train boost tree model bst = xgb.train(xgb_params, dtrain, num_boost_round, eval_list, evals_result=evals_result, verbose_eval=verbose_eval) # Turn dict to list # [ [('train...', float), ...], # [('test...', float), ...] ] eval_stats = [[(eval_name + " " + mname, mval_list[num_boost_round-1]) for mname, mval_list in eval_dict.items()] for eval_name, eval_dict in evals_result.items()] # {feature_name: fscore, ...} fscore_dict = bst.get_fscore() orig_fid_fscore_list = [(orig_fid_lut[istr_fid], ifscore) for istr_fid, ifscore in fscore_dict.items()] sorted_fscore_list = sorted(orig_fid_fscore_list, key=lambda t: t[1], reverse=True) return sorted_fscore_list, bst, eval_stats
[docs] def feature_importance_across_labs(self, selected_labs, test_size=0.3, num_boost_round=10, nprocs=1, random_state=None, silent=1, xgb_params=None, num_bootstrap_round=0, bootstrap_size=None, shuffle_features=False): """ Use xgboost to determine the importance of features determining the difference between samples with different labels. Run cross validation on dataset and obtain import features. Parameters ---------- selected_labs: label list Labels to compare using xgboost. test_size: float in range (0, 1) Ratio of samples to be used for testing num_bootstrap_round: int Do num_bootstrap_round times of simple bootstrapping if `num_bootstrap_round > 0`. bootstrap_size: int The number of samples for each bootstrapping run. shuffle_features: bool num_boost_round: int The number of rounds for xgboost training. random_state: int nprocs: int xgb_params: dict Parameters for xgboost run. If None, default will be used. If provided, they will be directly used for xgbooster. Returns ------- feature_importance_list: list of feature importance of each run [(feature_id, mean of fscore across all bootstrapping rounds, standard deviation of fscore across all bootstrapping rounds, number of times used all bootstrapping rounds), ...] bst_list: list of xgb Booster Fitted boost tree model Notes ----- If multiple features are highly correlated, they may not all show up in the resulting tree. You could try to reduce redundant features first before comparing different clusters, or you could also interpret the important features further after obtaining the important features. For details about xgboost parameters, check the following links: [1] https://www.analyticsvidhya.com/blog/2016/03/\ complete-guide-parameter-tuning-xgboost-with-codes-python/ [2] http://xgboost.readthedocs.io/en/latest/python/python_intro.html [3] http://xgboost.readthedocs.io/en/latest/parameter.html [4] https://xgboost.readthedocs.io/en/latest/how_to/param_tuning.html [5] https://www.analyticsvidhya.com/blog/2016/03/\ complete-guide-parameter-tuning-xgboost-with-codes-python/ """ num_boost_round = int(num_boost_round) if num_boost_round <= 0: raise ValueError("num_boost_round must >= 1") # This is for implementing caching in the future. selected_uniq_labs = np.unique(selected_labs).tolist() # subset SLCS lab_selected_slcs = self.lab_x(selected_uniq_labs) # unique labels in SLCS after subsetting # Since lab_x checks whether selected labels are all existing, # the unique labels of the subset is equivalent to input selected # labels. uniq_labs = lab_selected_slcs._uniq_labs.tolist() # convert labels into indices from 0 to n_classes n_uniq_labs = len(uniq_labs) if n_uniq_labs <= 1: raise ValueError("The number of unique labels should > 1. " "Provided uniq labs:" " {}".format(uniq_labs)) lab_ind_lut = dict(zip(uniq_labs, range(n_uniq_labs))) lab_inds = [lab_ind_lut[lab] for lab in lab_selected_slcs._labs] np.random.seed(random_state) # shuffle features if necessary fids = lab_selected_slcs.fids if shuffle_features: feature_inds = np.arange(lab_selected_slcs._x.shape[1]) feature_inds, fids = sklearn.utils.shuffle( feature_inds, fids) else: feature_inds = slice(None, None) # perform bootstrapping if necessary num_bootstrap_round = int(num_bootstrap_round) if num_bootstrap_round <= 0: # no bootstrapping # _xgb_train_runner returns (fscores, bst, eval_stats) fscores, bst, eval_stats = self._xgb_train_runner( lab_selected_slcs._x[:, feature_inds], lab_inds, fids, test_size=test_size, num_boost_round=num_boost_round, xgb_params=xgb_params, random_state=random_state, nprocs=nprocs, silent=silent) # make sorted_fs_list consistent to the bootstrapped one fs_list = [(t[0], t[1], 0, 1, [t[1]]) for t in fscores] sorted_fs_list = sorted(fs_list, key=lambda t: (t[3], t[1]), reverse=True) print(eval_stats) bst_list = [bst] else: # do bootstrapping # ([dict of scores], [list of bsts], dict of eval stats) fs_dict = defaultdict(list) bst_list = [] eval_stats_dict = defaultdict(list) if bootstrap_size is None: bootstrap_size = lab_selected_slcs._x.shape[0] sample_inds = np.arange(lab_selected_slcs._x.shape[0]) # bootstrapping rounds for i in range(num_bootstrap_round): # random state determined by numpy # ensure all labels present # initialize resample sample_indices and labels bs_s_inds, bs_lab_inds = sklearn.utils.resample( sample_inds, lab_inds, replace=True, n_samples=bootstrap_size) while len(np.unique(bs_lab_inds)) != n_uniq_labs: bs_s_inds, bs_lab_inds = sklearn.utils.resample( sample_inds, lab_inds, replace=True, n_samples=bootstrap_size) fscores, bst, eval_stats = self._xgb_train_runner( lab_selected_slcs._x[bs_s_inds, :][:, feature_inds], bs_lab_inds, fids, test_size=test_size, num_boost_round=num_boost_round, xgb_params=xgb_params, random_state=random_state, nprocs=nprocs, silent=silent) # Sum fscores for fid, fs in fscores: fs_dict[fid] += [fs] bst_list.append(bst) # est: eval stats tuple # [ [('train...', float), ...], # [('test...', float), ...] ] for elist in eval_stats: for ename, evalue in elist: eval_stats_dict[ename].append(evalue) if shuffle_features: feature_inds, fids = sklearn.utils.shuffle( feature_inds, fids) # score summary: average, std, times showed up fid_s_list = [] for fid, fs in fs_dict.items(): fid_s_list.append((fid, np.mean(fs), np.std(fs, ddof=0), len(fs), fs)) sorted_fs_list = sorted(fid_s_list, key=lambda t: (t[3], t[1]), reverse=True) # calculate mean +/- std of eval stats for ename, evalue_list in eval_stats_dict.items(): print("{}: mean {}, std {}".format( ename, np.mean(evalue_list), np.std(evalue_list, ddof=1))) # return same things for two branches return sorted_fs_list, bst_list
[docs] def feature_importance_distintuishing_labs(self, selected_labs, test_size=0.3, num_boost_round=10, nprocs=1, random_state=None, silent=1, xgb_params=None, num_bootstrap_round=0, bootstrap_size=None, shuffle_features=False): """ Use xgboost to compare selected labels and others. """ selected_s_bool_inds = self.lab_x_bool_inds(selected_labs) # binary labs distinguishing selected and non-selected io_bin_lab_arr = ["selected" if s else "non-selected" for s in selected_s_bool_inds] # create a new SLCS instance with new labels nl_slcs = self.relabel(io_bin_lab_arr) fi_res = nl_slcs.feature_importance_across_labs( ["selected", "non-selected"], test_size=test_size, num_boost_round=num_boost_round, nprocs=nprocs, random_state=random_state, silent=silent, xgb_params=xgb_params, num_bootstrap_round=num_bootstrap_round, bootstrap_size=bootstrap_size, shuffle_features=shuffle_features) return fi_res
[docs] def feature_importance_each_lab(self, test_size=0.3, num_boost_round=10, nprocs=1, random_state=None, silent=1, xgb_params=None, num_bootstrap_round=0, bootstrap_size=None, shuffle_features=False): """ Use xgboost to compare each label with others. Experimental. """ # Construct important feature lut # {ulab0: [if1, if2, ...], ...} ulab_fi_lut = defaultdict(list) for ulab in self._uniq_labs: # get bool indices of current label ulab_s_bool_inds = self.lab_x_bool_inds(ulab) # compare current label with other samples fi_res = self.feature_importance_distintuishing_labs( ulab, test_size=test_size, num_boost_round=num_boost_round, nprocs=nprocs, random_state=random_state, silent=silent, xgb_params=xgb_params, num_bootstrap_round=num_bootstrap_round, bootstrap_size=bootstrap_size, shuffle_features=shuffle_features) for fid in [t[0] for t in fi_res[0]]: fx = self.f_id_x_vec(fid) # current label values ulab_x = fx[ulab_s_bool_inds] # other values other_x = fx[np.logical_not(ulab_s_bool_inds)] # current lab mean ulab_x_mean = np.mean(ulab_x) # other mean other_x_mean = np.mean(other_x) # mean fold change ulab_mfc = (ulab_x_mean - other_x_mean) / ulab_x_mean # ks test result ks_res = ks_2samp(ulab_x, other_x) ulab_fi_lut[ulab].append((fid, ulab_mfc, ks_res.pvalue)) ulab_fi_lut[ulab].sort(key=lambda t: t[1], reverse=True) ulab_fi_lut[ulab] = [t for t in ulab_fi_lut[ulab]] return ulab_fi_lut
[docs] def tsne_plot(self, gradient=None, labels=None, selected_labels=None, shuffle_label_colors=False, title=None, xlab=None, ylab=None, figsize=(20, 20), add_legend=True, n_txt_per_cluster=3, alpha=1, s=0.5, random_state=None, **kwargs): """ Plot the last t-SNE projection with the provided gradient as color. """ if labels is None: labels = self.labs return super(SingleLabelClassifiedSamples, self).tsne_plot( gradient=gradient, labels=labels, selected_labels=selected_labels, shuffle_label_colors=shuffle_label_colors, title=title, xlab=xlab, ylab=ylab, figsize=figsize, add_legend=add_legend, n_txt_per_cluster=n_txt_per_cluster, alpha=alpha, s=s, random_state=random_state, **kwargs)
[docs] def tsne_feature_gradient_plot(self, fid, transform=None, labels=None, selected_labels=None, shuffle_label_colors=False, title=None, xlab=None, ylab=None, figsize=(20, 20), add_legend=True, n_txt_per_cluster=3, alpha=1, s=0.5, random_state=None, **kwargs): """ Plot the last t-SNE projection with the provided gradient as color. Parameters ---------- fid: feature id scalar ID of the feature to be used for gradient plot. transform: callable Map transform on feature before plotting. """ if labels is None: labels = self.labs return super(SingleLabelClassifiedSamples, self).tsne_feature_gradient_plot( fid=fid, transform=transform, labels=labels, selected_labels=selected_labels, shuffle_label_colors=shuffle_label_colors, title=title, xlab=xlab, ylab=ylab, figsize=figsize, add_legend=add_legend, n_txt_per_cluster=n_txt_per_cluster, alpha=alpha, s=s, random_state=random_state, **kwargs)
[docs] def feature_swarm_plot(self, fid, transform=None, labels=None, selected_labels=None, title=None, xlab=None, ylab=None, figsize=(10, 10)): f_ind = self.f_id_to_ind([fid])[0] fx = self.f_ind_x_vec(f_ind) if transform is not None: if callable(transform): fx = np.array(list(map(transform, fx))) else: raise ValueError("transform must be a callable") if labels is not None and len(labels) != fx.shape[0]: raise ValueError("labels ({}) must have same length as " "n_samples.".format(labels)) else: labels = self.labs return swarm(fx, labels=labels, selected_labels=selected_labels, title=title, xlab=xlab, ylab=ylab, figsize=figsize)
[docs] def dmat_heatmap(self, selected_labels=None, col_labels=None, transform=None, title=None, xlab=None, ylab=None, figsize=(10, 10), **kwargs): """ Plot distance matrix with rows colored by current labels. """ selected_s_bool_inds = self.lab_x_bool_inds(selected_labels) selected_labels = self._labs[selected_s_bool_inds].tolist() selected_d = self._d[selected_s_bool_inds, :][:, selected_s_bool_inds] return heatmap(selected_d, row_labels=selected_labels, col_labels=col_labels, transform=transform, title=title, xlab=xlab, ylab=ylab, figsize=figsize, **kwargs)
[docs] def xmat_heatmap(self, selected_labels=None, selected_fids=None, col_labels=None, transform=None, title=None, xlab=None, ylab=None, figsize=(10, 10), **kwargs): """ Plot x as heatmap. """ selected_s_bool_inds = self.lab_x_bool_inds(selected_labels) selected_s_ids = self._sids[selected_s_bool_inds] selected_slcs = self.id_x(selected_s_ids, selected_fids) return heatmap(selected_slcs._x, row_labels=selected_slcs.labs, col_labels=col_labels, transform=transform, title=title, xlab=xlab, ylab=ylab, figsize=figsize, **kwargs)
@property def labs(self): return self._labs.tolist() # Sort the clustered sample_ids with the reference order of another. # # Sort sids according to labs # If ref_sid_order is not None: # sort sids further according to ref_sid_order
[docs] def lab_sorted_sids(self, ref_sid_order=None): sep_lab_sid_list = [] sep_lab_list = [] for iter_lab in sorted(self._sid_lut.keys()): iter_sid_arr = self._sid_lut[iter_lab] sep_lab_sid_list.append(iter_sid_arr) sep_lab_list.append(np.repeat(iter_lab, len(iter_sid_arr))) if ref_sid_order is not None: mtype.check_is_valid_sfids(ref_sid_order) ref_sid_order = np.array(ref_sid_order) # sort r according to q # assumes: # - r contains all elements in q # - r is 1d np array def sort_flat_sids(query_sids, ref_sids): return ref_sids[np.in1d(ref_sids, query_sids)] # sort inner sid list but maintains the order as sep_lab_list sep_lab_sid_list = [sort_flat_sids(x, ref_sid_order) for x in sep_lab_sid_list] sep_lab_min_sid_list = [x[0] for x in sep_lab_sid_list] sorted_sep_lab_min_sid_list = list( sort_flat_sids(sep_lab_min_sid_list, ref_sid_order)) min_sid_sorted_sep_lab_ind_list = [ sep_lab_min_sid_list.index(x) for x in sorted_sep_lab_min_sid_list ] sep_lab_list = [sep_lab_list[i] for i in min_sid_sorted_sep_lab_ind_list] sep_lab_sid_list = [sep_lab_sid_list[i] for i in min_sid_sorted_sep_lab_ind_list] lab_sorted_sid_arr = np.concatenate(sep_lab_sid_list) lab_sorted_lab_arr = np.concatenate(sep_lab_list) # check sorted sids are the same set as original np.testing.assert_array_equal( np.sort(lab_sorted_sid_arr), np.sort(self._sids)) # check sorted labs are the same set as original np.testing.assert_array_equal( np.sort(lab_sorted_lab_arr), np.sort(self._labs)) # check sorted (sid, lab) matchings are the same set as original np.testing.assert_array_equal( lab_sorted_lab_arr[np.argsort(lab_sorted_sid_arr)], self._labs[np.argsort(self._sids)]) return (lab_sorted_sid_arr, lab_sorted_lab_arr)
# See how two clustering criteria match with each other. # When given q_slc_samples is not None, sids and labs are ignored. # When q_slc_samples is None, sids and labs must be provided
[docs] def cross_labs(self, q_slc_samples): if not isinstance(q_slc_samples, SingleLabelClassifiedSamples): raise TypeError("Query should be an instance of " "SingleLabelClassifiedSamples") try: ref_labs = np.array([self._lab_lut[x] for x in q_slc_samples.sids]) except KeyError as e: raise ValueError("query sid {} is not in ref sids.".format(e)) query_labs = np.array(q_slc_samples.labs) uniq_rlabs, uniq_rlab_cnts = np.unique(ref_labs, return_counts=True) cross_lab_lut = {} for i in range(len(uniq_rlabs)): # ref cluster i. query unique labs. ref_ci_quniq = tuple(map(list, np.unique( query_labs[np.where(np.array(ref_labs) == uniq_rlabs[i])], return_counts=True))) cross_lab_lut[uniq_rlabs[i]] = (uniq_rlab_cnts[i], tuple(map(tuple, ref_ci_quniq))) return cross_lab_lut
[docs]class MDLSingleLabelClassifiedSamples(SingleLabelClassifiedSamples): """ MDLSingleLabelClassifiedSamples inherits SingleLabelClassifiedSamples to offer MDL operations. Args: x (2d number array): data matrix labs (list of str or int): labels sids (list of str or int): sample ids fids (list of str or int): feature ids encode_type ("auto", "data", or "distance"): Type of values to encode. If "auto", encode data when n_features <= 100. mdl_method (mdl.Mdl): If None, use ZeroIGKdeMdl for encoded values with >= 50% zeros, and use GKdeMdl otherwise. d (2d number array): distance matrix metric (str): distance metric for scipy nprocs (int) Attributes: _mdl_method (.mdl.Mdl) """ LabMdlResult = namedtuple("LabMdlResult", ["ulab_mdl_sum", "ulab_s_inds", "ulab_cnts", "ulab_mdls", "cluster_mdl"]) def __init__(self, x, labs, sids=None, fids=None, encode_type="data", mdl_method=mdl.ZeroIGKdeMdl, d=None, metric="correlation", nprocs=None): super(MDLSingleLabelClassifiedSamples, self).__init__( x=x, labs=labs, sids=sids, fids=fids, d=d, metric=metric, nprocs=nprocs) # initialize encode type if encode_type not in ("auto", "data", "distance"): raise ValueError("encode_type must in " "('auto', 'data', 'distance')." "Provided: {}".format(encode_type)) if encode_type == "auto": if self._x.shape[1] > 100: encode_type = "distance" else: encode_type = "data" self._encode_type = encode_type # initialize mdl method if mdl_method is None: if encode_type == "data": ex = self._x else: ex = self._d if ex.size == 0: # empty matrix mdl_method = mdl.GKdeMdl else: n_nonzero = np.count_nonzero(ex) if n_nonzero / ex.size > 0.5: mdl_method = mdl.GKdeMdl else: mdl_method = mdl.ZeroIGKdeMdl self._mdl_method = mdl_method
[docs] @staticmethod def per_col_encoders(x, encode_type, mdl_method=mdl.ZeroIGKdeMdl, nprocs=1, verbose=False): """Compute mdl encoder for each column Args: x (2d number array) encode_type ("data" or "distance") mdl_method (mdl.Mdl) nprocs (int) verbose (bool) Returns: :obj: list of column mdl encoders of x """ # verbose is not implemented if not inspect.isclass(mdl_method): raise ValueError("method must be a subclass of eda.mdl.Mdl") if not issubclass(mdl_method, mdl.Mdl): raise ValueError("method must be a subclass of eda.mdl.Mdl") if x.ndim != 2: raise ValueError("x should be 2D. x.shape: {}".format(x.shape)) nprocs = max(int(nprocs), 1) if encode_type == "data": col_encoders = utils.parmap( lambda x1d: mdl_method(x1d), x.T, nprocs) elif encode_type == "distance": # distance s_inds = list(range(x.shape[0])) xs_for_map = [x[s_inds[i], s_inds[:i] + s_inds[i+1:]] for i in s_inds] def single_s_mdl_encoder(x1d): # copy indices for parallel processing return mdl_method(x1d) col_encoders = utils.parmap( single_s_mdl_encoder, xs_for_map, nprocs) else: raise NotImplementedError("unknown encode_type: " "{}".format(encode_type)) return col_encoders
[docs] def no_lab_mdl(self, nprocs=1, verbose=False): """Compute mdl of each feature without separating samples by labels Args: nprocs (int) verbose (bool): Not implemented Returns: float: mdl of matrix without separating samples by labels """ # TODO: implement verbose if self._encode_type == "data": col_encoders = self.per_col_encoders(self._x, self._encode_type, self._mdl_method, nprocs, verbose) col_mdl_sum = np.sum(list(map(lambda e: e.mdl, col_encoders))) elif self._encode_type == "distance": col_encoders = self.per_col_encoders(self._d, self._encode_type, self._mdl_method, nprocs, verbose) col_mdl_sum = 0 ulab_s_ind_list = [np.where(self._labs == ulab)[0].tolist() for ulab in self._uniq_labs] for s_inds in ulab_s_ind_list: n_s_inds = len(s_inds) rn_inds = list(range(n_s_inds)) # [(encoder, x), ...] enc_x_tups = [] for i in rn_inds: i_s_ind = s_inds[i] non_i_s_inds = s_inds[:i] + s_inds[i+1:] enc_x_tups.append((col_encoders[i_s_ind], self._d[non_i_s_inds, i_s_ind])) mdls = utils.parmap(lambda ext: ext[0].encode(ext[1]), enc_x_tups, nprocs) col_mdl_sum += sum(mdls) else: raise NotImplementedError("Do not change encode_type after init. " "Unknown encode type " "{}".format(self._encode_type)) return col_mdl_sum
[docs] @staticmethod def compute_cluster_mdl(labs, cl_mdl_scale_factor=1): """Additional MDL for encoding the cluster - labels are encoded by multinomial distribution - parameters are encoded by 32bit float np.log(2**32) = 22.18070977791825 - scaled by factor TODO: formalize parameter mdl """ uniq_labs, uniq_lab_cnts = np.unique(labs, return_counts=True) n_uniq_labs = len(uniq_lab_cnts) # make a flat list of labels int_labs = list(itertools.chain.from_iterable( [[i]*uniq_lab_cnts[i] for i in range(n_uniq_labs)])) mn_mdl = mdl.MultinomialMdl(int_labs).mdl param_mdl = 22.18070977791825 * n_uniq_labs return (mn_mdl + param_mdl) * cl_mdl_scale_factor
[docs] def lab_mdl(self, cl_mdl_scale_factor=1, nprocs=1, verbose=False, ret_internal=False): """Compute mdl of each feature after separating samples by labels Args: cl_mdl_scale_factor (float): multiplies cluster related mdl by this number nprocs (int) verbose (bool): Not implemented Returns: float: mdl of matrix after separating sampels by labels """ # compute cluster label overhead mdl cluster_mdl = self.compute_cluster_mdl( self._labs, cl_mdl_scale_factor=cl_mdl_scale_factor) # compute mdl for data points n_uniq_labs = self._uniq_labs.shape[0] ulab_s_ind_list = [np.where(self._labs == ulab)[0].tolist() for ulab in self._uniq_labs] # summarize mdls of all clusters if self._encode_type == "distance": ulab_mdls = [MDLSingleLabelClassifiedSamples( self._x[s_inds], labs=[0]*len(s_inds), encode_type=self._encode_type, mdl_method=self._mdl_method, d=self._d[s_inds][:, s_inds], metric=self._metric, nprocs=self._nprocs).no_lab_mdl() for s_inds in ulab_s_ind_list] elif self._encode_type == "data": # do not pass d ulab_mdls = [MDLSingleLabelClassifiedSamples( self._x[s_inds], labs=[0]*len(s_inds), encode_type=self._encode_type, mdl_method=self._mdl_method, metric=self._metric, nprocs=self._nprocs).no_lab_mdl() for s_inds in ulab_s_ind_list] else: raise NotImplementedError("Do not change encode_type after init. " "Unknown encode type " "{}".format(self._encode_type)) # add cluster overhead mdl to each cluster ulab_cnt_ratios = self._uniq_lab_cnts / np.int_(self._x.shape[0]) ulab_cl_mdls = [ulab_mdls[i] + cluster_mdl * ulab_cnt_ratios[i] for i in range(n_uniq_labs)] ulab_mdl_sum = np.sum(ulab_cl_mdls) lab_mdl_res = self.LabMdlResult(ulab_mdl_sum, ulab_s_ind_list, self._uniq_lab_cnts.tolist(), ulab_cl_mdls, cluster_mdl) if ret_internal: return lab_mdl_res, ulab_mdls else: return lab_mdl_res
[docs] def encode(self, qx, col_summary_func=sum, non_zero_only=False, nprocs=1, verbose=False): """Encode input array qx with fitted code without label Args: qx (2d np number array) col_summary_func (callable): function applied on column mdls non_zero_only (bool): whether to encode non-zero entries only nprocs (int) verbose (bool) Returns: float: mdl for encoding qx """ if not callable(col_summary_func): raise ValueError("col_summary_func must be callable") if self._encode_type == "data": ex = self._x elif self._encode_type == "distance": ex = self._d else: raise NotImplementedError("Do not change encode_type after init. " "Unknown encode type " "{}".format(self._encode_type)) if qx.ndim != 2 or qx.shape[1] != ex.shape[1]: raise ValueError("Array to encode should have the same number of" "columns as the encoded x") col_encoders = self.per_col_encoders( ex, self._encode_type, self._mdl_method, nprocs, verbose=verbose) ncols = ex.shape[1] q_x_cols = [] for i in range(ncols): x_col = qx[:, i] # mdl_method is valid after running per_col_encoders if non_zero_only: q_x_cols.append(x_col[np.nonzero(x_col)]) else: # all mdl methods here are valid # this branch is GKdeMdl q_x_cols.append(x_col) # (mdl, qx) tuple mdl_qxcol_tups = list(zip(col_encoders, q_x_cols)) encode_q_col_mdls = utils.parmap( lambda ext: ext[0].encode(ext[1]), mdl_qxcol_tups, nprocs=nprocs) encode_x_mdl = col_summary_func(encode_q_col_mdls) return encode_x_mdl