Source code for scedar.cluster.mirac

import itertools
import numpy as np

from scedar import eda
from scedar.eda.slcs import MDLSingleLabelClassifiedSamples as MDLSLCS
from scedar.eda.slcs import SingleLabelClassifiedSamples as SLCS
from scedar import utils

[docs]class MIRAC(object): """ MIRAC: MDL iteratively regularized agglomerative clustering. Args ---- x : float array Data matrix. d : float array Distance matrix. metric : str Type of distance metric. sids : sid list List of sample ids. fids : fid list List of feature ids. hac_tree : HCTree Hierarchical tree built by agglomerative clustering to divide in MIRAC. If provided, distance matrix will not be used for building another tree. nprocs : int Number of processes to run MIRAC parallely. cl_mdl_scale_factor : float Scale factor of cluster overhead mdl. min_cl_n : int Minimum # samples in a cluster. encode_type : {"auto", "data", or "distance"} Type of values to encode. If "auto", encode data when n_features <= 100. mdl_method : mdl.Mdl If None, use ZeroIGKdeMdl for encoded values with >= 50% zeros, and use GKdeMdl otherwise. linkage : str Linkage type for generating the hierarchy. optimal_ordering : bool To require hierarchical clustering tree with optimal ordering. Default value is False. dim_reduct_method : {"PCA", "t-SNE", "UMAP", None} If None, no dimensionality reduction before clustering. verbose : bool Print stats for each iteration. Attributes ---------- _sdm : SampleDistanceMatrix Data and distance matrices. _min_cl_n : int Stored parameter. _encode_type : str Encode type. If "auto" provided, this attribute will store the determined encode type. _mdl_method : mdl.Mdl Mdl method. If None is provided, this attribute will store the determined mdl method. labs : label list Labels of clustered samples. 1-to-1 matching to from first to last. _hac_tree : eda.hct.HClustTree Root node of the hierarchical agglomerative clustering tree. _run_log : str String containing the log of the MIRAC run. TODO: * Dendrogram representation of the splitting process. * Take HCTree as parameter. Computing it is non-trivial when n is large. * Simplify splitting criteria. """ # TODO: use PCA/tsne/umap def __init__(self, x, d=None, metric="cosine", sids=None, fids=None, hac_tree=None, nprocs=1, cl_mdl_scale_factor=1, min_cl_n=25, encode_type="auto", mdl_method=None, min_split_mdl_red_ratio=0.2, soft_min_subtree_size=1, linkage="complete", optimal_ordering=False, dim_reduct_method=None, verbose=False): super().__init__() # initialize simple attributes self._nprocs = max(int(nprocs), 1) self._is_euc_dist = metric == "euclidean" self._verbose = verbose self._linkage = linkage self._optimal_ordering = optimal_ordering self._dim_reduct_method = dim_reduct_method # check dimensionality reduction method if dim_reduct_method is not None: # TODO: use pdist if provided dim_red_sdm = eda.SampleDistanceMatrix( x=x, metric=metric, use_pdist=False, nprocs=nprocs) if dim_reduct_method == "PCA": data_x = dim_red_sdm._pca_x elif dim_reduct_method == "t-SNE": data_x = dim_red_sdm.tsne( n_iter=3000, random_state=17, verbose=verbose) elif dim_reduct_method == "UMAP": data_x = dim_red_sdm._umap_x else: raise ValueError("Not supported dimensionality reduction " "method: {}".format(dim_reduct_method)) else: data_x = x # labels for computing MDL self._sdm = MDLSLCS(x=data_x, labs=[0]*data_x.shape[0], d=d, metric=metric, sids=sids, fids=fids, encode_type=encode_type, mdl_method=mdl_method, nprocs=nprocs) self._encode_type = self._sdm._encode_type self._mdl_method = self._sdm._mdl_method # initialize hierarchical clustering tree if hac_tree is not None: n_leaf_nodes = len(hac_tree.leaf_ids()) if n_leaf_nodes != self._sdm._x.shape[0]: raise ValueError("hac_tree should have same number of " "samples as x.") else: hac_tree = eda.HClustTree.hclust_tree( self._sdm._d, linkage=self._linkage, optimal_ordering=self._optimal_ordering, is_euc_dist=self._is_euc_dist, verbose=self._verbose) self._hac_tree = hac_tree # run self.tune_parameters(cl_mdl_scale_factor, min_cl_n, min_split_mdl_red_ratio, soft_min_subtree_size, self._verbose) def _set_parameters(self, cl_mdl_scale_factor=1, min_cl_n=25, min_split_mdl_red_ratio=0.2, soft_min_subtree_size=1): # initialize cluster mdl scale factor if cl_mdl_scale_factor < 0: raise ValueError("cl_mdl_scale_factor should >= 0" "cl_mdl_scale_factor: " "{}".format(cl_mdl_scale_factor)) self._cl_mdl_scale_factor = cl_mdl_scale_factor # intialize min_cl_n # assert min_cl_n > 0 min_cl_n = int(min_cl_n) if min_cl_n <= 0: raise ValueError("min_cl_n shoud > 0. " "min_cl_n: {}".format(min_cl_n)) self._min_cl_n = min_cl_n self._min_split_mdl_red_ratio = min_split_mdl_red_ratio self._soft_min_subtree_size = soft_min_subtree_size return
[docs] def tune_parameters(self, cl_mdl_scale_factor=1, min_cl_n=25, min_split_mdl_red_ratio=0.2, soft_min_subtree_size=1, verbose=False): self._verbose = verbose self._set_parameters(cl_mdl_scale_factor, min_cl_n, min_split_mdl_red_ratio, soft_min_subtree_size) # run MIRAC with initialized parameters s_inds, s_labs = self._mirac() # initialize labels self._labs = s_labs[np.argsort(s_inds, kind="mergesort")].tolist() return
[docs] def dmat_heatmap(self, selected_labels=None, col_labels=None, transform=None, title=None, xlab=None, ylab=None, figsize=(10, 10), **kwargs): # hierarchical clustering tree leaf sample inds ordered from # left to right leaf_order = self._hac_tree.leaf_ids() if len(leaf_order) == 0: return None # leaf labels from left to right leaf_ordered_labs = np.array(self._labs)[leaf_order].tolist() # check labels not interrupted in leaf order # in other word, same labels should be adjacent to each other # Examples: # - good: [1] and [1, 1, 2, 2, 3] # - bad: [1, 2, 1, 1, 3, 3] curr_lab = leaf_ordered_labs[0] lab_set = set([curr_lab]) for ilab in leaf_ordered_labs: if ilab != curr_lab: # reached the next group of labels if ilab in lab_set: raise ValueError("Same labels should be grouped " "together.\n\t" "iterating lab: {}\n\t" "iterated lab set: {}\n\t" "leaf order: {}\n\t" "leaf ordered labs: {}".format( ilab, lab_set, leaf_order, leaf_ordered_labs)) lab_set.add(ilab) curr_lab = ilab # generate heatmap # select labels to plot s_lab_bool_inds = SLCS.select_labs_bool_inds( leaf_ordered_labs, selected_labels) s_leaf_order = list(itertools.compress(leaf_order, s_lab_bool_inds)) s_leaf_ordered_labs = list(itertools.compress(leaf_ordered_labs, s_lab_bool_inds)) s_d = self._sdm._d[s_leaf_order][:, s_leaf_order] return eda.heatmap(s_d, row_labels=s_leaf_ordered_labs, col_labels=col_labels, transform=transform, title=title, xlab=xlab, ylab=ylab, figsize=figsize, **kwargs)
@property def labs(self): return self._labs.copy() def _mirac(self): # iterative hierarchical agglomerative clustering # Input: # - cl_mdl_scale_factor: scale cluster overhead mdl # - minimax_n: estimated minimum # samples in a cluster # - maxmini_n: estimated max # samples in a cluster. # If none, 10 * minimax is used. leaf_order = self._hac_tree.leaf_ids() n_samples = self._sdm._x.shape[0] n_features = self._sdm._x.shape[1] self._run_log = "" # split samples into sub-clusters with less than min_cl_n samples curr_trees = [self._hac_tree] next_trees = [] split_s_inds = [] while len(curr_trees) != 0: # Split each of the hac tree in curr_trees for i in range(len(curr_trees)): # lst, rst: left_sub_tree, right_sub_tree labs, s_inds, lst, rst = curr_trees[i].bi_partition( soft_min_subtree_size=self._soft_min_subtree_size, return_subtrees=True) s_cnt = len(s_inds) subtrees = [lst, rst] subtree_s_ind_list = [t.leaf_ids() for t in subtrees] subtree_s_cnt_list = [len(x) for x in subtree_s_ind_list] n_subtrees = len(subtrees) subtree_split_list = [] for st_ind in range(n_subtrees): if subtree_s_cnt_list[st_ind] < self._min_cl_n: subtree_split_list.append("min-cl-size") split_s_inds.append(subtree_s_ind_list[st_ind]) else: subtree_split_list.append("split") next_trees.append(subtrees[st_ind]) curr_iter_run_log = str.format( "subtree_n: {}, " "subtree_n/n: {}," "split: {}.\n", subtree_s_cnt_list, [x / s_cnt for x in subtree_s_cnt_list], subtree_split_list) self._run_log += curr_iter_run_log curr_trees = next_trees next_trees = [] # sort individual splitted sample index list sub_cl_order = np.argsort(list(map(lambda x: leaf_order.index(x[0]), split_s_inds))) split_s_inds = [split_s_inds[i] for i in sub_cl_order] # merge subclusters by mdl # start with merging clusters at beginning to form a cluster with more # than min_cl_n samples. while len(split_s_inds) > 1 and len(split_s_inds[0]) < self._min_cl_n: # pop first 2 sub-cluster indices # concatenate # insert concatenated cluster back at the beginning split_s_inds.insert(0, split_s_inds.pop(0) + split_s_inds.pop(0)) # Then merge clusters at the end to form a cluster with more than # min_cl_n samples while len(split_s_inds) > 1 and len(split_s_inds[-1]) < self._min_cl_n: # pop last 2 sub-cluster indices # concatenate # append at the end split_s_inds.append(split_s_inds.pop(-2) + split_s_inds.pop(-1)) # merge other sub-clusters # m_ind points to the left sub-cluster (scl_left) of the sub-cluster # currently being inspected (scl_insp) m_ind = 0 while m_ind < len(split_s_inds) - 1: scl_left = split_s_inds.pop(m_ind) scl_insp = split_s_inds.pop(m_ind) while len(scl_insp) < self._min_cl_n and len(split_s_inds) > m_ind: scl_right = split_s_inds.pop(m_ind) # rhs sub-cluster with >= min_cl_n samples for mdl encoding scl_r_minimax = scl_right.copy() # concatenate sub-clasters after scl_r_minimax scl_r_enc_i = m_ind while (len(scl_r_minimax) < self._min_cl_n and scl_r_enc_i < len(split_s_inds)): # += edits list in-place, so copy is necessary scl_r_minimax += split_s_inds[scl_r_enc_i] scl_r_enc_i += 1 scl_ns = [len(scl_left), len(scl_insp), len(scl_right), len(scl_r_minimax)] if self._encode_type == "distance": # TODO: decide mdl by linkage left_mdlslcs = MDLSLCS( self._sdm._x[scl_left], labs=[0]*len(scl_left), d=self._sdm._d[scl_left][:, scl_left], metric=self._sdm._metric, encode_type=self._encode_type, mdl_method=self._mdl_method, nprocs=self._nprocs) r_minimax_mdlslcs = MDLSLCS( self._sdm._x[scl_r_minimax], labs=[0]*len(scl_r_minimax), d=self._sdm._d[scl_r_minimax][:, scl_r_minimax], metric=self._sdm._metric, encode_type=self._encode_type, mdl_method=self._mdl_method, nprocs=self._nprocs) left_enc_insp_mdl = left_mdlslcs.encode( self._sdm._d[scl_insp][:, scl_left], col_summary_func=max) r_minimax_enc_insp_mdl = r_minimax_mdlslcs.encode( self._sdm._d[scl_insp][:, scl_r_minimax], col_summary_func=max) else: # data # d is not passed left_mdlslcs = MDLSLCS( self._sdm._x[scl_left], labs=[0]*len(scl_left), metric=self._sdm._metric, encode_type=self._encode_type, mdl_method=self._mdl_method, nprocs=self._nprocs) r_minimax_mdlslcs = MDLSLCS( self._sdm._x[scl_r_minimax], labs=[0]*len(scl_r_minimax), metric=self._sdm._metric, encode_type=self._encode_type, mdl_method=self._mdl_method, nprocs=self._nprocs) left_enc_insp_mdl = left_mdlslcs.encode( self._sdm._x[scl_insp], nprocs=self._nprocs) r_minimax_enc_insp_mdl = r_minimax_mdlslcs.encode( self._sdm._x[scl_insp], nprocs=self._nprocs) # decide merging direction if left_enc_insp_mdl < r_minimax_enc_insp_mdl: # inspected more similar to left scl_left = scl_left + scl_insp scl_insp = scl_right merge_type = "m left" else: scl_insp = scl_insp + scl_right merge_type = "m right" curr_iter_run_log = str.format( "{}, {} -- sub-cl sizes: {}, " "eval mdl: {}, {}", m_ind, sum(map(len, split_s_inds[:m_ind])), scl_ns, [float(left_enc_insp_mdl), float(r_minimax_enc_insp_mdl), float(left_enc_insp_mdl / r_minimax_enc_insp_mdl)], merge_type) self._run_log += curr_iter_run_log if self._verbose: print(curr_iter_run_log) # insp cluster has >= min_cl_n samples, check whether merge with # left or not # In this scenario, rhs sub-clusters are non-informative, because # we do not know their cluster belongings yet. # If we still assume that rhs cluster is just above min_cl_n, # we may underestimate the rhs true cluster size, thus causing # undesired behavior. scl_left_n = np.int_(len(scl_left)) scl_insp_n = np.int_(len(scl_insp)) scl_left_ratio = scl_left_n / (scl_left_n + scl_insp_n) scl_insp_ratio = scl_insp_n / (scl_left_n + scl_insp_n) scl_left_insp = scl_left + scl_insp if self._encode_type == "distance": l_i_mdl_slcs = MDLSLCS( x=self._sdm._x[scl_left_insp], labs=[0]*scl_left_n + [1]*scl_insp_n, encode_type=self._encode_type, mdl_method=self._mdl_method, d=self._sdm._d[scl_left_insp][:, scl_left_insp], metric=self._sdm._metric, nprocs=self._nprocs) else: # d is not passed l_i_mdl_slcs = MDLSLCS( x=self._sdm._x[scl_left_insp], labs=[0]*scl_left_n + [1]*scl_insp_n, encode_type=self._encode_type, mdl_method=self._mdl_method, metric=self._sdm._metric, nprocs=self._nprocs) left_insp_no_lab_mdl = l_i_mdl_slcs.no_lab_mdl( nprocs=self._nprocs, verbose=self._verbose) left_insp_lab_mdl_res = l_i_mdl_slcs.lab_mdl( cl_mdl_scale_factor=self._cl_mdl_scale_factor, nprocs=self._nprocs, verbose=self._verbose) # TODO: validate ulab_mdls order left_split_mdl = left_insp_lab_mdl_res.ulab_mdls[0] insp_split_mdl = left_insp_lab_mdl_res.ulab_mdls[1] cluster_mdl = left_insp_lab_mdl_res.cluster_mdl left_insp_lab_mdl = left_insp_lab_mdl_res.ulab_mdl_sum if left_insp_no_lab_mdl < 0: min_merge_mdl = ((1 + self._min_split_mdl_red_ratio) * left_insp_no_lab_mdl) else: min_merge_mdl = ((1 - self._min_split_mdl_red_ratio) * left_insp_no_lab_mdl) if left_insp_lab_mdl > min_merge_mdl: # merge merge_type = "merge" split_s_inds.insert(m_ind, scl_left + scl_insp) else: # do not merge merge_type = "split" split_s_inds.insert(m_ind, scl_insp) split_s_inds.insert(m_ind, scl_left) m_ind += 1 curr_iter_run_log = str.format( "{}, {} -- no lab mdl: {}, [left, insp] mdl: {}, " "cluster_mdl: {}, \n[left, insp] n: {}, " "[left, insp] ratio: {}, \n" "lab mdl: {}, split/merge: {}, \n" "{}.\n\n", m_ind, sum(map(len, split_s_inds[:m_ind])), float(left_insp_no_lab_mdl), [float(left_split_mdl), float(insp_split_mdl)], cluster_mdl, [int(scl_left_n), int(scl_insp_n)], [float(scl_left_ratio), float(scl_insp_ratio)], float(left_insp_lab_mdl), [float(left_insp_lab_mdl / left_insp_no_lab_mdl), float((left_insp_lab_mdl - cluster_mdl) / left_insp_no_lab_mdl), float(left_split_mdl / left_insp_no_lab_mdl), float(insp_split_mdl / left_insp_no_lab_mdl)], merge_type) self._run_log += curr_iter_run_log if self._verbose: print(curr_iter_run_log) labs = np.concatenate([[i] * len(split_s_inds[i]) for i in range(len(split_s_inds))]) s_inds = np.concatenate(split_s_inds) return s_inds, labs