Source code for scedar.cluster.community

import numpy as np

from scedar.eda import SampleDistanceMatrix
from scedar.eda.slcs import SingleLabelClassifiedSamples as SLCS
from scedar import utils

import leidenalg as la

[docs]class Community(object): """ Community clustering Args ---- x : float array Data matrix. d : float array Distance matrix. graph: igraph.Graph Need to have a weight attribute as affinity. If this argument is not None, the graph will directly be used for community clustering. metric: {'cosine', 'euclidean'} Metric used for nearest neighbor computation. sids : sid list List of sample ids. fids : fid list List of feature ids. use_pdist : boolean To use the pairwise distance matrix or not. The pairwise distance matrix may be too large to save for datasets with a large number of cells. k : int The number of nearest neighbors. use_pca : bool Use PCA for nearest neighbors or not. use_hnsw : bool Use Hierarchical Navigable Small World graph to compute approximate nearest neighbor. index_params : dict Parameters used by HNSW in indexing. efConstruction : int Default 100. Higher value improves the quality of a constructed graph and leads to higher accuracy of search. However this also leads to longer indexing times. The reasonable range of values is 100-2000. M : int Default 5. Higher value leads to better recall and shorter retrieval times, at the expense of longer indexing time. The reasonable range of values is 5-100. delaunay_type : {0, 1, 2, 3} Default 2. Pruning heuristic, which affects the trade-off between retrieval performance and indexing time. The default is usually quite good. post : {0, 1, 2} Default 0. The amount and type of postprocessing applied to the constructed graph. 0 means no processing. 2 means more processing. indexThreadQty : int Default self._nprocs. The number of threads used. query_params : dict Parameters used by HNSW in querying. efSearch : int Default 100. Higher value improves recall at the expense of longer retrieval time. The reasonable range of values is 100-2000. aff_scale : float > 0 Scaling factor used for converting distance to affinity. Affinity = (max(distance) - distance) * aff_scale. partition_method : str Following methods are implemented in leidenalg package: - RBConfigurationVertexPartition: only well-defined for positive edge weights. - RBERVertexPartition: well-defined only for positive edge weights. - CPMVertexPartition: well-defined for both positive and negative edge weights. - SignificanceVertexPartition: well-defined only for unweighted graphs. - SurpriseVertexPartition: well-defined only for positive edge weights. resolution : float > 0 Resolution used for community clustering. Higer value produces more clusters. random_state : int Random number generator seed used for community clustering. n_iter : int Number of iterations used for community clustering. nprocs : int > 0 The number of processes/cores used for community clustering. verbose : bool Print progress or not. Attributes ---------- labs : label list Labels of clustered samples. 1-to-1 matching to from first to last. _sdm : SampleDistanceMatrix Data and distance matrices. _graph : igraph.Graph Graph used for clustering. _la_res : leidenalg.VertexPartition Partition results computed by leidenalg. _k _use_pca _use_hnsw _index_params _query_params _aff_scale """ def __init__(self, x, d=None, graph=None, metric="cosine", sids=None, fids=None, use_pdist=False, k=15, use_pca=True, use_hnsw=True, index_params=None, query_params=None, aff_scale=1, partition_method="RBConfigurationVertexPartition", resolution=1, random_state=None, n_iter=2, nprocs=1, verbose=False): super().__init__() if aff_scale <= 0: raise ValueError("Affinity scaling (aff_scale) shoud > 0.") if metric not in ("cosine", "euclidean"): raise ValueError("Metric only supports cosine and euclidean.") self._sdm = SampleDistanceMatrix(x=x, d=d, metric=metric, use_pdist=use_pdist, sids=sids, fids=fids, nprocs=nprocs) if graph is None: knn_conn_mat = self._sdm.s_knn_connectivity_matrix( k=k, use_pca=use_pca, use_hnsw=use_hnsw, index_params=index_params, query_params=query_params, verbose=verbose) graph = SampleDistanceMatrix.knn_conn_mat_to_aff_graph( knn_conn_mat, aff_scale=aff_scale) if partition_method == "RBConfigurationVertexPartition": la_part_cls = la.RBConfigurationVertexPartition elif partition_method == "RBERVertexPartition": la_part_cls = la.RBERVertexPartition elif partition_method == "CPMVertexPartition": la_part_cls = la.CPMVertexPartition elif partition_method == "SignificanceVertexPartition": la_part_cls = la.SignificanceVertexPartition elif partition_method == "SurpriseVertexPartition": la_part_cls = la.SurpriseVertexPartition else: raise ValueError( "Unknown partition method: {}".format(partition_method)) la_res = la.find_partition(graph, la.RBConfigurationVertexPartition, seed=random_state, weights='weight', resolution_parameter=resolution) # keep track of results and parameters self._graph = graph self._la_res = la_res self._labs = la_res.membership self._k = k self._use_pca = use_pca self._use_hnsw = use_hnsw self._index_params = index_params self._query_params = query_params self._aff_scale = aff_scale @property def labs(self): return self._labs.copy()