Source code for scedar.cluster.community

import numpy as np

from scedar.eda import SampleDistanceMatrix
from scedar.eda.slcs import SingleLabelClassifiedSamples as SLCS
from scedar import utils

import leidenalg as la

[docs]class Community(object):
    """
    Community clustering

    Args
    ----
    x : float array
        Data matrix.
    d : float array
        Distance matrix.
    graph: igraph.Graph
        Need to have a weight attribute as affinity. If this argument
        is not None, the graph will directly be used for community
        clustering.
    metric: {'cosine', 'euclidean'}
        Metric used for nearest neighbor computation.
    sids : sid list
        List of sample ids.
    fids : fid list
        List of feature ids.
    use_pdist : boolean
        To use the pairwise distance matrix or not. The pairwise distance
        matrix may be too large to save for datasets with a large number of
        cells.
    k : int
        The number of nearest neighbors.
    use_pca : bool
        Use PCA for nearest neighbors or not.
    use_hnsw : bool
        Use Hierarchical Navigable Small World graph to compute
        approximate nearest neighbor.
    index_params : dict
        Parameters used by HNSW in indexing.

        efConstruction : int
            Default 100. Higher value improves the quality of a constructed
            graph and leads to higher accuracy of search. However this also
            leads to longer indexing times. The reasonable range of values
            is 100-2000.
        M : int
            Default 5. Higher value leads to better recall and shorter
            retrieval times, at the expense of longer indexing time. The
            reasonable range of values is 5-100.
        delaunay_type : {0, 1, 2, 3}
            Default 2. Pruning heuristic, which affects the trade-off
            between retrieval performance and indexing time. The default
            is usually quite good.
        post : {0, 1, 2}
            Default 0. The amount and type of postprocessing applied to the
            constructed graph. 0 means no processing. 2 means more
            processing.
        indexThreadQty : int
            Default self._nprocs. The number of threads used.

    query_params : dict
        Parameters used by HNSW in querying.

        efSearch : int
            Default 100. Higher value improves recall at the expense of
            longer retrieval time. The reasonable range of values is
            100-2000.

    aff_scale : float > 0
        Scaling factor used for converting distance to affinity.
        Affinity = (max(distance) - distance) * aff_scale.
    partition_method : str
        Following methods are implemented in leidenalg package:

        - RBConfigurationVertexPartition: only well-defined for positive edge
          weights.
        - RBERVertexPartition: well-defined only for positive edge weights.
        - CPMVertexPartition: well-defined for both positive and negative edge
          weights.
        - SignificanceVertexPartition: well-defined only for unweighted graphs.
        - SurpriseVertexPartition: well-defined only for positive edge weights.
    resolution : float > 0
        Resolution used for community clustering. Higer value produces more
        clusters.
    random_state : int
        Random number generator seed used for community clustering.
    n_iter : int
        Number of iterations used for community clustering.
    nprocs : int > 0
        The number of processes/cores used for community clustering.
    verbose : bool
        Print progress or not.

    Attributes
    ----------
        labs : label list
            Labels of clustered samples. 1-to-1 matching to
            from first to last.
        _sdm : SampleDistanceMatrix
            Data and distance matrices.
        _graph : igraph.Graph
            Graph used for clustering.
        _la_res : leidenalg.VertexPartition
            Partition results computed by leidenalg.
        _k
        _use_pca
        _use_hnsw
        _index_params
        _query_params
        _aff_scale
    """

    def __init__(self, x, d=None, graph=None,
                 metric="cosine", sids=None, fids=None,
                 use_pdist=False,  k=15, use_pca=True, use_hnsw=True,
                 index_params=None, query_params=None, aff_scale=1,
                 partition_method="RBConfigurationVertexPartition",
                 resolution=1, random_state=None, n_iter=2,
                 nprocs=1, verbose=False):
        super().__init__()
        if aff_scale <= 0:
            raise ValueError("Affinity scaling (aff_scale) shoud > 0.")

        if metric not in ("cosine", "euclidean"):
            raise ValueError("Metric only supports cosine and euclidean.")

        self._sdm = SampleDistanceMatrix(x=x, d=d, metric=metric,
                                         use_pdist=use_pdist,
                                         sids=sids, fids=fids, nprocs=nprocs)
        if graph is None:
            knn_conn_mat = self._sdm.s_knn_connectivity_matrix(
                k=k, use_pca=use_pca, use_hnsw=use_hnsw,
                index_params=index_params, query_params=query_params,
                verbose=verbose)
            graph = SampleDistanceMatrix.knn_conn_mat_to_aff_graph(
                knn_conn_mat, aff_scale=aff_scale)

        if partition_method == "RBConfigurationVertexPartition":
            la_part_cls = la.RBConfigurationVertexPartition
        elif partition_method == "RBERVertexPartition":
            la_part_cls = la.RBERVertexPartition
        elif partition_method == "CPMVertexPartition":
            la_part_cls = la.CPMVertexPartition
        elif partition_method == "SignificanceVertexPartition":
            la_part_cls = la.SignificanceVertexPartition
        elif partition_method == "SurpriseVertexPartition":
            la_part_cls = la.SurpriseVertexPartition
        else:
            raise ValueError(
                "Unknown partition method: {}".format(partition_method))

        la_res = la.find_partition(graph, la.RBConfigurationVertexPartition,
                                   seed=random_state, weights='weight',
                                   resolution_parameter=resolution)
        # keep track of results and parameters
        self._graph = graph
        self._la_res = la_res
        self._labs = la_res.membership
        self._k = k
        self._use_pca = use_pca
        self._use_hnsw = use_hnsw
        self._index_params = index_params
        self._query_params = query_params
        self._aff_scale = aff_scale

    @property
    def labs(self):
        return self._labs.copy()