Source code for scedar.eda.plot

import numpy as np
import pandas as pd

import matplotlib as mpl
mpl.use("agg", warn=False)  # noqa
import matplotlib.pyplot as plt
import matplotlib.colors
import matplotlib.patches
import matplotlib.gridspec
from mpl_toolkits.axes_grid1.inset_locator import inset_axes

import seaborn as sns

import networkx as nx

from scedar.eda import mtype

from collections import OrderedDict

sns.set(style="ticks")


[docs]def labs_to_cmap(labels, return_lut=False, shuffle_colors=False, random_state=None): np.random.seed(random_state) # Each label has its own index and color mtype.check_is_valid_labs(labels) labels = np.array(labels) uniq_lab_arr = np.unique(labels) num_uniq_labs = len(uniq_lab_arr) uniq_lab_inds = list(range(num_uniq_labs)) lab_col_list = list(sns.hls_palette(num_uniq_labs)) if shuffle_colors: np.random.shuffle(lab_col_list) lab_cmap = mpl.colors.ListedColormap(lab_col_list) # Need to keep track the order of unique labels, so that a labeled # legend can be generated. # Map unique label indices to unique labels uniq_lab_lut = dict(zip(range(num_uniq_labs), uniq_lab_arr)) # Map unique labels to indices uniq_ind_lut = dict(zip(uniq_lab_arr, range(num_uniq_labs))) # a list of label indices lab_ind_arr = np.array([uniq_ind_lut[x] for x in labels]) # map unique labels to colors # Used to generate legends lab_col_lut = dict(zip([uniq_lab_lut[i] for i in range(len(uniq_lab_arr))], lab_col_list)) # norm separates cmap to difference indices # https://matplotlib.org/tutorials/colors/colorbar_only.html lab_norm = mpl.colors.BoundaryNorm(uniq_lab_inds + [lab_cmap.N], lab_cmap.N) if return_lut: return lab_cmap, lab_norm, lab_ind_arr, lab_col_lut, uniq_lab_lut else: return lab_cmap, lab_norm
[docs]def cluster_scatter(projection2d, labels=None, selected_labels=None, plot_different_markers=False, label_markers=None, shuffle_label_colors=False, gradient=None, xlim=None, ylim=None, title=None, xlab=None, ylab=None, figsize=(20, 20), add_legend=True, n_txt_per_cluster=3, alpha=1, s=0.5, random_state=None, **kwargs): """Scatter plot for clustering illustration Args: projection2d (2 col numeric array): (n, 2) matrix to plot labels (list of labels): labels of n samples selected_labels (list of labels): selected labels to plot plot_different_markers (bool): plot different markers for samples with different labels label_markers (list of marker shapes): passed to matplotlib plot shuffle_label_colors (bool): shuffle the color of labels to avoid similar colors show up in close clusters gradient (list of number): color gradient of n samples title (str) xlab (str): x axis label ylab (str): y axis label figsize (tuple of two number): (width, height) add_legend (bool) n_txt_per_cluster (number): the number of text to plot per cluster. Could be 0. alpha (number) s (number): size of the points random_state (int): random seed to shuffle features **kwards: passed to matplotlib plot Return: matplotlib figure of the created scatter plot """ kwargs = kwargs.copy() # randomly: # - select labels for annotation if required # - shuffle colors if required np.random.seed(random_state) # check projection2d projection2d = np.array(projection2d, dtype="float") if (projection2d.ndim != 2) or (projection2d.shape[1] != 2): raise ValueError("projection2d matrix should have shape " "(n_samples, 2). {}".format(projection2d)) # check gradient length if gradient is not None: gradient = np.array(gradient) if gradient.ndim != 1: raise ValueError("gradient must be 1d.") if gradient.shape[0] != projection2d.shape[0]: raise ValueError("gradient should have the same length ({}) as " "n_samples in projection2d " "(shape {})".format(gradient.shape[0], projection2d.shape[0])) # check label length if labels is not None: mtype.check_is_valid_labs(labels) labels = np.array(labels) if labels.shape[0] != projection2d.shape[0]: raise ValueError("labels should have the same length ({}) as " "n_samples in projection2d " "(shape {})".format(labels.shape[0], projection2d.shape[0])) # check markers if label_markers is not None: if labels is None: raise ValueError("labels should not be None when label_markers") if len(label_markers) != len(labels): raise ValueError("labels should have the same length as" "label_markers") # plot selected labels if selected_labels is not None: if labels is None: raise ValueError("selected_labels needs labels to be " "provided.") else: uniq_selected_labels = np.unique(selected_labels).tolist() uniq_labels = np.unique(labels).tolist() # np.in1d(uniq_selected_labels, uniq_labels) will cause # future warning: # https://stackoverflow.com/a/46721064/4638182 if not np.all([x in uniq_labels for x in uniq_selected_labels]): raise ValueError("selected_labels: {} must all " "be included in the labels: " "{}.".format(uniq_selected_labels, uniq_labels)) slabels_bool = [lab in uniq_selected_labels for lab in labels.tolist()] labels = labels[slabels_bool] projection2d = projection2d[slabels_bool] if gradient is not None: gradient = gradient[slabels_bool] fig, ax = plt.subplots(figsize=figsize) # TODO: optimize the if-else statement if labels is not None: # return empty scatter plot if there is no point to plot # markers for each label uniq_labels = np.unique(labels) # create marker dict: # lab_m_s_ind_lut: {(lab1, marker1): marker_1_s_ind_list} lab_m_s_ind_lut = {} if plot_different_markers: if label_markers is None: # use a different marker for each label # cycle use the following filled markers: # "o": circle # "s": square # "^": triangle_up # "D": diamond # "x": x # "v": triangle_down # "d": thin_diamond # "+": plus # ">": triangle_right # "p": pentagon # "h": hexagon1 # "<": triangle_left # "H": hexagon2 # "*": star # order: "os^Dxvd+>ph<H*" # Refs: # - markers with shape: # https://matplotlib.org/examples/lines_bars_and_markers/ # marker_reference.html # - all merkers # ref: https://matplotlib.org/api/markers_api.html m_cycle = "os^Dxvd+>ph<H*" for i, ulab in enumerate(uniq_labels): ulab_m = m_cycle[i % len(m_cycle)] lab_m_s_ind_lut[(ulab, ulab_m)] = list(filter( lambda j: labels[j] == ulab, range(len(labels)))) else: # use user provided markers for ulab_m in set(label_markers): # if user provided, legend show marker rather than # label lab_m_s_ind_lut[(ulab_m, ulab_m)] = list(filter( lambda i: label_markers[i] == ulab_m, range(len(label_markers)))) else: # plot all labels with shape "o" for i, ulab in enumerate(uniq_labels): lab_m_s_ind_lut[(ulab, "o")] = list(filter( lambda j: labels[j] == ulab, range(len(labels)))) # plot # list of ulabs lgd_ulabs = [] # list of matplotlib.collections.PathCollection lgd_mpcs = [] if gradient is not None: cmap = kwargs.pop("cmap", "viridis") # lab_m_s_ind_lut = {(lab1, m1): [s_inds]} for (ulab, ulab_m), s_inds in sorted(lab_m_s_ind_lut.items()): mpc = plt.scatter(x=projection2d[s_inds, 0], y=projection2d[s_inds, 1], c=gradient[s_inds], cmap=cmap, marker=ulab_m, s=s, alpha=alpha, **kwargs) lgd_ulabs.append(ulab) lgd_mpcs.append(mpc) if add_legend and len(labels) != 0: box = ax.get_position() ax.set_position([box.x0, box.y0, box.width*0.7, box.height]) plt.legend(handles=lgd_mpcs, labels=lgd_ulabs, bbox_to_anchor=(1.25, 1), loc=2, borderaxespad=0.) # colorbar location # ref: # https://matplotlib.org/gallery/axes_grid1/ # demo_colorbar_with_inset_locator.html cb_axins = inset_axes( ax, width="5%", # width = 10% of parent_bbox width height="100%", # height : 50% loc=2, bbox_to_anchor=(1.05, 0., 1, 1), bbox_transform=ax.transAxes, borderpad=0) plt.colorbar(cax=cb_axins) else: label_color_arr = np.array( sns.color_palette("hls", len(uniq_labels))) if shuffle_label_colors: np.random.shuffle(label_color_arr) color_lut = dict(zip(uniq_labels, label_color_arr)) s_col_arr = np.array([color_lut[lab] for lab in labels]) # lab_m_s_ind_lut = {(lab1, m1): [s_inds]} for (ulab, ulab_m), s_inds in sorted(lab_m_s_ind_lut.items()): mpc = plt.scatter(x=projection2d[s_inds, 0], y=projection2d[s_inds, 1], c=s_col_arr[s_inds], marker=ulab_m, s=s, alpha=alpha, **kwargs) lgd_ulabs.append(ulab) lgd_mpcs.append(mpc) # Add legend # Shrink current axis by 20% if add_legend and len(labels) != 0: box = ax.get_position() ax.set_position([box.x0, box.y0, box.width*0.8, box.height]) plt.legend(handles=lgd_mpcs, labels=lgd_ulabs, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) # add text annotation [[label1 anno inds], [label2 anno ind], ...] anno_ind_list = [np.random.choice(np.where(labels == ulab)[0], n_txt_per_cluster) for ulab in uniq_labels] for ulab_anno in anno_ind_list: for i in map(int, ulab_anno): ax.annotate(labels[i], (projection2d[i, 0], projection2d[i, 1])) else: if gradient is None: plt.scatter(x=projection2d[:, 0], y=projection2d[:, 1], s=s, alpha=alpha, **kwargs) else: cmap = kwargs.pop("cmap", "viridis") # matplotlib.collections.PathCollection plt.scatter(x=projection2d[:, 0], y=projection2d[:, 1], c=gradient, cmap=cmap, s=s, alpha=alpha, **kwargs) if add_legend: plt.colorbar() if title is not None: ax.set_title(title) if xlab is not None: ax.set_xlabel(xlab) if ylab is not None: ax.set_ylabel(ylab) plt.xlim(xlim) plt.ylim(ylim) plt.close() return fig
[docs]def regression_scatter(x, y, title=None, xlab=None, ylab=None, figsize=(5, 5), alpha=1, s=0.5, ax=None, **kwargs): """ Paired vector scatter plot. """ if xlab is not None: x = pd.Series(x, name=xlab) if ylab is not None: y = pd.Series(y, name=ylab) # initialize a new figure if ax is None: _, ax = plt.subplots() ax = sns.regplot(x=x, y=y, ax=ax, **kwargs) fig = ax.get_figure() if title is not None: ax.set_title(title) if xlab is not None: ax.set_xlabel(xlab) if ylab is not None: ax.set_ylabel(ylab) fig.set_size_inches(*figsize) plt.close() return fig
[docs]def hist_dens_plot(x, title=None, xlab=None, ylab=None, figsize=(5, 5), ax=None, **kwargs): """ Plot histogram and density plot of x. """ if ax is None: _, ax = plt.subplots() ax = sns.distplot(x, norm_hist=None, ax=ax, **kwargs) fig = ax.get_figure() if title is not None: ax.set_title(title) if xlab is not None: ax.set_xlabel(xlab) if ylab is not None: ax.set_ylabel(ylab) fig.set_size_inches(*figsize) plt.close() return fig
[docs]def swarm(x, labels=None, selected_labels=None, title=None, xlab=None, ylab=None, figsize=(10, 10), ax=None, **kwargs): # check x x = np.array(x, dtype="float") if x.ndim != 1: raise ValueError("x should be 1d and have shape " "(n_samples,). {}".format(x)) if x.shape[0] == 0: raise ValueError("x must be non-empty.") # check label length if labels is not None: mtype.check_is_valid_labs(labels) labels = np.array(labels) if labels.shape[0] != x.shape[0]: raise ValueError("labels should have the same length ({}) as " "n_samples in projection2d " "(shape {})".format(labels.shape[0], x.shape[0])) else: # plot selected labels if selected_labels is not None: raise ValueError("selected_labels needs labels to be " "provided.") labels = np.repeat(0, x.shape[0]) # plot selected labels if selected_labels is not None: # labels can only be existing uniq_selected_labels = np.unique(selected_labels).tolist() uniq_labels = np.unique(labels).tolist() # np.in1d(uniq_selected_labels, uniq_labels) will cause # future warning: # https://stackoverflow.com/a/46721064/4638182 if not np.all([x in uniq_labels for x in uniq_selected_labels]): raise ValueError("selected_labels: {} must all " "be included in the labels: " "{}.".format(uniq_selected_labels, uniq_labels)) slabels_bool = [lab in uniq_selected_labels for lab in labels.tolist()] labels = labels[slabels_bool] x = x[slabels_bool] if len(x) == 0: raise ValueError("No value selected.") plt_df = pd.DataFrame({"labels": labels, "val": x}) if ax is None: _, ax = plt.subplots() ax = sns.swarmplot(x="labels", y="val", data=plt_df, ax=ax, **kwargs) fig = ax.get_figure() if title is not None: ax.set_title(title) if xlab is not None: ax.set_xlabel(xlab) if ylab is not None: ax.set_ylabel(ylab) fig.set_size_inches(*figsize) plt.close() return fig
[docs]def heatmap(x, row_labels=None, col_labels=None, title=None, xlab=None, ylab=None, figsize=(20, 20), transform=None, shuffle_row_colors=False, shuffle_col_colors=False, random_state=None, row_label_order=None, col_label_order=None, **kwargs): x = np.array(x, dtype="float") if x.ndim != 2: raise ValueError("x should be 2D array. {}".format(x)) if x.size == 0: raise ValueError("x cannot be empty.") if transform is not None: if callable(transform): # now x must be float, so copy and transform will not cause side # effects on original array. x = x.copy() x = transform(x) else: raise ValueError("transform must be callable. It will be " "on x.") if row_labels is not None: mtype.check_is_valid_labs(row_labels) if len(row_labels) != x.shape[0]: raise ValueError("length of row_labels should be the same as the " "number of rows in x." " row_labels: {}. x: {}".format(len(row_labels), x.shape)) if col_labels is not None: mtype.check_is_valid_labs(col_labels) if len(col_labels) != x.shape[1]: raise ValueError("length of col_labels should be the same as the " "number of rows in x." " col_labels: {}. x: {}".format(len(col_labels), x.shape)) if "interpolation" not in kwargs: kwargs["interpolation"] = "nearest" im_cmap = kwargs.pop("cmap", "magma") fig = plt.figure(figsize=figsize) if title is not None: fig.suptitle(title) # outer 2x2 grid gs = mpl.gridspec.GridSpec(2, 2, width_ratios=[1, 4], height_ratios=[1, 4], wspace=0.0, hspace=0.0) # inner upper right for color labels and legends ur_gs = mpl.gridspec.GridSpecFromSubplotSpec(2, 1, height_ratios=[3, 1], subplot_spec=gs[1], wspace=0.0, hspace=0.0) # inner lower left for color labels and legends ll_gs = mpl.gridspec.GridSpecFromSubplotSpec(1, 2, width_ratios=[3, 1], subplot_spec=gs[2], wspace=0.0, hspace=0.0) ax_lut = { "cb_ax": plt.subplot(gs[0]), "hm_ax": plt.subplot(gs[3]), "lcol_ax": plt.subplot(ll_gs[1]), "ucol_ax": plt.subplot(ur_gs[1]), "llgd_ax": plt.subplot(ll_gs[0]), "ulgd_ax": plt.subplot(ur_gs[0]) } # remove frames and ticks for iax in ax_lut.values(): iax.set_xticks([]) iax.set_yticks([]) iax.axis("off") # lower right heatmap imgp = ax_lut["hm_ax"].imshow(x, cmap=im_cmap, aspect="auto", **kwargs) if xlab is not None: ax_lut["hm_ax"].set_xlabel(xlab) if ylab is not None: ax_lut["hm_ax"].set_ylabel(ylab) # upper left colorbar cb = plt.colorbar(imgp, cax=ax_lut["cb_ax"]) ax_lut["cb_ax"].set_aspect(5, anchor="W") ax_lut["cb_ax"].yaxis.tick_left() ax_lut["cb_ax"].axis("on") # color labels and legends ax_lut["ucol_ax"].set_anchor("S") ax_lut["lcol_ax"].set_anchor("E") col_axs = (ax_lut["ucol_ax"], ax_lut["lcol_ax"]) lgd_axs = (ax_lut["ulgd_ax"], ax_lut["llgd_ax"]) cr_labs = (col_labels, row_labels) for i in range(2): if cr_labs[i] is not None: if i == 0: # col color labels cmap, norm, lab_inds, ulab_col_lut, ulab_lut = labs_to_cmap( cr_labs[i], return_lut=True, shuffle_colors=shuffle_col_colors, random_state=random_state) ind_mat = lab_inds.reshape(1, -1) if col_label_order is None: lgd_patches = [mpl.patches.Patch(color=ulab_col_lut[ulab], label=ulab) for ulab in sorted(ulab_lut.values())] else: lgd_patches = [mpl.patches.Patch(color=ulab_col_lut[ulab], label=ulab) for ulab in col_label_order] else: # row color labels cmap, norm, lab_inds, ulab_col_lut, ulab_lut = labs_to_cmap( cr_labs[i], return_lut=True, shuffle_colors=shuffle_row_colors, random_state=random_state) ind_mat = lab_inds.reshape(-1, 1) if row_label_order is None: lgd_patches = [mpl.patches.Patch(color=ulab_col_lut[ulab], label=ulab) for ulab in sorted(ulab_lut.values())] else: lgd_patches = [mpl.patches.Patch(color=ulab_col_lut[ulab], label=ulab) for ulab in row_label_order] col_axs[i].imshow(ind_mat, cmap=cmap, norm=norm, aspect="auto", interpolation="nearest") if i == 0: # col color legend lgd_axs[i].legend(handles=lgd_patches, loc="center", ncol=6) else: # row color legend lgd_axs[i].legend(handles=lgd_patches, loc="upper center", ncol=1) plt.close() return fig
[docs]def networkx_graph(ng, pos=None, alpha=0.05, figsize=(20, 20), gradient=None, labels=None, different_label_markers=True, node_size=30, node_with_labels=False, nx_draw_kwargs=None): # TODO: offset labels fig = plt.figure(figsize=figsize) if nx_draw_kwargs is None: nx_draw_kwargs = {} if labels is not None: # prepare markers and colors for each unique label if different_label_markers: # each marker for each label # use a different marker for each label # cycle use the following filled markers: # "o": circle # "s": square # "^": triangle_up # "D": diamond # "x": x # "v": triangle_down # "d": thin_diamond # "+": plus # ">": triangle_right # "p": pentagon # "h": hexagon1 # "<": triangle_left # "H": hexagon2 # "*": star # order: "os^Dxvd+>ph<H*" # Refs: # - markers with shape: # https://matplotlib.org/examples/lines_bars_and_markers/ # marker_reference.html # - all merkers # ref: https://matplotlib.org/api/markers_api.html m_cycle = "os^Dxvd+>ph<H*" else: # all labels use "o" m_cycle = "o" # each label has a marker uniq_labels = sorted(set(labels)) ulab_colors = sns.hls_palette(len(uniq_labels)) lab_m_s_ind_lut = OrderedDict() for i, ulab in enumerate(uniq_labels): ulab_m = m_cycle[i % len(m_cycle)] ulab_c = ulab_colors[i] lab_m_s_ind_lut[(ulab, ulab_m, ulab_c)] = list(filter( lambda j: labels[j] == ulab, range(len(labels)))) if labels is None: if gradient is None: # no label. no gradient. # plot all nodes as blue. node_color = nx_draw_kwargs.pop("node_color", "b") cmap = nx_draw_kwargs.pop("cmap", None) nx.draw_networkx(ng, pos, alpha=alpha, node_color=node_color, cmap=cmap, node_size=node_size, with_labels=node_with_labels, **nx_draw_kwargs) else: # no label. has gradient. cmap = nx_draw_kwargs.pop("cmap", "viridis") # matplotlib.collections.PathCollection nx.draw_networkx_edges(ng, pos, alpha=alpha) mcp = nx.draw_networkx_nodes(ng, pos, alpha=alpha, node_color=gradient, cmap=cmap, node_size=node_size, with_labels=node_with_labels, **nx_draw_kwargs) plt.colorbar(mcp) else: nx.draw_networkx_edges(ng, pos, alpha=alpha) if gradient is None: # has label. no gradient. # plot differnt labels with different colors and markers. for (ulab, ulab_m, ulab_c), ulab_s_inds in lab_m_s_ind_lut.items(): mcp = nx.draw_networkx_nodes(ng, pos, alpha=alpha, nodelist=ulab_s_inds, node_color=ulab_c, node_shape=ulab_m, node_size=node_size, label=ulab) else: # has label. has gradient. gradient = np.array(gradient) cmap = nx_draw_kwargs.pop("cmap", "viridis") for (ulab, ulab_m, ulab_c), ulab_s_inds in lab_m_s_ind_lut.items(): mcp = nx.draw_networkx_nodes(ng, pos, alpha=alpha, nodelist=ulab_s_inds, node_color=gradient[ulab_s_inds], node_shape=ulab_m, node_size=node_size, label=ulab, cmap=cmap) # TODO: move legend out of the graph plt.colorbar(mcp) # TODO: move legend out of the graph plt.legend(scatterpoints=1) plt.close(fig) return fig