Example of the SuperCell pipeline

Installing SuperCell package from gitHub

if (!requireNamespace("remotes")) install.packages("remotes")
remotes::install_github("GfellerLab/SuperCell")
library(SuperCell)
#> Thank you for installing SuperCell! SuperCell is developed by the group of David Gfeller at University of Lausanne.
#> 
#> SuperCell can be used freely by academic groups for non-commercial purposes (see license). The product is provided free of charge, and, therefore, on an 'as is' basis, without warranty of any kind.
#> 
#> FOR-PROFIT USERS
#> 
#> If you plan to use SuperCell or any data provided with the script in any for-profit application, you are required to obtain a separate license. To do so, please contact eauffarth@licr.org at the Ludwig Institute for Cancer Research Ltd.
#> 
#> If required, FOR-PROFIT USERS are also expected to have proper licenses for the tools used in SuperCell, including the R packages igraph, RANN, WeightedCluster, corpora, weights, Hmisc, Matrix, ply, irlba, grDevices, patchwork, ggplot2 and velocyto.R
#> 
#> For scientific questions, please contact Mariia Bilous (mariia.bilous@unil.ch) or David Gfeller (David.Gfeller@unil.ch).

Analysis

Load scRNA-seq data of 5 cancer cell lines from Tian et al., 2019.

Data available at authors’ GitHub under file name sincell_with_class_5cl.Rdata.

data(cell_lines) # list with GE - gene expression matrix (logcounts), meta - cell meta data
GE <- cell_lines$GE
dim(GE) # genes as rows and cells as columns
#> Loading required package: Matrix
#> NULL
cell.meta <- cell_lines$meta

Simplify single-cell data at the graining level \(gamma = 20\)

(i.e., 20 times less metacells (called ‘supercells’ in the package functions) than single cells) by first building a kNN (\(k=5\)) network using top \(n.var.genes=1000\) most variable genes for dimentionality reduction. Function SCimplify() computes the partition into metacells, this information is available with the field membership.

gamma <- 20 # graining level
k.knn <- 5

# Building metacells from gene expressio (GE)
SC <- SCimplify(GE,  # gene expression matrix 
                k.knn = k.knn, # number of nearest neighbors to build kNN network
                gamma = gamma, # graining level
                n.var.genes = 1000 # number of the top variable genes to use for dimentionality reduction 
)


# Alternative, metacells can be build from low-dimensional embedding.
# For this, first compute low-dim embedding and pass it into `SCimplify_from_embedding()`
if(0){
  SC_alt <- SCimplify_from_embedding(
    X = stats::prcomp(Matrix::t(GE[SC$genes.use,]), rank. = 10)$x, # PCA embedding
    k.knn = k.knn, # number of nearest neighbors to build kNN network
    gamma = gamma # graining level)
  )
}

# plot network of metacells

supercell_plot(SC$graph.supercells, # network
               color.use = "gray", # color of the nodes
               main = paste("Metacell network, gamma =", gamma), 
               seed = 1) 


# plot single-cell network
supercell_plot(SC$graph.singlecell, # network
               group = cell.meta, # colored by cell line assignment
               do.frames = F, # not drawing frames around each node 
               main = paste("Single-cell network, N =", dim(GE)[2]), 
               lay.method = "components") # method to compute the network 2D embedding 

Compute gene expression for simplified data

To get a gene expression of metacells, we need to average gene expressions within each metacell with function supercell_GE()

SC.GE <- supercell_GE(GE, SC$membership)
dim(SC.GE) 
#> [1] 2000   98

Map each metacell to a particular cell line

We now assign each metcell to a particular cell line based on the cell line data, for this, we use function supercell_assign(). By default, this function assign each metacell to a cluster with the largest Jaccard coefficient to avoid biases towards very rare or very abundant clusters. Alternatively, assigmnent can be performed using relative (may cause biase towards very small populations) or absolute (may cause bias towards large populations) abundance with method = "relative" or method = "absolute", respectively.

SC$cell_line <- supercell_assign(clusters = cell.meta, # single-cell assigment to cell lines (clusters)
                                 supercell_membership = SC$membership, # single-cell assignment to metacells
                                 method = "jaccard")


seed <- 1 # seed for network plotting 

# plot network of metacells colored by cell line assignment 
supercell_plot(SC$graph.supercells, 
               group = SC$cell_line, 
               seed = seed, 
               main = "Metacells colored by cell line assignment")

The quality of assignment can be evaluated with metacell purity (function supercell_purity()) that returns the proportion of the most abundant cell type (in this case, cell line) in each metacell.

# compute purity of metacells in terms of cell line composition
purity <- supercell_purity(clusters = cell.meta, 
                           supercell_membership = SC$membership, method = 'max_proportion')
hist(purity, main = "Purity of metacells \nin terms of cell line composition")

SC$purity <- purity

Some options to plot networks of metacells

## rotate network to be more consistent with the single-cell one
supercell_plot(SC$graph.supercells, 
               group = SC$cell_line, 
               seed = seed, 
               alpha = -pi/2,
               main  = "Metacells colored by cell line assignment (rotated)")


## alternatively, any layout can be provided as 2xN numerical matrix, where N is number of nodes (cells)

## Let's plot metacell network using the layout of the single-cell network:
## 1) get single-cell network layout 
my.lay.sc <- igraph::layout_components(SC$graph.singlecell) 

## 2) compute metacell network layout averaging coordinates withing metacells
my.lay.SC <- Matrix::t(supercell_GE(ge = t(my.lay.sc), groups = SC$membership))

## 3) provide layout with the parameter $lay$
supercell_plot(SC$graph.supercells, 
               group = SC$cell_line, 
               lay = my.lay.SC,
               main  = "Metacells colored by cell line assignment (averaged coordinates)")

Cluster metacell data

#dimensionality reduction 
SC.PCA         <- supercell_prcomp(Matrix::t(SC.GE), # metacell gene expression matrix
                                   genes.use = SC$genes.use, # genes used for the coarse-graining, but any set can be provided
                                   supercell_size = SC$supercell_size, # sample-weighted pca
                                   k = 20) 
## compute distance
D              <- dist(SC.PCA$x)

## cluster metacells
SC.clusters    <- supercell_cluster(D = D, k = 5, supercell_size = SC$supercell_size) 
SC$clustering  <- SC.clusters$clustering

Map clusters of metacells to cell lines

## mapping metacell cluster to cell line 
map.cluster.to.cell.line    <- supercell_assign(supercell_membership = SC$clustering, clusters  = SC$cell_line)
## clustering as cell line
SC$clustering_reordered     <- map.cluster.to.cell.line[SC$clustering]

supercell_plot(SC$graph.supercells, 
               group = SC$clustering_reordered, 
               seed = seed,
               alpha = -pi/2,
               main = "Metacells colored by cluster")

Differential expression analysis of clustered metacell data

markers.all.positive <- supercell_FindAllMarkers(ge = SC.GE, # metacell gene expression matrix
                                                 supercell_size = SC$supercell_size, # size of metacell for sample-weighted method
                                                 clusters = SC$clustering_reordered, # clustering
                                                 logfc.threshold = 1, # mininum log fold-change
                                                 only.pos = T) # keep only upregulated genes
markers.all.positive$H2228[1:20,]
#>          p.value adj.p.value pct.1     pct.2    logFC w.mean.1   w.mean.2
#> S100A9         0           0     1 0.9821429 4.900415 3.407261 0.50021541
#> CD74           0           0     1 0.9515306 4.706122 5.160220 0.33755798
#> SAA1           0           0     1 0.9623724 4.645834 5.692237 0.70177437
#> CXCL1          0           0     1 0.9815051 4.597609 5.086265 0.34463066
#> CXCL6          0           0     1 0.9081633 3.909404 3.755125 0.15881505
#> LCN2           0           0     1 1.0000000 3.870886 6.867711 1.49507670
#> HLA-B          0           0     1 1.0000000 3.565492 5.059566 1.22965559
#> HLA-C          0           0     1 0.9751276 3.384311 4.180295 0.73496480
#> CXCL8          0           0     1 0.8641582 3.156071 3.184266 0.14175113
#> HLA-DRA        0           0     1 0.8131378 3.072904 2.809587 0.09010310
#> IGFBP3         0           0     1 1.0000000 3.024813 4.496049 1.31677725
#> BIRC3          0           0     1 0.8463010 2.994637 3.182946 0.21117832
#> HLA-DRB1       0           0     1 0.7295918 2.885790 2.644877 0.07274146
#> CCL20          0           0     1 0.8469388 2.814461 2.752270 0.08244648
#> HLA-A          0           0     1 0.9929847 2.656370 4.532330 1.91215341
#> MMP7           0           0     1 0.9317602 2.645386 3.085071 0.53916665
#> SAA2           0           0     1 0.7646684 2.597852 2.576263 0.15414246
#> WFDC2          0           0     1 0.6721939 2.376167 2.171900 0.10202708
#> HLA-DRB5       0           0     1 0.5943878 2.358504 2.256209 0.04683749
#> HLA-DPA1       0           0     1 0.6543367 2.316558 2.109882 0.05200388

Some additional plotting options

genes.to.plot <- c("DHRS2", "MT1P1", "TFF1", "G6PD", "CD74", "CXCL8")

supercell_VlnPlot(ge = SC.GE, 
                  supercell_size = SC$supercell_size, 
                  clusters = SC$clustering_reordered,
                  features = genes.to.plot,
                  idents = c("H1975", "H2228", "A549"), 
                  ncol = 3)


supercell_GeneGenePlot(ge = SC.GE, 
                       gene_x = genes.to.plot[1:3],
                       gene_y = genes.to.plot[4:6],
                       supercell_size = SC$supercell_size, 
                       clusters = SC$clustering_reordered,)
#> $p

#> 
#> $w.cor
#> $w.cor$TFF1_CXCL8
#> [1] -0.2530462
#> 
#> $w.cor$DHRS2_G6PD
#> [1] -0.2143222
#> 
#> $w.cor$MT1P1_CD74
#> [1] 0.08085401
#> 
#> 
#> $w.pval
#> $w.pval$TFF1_CXCL8
#> [1] 5.350818e-30
#> 
#> $w.pval$DHRS2_G6PD
#> [1] 8.66973e-22
#> 
#> $w.pval$MT1P1_CD74
#> [1] 0.0003406761

SuperCell graining level can be quickly chaged with supercell_rescale() function

SC10 <- supercell_rescale(SC, gamma = 10)

SC10$cell_line <- supercell_assign(clusters = cell.meta, # single-cell assigment to cell lines (clusters)
                                 supercell_membership = SC10$membership, # single-cell assignment to metacells
                                 method = "jaccard")

supercell_plot(SC10$graph.supercells, 
               group = SC10$cell_line, 
               seed = 1,
               main  = "Metacells at gamma = 10 colored by cell line assignment")


### don't forget to recompute metacell gene expression matrix for a new grainig level with 
# GE10 <- supercell_GE(GE, SC10$membership)
### if you are going to perform downstream analyses at the new graining level

P.S.: SuperCell to Seurat object

In case you want to perform other analyses available with Seurat package, we can convert SuperCell to Seurat object with function supercell_2_Seurat() or to SingleCellExperiment object with function ‘supercell_2_sce()’. Let consider a Seurat example.

#install.packages("Seurat")
library(Seurat)
#> Loading required package: SeuratObject
#> Loading required package: sp
#> 'SeuratObject' was built under R 4.4.0 but the current version is
#> 4.4.1; it is recomended that you reinstall 'SeuratObject' as the ABI
#> for R may have changed
#> 
#> Attaching package: 'SeuratObject'
#> The following objects are masked from 'package:base':
#> 
#>     intersect, t

m.seurat <- supercell_2_Seurat(SC.GE = as.matrix(SC.GE), SC = SC, fields = c("cell_line", "clustering","clustering_reordered"))
#> Warning: Data is of class matrix. Coercing to dgCMatrix.
#> Normalizing layer: counts
#> Done: NormalizeData
#> Doing: data to normalized data
#> Doing: weighted scaling
#> Done: weighted scaling
#> Warning in irlba(A = t(x = object), nv = npcs, ...): You're computing too large
#> a percentage of total singular values, use a standard svd instead.
#> Computing nearest neighbor graph
#> Computing SNN

Note: since metacells have different size (consist of different number of cells), we apply sample-weighted algorithms at most af the steps of the downstream analyses. Thus, when coercing SuperCell to Seurat, we replaced PCA, saling and kNN graph of Seurat object with those obtained applying sample-weighted version of PCA, scaling or SuperCell graph (i.e., metacell network), respectively. If you then again apply RunPCA, ScaleData, or FindNeighbors, the result will be rewritten, but you will be able to access them with Embeddings(m.seurat, reduction = "pca_weigted"), m.seurat@assays$RNA@misc[["scale.data.weighted"]], or m.seurat@graphs$RNA_super_cells, respectively.

PCAPlot(m.seurat)


### cluster SuperCell network (unweighted clustering)
m.seurat <- FindClusters(m.seurat, graph.name = "RNA_nn") # now RNA_nn is metacell network
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 98
#> Number of edges: 255
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8185
#> Number of communities: 7
#> Elapsed time: 0 seconds
#> 1 singletons identified. 6 final clusters.

m.seurat <- FindNeighbors(m.seurat, verbose = FALSE)  # RNA_nn has been replaced with kNN graph of metacell (unweigted)
m.seurat <- FindClusters(m.seurat, graph.name = "RNA_nn") 
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 98
#> Number of edges: 960
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.7410
#> Number of communities: 4
#> Elapsed time: 0 seconds