Content Comparison

I've made available some publicly available data containing single cell RNA-Seq data for 5k immune cells as well as a script that runs the Seurat workflow to define cell-type clusters in this data.

You can get the data, R script and results from ls6 here:

Script and Data locations

Code Block

/work2/projects/BioITeam/projects/courses/rnaseq_course/day_5_single_cell_data
	Script: seuratTotalScript5k.mod.R
	Data (after preprocessing by cellRanger): filtered_feature_bc_matrix
	Results generated: results

#You can copy over the data to your directory:
cds
cd my_rnaseq_course
cp -r /work2/projects/BioITeam/projects/courses/rnaseq_course/day_5_single_cell_data .

The script we are going to run is an older version (Because Seurat on TACC is an older version). You can kick it off by doing the following:

Load modules and execute R script

Code Block
module load seurat-scripts/ctr-0.0.5--r341_0 #OPEN AN IDEV SESSION: idev -m 120 -q normal -A OTH21164 -r rna-seq-class-0613 R CMD BATCH seuratTotalScript5k.mod.3.4.R &

The script does the following (newer versions of the functions given here):

Load required libraries

Code Block
library(Seurat) library(ggplot2) library(dplyr) library(data.table) #if doing sctransform with newer version: library()

Load Data and Create Seurat Object

Load data and create Seurat object

Code Block
pbmc_data <- Read10X(data.dir = "filtered_feature_bc_matrix") pbmc <- CreateSeuratObject(pbmc_data)

Check what the counts matrix looks like

Look at counts matrix

Code Block

pbmc_data[c("CD3D", "TCL1A", "MS4A1"), 1:30]

##output                                                                   
## CD3D  4 . 10 . . 1 2 3 1 . . 2 7 1 . . 1 3 . 2  3 . . . . . 3 4 1 5
## TCL1A . .  . . . . . . 1 . . . . . . . . . . .  . 1 . . . . . . . .
## MS4A1 . 6  . . . . . . 1 1 1 . . . . . . . . . 36 1 2 . . 2 . . . .

Check what the seurat object looks like

Look at seurat object

Code Block
pbmc An object of class seurat in project SeuratProject 33538 genes across 5155 samples. #right now, it just has counts, but as we process the data, many other slots will get added.

Normalize Data Using SCTransform and Regress out Genes Related to Cell Cycle

Normalize data

Code Block

s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes
pbmc <- CellCycleScoring(pbmc, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)
pbmc$CC.Difference <- pbmc$S.Score - pbmc$G2M.Score
pbmc <- SCTransform(pbmc, vars.to.regress = "CC.Difference")

Identify Highly Variable Features

Identify Highly Variable Features and Generate Plots

Code Block

pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
top10 <- head(VariableFeatures(pbmc), 10)
pdf("variableFeaturesPlot.pdf")
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
CombinePlots(plots = list(plot1, plot2))
dev.off()

Scale the Data

Scale data

Code Block
all.genes <- rownames(pbmc) pbmc <- ScaleData(pbmc, features = all.genes)

Select Dimensions using PCA (using variable features)

Perform PCA for Dimensionality Reduction

Code Block

#Perform PCA for Dimensionality Reduction
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))

#Print and Examine PCA Results in Multiple Formats
pdf("findDimensions.pdf")
print(pbmc[["pca"]], dims = 1:5, nfeatures = 5)
VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")
DimPlot(pbmc, reduction = "pca")

#Generate Principal Component Heatmaps through PC_30
DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)
DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE)
DimHeatmap(pbmc, dims = 16:30, cells = 500, balanced = TRUE)

#Create Elbow Plot of Principal Components through PC_30
ElbowPlot(pbmc, ndims = 30, reduction = "pca")
dev.off()

Cluster the cells (using the selected number of dimensions)

Cluster the Cells for the selected numeber of Principal Components

Code Block

#Cluster the Cells for the selected numeber of Principal Components (we selected 14 for 5k Data)
pbmc <- FindNeighbors(pbmc, dims = 1:14)
pbmc <- FindClusters(pbmc, resolution = 0.5)

#View the Cluster IDs of the First 5 Cells
head(Idents(pbmc), 5)

Visualize the clusters using tSNE or UMAP

Visualize using tSNE

Code Block
pdf("tsne.pdf") pbmcTSNE <- RunTSNE(pbmc, dims = 1:14) DimPlot(pbmcTSNE, reduction = "tsne") dev.off()

Find markers for each cluster

Find Markers for Every Cluster and Print Top 2 Per Cluster

Code Block

pbmc.markers <- FindAllMarkers(pbmcTSNE, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_logFC)
write.csv(pbmc.markers, file = "markersAll.csv")

#Generate Heatmap of Top 10 Markers per Cluster
top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
DoHeatmap(pbmcTSNE, features = top10$gene, size = 3) + theme(axis.text.y = element_text(size = 5)) + NoLegend()

Identify cells expressing genes of interest

Identify Clusters for Marker Genes of Interest

Code Block

markerGenes <- c("CD27", "CCR7", "CD8A", "CD8B", "IL7R", "GZMK", "CD79A", "CD37", "CD160", "NKG7", "GNL$
seuratClusters <- function(output, genes) {
        for (gene in genes) {
        print(output[output$gene == gene,]) }
        }
seuratClusters(pbmc.markers, markerGenes)

#Visualize Feature Expression on TSNE Plot of Top Gene per Cluster
FeaturePlot(pbmcTSNE, features = c("LTB","S100A8","CCL5","NOSIP"))
FeaturePlot(pbmcTSNE, features = c("LINC02446","IGKC","GNLY","MT-CO3"))
FeaturePlot(pbmcTSNE, features = c("CST3","NKG7","KLRB1","LST1"))
FeaturePlot(pbmcTSNE, features = c("PPBP","AC084033.3"))

#Visualize Expression Probability Distribution Across Clusters for Top Gene per Cluster
VlnPlot(pbmcTSNE, features = c("LTB","S100A8","CCL5","NOSIP"))
VlnPlot(pbmcTSNE, features = c("LINC02446","IGKC","GNLY","MT-CO3"))
VlnPlot(pbmcTSNE, features = c("CST3","NKG7","KLRB1","LST1"))
VlnPlot(pbmcTSNE, features = c("PPBP","AC084033.3"))

Label clusters as cell types based on expression of known marker genes.

Generate Labeled TSNE Plot

Code Block

new.cluster.ids <- c("CD4+ Memory Cells", "CD16+ and CD14+ Monocytes", "Regulatory T Cells", "CD8+ T Cells", "N/A (Cluster 4)", "NK Cells", "B Cells", "Monocyte Derived Dendritic Cells", "N/A (Cluster 8)", "N/A (Cluster 9)", "Monocyte Derived Dendritic Cells", "N/A (Cluster 11)", "N/A (Cluster 12)", "Megakaryocyte Progenitors")
names(new.cluster.ids) <- levels(pbmcTSNE)
pbmcTSNE <- RenameIdents(pbmcTSNE, new.cluster.ids)
DimPlot(pbmcTSNE, reduction = "tsne", label = TRUE)

Let's look at the results:

Download this zip file and unzip it on your computer to view the files (Most computers will automatically unzip files).

results.zip

Version	Old Version 22	New Version 23
Changes made by	Dhivya Arasappan	Dhivya Arasappan
Saved on	Jun 13, 2025	Jun 13, 2025

Versions Compared

Key

Script and Data locations

Load modules and execute R script