Quality Control

Objective

Prior to conducting the basic Seurat analysis workflow, perform quality control (QC) to remove low-quality cells.

Quality Control Workflow

Automated quality control using “ddqc”
Removal of doublet cells using “DoubletFinder”

Load Required Libraries

# Load necessary libraries
library(Seurat)
library(tidyverse)       # Includes dplyr, ggplot2, etc.

library(ddqcR)           # Automated Quality Control tool
library(DoubletFinder)   # Doublet removal tool

Load Data

# Load the Seurat object containing cell metadata
created_seurat_obj_list <- readRDS("./output/01_load/obj/created_seurat_obj_list.RDS")

# Load the original expression matrix without preprocessing
orig_mtx_list <- readRDS("./output/01_load/obj/orig_mtx_list.RDS")

Calculate the Proportion of Mitochondrial and Ribosomal Genes

Before conducting QC, calculate the proportions of mitochondrial and ribosomal genes.

for(smn in names(created_seurat_obj_list)) {
  # Mitochondrial gene percentage
  created_seurat_obj_list[[smn]][["bfqc_percent.mt"]] <- 
    PercentageFeatureSet(created_seurat_obj_list[[smn]], pattern = "^mt-")
  
  # Ribosomal gene percentage
  created_seurat_obj_list[[smn]][["bfqc_percent.rp"]] <- 
    PercentageFeatureSet(created_seurat_obj_list[[smn]], pattern = "(?i)^RP[SL][[:digit:]]|^RPLP[[:digit:]]|^RPSA")
}

Automated Quality Control using ddqc

Filtering criteria:
- Cells with values significantly higher (percent.mito) or lower (nCount_RNA, nFeature_RNA) than x MADs (Median Absolute Deviation, default x = 2) from the cluster median are removed.
- If the computed threshold within a cluster is greater than 200 for nFeature_RNA or less than 10% for percent.mito, they are set to 200 and 10%, respectively.
Generated plots:
- A log2-scaled nFeature_RNA distribution per cluster with a red line at 200 genes (log2 scale: 7.64).
- A mitochondrial gene percentage distribution per cluster with a red line at 10%.

ddqc_seurat_obj_list <- list()
ddqc_df_list <- list()

for(smn in names(created_seurat_obj_list)){
  print(paste0("ddqc: ", smn))
  tmpdata <- initialQC(created_seurat_obj_list[[smn]])
  ddqc_df_list[[smn]] <- ddqc.metrics(tmpdata)
  ddqc_seurat_obj_list[[smn]] <- filterData(tmpdata, ddqc_df_list[[smn]])
}

## [1] "ddqc: P1_cortex_WT"

## [1] "ddqc: P1_cortex_Fezf2KO"

Check the updated metadata:

head(ddqc_seurat_obj_list[[1]][[]])

Save the filtered Seurat objects:

saveRDS(ddqc_seurat_obj_list, "./output/02_quality_control/obj/ddqc_seurat_obj_list.RDS")
saveRDS(ddqc_df_list, "./output/02_quality_control/obj/ddqc_df_list.RDS")

Doublet Removal using DoubletFinder

Preprocessing for Doublet Detection

Before running “DoubletFinder”, a basic Seurat analysis workflow must be performed:

for_doubletfinder_seurat_obj_list <- lapply(ddqc_seurat_obj_list, NormalizeData)
for_doubletfinder_seurat_obj_list <- lapply(for_doubletfinder_seurat_obj_list, FindVariableFeatures)
for_doubletfinder_seurat_obj_list <- lapply(for_doubletfinder_seurat_obj_list, ScaleData)
for_doubletfinder_seurat_obj_list <- lapply(for_doubletfinder_seurat_obj_list, RunPCA)
for_doubletfinder_seurat_obj_list <- lapply(for_doubletfinder_seurat_obj_list, RunUMAP, dims = 1:10)

## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session

for_doubletfinder_seurat_obj_list <- lapply(for_doubletfinder_seurat_obj_list, FindNeighbors)
for_doubletfinder_seurat_obj_list <- lapply(for_doubletfinder_seurat_obj_list, FindClusters)

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6398
## Number of edges: 227149
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8837
## Number of communities: 18
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 7795
## Number of edges: 269490
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9108
## Number of communities: 20
## Elapsed time: 0 seconds

Detecting Doublets with DoubletFinder

set.seed(1)
doublet_finder_res_df_list <- list()

for(nm in names(for_doubletfinder_seurat_obj_list)){
  print(paste0("DoubletFinder: ", nm))
  tmp_seurat_obj <- for_doubletfinder_seurat_obj_list[[nm]]
  
  sweep.res.list_pmbc <- paramSweep(tmp_seurat_obj, PCs = 1:10, sct = FALSE)
  sweep.stats_pmbc <- summarizeSweep(sweep.res.list_pmbc, GT = FALSE)
  bcmvn_pmbc <- find.pK(sweep.stats_pmbc)
  
  homotypic.prop <- modelHomotypic(tmp_seurat_obj[[]]$seurat_clusters)
  nExp_poi <- round(0.075 * nrow(tmp_seurat_obj[[]]))
  nExp_poi.adj <- round(nExp_poi * (1 - homotypic.prop))
  
  pN <- 0.25
  pK <- bcmvn_pmbc %>% arrange(desc(BCmetric)) %>% pull(pK) %>% as.character %>% as.numeric %>% .[1]
  
  base_char <- paste("DF.classifications", pN, pK, nExp_poi, sep = "_")
  adj_char  <- paste("DF.classifications", pN, pK, nExp_poi.adj, sep = "_")
  char_reuse_pANN <- paste("pANN", pN, pK, nExp_poi, sep = "_")
  
  tmp_seurat_obj <- doubletFinder(tmp_seurat_obj, PCs = 1:10, pN = pN, pK = pK, nExp = nExp_poi, reuse.pANN = FALSE, sct = FALSE)
  tmp_seurat_obj <- doubletFinder(tmp_seurat_obj, PCs = 1:10, pN = pN, pK = pK, nExp = nExp_poi.adj, reuse.pANN = char_reuse_pANN, sct = FALSE)
  
  tmp_doublet_finder_res_df <- tmp_seurat_obj[[]][c(base_char, adj_char)]
  colnames(tmp_doublet_finder_res_df) <- c("double_flg", "doublet_adj_flg")
  doublet_finder_res_df_list[[nm]] <- tmp_doublet_finder_res_df
}

Removing Cells Identified as Doublets by “DoubleFinder”

# code:2-9
doubletfinder_ddqc_seurat_obj_list <- list()
for (nm in names(ddqc_seurat_obj_list)) {
  # Add DoubletFinder results to metadata.
  doubletfinder_ddqc_seurat_obj_list[[nm]] <-
    AddMetaData(
      ddqc_seurat_obj_list[[nm]],
      doublet_finder_res_df_list[[nm]]
    )
  # Extract only cells identified as singlets by DoubletFinder.
  doubletfinder_ddqc_seurat_obj_list[[nm]] <- subset(doubletfinder_ddqc_seurat_obj_list[[nm]], subset = doublet_adj_flg %in% c("Singlet"))
}

# Verify metadata to ensure DoubletFinder results have been added.
head(doubletfinder_ddqc_seurat_obj_list[[1]][[]])

Saving Seurat Objects After Doublet Removal

# code:2-10
saveRDS(doubletfinder_ddqc_seurat_obj_list, "./output/02_quality_control/obj/doubletfinder_ddqc_seurat_obj_list.RDS")

Quality Control Evaluation

Perform summarization before and after Quality Control (QC). To do so, collect metadata from each Seurat object.

# code:2-11
get_metadata <- function(smn, obj){
  return(obj[[smn]][[]])
}
before_qc_metadata_df <- do.call(bind_rows, lapply(names(created_seurat_obj_list), get_metadata, obj = created_seurat_obj_list))
colnames(before_qc_metadata_df)[which(colnames(before_qc_metadata_df) %in% c("bfqc_percent.mt", "bfqc_percent.rp"))] <- c("percent.mt", "percent.rb")
after_qc_metadata_df  <- do.call(bind_rows, lapply(names(ddqc_seurat_obj_list), get_metadata, obj = ddqc_seurat_obj_list))
after_qc_doubletf_metadata_df <- do.call(bind_rows, lapply(names(doubletfinder_ddqc_seurat_obj_list), get_metadata, obj = doubletfinder_ddqc_seurat_obj_list))

Visualizing Distribution Changes in “nCount_RNA”, “nFeature_RNA”, “percent.mt”, and “percent.rb” Using Violin Plots

Aggregate necessary information for ViolinPlot visualization.

# code:2-12
plot_vars <- c("nCount_RNA", "nFeature_RNA", "percent.mt", "percent.rb")
# Consolidate metadata information.
qc_result_metadata_df <- bind_rows(
  before_qc_metadata_df %>%
    dplyr::select(condition, {{ plot_vars }}) %>%
    mutate(qc="before_qc"),
  after_qc_metadata_df %>%
    dplyr::select(condition, {{ plot_vars }}) %>%
    mutate(qc="after_ddqc"),
  after_qc_doubletf_metadata_df %>%
    dplyr::select(condition, {{ plot_vars }}) %>%
    mutate(qc="after_ddqc_doubletfinder")
)
# Convert the "qc" column to a factor and set category order.
qc_result_metadata_df$qc <- factor(qc_result_metadata_df$qc, levels = c("before_qc", "after_ddqc", "after_ddqc_doubletfinder"))
qc_result_metadata_df$condition <- factor(qc_result_metadata_df$condition, levels = c("WT", "Fezf2KO"))

Plot Violin Plots for “nCount_RNA”, “nFeature_RNA”, “percent.mt”, and “percent.rb”.

# code:2-13
violinplot_list <- list()
for(var in plot_vars){
  print(var)
  target_var <- sym(var)  # Convert string to symbol
  violinplot_list[[var]] <-
    ggplot(data = qc_result_metadata_df, mapping = aes(x = condition, y = {{ target_var }}, fill = qc)) +
    geom_violin(position = position_dodge(width = 0.9), width = 1.0, draw_quantiles = c(0.25, 0.5, 0.75)) +
    ggtitle(paste("QC result: ", var)) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
}

## [1] "nCount_RNA"
## [1] "nFeature_RNA"
## [1] "percent.mt"
## [1] "percent.rb"

Verify Violin Plots.

# code:2-14
violinplot_list

## $nCount_RNA

## 
## $nFeature_RNA

## 
## $percent.mt

## 
## $percent.rb

Visualizing Changes in Gene and Cell Counts Using Bar Plots

Collect gene and cell count information from Seurat objects.

# code:2-15
data_vec <- c("rawdata", "before_qc", "after_ddqc", "after_ddqc_doubletfinder")
# Collect gene and cell count information from Seurat objects.
gene_cell_pop_list <- list(
  as.data.frame(t(as.data.frame(lapply(orig_mtx_list, dim)))),
  as.data.frame(t(as.data.frame(lapply(created_seurat_obj_list, dim)))),
  as.data.frame(t(as.data.frame(lapply(ddqc_seurat_obj_list, dim)))),
  as.data.frame(t(as.data.frame(lapply(doubletfinder_ddqc_seurat_obj_list, dim))))
)

Prepare for Bar Plot visualization.

# code:2-16
names(gene_cell_pop_list) <- data_vec
for(dtn in data_vec) {
  colnames(gene_cell_pop_list[[dtn]]) <- c("gene_number", "cell_number")
  gene_cell_pop_list[[dtn]] <-
    gene_cell_pop_list[[dtn]] %>%
    mutate(data_name = dtn)
  gene_cell_pop_list[[dtn]]["sample"] <- rownames(gene_cell_pop_list[[dtn]])
  row.names(gene_cell_pop_list[[dtn]]) <- NULL
}
gene_cell_pop_df <- 
  do.call(bind_rows, gene_cell_pop_list) %>%
  mutate(data_name = factor(data_name, levels = c("rawdata", "before_qc", "after_ddqc", "after_ddqc_doubletfinder"))) %>%
  separate(col = sample, into = c("age", "area", "condition"), sep = "_") %>%
  mutate(condition = factor(condition, levels = c("WT", "Fezf2KO")))

Generate Bar Plots for gene and cell counts.

# code:2-17
plot_vars <- c("gene_number", "cell_number")
barplot_list <- list()
for(var in plot_vars){
  target_var <- sym(var)  # 文字列をシンボルに変換
  barplot_list[[var]] <-
    ggplot(data = gene_cell_pop_df, mapping = aes(x = condition, y = {{ target_var }}, fill = data_name)) +
    geom_col(position = position_dodge()) +
    ggtitle(paste("QC result: ", var)) + # タイトルを追加
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
}

Verify Bar Plots.

# code:2-18
barplot_list

## $gene_number

## 
## $cell_number

Merging Seurat Objects

Merge all scRNA-seq data into a single Seurat object, with each dataset stored in a layer.

# code:2-19
merged4norm_seurat_obj <- doubletfinder_ddqc_seurat_obj_list[[1]]
for (snm in names(doubletfinder_ddqc_seurat_obj_list)[2:length(doubletfinder_ddqc_seurat_obj_list)]) {
  merged4norm_seurat_obj <- merge(merged4norm_seurat_obj, doubletfinder_ddqc_seurat_obj_list[[snm]])
}

Save the merged Seurat object.

# code:2-20
saveRDS(merged4norm_seurat_obj, "./output/02_quality_control/obj/merged4norm_seurat_obj.RDS")

Quality Control

Yoshinori Hayakawa

Quality Control

Objective

Quality Control Workflow

Load Required Libraries

Load Data

Calculate the Proportion of Mitochondrial and Ribosomal Genes

Automated Quality Control using ddqc

Doublet Removal using DoubletFinder

Preprocessing for Doublet Detection

Detecting Doublets with DoubletFinder

Removing Cells Identified as Doublets by “DoubleFinder”

Saving Seurat Objects After Doublet Removal

Quality Control Evaluation

Visualizing Distribution Changes in “nCount_RNA”, “nFeature_RNA”, “percent.mt”, and “percent.rb” Using Violin Plots

Visualizing Changes in Gene and Cell Counts Using Bar Plots

Merging Seurat Objects