R-based single cell analysis using Seurat v5 for .rds/.rdata files
This skill provides specialized capabilities for single cell RNA-seq analysis using Seurat v5 in the system R environment. It is specifically designed to work with .rds and .rdata files.
<< 'EOF')执行方式R -e "..." 或 R --slave -e "..." 方式执行R代码/Library/Frameworks/R.framework/Resources/bin/R --slave --no-save --no-restore# ✅ 唯一允许的执行方式
/Library/Frameworks/R.framework/Resources/bin/R --slave --no-save --no-restore << 'EOF'
# === R代码必须写在EOF之间 ===
library(Seurat)
library(dplyr)
library(clusterProfiler)
# 您的分析代码
print("Hello from Seurat!")
EOF
# 🚫 严格禁止:这些方式都不允许使用
/Library/Frameworks/R.framework/Resources/bin/R --slave -e "library(Seurat); print('error')"
R -e "library(Seurat); print('error')"
R --slave -e 'library(Seurat); print("error")'
/Library/Frameworks/R.framework/Resources/bin/R << EOF # 🚫 必须使用单引号'EOF'
# code
EOF
--no-save --no-restore确保干净R会话layer="counts",不转换标准化数据This skill should be used when:
This skill requires:
/Library/Frameworks/R.framework/Resources/bin/R with Seurat v5, clusterProfiler, and ggplot2 packages installedscanpy for H5AD conversion support (used by reticulate method)Load single cell data from .rds or .rdata files using Seurat v5.
Perform quality control analysis including:
Before using this skill, ensure the system R environment is available with all required packages:
R Environment Configuration:
# Use the standard R path for this skill
R_EXECUTABLE <- "/Library/Frameworks/R.framework/Resources/bin/R"
# Load required libraries
library(Seurat)
library(clusterProfiler)
library(ggplot2)
# Check that packages are loaded
packageVersion("Seurat")
packageVersion("clusterProfiler")
packageVersion("ggplot2")
Conda Environment for Python Integration:
# Required: Source conda first, then activate scanpy environment
source ~/miniconda3/etc/profile.d/conda.sh
conda activate scanpy
⚠️ 注意:以下模板是唯一允许的执行方式
# 所有Seurat分析的标准模板(强制使用)
/Library/Frameworks/R.framework/Resources/bin/R --slave --no-save --no-restore << 'EOF'
# === 加载库 ===
library(Seurat)
library(dplyr)
library(clusterProfiler)
# === 数据加载 ===
seurat_obj <- readRDS('input_file.rds')
# === 数据检查 ===
cat('数据维度:', nrow(seurat_obj), '×', ncol(seurat_obj), '\n')
print(table(seurat_obj$celltype))
# === 分析流程 ===
# ... 您的分析代码
# === 结果导出 ===
write.table(results, 'output.tsv', sep = '\t', quote = FALSE, row.names = FALSE)
EOF
🚨 重要提醒:技能中的所有代码示例都必须遵循Here Document执行方式!
任何Seurat数据读取后,必须首先检查数据状态,确保使用正确的数据类型:
# 数据状态检查函数
check_seurat_data_state <- function(seurat_obj, verbose = TRUE) {
"""
检查Seurat对象的counts layer是否为整数(原始counts)
返回:
- 'raw_counts': counts layer包含整数,是原始counts数据
- 'normalized': counts layer已被标准化,需要谨慎处理
- 'unknown': 数据状态无法识别
"""
if (verbose) {
cat('🔍 检查Seurat数据状态...\n')
cat('数据维度:', nrow(seurat_obj), '基因 ×', ncol(seurat_obj), '细胞\n')
}
# 检查counts layer
if ('counts' %in% Layers(seurat_obj)) {
counts_matrix <- GetAssayData(seurat_obj, layer = 'counts')
max_val <- max(counts_matrix)
is_integer <- all(counts_matrix == round(counts_matrix))
if (verbose) {
cat('Counts layer信息:\n')
cat(' 最大值:', max_val, '\n')
cat(' 是否为整数:', is_integer, '\n')
cat(' 数据类型:', class(counts_matrix), '\n')
}
if (max_val > 20 && is_integer) {
if (verbose) cat('✅ 检测到原始counts数据\n')
return('raw_counts')
} else if (max_val <= 10) {
if (verbose) cat('⚠️ 检测到已标准化数据\n')
return('normalized')
} else {
if (verbose) cat('❌ 数据状态未知\n')
return('unknown')
}
} else {
if (verbose) cat('❌ 未找到counts layer\n')
return('no_counts')
}
}
# 使用示例
data_state <- check_seurat_data_state(seurat_obj)
if (data_state != 'raw_counts') {
stop('❌ 数据不是原始counts,请检查数据来源')
}
# For .rds files
data <- readRDS("path/to/your/file.rds")
# For .rdata files
load("path/to/your/file.rdata")
# The loaded object will be available in the environment
# ⚠️ 重要:加载后立即检查数据状态
data_state <- check_seurat_data_state(data)
if (data_state == 'raw_counts') {
cat('✅ 数据验证通过,进行默认标准化...\n')
# 🔄 **默认标准化流程**: 检查后自动进行标准化
data <- NormalizeData(data, normalization.method = "LogNormalize", scale.factor = 10000)
data <- FindVariableFeatures(data, selection.method = "vst", nfeatures = 2000)
cat('✅ 标准化完成,data layer已创建\n')
} else if (data_state == 'normalized') {
cat('✅ 数据已标准化,跳过标准化步骤\n')
} else {
stop('❌ 数据验证失败,终止分析')
}
Perform quality control with the qc_analysis function:
# Calculate mitochondrial percentage
pbmc[['percent.mt']] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
# Visualize QC metrics
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
# Filter cells based on QC metrics
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
Normalize data and identify highly variable genes:
# Normalize data
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
# Identify highly variable features
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
Scale data and perform PCA:
# Scale the data
all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
# Run PCA
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
Find neighbors, clusters, and create UMAP visualization:
# Find neighbors
pbmc <- FindNeighbors(pbmc, dims = 1:10)
# Find clusters
pbmc <- FindClusters(pbmc, resolution = 0.5)
# Run UMAP
pbmc <- RunUMAP(pbmc, dims = 1:10)
# Plot clusters
DimPlot(pbmc, reduction = "umap")
⚠️ 前置条件: FindAllMarkers需要data layer存在,必须先进行标准化!
经过实践验证,推荐使用以下标准流程来正确提取特定cluster的标记基因:
# 标准FindAllMarkers分析流程
library(Seurat)
# 1. 设置ident(如果还没有设置)
Idents(seurat) <- 'celltype.clean' # 或您的cluster列名
# 2. 确保数据已标准化(检查data layer是否存在)
if (!'data' %in% Layers(seurat)) {
cat('⚠️ 缺少data layer,进行标准化...\n')
seurat <- NormalizeData(seurat, normalization.method = "LogNormalize", scale.factor = 10000)
seurat <- FindVariableFeatures(seurat, selection.method = "vst", nfeatures = 2000)
}
# 3. 运行FindAllMarkers(推荐使用严格参数)
all.markers <- FindAllMarkers(seurat,
only.pos = TRUE,
min.pct = 0.25,
logfc.threshold = 0.25)
# 4. 正确提取特定cluster的标记基因
# 方法:直接使用subset函数(推荐)
cluster.markers <- subset(all.markers, cluster == "Fibro3") # 替换为您的cluster名
# 5. 筛选显著性基因(可选但推荐)
significant.markers <- subset(cluster.markers, p_val_adj < 0.05)
# 6. 按p值排序
cluster.markers.sorted <- significant.markers[order(significant.markers$p_val_adj), ]
# 7. 保存结果(默认TSV格式)
write.table(cluster.markers.sorted, 'cluster_markers.tsv', sep = '\t', quote = FALSE, row.names = FALSE)
write.table(cluster.markers.sorted$gene, 'gene_list.txt', quote = FALSE, row.names = FALSE, col.names = FALSE)
# 宽松条件(更多基因)
all.markers <- FindAllMarkers(seurat, only.pos = TRUE, min.pct = 0.1, logfc.threshold = 0.1)
# 严格条件(更少但更可靠基因)
all.markers <- FindAllMarkers(seurat, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
# 极严格条件(核心标记基因)
all.markers <- FindAllMarkers(seurat, only.pos = TRUE, min.pct = 0.5, logfc.threshold = 1.0)
Find marker genes for clusters:
# Find all markers (using default parameters)
markers <- FindAllMarkers(pbmc)
# Find markers for specific cluster (using default parameters)
cluster1.markers <- FindMarkers(pbmc, ident.1 = 1)
Perform Gene Ontology enrichment analysis on marker genes from a specific cluster:
# Find all markers across all clusters with stringent thresholds
all.markers <- FindAllMarkers(pbmc,
only.pos = TRUE,
min.pct = 0.25,
logfc.threshold = 0.25)
# Extract marker genes for a specific cluster (e.g., cluster 1)
cluster1.markers <- subset(all.markers, cluster == 1)
# Filter for significant genes (adjusted p-value < 0.05)
sig_genes <- subset(cluster1.markers, p_val_adj < 0.05)
# Create gene list for enrichment analysis (using avg_log2FC as weights)
gene_list <- sig_genes$avg_log2FC
names(gene_list) <- sig_genes$gene
# Convert gene symbols to ENTREZ IDs (required for clusterProfiler)
library(org.Hs.eg.db)
entrez_ids <- bitr(names(gene_list), fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)
# Perform GO enrichment analysis for all three ontologies
ego_bp <- enrichGO(gene = entrez_ids$ENTREZID,
OrgDb = org.Hs.eg.db,
keyType = "ENTREZID",
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05)
ego_cc <- enrichGO(gene = entrez_ids$ENTREZID,
OrgDb = org.Hs.eg.db,
keyType = "ENTREZID",
ont = "CC",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05)
ego_mf <- enrichGO(gene = entrez_ids$ENTREZID,
OrgDb = org.Hs.eg.db,
keyType = "ENTREZID",
ont = "MF",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05)
# Convert ENTREZ IDs back to gene symbols for readable results
ego_bp_readable <- setReadable(ego_bp, OrgDb = org.Hs.eg.db, keyType = "ENTREZID")
ego_cc_readable <- setReadable(ego_cc, OrgDb = org.Hs.eg.db, keyType = "ENTREZID")
ego_mf_readable <- setReadable(ego_mf, OrgDb = org.Hs.eg.db, keyType = "ENTREZID")
# View results
head(ego_bp_readable)
head(ego_cc_readable)
head(ego_mf_readable)
# Visualize results for each ontology
barplot(ego_bp_readable, showCategory = 10) + ggtitle("GO Biological Process")
dotplot(ego_bp_readable, showCategory = 10) + ggtitle("GO Biological Process")
barplot(ego_cc_readable, showCategory = 10) + ggtitle("GO Cellular Component")
dotplot(ego_cc_readable, showCategory = 10) + ggtitle("GO Cellular Component")
barplot(ego_mf_readable, showCategory = 10) + ggtitle("GO Molecular Function")
dotplot(ego_mf_readable, showCategory = 10) + ggtitle("GO Molecular Function")
# Save results as TSV files with readable gene names
write.table(as.data.frame(ego_bp_readable),
file = "cluster1_GO_BP_enrichment_results.tsv",
sep = "\t", quote = FALSE, row.names = FALSE)
write.table(as.data.frame(ego_cc_readable),
file = "cluster1_GO_CC_enrichment_results.tsv",
sep = "\t", quote = FALSE, row.names = FALSE)
write.table(as.data.frame(ego_mf_readable),
file = "cluster1_GO_MF_enrichment_results.tsv",
sep = "\t", quote = FALSE, row.names = FALSE)
The skill now includes two powerful GO visualization functions that create publication-quality plots with fixed formatting:
问题: 如果发现图中不同Count值的点大小区分度不够(如24个基因和62个基因的点看起来差不多大)
解决方案: 修改 go_visualization_separate.R 文件中的第53行:
# 原始设置(区分度较小)
scale_size(range = c(5, 8.5), name = "Count",
# 改进设置(区分度更大)
scale_size(range = c(3, 12), name = "Count",
📋 参数说明:
c(5, 8.5): 点大小范围5-8.5,适合Count值差异较小的情况c(3, 12): 点大小范围3-12,适合Count值差异较大的情况c(2, 15): 点大小范围2-15,极大区分度,适合Count值范围很宽的情况🎯 推荐设置:
c(5, 8.5) 或 c(4, 10)c(3, 12)c(2, 15)# Load the separate visualization functions
source("/Users/zhy/.claude/skills/seurat-single-cell/go_visualization_separate.R")
# Prepare data in the required format (columns: Description, p.adjust, Count, ONTOLOGY)
# Example: Convert clusterProfiler results to the required format
bp_data <- data.frame(
Description = ego_bp_readable$Description,
p.adjust = ego_bp_readable$p.adjust,
Count = ego_bp_readable$Count,
ONTOLOGY = "BP"
)
cc_data <- data.frame(
Description = ego_cc_readable$Description,
p.adjust = ego_cc_readable$p.adjust,
Count = ego_cc_readable$Count,
ONTOLOGY = "CC"
)
mf_data <- data.frame(
Description = ego_mf_readable$Description,
p.adjust = ego_mf_readable$p.adjust,
Count = ego_mf_readable$Count,
ONTOLOGY = "MF"
)
# Create separate plots for each GO category
separate_plots <- create_all_separate_go_plots(
bp_data = bp_data,
cc_data = cc_data,
mf_data = mf_data,
output_dir = "separate_go_plots",
filename_prefix = "Cluster1_GO",
width = 11,
height = 8
)
# Results:
# - Cluster1_GO_BP_enrichment.svg/png/pdf
# - Cluster1_GO_CC_enrichment.svg/png/pdf
# - Cluster1_GO_MF_enrichment.svg/png/pdf
# For best results, use the r_gground conda environment:
# conda activate r_gground
# Load the combined visualization functions
source("/Users/zhy/.claude/skills/seurat-single-cell/go_visualization_combined.R")
# Method 1: Read from CSV files (recommended for consistency)
# First save your data as CSV files
write.csv(bp_data, "Proteome_GO_BP.csv", row.names = FALSE)
write.csv(cc_data, "Proteome_GO_CC.csv", row.names = FALSE)
write.csv(mf_data, "Proteome_GO_MF.csv", row.names = FALSE)
# Create combined plot with oxidative stress term prioritization
combined_plot <- create_combined_go_plot(
go_data = rbind(bp_data, cc_data, mf_data),
output_dir = "combined_go_plots",
filename_prefix = "Cluster1_GO"
)
# Results:
# - Cluster1_GO_Visualization.svg/png/pdf (all categories in one figure)
# Method 2: Direct data input
combined_plot <- create_combined_go_plot(
go_data = rbind(bp_data, cc_data, mf_data),
output_dir = "combined_go_plots",
filename_prefix = "Cluster1_GO"
)
Both visualization functions require data frames with these exact column names:
Description: GO term descriptionp.adjust: Adjusted p-value (numeric)Count: Number of genes in the term (numeric)ONTOLOGY: Category ("BP", "CC", or "MF")r_gground conda environment for full featuresconda activate r_gground
geom_round_col() for rounded barsgeom_round_rect() for rounded category labelstheme_prism() for professional styling# Complete workflow from Seurat to publication-ready GO plots
library(Seurat)
library(clusterProfiler)
# 1. Get marker genes
markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
# 2. Perform GO enrichment for cluster 1
cluster1_markers <- subset(markers, cluster == 1 & p_val_adj < 0.05)
gene_list <- cluster1_markers$avg_log2FC
names(gene_list) <- cluster1_markers$gene
# 3. Convert to ENTREZ IDs and run enrichment
library(org.Hs.eg.db)
entrez_ids <- bitr(names(gene_list), fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)
ego_bp <- enrichGO(gene = entrez_ids$ENTREZID, OrgDb = org.Hs.eg.db, ont = "BP", pAdjustMethod = "BH")
ego_cc <- enrichGO(gene = entrez_ids$ENTREZID, OrgDb = org.Hs.eg.db, ont = "CC", pAdjustMethod = "BH")
ego_mf <- enrichGO(gene = entrez_ids$ENTREZID, OrgDb = org.Hs.eg.db, ont = "MF", pAdjustMethod = "BH")
# 4. Convert to readable format
ego_bp_readable <- setReadable(ego_bp, OrgDb = org.Hs.eg.db, keyType = "ENTREZID")
ego_cc_readable <- setReadable(ego_cc, OrgDb = org.Hs.eg.db, keyType = "ENTREZID")
ego_mf_readable <- setReadable(ego_mf, OrgDb = org.Hs.eg.db, keyType = "ENTREZID")
# 5. Load visualization functions
source("/Users/zhy/.claude/skills/seurat-single-cell/go_visualization_separate.R")
source("/Users/zhy/.claude/skills/seurat-single-cell/go_visualization_combined.R")
# 6. Create publication-ready visualizations
# Separate plots
bp_data <- data.frame(Description = ego_bp_readable$Description, p.adjust = ego_bp_readable$p.adjust, Count = ego_bp_readable$Count, ONTOLOGY = "BP")
cc_data <- data.frame(Description = ego_cc_readable$Description, p.adjust = ego_cc_readable$p.adjust, Count = ego_cc_readable$Count, ONTOLOGY = "CC")
mf_data <- data.frame(Description = ego_mf_readable$Description, p.adjust = ego_mf_readable$p.adjust, Count = ego_mf_readable$Count, ONTOLOGY = "MF")
separate_plots <- create_all_separate_go_plots(bp_data, cc_data, mf_data, output_dir = "cluster1_go_separate")
# Combined plot
combined_plot <- create_combined_go_plot(rbind(bp_data, cc_data, mf_data), output_dir = "cluster1_go_combined")
# 7. Results ready for publication!
This section provides the custom conversion functions for converting between Seurat objects (.rds/.rdata) and H5AD format. These functions are fully tested and provide 100% compatibility with Seurat v5.
✅ Advantages of Custom Functions:
The following custom functions provide reliable Seurat ↔ H5AD conversion for Seurat v5. All issues have been resolved through extensive testing.
# Three main functions (fully tested and working)
- seurat_to_h5ad(seurat_obj, output_file, verbose = TRUE)
- h5ad_to_seurat(h5ad_file, verbose = TRUE)
- validate_conversion(seurat_obj, h5ad_file, verbose = TRUE)
1. Python KeysView Object Handling
adata$obsm$keys() returns Python KeysView objects that cannot be used directly in Rreticulate::import_builtins()$list() to convert KeysView to Python list, then to R vectorbuiltins <- reticulate::import_builtins()
key_list <- builtins$list(adata$obsm$keys())
obsm_keys <- unlist(reticulate::py_to_r(key_list), use.names = FALSE)
2. AnnData obsm Dictionary Assignment
adata$obsm[["X_umap"]] <- data fails in reticulatereticulate::py_set_item() for proper dictionary assignmentumap_py <- r_to_py(umap_coords) # Convert to Python object
reticulate::py_set_item(adata$obsm, "X_umap", umap_py)
R packages required:
library(Seurat)
library(reticulate)
# Python environment
use_condaenv("scanpy", required = TRUE)
sc <- import("scanpy")
ad <- import("anndata")
⚠️ 重要原则: 只转换原始counts数据
# Load custom functions from skill directory
source("/Users/zhy/.claude/skills/seurat-single-cell/custom_seurat_h5ad_conversion.R")
# Convert Seurat to H5AD (只使用原始counts)
seurat_obj <- readRDS("your_seurat_object.rds") # or load from .Rdata
output_file <- "your_output.h5ad"
success <- seurat_to_h5ad(seurat_obj, output_file, verbose = TRUE)
if (success) {
cat("✅ Conversion completed successfully!\n")
cat("📁 Output file:", output_file, "\n")
cat("📋 转换说明: 只保存原始counts,标准化由Scanpy处理\n")
} else {
cat("❌ Conversion failed\n")
}
为什么只转换原始counts?
# Convert H5AD back to Seurat
h5ad_file <- "your_data.h5ad"
seurat_obj <- h5ad_to_seurat(h5ad_file, verbose = TRUE)
if (!is.null(seurat_obj)) {
cat("✅ Conversion completed successfully!\n")
cat("📊 Result: ", ncol(seurat_obj), "cells × ", nrow(seurat_obj), "genes\n")
# Save as RDS for future use
saveRDS(seurat_obj, "converted_seurat.rds")
} else {
cat("❌ Conversion failed\n")
}
# Validate bidirectional conversion
validation <- validate_conversion(original_seurat, h5ad_file)
if (validation$success) {
cat("🎉 Perfect conversion achieved!\n")
cat("✅ All data preserved:\n")
cat(" - Cells:", validation$original_cells, "→", validation$back_cells, "\n")
cat(" - Genes:", validation$original_genes, "→", validation$back_genes, "\n")
} else {
cat("⚠️ Conversion issues detected\n")
}