前言
Hello小伙伴們大家好,我是生信技能樹的小學(xué)徒”我才不吃蛋黃“。今天是胃癌單細(xì)胞數(shù)據(jù)集GSE163558復(fù)現(xiàn)系列第六期。第五期我們通過(guò)繪制餅圖、堆積柱狀圖、氣泡圖等,比較不同分組之間細(xì)胞比例差異。本期,我們將下載胃癌bulk數(shù)據(jù)(TCGA-STAD),篩選胃癌相關(guān)差異表達(dá)基因,根據(jù)差異基因list,使用AddModuleScore_UCell函數(shù)給上皮細(xì)胞亞群Seurat對(duì)象添加惡性評(píng)分,以此協(xié)助區(qū)分正常上皮和惡性上皮。
1.背景介紹
眾所周知,胃癌細(xì)胞是胃上皮細(xì)胞發(fā)生了惡性病變。在第三期對(duì)細(xì)胞分群注釋時(shí),我們并未對(duì)上皮細(xì)胞進(jìn)行細(xì)分,這里面就包括非惡性上皮和惡性上皮細(xì)胞。那么該如何識(shí)別惡性上皮細(xì)胞?本文作者根據(jù)TCGA數(shù)據(jù)庫(kù)中胃癌和正常胃組織之間的差異表達(dá)基因,定義了每個(gè)上皮細(xì)胞的惡性和非惡性評(píng)分,從而區(qū)分惡性上皮和非惡性上皮細(xì)胞。當(dāng)然,我們還可以利用copykat和infercnv從單細(xì)胞轉(zhuǎn)錄組數(shù)據(jù)推斷腫瘤拷貝數(shù)變異,從而區(qū)分上皮惡性與否(后期更新,敬請(qǐng)期待)。
2.數(shù)據(jù)下載
在正式分析之前,我們先從TCGA數(shù)據(jù)庫(kù)獲取胃癌患者的表達(dá)數(shù)據(jù)和臨床數(shù)據(jù)。
首先加載工作目錄,創(chuàng)建新的文件夾,加載R包:
rm(list=ls())
getwd()
setwd("/home/data/t020505/GSE163558-GC代碼公眾號(hào)復(fù)現(xiàn)版")
dir.create("6-TCGA_STAD")
setwd('6-TCGA_STAD/')
source('../scRNA_scripts/mycolors.R')
library(tidyverse)
library(data.table)
#如果需要下載TCGA其他癌種,請(qǐng)更換proj
proj = "TCGA-STAD"
dir.create("input")
在R中利用代碼一鍵下載TCGA數(shù)據(jù)庫(kù)胃癌患者的表達(dá)數(shù)據(jù)及臨床數(shù)據(jù),并將下載好了數(shù)據(jù)存放在input文件夾:
if(T){
download.file(url = paste0("https://gdc./download/",proj, ".htseq_counts.tsv.gz"),destfile = paste0("input/",proj,".htseq_counts.tsv.gz")) ##表達(dá)數(shù)據(jù)
download.file(url = paste0("https://gdc./download/",proj, ".GDC_phenotype.tsv.gz"),destfile = paste0("input/",proj,".GDC_phenotype.tsv.gz")) ##臨床數(shù)據(jù)
download.file(url = paste0("https://gdc./download/",proj, ".survival.tsv"),destfile = paste0("input/",proj,".survival.tsv")) ##生存數(shù)據(jù)
}
clinical = read.delim(paste0("input/",proj,".GDC_phenotype.tsv.gz"),fill = T,header = T,sep = "\t")
surv = read.delim(paste0("input/",proj,".survival.tsv"),header = T)
head(surv) #生存數(shù)據(jù)os和os.time
生存數(shù)據(jù)包括os和os.time:
3.數(shù)據(jù)整理
在下載完數(shù)據(jù)之后,我們需要對(duì)數(shù)據(jù)進(jìn)行初步的整理。
3.1 處理表達(dá)矩陣
首先處理表達(dá)矩陣:
dat = read.table(paste0("input/",proj,".htseq_counts.tsv.gz"),check.names = F,row.names = 1,header = T)
dat <- data.table::fread(paste0("input/",proj,".htseq_counts.tsv.gz"),
data.table = F) #全部是symbol
a = dat[60483:60488,]
dat = dat[1:60483,]
rownames(dat) = dat$Ensembl_ID
a = dat
a = a[,-1]
exp = a
去除在所有樣本里表達(dá)量都為零的基因:
exp = exp[rowSums(exp)>0,]
nrow(exp)
僅保留在一半以上樣本里表達(dá)的基因:
exp = exp[apply(exp, 1, function(x) sum(x > 0) > 0.5*ncol(exp)), ]
nrow(exp)
表達(dá)矩陣行名ID轉(zhuǎn)換:
library(stringr)
head(rownames(exp))
library(AnnoProbe)
rownames(exp) = substr(rownames(exp), 1, 15)
##rownames(exp) = str_split(rownames(exp),"\\.",simplify = T)[,1];head(rownames(exp))
#annoGene(rownames(exp),ID_type = "ENSEMBL")
re = annoGene(rownames(exp),ID_type = "ENSEMBL");head(re)
#if(!require(tinyarray))devtools::install_github("xjsun1221/tinyarray",upgrade = FALSE,dependencies = TRUE)
library(tinyarray)
exp = trans_array(exp,ids = re,from = "ENSEMBL",to = "SYMBOL")
exp[1:4,1:4]
3.2 整理分組信息
然后整理分組信息:
根據(jù)樣本ID的第14-15位,給樣本分組(tumor和normal):
table(str_sub(colnames(exp),14,15))
table(str_sub(colnames(exp),14,16))
Group = ifelse(as.numeric(str_sub(colnames(exp),14,15)) < 10,'tumor','normal')
Group = factor(Group,levels = c("normal","tumor"))
table(Group)
group = make_tcga_group(exp)
table(group)
保存數(shù)據(jù):
save(exp,Group,proj,clinical,surv,file = paste0(proj,".Rdata"))
4.數(shù)據(jù)分析
4.1 TCGA差異分析
首先清除系統(tǒng)環(huán)境變量,加載TCGA數(shù)據(jù),加載差異分析R包“edgeR”:
rm(list = ls())
load("TCGA-STAD.Rdata")
table(Group)
library(edgeR)
開始進(jìn)行胃癌和癌旁組織差異基因分析:
dge <- DGEList(counts=exp,group=Group)
dge$samples$lib.size <- colSums(dge$counts)
dge <- calcNormFactors(dge)
design <- model.matrix(~Group)
dge <- estimateGLMCommonDisp(dge, design)
dge <- estimateGLMTrendedDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)
fit <- glmFit(dge, design)
fit <- glmLRT(fit)
DEG2=topTags(fit, n=Inf)
class(DEG2)
DEG2=as.data.frame(DEG2)
head(DEG2)
添加change列標(biāo)記基因上調(diào)下調(diào),在這里,我們對(duì)差異分析篩選的閾值為:“| log2 fold change |>1, pvalue <0.05”,最終在胃癌中篩選到了2213個(gè)上調(diào)基因,142個(gè)下調(diào)基因。
logFC_t = 1
pvalue_t = 0.05
k1 = (DEG2$PValue < pvalue_t)&(DEG2$logFC < -logFC_t)
k2 = (DEG2$PValue < pvalue_t)&(DEG2$logFC > logFC_t)
DEG2$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
head(DEG2)
table(DEG2$change)
保存差異分析結(jié)果:
save(DEG2,Group,file = paste0(proj,"_DEG.Rdata"))
4.2.差異基因可視化
差異基因可視化選擇了熱圖和火山圖:
library(ggplot2)
library(tinyarray)
dat = log2(cpm(exp)+1)
pca.plot = draw_pca(dat,Group);pca.plot
save(pca.plot,file = paste0(proj,"_pcaplot.Rdata"))
cg2 = rownames(DEG2)[DEG2$change !="NOT"]
h2 = draw_heatmap(dat[cg2,],Group)
v2 = draw_volcano(DEG2,pkg = 2,logFC_cutoff = logFC_t)
library(patchwork)
h2 + v2 +plot_layout(guides = 'collect') &theme(legend.position = "none")
ggsave(paste0(proj,"_heat_vo.png"),width = 16,height = 9)
4.3 提取上皮細(xì)胞亞群
設(shè)置隨機(jī)種子,讀取第三期細(xì)胞分群注釋后的單細(xì)胞文件:
set.seed(12345)
sce.all=readRDS( "../3-Celltype/sce_celltype.rds")
table(sce.all$celltype)
提取出其中的上皮細(xì)胞:
sce1 = sce.all[, sce.all$celltype %in% c( 'Epithelial' )]
然后對(duì)上皮細(xì)胞亞群走Seurat V5標(biāo)準(zhǔn)流程(同第二期):
names(sce1@assays$RNA@layers)
sce1[["RNA"]]$counts
LayerData(sce1, assay = "RNA", layer = "counts")
dim(sce1[["RNA"]]$counts )
as.data.frame(sce1@assays$RNA$counts[1:10, 1:2])
head(sce1@meta.data, 10)
table(sce1$orig.ident)
sce = sce1
sce <- NormalizeData(sce, normalization.method = "LogNormalize", scale.factor = 1e4)
GetAssay(sce,assay = "RNA")
sce <- FindVariableFeatures(sce, selection.method = "vst",
nfeatures = 2000)
sce <- ScaleData(sce)
sce <- RunPCA(object = sce, pc.genes = VariableFeatures(sce))
DimHeatmap(sce, dims = 1:12, cells = 100, balanced = TRUE)
ElbowPlot(sce)
分辨率設(shè)置為0.1:
sce <- FindNeighbors(sce, dims = 1:15)
sce <- FindClusters(sce, resolution = 0.1)
table(sce@meta.data$RNA_snn_res.0.1)
數(shù)據(jù)降維:
set.seed(321)
sce <- RunTSNE(object = sce, dims = 1:15, do.fast = TRUE)
sce <- RunUMAP(object = sce, dims = 1:5, do.fast = TRUE)
分群可視化:
p = DimPlot(sce,reduction = "tsne",label=T,cols = mycolors)
然后保存上皮細(xì)胞rds文件:
saveRDS(sce,file = "Epithelial.rds")
4.4 AddModuleScore_UCell函數(shù)惡性評(píng)分
AddModuleScore_UCell函數(shù)可以實(shí)現(xiàn)基因集合打分,通過(guò)評(píng)估單細(xì)胞數(shù)據(jù)集中特定基因特征的表達(dá)模式,我們可以量化生物學(xué)信號(hào)的強(qiáng)度,并鑒定細(xì)胞的亞型和狀態(tài)。在胃癌組織中取上調(diào)排名前50的差異基因用作惡性評(píng)分,下調(diào)排名前50的差異基因用作正常評(píng)分。
#sce = readRDS('Epithelial.rds')
head(DEG2)
table(DEG2$change)
UP = DEG2[DEG2$change == 'UP' & DEG2$FDR <0.01,]
DOWN = DEG2[DEG2$change == 'DOWN' & DEG2$FDR <0.01,]
##用作惡性評(píng)分
sorted_up = UP[order(UP$FDR),]
top_50_upgene = rownames(sorted_up[1:50, ])
##用作正常評(píng)分
sorted_down = DOWN[order(DOWN$FDR),]
top_50_downgene = rownames(sorted_down[1:50, ])
library(UCell)
marker <- list(top_50_upgene,top_50_downgene)#將基因整成list
names(marker)[1] <- 'tumor'
names(marker)[2] <- 'normal'
score <- AddModuleScore_UCell(sce,features=marker)
計(jì)算每個(gè)上皮細(xì)胞的惡性評(píng)分減去非惡性評(píng)分,并將評(píng)分從小到大排序以擬合生長(zhǎng)曲線。然后,根據(jù)生長(zhǎng)曲線拐點(diǎn)附近的最大間隙,將細(xì)胞分為得分較高的惡性細(xì)胞(≥0.02)或得分較低的非惡性細(xì)胞(≤0.02)。
sce2 = score
df = sce2@meta.data
df$score = df$tumor_UCell - df$normal_UCell
table(df$score)
dim(sce)
df$score1 = 'unknown'
df[df$score > -0.02,]$score1 = 'malignant'
table(df$score1)
df[df$score < -0.02,]$score1 = 'nonmalignant'
table(df$score1)
sce2@meta.data = df
DimPlot(sce2, reduction = "tsne",cols = mycolors,pt.size = 0.8,
group.by = "score1",label = T,label.box = T)
然后繪制氣泡圖和堆積性柱狀圖可視化分組細(xì)胞比例(同第五期 胃癌單細(xì)胞數(shù)據(jù)集GSE163558復(fù)現(xiàn)(五):細(xì)胞比例):
tb=table(sce2$tissue, sce2$score1)
head(tb)
library (gplots)
balloonplot(tb)
bar_data <- as.data.frame(tb)
bar_per <- bar_data %>%
group_by(Var1) %>%
mutate(sum(Freq)) %>%
mutate(percent = Freq / `sum(Freq)`)
head(bar_per)
#write.csv(bar_per,file = "celltype_by_group_percent.csv")
col =c("#3176B7","#F78000","#3FA116","#CE2820","#9265C1",
"#885649","#DD76C5","#BBBE00","#41BED1")
colnames(bar_per)
library(ggthemes)
p1 = ggplot(bar_per, aes(x = percent, y = Var1)) +
geom_bar(aes(fill = Var2) , stat = "identity") + coord_flip() +
theme(axis.ticks = element_line(linetype = "blank"),
legend.position = "top",
panel.grid.minor = element_line(colour = NA,linetype = "blank"),
panel.background = element_rect(fill = NA),
plot.background = element_rect(colour = NA)) +
labs(y = " ", fill = NULL)+labs(x = 'Relative proportion(%)')+
scale_fill_manual(values=col)+
theme_few()+
theme(plot.title = element_text(size=12,hjust=0.5))
p1
tb=table(sce2$sample, sce2$score1)
head(tb)
malignant nonmalignant
adjacent_nontumor 84 92
GC 2805 4
GC_liver_metastasis 39 0
GC_lymph_metastasis 82 0
GC_ovary_metastasis 578 0
GC_peritoneum_metastasis 258 0
library (gplots)
balloonplot(tb)
bar_data <- as.data.frame(tb)
bar_per <- bar_data %>%
group_by(Var1) %>%
mutate(sum(Freq)) %>%
mutate(percent = Freq / `sum(Freq)`)
head(bar_per)
colnames(bar_per)
library(ggthemes)
p2 = ggplot(bar_per, aes(x = percent, y = Var1)) +
geom_bar(aes(fill = Var2) , stat = "identity") + coord_flip() +
theme(axis.ticks = element_line(linetype = "blank"),
legend.position = "top",
panel.grid.minor = element_line(colour = NA,linetype = "blank"),
panel.background = element_rect(fill = NA),
plot.background = element_rect(colour = NA)) +
labs(y = " ", fill = NULL)+labs(x = 'Relative proportion(%)')+
scale_fill_manual(values=col)+
theme_few()+
theme(plot.title = element_text(size=12,hjust=0.5)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p2
p1+p2
ggsave(filename="prop.pdf",width = 12,height = 8)
saveRDS(sce2,file = "tumorEpithelial.rds")
4.5 提取惡性上皮亞群
在添加惡性評(píng)分之后,我們可以在從上皮細(xì)胞中提取惡性上皮亞群,繼續(xù)走Seurat V5標(biāo)準(zhǔn)流程:
table(sce2$score1)
tumor = sce2[, sce2$score1 %in% c( 'malignant' )]
set.seed(1234567)
tumor[["RNA"]]$counts
LayerData(tumor, assay = "RNA", layer = "counts")
tumor <- JoinLayers(tumor)
dim(tumor[["RNA"]]$counts )
as.data.frame(tumor@assays$RNA$counts[1:10, 1:2])
head(tumor@meta.data, 10)
table(tumor$orig.ident)
tumor <- NormalizeData(tumor, normalization.method = "LogNormalize",
scale.factor = 1e4)
GetAssay(tumor,assay = "RNA")
tumor <- FindVariableFeatures(tumor,
selection.method = "vst", nfeatures = 2000)
tumor <- ScaleData(tumor)
tumor <- RunPCA(object = tumor, pc.genes = VariableFeatures(tumor))
DimHeatmap(tumor, dims = 1:12, cells = 100, balanced = TRUE)
ElbowPlot(tumor)
tumor <- FindNeighbors(tumor, dims = 1:15)
tumor <- FindClusters(tumor, resolution = 0.1)
table(tumor@meta.data$RNA_snn_res.0.1)
set.seed(321)
tumor <- RunTSNE(object = tumor, dims = 1:15, do.fast = TRUE)
tumor <- RunUMAP(object = tumor, dims = 1:5, do.fast = TRUE)
mycolors <-c('#E64A35','#4DBBD4' ,'#01A187' ,'#3C5588' ,'#F29F80' ,
'#8491B6','#91D0C1','#7F5F48','#AF9E85','#4F4FFF','#CE3D33',
'#739B57','#EFE685','#446983','#BB6239','#5DB1DC','#7F2268','#6BD66B','#800202','#D8D8CD','pink'
)
p2 = DimPlot(tumor,reduction = "tsne",label=T,cols = mycolors)+
theme_few()
p2
保存惡性上皮細(xì)胞rds文件
table(tumor$seurat_clusters)
0 1 2 3 4
2275 579 482 314 196
tumor$celltype = paste0('G',tumor$seurat_clusters)
table(tumor$celltype)
G0 G1 G2 G3 G4
2275 579 482 314 196
saveRDS(tumor,file = "malignant.rds")
4.6 富集分析
提取惡性上皮細(xì)胞之后,我們接著從單細(xì)胞層面篩選出了特征基因,并在此基礎(chǔ)上進(jìn)行了富集分析。
首先讀取惡性上皮細(xì)胞rds文件,加載R包:
#sce =readRDS('malignant.rds')
sce = tumor
###熱圖表示GO富集分析
library(grid)
library(ggplot2)
library(gridExtra)
library(clusterProfiler)
library(org.Hs.eg.db)
library(enrichplot)
library(ggplot2)
library(org.Hs.eg.db)
library(GOplot)
然后使用Seurat內(nèi)置函數(shù)FindAllMarkers篩選特征基因:
markers <- FindAllMarkers(object = sce, only.pos = TRUE,
min.pct = 0.25,
thresh.use = 0.25)
write.csv(markers,file='markers.csv')
#markers = read.csv('markers.csv',row.names = 1)
table(markers$cluster)
library(dplyr)
gene = markers[markers$p_val <0.01 & markers$avg_log2FC >2,]$gene
entrezIDs = bitr(gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb= "org.Hs.eg.db", drop = TRUE)
gene<- entrezIDs$ENTREZID
marker_new = markers[markers$gene %in% entrezIDs$SYMBOL,]
marker_new = marker_new[!duplicated(marker_new$gene),]
identical(marker_new$gene, entrezIDs$SYMBOL)
p = identical(marker_new$gene, entrezIDs$SYMBOL);p
if(!p) entrezIDs = entrezIDs[match(marker_new$gene,entrezIDs$SYMBOL),]
marker_new$ID = entrezIDs$ENTREZID
GO富集分析:
gcSample=split(marker_new$ID, marker_new$cluster)
###參數(shù)可以更改,看看?compareCluster
xx <- compareCluster(gcSample,
fun = "enrichGO",
OrgDb = "org.Hs.eg.db",
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05
)
p <- dotplot(xx)
library(ggthemes)
p + theme(axis.text.x = element_text(
angle = 45,
vjust = 0.5, hjust = 0.5
))+theme_few()
還可以使用熱圖展示GO富集:
a = xx@compareClusterResult
colnames(a)
a$p = -log10(a$pvalue)
b = a[,c(1,3,11)]
library(reshape2)
colnames(b)
library(tidyr)
wide_data <- pivot_wider(b, names_from = Cluster, values_from = p)
wide_data = as.data.frame(wide_data)
rownames(wide_data) = wide_data$Description
wide = wide_data[,-1]
wide = wide[c(1:13),]
rownames(wide) <- sapply(rownames(wide), function(x) paste0(substr(x, 1, 40), "..."))
wide = as.matrix(wide)
library(pheatmap)
B <- pheatmap(wide, show_rownames = T,
show_colnames = T,
col = colorRampPalette(c("white","firebrick3"))(255),
annotation_names_row = T,#不顯示行注釋信息
annotation_names_col = T ,#不顯示列注釋信息
column_title = NULL,#不顯示列標(biāo)題
cluster_rows = F,
cluster_cols = F,
scale = 'column',
row_title = NULL)#不顯示行標(biāo)題scale = "row"
B
結(jié)語(yǔ)
本期,我們根據(jù)TCGA數(shù)據(jù)庫(kù)中胃癌和正常胃組織之間的差異表達(dá)基因,定義了每個(gè)上皮細(xì)胞的惡性和非惡性評(píng)分。下一期,我們將在本期基礎(chǔ)上,分析惡性上皮細(xì)胞G0-G4的Marker基因并繪制熱圖和小提琴圖,此外,我們還將使用AddModuleScore_UCell函數(shù)計(jì)算細(xì)胞的增殖和遷移評(píng)分。干貨滿滿,歡迎大家持續(xù)追更,謝謝!