【原】胃癌單細(xì)胞數(shù)據(jù)集GSE163558復(fù)現(xiàn)(六)：聯(lián)合TCGA-STAD數(shù)據(jù)添加惡性評(píng)分

健明 2024-07-08 發(fā)布于廣東

展開全文

前言

Hello小伙伴們大家好，我是生信技能樹的小學(xué)徒”我才不吃蛋黃“。今天是胃癌單細(xì)胞數(shù)據(jù)集GSE163558復(fù)現(xiàn)系列第六期。第五期我們通過(guò)繪制餅圖、堆積柱狀圖、氣泡圖等，比較不同分組之間細(xì)胞比例差異。本期，我們將下載胃癌bulk數(shù)據(jù)（TCGA-STAD），篩選胃癌相關(guān)差異表達(dá)基因，根據(jù)差異基因list，使用AddModuleScore_UCell函數(shù)給上皮細(xì)胞亞群Seurat對(duì)象添加惡性評(píng)分，以此協(xié)助區(qū)分正常上皮和惡性上皮。

1.背景介紹

眾所周知，胃癌細(xì)胞是胃上皮細(xì)胞發(fā)生了惡性病變。在第三期對(duì)細(xì)胞分群注釋時(shí)，我們并未對(duì)上皮細(xì)胞進(jìn)行細(xì)分，這里面就包括非惡性上皮和惡性上皮細(xì)胞。那么該如何識(shí)別惡性上皮細(xì)胞？本文作者根據(jù)TCGA數(shù)據(jù)庫(kù)中胃癌和正常胃組織之間的差異表達(dá)基因，定義了每個(gè)上皮細(xì)胞的惡性和非惡性評(píng)分，從而區(qū)分惡性上皮和非惡性上皮細(xì)胞。當(dāng)然，我們還可以利用copykat和infercnv從單細(xì)胞轉(zhuǎn)錄組數(shù)據(jù)推斷腫瘤拷貝數(shù)變異，從而區(qū)分上皮惡性與否（后期更新，敬請(qǐng)期待）。

2.數(shù)據(jù)下載

在正式分析之前，我們先從TCGA數(shù)據(jù)庫(kù)獲取胃癌患者的表達(dá)數(shù)據(jù)和臨床數(shù)據(jù)。首先加載工作目錄，創(chuàng)建新的文件夾，加載R包：

rm(list=ls())
getwd()
setwd("/home/data/t020505/GSE163558-GC代碼公眾號(hào)復(fù)現(xiàn)版")
dir.create("6-TCGA_STAD")
setwd('6-TCGA_STAD/')
source('../scRNA_scripts/mycolors.R')
library(tidyverse)
library(data.table) 
#如果需要下載TCGA其他癌種，請(qǐng)更換proj
proj = "TCGA-STAD"
dir.create("input")

在R中利用代碼一鍵下載TCGA數(shù)據(jù)庫(kù)胃癌患者的表達(dá)數(shù)據(jù)及臨床數(shù)據(jù)，并將下載好了數(shù)據(jù)存放在input文件夾：

if(T){
  download.file(url = paste0("https://gdc./download/",proj, ".htseq_counts.tsv.gz"),destfile = paste0("input/",proj,".htseq_counts.tsv.gz"))  ##表達(dá)數(shù)據(jù)
  download.file(url = paste0("https://gdc./download/",proj, ".GDC_phenotype.tsv.gz"),destfile = paste0("input/",proj,".GDC_phenotype.tsv.gz")) ##臨床數(shù)據(jù)
  download.file(url = paste0("https://gdc./download/",proj, ".survival.tsv"),destfile = paste0("input/",proj,".survival.tsv")) ##生存數(shù)據(jù)
}
clinical = read.delim(paste0("input/",proj,".GDC_phenotype.tsv.gz"),fill = T,header = T,sep = "\t")
surv = read.delim(paste0("input/",proj,".survival.tsv"),header = T) 
head(surv) #生存數(shù)據(jù)os和os.time

生存數(shù)據(jù)包括os和os.time：

3.數(shù)據(jù)整理

在下載完數(shù)據(jù)之后，我們需要對(duì)數(shù)據(jù)進(jìn)行初步的整理。

3.1 處理表達(dá)矩陣

首先處理表達(dá)矩陣：

dat = read.table(paste0("input/",proj,".htseq_counts.tsv.gz"),check.names = F,row.names = 1,header = T)
dat <- data.table::fread(paste0("input/",proj,".htseq_counts.tsv.gz"),
                         data.table = F) #全部是symbol

a = dat[60483:60488,]
dat = dat[1:60483,]
rownames(dat) = dat$Ensembl_ID
a = dat
a = a[,-1]
exp = a

去除在所有樣本里表達(dá)量都為零的基因：

exp = exp[rowSums(exp)>0,]
nrow(exp)

僅保留在一半以上樣本里表達(dá)的基因：

exp = exp[apply(exp, 1, function(x) sum(x > 0) > 0.5*ncol(exp)), ]
nrow(exp)

表達(dá)矩陣行名ID轉(zhuǎn)換：

library(stringr)
head(rownames(exp))
library(AnnoProbe)
rownames(exp) = substr(rownames(exp), 1, 15)
##rownames(exp) = str_split(rownames(exp),"\\.",simplify = T)[,1];head(rownames(exp))
#annoGene(rownames(exp),ID_type = "ENSEMBL") 
re = annoGene(rownames(exp),ID_type = "ENSEMBL");head(re)
#if(!require(tinyarray))devtools::install_github("xjsun1221/tinyarray",upgrade = FALSE,dependencies = TRUE)
library(tinyarray)
exp = trans_array(exp,ids = re,from = "ENSEMBL",to = "SYMBOL")
exp[1:4,1:4]

3.2 整理分組信息

然后整理分組信息:

根據(jù)樣本ID的第14-15位，給樣本分組（tumor和normal）：

01A：癌癥組織
01B：福爾馬林浸泡樣本
02A：復(fù)發(fā)組織
06A：轉(zhuǎn)移組織
11A：癌旁組織

table(str_sub(colnames(exp),14,15))  
table(str_sub(colnames(exp),14,16)) 
Group = ifelse(as.numeric(str_sub(colnames(exp),14,15)) < 10,'tumor','normal')  
Group = factor(Group,levels = c("normal","tumor"))
table(Group)
group = make_tcga_group(exp)
table(group)

保存數(shù)據(jù)：

save(exp,Group,proj,clinical,surv,file = paste0(proj,".Rdata"))

4.數(shù)據(jù)分析

4.1 TCGA差異分析

首先清除系統(tǒng)環(huán)境變量，加載TCGA數(shù)據(jù)，加載差異分析R包“edgeR”：

rm(list = ls())
load("TCGA-STAD.Rdata")
table(Group)
library(edgeR)

開始進(jìn)行胃癌和癌旁組織差異基因分析：

dge <- DGEList(counts=exp,group=Group)
dge$samples$lib.size <- colSums(dge$counts)
dge <- calcNormFactors(dge) 
design <- model.matrix(~Group)
dge <- estimateGLMCommonDisp(dge, design)
dge <- estimateGLMTrendedDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)
fit <- glmFit(dge, design)
fit <- glmLRT(fit) 
DEG2=topTags(fit, n=Inf)
class(DEG2)
DEG2=as.data.frame(DEG2)
head(DEG2)

添加change列標(biāo)記基因上調(diào)下調(diào)，在這里，我們對(duì)差異分析篩選的閾值為：“| log2 fold change |>1, pvalue <0.05”,最終在胃癌中篩選到了2213個(gè)上調(diào)基因，142個(gè)下調(diào)基因。

logFC_t = 1
pvalue_t = 0.05
k1 = (DEG2$PValue < pvalue_t)&(DEG2$logFC < -logFC_t)
k2 = (DEG2$PValue < pvalue_t)&(DEG2$logFC > logFC_t)
DEG2$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
head(DEG2)
table(DEG2$change)

保存差異分析結(jié)果：

save(DEG2,Group,file = paste0(proj,"_DEG.Rdata"))

4.2.差異基因可視化

差異基因可視化選擇了熱圖和火山圖：

library(ggplot2)
library(tinyarray)
dat = log2(cpm(exp)+1)
pca.plot = draw_pca(dat,Group);pca.plot
save(pca.plot,file = paste0(proj,"_pcaplot.Rdata"))
cg2 = rownames(DEG2)[DEG2$change !="NOT"]
h2 = draw_heatmap(dat[cg2,],Group)
v2 = draw_volcano(DEG2,pkg = 2,logFC_cutoff = logFC_t)
library(patchwork)
h2 + v2 +plot_layout(guides = 'collect') &theme(legend.position = "none")
ggsave(paste0(proj,"_heat_vo.png"),width = 16,height = 9)

4.3 提取上皮細(xì)胞亞群

設(shè)置隨機(jī)種子，讀取第三期細(xì)胞分群注釋后的單細(xì)胞文件：

set.seed(12345)
sce.all=readRDS( "../3-Celltype/sce_celltype.rds")
table(sce.all$celltype)

提取出其中的上皮細(xì)胞：

sce1 = sce.all[, sce.all$celltype %in% c( 'Epithelial' )]

然后對(duì)上皮細(xì)胞亞群走Seurat V5標(biāo)準(zhǔn)流程（同第二期）：

names(sce1@assays$RNA@layers)
sce1[["RNA"]]$counts 
LayerData(sce1, assay = "RNA", layer = "counts")
dim(sce1[["RNA"]]$counts )
as.data.frame(sce1@assays$RNA$counts[1:10, 1:2])
head(sce1@meta.data, 10)
table(sce1$orig.ident) 
sce = sce1
sce <- NormalizeData(sce, normalization.method =  "LogNormalize",               scale.factor = 1e4)
GetAssay(sce,assay = "RNA")
sce <- FindVariableFeatures(sce, selection.method = "vst",
                            nfeatures = 2000)  
sce <- ScaleData(sce) 
sce <- RunPCA(object = sce, pc.genes = VariableFeatures(sce)) 
DimHeatmap(sce, dims = 1:12, cells = 100, balanced = TRUE)
ElbowPlot(sce)

分辨率設(shè)置為0.1：

sce <- FindNeighbors(sce, dims = 1:15)
sce <- FindClusters(sce, resolution = 0.1)
table(sce@meta.data$RNA_snn_res.0.1)

數(shù)據(jù)降維：

set.seed(321)
sce <- RunTSNE(object = sce, dims = 1:15, do.fast = TRUE)
sce <- RunUMAP(object = sce, dims = 1:5, do.fast = TRUE)

分群可視化：

p = DimPlot(sce,reduction = "tsne",label=T,cols = mycolors)

然后保存上皮細(xì)胞rds文件：

saveRDS(sce,file = "Epithelial.rds")

4.4 AddModuleScore_UCell函數(shù)惡性評(píng)分

AddModuleScore_UCell函數(shù)可以實(shí)現(xiàn)基因集合打分，通過(guò)評(píng)估單細(xì)胞數(shù)據(jù)集中特定基因特征的表達(dá)模式，我們可以量化生物學(xué)信號(hào)的強(qiáng)度，并鑒定細(xì)胞的亞型和狀態(tài)。在胃癌組織中取上調(diào)排名前50的差異基因用作惡性評(píng)分，下調(diào)排名前50的差異基因用作正常評(píng)分。

#sce = readRDS('Epithelial.rds')
head(DEG2)
table(DEG2$change)
UP = DEG2[DEG2$change == 'UP' & DEG2$FDR <0.01,]
DOWN = DEG2[DEG2$change == 'DOWN' & DEG2$FDR <0.01,]
##用作惡性評(píng)分
sorted_up = UP[order(UP$FDR),]
top_50_upgene = rownames(sorted_up[1:50, ]) 
##用作正常評(píng)分
sorted_down = DOWN[order(DOWN$FDR),]
top_50_downgene = rownames(sorted_down[1:50, ])
library(UCell)
marker <- list(top_50_upgene,top_50_downgene)#將基因整成list
names(marker)[1] <- 'tumor'
names(marker)[2] <- 'normal'
score <- AddModuleScore_UCell(sce,features=marker)

計(jì)算每個(gè)上皮細(xì)胞的惡性評(píng)分減去非惡性評(píng)分，并將評(píng)分從小到大排序以擬合生長(zhǎng)曲線。然后，根據(jù)生長(zhǎng)曲線拐點(diǎn)附近的最大間隙，將細(xì)胞分為得分較高的惡性細(xì)胞（≥0.02）或得分較低的非惡性細(xì)胞（≤0.02）。

sce2 = score
df = sce2@meta.data
df$score = df$tumor_UCell - df$normal_UCell
table(df$score)
dim(sce)
df$score1 = 'unknown'
df[df$score > -0.02,]$score1 = 'malignant'
table(df$score1)
df[df$score < -0.02,]$score1 = 'nonmalignant'
table(df$score1)
sce2@meta.data = df
DimPlot(sce2, reduction = "tsne",cols = mycolors,pt.size = 0.8,
              group.by = "score1",label = T,label.box = T)

然后繪制氣泡圖和堆積性柱狀圖可視化分組細(xì)胞比例（同第五期胃癌單細(xì)胞數(shù)據(jù)集GSE163558復(fù)現(xiàn)(五)：細(xì)胞比例）：

tb=table(sce2$tissue, sce2$score1)
head(tb)
library (gplots) 
balloonplot(tb)

bar_data <- as.data.frame(tb)

bar_per <- bar_data %>% 
  group_by(Var1) %>%
  mutate(sum(Freq)) %>%
  mutate(percent = Freq / `sum(Freq)`)
head(bar_per) 
#write.csv(bar_per,file = "celltype_by_group_percent.csv")
col =c("#3176B7","#F78000","#3FA116","#CE2820","#9265C1",
       "#885649","#DD76C5","#BBBE00","#41BED1")
colnames(bar_per)

library(ggthemes)
p1 = ggplot(bar_per, aes(x = percent, y = Var1)) +
  geom_bar(aes(fill = Var2) , stat = "identity") + coord_flip() +
  theme(axis.ticks = element_line(linetype = "blank"),
        legend.position = "top",
        panel.grid.minor = element_line(colour = NA,linetype = "blank"), 
        panel.background = element_rect(fill = NA),
        plot.background = element_rect(colour = NA)) +
  labs(y = " ", fill = NULL)+labs(x = 'Relative proportion(%)')+
  scale_fill_manual(values=col)+
  theme_few()+
  theme(plot.title = element_text(size=12,hjust=0.5))
p1

tb=table(sce2$sample, sce2$score1)
head(tb)
                          
                           malignant nonmalignant
  adjacent_nontumor               84           92
  GC                            2805            4
  GC_liver_metastasis             39            0
  GC_lymph_metastasis             82            0
  GC_ovary_metastasis            578            0
  GC_peritoneum_metastasis       258            0
library (gplots) 
balloonplot(tb)

bar_data <- as.data.frame(tb)

bar_per <- bar_data %>% 
  group_by(Var1) %>%
  mutate(sum(Freq)) %>%
  mutate(percent = Freq / `sum(Freq)`)
head(bar_per) 
colnames(bar_per)

library(ggthemes)
p2 = ggplot(bar_per, aes(x = percent, y = Var1)) +
  geom_bar(aes(fill = Var2) , stat = "identity") + coord_flip() +
  theme(axis.ticks = element_line(linetype = "blank"),
        legend.position = "top",
        panel.grid.minor = element_line(colour = NA,linetype = "blank"), 
        panel.background = element_rect(fill = NA),
        plot.background = element_rect(colour = NA)) +
  labs(y = " ", fill = NULL)+labs(x = 'Relative proportion(%)')+
  scale_fill_manual(values=col)+
  theme_few()+
  theme(plot.title = element_text(size=12,hjust=0.5)) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p2
p1+p2
ggsave(filename="prop.pdf",width = 12,height = 8)
saveRDS(sce2,file = "tumorEpithelial.rds")

4.5 提取惡性上皮亞群

在添加惡性評(píng)分之后，我們可以在從上皮細(xì)胞中提取惡性上皮亞群，繼續(xù)走Seurat V5標(biāo)準(zhǔn)流程：

table(sce2$score1)
tumor = sce2[, sce2$score1 %in% c( 'malignant' )]
set.seed(1234567)
tumor[["RNA"]]$counts 
LayerData(tumor, assay = "RNA", layer = "counts")
tumor <- JoinLayers(tumor)
dim(tumor[["RNA"]]$counts )
as.data.frame(tumor@assays$RNA$counts[1:10, 1:2])
head(tumor@meta.data, 10)
table(tumor$orig.ident) 
tumor <- NormalizeData(tumor, normalization.method =  "LogNormalize",  
                       scale.factor = 1e4)
GetAssay(tumor,assay = "RNA")
tumor <- FindVariableFeatures(tumor, 
                              selection.method = "vst", nfeatures = 2000)  
tumor <- ScaleData(tumor) 
tumor <- RunPCA(object = tumor, pc.genes = VariableFeatures(tumor)) 
DimHeatmap(tumor, dims = 1:12, cells = 100, balanced = TRUE)
ElbowPlot(tumor)

tumor <- FindNeighbors(tumor, dims = 1:15)
tumor <- FindClusters(tumor, resolution = 0.1)
table(tumor@meta.data$RNA_snn_res.0.1)  
set.seed(321)
tumor <- RunTSNE(object = tumor, dims = 1:15, do.fast = TRUE)

tumor <- RunUMAP(object = tumor, dims = 1:5, do.fast = TRUE)

mycolors <-c('#E64A35','#4DBBD4' ,'#01A187'  ,'#3C5588'  ,'#F29F80'  ,
             '#8491B6','#91D0C1','#7F5F48','#AF9E85','#4F4FFF','#CE3D33',
             '#739B57','#EFE685','#446983','#BB6239','#5DB1DC','#7F2268','#6BD66B','#800202','#D8D8CD','pink'
)

p2 = DimPlot(tumor,reduction = "tsne",label=T,cols = mycolors)+
  theme_few()
p2

保存惡性上皮細(xì)胞rds文件

table(tumor$seurat_clusters)
   0    1    2    3    4 
2275  579  482  314  196 
tumor$celltype = paste0('G',tumor$seurat_clusters)
table(tumor$celltype)
  G0   G1   G2   G3   G4 
2275  579  482  314  196 
saveRDS(tumor,file = "malignant.rds")

4.6 富集分析

提取惡性上皮細(xì)胞之后，我們接著從單細(xì)胞層面篩選出了特征基因，并在此基礎(chǔ)上進(jìn)行了富集分析。

首先讀取惡性上皮細(xì)胞rds文件，加載R包：

#sce =readRDS('malignant.rds') 
sce = tumor
###熱圖表示GO富集分析
library(grid)
library(ggplot2)
library(gridExtra)
library(clusterProfiler)
library(org.Hs.eg.db)
library(enrichplot)
library(ggplot2)
library(org.Hs.eg.db)
library(GOplot)

然后使用Seurat內(nèi)置函數(shù)FindAllMarkers篩選特征基因：

markers <- FindAllMarkers(object = sce, only.pos = TRUE, 
                          min.pct = 0.25, 
                          thresh.use = 0.25)
write.csv(markers,file='markers.csv')

#markers = read.csv('markers.csv',row.names = 1)
table(markers$cluster)
library(dplyr) 
gene = markers[markers$p_val <0.01 & markers$avg_log2FC >2,]$gene
entrezIDs = bitr(gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb= "org.Hs.eg.db", drop = TRUE)
gene<- entrezIDs$ENTREZID
marker_new = markers[markers$gene %in% entrezIDs$SYMBOL,]
marker_new = marker_new[!duplicated(marker_new$gene),]
identical(marker_new$gene, entrezIDs$SYMBOL)
p = identical(marker_new$gene, entrezIDs$SYMBOL);p
if(!p) entrezIDs = entrezIDs[match(marker_new$gene,entrezIDs$SYMBOL),]
marker_new$ID = entrezIDs$ENTREZID

GO富集分析：

gcSample=split(marker_new$ID, marker_new$cluster) 
###參數(shù)可以更改，看看?compareCluster
xx <- compareCluster(gcSample,
                     fun = "enrichGO",
                     OrgDb = "org.Hs.eg.db",
                     ont = "BP",
                     pAdjustMethod = "BH",
                     pvalueCutoff = 0.01,
                     qvalueCutoff = 0.05
)
p <- dotplot(xx)
library(ggthemes)
p + theme(axis.text.x = element_text(
  angle = 45,
  vjust = 0.5, hjust = 0.5
))+theme_few()

還可以使用熱圖展示GO富集：

a = xx@compareClusterResult
colnames(a)
a$p = -log10(a$pvalue)
b = a[,c(1,3,11)]
library(reshape2)
colnames(b)
library(tidyr)
wide_data <- pivot_wider(b, names_from = Cluster, values_from = p)

wide_data = as.data.frame(wide_data)
rownames(wide_data) = wide_data$Description
wide = wide_data[,-1]
wide = wide[c(1:13),]
rownames(wide) <- sapply(rownames(wide), function(x) paste0(substr(x, 1, 40), "..."))
wide = as.matrix(wide)
library(pheatmap)
B <- pheatmap(wide, show_rownames = T,
              show_colnames = T,
              col = colorRampPalette(c("white","firebrick3"))(255),
              annotation_names_row = T,#不顯示行注釋信息
              annotation_names_col = T ,#不顯示列注釋信息
              column_title = NULL,#不顯示列標(biāo)題
              cluster_rows = F,
              cluster_cols = F,
              scale = 'column',
              row_title = NULL)#不顯示行標(biāo)題scale = "row"
B

結(jié)語(yǔ)

本期，我們根據(jù)TCGA數(shù)據(jù)庫(kù)中胃癌和正常胃組織之間的差異表達(dá)基因，定義了每個(gè)上皮細(xì)胞的惡性和非惡性評(píng)分。下一期，我們將在本期基礎(chǔ)上，分析惡性上皮細(xì)胞G0-G4的Marker基因并繪制熱圖和小提琴圖，此外，我們還將使用AddModuleScore_UCell函數(shù)計(jì)算細(xì)胞的增殖和遷移評(píng)分。干貨滿滿，歡迎大家持續(xù)追更，謝謝！