功能富集分析 在得到了差異基因的基礎之上,進一步進行功能富集分析,這里我們使用clusterprofiler包 本文將對差異基因進行 GO, KEGG注釋并完成可視化,GSEA分析
Sys.setlocale('LC_ALL','C') library(tidyverse) library(ggplot2) library(dplyr) library(clusterProfiler) library(org.Hs.eg.db) load(file = "DEG_all.Rdata") head(DEG) ##定義 篩選logFC>=1, P<=0.05的差異基因 DEG_2<-DEG %>% rownames_to_column("genename") #%>% as_tibble() %>% filter(abs(logFC)>=0.58,P.Value<=0.05) genelist<-DEG_2$genename
## ID轉(zhuǎn)換 genelist<- genelist %>% bitr(fromType = "SYMBOL", toType = c("ENSEMBL", "ENTREZID"), OrgDb = org.Hs.eg.db) head(genelist) dim(genelist)
GO富集分析注意clusterprofiler中使用的富集ID是ENTREZID,如果不是需要進行轉(zhuǎn)換,也很簡單 我們這里其實可以用我們自帶的probe中的ID 使用了semi_join這個篩選連接 注意這里我故意使用的是篩選連接,其實inner_join, 等等都可以,我就是要用這些不熟悉的 但是這里的probe其實基因名是有重復的,我們還要進行去重的操作
如果自己有probe信息if(F){ load("probe.Rdata") names(probe)[2]<-"genename" DEG_2<-as_tibble(rownames_to_column(DEG,"genename")) probe<-probe[!duplicated(probe$genename),]
##設定差異閾值篩選 DEG_2<-DEG_2 %>% arrange(abs(logFC)) %>% filter(abs(logFC)>=0.58,P.Value<=0.05) dim(DEG_2)
gene<-probe %>%as_tibble() %>% dplyr::semi_join(DEG_2,by="genename") %>% .$ENTREZ_GENE_ID head(gene) length(gene)##相當于是取了交集,用DEG_2中的基因來做篩選,得到了12549個基因 }
我們假設沒有probe注釋信息ego <- enrichGO(gene = genelist$ENSEMBL, OrgDb = org.Hs.eg.db, keyType = 'ENSEMBL',##指定gene id的類型 ont = "all",##GO分類 pAdjustMethod = "BH", pvalueCutoff = 0.01,##設置閾值 qvalueCutoff = 0.05, readable=TRUE) write.csv(ego,file = "ego.csv") ego[1:5,1:5] ## ONTOLOGY ID GO:0019882 BP GO:0019882 GO:0048002 BP GO:0048002 GO:0019884 BP GO:0019884 GO:0002478 BP GO:0002478 GO:0070482 BP GO:0070482 Description GO:0019882 antigen processing and presentation GO:0048002 antigen processing and presentation of peptide antigen GO:0019884 antigen processing and presentation of exogenous antigen GO:0002478 antigen processing and presentation of exogenous peptide antigen GO:0070482 response to oxygen levels GeneRatio BgRatio GO:0019882 353/12683 385/19623 GO:0048002 320/12683 347/19623 GO:0019884 314/12683 341/19623 GO:0002478 307/12683 334/19623 GO:0070482 372/12683 420/19623 ############ class(ego) str(ego) ## 三個結果整合在一起 go_dot<-dotplot(ego,split="ONTOLOGY",showCategory=5)+ facet_grid(ONTOLOGY~.) go_dot ## 單個繪制 ego_MF<-enrichGO(gene = genelist$ENSEMBL, OrgDb = org.Hs.eg.db, keyType = 'ENSEMBL',##指定gene id的類型 ont = "MF",##GO分類 pAdjustMethod = "BH", pvalueCutoff = 0.01,##設置閾值 qvalueCutoff = 0.05, readable=TRUE) dotplot(ego_MF,showCategory=6)
KEGG富集分析代碼的風格都是類似的
kk <- enrichKEGG(gene = genelist$ENTREZID, organism = 'hsa', pvalueCutoff = 0.05 ) head(kk)
## 打開網(wǎng)頁瀏覽通路 browseKEGG(kk,"hsa04151") ## 點圖 dotplot(kk) ## 柱狀圖 barplot(kk)
點圖
dotplot(kk)
柱狀圖
barplot(kk)
Pathview包用于通路可視化先調(diào)整數(shù)據(jù)
library(pathview) pathview(genelist$ENTREZID,pathway.id ="04151",species = "hsa")
## DEG_2<-DEG %>% rownames_to_column("genename") DEG_2<-genelist %>% rename(SYMBOL='genename') %>% inner_join(DEG_2,by='genename') %>% ## select配合everything排序把,改變變量順序 dplyr::select(ENTREZID,logFC,everything()) head(DEG_2) ### ENTREZID logFC genename ENSEMBL AveExpr t P.Value 1 218 -3.227263 ALDH3A1 ENSG00000108602 10.302323 -10.710306 4.482850e-05 2 1545 -3.033684 CYP1B1 ENSG00000138061 13.287607 -10.505888 4.995753e-05 3 1543 -9.003353 CYP1A1 ENSG00000140465 11.481268 -8.371476 1.762905e-04 4 11148 -1.550587 HHLA2 ENSG00000114455 6.595658 -7.443431 3.337672e-04 5 8140 -2.470333 SLC7A5 ENSG00000103257 13.628775 -7.298868 3.708688e-04 6 25976 -1.581274 TIPARP ENSG00000163659 12.764218 -7.024252 4.552834e-04 adj.P.Val B 1 0.3134585 -4.048355 2 0.3134585 -4.049713 3 0.7374232 -4.069681 4 0.9308066 -4.083411 5 0.9308066 -4.085966 6 0.9522252 -4.091192 ###
構建geneList對象這是比較關鍵的一步,需要構建geneList對象,至少需要包括ID與排序好的數(shù)值型變量 數(shù)值變量降序排列 sign.pos參數(shù)包括的位置有4種:"bottomleft", "bottomright", "topleft" and "topright".
geneList<-DEG_2$logFC names(geneList)<-as.character(DEG_2$ENTREZID) geneList<-sort(geneList,decreasing=TRUE) #geneList<-geneList[!duplicated(names(geneList))] head(geneList)##構建geneList對象成功
1580 7025 54474 9247 56603 4907 ## geneid 2.378155 2.365342 2.258682 2.180382 2.143749 2.030780 ## logFC
## 這樣logFC值就能反應在通路的變化中了 pathview(geneList,pathway.id ="04151",species = "hsa",same.layer = F )
調(diào)整函數(shù)的參數(shù),改變圖形比較重要的是kegg.native = F,可以調(diào)整輸出為矢量圖
## 看不出變化的參數(shù)kegg.native,默認設置為T pathview(geneList,pathway.id ="04151",species = "hsa", out.suffix = "KEGG",##輸出文件后綴 kegg.native = T, same.layer = F )
## 可以產(chǎn)生矢量圖pdf文檔, pathview(geneList,pathway.id ="04151",species = "hsa", out.suffix = "KEGG",##輸出文件后綴 kegg.native = F, split.group = T,#是否分開節(jié)點組 sign.pos= "topleft",##圖注的位置 same.layer = F )
GSEA分析GSEA分析在高分文章中還是一個很常見的分析,之前認為官方軟件的作圖達不到很高的要求,現(xiàn)在終于可以解決了clusterprofiler包更新的畫圖功能模仿官網(wǎng)的圖,顏值更高,操作更簡單,大家都值得擁有。
KEGG的GSEA富集分析head(geneList)
## 1580 7025 54474 9247 56603 4907 2.378155 2.365342 2.258682 2.180382 2.143749 2.030780 ##
#geneList<-geneList[!duplicated(geneList)] kk2<- gseKEGG(geneList = geneList, organism = 'hsa', nPerm = 1000, minGSSize = 120, pvalueCutoff = 0.5, verbose = FALSE) head(kk2)
## ID Description setSize enrichmentScore hsa04080 hsa04080 Neuroactive ligand-receptor interaction 293 -0.3593278 hsa04024 hsa04024 cAMP signaling pathway 185 -0.3656357 hsa04060 hsa04060 Cytokine-cytokine receptor interaction 244 -0.3358228 hsa04020 hsa04020 Calcium signaling pathway 172 -0.3566872 hsa04062 hsa04062 Chemokine signaling pathway 165 0.3112935 hsa05202 hsa05202 Transcriptional misregulation in cancer 163 -0.3537420 NES pvalue p.adjust qvalues rank hsa04080 -1.481227 0.002649007 0.1562914 0.1422098 1912 hsa04024 -1.426245 0.007062147 0.2083333 0.1895629 1697 hsa04060 -1.356610 0.012345679 0.2427984 0.2209227 2055 hsa04020 -1.378406 0.017118402 0.2524964 0.2297470 1748 hsa04062 1.323383 0.025806452 0.2676695 0.2435530 1332 hsa05202 -1.361718 0.027220630 0.2676695 0.2435530 2463 leading_edge hsa04080 tags=31%, list=16%, signal=27% hsa04024 tags=26%, list=14%, signal=23% hsa04060 tags=30%, list=17%, signal=25% hsa04020 tags=26%, list=15%, signal=22% hsa04062 tags=19%, list=11%, signal=17% hsa05202 tags=32%, list=21%, signal=26% ##
GSEA圖可視化之前的圖確實顏值不夠
gseaplot(kk2,geneSetID = "hsa04080")
新功能gseaplot2-模仿GSEA軟件的圖-確實更好看了
library(enrichplot) gseaplot2(kk2,##KEGG gse的對象 geneSetID = "hsa04080", pvalue_table=T)
修改線條顏色現(xiàn)在的圖-高端大氣上檔次
gseaplot2(kk2,##KEGG gse的對象 geneSetID = "hsa04080", color="red", pvalue_table=T)
MgsiDB的GSEA富集分析gmtfile <- system.file("extdata", "c5.cc.v5.0.entrez.gmt", package="clusterProfiler") c5 <- read.gmt(gmtfile) gene <- names(geneList) egmt <- enricher(gene, TERM2GENE=c5) head(egmt)
## ID Description GeneRatio BgRatio CELL_FRACTION CELL_FRACTION CELL_FRACTION 451/4461 493/5270 MEMBRANE_FRACTION MEMBRANE_FRACTION MEMBRANE_FRACTION 309/4461 339/5270 pvalue p.adjust qvalue CELL_FRACTION 1.764189e-06 0.0003775364 0.00036398 MEMBRANE_FRACTION 1.870439e-04 0.0200136995 0.01929506 ##
#write.csv(egmt,file = "egmt.csv")
GSEA
egmt2 <- GSEA(geneList, TERM2GENE=c5, pvalueCutoff = 0.5,# verbose=FALSE) head(egmt2) gseaplot2(egmt2, geneSetID = 1, title = egmt2$Description[1],pvalue_table=T)
還可以同時畫多個
gseaplot2(egmt2, geneSetID = 1:3,pvalue_table=T)
Gene-Concept Network這里的輸入egmt2可以是ego,kk,edo都可以,進行可視化
edox <- setReadable(egmt2, 'org.Hs.eg.db', 'ENTREZID') cnetplot(edox, foldChange=geneList)
換個輸出形狀
cnetplot(edox, foldChange=geneList, circular = TRUE, colorEdge = TRUE)
|