1. 準備工作 1.1 測試數據集 set.seed(123456) dir.create('Results')
library(Seurat)
library(ggplot2) library(png) sce = Read10X('./pbmc3k_filtered_gene_bc_matrices/filtered_gene_bc_matrices/hg19/')
library(dplyr)
sce_final <- sce %>% CreateSeuratObject(min.cells = 3, min.features = 200) %>% PercentageFeatureSet(pattern = '^MT',col.name = 'percent.mt') select_c <- WhichCells(sce_final,expression = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5) sce_final = subset(sce_final,cells = select_c) dim(sce_final)
# [1] 13714 2626
sce_final <- sce_final %>% NormalizeData() %>% FindVariableFeatures() %>% ScaleData() %>% RunPCA() %>% FindNeighbors(dims = 1:10) %>% FindClusters(resolution = 0.5) %>% RunTSNE(dims = 1:10) %>% RunUMAP(dims = 1:10)
new.cluster.ids <- c('Naive CD4 T', 'CD14_Mono', 'Memory CD4 T', 'B', 'CD8 T', 'FCGR3A_Mono', 'NK', 'DC', 'Platelet') names(new.cluster.ids) <- levels(sce_final) sce_final <- RenameIdents(sce_final, new.cluster.ids)
sce_final$celltype <- Idents(sce_final) sce_final$celltype <- factor(sce_final$celltype,levels = c('Naive CD4 T','Memory CD4 T','CD8 T','B','NK','CD14_Mono','FCGR3A_Mono','DC', 'Platelet'))
my9color <- c('#5470c6','#91cc75','#fac858','#ee6666','#73c0de','#3ba272','#fc8452','#9a60b4','#ea7ccc') p <- DimPlot(sce_final, reduction = 'umap', label = TRUE, pt.size = 0.5,label.size = 5,cols = my9color) NoLegend() NoAxes() ggsave('./Results/umap.png',width = 5,height = 5)
## [1] 'AAK1' 'ACVR1' 'AK5' 'AKTIP' 'ANXA1' 'AP3M2' ## [7] 'APBA2' 'AQP3' 'ARL4C' 'ARNTL' 'ARRB1' 'ATP10A' ## [13] 'ATP1A1' 'ATP2B4' 'ATP6V0E2' 'BCL11B' 'BEX3' 'BLMH' ## [19] 'BNIP3' 'C1orf216' 'CALM1' 'CAMK2G' 'CCL5' 'CCND2' ## [25] 'CCND3' 'CCR2' 'CCR5' 'CCSER2' 'CD2' 'CD247' ## [31] 'CD28' 'CD3G' 'CD6' 'CDR2' 'CERK' 'CHST7' ## [37] 'CISH' 'CLUAP1' 'CORO2A' 'CPD' 'CYB561' 'CYLD' ## [43] 'DEXI' 'DNMT1' 'DOCK9' 'DPP4' 'DPYD' 'DUSP7' ## [49] 'DYRK2' 'FAM102A' 'FBXL8' 'FHL1' 'FKBP5' 'FLT3LG' ## [55] 'FYB1' 'FYN' 'GATA3' 'GIMAP4' 'GIMAP5' 'GIMAP6' ## [61] 'GNAQ' 'GOLGA7' 'GPD1L' 'GSTK1' 'GZMA' 'GZMK' ## [67] 'HDAC4' 'HPGD' 'ICOS' 'ID2' 'IFITM1' 'IGF2R' ## [73] 'IL10RA' 'IL11RA' 'IL2RB' 'INPP4A' 'IPCEF1' 'ITGA6' ## [79] 'ITK' 'ITM2A' 'ITPKB' 'KLRB1' 'KLRG1' 'LCK' ## [85] 'LDHB' 'LDLRAP1' 'LEF1' 'LEPROTL1' 'LGALS8' 'LIMA1' ## [91] 'LINC00623' 'LPAR6' 'LRIG1' 'MAF' 'MAL' 'MGAT4A' ## [97] 'MICAL2' 'MLLT3' 'MT1F' 'MT1X' 'MYBL1' 'NACC2' ## [103] 'NCALD' 'NELL2' 'NOSIP' 'OPTN' 'P2RX4' 'PDE3B' ## [109] 'PDE4D' 'PHACTR2' 'PIK3IP1' 'PIK3R1' 'PIM1' 'PIP4K2A' ## [115] 'PLAAT4' 'PLCG1' 'PLCL1' 'PLIN2' 'PLK3' 'PRDX2' ## [121] 'PRF1' 'PRKACB' 'PRKCA' 'PRKCH' 'PRKCQ' 'PRMT2' ## [127] 'PRORP' 'PTGER2' 'PTPN13' 'PTPN4' 'PXN' 'RAB33A' ## [133] 'RASGRP1' 'RCAN3' 'RGCC' 'RGS10' 'RIC8B' 'RNF144A' ## [139] 'RORA' 'RPS6KA3' 'RUNX1' 'S100A4' 'SAMHD1' 'SELPLG' ## [145] 'SEMA4C' 'SEMA4D' 'SERINC5' 'SFI1' 'SH2D1A' 'SH2D2A' ## [151] 'SIRPG' 'SKAP1' 'SLC39A8' 'SLC9A3R1' 'SOCS2' 'SPATS2L' ## [157] 'SRGN' 'STAT4' 'STAT5B' 'STOM' 'STXBP1' 'SVIL' ## [163] 'TBC1D2B' 'TBC1D4' 'TGFBR3' 'TIAM1' 'TIMP1' 'TKTL1' ## [169] 'TMEM123' 'TMEM30B' 'TMEM43' 'TNFAIP3' 'TNFRSF25' 'TNFSF10' ## [175] 'TNFSF14' 'TNFSF8' 'TNIK' 'TOB1' 'TRAT1' 'TRAV8-3' ## [181] 'TRBC1' 'TRBV10-2' 'TRDV2' 'TRPV2' 'TTLL1' 'TXK' ## [187] 'UBASH3A' 'UPP1' 'URI1' 'VAMP5' 'VIM' 'VPS26C' ## [193] 'WDR1' 'WDR82' 'WWP1' 'ZAP70' 'ZMYM6' 'ZNF277'
## [1] 'orig.ident' 'nCount_RNA' ## [3] 'nFeature_RNA' 'percent.mt' ## [5] 'RNA_snn_res.0.5' 'seurat_clusters' ## [7] 'celltype' 'CD4_TCELL_VS_BCELL_UP_score'
my_comparisons <- list(c('Naive CD4 T','B'),c('Memory CD4 T','B')) library(ggpubr) p.percentage <- ggviolin(sce@meta.data, x = 'celltype', y = 'CD4_TCELL_VS_BCELL_UP_score', color = 'celltype',add = 'mean_sd',fill = 'celltype', add.params = list(color = 'black')) stat_compare_means(comparisons = my_comparisons,label = 'p.signif') scale_color_manual(values = my9color) scale_fill_manual(values = my9color) theme(axis.text.x.bottom = element_text(angle = 90,vjust = 0.5,hjust = 1)) NoLegend() labs(x = '') ggsave('./Results/percentagefeatureset_score_vln.png',width = 8,height = 4.5) p2 <- FeaturePlot(sce,'CD4_TCELL_VS_BCELL_UP_score') ggsave('./Results/percentagefeatureset_score_umap.png',width = 5.5,height = 5)
## [1] 'orig.ident' 'nCount_RNA' ## [3] 'nFeature_RNA' 'percent.mt' ## [5] 'RNA_snn_res.0.5' 'seurat_clusters' ## [7] 'celltype' 'CD4_TCELL_VS_BCELL_UP_score1'
my_comparisons <- list(c('Naive CD4 T','B'),c('Memory CD4 T','B')) library(ggpubr) p.AddModuleScore <- ggviolin(sce@meta.data, x = 'celltype', y = 'CD4_TCELL_VS_BCELL_UP_score1', color = 'celltype',add = 'mean_sd',fill = 'celltype', add.params = list(color = 'black')) stat_compare_means(comparisons = my_comparisons,label = 'p.signif') scale_color_manual(values = my9color) scale_fill_manual(values = my9color) theme(axis.text.x.bottom = element_text(angle = 90,vjust = 0.5,hjust = 1)) NoLegend() labs(x = '') ggsave('./Results/addmodulescore_score_vln.png',width = 8,height = 4.5) p2 <- FeaturePlot(sce,'CD4_TCELL_VS_BCELL_UP_score1') ggsave('./Results/addmodulescore_score_umap.png',width = 5.5,height = 5)
library(AUCell) library(clusterProfiler)
library(ggplot2) library(Seurat) library(GSEABase)
sce <- sce_final
# AUCell_buildRankings cells_rankings <- AUCell_buildRankings(sce@assays$RNA@counts,nCores=1, plotStats=TRUE)
## min 1% 5% 10% 50% 100% ## 212.00 341.00 460.25 557.50 819.00 2413.00
## 制作基因集 geneSets <- a extraGeneSets <- c(GeneSet(sample(geneSets),setName='CD4_TCELL_VS_BCELL_UP_gene'))
geneSets1 <- GeneSetCollection(extraGeneSets) names(geneSets1)
## 計算AUC cells_AUC <- AUCell_calcAUC(geneSets1, cells_rankings) cells_AUC
pdf('./Results/cells_assignment.pdf',width = 6,height = 5) cells_assignment <- AUCell_exploreThresholds(cells_AUC, plotHist=TRUE, assign=TRUE) dev.off()
##set gene set of interest here for plotting geneSet <- 'CD4_TCELL_VS_BCELL_UP_gene' aucs <- as.numeric(getAUC(cells_AUC)[geneSet, ]) sce$AUC <- aucs df<- data.frame(sce@meta.data, sce@reductions[['umap']]@cell.embeddings) colnames(df)
#我們看到每個細胞現(xiàn)在都加上AUC值了,下面做一下可視化。 class_avg <- df %>% group_by(celltype) %>% summarise( UMAP_1 = median(UMAP_1), UMAP_2 = median(UMAP_2) ) p1 <- ggplot(df, aes(UMAP_1, UMAP_2)) geom_point(aes(colour = AUC)) viridis::scale_color_viridis(option='H') ggrepel::geom_text_repel(aes(label = celltype), data = class_avg, size = 6, segment.color = NA) theme(legend.position = 'none') theme_classic() ggsave('./Results/umap_auc.png',width = 5.5,height = 5)
p.auc <- ggviolin(df, x = 'celltype', y = 'AUC', color = 'celltype',add = 'mean_sd',fill = 'celltype', add.params = list(color = 'black')) stat_compare_means(comparisons = my_comparisons,label = 'p.signif') scale_color_manual(values = my9color) scale_fill_manual(values = my9color) theme(axis.text.x.bottom = element_text(angle = 90,vjust = 0.5,hjust = 1)) NoLegend() labs(x = '') ggsave('./Results/vio_auc.png',width = 8,height = 4.5)
gene.expr <- as.matrix(sce_final[['RNA']]@data) dim(gene.expr)
## Estimating ssGSEA scores for 1 gene sets.
## orig.ident nCount_RNA nFeature_RNA percent.mt ## AAACATACAACCAC-1 SeuratProject 2419 779 3.100455 ## AAACATTGAGCTAC-1 SeuratProject 4903 1352 3.875178 ## AAACATTGATCAGC-1 SeuratProject 3147 1129 1.080394 ## AAACCGTGCTTCCG-1 SeuratProject 2639 960 1.970443
2.5 GSVA GSVA方法Biomamba生信基地之前發(fā)過推文的噢!鏈接在單細胞中應該如何做GSVA?這里參照之前的流程進行分析。 2.5.1 加載包 library(GSVA) library(ggplot2) 2.5.2 GSVA分析
## [1] 13714 2626
## Estimating GSVA scores for 1 gene sets. ## Estimating ECDFs with Gaussian kernels ## | | | 0% | |======================================================================| 100% 2.5.3 可視化
## orig.ident nCount_RNA nFeature_RNA percent.mt ## AAACATACAACCAC-1 SeuratProject 2419 779 3.100455 ## AAACATTGAGCTAC-1 SeuratProject 4903 1352 3.875178 ## AAACATTGATCAGC-1 SeuratProject 3147 1129 1.080394 ## AAACCGTGCTTCCG-1 SeuratProject 2639 960 1.970443
library(patchwork) (p.percentage|p.AddModuleScore)/(p.auc|p.GSVA|p.ssGSEA)
## R version 4.0.3 (2020-10-10) ## Platform: x86_64-w64-mingw32/x64 (64-bit) ## Running under: Windows 10 x64 (build 19044) ## ## Matrix products: default ## ## locale: ## [1] LC_COLLATE=Chinese (Simplified)_China.936 ## [2] LC_CTYPE=Chinese (Simplified)_China.936 ## [3] LC_MONETARY=Chinese (Simplified)_China.936 ## [4] LC_NUMERIC=C ## [5] LC_TIME=Chinese (Simplified)_China.936 ## ## attached base packages: ## [1] stats4 parallel stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] patchwork_1.1.1 GSVA_1.38.2 GSEABase_1.52.1 ## [4] graph_1.68.0 annotate_1.68.0 XML_3.99-0.7 ## [7] AnnotationDbi_1.52.0 IRanges_2.24.1 S4Vectors_0.28.1 ## [10] Biobase_2.50.0 BiocGenerics_0.36.1 clusterProfiler_3.18.1 ## [13] AUCell_1.12.0 ggpubr_0.4.0 readxl_1.3.1 ## [16] dplyr_1.0.7 png_0.1-7 ggplot2_3.3.5 ## [19] SeuratObject_4.0.2 Seurat_4.0.2 ## ## loaded via a namespace (and not attached): ## [1] utf8_1.2.2 reticulate_1.20 ## [3] R.utils_2.10.1 tidyselect_1.1.1 ## [5] RSQLite_2.2.8 htmlwidgets_1.5.4 ## [7] BiocParallel_1.24.1 grid_4.0.3 ## [9] Rtsne_0.15 scatterpie_0.1.7 ## [11] munsell_0.5.0 codetools_0.2-16 ## [13] ragg_1.2.2 ica_1.0-2 ## [15] future_1.22.1 miniUI_0.1.1.1 ## [17] withr_2.4.2 GOSemSim_2.16.1 ## [19] colorspace_2.0-2 highr_0.9 ## [21] knitr_1.34 ROCR_1.0-11 ## [23] ggsignif_0.6.3 tensor_1.5 ## [25] DOSE_3.16.0 listenv_0.8.0 ## [27] MatrixGenerics_1.2.1 labeling_0.4.2 ## [29] GenomeInfoDbData_1.2.4 polyclip_1.10-0 ## [31] bit64_4.0.5 farver_2.1.0 ## [33] downloader_0.4 parallelly_1.28.1 ## [35] vctrs_0.3.8 generics_0.1.0 ## [37] xfun_0.25 R6_2.5.1 ## [39] GenomeInfoDb_1.26.7 graphlayouts_0.7.1 ## [41] openintro_2.2.0 fgsea_1.16.0 ## [43] DelayedArray_0.16.3 bitops_1.0-7 ## [45] spatstat.utils_2.2-0 cachem_1.0.6 ## [47] assertthat_0.2.1 promises_1.2.0.1 ## [49] cherryblossom_0.1.0 scales_1.1.1 ## [51] ggraph_2.0.5 enrichplot_1.10.2 ## [53] gtable_0.3.0 globals_0.14.0 ## [55] goftest_1.2-2 tidygraph_1.2.0 ## [57] rlang_0.4.11 systemfonts_1.0.4 ## [59] splines_4.0.3 rstatix_0.7.0 ## [61] lazyeval_0.2.2 spatstat.geom_2.3-2 ## [63] broom_0.7.9 BiocManager_1.30.16 ## [65] yaml_2.2.1 reshape2_1.4.4 ## [67] abind_1.4-5 backports_1.2.1 ## [69] httpuv_1.6.2 qvalue_2.22.0 ## [71] tools_4.0.3 ellipsis_0.3.2 ## [73] spatstat.core_2.3-0 jquerylib_0.1.4 ## [75] RColorBrewer_1.1-2 ggridges_0.5.3 ## [77] Rcpp_1.0.7 plyr_1.8.6 ## [79] zlibbioc_1.36.0 purrr_0.3.4 ## [81] RCurl_1.98-1.4 rpart_4.1-15 ## [83] deldir_1.0-6 viridis_0.6.1 ## [85] pbapply_1.5-0 cowplot_1.1.1 ## [87] zoo_1.8-9 SummarizedExperiment_1.20.0 ## [89] haven_2.4.3 ggrepel_0.9.1 ## [91] cluster_2.1.0 magrittr_2.0.1 ## [93] data.table_1.14.2 RSpectra_0.16-0 ## [95] scattermore_0.7 DO.db_2.9 ## [97] openxlsx_4.2.4 lmtest_0.9-38 ## [99] RANN_2.6.1 airports_0.1.0 ## [101] fitdistrplus_1.1-6 matrixStats_0.60.1 ## [103] hms_1.1.0 mime_0.11 ## [105] evaluate_0.14 xtable_1.8-4 ## [107] rio_0.5.27 gridExtra_2.3 ## [109] compiler_4.0.3 tibble_3.1.5 ## [111] shadowtext_0.1.0 KernSmooth_2.23-17 ## [113] crayon_1.4.1 R.oo_1.24.0 ## [115] htmltools_0.5.2 segmented_1.3-4 ## [117] ggfun_0.0.4 mgcv_1.8-33 ## [119] later_1.3.0 tzdb_0.1.2 ## [121] tidyr_1.1.4 DBI_1.1.1 ## [123] tweenr_1.0.2 MASS_7.3-53 ## [125] Matrix_1.3-4 car_3.0-11 ## [127] readr_2.1.1 R.methodsS3_1.8.1 ## [129] igraph_1.2.6 GenomicRanges_1.42.0 ## [131] forcats_0.5.1 pkgconfig_2.0.3 ## [133] rvcheck_0.1.8 foreign_0.8-80 ## [135] plotly_4.10.0 spatstat.sparse_2.0-0 ## [137] bslib_0.3.1 XVector_0.30.0 ## [139] stringr_1.4.0 digest_0.6.27 ## [141] sctransform_0.3.2 RcppAnnoy_0.0.19 ## [143] spatstat.data_2.1-0 fastmatch_1.1-3 ## [145] rmarkdown_2.10 cellranger_1.1.0 ## [147] leiden_0.3.9 usdata_0.2.0 ## [149] uwot_0.1.10 kernlab_0.9-29 ## [151] curl_4.3.2 shiny_1.7.1 ## [153] lifecycle_1.0.1 nlme_3.1-149 ## [155] jsonlite_1.7.2 carData_3.0-4 ## [157] viridisLite_0.4.0 fansi_0.5.0 ## [159] pillar_1.6.4 lattice_0.20-41 ## [161] GO.db_3.12.1 fastmap_1.1.0 ## [163] httr_1.4.2 survival_3.2-7 ## [165] glue_1.4.2 zip_2.2.0 ## [167] bit_4.0.4 mixtools_1.2.0 ## [169] ggforce_0.3.3 stringi_1.7.4 ## [171] sass_0.4.0 blob_1.2.2 ## [173] textshaping_0.3.6 memoise_2.0.0 ## [175] irlba_2.3.3 future.apply_1.8.1 5.總結 我們可以看到,這些方法所得到的結果均具有很好的一致性;在這個前提下,我們應該選擇哪個方法可能是值得商榷的問題。 ① PercentageFeatureSet函數并未對結果進行處理,raw結果的比較分析可能會受到離散值的影響,所以個人覺得可能不是很客觀;相比之下,其他方法均不同程度的去除了離散值的影響,相對來說更為客觀。 ② 值得一提的是,GSEA使用的情況與其他函數/算法并不相同,其首先需要進行差異基因的計算,然后再去觀察針對特定基因集差異基因所富集的情況。其他算法則不需要進行差異基因計算,但總而言之,評分/富集程度的差異也同時反映了不同細胞/評估對象基因表達的差異。萬變不離其宗。 你覺得那種方法更簡單呢? 如果是小編,我會更喜歡AddModuleScore函數。一行搞定,不耗時!耗時短除了PercentageFeatureSet函數可以相媲美(當然缺點很明顯),其他方法相對AddModuleScore來說都過于復雜了。 不短了,今天就總結到這里吧,希望這個教程能幫到你們這些愛學習生信的小可愛們,一起加油吧~~ |
|