Advertisement

差异基因富集分析(R语言——GO&KEGG&GSEA)

阅读量:

随后上期内容中提到

1.准备差异基因集

我直接将上次分享的内容带到此处。在分析过程中,默认将差异基因划分为上调和下调两类,并进行通路富集分析。接下来编写代码时可能会融入一些个人工作习惯,请稍候。具体筛选标准则由个人需求决定。

复制代码
    ##载入所需R包
    library(readxl)
    library(DOSE)
    library(org.Hs.eg.db)
    library(topGO)
    library(pathview)
    library(ggplot2)
    library(GSEABase)
    library(limma) 
    library(clusterProfiler)
    library(enrichplot)
    
    
    ##edger
    edger_diff <- diff_gene_Group
    edger_diff_up <- rownames(edger_diff[which(edger_diff$logFC > 0.584962501),])
    edger_diff_down <- rownames(edger_diff[which(edger_diff$logFC < -0.584962501),])
    
    ##deseq2
    deseq2_diff <- diff_gene_Group2
    deseq2_diff_up <- rownames(deseq2_diff[which(deseq2_diff$log2FoldChange > 0.584962501),])
    deseq2_diff_down <- rownames(deseq2_diff[which(deseq2_diff$log2FoldChange < -0.584962501),])
    
    ##将差异基因集保存为一个list
    gene_diff_edger_deseq2 <- list()
    gene_diff_edger_deseq2[["edger_diff_up"]] <- edger_diff_up
    gene_diff_edger_deseq2[["edger_diff_down"]] <- edger_diff_down
    gene_diff_edger_deseq2[["deseq2_diff_up"]] <- deseq2_diff_up
    gene_diff_edger_deseq2[["deseq2_diff_down"]] <- deseq2_diff_down
2.进行通路富集分析

本节将涵盖普通GO&KEGG&GSEA分析中的基本原理及操作步骤,并结合实际案例进行阐述。在具体实施时, 筛选显著富集通路的标准并非唯一, 通常由研究者的具体需求来决定其筛选标准, 一般设定为矫正后的P值小于等于0.05. 为了提高效率而编写了各列表循环处理代码, 这样可以避免逐一分析每个基因组学数据表带来的繁琐工作

复制代码
    for (i in 1:length(gene_diff_edger_deseq2)){
      keytypes(org.Hs.eg.db)
      
      entrezid_all = mapIds(x = org.Hs.eg.db,  
                        keys = gene_diff_edger_deseq2[[i]], 
                        keytype = "SYMBOL", #输入数据的类型
                        column = "ENTREZID")#输出数据的类型
      entrezid_all  = na.omit(entrezid_all)  #na省略entrezid_all中不是一一对应的数据情况
      entrezid_all = data.frame(entrezid_all) 
      
      ##GO富集##
      GO_enrich = enrichGO(gene = entrezid_all[,1],
                       OrgDb = org.Hs.eg.db, 
                       keyType = "ENTREZID", #输入数据的类型
                       ont = "ALL", #可以输入CC/MF/BP/ALL
                       #universe = 背景数据集我没用到它。
                       pvalueCutoff = 1,qvalueCutoff = 1, #表示筛选的阈值,阈值设置太严格可导致筛选不到基因。可指定 1 以输出全部
                       readable = T) #是否将基因ID映射到基因名称。
      
      GO_enrich_data  = data.frame(GO_enrich)
      write.csv(GO_enrich_data,paste('GO_enrich_',names(gene_diff_edger_deseq2)[i], '.csv', sep = ""))
      GO_enrich_data <- GO_enrich_data[which(GO_enrich_data$p.adjust < 0.05),]
      write.csv(GO_enrich_data,paste('GO_enrich_',names(gene_diff_edger_deseq2)[i], '_filter.csv', sep = ""))
      
      
      ###KEGG富集分析###
      KEGG_enrich = enrichKEGG(gene = entrezid_all[,1], #即待富集的基因列表
                           keyType = "kegg",
                           pAdjustMethod = 'fdr',  #指定p值校正方法
                           organism= "human",  #hsa,可根据你自己要研究的物种更改,可在https://www.kegg.jp/brite/br08611中寻找
                           qvalueCutoff = 1, #指定 p 值阈值(可指定 1 以输出全部)
                           pvalueCutoff=1) #指定 q 值阈值(可指定 1 以输出全部)
      KEGG_enrich_data  = data.frame(KEGG_enrich)
      write.csv(KEGG_enrich_data, paste('KEGG_enrich_',names(gene_diff_edger_deseq2)[i], '.csv', sep = ""))
      KEGG_enrich_data <- KEGG_enrich_data[which(KEGG_enrich_data$p.adjust < 0.05),]
      write.csv(KEGG_enrich_data, paste('KEGG_enrich_',names(gene_diff_edger_deseq2)[i], '_filter.csv', sep = ""))
    }
3.通路富集情况可视化

这里只介绍一种简单的气泡图,当然还有其他的自己去了解吧。

复制代码
    ##GO&KEGG富集BPCCMFKEGG分面绘图需要分开处理一下,富集结果里的ONTOLOGYL列修改
    GO_enrich_data_BP <- subset(GO_enrich_data, subset = GO_enrich_data$ONTOLOGY == "BP")
    GO_enrich_data_CC <- subset(GO_enrich_data, subset = GO_enrich_data$ONTOLOGY == "CC")
    GO_enrich_data_MF <- subset(GO_enrich_data, subset = GO_enrich_data$ONTOLOGY == "MF")
    
    ##提取GO富集BPCCMF的top5
    GO_enrich_data_filter <- rbind(GO_enrich_data_BP[1:5,], GO_enrich_data_CC[1:5,], GO_enrich_data_MF[1:5,])
    
    ##重新整合进富集结果
    GO_enrich@result <- GO_enrich_data_filter
    
    ##处理KEGG富集结果
    KEGG_enrich@result <- KEGG_enrich_data
    ncol(KEGG_enrich@result)
    KEGG_enrich@result$ONTOLOGY <- "KEGG"
    KEGG_enrich@result <- KEGG_enrich@result[,c(10,1:9)]
    
    ##整合GO KEGG富集结果
    ego_GO_KEGG <- GO_enrich
    ego_GO_KEGG@result <- rbind(ego_GO_KEGG@result, KEGG_enrich@result[1:5,])
    ego_GO_KEGG@result$ONTOLOGY <- factor(ego_GO_KEGG@result$ONTOLOGY, levels = c("BP", "CC", "MF","KEGG"))##规定分组顺序
    
    ##简单画图
    pdf("edger_diff_up_dotplot.pdf", width = 7, height = 7)
    dotplot(ego_GO_KEGG, split = "ONTOLOGY", title="UP-GO&KEGG", label_format = 60, color = "pvalue") + 
      facet_grid(ONTOLOGY~., scale = "free_y")+
      theme(plot.title = element_text(hjust = 0.5, size = 13, face = "bold"), axis.text.x = element_text(angle = 90, hjust = 1))
    dev.off()
4.气泡图如图所示

进行了初步处理后,在视觉效果上采用了真实图片的形式展现;值得注意的是,在左侧Pathway与右侧气泡之间存在一一对应的关系;除此之外,在可视化方法上还有更多可能性等待我们去发现;感谢您的关注!

5.GSEA富集分析

这里也是做一下简单的GSEA

复制代码
    ##GSEA官方网站下载背景gmt文件并读入
    geneset <- list()
    geneset[["c2_cp"]] <- read.gmt("c2.cp.v2023.2.Hs.symbols.gmt")
    geneset[["c2_cp_kegg_legacy"]] <- read.gmt("c2.cp.kegg_legacy.v2023.2.Hs.symbols.gmt")
    geneset[["c2_cp_kegg_medicus"]] <- read.gmt("c2.cp.kegg_medicus.v2023.2.Hs.symbols.gmt")
    geneset[["c2_cp_reactome"]] <- read.gmt("c2.cp.reactome.v2023.2.Hs.symbols.gmt")
    geneset[["c3_tft"]] <- read.gmt("c3.tft.v2023.2.Hs.symbols.gmt")
    geneset[["c4_cm"]] <- read.gmt("c4.cm.v2023.2.Hs.symbols.gmt")
    geneset[["c5_go_bp"]] <- read.gmt("c5.go.bp.v2023.2.Hs.symbols.gmt")
    geneset[["c5_go_cc"]] <- read.gmt("c5.go.cc.v2023.2.Hs.symbols.gmt")
    geneset[["c5_go_mf"]] <- read.gmt("c5.go.mf.v2023.2.Hs.symbols.gmt")
    geneset[["c6"]] <- read.gmt("c6.all.v2023.2.Hs.symbols.gmt")
    geneset[["c7"]] <- read.gmt("c7.all.v2023.2.Hs.symbols.gmt")
    
    ##进行GSEA富集分析,这里也是写了个循环
    gsea_results <- list()
    for (i in names(gene_diff)){
      geneList <- gene_diff[[i]]$logFC
      names(geneList) <- toupper(rownames(gene_diff[[i]]))
      geneList <- sort(geneList,decreasing = T)
      for (j in names(geneset)){
    listnames <- paste(i,j,sep = "_")
    gsea_results[[listnames]] <- GSEA(geneList = geneList,
                         TERM2GENE = geneset[[j]],
                         verbose = F,
                         pvalueCutoff = 0.1,
                         pAdjustMethod = "none",
                         eps=0)
      }
    }
    
    ##批量绘图,注意这里如果有空富集通路,会报错!
    for (j in 1:nrow(gsea_results[[i]]@result)) {
    p <- gseaplot2(x=gsea_results[[i]],geneSetID=gsea_results[[i]]@result$ID[j], title = 
    gsea_results[[i]]@result$ID[j]) 
    
    pdf(paste(paste(names(gsea_results)[i], gsea_results[[i]]@result$ID[j], sep = 
    "_"),".pdf",sep = ""))
    print(p)
    dev.off()
      }

6.GSEA富集最简单图形如下

分享到此结束了,希望对大家有所帮助。

全部评论 (0)

还没有任何评论哟~