Advertisement

KEGG基因富集分析

阅读量:
复制代码
 #kegg:基因功能存储在pathway数据库里

    
 rm(list = ls())  ## 魔幻操作,一键清空~
    
  
    
 load(file = 'deg.Rdata')
    
  
    
 head(deg)
    
  
    
 ## 不同的阈值,筛选到的差异基因数量就不一样,后面的超几何分布检验结果就大相径庭。
    
  
    
 logFC_t=1.5
    
 deg=nrDEG
    
  
    
 deg$g=ifelse(deg$P.Value>0.05,'stable',
    
          
    
          ifelse( deg$logFC > logFC_t,'UP',
    
                  
    
                  ifelse( deg$logFC < -logFC_t,'DOWN','stable') )
    
          
    
 )
    
  
    
 table(deg$g)
    
  
    
 head(deg)
    
  
    
 deg$symbol=rownames(deg)
    
  
    
 library(ggplot2)
    
  
    
 library(clusterProfiler)
    
  
    
 library(org.Hs.eg.db)
    
  
    
 df <- bitr(unique(deg$symbol), fromType = "SYMBOL",
    
        
    
        toType = c( "ENTREZID"),
    
        
    
        OrgDb = org.Hs.eg.db)
    
  
    
 head(df)
    
  
    
 DEG=deg
    
  
    
 head(DEG)
    
  
    
 #merge函数:将两个数据集合并
    
 #用于指定依据哪个列合并,常用于当两个数据集公共列名不一样的时候;
    
 DEG=merge(DEG,df,by.y='SYMBOL',by.x='symbol')
    
  
    
 head(DEG)
    
 #标注好的差异基因
    
 save(DEG,file = 'anno_DEG.Rdata')
    
  
    
  
    
  
    
  
    
  
    
 gene_up= DEG[DEG$g == 'UP','ENTREZID'] 
    
  
    
 gene_down=DEG[DEG$g == 'DOWN','ENTREZID'] 
    
  
    
 gene_diff=c(gene_up,gene_down)
    
  
    
 gene_all=as.character(DEG[ ,'ENTREZID'] )
    
  
    
 data(geneList, package="DOSE")
    
  
    
 head(geneList)
    
  
    
 boxplot(geneList)
    
  
    
 boxplot(DEG$logFC)
    
  
    
  
    
  
    
 geneList=DEG$logFC
    
  
    
 names(geneList)=DEG$ENTREZID
    
  
    
 geneList=sort(geneList,decreasing = T)
    
  
    
  
    
  
    
  
    
  
    
 ## KEGG pathway analysis
    
  
    
 ### 做KEGG数据集超几何分布检验分析,重点在结果的可视化及生物学意义的理解。
    
 library(org.Hs.eg.db)
    
 library("clusterProfiler")
    
 if(T){
    
   
    
   ###   over-representation test 超几何分布检验分析
    
   #kegg基因富集
    
   kk.up <- enrichKEGG(gene         = gene_up,
    
                   
    
                   organism     = 'hsa',
    
                   
    
                   universe     = gene_all,
    
                   
    
                   pvalueCutoff = 0.9,
    
                   
    
                   qvalueCutoff =0.9)
    
   
    
   head(kk.up)[,1:6]
    
   
    
   dotplot(kk.up )
    
   library(ggplot2)
    
   ggsave('kk.up.dotplot.png')
    
   #分析下调基因
    
   kk.down <- enrichKEGG(gene         =  gene_down,
    
                     
    
                     organism     = 'hsa',
    
                     
    
                     universe     = gene_all,
    
                     
    
                     pvalueCutoff = 0.9,
    
                     
    
                     qvalueCutoff =0.9)
    
   
    
   head(kk.down)[,1:6]
    
   
    
   dotplot(kk.down );ggsave('kk.down.dotplot.png')
    
   
    
   kk.diff <- enrichKEGG(gene         = gene_diff,
    
                     
    
                     organism     = 'hsa',
    
                     
    
                     pvalueCutoff = 0.05)
    
   
    
   head(kk.diff)[,1:6]
    
   
    
   dotplot(kk.diff );ggsave('kk.diff.dotplot.png')
    
   
    
  
    
   kegg_diff_dt <- as.data.frame(kk.diff)
    
   
    
   kegg_down_dt <- as.data.frame(kk.down)
    
   
    
   kegg_up_dt <- as.data.frame(kk.up)
    
   
    
   down_kegg<-kegg_down_dt[kegg_down_dt$pvalue<0.05,]
    
   down_kegg$group=-1
    
   
    
   up_kegg<-kegg_up_dt[kegg_up_dt$pvalue<0.05,]
    
   up_kegg$group=1
    
   #调用其他文件中的函数
    
   source('functions.R')
    
   
    
   g_kegg=kegg_plot(up_kegg,down_kegg)
    
   
    
   print(g_kegg)
    
   
    
   
    
   
    
   ggsave(g_kegg,filename = 'kegg_up_down.png')
    
   
    
   
    
   
    
   ###  GSEA 基因富集
    
   
    
   kk_gse <- gseKEGG(geneList     = geneList,
    
                 
    
                 organism     = 'hsa',
    
                 
    
                 nPerm        = 1000,
    
                 
    
                 minGSSize    = 120,
    
                 
    
                 pvalueCutoff = 0.9,
    
                 
    
                 verbose      = FALSE)
    
   
    
   head(kk_gse)[,1:6]
    
   
    
   gseaplot(kk_gse, geneSetID = rownames(kk_gse[1,]))
    
   
    
   #处理数据,挑选一部分数据
    
   
    
   down_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore < 0,]
    
   down_kegg$group=-1
    
   
    
   up_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore > 0,]
    
   up_kegg$group=1
    
   
    
   
    
   
    
   g_kegg=kegg_plot(up_kegg,down_kegg)
    
   
    
   print(g_kegg)
    
   
    
   ggsave(g_kegg,filename = 'kegg_up_down_gsea.png')
    
   
    
   
    
   
    
   
    
   
    
 }
    
  
    
  
    
  
    
 ### GO database analysis 
    
  
    
 ### 做GO数据集超几何分布检验分析,重点在结果的可视化及生物学意义的理解。
    
  
    
 {
    
   
    
   
    
   
    
   g_list=list(gene_up=gene_up,
    
           
    
           gene_down=gene_down,
    
           
    
           gene_diff=gene_diff)
    
   
    
   
    
   
    
   if(F){
    
     
    
     go_enrich_results <- lapply( g_list , function(gene) {
    
       
    
     ego <- enrichGO(gene          = gene,
    
                     
    
                     universe      = gene_all,
    
                     
    
                     OrgDb         = org.Hs.eg.db,
    
                     
    
                     ont           = ont ,
    
                     
    
                     pAdjustMethod = "BH",
    
                     
    
                     pvalueCutoff  = 0.99,
    
                     
    
                     qvalueCutoff  = 0.99,
    
                     
    
                     readable      = TRUE)
    
     
    
     
    
     
    
     print( head(ego) )
    
     
    
     return(ego)
    
     
    
       })
    
       
    
     })
    
     
    
     save(go_enrich_results,file = 'go_enrich_results.Rdata')
    
     
    
     
    
     
    
   
    
     
    
   
    
   
    
   
    
   
    
   
    
   load(file = 'go_enrich_results.Rdata')
    
   
    
   
    
   
    
   n1= c('gene_up','gene_down','gene_diff')
    
   
    
   n2= c('BP','MF','CC') 
    
   
    
   for (i in 1:3){
    
     
    
     for (j in 1:3){
    
       
    
       fn=paste0('dotplot_',n1[i],'_',n2[j],'.png')
    
       
    
       cat(paste0(fn,'\n'))
    
       
    
       png(fn,res=150,width = 1080)
    
       
    
       print( dotplot(go_enrich_results[[i]][[j]] ))
    
       
    
       dev.off()
    
       
    
     }
    
     
    
   }
    
   
    
   }
    
  
    
 #Step5:GSEA基因富集 
    
 library(ggplot2)
    
  
    
 library(clusterProfiler)
    
  
    
 library(org.Hs.eg.db)
    
  
    
  
    
  
    
 ### 对 MigDB中的全部基因集 做GSEA分析。
    
  
    
 # http://www.bio-info-trainee.com/2105.html
    
  
    
 # http://www.bio-info-trainee.com/2102.html 
    
  
    
 {
    
   
    
   load(file = 'anno_DEG.Rdata')
    
   
    
   geneList=DEG$logFC
    
   
    
   names(geneList)=DEG$symbol
    
   
    
   geneList=sort(geneList,decreasing = T)
    
   
    
   #选择gmt文件(MigDB中的全部基因集)
    
   
    
   d='./GEO-master/MsigDB/symbols'
    
   
    
   gmts <- list.files(d,pattern = 'all')
    
   
    
   gmts
    
   
    
   #GSEA分析
    
   
    
   library(GSEABase) # BiocManager::install('GSEABase')
    
   
    
   ## 下面使用lapply循环读取每个gmt文件,并且进行GSEA分析
    
   
    
   ## 如果存在之前分析后保存的结果文件,就不需要重复进行GSEA分析。
    
   
    
   f='gsea_results.Rdata'
    
   
    
   if(!file.exists(f)){
    
     #laaply循环读取,要求输入数据必须是list
    
     #laaply 高批量处理函数, 遍历列表向量内的每个元素,并且使用指定函数来对其元素进行处理。返回列表向量。
    
     
    
     gsea_results <- lapply(gmts, function(gmtfile){
    
       
    
       # gmtfile=gmts[2]
    
       
    
       geneset <- read.gmt(file.path(d,gmtfile)) 
    
       
    
       print(paste0('Now process the ',gmtfile))
    
       
    
       egmt <- GSEA(geneList, TERM2GENE=geneset, verbose=FALSE)
    
       
    
       head(egmt)
    
       
    
       # gseaplot(egmt, geneSetID = rownames(egmt[1,]))
    
       
    
       
    
       
    
       return(egmt)
    
       
    
     })
    
     
    
     # 上面的代码耗时,所以保存结果到本地文件
    
     
    
     save(gsea_results,file = f)
    
     
    
   }
    
   
    
   load(file = f)
    
   
    
   
    
   #提取gsea结果,熟悉这个对象
    
   
    
   gsea_results_list<- lapply(gsea_results, function(x){
    
     
    
     cat(paste(dim(x@result)),'\n')
    
     
    
     x@result
    
     
    
   })
    
   
    
   gsea_results_df <- do.call(rbind, gsea_results_list)
    
   
    
   gseaplot(gsea_results[[2]],'KEGG_CELL_CYCLE') 
    
   
    
   gseaplot(gsea_results[[2]],'FARMER_BREAST_CANCER_CLUSTER_6') 
    
   
    
   gseaplot(gsea_results[[5]],'GO_CONDENSED_CHROMOSOME_OUTER_KINETOCHORE') 
    
   
    
   
    
   
    
 }

全部评论 (0)

还没有任何评论哟~