Advertisement

TCGA-HCC-差异分析-富集分析-pathview

阅读量:
复制代码
    library(pathview)
    library(TCGAbiolinks)
    library(SummarizedExperiment)
    library(clusterProfiler)
    #下载HCC的Gene expression文件
    query <- GDCquery(project = "TCGA-LIHC",
                  data.category = "Gene expression",
                  data.type = "Gene expression quantification",
                  platform = "Illumina HiSeq", 
                  file.type  = "results", 
                  sample.type = c("normal"),
                  legacy = TRUE)
    
    GDCdownload(query)
    #获得表达文件
    HCC.exp <- GDCprepare(query, 
                      save = TRUE, 
                      summarizedExperiment = TRUE, 
                      save.filename = "LIHCIllumina_HiSeq.rda")
    
    dataPrep_HCC <- TCGAanalyze_Preprocessing(object = HCC.exp,
                                          cor.cut = 0.6,    
                                          datatype = "raw_count",
                                          filename = "HCC_IlluminaHiSeq_RNASeqV2.png")
    dataNorm <- TCGAanalyze_Normalization(tabDF = dataPrep_HCC,
                                      geneInfo = TCGAbiolinks::geneInfo,
                                      method = "gcContent") #18323   672
    
    dataFilt <- TCGAanalyze_Filtering(tabDF = dataNorm,
                                  method = "quantile",
                                  qnt.cut =  0.25)
    HCC_clin <- GDCquery_clinic(project = "TCGA-LIHC", type = "Clinical")
    dataFiltHCC <- subset(dataFilt, 
                      select = substr(colnames(dataFilt),1,12) %in% HCC_clin$bcr_patient_barcode)
    
    
    
    #下载正常的样本
    query <- GDCquery(project = "TCGA-LIHC",
                  data.category = "Gene expression",
                  data.type = "Gene expression quantification",
                  platform = "Illumina HiSeq", 
                  file.type  = "results", 
                  sample.type = c("Solid Tissue Normal"),
                  legacy = TRUE)
    
    GDCdownload(query)
    #获得表达文件
    HCC.normal.exp <- GDCprepare(query, 
                      save = TRUE, 
                      summarizedExperiment = TRUE, 
                      save.filename = "LIHC_normal_Illumina_HiSeq.rda")
    dataPrep_HCC_normal <- TCGAanalyze_Preprocessing(object = HCC.normal.exp,
                                          cor.cut = 0.6,    
                                          datatype = "raw_count",
                                          filename = "HCC_noraml_IlluminaHiSeq_RNASeqV2.png")
    dataNorm_normal <- TCGAanalyze_Normalization(tabDF = dataPrep_HCC_normal,
                                      geneInfo = TCGAbiolinks::geneInfo,
                                      method = "gcContent") 
    dataFilt_normal <- TCGAanalyze_Filtering(tabDF = dataNorm_normal,
                                  method = "quantile",
                                  qnt.cut =  0.25) 
    #取正常样本与肿瘤样本中相同的gene交集
    c <- intersect(row.names(dataFiltHCC),row.names(dataFilt_normal))
    dim(dataFiltHCC)
    class(dataFiltHCC)
    dataFiltHCC <- as.data.frame(dataFiltHCC)
    dataFiltHCC$geneid <- row.names(dataFiltHCC)
    library(dplyr)
    length(c)
    dataFiltHCC1 <- dataFiltHCC %>%
      filter(geneid %in% c)
    row.names(dataFiltHCC1) <- dataFiltHCC1$geneid
    names(dataFiltHCC1)
    dataFiltHCC1 <- dataFiltHCC1[,-372]
    dataFiltHCC1 <- as.matrix(dataFiltHCC1)
    
    dataFilt_normal <- as.data.frame(dataFilt_normal)
    dataFilt_normal$geneid <- row.names(dataFilt_normal)
    names(dataFilt_normal)
    dataFilt_normal1 <- dataFilt_normal %>%
      filter(geneid %in% c)
    row.names(dataFilt_normal1) <- dataFilt_normal1$geneid
    colnames(dataFilt_normal1)
    row.names(dataFilt_normal1)
    dataFilt_normal1 <- dataFilt_normal1[,-51]
    dataFilt_normal1 <- as.matrix(dataFilt_normal1)
    row.names(dataFilt_normal1)
    #排序
    dataFilt_normal1 <- dataFilt_normal1[order(row.names(dataFilt_normal1),decreasing=FALSE),]
    dataFiltHCC1 <- dataFiltHCC1[order(row.names(dataFiltHCC1),decreasing=FALSE),]
    #  should be TRUE,TRUE说明两个矩阵的行名一模一样
    all(row.names(dataFilt_normal1) == row.names(dataFiltHCC1))
    #差异分析
    dataDEGs <- TCGAanalyze_DEA(mat1 = dataFiltHCC1,
                            mat2 = dataFilt_normal1,
                            Cond1type = "Tumor",
                            Cond2type = "Normal",
                            fdr.cut = 0.01 ,
                            logFC.cut = 1,
                            method = "glmLRT")
    > head(dataDEGs)
           logFC   logCPM        LR       PValue          FDR
    155060 -2.571642 2.321918 156.36454 7.047070e-36 7.143353e-35
    90288  -1.676365 1.398311  28.34992 1.012510e-07 1.675663e-07
    A4GALT -1.396513 2.638181  53.84503 2.169434e-13 4.951607e-13
    AACS   -1.616439 3.692283 117.12599 2.693991e-27 1.477036e-26
    AADAT   2.179292 3.500549 135.76657 2.244217e-31 1.638009e-30
    AAGAB  -1.150286 4.838394 244.67963 3.752987e-55 2.604312e-53
    > dim(dataDEGs)
    [1] 5461    5
    
    dataDEGs$ID <- row.names(dataDEGs)
    dataDEGs <- dataDEGs[,-6]
    d <- intersect(MethylMixResults$MethylationDrivers,dataDEGs$ID )
    HCC_GO <- dataDEGs %>%
      dplyr::filter(ID %in% d )
    HCC_GO <- data.frame(ID = HCC_GO $ID,logFC =HCC_GO $logFC)
    library(org.Hs.eg.db)
    library(topGO)
    library(GSEABase)
    library(clusterProfiler)
    #将txt中的一行或一列标成一个向量
    x <- HCC_GO$ID #total_RNA是一个只有一行基因标准名的txt文件
    #将标准的gene转化为ENTREZID格式
    eg = bitr(x, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")
    #进行GO富集
    
    ego <- enrichGO(gene         = eg$ENTREZID,
                OrgDb         = org.Hs.eg.db,
                ont           = "ALL",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.2,
                qvalueCutoff  = 0.2,
                readable = T)
    df_ego <- as.data.frame(ego)
    write.table(df_ego,"sample.csv",sep=",")
    
    sample <- read.csv("I:/test/sample.csv", stringsAsFactors=FALSE)
    #生成GOplot图
    library(GOplot)
    # Generate the plotting object
    circ <- circle_dat(sample, HCC_GO)
    #sample的列名为 :Category ID Term Genes adj_pval
    #GO_log的列名为:ID logFC
    #绘制GOplot
    chord <- chord_dat(data = circ, genes = HCC_GO, process = sample$Term)
    GOChord(chord, space = 0.02, gene.order = 'logFC',gene.space = 0.25, gene.size = 2,process.label = 6)
    
    # DEGs TopTable
    dataDEGsFiltLevel <- TCGAanalyze_LevelTab(dataDEGs,"Tumor","Normal",
                                          dataFiltHCC1[,colnames(dataFiltHCC1)],
                                          dataFilt_normal1[,colnames(dataFilt_normal1)])
    dataDEGsFiltLevel$GeneID <- 0
    > head(dataDEGsFiltLevel)
       mRNA     logFC          FDR      Tumor    Normal     Delta GeneID
    ALB     ALB  1.159244 7.042187e-16 2491918.90 5975905.3 2888743.2      0
    HP       HP  1.738657 1.612502e-16  640987.55 2297407.6 1114457.4      0
    FTL     FTL -1.144761 3.796445e-10  515336.83  250421.6  589937.3      0
    GLUL   GLUL -2.998764 5.611403e-14  187833.75   25261.8  563269.1      0
    SPP1   SPP1 -5.972776 6.830736e-21   71733.61    1227.8  428448.7      0
    APOA1 APOA1  1.226569 1.459487e-09  294893.27  740950.3  361706.8      0
    
    dataDEGsFiltLevel$ID <- row.names(dataDEGsFiltLevel)
    head(dataDEGsFiltLevel)
    > head(dataDEGsFiltLevel)
       mRNA     logFC          FDR      Tumor    Normal     Delta GeneID
    ALB     ALB  1.159244 7.042187e-16 2491918.90 5975905.3 2888743.2      0
    HP       HP  1.738657 1.612502e-16  640987.55 2297407.6 1114457.4      0
    FTL     FTL -1.144761 3.796445e-10  515336.83  250421.6  589937.3      0
    GLUL   GLUL -2.998764 5.611403e-14  187833.75   25261.8  563269.1      0
    SPP1   SPP1 -5.972776 6.830736e-21   71733.61    1227.8  428448.7      0
    APOA1 APOA1  1.226569 1.459487e-09  294893.27  740950.3  361706.8      0
    dataDEGsFiltLevel <- dataDEGsFiltLevel[,-8]
    dataDEGsFiltLevel1 <- dataDEGsFiltLevel %>%
      dplyr::filter(mRNA %in% d)
    
    a <- c(dataDEGsFiltLevel$mRNA)
    dataDEGsFiltLevel$mRNA <- a
    
    dim(dataDEGsFiltLevel1)
    
    # Converting Gene symbol to geneID
    eg = as.data.frame(bitr(dataDEGsFiltLevel1$mRNA,
                        fromType="SYMBOL",
                        toType="ENTREZID",
                        OrgDb="org.Hs.eg.db"))
    eg <- eg[!duplicated(eg$SYMBOL),]
    
    
    dataDEGsFiltLevel1 <- dataDEGsFiltLevel1[dataDEGsFiltLevel1$mRNA %in% eg$SYMBOL,]
    
    dataDEGsFiltLevel1 <- dataDEGsFiltLevel1[order(dataDEGsFiltLevel1$mRNA,decreasing=FALSE),]
    eg <- eg[order(eg$SYMBOL,decreasing=FALSE),]
    
    # table(eg$SYMBOL == dataDEGsFiltLevel$mRNA) should be TRUE
    all(eg$SYMBOL == dataDEGsFiltLevel1$mRNA)
    
    
    dataDEGsFiltLevel1$GeneID <- eg$ENTREZID
    
    dataDEGsFiltLevel_sub <- subset(dataDEGsFiltLevel1, select = c("GeneID", "logFC"))
    genelistDEGs <- as.numeric(dataDEGsFiltLevel_sub$logFC)
    names(genelistDEGs) <- dataDEGsFiltLevel_sub$GeneID
    
    library(pathview)
    # pathway.id: hsa05214 is the glioma pathway
    # limit: sets the limit for gene expression legend and color
    > head(genelistDEGs)
     4363       215     84519     54988    113622     60312 
    -1.731574 -1.392554 -1.294594  1.384184 -1.312951 -1.418266 
    hsa05225 <- pathview::pathview(gene.data  = genelistDEGs,
                               pathway.id = "hsa05225",
                               species    = "hsa",
                               limit = list(gene=as.integer(max(abs(genelistDEGs)))))

全部评论 (0)

还没有任何评论哟~