Advertisement

单细胞RNA测序(scRNA-seq)Seurat分析流程入门

阅读量:

单细胞RNA测序(scRNA-seq)基础可查内容看以下文章:

单细胞RNA测序(scRNA-seq)工作流程入门

单细胞RNA测序(scRNA-seq)细胞分离与扩增

单细胞RNA测序(scRNA-seq)SRA数据下载及fastq-dumq数据拆分

单细胞RNA测序(scRNA-seq)Cellranger流程入门和数据质控

单细胞RNA测序(scRNA-seq)构建人类参考基因组注释

单细胞RNA测序(scRNA-seq)cellranger count的细胞定量和aggr整合

Seurat分析流程入门

1. 数据与R包准备

以下代码在RStudio中实现。

1.1 PMBC数据下载

下载2700个10X单细胞-外周血单核细胞(PBMC)数据集。

复制代码
    # Seurat_1.R
    ################### 1. 数据获取及读取 ###################
    # 切换至工作目录
    setwd("F:/scRNA-seq/Seurat4.0")
    
    # 下载PMBC数据
    download.file("https://cf.10xgenomics.com/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz", 
              destfile = "pbmc3k_filtered_gene_bc_matrices.tar.gz")
    
    # 解压
    untar(tarfile = "pbmc3k_filtered_gene_bc_matrices.tar.gz", exdir = "./")
    
    # 查看当前目录文件
    dir()
    # [1] "filtered_gene_bc_matrices"     "pbmc3k_filtered_gene_bc_matrices.tar.gz"
    # [3] "Seurat_1.R"
    
    # 查看解压目录文件
    dir("./filtered_gene_bc_matrices/hg19")
    # [1] "barcodes.tsv" "genes.tsv"    "matrix.mtx"
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    

1.2 分析R包安装

复制代码
    # 选择清华镜像安装
    options("repos"=c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))
    install.packages("BiocManager")
    
    BiocManager::install("dplyr")
    BiocManager::install("Seurat")
    BiocManager::install("patchwork")
    BiocManager::install("ggplot2")
    
    
      
      
      
      
      
      
      
      
    

2. 数据预处理

2.1 构建单细胞Seurat对象

Read10X函数读取,返回独特的分子识别(UMI)计数矩阵
矩阵中的行值表示每个功能(即基因)列值表示在每个细胞中检测到的分子数量

复制代码
    ################### 2. 数据预处理 ###################
    library(dplyr)
    library(Seurat)
    library(patchwork)
    
    # 读取当前目录 2700个10X单细胞-外周血单核细胞(PBMC)数据集
    pbmc.data <- Read10X(data.dir = "./filtered_gene_bc_matrices/hg19")
    
    # 查看部分数据
    pbmc.data[1:5, 1:3]
    # 5 x 3 sparse Matrix of class "dgCMatrix"
    # AAACATACAACCAC-1 AAACATTGAGCTAC-1 AAACATTGATCAGC-1
    # MIR1302-10                  .                .                .
    # FAM138A                     .                .                .
    # OR4F5                       .                .                .
    # RP11-34P13.7                .                .                .
    # RP11-34P13.8                .                .                .
    
    
    # 创建Seurat对象
    # 每个基因至少在3个细胞中表达
    # 每个细胞至少检测到200个基因
    pbmc.obj <- CreateSeuratObject(counts = pbmc.data, 
                               project = "pbmc3k",
                               min.cell = 3,
                               min.features = 200)
    
    pbmc.obj
    # An object of class Seurat 
    # 13714 features across 2700 samples within 1 assay 
    # Active assay: RNA (13714 features, 0 variable features)
    # 1 layer present: counts
    
    # 细胞名称重命名
    pbmc.obj <- RenameCells(pbmc.obj, add.cell.id = "PBMC")
    
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    

2.2 单细胞数据质控

为分析得到高质量和可靠的单细胞数据,保证后续分析结果的准确度,通常会设置以下QC指标过滤细胞:

常见QC指标:

在每个细胞中检测到的基因数量
低质量细胞或空液滴通常只有很少的基因; 细胞双倍或多胞可能表现出异常高的基因计数。

细胞内检测到的分子总数 (与独特的基因密切相关)

细胞中的线粒体基因组的百分比
低质量/死细胞经常表现出线粒体污染;使用PercentageFeatureSet()函数计算线粒体 QC 指标。

2.2.1 通过细胞counts数和feature数的分布确定过滤条件阈值

复制代码
    head(pbmc.obj@meta.data)
    # orig.ident nCount_RNA nFeature_RNA percent.mt
    # PBMC_AAACATACAACCAC-1     pbmc3k       2419          779  3.0177759
    # PBMC_AAACATTGAGCTAC-1     pbmc3k       4903         1352  3.7935958
    # PBMC_AAACATTGATCAGC-1     pbmc3k       3147         1129  0.8897363
    # PBMC_AAACCGTGCTTCCG-1     pbmc3k       2639          960  1.7430845
    # PBMC_AAACCGTGTATGCG-1     pbmc3k        980          521  1.2244898
    # PBMC_AAACGCACTGGTAC-1     pbmc3k       2163          781  1.6643551
    
    # 细胞counts数和feature数的分布
    VlnPlot(pbmc.obj, features = c("nCount_RNA","nFeature_RNA"))
    
    
    
      
      
      
      
      
      
      
      
      
      
      
      
    
counts数和feature数的分布图

2.2.2 细胞线粒体基因和和核糖体基因含量获取

复制代码
    # 获取基因名以MT-开头的线粒体基因
    pbmc.obj[["percent.mt"]] <- PercentageFeatureSet(pbmc.obj,
                                                 pattern = "^MT-")
    
    head(pbmc.obj@meta.data, 5)
    # orig.ident nCount_RNA nFeature_RNA percent.mt
    # PBMC_AAACATACAACCAC-1       PMBC       2419          779  3.0177759
    # PBMC_AAACATTGAGCTAC-1       PMBC       4903         1352  3.7935958
    # PBMC_AAACATTGATCAGC-1       PMBC       3147         1129  0.8897363
    # PBMC_AAACCGTGCTTCCG-1       PMBC       2639          960  1.7430845
    # PBMC_AAACCGTGTATGCG-1       PMBC        980          521  1.2244898
    
    # 统计核糖体含量, 核糖体基因命名规则为"RPS"或"RPL"
    rRNA.genes <- rownames(pbmc.obj)[grep("^RP[SL]", 
                                      rownames(pbmc.obj),
                                      ignore.case=TRUE)]
    pbmc.obj$percent.rRNA <- PercentageFeatureSet(pbmc.obj, features = rRNA.genes)
    
    pbmc.obj[["percent.rRNA"]] <- PercentageFeatureSet(pbmc.obj, pattern = "^RP[SL]")
    
    head(pbmc.obj@meta.data)
    # orig.ident nCount_RNA nFeature_RNA percent.mt percent.rRNA
    # PBMC_AAACATACAACCAC-1       PMBC       2419          779  3.0177759     43.69574
    # PBMC_AAACATTGAGCTAC-1       PMBC       4903         1352  3.7935958     42.40261
    # PBMC_AAACATTGATCAGC-1       PMBC       3147         1129  0.8897363     31.68097
    # PBMC_AAACCGTGCTTCCG-1       PMBC       2639          960  1.7430845     24.25161
    # PBMC_AAACCGTGTATGCG-1       PMBC        980          521  1.2244898     14.89796
    # PBMC_AAACGCACTGGTAC-1       PMBC       2163          781  1.6643551     36.19972
    
    VlnPlot(pbmc.obj, features = c("percent.mt","percent.rRNA"), pt.size = 1)
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
细胞线粒体基因和和核糖体基因含量

2.2.3 过滤不符合质控指标的细胞

复制代码
    # 标题显示数字为相关性
    p1 <- FeatureScatter(pbmc.obj, 
                     feature1 = "nCount_RNA",
                     feature2 = "nFeature_RNA")
    p2 <- FeatureScatter(pbmc.obj, 
                     feature1 = "nCount_RNA", 
                     feature2 = "percent.mt")
    p1 + p2
    #相关性为0.95和-0.13
    
    
      
      
      
      
      
      
      
      
      
    
可视化QC指标散点图
复制代码
    # 使用subset函数过滤seurat对象
    # 过滤具有UMI计数超过 2500 或小于 200的细胞, >5%线粒体的细胞
    pbmc.obj<- subset(pbmc.obj, 
                      subset = nCount_RNA > 300 & nCount_RNA < 10000 & 
                        nFeature_RNA > 200 & nFeature_RNA < 2500 & 
                        percent.mt < 5)
                        
    pbmc.obj
    # An object of class Seurat 
    # 13714 features across 2638 samples within 1 assay 
    # Active assay: RNA (13714 features, 0 variable features)
    # 1 layer present: count
    
    
    # 过滤后可视化QC指标 "nFeature_RNA", "nCount_RNA", "percent.mt"
    VlnPlot(pbmc.obj, features = c("nFeature_RNA","nCount_RNA","percent.mt"),
        cols = 3, pt.size = 1)
    
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
 可视化QC指标

3 单细胞数据的标准化和均一化

标准化和均一化的处理目的是:

1. 使细胞间表达具有可比性
2. 使表达量分布符合统计学分布

3.1 数据标准化

最常用的方法是对测序深度进行标准化使每个细胞具有相同的reads数据量
Seurat默认的标准化方法为 LogNormalize : 以e为底数,log(每个细胞中基因的nCount_RNA / 该细胞内总Count*10000 + 1)

复制代码
    ################### 3. 数据标准化和均一化 ###################
    
    pbmc.obj <- NormalizeData(pbmc.obj, 
                          normalization.method = 'LogNormalize',
                          scale.factor = 10000)
    
    pbmc.obj@assays$RNA$data[0:5, 0:3]
    # 5 x 3 sparse Matrix of class "dgCMatrix"
    # PBMC_AAACATACAACCAC-1 PBMC_AAACATTGAGCTAC-1 PBMC_AAACATTGATCAGC-1
    # AL627309.1                        .                     .                     .
    # AP006222.2                        .                     .                     .
    # RP11-206L10.2                     .                     .                     .
    # RP11-206L10.9                     .                     .                     .
    # LINC00115                         .                     .                     .
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    

3.2 筛选高变基因HGVS

高变基因 : 计算数据集中显示高变异的特征子集 (即在某些细胞中表达强烈,在另一些细胞中表达得很低); 一般使用均值与方差之间的关系 来挑选高变基因。

为了减轻下游分析工具的计算负担、减少数据中的噪声并方便数据可视化, 在下游分析中关注这些基因有助于突出单细胞数据集中的生物信号 ,。

筛选高变基因方法:

vst (默认):首先利用loess回归对log(variance)和log(mean)拟合一条直线,然后利用观测均值和期望方差对基因表达量进行标准化,最后根据标准化后的表达量计算方差。

mean.var.plot (mvp): 首先分别计算每个基因的平均表达量和离散情况,然后根据平均表达量将基因们分散到一定数量(默认是20个)的小区间(bin)中,并且计算每个区间的z-score。

dispersion (disp): 挑选离差值最高的基因。

复制代码
    # 默认情况下,使用每个数据集的 2,000 个基因
    pbmc.obj <- FindVariableFeatures(pbmc.obj,
                                 selection.method = "vst",
                                 nfeatures = 2000)
    
    # 获取前10个高变基因
    top10.hgvs <- head(VariableFeatures(pbmc.obj), 10)
    # [1] "PPBP"   "LYZ"    "S100A9" "IGLL5"  "GNLY"   "FTL"    "PF4"    "FTH1"   "GNG11" 
    # [10] "S100A8"
    
    p3 <- VariableFeaturePlot(pbmc.obj)
    p4 <- LabelPoints(plot = p3, points = top10.hgvs, repel = TRUE)
    p3 + p4
    
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
高变基因HGVS

3.3 数据均一化

均一化对表达矩阵进行scale处理scale之后的矩阵每个基因表达均值为0 。 经过scale之后,所有基因的表达分布基本一致 ,有助于后续的降维聚类。

复制代码
    # 均一化
    all.genes <- rownames(pbmc.obj)
    
    pbmc.obj <- ScaleData(pbmc.obj, features = all.genes)
    
    pbmc.obj@assays$RNA$scale.data[0:5, 0:3]
    # PBMC_AAACATACAACCAC-1 PBMC_AAACATTGAGCTAC-1 PBMC_AAACATTGATCAGC-1
    # AL627309.1              -0.05812316           -0.05812316           -0.05812316
    # AP006222.2              -0.03357571           -0.03357571           -0.03357571
    # RP11-206L10.2           -0.04166819           -0.04166819           -0.04166819
    # RP11-206L10.9           -0.03364562           -0.03364562           -0.03364562
    # LINC00115               -0.08223981           -0.08223981           -0.08223981
    
    
    ####### 根据需求选择 ######
    # 移除细胞周期等因素影响
    s.genes <- cc.genes$s.genes
    g2m.genes <- cc.genes$g2m.genes
    
    pbmc.obj2 <- CellCycleScoring(pbmc.obj, s.features = s.genes, g2m.features = g2m.genes)
    head(pbmc.obj2@meta.data)
    # orig.ident nCount_RNA nFeature_RNA percent.mt percent.rRNA
    # PBMC_AAACATACAACCAC-1       PMBC       2419          779  3.0177759     43.69574
    # PBMC_AAACATTGAGCTAC-1       PMBC       4903         1352  3.7935958     42.40261
    # PBMC_AAACATTGATCAGC-1       PMBC       3147         1129  0.8897363     31.68097
    # PBMC_AAACCGTGCTTCCG-1       PMBC       2639          960  1.7430845     24.25161
    # PBMC_AAACCGTGTATGCG-1       PMBC        980          521  1.2244898     14.89796
    # PBMC_AAACGCACTGGTAC-1       PMBC       2163          781  1.6643551     36.19972
    # S.Score    G2M.Score Phase
    # PBMC_AAACATACAACCAC-1  0.09853841 -0.044716507     S
    # PBMC_AAACATTGAGCTAC-1 -0.02364305 -0.046889929    G1
    # PBMC_AAACATTGATCAGC-1 -0.02177266  0.074841537   G2M
    # PBMC_AAACCGTGCTTCCG-1  0.03794398  0.006575446     S
    # PBMC_AAACCGTGTATGCG-1 -0.03309970  0.027910063   G2M
    # PBMC_AAACGCACTGGTAC-1 -0.04814181 -0.078164839    G1
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    

4 单细胞数据降维

降维是通过算法最优地保留原始数据的一些关键属性来将数据投影到更低的维度空间中,便于可视化和后续分析

4.1 PCA降维分析

PCA降维时, Secuat包提供VizDimLoadings(), DimPlot()DimHeatmap() 函数用于单细胞 cells and features 的可视化。

复制代码
    ################### 4. 数据降维 ###################
    
    hgvs <- VariableFeatures(object = pbmc.obj)
    
    # PCA分析
    pbmc.obj <- RunPCA(pbmc.obj, features = hgvs)
    
    # 打印部分PCA结果
    print(pbmc.obj[["pca"]], dims = 1:3, nfeatures = 5)
    # PC_ 1 
    # Positive:  CST3, TYROBP, LST1, AIF1, FTL 
    # Negative:  MALAT1, LTB, IL32, IL7R, CD2 
    # PC_ 2 
    # Positive:  CD79A, MS4A1, TCL1A, HLA-DQA1, HLA-DQB1 
    # Negative:  NKG7, PRF1, CST7, GZMB, GZMA 
    # PC_ 3 
    # Positive:  HLA-DQA1, CD79A, CD79B, HLA-DQB1, HLA-DPB1 
    # Negative:  PPBP, PF4, SDPR, SPARC, GNG11
    
    # 不同函数的可视化结果
    VizDimLoadings(pbmc.obj, dims = 1:2 , reduction = "pca")
    
    DimPlot(pbmc.obj,  reduction = "pca", pt.size = 1)
    
    DimHeatmap(pbmc.obj, dims = 1:6 , reduction = "pca", balanced = TRUE)
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    

VizDimLoadings
DimPlot
DimHeatmap

4.2 选取合适的维度

维度的选择标准是在保证足够的信息同时减少噪音

复制代码
    # 展示维度信息, 选取20较为合适
    ElbowPlot(pbmc.obj)
    
    
      
      
    
ElbowPlot
复制代码
    # 确定主成分个数,耗时长
    pbmc.obj <- JackStraw(pbmc.obj, num.replicate = 100)
    pbmc.obj <- ScoreJackStraw(pbmc.obj, dims = 1:20)
    
    # 可视化
    JackStrawPlot(pbmc.obj, dims = 1:20)
    
    
      
      
      
      
      
      
    
JackStrawPlot

5. 单细胞聚类

基于细胞基因表达谱的相似性 ,将细胞聚类成簇 。通常是任何单细胞分析的第一个中间结果,细胞聚类可以帮助我们推断数据集中各细胞的类别。

5.1 聚类分簇

表达谱的相似性采用降维之后表达空间上的欧氏距离度量

复制代码
    ################### 5. 数据聚类 ###################
    
    # 单细胞聚类
    # 最近邻算法对细胞进行聚类
    pbmc.obj <- FindNeighbors(pbmc.obj, reduction = "pca", dims = 1:10)
    
    # 使用多分辨率模块优化算法,迭代将细胞进行分类
    pbmc.obj <- FindClusters(pbmc.obj, resolution = 0.5)
    
    head(pbmc.obj@meta.data)
    # orig.ident nCount_RNA nFeature_RNA percent.mt percent.rRNA
    # PBMC_AAACATACAACCAC-1       PMBC       2419          779  3.0177759     43.69574
    # PBMC_AAACATTGAGCTAC-1       PMBC       4903         1352  3.7935958     42.40261
    # PBMC_AAACATTGATCAGC-1       PMBC       3147         1129  0.8897363     31.68097
    # PBMC_AAACCGTGCTTCCG-1       PMBC       2639          960  1.7430845     24.25161
    # PBMC_AAACCGTGTATGCG-1       PMB        980          521  1.2244898     14.89796
    # PBMC_AAACGCACTGGTAC-1       PMBC       2163          781  1.6643551     36.19972
    # RNA_snn_res.0.5 seurat_clusters
    # PBMC_AAACATACAACCAC-1               2               2
    # PBMC_AAACATTGAGCTAC-1               3               3
    # PBMC_AAACATTGATCAGC-1               2               2
    # PBMC_AAACCGTGCTTCCG-1               1               1
    # PBMC_AAACCGTGTATGCG-1               6               6
    # PBMC_AAACGCACTGGTAC-1               2               2
    
    table(pbmc.obj@meta.data$seurat_clusters)
    # 0   1   2   3   4   5   6   7   8 
    # 711 480 472 344 279 162 144  32  14
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    

5.2 UMAP 可视化

Seurat R包提供UMAPt-SNE 两种降维可视化算法。

复制代码
    # UMAP聚类可视化
    pbmc.obj <- RunUMAP(pbmc.obj, reduction = "pca",
                    dims = 1:10, verbose = TRUE)
    
    DimPlot(pbmc.obj, reduction = "umap", 
        label = TRUE, pt.size = 1.5)
    
    head(pbmc.obj@reductions$umap@cell.embeddings)
    # umap_1     umap_2
    # PBMC_AAACATACAACCAC-1 -4.611246 -3.0574470
    # PBMC_AAACATTGAGCTAC-1 -2.061398  9.7727327
    # PBMC_AAACATTGATCAGC-1 -5.386918 -6.3924266
    # PBMC_AAACCGTGCTTCCG-1 10.931999  3.5166690
    # PBMC_AAACCGTGTATGCG-1 -9.780782  0.6403372
    # PBMC_AAACGCACTGGTAC-1 -3.459144 -4.8853937
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
UMAP DimPlot

5.3 tSNE 可视化

复制代码
    # tSNE聚类可视化
    pbmc.obj <- RunTSNE(pbmc.obj, reduction = "pca", dims = 1:10)
    
    DimPlot(pbmc.obj, reduction = "tsne", 
        label = TRUE, pt.size = 1.5)
    
    # 自定义颜色, 聚类靠前颜色显著
    # DimPlot(pbmc.obj, reduction = "tsne", cols = c("red", "blue", 3:10),
    #         label = TRUE, pt.size = 1.5)
    
    head(pbmc.obj@reductions$tsne@cell.embeddings)
    # tSNE_1     tSNE_2
    # PBMC_AAACATACAACCAC-1 -11.701490   7.120466
    # PBMC_AAACATTGAGCTAC-1 -21.981401 -21.330793
    # PBMC_AAACATTGATCAGC-1  -1.292978  23.822324
    # PBMC_AAACCGTGCTTCCG-1  30.877776  -9.926240
    # PBMC_AAACCGTGTATGCG-1 -34.619197   8.773753
    # PBMC_AAACGCACTGGTAC-1  -3.046157  13.013471
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
tSNE DimPlot

自定义颜色可视化图

红色和蓝色为自定义颜色。
tSNE DimPlot 自定义颜色

6. 单细胞聚类簇鉴定

在基因水平上,对每个簇的marker gene进行分析 ,这些marker gene包含细胞簇的特征,可帮助根据细胞簇生物学功能等特性进行分类。

6.1 单细胞差异基因分析

min.pct: 在2组细胞簇中检测特征的最小百分比。

复制代码
    # 单细胞聚簇分类
    # 细胞簇差异基因(DEGs)分析
    cluster1.marker <- FindMarkers(pbmc.obj, ident.1 = 0, 
                               min.pct = 0.1, logfc.threshold = 0.25)
    
    head(cluster1.marker)
    # p_val avg_log2FC pct.1 pct.2     p_val_adj
    # RPS12 1.806317e-144  0.7439459 1.000 0.991 2.477183e-140
    # RPS6  7.135900e-142  0.6862148 1.000 0.995 9.786173e-138
    # RPS27 5.257820e-140  0.7298479 0.999 0.992 7.210575e-136
    # RPL32 4.229582e-136  0.6184804 0.999 0.995 5.800448e-132
    # RPS14 1.799019e-130  0.6283021 1.000 0.994 2.467175e-126
    # CYBA  1.235790e-129 -1.7444394 0.658 0.917 1.694762e-125
    
    
    # 从cluster0到cluster3中区分cluster5 marker gene
    cluster5.marker <- FindMarkers(pbmc.obj, ident.1 = 5, 
                               ident.2 = c(0,3), min.pct = 0.25)
                               
    head(cluster5.marker)
    # p_val avg_log2FC pct.1 pct.2     p_val_adj
    # FCGR3A        2.150929e-209   6.832372 0.975 0.039 2.949784e-205
    # IFITM3        6.103366e-199   6.181000 0.975 0.048 8.370156e-195
    # CFD           8.891428e-198   6.052575 0.938 0.037 1.219370e-193
    # CD68          2.374425e-194   5.493138 0.926 0.035 3.256286e-190
    # RP11-290F20.3 9.308287e-191   6.335402 0.840 0.016 1.276538e-186
    # IFI30         1.575100e-181   6.382751 0.796 0.014 2.160093e-17
    
    
    # 比较剩余的细胞簇找到marker gene
    # only.pos为TRUE表示仅记录阳性结果
    all.marker <- FindAllMarkers(pbmc.obj, only.pos = TRUE,
                             min.pct = 0.1 , logfc.threshold = 0.25)
                             
    head(all.marker)
    # p_val avg_log2FC pct.1 pct.2     p_val_adj cluster  gene
    # RPS12 1.806317e-144  0.7439459 1.000 0.991 2.477183e-140       0 RPS12
    # RPS6  7.135900e-142  0.6862148 1.000 0.995 9.786173e-138       0  RPS6
    # RPS27 5.257820e-140  0.7298479 0.999 0.992 7.210575e-136       0 RPS27
    # RPL32 4.229582e-136  0.6184804 0.999 0.995 5.800448e-132       0 RPL32
    # RPS14 1.799019e-130  0.6283021 1.000 0.994 2.467175e-126       0 RPS14
    # RPS25 5.507298e-123  0.7635765 0.997 0.975 7.552709e-119       0 RPS25
    
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    

6.2 mark gene可视化

mark gene为已知的特定细胞中特异性表达的基因 ,根据已知的数据库(如CellMarker、PanglaoDB等)和发表的文献

与mark gene可视化相关函数:

VlnPlot() (shows expression probability distributions across clusters)

FeaturePlot() (visualizes feature expression on a tSNE or PCA plot) are our most commonly used visualizations.

DoHeatmap() generates an expression heatmap for given cells and features.

RidgePlot(), CellScatter(), and DotPlot() as additional methods to view your dataset.

复制代码
    library(dplyr)
    library(ggplot2)
    
    # marker基因可视化
    marker.show <- c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", 
                 "FCGR3A", "LYZ", "PPBP", "CD8A")
    
    VlnPlot(pbmc.obj, features = marker.show)
    
    
      
      
      
      
      
      
      
      
    
mark gene
复制代码
    VlnPlot(pbmc.obj, features = marker.show, slot = "counts", log = TRUE)
    
    
    
      
      
    
mark gene
复制代码
    FeaturePlot(pbmc.obj, features = marker.show)
    
    
    
      
      
    
mark gene
复制代码
    # 根据中cluster列分组,选取按avg_log2FC排序
    top10.marker <- all.marker %>% group_by(cluster) %>% top_n(n=10, wt = avg_log2FC)
    
    DoHeatmap(pbmc.obj, features = top10.marker$ge) + NoLegend()
    
    
    
      
      
      
      
      
    
top10 marker gene
复制代码
    # 点图, 颜色深浅表示表达量的高低,点大小表示表达量占比
    DotPlot(pbmc.obj, features = marker.show) +
      theme(axis.text.x = element_text(angle=45, hjust=1))
    
    
    
      
      
      
      
    
DotPlot
复制代码
    # 山脊图
    RidgePlot(pbmc.obj, features = marker.show)
    
    
      
      
    
RidgePlot

6.3 细胞类型注释

根据细胞中特异性表达的marker gene来注释细胞簇的类型

使用marker来轻松地将聚类与已知的单元类型进行匹配:
细胞类型注释

复制代码
    cluster.cell <- c("Naive CD4 T", "Memory CD4 T", "CD14+ Mono", "B", 
                  "CD8 T", "FCGR3A+ Mono", "NK", "DC", "Platelet")
    
    names(cluster.cell) <- levels(pbmc.obj)
    
    cluster.cell
    # 0              1              2              3              4              5 
    # "Naive CD4 T" "Memory CD4 T"   "CD14+ Mono"            "B"        "CD8 T" "FCGR3A+ Mono" 
    # 6              7              8 
    # "NK"           "DC"     "Platelet" 
    
    # 重命名Idents
    pbmc.obj <- RenameIdents(pbmc.obj, cluster.cell)
    
    # 可视化
    DimPlot(pbmc.obj, reduction = "umap", label=TRUE, pt.size=1)
    
    # 保存为.RDS结果
    saveRDS(pbmc.obj, "pbmc.final.RDS")
    
    head(pbmc.obj@active.ident)
    # PBMC_AAACATACAACCAC-1 PBMC_AAACATTGAGCTAC-1 PBMC_AAACATTGATCAGC-1 PBMC_AAACCGTGCTTCCG-1 
    # CD14+ Mono                     B            CD14+ Mono          Memory CD4 T 
    # PBMC_AAACCGTGTATGCG-1 PBMC_AAACGCACTGGTAC-1 
    # NK            CD14+ Mono 
    # Levels: Naive CD4 T Memory CD4 T CD14+ Mono B CD8 T FCGR3A+ Mono NK DC Platelet
    
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
细胞类型注释

完整代码下载请关注微信公账号【生信与基因组学】。

全部评论 (0)

还没有任何评论哟~