ChIP-Seq分析流程
发布时间
阅读量:
阅读量
1.FastqC质控
fastqc *fq.gz -o raw -t 4
multiqc /. 20230204_fastqc.zip
2.Trim_galore去接头(×)
paste <(ls *1.fq.gz) <(ls *2.fq.gz) >config
cat config |while read id;do arr=$id;fq1=${arr[0]};fq2=${arr[1]};nohup trim_galore --gzip --paired $fq1 $fq2 -o ./raw/trim.log & done
2.Fastp(√)
paste <(ls *1.clean.fq.gz) <(ls *2.clean.fq.gz) >config
cat config |while read id;
do arr=($id);fq1=${arr[0]};fq2=${arr[1]};
fastp -i $fq1 -I $fq2 -o ./fastp/$fq1 -O ./fastp/$fq2 \
-f 15 \ # 双端测序修剪1端上游15bp
-F 15 \ # 修剪2端上游15bp
-q 20 \ # reads最低质量,一般在20以上即可
-l 75 \ # 修剪后reads长度在75bp以上
-u 20 \ # 限制低质量碱基的百分比,默认40,表示40%
-w 20 # 线程数
done
# 再次fastqc质控
fastqc -t 20 -o fastp/fastqc fastp/*.gz
multiqc -o fastqc ./fastp
3.Bowtie2比对
bowtie2_mm10_idx=/public/ojsys/management/wbhou/reference/mouse/genome/mm10/mm10-bowtie2
paste <(ls *.clean.fq.gz | cut -d "_" -f 1 | sort | uniq) <(ls *1.clean.fq.gz) <(ls *2.clean.fq.gz) >config
cat config | while read id;do arr=($id);sample=${arr[0]};fq1=${arr[1]};fq2=${arr[2]};
bowtie2 -p 100 -1 $fq1 -2 $fq2 -x bowtie2_mm10_idx -S ../bowtie2/sam/$sample.sam
4. sam转bam,排序 ---Samtools
samtools view -@ 100 -bhS ../bowtie2/sam/$sample.sam -o ../bowtie2/bam/$sample.bam
samtools sort -@ 100 -o ../bowtie2/bam/$sample_sorted.bam ../bowtie2/bam/$sample.bam
done
5.Picard去重
picard MarkDuplicates \
VALIDATION_STRINGENCY=SILENT \
REMOVE_DUPLICATES=true \ # 不将重复写入输出文件,而设置适当的标志来写入它们,默认值为false
READ_NAME_REGEX=null \ # 正则表达式,可用于解析传入的SAM文件中的读名称。读取的名称被解析为提取三个变量:tile/region、x坐标和y坐标。这些值用于估计光学复制率,以便给出更准确的库大小估计。将此选项设置为null可禁用光学重复检测
ASSUME_SORTED=true \ # 如果为true,假设输入文件按坐标排序
I=../bowtie2/bam/$sample_sorted.bam \ # input
O=../bowtie2/bam/$sample_sorted_rmdup.bam \ # output
M=../bowtie2/bam/$sample_dup_metrics.txt # 相当于log文件
Tips:如需过滤二次比对得到unique map reads可用下列代码
samtools view -bhS -@ 8 -F 1796 -q 20 ../bowtie2/bam/$sample_sorted_rmdup.bam | samtools sort -@ 12 > ../bowtie2/bam/$sample_sort.bam
6.为bam文件建立索引
samtools index -@ 100 ../bowtie2/bam/$sample_sorted_rmdup.bam
samtools index -@ 100 ../bowtie2/bam/$sample_sort.bam
7.相关性分析(bam文件),根据相关性分析结果可选择是否merge
multiBamSummary bins --bamfiles ./*sort.bam --binSize 5000 --numberOfProcessors 100 --outRawCounts ./correlation/results.txt -o ./correlation/results.npz
plotCorrelation -in ./correlation/results.npz --corMethod spearman --skipZeros --plotTitle "Sperman Correlation of Read Counts" --whatToPlot heatmap --colorMap RdYlBu --plotNumbers -o ./correlation/heatmap_SpearmanCorr.pdf --outFileCorMatrix ./correlation/SpearmanCorr_readCounts.tab
Tips:merge相同处理组不同重复的bam文件
samtools merge ./bw-merge/CUT-MI-1000-1-2.bam CUT-MI-1000-1_sorted_rmdup.bam CUT-MI-1000-1_sorted_rmdup.bam
7. bam转bw---deeptols
paste <(ls *rmdup.bam | cut -d "_" -f1 | sort | uniq) >config
cat config | while read id;
do
bamCoverage --bam ${id}_sorted_rmdup.bam -o ./bw/${id}.bw --binSize 50 --normalizeUsing RPKM --effectiveGenomeSize 2652783500
done
8. 根据mm10.bed文件绘制基因上H4K16ac信号的富集图
computeMatrix reference-point -S CUT-GV-1000-100.bw CUT-GV-1000-10.bw CUT-GV-500-100.bw CUT-GV-500-10.bw CUT-GV-Ctrl-1.bw -R mm10.bed --referencePoint TSS -a 10000 -b 10000 --numberOfProcessors 100 -out ./signal2/GV-H4K16ac.gz
# plot:
plotProfile -m ./signal2/GV-H4K16ac.gz --perGroup -out ./signal2/GV-H4K16ac.pdf --numPlotsPerRow 8 --plotTitle GV-H4K16ac
# heatmap+plot
plotHeatmap -m upMI-H4K16ac-merge.gz -out heatmap-MI-UP-K16-merge.svg --heatmapHeight 12 --refPointLabel TSS --zMin 1 --zMax 4 --colorMap OrRd --dpi 750 --regionsLabel genes --plotTitle MI-UP-Genes-H4K16ac-merge
Tips:mm10.bed是所有基因list,DESeq得到差异基因后可根据基因名获得感兴趣基因的bed文件
cat up_MIgene.txt | while read line;
do
grep $line mm10.bed >> up-GV.bed
done
9. Callpeaks---Macs2
macs2 callpeak -t H3K4me3-rep1-sorted_rmp.bam -c H3K4me3-input-sorted_rmp.bam -f BAM -g mm --outdir ./callpeaks/ -n H3K4me3-1.bam -B -q 0.01
Tips:合并在一起操作
samtools merge chip.merge.bam morula-H3K4me3-rep1-sorted_rmp.bam morula-H3K4me3-rep2-sorted_rmp.bam
macs2 callpeak -t chip.merge.bam -c morula-input-sorted_rmp.bam -f BAM -g mm --outdir ./ -n morula-H3K4me3 -B -q 0.01
9.Chipseeker注释
BiocManager::install("ChIPseeker")
library(CHIPseeker)
BiocManager::install("ChIPseeker")
BiocManager::install('TxDb.Hsapiens.UCSC.hg19.knownGene', ask=F,suppressUpdates=T)
BiocManager::install('TxDb.Mmusculus.UCSC.mm10.knownGene', ask=F,suppressUpdates=T)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
H3K4me3_1_peak <- readPeakFile("D:/Rstudio/fenxishuju/ChIP-seq/2023_7_15/macs2_callpeak/morula-H3K4me3-1_summits.bed")
require(TxDb.Mmusculus.UCSC.mm10.knownGene)
txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene
H3K4me3_1_peakAnno <- annotatePeak(H3K4me3_1_peak, tssRegion = c(-1000, 1000), TxDb = txdb)
setwd("D:/Rstudio/fenxishuju/ChIP-seq/2023_7_15/peakanno/")
write.table(H3K4me3_1_peakAnno, file = "H3K4me3_1_peak.txt",sep = '\t', quote = FALSE, row.names = FALSE)
plotAnnoPie(H3K4me3_1_peakAnno)
全部评论 (0)
还没有任何评论哟~
