Advertisement

利用gtf文件从基因组中提取序列

阅读量:

承接上一篇采用类对代码进行封装

复制代码
    from Bio import SeqIO
    import pandas as pd
    from Bio.Seq import Seq
    
    class GeneSeqExtractor:
    def __init__(self, gtf_file, fasta_file):
        self.gtf_file = gtf_file
        self.fasta_file = fasta_file
        self.gene_info = None
        self.chrom_seqs = None
        self.gene_seqs = None
    
    def parse_gtf(self):
        self.gene_info = pd.read_csv(self.gtf_file, sep='\t', comment='#', header=None,
                                     usecols=[0, 2, 3, 4, 6, 8], names=['chrom', 'type', 'start', 'end', 'strand', 'info'])
        self.gene_info['gene_id'] = self.gene_info['info'].str.extract('gene_id "(.*?)";', expand=False)
    
    def parse_fasta(self):
        self.chrom_seqs = {}
        for record in SeqIO.parse(self.fasta_file, 'fasta'):
            self.chrom_seqs[record.id] = str(record.seq)
    
    def extract_gene_seqs(self):
        self.gene_seqs = {}
        for i, row in self.gene_info.iterrows():
            chrom = row['chrom']
            type = row['type']
            start = int(row['start']) - 1  # Convert to 0-based index
            end = int(row['end'])
            strand = row['strand']
            gene_id = row['gene_id']
    
            # Extract sequence
            if chrom in self.chrom_seqs:
                chrom_seq = self.chrom_seqs[chrom]
                gene_seq = chrom_seq[start:end]
                if strand == '-':
                    gene_seq = Seq(gene_seq).reverse_complement()
                gene_type = f"{gene_id}{type}"
                self.gene_seqs[gene_type] = str(gene_seq)
    
    def write_fasta(self, output_file):
        with open(output_file, 'w') as f:
            for gene_id, gene_seq in self.gene_seqs.items():
                f.write(f'>{gene_id}\n')
                f.write(f'{gene_seq}\n')
    
    extractor = GeneSeqExtractor(gtf_file='Athaliana_447_Araport11.gene.gtf', fasta_file='Athaliana_447_TAIR10.fa')
    
    extractor.parse_gtf()
    extractor.parse_fasta()
    extractor.extract_gene_seqs()
    extractor.write_fasta(output_file='gene_sequences.fasta')

参考:https://mp.weixin.qq.com/s/nv2GqxXCd1kgGo1IwHL4uw

全部评论 (0)

还没有任何评论哟~