利用gtf文件从基因组中提取序列
发布时间
阅读量:
阅读量
承接上一篇采用类对代码进行封装
from Bio import SeqIO
import pandas as pd
from Bio.Seq import Seq
class GeneSeqExtractor:
def __init__(self, gtf_file, fasta_file):
self.gtf_file = gtf_file
self.fasta_file = fasta_file
self.gene_info = None
self.chrom_seqs = None
self.gene_seqs = None
def parse_gtf(self):
self.gene_info = pd.read_csv(self.gtf_file, sep='\t', comment='#', header=None,
usecols=[0, 2, 3, 4, 6, 8], names=['chrom', 'type', 'start', 'end', 'strand', 'info'])
self.gene_info['gene_id'] = self.gene_info['info'].str.extract('gene_id "(.*?)";', expand=False)
def parse_fasta(self):
self.chrom_seqs = {}
for record in SeqIO.parse(self.fasta_file, 'fasta'):
self.chrom_seqs[record.id] = str(record.seq)
def extract_gene_seqs(self):
self.gene_seqs = {}
for i, row in self.gene_info.iterrows():
chrom = row['chrom']
type = row['type']
start = int(row['start']) - 1 # Convert to 0-based index
end = int(row['end'])
strand = row['strand']
gene_id = row['gene_id']
# Extract sequence
if chrom in self.chrom_seqs:
chrom_seq = self.chrom_seqs[chrom]
gene_seq = chrom_seq[start:end]
if strand == '-':
gene_seq = Seq(gene_seq).reverse_complement()
gene_type = f"{gene_id}{type}"
self.gene_seqs[gene_type] = str(gene_seq)
def write_fasta(self, output_file):
with open(output_file, 'w') as f:
for gene_id, gene_seq in self.gene_seqs.items():
f.write(f'>{gene_id}\n')
f.write(f'{gene_seq}\n')
extractor = GeneSeqExtractor(gtf_file='Athaliana_447_Araport11.gene.gtf', fasta_file='Athaliana_447_TAIR10.fa')
extractor.parse_gtf()
extractor.parse_fasta()
extractor.extract_gene_seqs()
extractor.write_fasta(output_file='gene_sequences.fasta')
全部评论 (0)
还没有任何评论哟~
