Advertisement

Python生物信息学数据管理——第3、4章课后习题

阅读量:

#初学小白,仅供参考
3.1读取和写入文件

复制代码
    fo=open('neuron_data.txt','r')
    fi=open('neuron_data_副本.txt','w')
    txt=fo.read()
    fi.write(txt)
    fo.close()
    fi.close()

3.2计算平均值和标准差

复制代码
    import math
    fo=open('neuron_data.txt','r')
    fi=open('结果.txt','w')
    txt=fo.readlines()
    data=[]
    for i in txt:
    data.append(float(i.strip()))
    average=sum(data)/len(data)
    total=0
    for value in data:
    total+=(value-average)**2
    stddev=math.sqrt(total/len(data)) 
    fi.write('平均长度'+str(average)+'\n')
    fi.write('标准差'+str(round(stddev,2)))
    fo.close()
    fi.close()

3.3/4/5核苷酸的频率、DNA序列的GC含量

复制代码
    fo=open('a序列.txt','r')
    fi=open('结果.txt','w')
    txt=fo.read()
    data={}
    for i in txt:
    data[i]=data.get(i,0)+1
    del data['\n']
    total=sum(data.values())
    maxfre=0
    for i in data:
    fi.write('{}的频率为{:.1%}\n'.format(i,data[i]/total))
    if data[i]>maxfre:
        maxfre=data[i]
        base=i
    fi.write('{}出现的频率最高:{:.1%}\n'.format(base,maxfre/total))
    fi.write('GC含量为{:.1%}'.format((data['G']+data['C'])/total))
    fo.close()
    fi.close()

4.1读取和写入多序列FASTA文件

复制代码
    fasta_file=open('input.fasta','r')
    for line in fasta_file:
    if line[0]=='>':
        AC=line.split('|')[1]
        outfile=open(AC+'.fasta','w')
        outfile.write(line)
    if line[0]!='>':
        outfile.write(line)
    outfile.close()

4.2读取和过滤FASTA文件

复制代码
    fasta_file=open('input.fasta','r')
    out_file=open('result.fasta','w')
    seq=''
    for line in fasta_file:
    if line[0]=='>' and seq=='':
        header=line
    elif line[0]!='>':
        seq+=line
    else:
        if seq[0]=='M' and seq.count('W')>1:
            out_file.write(header+seq)
        seq=''
        header=line
    if seq[0]=='M' and seq.count('W')>1:
            out_file.write(header+seq)
    fasta_file.close()
    out_file.close()

4.3计算在FASTA格式中单个DNA序列的核苷酸频率

复制代码
    input_file=open('a序列.txt','r')
    seq=''
    for line in input_file:
    if line[0]=='>':
        header=line
    elif line[0]!='>':
        seq+=line
    Total=float(len(seq))
    count_A=seq.count('A')
    count_T=seq.count('T')
    count_C=seq.count('C')
    count_G=seq.count('G')
    print('A的频率{:.2}\nT的频率{:.2}\nC的频率{:.2}\nG的频率{:.2}'.format(count_A/Total,count_T/Total,count_C/Total,count_G/Total))
    input_file.close()

4.4计算在FASTA格式中多个DNA序列的核苷酸频率
4.5计算在GenBank中的多个DNA序列的核苷酸频率

复制代码
    fasta_file=open('a序列.txt','r')
    seq=''
    for line in fasta_file:
    if line[0]=='>' and seq=='':
        AC=line.split('|')[1].split('_cds_')[0]
    elif line[0]!='>':
        seq+=line
    else:
        a=seq.count('A')/float(len(seq))
        t=seq.count('T')/float(len(seq))
        c=seq.count('C')/float(len(seq))
        g=seq.count('G')/float(len(seq))
        print('{}\tA:{:.2}\tT:{:.2}\tC:{:.2}\tG:{:.2}'.format(AC,a,t,c,g))
        seq=''
        AC=line.split('|')[1].split('_cds_')[0]
    a=seq.count('A')/float(len(seq))
    t=seq.count('T')/float(len(seq))
    c=seq.count('C')/float(len(seq))
    g=seq.count('G')/float(len(seq))
    print('{}\tA:{:.2}\tT:{:.2}\tC:{:.2}\tG:{:.2}'.format(AC,a,t,c,g))
    fasta_file.close()

全部评论 (0)

还没有任何评论哟~