HIT 生物信息学 实验三:基于de Bruijn graph的基因组片段拼接系统 参考代码
目录
实验目的:
实验原理:
Error correction
构建De Bruijn Graph
Refine去除tips和bubbles路径。
细节:
测试结果及分析
经验体会:
参考代码:
实验目的:
给定来自大肠杆菌E. Coli(长度~5Mbp)的基因组测序片段作为输入。
建立算法和系统,可以完成对基因组测序片段的拼接,形成contig一系列序列。
采用基于de Bruijn graph和欧拉路径的拼接算法。
根据输入数据,自行构建de Bruijn graph,并在其中找到所有的unipath作为contigs输出。
预先对de Bruijn graph的所有节点进行处理,去除putative false vertices(即缺少足够read支持的节点,加分项)。
对de Bruijn graph进行处理和分析,去除tips和bubbles路径(加分项)。
实验原理:
基于de Bruijn graph的组装:
- Error correction
- De Bruijn Graph
- Refine
Error correction
第一步Error correction:去除putative false vertices,即缺少足够read支持的节点。思路:错误倾向于将频繁的k-mers转换为不频繁的k-mers,故应该将出现次数不频繁的k-mers删除。
利用Hash表记录不同k-mer的数量。从文件中读入read,记录k-mer的数量,将总数较少的k-mer删除,保留有足够read支持的k-mer。
实现:
函数CreatTable:
功能:记录各个k-mer出现的数量在哈希表中
输入:file:read文件路径
h:用于记录各个k-mer出现的数量的hash表。
输出:无
实现:读取每条read,提取read的每条k-mer,在hash表中检查该k-mer是否存在,若存在则将该k-mer的次数加1。若不存在则将该k-mer存储在hash表中。
函数Errorcorection:
功能:去除hash表中缺少足够read支持的k-mer
输入:hash:记录各个k-mer出现的数量的hash表。
输出:无
实现:遍历hash表的所有节点,将出现次数小于一定值的节点删除。
构建De Bruijn Graph
De Bruijn Graph:顶点为k-1-mer,边为k-mer的有向加权图。
函数CreatGraph:
功能:建造De Bruijn Graph
输入:pG:De Bruijn Graph
h:记录k-mer的hash表
H:记录k-1-mer的hash表
输出:无
实现:遍历h的每个k-mer,利用addedge函数将该k-mer记录在图中。
函数:addedge:
功能:将k-mer添加在De Bruijn Graph中。
输入:h:记录k-1-mer的hash表
pG:De Bruijn Graph
k-mer:添加的k-mer
v:k-mer的出现次数
输出:无
实现:将k-mer分解成lk-mer与rk-mer。在hash表h中寻找lk-mer与rk-mer,若两者都存在,则在两者的结点处加一条权重为v的边,方向为从lk-mer到rk-mer;若两者都不存在,则先加入两者的结点,再加边;若两者只存在一个,则加入hash中不存在的结点,再加边。
Refine去除tips和bubbles路径。
函数:deletetips
输入:pG:De Bruijn Graph
输出:无
实现:遍历图中的结点,若该结点的入度或出度大于1,则监听该结点分出的两个路径,不断更新结点直到其中一条路径的结点不是单入单出的结点。若一条路径终止,而另一条路径未终止,则判断该结点存在一个tips,并去除长度较短的路径;若两个路径同时终止且终止在同一个结点,则判断该结点存在一个bubble,并去除权重和最小的路径。
细节:
Hash表:Hash表结构:

Hash表插入方法:头插法
De Bruijn Graph表示结构:十字链表

De Bruijn Graph加入边:头插法
De Bruijn Graph删除入边:修改两个指针的指向。
k-mer存储:设定的k-mer长度为23bp,将23bp分解为15bp与8bp,按照A-00,C-01,G-10,T-11的编码规则编码,将编码的结果转化为int类型。
k-1-mer存储:与k-mer存储同理,将22bp分解为15bp与7bp。
k-mer变k-1-mer:利用k-mer的数值分别构建两个k-1-mer的数值

输出contig:遍历De Bruijn Graph寻找起点,将第一个结点的序列完全输出,其余结点只输出最后一个碱基,直到结束。
测试结果及分析
读取read文件记录k-mer:

Errorcorection:

构建De Bruijn Graph:

去除tips和bubbles(部分):


将contig文件写入:


进程内存:

所用时间:

生成contig文件:

8384个contig。共7097630个碱基。
部分contig:

最长contig:28549bp
最短contig:23bp
平均contig:847bp
分析:
在生成的contig中仍然有比较多的较短的contig,也有一些比较长的contig,可以看出去除tips与bubbles没有去除完全,或者说在寻找tips与bubbles时出现了一些遗漏,实际上,对于该实验代码中记录两个路径的寻找tips与bubble方法,在有些情况下无法很好的处理,例如在结点的入度或出度大于2时,在前两个路径无法去除的情况下,后面的路径没有机会去遍历,再例如当bubble中套bubble的情况下也不能去除bubble,虽然这些情况出现的几率比较小,但也是存在的,直接影响了生成的contig的质量,采用课件上的广度优先遍历可以解决这些问题,但广度优先搜索会使程序的时间复杂度与空间复杂度提高,还要格外记录路径,代码也会更复杂,故没有采用。
在程序过程中会越来越慢,有一定原因是因为当内存使用完时就用磁盘保存数据。在进程时,数据在磁盘和内存之间来回移动。内存管理硬件负责把虚拟地址翻译为物理地址,并让一个进程始终于系统的真正内存中,进程在磁盘与内存之间来回切换。这种数据的来回切换造成时间的浪费。
经验体会:
通过此次实验让我了解了基因组片段拼接的De Bruijn Graph的实现的过程,提高了我的代码能力以及对基因组拼接的理解。完整地完成基因组拼接算法所有模块(数据读入,Errorcorection,构建De Bruijn Graph,去除tips和bubbles),去除false vertices将缺少足够read支持的节点去除,基本去除tips和bubbles,拼接形成的contigs序列足够长且序列正确,程序正确。
参考代码:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#define SIZEOFKMER 23
#define MAXHASHSIZE 99999983
#define maxn 18133313
typedef struct element
{
int key;
int data2;
} Element;
typedef struct node
{
Element* data;
struct node* nexthash;
int sum;
} Node;
typedef struct _WNode
{
struct node* nexthash;
} WNode;
typedef struct hash_table
{
int size;
int length;
WNode head[MAXHASHSIZE];
} Hash_table;
Hash_table* Creat_Table()
{
Hash_table* h;
h = (Hash_table*)malloc(sizeof(Hash_table));
memset(h, 0, sizeof(Hash_table));
h->size = MAXHASHSIZE;
h->length = 0;
for (int i = 0; i < h->size; i++)
{
h->head[i].nexthash = NULL;
}
return h;
}
int CaculateHash(int key)
{
return key % MAXHASHSIZE;
}
char* datatosubdata(char* data, int start, int end)
{
char* subdata = (char*)malloc((sizeof(char)) * (end - start + 1 + 1));
for (int i = start; i <= end; i++)
{
subdata[i - start] = data[i];
}
subdata[end - start + 1] = '\0';
return subdata;
}
char* subdatatobindata(char* subdata)
{
int L = strlen(subdata);
char* bindata = (char*)malloc(sizeof(char) * 2 * L + 1);
for (int i = 0; i < L; i++)
{
if (subdata[i] == 'A')
{
bindata[2 * i] = '0';
bindata[2 * i + 1] = '0';
}
else if (subdata[i] == 'C')
{
bindata[2 * i] = '0';
bindata[2 * i + 1] = '1';
}
else if (subdata[i] == 'G')
{
bindata[2 * i] = '1';
bindata[2 * i + 1] = '0';
}
else if (subdata[i] == 'T')
{
bindata[2 * i] = '1';
bindata[2 * i + 1] = '1';
}
else if (subdata[i] == 'N')
{
bindata[2 * i] = 'N';
bindata[2 * i + 1] = 'N';
}
}
bindata[2 * L] = '\0';
return bindata;
}
int bindatatoint(char* bindata)
{
int num = 0;
int L = strlen(bindata);
for (int j = L - 1; j >= 0; j--)
{
if (bindata[j] == '1')
{
num += (int)pow(2, L - 1 - j);
}
}
return num;
}
int chartokey(char* data)
{
char* subdata = datatosubdata(data, 0, 14);
char* bindata = subdatatobindata(subdata);
int key = bindatatoint(bindata);
free(subdata);
free(bindata);
return key;
}
int chartodata2(char* data)
{
char* subdata = datatosubdata(data, 15, 22);
char* bindata = subdatatobindata(subdata);
int data2 = bindatatoint(bindata);
free(subdata);
free(bindata);
return data2;
}
char* inttobin(int data, int length) {
char* c = (char*)malloc(sizeof(char) * length * 2 + 1);
for (int i = 0;i < length * 2;i++) {
if (data / (int)pow(2, 2 * length - 1 - i) == 1) {
data -= pow(2, 2 * length - 1 - i);
c[i] = '1';
}
else {
c[i] = '0';
}
}
c[length * 2] = '\0';
return c;
}
char* dattochar(Element* data) {
char* a = inttobin(data->key, 15);
char* b = inttobin(data->data2, 7);
char* ch = (char*)malloc(sizeof(char) * SIZEOFKMER);
for (int i = 0;i < 15;i++) {
if (a[2 * i] == '0' && a[2 * i + 1] == '0') {
ch[i] = 'A';
}
else if (a[2 * i] == '0' && a[2 * i + 1] == '1') {
ch[i] = 'C';
}
else if (a[2 * i] == '1' && a[2 * i + 1] == '0') {
ch[i] = 'G';
}
else if (a[2 * i] == '1' && a[2 * i + 1] == '1') {
ch[i] = 'T';
}
}
for (int i = 0;i < 7;i++) {
if (b[2 * i] == '0' && b[2 * i + 1] == '0') {
ch[i + 15] = 'A';
}
else if (b[2 * i] == '0' && b[2 * i + 1] == '1') {
ch[i + 15] = 'C';
}
else if (b[2 * i] == '1' && b[2 * i + 1] == '0') {
ch[i + 15] = 'G';
}
else if (b[2 * i] == '1' && b[2 * i + 1] == '1') {
ch[i + 15] = 'T';
}
}
ch[SIZEOFKMER - 1] = '\0';
return ch;
//return "h";
}
char* dattochar1(Element* data) {
char* a = inttobin(data->key, 15);
char* b = inttobin(data->data2, 8);
char* ch = (char*)malloc(sizeof(char) * (SIZEOFKMER+1));
for (int i = 0;i < 15;i++) {
if (a[2 * i] == '0' && a[2 * i + 1] == '0') {
ch[i] = 'A';
}
else if (a[2 * i] == '0' && a[2 * i + 1] == '1') {
ch[i] = 'C';
}
else if (a[2 * i] == '1' && a[2 * i + 1] == '0') {
ch[i] = 'G';
}
else if (a[2 * i] == '1' && a[2 * i + 1] == '1') {
ch[i] = 'T';
}
}
for (int i = 0;i < 8;i++) {
if (b[2 * i] == '0' && b[2 * i + 1] == '0') {
ch[i + 15] = 'A';
}
else if (b[2 * i] == '0' && b[2 * i + 1] == '1') {
ch[i + 15] = 'C';
}
else if (b[2 * i] == '1' && b[2 * i + 1] == '0') {
ch[i + 15] = 'G';
}
else if (b[2 * i] == '1' && b[2 * i + 1] == '1') {
ch[i + 15] = 'T';
}
}
ch[SIZEOFKMER ] = '\0';
return ch;
//return "h";
}
char* dattosubchar(Element* data) {
char* c = inttobin(data->data2, 7);
if (c[12] == '0' && c[13] == '0') {
return "A";
}
else if (c[12] == '0' && c[13] == '1') {
return "C";
}
else if (c[12] == '1' && c[13] == '0') {
return "G";
}
else if (c[12] == '1' && c[13] == '1') {
return "T";
}
return "H";
}
char* re(char* data) {
int L = strlen(data);
char* r = (char*)malloc(sizeof(char)*(L+1));
for (int i = 0;i < L;i++) {
r[i] = data[L - i-1];
}
r[L] = '\0';
for (int i = 0;i < L;i++) {
if (r[i] == 'A') {
r[i] = 'T';
}
else if (r[i] == 'C') {
r[i] = 'G';
}
else if (r[i] == 'G') {
r[i] = 'T';
}
else if (r[i] == 'T') {
r[i] = 'A';
}
}
return r;
}
Node* findinhash(Hash_table* h, char* data)
{
int key = chartokey(data);
int data2 = chartodata2(data);
int position = CaculateHash(key);
Node* q = h->head[position].nexthash;
while (q != NULL)
{
if (q->data->data2 == data2 && q->data->key == key)
{
return q;
}
q = q->nexthash;
}
return NULL;
}
void InsertHash(Hash_table* h, Element* k, int location)
{
Node* q = (Node*)malloc(sizeof(Node));
q->data = k;
q->sum = location;
int position;
position = CaculateHash(k->key);
q->nexthash = h->head[position].nexthash;
h->head[position].nexthash = q;
h->length += 1;
}
void CreatTable(char* file, Hash_table* h)
{
FILE* r;
fopen_s(&r,file, "r");
int hy = 0;
char* s = (char*)malloc(sizeof(char) * 160);
while (fgets(s, 170, r) != NULL)
{
if ((s[0] == 'A' || s[0] == 'G' || s[0] == 'C' || s[0] == 'T') && (s[1] == 'A' || s[1] == 'G' || s[1] == 'C' || s[1] == 'T'))
{
s[strlen(s)] = '\0';
//printf("%d\n", hy);
hy++;
//printf("%s\n",s);
for (int i = 0; i < strlen(s) - SIZEOFKMER; i++)
{
char* kmer = (char*)malloc(sizeof(char) * (SIZEOFKMER + 1));
for (int j = 0; j < SIZEOFKMER; j++)
{
kmer[j] = s[i + j];
}
kmer[SIZEOFKMER] = '\0';
//printf("%s\n",kmer);
Node* n = findinhash(h,kmer);
if (n==NULL) {
Element* k = (Element*)malloc(sizeof(Element));
k->key = chartokey(kmer);
k->data2 = chartodata2(kmer);
InsertHash(h,k,1);
}
else {
n->sum++;
}
free(kmer);
}
}
}
fclose(r);
}
void Creattable(char* file, Hash_table* h)
{
FILE* r;
fopen_s(&r, file, "r");
int hy = 0;
char* s = (char*)malloc(sizeof(char) * 160);
while (fgets(s, 160, r) != NULL)
{
if ((s[0] == 'A' || s[0] == 'G' || s[0] == 'C' || s[0] == 'T') && (s[1] == 'A' || s[1] == 'G' || s[1] == 'C' || s[1] == 'T'))
{
s[strlen(s)] = '\0';
//printf("%d\n", hy);
hy++;
// printf("%s\n",s);
for (int i = 0; i <strlen(s) - SIZEOFKMER; i++)
{
char* kmer = (char*)malloc(sizeof(char) * (SIZEOFKMER + 1));
for (int j = 0; j < SIZEOFKMER; j++)
{
kmer[j] = s[i + j];
}
kmer[SIZEOFKMER] = '\0';
char* kkmer = re(kmer);
Node* N = findinhash(h, kkmer);
if (N == NULL) {
Element* k = (Element*)malloc(sizeof(Element));
k->key = chartokey(kkmer);
k->data2 = chartodata2(kkmer);
InsertHash(h, k, 1);
}
else {
N->sum++;
}
free(kkmer);
free(kmer);
}
}
}
fclose(r);
}
void Errorcorrection(Hash_table* h)
{
for (int i = 0;i < MAXHASHSIZE;i++) {
if (h->head[i].nexthash!=NULL) {
while(h->head[i].nexthash!=NULL&&(h->head[i].nexthash->sum == 1 || h->head[i].nexthash->sum == 2)) {
//printf("1");
Node* n = h->head[i].nexthash;
h->head[i].nexthash = h->head[i].nexthash->nexthash;
free(n);
}
Node* N = h->head[i].nexthash;
while (N!=NULL&&N->nexthash!= NULL) {
if (N->nexthash->sum == 1 || N->nexthash->sum == 2) {
Node* n = N->nexthash;
N->nexthash = N->nexthash->nexthash;
free(n);
}
else {
N = N->nexthash;
}
}
}
}
}
Node* Findinhash(Hash_table* h, Element* data)
{
int key = data->key;
int data2 = data->data2;
int position = CaculateHash(key);
Node* q = h->head[position].nexthash;
while (q != NULL)
{
if (q->data->data2 == data2 && q->data->key == key)
{
return q;
}
q = q->nexthash;
}
return NULL;
}
typedef struct nodes
{
int data1;
int data2;
int value;
struct nodes* nextinD;
struct nodes* nextoutD;
} Nodes;
typedef struct _VNode
{
Element* data;
Nodes* in_edge;
Nodes* out_edge;
} VNode;
typedef struct _LGraph
{
int vex_num;
int edg_num;
VNode vexs[maxn];
} LGraph;
LGraph* create()
{
LGraph* pG;
pG = (LGraph*)malloc(sizeof(LGraph));
pG->vex_num = 0; //顶点数
pG->edg_num = 0; //边数
for (int i = 0; i < maxn; ++i) //初始化定点表的指针域为空
{
pG->vexs[i].in_edge = NULL;
pG->vexs[i].out_edge = NULL;
}
return pG;
}
Element* kmertorkmer(Element* kmer) {
Element* rkmer = (Element*)malloc(sizeof(Element));
rkmer->key = (kmer->key % (int)pow(2, 28)) * 4 + kmer->data2 / (int)pow(2,14);
rkmer->data2=kmer->data2% (int)pow(2, 14);
return rkmer;
}
Element* kmertolkmer(Element* kmer) {
Element* lkmer = (Element*)malloc(sizeof(Element));
lkmer->key = kmer->key;
lkmer->data2 = kmer->data2 /(int)pow(2, 2);
return lkmer;
}
LGraph* pG;
void addedge(Hash_table* h, LGraph* pG, Element* kmer,int v)
{
Element* lkmer = kmertolkmer(kmer);
Element* rkmer = kmertorkmer(kmer);
Node* flaga = Findinhash(h, lkmer);
Node* flagb = Findinhash(h, rkmer);
if (flaga == NULL && flagb == NULL)
{
InsertHash(h, lkmer, pG->vex_num);
InsertHash(h, rkmer, (pG->vex_num) + 1);
pG->vexs[pG->vex_num].data = lkmer;
pG->vexs[pG->vex_num + 1].data = rkmer;
Nodes* p = (Nodes*)malloc(sizeof(Nodes));
p->value = v;
p->data1 = pG->vex_num;
p->data2 = pG->vex_num + 1;
p->nextinD = pG->vexs[pG->vex_num].in_edge;
pG->vexs[pG->vex_num].in_edge = p;
p->nextoutD = pG->vexs[pG->vex_num + 1].out_edge;
pG->vexs[pG->vex_num + 1].out_edge = p;
pG->edg_num++;
pG->vex_num = pG->vex_num + 2;
}
else if (flaga == NULL && flagb != NULL)
{
free(rkmer);
InsertHash(h, lkmer, pG->vex_num);
pG->vexs[pG->vex_num].data = lkmer;
Nodes* p = (Nodes*)malloc(sizeof(Nodes));
p->value = v;
p->data1 = pG->vex_num;
p->data2 = flagb->sum;
p->nextinD = pG->vexs[pG->vex_num].in_edge;
pG->vexs[pG->vex_num].in_edge = p;
p->nextoutD = pG->vexs[flagb->sum].out_edge;
pG->vexs[flagb->sum].out_edge = p;
pG->edg_num++;
pG->vex_num++;
}
else if (flaga != NULL && flagb == NULL)
{
free(lkmer);
InsertHash(h, rkmer, pG->vex_num);
pG->vexs[pG->vex_num].data = rkmer;
Nodes* p = (Nodes*)malloc(sizeof(Nodes));
p->value = v;
p->data1 = flaga->sum;
p->data2 = pG->vex_num;
p->nextinD = pG->vexs[flaga->sum].in_edge;
pG->vexs[flaga->sum].in_edge = p;
p->nextoutD = pG->vexs[pG->vex_num].out_edge;
pG->vexs[pG->vex_num].out_edge = p;
pG->edg_num++;
pG->vex_num++;
}
else if (flaga != NULL && flagb != NULL)
{
free(lkmer);
free(rkmer);
Nodes* p = (Nodes*)malloc(sizeof(Nodes));
p->value = v;
p->data1= flaga->sum;
p->data2 = flagb->sum;
p->nextinD = pG->vexs[flaga->sum].in_edge;
pG->vexs[flaga->sum].in_edge = p;
p->nextoutD = pG->vexs[flagb->sum].out_edge;
pG->vexs[flagb->sum].out_edge = p;
pG->edg_num++;
}
}
void CreatGraph(Hash_table* h, Hash_table* H, LGraph* pG) {
for (int i = 0;i < MAXHASHSIZE;i++) {
if (h->head[i].nexthash != NULL) {
Node* n = h->head[i].nexthash;
while (n != NULL) {
addedge(H, pG, n->data,n->sum);
n = n->nexthash;
}
}
}
for (int i = 0;i < MAXHASHSIZE;i++) {
while (h->head[i].nexthash != NULL) {
Node* n= h->head[i].nexthash;
h->head[i].nexthash = h->head[i].nexthash->nexthash;
free(n);
}
}
}
int isoneinineout(VNode p)
{
if (p.in_edge != NULL && p.in_edge->nextinD == NULL && p.out_edge != NULL && p.out_edge->nextoutD == NULL)
{
return 1;
}
return 0;
}
int isend(VNode p)
{
if (p.in_edge == NULL)
{
return 1;
}
return 0;
}
int ismutend(VNode p)
{
if (p.in_edge == NULL && p.out_edge != NULL && p.out_edge->nextoutD != NULL)
{
return 1;
}
return 0;
}
int isstart(VNode p)
{
if (p.out_edge == NULL)
{
return 1;
}
return 0;
}
int ismutstart(VNode p)
{
if (p.out_edge == NULL && p.in_edge != NULL && p.in_edge->nextinD != NULL)
{
return 1;
}
return 0;
}
int ismutiin(VNode p)
{
if (p.in_edge != NULL && p.in_edge->nextinD == NULL && p.out_edge != NULL && p.out_edge->nextoutD != NULL)
{
return 1;
}
return 0;
}
int ismutiout(VNode p)
{
if (p.out_edge != NULL && p.out_edge->nextoutD == NULL && p.in_edge != NULL && p.in_edge->nextinD != NULL)
{
return 1;
}
return 0;
}
int isNULL(VNode p)
{
if (p.in_edge == NULL && p.out_edge == NULL)
{
return 1;
}
return 0;
}
void Deleteout(LGraph* pG, Nodes* deletenode)
{
int position = deletenode->nextinD->data2;
if (pG->vexs[position].out_edge->data1 == deletenode->nextinD->data1)
{
pG->vexs[position].out_edge = pG->vexs[position].out_edge->nextoutD;
}
else
{
Nodes* p = pG->vexs[position].out_edge;
while (p != NULL && p->nextoutD)
{
if (p->nextoutD->data1 == deletenode->nextinD->data1)
{
p->nextoutD = p->nextoutD->nextoutD;
break;
}
p = p->nextoutD;
}
}
Nodes* N = deletenode->nextinD;
deletenode->nextinD = deletenode->nextinD->nextinD;
free(N);
}
void Deleteouts(LGraph* pG, int i)
{
int position = pG->vexs[i].in_edge->data2;
if (pG->vexs[position].out_edge->data1== pG->vexs[i].in_edge->data1)
{
pG->vexs[position].out_edge = pG->vexs[position].out_edge->nextoutD;
}
else
{
Nodes* p = pG->vexs[position].out_edge;
while (p != NULL && p->nextoutD)
{
if (p->nextoutD->data1 == pG->vexs[i].in_edge->data1)
{
p->nextoutD = p->nextoutD->nextoutD;
break;
}
p = p->nextoutD;
}
}
Nodes* N = pG->vexs[i].in_edge;
pG->vexs[i].in_edge = pG->vexs[i].in_edge->nextinD;
free(N);
}
void deletetips(LGraph* pG) {
int L = pG->vex_num;
for (int k = 0;k < 10;k++) {
for (int i = 0; i < L; i++)
{
if (isNULL(pG->vexs[i]) == 0)
{
if (ismutiout(pG->vexs[i]) == 1 || ismutstart(pG->vexs[i]) == 1)
{
VNode p = pG->vexs[pG->vexs[i].in_edge->data2];
VNode q = pG->vexs[pG->vexs[i].in_edge->nextinD->data2];
int a[10000];
int suma = 0;
int b[10000];
int sumb = 0;
int num = 1;
a[0] = pG->vexs[i].in_edge->data2;
b[0] = pG->vexs[i].in_edge->nextinD->data2;
while (isoneinineout(p) && isoneinineout(q))
{
suma += p.in_edge->value;
sumb += q.in_edge->value;
a[num] = p.in_edge->data2;
b[num] = q.in_edge->data2;
p = pG->vexs[p.in_edge->data2];
q = pG->vexs[q.in_edge->data2];
num++;
}
suma += pG->vexs[i].in_edge->value;
sumb += pG->vexs[i].in_edge->nextinD->value;
if (isend(p)&& isend(q)==0)
{
printf("find a tip at %d!\n", i);
for (int j = 0; j < num - 1; j++)
{
Deleteouts(pG, a[j]);
}
Deleteouts(pG, i);
pG->edg_num -= suma;
pG->vex_num -= num;
}
else if (isend(q)&&isend(p)==0)
{
printf("find a tip at %d!\n", i);
for (int j = 0; j < num - 1; j++)
{
Deleteouts(pG, b[j]);
}
Deleteout(pG, pG->vexs[i].in_edge);
pG->edg_num -= sumb;
pG->vex_num -= num;
}
else if (isend(q)&&isend(p)) {
printf("find a tip at %d!\n", i);
if (suma >= sumb) {
for (int j = 0; j < num - 1; j++)
{
Deleteouts(pG, b[j]);
}
Deleteout(pG, pG->vexs[i].in_edge);
pG->edg_num -= sumb;
pG->vex_num -= num;
}
else if (sumb > suma) {
for (int j = 0; j < num - 1; j++)
{
Deleteouts(pG, a[j]);
}
Deleteouts(pG, i);
pG->edg_num -= suma;
pG->vex_num -= num;
}
}
else
{
if (a[num - 1] == b[num - 1])
{
printf("find a Bubble at %d!\n", i);
if (suma >= sumb)
{
for (int j = 0; j < num - 1; j++)
{
Deleteouts(pG, b[j]);
}
Deleteout(pG, pG->vexs[i].in_edge);
pG->edg_num -= sumb;
pG->vex_num -= num;
}
else
{
for (int j = 0; j < num - 1; j++)
{
Deleteouts(pG, a[j]);
}
Deleteouts(pG, i);
pG->edg_num -= suma;
pG->vex_num -= num;
}
}
}
}
else if (ismutiin(pG->vexs[i]) == 1 && ismutiin(pG->vexs[i]) == 1)
{
VNode p = pG->vexs[pG->vexs[i].out_edge->data1];
VNode q = pG->vexs[pG->vexs[i].out_edge->nextoutD->data1];
int a[10000];
int suma = 0;
int b[10000];
int sumb = 0;
int num = 1;
a[0] = pG->vexs[i].out_edge->data1;
b[0] = pG->vexs[i].out_edge->nextoutD->data1;
while (isoneinineout(p) && isoneinineout(q))
{
suma += p.out_edge->value;
sumb += q.out_edge->value;
a[num] = p.out_edge->data1;
b[num] = q.out_edge->data1;
p = pG->vexs[ p.out_edge->data1];
q = pG->vexs[ q.out_edge->data1];
num++;
}
suma += pG->vexs[i].out_edge->value;
sumb += pG->vexs[i].out_edge->nextoutD->value;
if (isstart(p)&&isstart(q)==0)
{
printf("find a tip at %d!\n", i);
for (int i = 0; i < num; i++)
{
Deleteouts(pG, a[i]);
}
pG->edg_num -= suma;
pG->vex_num -= num;
}
else if (isstart(q)&&isstart(p)==0)
{
printf("find a tip at %d!\n", i);
for (int i = 0; i < num; i++)
{
Deleteouts(pG, b[i]);
}
pG->edg_num -= sumb;
pG->vex_num -= num;
}
else if (isstart(q) && isstart(p)) {
printf("find a tip at %d!\n", i);
if (suma >= sumb)
{
for (int i = 0; i < num; i++)
{
Deleteouts(pG, b[i]);
}
pG->edg_num -= sumb;
pG->vex_num -= num;
}
else if(sumb>suma)
{
for (int i = 0; i < num; i++)
{
Deleteouts(pG, a[i]);
}
pG->edg_num -= suma;
pG->vex_num -= num;
}
}
else
{
if (a[num - 1] == b[num - 1])
{
printf("find a Bubble at %d!\n", i);
if (suma >= sumb)
{
for (int i = 0; i < num; i++)
{
Deleteouts(pG, b[i]);
}
pG->edg_num -= sumb;
pG->vex_num -= num;
}
else
{
for (int i = 0; i < num; i++)
{
Deleteouts(pG, a[i]);
}
pG->edg_num -= suma;
pG->vex_num -= num;
}
}
}
}
}
}
}
}
void Print(LGraph* pG, char* filename) {
FILE* r;
fopen_s(&r, filename, "w");
//fputs('A',r);
printf("%d\n",pG->vex_num);
for (int i = 0;i <pG->vex_num;i++) {
if (isNULL(pG->vexs[i]) == 0) {
if (isstart(pG->vexs[i])||ismutiin(pG->vexs[i])) {
// printf("%s", dattochar(pG->vexs[i].data));
fputs(dattochar(pG->vexs[i].data),r);
VNode n = pG->vexs[pG->vexs[i].in_edge->data2];
while (isoneinineout(n)) {
fputs(dattosubchar(n.data), r);
// printf("%s\n", dattosubchar(n.data));
n = pG->vexs[n.in_edge->data2];
}
//printf("%s\n", dattosubchar(n.data));
fputs(dattosubchar(n.data),r);
fputs("\n",r);
// printf("\n");
}
if (ismutstart(pG->vexs[i]) || ismutiout(pG->vexs[i])) {
Nodes* N = pG->vexs[i].in_edge;
while (N != NULL) {
fputs(dattochar(pG->vexs[i].data), r);
VNode n = pG->vexs[N->data2];
while (isoneinineout(n)) {
fputs(dattosubchar(n.data), r);
// printf("%s", dattosubchar(n.data));
n = pG->vexs[n.in_edge->data2];
}
//printf("%s", dattosubchar(n.data));
fputs(dattosubchar(n.data), r);
fputs("\n", r);
N = N->nextinD;
}
}
}
}
fclose(r);
}
int main() {
Hash_table* h;
h = Creat_Table();
printf("StartCreatTable !\n");
CreatTable("C:/Users/86136/Desktop/xinjianwenjianjia/swxxxlab3/NC_008253_2.fastq", h);
CreatTable("C:/Users/86136/Desktop/xinjianwenjianjia/swxxxlab3/NC_008253_1.fastq", h);
printf("StartCreatTable ok!\n");
printf("StartErrorcorrection !\n");
Errorcorrection(h);
printf("Errorcorrection OK!\n");
printf("Start creatgraph !\n");
Hash_table* H;
H = Creat_Table();
pG = create();
CreatGraph(h, H, pG);
printf("creatgraph ok!\n");
printf("start deletetips!\n");
deletetips(pG);
printf("deletetips ok!\n");
printf("start printf");
Print(pG, "C:/Users/86136/Desktop/xinjianwenjianjia/swxxxlab3/result.txt");
printf("printf ok\n");
printf("huiyi\n");
}

