Advertisement

C++ 重复的DNA序列

阅读量:

将DNA序列视为仅由A、C、G、T这四个字母构成的字符串,并对给定的一个DNA字符串进行处理。
具体来说,在该字符串中寻找所有长度为10且至少出现两次以上的子串。
例如:
假设输入字符串 s = "AAAAACCCCC" + "AAA" + "CC", 则返回值应包含两个满足条件的部分。
再如:
当输入全由A组成时,
即 s = "AAAAAAAAAA",
则返回值仅为这一种情况。

方法一:

复制代码
    #include <string>
    #include<vector>
    #include<map>
    class Solution
    {
    public:
     Solution() {}
     ~Solution() {}
     std::vector<std::string> findRepeatedDnaSequences(std::string s)
     {
      std::vector<std::string> result;
      std::map<std::string, int> word_map;
      for (int i = 0; i < s.length(); i++)
      {
       std::string word = s.substr(i, 10);
       if (word_map.find(word) != word_map.end())
       {
    word_map[word]+=1;
       }
       else
       {
    word_map[word] = 1;
       }
      }
      std::map<std::string, int>::iterator it;
      for (it = word_map.begin(); it != word_map.end(); it++)
      {
       if (it->second > 1)
       {
    result.push_back(it->first);
       }
      }
      return result;
     }
    };
    int main()
    {
     std::string s = "AAAAACCCCCAAAAACCCCCAAAAAGGGTTT";
     Solution solve;
     std::vector<std::string> result = solve.findRepeatedDnaSequences(s);
     for (int i = 0; i < result.size(); i++)
     {
      printf("%s\n", result[i].c_str());
     }
     return 0;
    }

运行结果为:

复制代码
    AAAAACCCCC
    AAAACCCCCA
    AAACCCCCAA
    AACCCCCAAA
    ACCCCCAAAA
    CCCCCAAAAA

方法二:

复制代码
    #include <string>
    #include <vector>
    int g_hash_map[1048576] = { 0 };
    std::string change_int_to_DNA(int DNA)
    {
     static const char DNA_CHAR[] = { 'A','C','G','T' };
     std::string str;
     for (int  i = 0; i < 10; i++)
     {
      str += DNA_CHAR[DNA & 3];
      DNA = DNA >> 2;
     }
     return str;
    }
    class Solution
    {
    public:
     Solution(){}
     ~Solution(){}
     std::vector<std::string> findRepeatedDnaSequences(std::string s)
     {
      std::vector<std::string> result;
      if (s.length()<10)
      {
       return result;
      }
      for (int i = 0; i < 1048576; i++)
      {
       g_hash_map[i] = 0;
      }
      int char_map[128] = { 0 };
      char_map['A'] = 0;
      char_map['C'] = 1;
      char_map['G'] = 2;
      char_map['T'] = 3;
      int key = 0;
      for (int i = 9; i >=0; i--)
      {
       key = (key << 2) + char_map[s[i]];
      }
      g_hash_map[key] = 1;
      for (int i = 10; i < s.length(); i++)
      {
       key = key >> 2;
       key = key | (char_map[s[i]] << 18);
       g_hash_map[key]++;
      }
      for (int i = 0; i < 1048576; i++)
      {
       if (g_hash_map[i]>1)
       {
    result.push_back(change_int_to_DNA(i));
       }
      }
      return result;
     }
    };
    int main()
    {
     std::string s = "AAAAACCCCCAAAAACCCCCAAAAAGGGTTT";
     Solution solve;
     std::vector<std::string> result = solve.findRepeatedDnaSequences(s);
     for (int i = 0; i < result.size(); i++)
     {
      printf("%s\n", result[i].c_str());
     }
     return 0;
    }

运行结果为:

复制代码
    CCCCCAAAAA
    ACCCCCAAAA
    AACCCCCAAA
    AAACCCCCAA
    AAAACCCCCA
    AAAAACCCCC

全部评论 (0)

还没有任何评论哟~