Advertisement

MAX-MINER 频繁模式挖掘 Apriori算法

阅读量:

这几天小白让我做一个max-miner 最大项集的挖掘,一般的算法有apriori和FP-TREE 考虑到用FP-TREE 可能有点复杂就用apriori算法先测试下,在小样本测试的时候速度非常快,当对一个5W行的文本测试效率变得不可承受了,因此对算法进行了分析,假设每行有100个单词当用apriori算法对3-候选集进行count的时候每一个候选集需要:
5W * 100*3 = 1500W 次的比较 当3-候选集有1000个那么需要150亿次比较效率变得不可接受即使用优化的算法在1天也很难得到结果。因此在样本很大的情况下Apriori算法并不适用。
原始Apriori算法借鉴了别人的一个程序进行了自己的改进,找不到原作者了在这里对作者表示感谢。

下面是代码:

复制代码
 #ifndef APRIOR_H

    
 #define APRIOR_H
    
 #include <vector>
    
 #include <map>
    
  
    
 using namespace std;
    
 class aprior{
    
 	public:
    
 		aprior(int supp, string name):support(supp),file(name){}
    
 		~aprior();
    
 		void openData();
    
 		void generate_1Itemset();
    
 		void generate_Alternative2();
    
 		void generate_Alternative();
    
 		void generate_ItemSet();
    
 		void get_Big_set();
    
 		void get2B();
    
 		void begin();
    
 		void countWord(char s);
    
 	private:
    
 		void countSupport();
    
 		void getNextSet();
    
 		void output1();
    
 		void output2();
    
 		bool compareS(const vector<string> s1, const vector<string> s2);
    
 		string getRes();
    
 		struct item{
    
 			string key;
    
 			int value;
    
 		};
    
  
    
 		struct mutiItem{
    
 			vector<string> key;
    
 			int value;
    
 		};
    
 		int support;
    
 		vector<mutiItem> vec_n_current;
    
 		vector<mutiItem> vec_muti_pre;
    
 		vector<mutiItem> vec_muti_current;
    
 		vector<item> vec_item;
    
 		vector<string> vec_str;
    
 		map<string, int> sequence;
    
 		string file;
    
 };
    
  
    
 #endif //end APRIOR_H
    
    
    
    
复制代码
 //aprior.cpp

    
 #include <algorithm>
    
 #include <iostream>
    
 #include <map>
    
 #include <sstream>
    
 #include <vector>
    
 #include <fstream>
    
 #include "aprior.h"
    
 #include <string>
    
 #include <cassert>
    
 #include <cstdio>
    
  
    
 using namespace std;
    
  
    
 //open file and each line of file as input database;
    
 void aprior::openData()
    
 {
    
 	ifstream in(file.c_str());
    
 	assert(in != NULL);
    
 	string word;
    
  
    
 	while(getline(in,word))
    
 	{
    
 		vec_str.push_back(word);
    
 	}
    
 }
    
  
    
 //1 genetative whit structure map
    
 void aprior::countWord(char split)
    
 {
    
 	string line, word;
    
 	for(vector<string>::iterator iter = vec_str.begin(); iter != vec_str.end(); ++iter){
    
 		line = *iter;
    
 		while(line.find(split) != -1)
    
 		{
    
 			word = line.substr(0, line.find(split));
    
 			++sequence[word];
    
  
    
 			line = line.substr(line.find(split)+1, line.size()-1);
    
 		}
    
 		++sequence[line];
    
 	}
    
 }
    
  
    
  
    
 void aprior::generate_1Itemset()
    
 {
    
 	item ite;
    
 	for(map<string,int>::iterator iter = sequence.begin(); iter !=sequence.end(); ++iter)
    
 	{
    
 		if(iter->second >= support)
    
 		{
    
 			ite.key = iter->first;
    
 			ite.value = iter->second;
    
 			vec_item.push_back(ite);
    
 		}
    
 	}
    
 }
    
  
    
 void aprior::generate_Alternative2()
    
 {
    
 	mutiItem mutiTmp;
    
 	vector<string> vecTmp;
    
  
    
 	for(vector<item>::iterator iter1 = vec_item.begin(); iter1 != vec_item.end() -1 ; ++iter1){
    
 		vecTmp.push_back(iter1->key);
    
 		for(vector<item>::iterator iter2 = iter1+1; iter2 !=vec_item.end(); ++iter2){
    
 			vecTmp.push_back(iter2->key);
    
 			mutiTmp.key = vecTmp;
    
 			mutiTmp.value = 0;
    
 			vec_muti_pre.push_back(mutiTmp);
    
 			vecTmp.pop_back();
    
 		}
    
  
    
 		vecTmp.clear();
    
 	}
    
 }
    
  
    
 void aprior::countSupport()
    
 {
    
 	bool flag = true;
    
 	vector<string> vec;
    
 	for(vector<mutiItem>::iterator iter1 = vec_muti_pre.begin(); iter1 != vec_muti_pre.end(); ++iter1){
    
 		vec = iter1->key;
    
 		for(vector<string>::iterator iter2 = vec_str.begin(); iter2 != vec_str.end(); ++iter2)
    
 		{
    
 			flag = true;
    
 			for(vector<string>::iterator iter3 = vec.begin(); iter3 != vec.end(); ++iter3){
    
 				if(iter2->find(*iter3) == -1)
    
 				{
    
 					flag =false;
    
 					break;
    
 				}
    
 			}
    
 				if(flag)
    
 					++iter1->value;
    
 		}
    
 	}
    
 }
    
  
    
 void aprior::get2B()
    
 {
    
 	ofstream out;
    
 	out.open(getRes().c_str(),ios::app);
    
 	out<<endl;
    
 	out<<"***************************************"<<endl;
    
 	vector<string> vecTmp;
    
 	bool flag = true;
    
 	for(vector<item>::iterator iter1 = vec_item.begin(); iter1 != vec_item.end(); ++iter1)
    
 	{
    
 		flag = true;
    
 		for(vector<mutiItem>::iterator iter = vec_muti_current.begin(); iter != vec_muti_current.end(); ++iter)
    
 		{
    
 			vecTmp =iter->key;
    
 			if(find(vecTmp.begin(),vecTmp.end(),iter1->key) != vecTmp.end())
    
 			{
    
 				flag =false;
    
 				break;
    
 			}
    
 		}
    
 		if(flag)
    
 			out<<iter1->key<<endl;
    
 	}
    
 	out<<"****************************************"<<endl;
    
 	out<<endl;
    
  
    
 	out.close();
    
 }
    
  
    
 void aprior::generate_ItemSet()
    
 {
    
 	vec_muti_current.clear();
    
 	for(vector<mutiItem>::iterator iter = vec_muti_pre.begin(); iter != vec_muti_pre.end(); ++iter){
    
 		if(iter->value >= support){
    
 			vec_muti_current.push_back(*iter);
    
 		}
    
 	}
    
 }
    
  
    
 void aprior::getNextSet()
    
 {
    
 	vec_n_current.clear();
    
 	vec_n_current = vec_muti_current;
    
 }
    
  
    
 void aprior::generate_Alternative(){
    
  
    
 	vec_muti_pre.clear();
    
 	int diff = 0;
    
 	vector<string> vecTmp;
    
 	string tmp;
    
 	mutiItem muti;
    
 	bool sa;
    
 	for(vector<mutiItem>::iterator iter1 = vec_muti_current.begin(); iter1 != vec_muti_current.end()-1; ++iter1)
    
 	{
    
  
    
 		vecTmp = iter1->key;
    
 		for(vector<mutiItem>::iterator iter2 = iter1+1; iter2 != vec_muti_current.end(); ++iter2){
    
 			diff = 0;
    
 			for(vector<string>::iterator iter3 = iter2->key.begin(); iter3 != iter2->key.end(); ++iter3){
    
 				if(find(vecTmp.begin(),vecTmp.end(),*iter3) == vecTmp.end())
    
 				{
    
 					++diff;
    
 					tmp = *iter3;				
    
 				}
    
 			}
    
 			if(diff == 1)
    
 			{
    
 				vecTmp.push_back(tmp);
    
 				muti.key = vecTmp;
    
 				muti.value = 0;
    
 				sa = true;
    
 				for(vector<mutiItem>::iterator itersa = vec_muti_pre.begin(); itersa != vec_muti_pre.end(); ++itersa)
    
 				{
    
 					if(compareS(itersa->key,muti.key))
    
 					{
    
 						sa = false;
    
 						break;
    
 					}
    
 				}
    
 				if(sa)
    
 				vec_muti_pre.push_back(muti);
    
 				vecTmp.pop_back();
    
 			}
    
 		}
    
 	}
    
 }
    
  
    
 bool aprior::compareS(vector<string> s1, vector<string> s2)
    
 {
    
 	for(vector<string>::iterator iter1 = s1.begin(); iter1 != s1.end(); ++iter1)
    
 	{
    
 			if(find(s2.begin(),s2.end(),*iter1)==s2.end())
    
 				return false;
    
 	}
    
  
    
 	return true;
    
 }
    
  
    
  
    
 void aprior::get_Big_set()
    
 {
    
 	ofstream out;
    
 	out.open(getRes().c_str(),ios::app);
    
 	out<<endl;
    
 	out<<"****************************************************************"<<endl;
    
 	bool flag = false;
    
 	vector<string> vecTmp;
    
 	for(vector<mutiItem>::iterator iter1 = vec_n_current.begin(); iter1 != vec_n_current.end(); ++iter1){
    
 		vecTmp = iter1->key;
    
 		flag = false;
    
 		for(vector<mutiItem>::iterator iter2 = vec_muti_current.begin(); iter2 != vec_muti_current.end(); ++iter2)
    
 		{
    
 			if(compareS(vecTmp,iter2->key))
    
 				flag = true;
    
 		}
    
 		if(!flag)
    
 		{
    
 			for(vector<string>::iterator iter = vecTmp.begin(); iter != vecTmp.end(); ++iter)
    
 				out<<*iter<<"\t";
    
 			out<<endl;
    
 		}
    
 	}
    
 	out<<"*************************************************************"<<endl;
    
 	out<<endl;
    
 }
    
  
    
 void aprior::begin()
    
 {
    
 	generate_1Itemset();
    
 	output1();
    
  
    
 	generate_Alternative2();
    
 	countSupport();
    
 	generate_ItemSet();
    
 	output2();
    
 	get2B();
    
  
    
 	while(1)
    
 	{
    
 		generate_Alternative();
    
 		getNextSet();
    
 		countSupport();
    
 		generate_ItemSet();
    
 	
    
 		if(vec_muti_current.size() == 0)
    
 			return;
    
 		output2();
    
 		get_Big_set();
    
 	}
    
 }
    
  
    
 string aprior::getRes()
    
 {
    
 	string tmp;
    
 	stringstream ss;
    
 	ss<<support;
    
 	tmp = ss.str();
    
 	tmp = file+tmp;
    
 	return tmp;
    
 }
    
 void aprior::output1()
    
 {
    
 	ofstream out;
    
 	string tmp = getRes();
    
 	out.open(tmp.c_str(),ios::app);
    
 	for(vector<item>::iterator iter = vec_item.begin(); iter != vec_item.end(); ++iter)
    
 	out<<iter->key<<'\t'<<iter->value<<endl;
    
  
    
 	out.close();
    
 }
    
  
    
 void aprior::output2()
    
 {
    
 	ofstream out;
    
 	out.open(getRes().c_str(),ios::app);
    
 	vector<string> vecTmp;
    
 	string tmp;
    
 	for(vector<mutiItem>::iterator iter = vec_muti_current.begin(); iter != vec_muti_current.end(); ++iter){
    
 		vecTmp = iter->key;
    
 		for(vector<string>::iterator iters = vecTmp.begin(); iters!= vecTmp.end(); ++iters)
    
 		out<<*iters<<"\t";
    
 		out<<iter->value<<endl;
    
  
    
 	}
    
 	out.close();
    
 }
    
  
    
 aprior::~aprior()
    
 {
    
  
    
 }
    
    
    
    
复制代码
 //main.cpp

    
 #include <iostream>
    
 #include "aprior.h"
    
  
    
 using namespace std;
    
  
    
 void test(int support,string file)
    
 {
    
 	aprior ap(support,file);
    
 	ap.openData();
    
 	ap.countWord(' ');
    
 	ap.begin();
    
 }
    
 int main()
    
 {
    
 	test(50,"data.txt");
    
 	return 0;
    
 }
    
    
    
    

在编译时候用g++ -c aprior.cpp main.cpp 就可以

要生成可执行文件需要g++ aprior.cpp main.cpp -o aprior就可以了。

在接下来要写FP-TREE的算法对样本就行测试对比效率。

全部评论 (0)

还没有任何评论哟~