MAX-MINER 频繁模式挖掘 Apriori算法
发布时间
阅读量:
阅读量
这几天小白让我做一个max-miner 最大项集的挖掘,一般的算法有apriori和FP-TREE 考虑到用FP-TREE 可能有点复杂就用apriori算法先测试下,在小样本测试的时候速度非常快,当对一个5W行的文本测试效率变得不可承受了,因此对算法进行了分析,假设每行有100个单词当用apriori算法对3-候选集进行count的时候每一个候选集需要:
5W * 100*3 = 1500W 次的比较 当3-候选集有1000个那么需要150亿次比较效率变得不可接受即使用优化的算法在1天也很难得到结果。因此在样本很大的情况下Apriori算法并不适用。
原始Apriori算法借鉴了别人的一个程序进行了自己的改进,找不到原作者了在这里对作者表示感谢。
下面是代码:
#ifndef APRIOR_H
#define APRIOR_H
#include <vector>
#include <map>
using namespace std;
class aprior{
public:
aprior(int supp, string name):support(supp),file(name){}
~aprior();
void openData();
void generate_1Itemset();
void generate_Alternative2();
void generate_Alternative();
void generate_ItemSet();
void get_Big_set();
void get2B();
void begin();
void countWord(char s);
private:
void countSupport();
void getNextSet();
void output1();
void output2();
bool compareS(const vector<string> s1, const vector<string> s2);
string getRes();
struct item{
string key;
int value;
};
struct mutiItem{
vector<string> key;
int value;
};
int support;
vector<mutiItem> vec_n_current;
vector<mutiItem> vec_muti_pre;
vector<mutiItem> vec_muti_current;
vector<item> vec_item;
vector<string> vec_str;
map<string, int> sequence;
string file;
};
#endif //end APRIOR_H
//aprior.cpp
#include <algorithm>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <fstream>
#include "aprior.h"
#include <string>
#include <cassert>
#include <cstdio>
using namespace std;
//open file and each line of file as input database;
void aprior::openData()
{
ifstream in(file.c_str());
assert(in != NULL);
string word;
while(getline(in,word))
{
vec_str.push_back(word);
}
}
//1 genetative whit structure map
void aprior::countWord(char split)
{
string line, word;
for(vector<string>::iterator iter = vec_str.begin(); iter != vec_str.end(); ++iter){
line = *iter;
while(line.find(split) != -1)
{
word = line.substr(0, line.find(split));
++sequence[word];
line = line.substr(line.find(split)+1, line.size()-1);
}
++sequence[line];
}
}
void aprior::generate_1Itemset()
{
item ite;
for(map<string,int>::iterator iter = sequence.begin(); iter !=sequence.end(); ++iter)
{
if(iter->second >= support)
{
ite.key = iter->first;
ite.value = iter->second;
vec_item.push_back(ite);
}
}
}
void aprior::generate_Alternative2()
{
mutiItem mutiTmp;
vector<string> vecTmp;
for(vector<item>::iterator iter1 = vec_item.begin(); iter1 != vec_item.end() -1 ; ++iter1){
vecTmp.push_back(iter1->key);
for(vector<item>::iterator iter2 = iter1+1; iter2 !=vec_item.end(); ++iter2){
vecTmp.push_back(iter2->key);
mutiTmp.key = vecTmp;
mutiTmp.value = 0;
vec_muti_pre.push_back(mutiTmp);
vecTmp.pop_back();
}
vecTmp.clear();
}
}
void aprior::countSupport()
{
bool flag = true;
vector<string> vec;
for(vector<mutiItem>::iterator iter1 = vec_muti_pre.begin(); iter1 != vec_muti_pre.end(); ++iter1){
vec = iter1->key;
for(vector<string>::iterator iter2 = vec_str.begin(); iter2 != vec_str.end(); ++iter2)
{
flag = true;
for(vector<string>::iterator iter3 = vec.begin(); iter3 != vec.end(); ++iter3){
if(iter2->find(*iter3) == -1)
{
flag =false;
break;
}
}
if(flag)
++iter1->value;
}
}
}
void aprior::get2B()
{
ofstream out;
out.open(getRes().c_str(),ios::app);
out<<endl;
out<<"***************************************"<<endl;
vector<string> vecTmp;
bool flag = true;
for(vector<item>::iterator iter1 = vec_item.begin(); iter1 != vec_item.end(); ++iter1)
{
flag = true;
for(vector<mutiItem>::iterator iter = vec_muti_current.begin(); iter != vec_muti_current.end(); ++iter)
{
vecTmp =iter->key;
if(find(vecTmp.begin(),vecTmp.end(),iter1->key) != vecTmp.end())
{
flag =false;
break;
}
}
if(flag)
out<<iter1->key<<endl;
}
out<<"****************************************"<<endl;
out<<endl;
out.close();
}
void aprior::generate_ItemSet()
{
vec_muti_current.clear();
for(vector<mutiItem>::iterator iter = vec_muti_pre.begin(); iter != vec_muti_pre.end(); ++iter){
if(iter->value >= support){
vec_muti_current.push_back(*iter);
}
}
}
void aprior::getNextSet()
{
vec_n_current.clear();
vec_n_current = vec_muti_current;
}
void aprior::generate_Alternative(){
vec_muti_pre.clear();
int diff = 0;
vector<string> vecTmp;
string tmp;
mutiItem muti;
bool sa;
for(vector<mutiItem>::iterator iter1 = vec_muti_current.begin(); iter1 != vec_muti_current.end()-1; ++iter1)
{
vecTmp = iter1->key;
for(vector<mutiItem>::iterator iter2 = iter1+1; iter2 != vec_muti_current.end(); ++iter2){
diff = 0;
for(vector<string>::iterator iter3 = iter2->key.begin(); iter3 != iter2->key.end(); ++iter3){
if(find(vecTmp.begin(),vecTmp.end(),*iter3) == vecTmp.end())
{
++diff;
tmp = *iter3;
}
}
if(diff == 1)
{
vecTmp.push_back(tmp);
muti.key = vecTmp;
muti.value = 0;
sa = true;
for(vector<mutiItem>::iterator itersa = vec_muti_pre.begin(); itersa != vec_muti_pre.end(); ++itersa)
{
if(compareS(itersa->key,muti.key))
{
sa = false;
break;
}
}
if(sa)
vec_muti_pre.push_back(muti);
vecTmp.pop_back();
}
}
}
}
bool aprior::compareS(vector<string> s1, vector<string> s2)
{
for(vector<string>::iterator iter1 = s1.begin(); iter1 != s1.end(); ++iter1)
{
if(find(s2.begin(),s2.end(),*iter1)==s2.end())
return false;
}
return true;
}
void aprior::get_Big_set()
{
ofstream out;
out.open(getRes().c_str(),ios::app);
out<<endl;
out<<"****************************************************************"<<endl;
bool flag = false;
vector<string> vecTmp;
for(vector<mutiItem>::iterator iter1 = vec_n_current.begin(); iter1 != vec_n_current.end(); ++iter1){
vecTmp = iter1->key;
flag = false;
for(vector<mutiItem>::iterator iter2 = vec_muti_current.begin(); iter2 != vec_muti_current.end(); ++iter2)
{
if(compareS(vecTmp,iter2->key))
flag = true;
}
if(!flag)
{
for(vector<string>::iterator iter = vecTmp.begin(); iter != vecTmp.end(); ++iter)
out<<*iter<<"\t";
out<<endl;
}
}
out<<"*************************************************************"<<endl;
out<<endl;
}
void aprior::begin()
{
generate_1Itemset();
output1();
generate_Alternative2();
countSupport();
generate_ItemSet();
output2();
get2B();
while(1)
{
generate_Alternative();
getNextSet();
countSupport();
generate_ItemSet();
if(vec_muti_current.size() == 0)
return;
output2();
get_Big_set();
}
}
string aprior::getRes()
{
string tmp;
stringstream ss;
ss<<support;
tmp = ss.str();
tmp = file+tmp;
return tmp;
}
void aprior::output1()
{
ofstream out;
string tmp = getRes();
out.open(tmp.c_str(),ios::app);
for(vector<item>::iterator iter = vec_item.begin(); iter != vec_item.end(); ++iter)
out<<iter->key<<'\t'<<iter->value<<endl;
out.close();
}
void aprior::output2()
{
ofstream out;
out.open(getRes().c_str(),ios::app);
vector<string> vecTmp;
string tmp;
for(vector<mutiItem>::iterator iter = vec_muti_current.begin(); iter != vec_muti_current.end(); ++iter){
vecTmp = iter->key;
for(vector<string>::iterator iters = vecTmp.begin(); iters!= vecTmp.end(); ++iters)
out<<*iters<<"\t";
out<<iter->value<<endl;
}
out.close();
}
aprior::~aprior()
{
}
//main.cpp
#include <iostream>
#include "aprior.h"
using namespace std;
void test(int support,string file)
{
aprior ap(support,file);
ap.openData();
ap.countWord(' ');
ap.begin();
}
int main()
{
test(50,"data.txt");
return 0;
}
在编译时候用g++ -c aprior.cpp main.cpp 就可以
要生成可执行文件需要g++ aprior.cpp main.cpp -o aprior就可以了。
在接下来要写FP-TREE的算法对样本就行测试对比效率。
全部评论 (0)
还没有任何评论哟~
