Advertisement

朴素贝叶斯算法Python实现

阅读量:
复制代码
 # -*- coding: cp936 -*-

    
 #朴素贝叶斯分类器的实现
    
 #运行
    
 #reload(docclass)
    
 #c1=docclass.naivebayes(docclass.getwords)
    
 #docclass.sampletrain(c1)
    
 #c1.classify('quick rabbit',default='unknown')
    
 #构建训练样本
    
 def sampletrain(c1):
    
     c1.train('Nobody owns the water.','good')
    
     c1.train('the quick rabbit jumps fences','good')
    
     c1.train('buy pharmaceuticals now','bad')
    
     c1.train('make quick money at the online casino','bad')
    
     c1.train('the quick brown fox jumps','good')
    
 import re
    
 import math
    
 #从文本中提取特征
    
 def getwords(doc):
    
     splitter=re.compile('\ W*')
    
     #根据非字母字典进行单词拆分
    
     words=[s.lower() for s in splitter.split(doc) if len(s)>2 and len(s)<20]
    
     #只返回一组不重复的单词
    
     return dict([(w,1) for w in words])
    
  
    
 #分类器
    
 class classifier:
    
     def __init__(self,getfeatures):
    
     self.fc={}
    
     self.cc={}
    
     self.getfeatures=getfeatures
    
     #classifier.__init__(self,getfeatures)
    
     self.thresholds={}
    
  
    
     def setthreshold(self,cat,t):
    
     self.thresholds[cat]=t
    
  
    
     def getthreshold(self,cat):
    
     if cat not in self.thresholds:
    
         return 1.0
    
     return self.thresholds[cat]
    
  
    
     def incf(self,f,cat):
    
     self.fc.setdefault(f,{})
    
     self.fc[f].setdefault(cat,0)
    
     self.fc[f][cat]+=1
    
  
    
     def incc(self,cat):
    
     self.cc.setdefault(cat,0)
    
     self.cc[cat]+=1
    
  
    
     def fcount(self,f,cat):
    
     if f in self.fc and cat in self.fc[f]:
    
         return float(self.fc[f][cat])
    
     return 0.0
    
  
    
     def catcount(self,cat):
    
     if cat in self.cc:
    
         return float(self.cc[cat])
    
     return 0
    
  
    
     def totalcount(self):
    
     return sum(self.cc.values())
    
  
    
     def categories(self):
    
     return self.cc.keys()
    
  
    
     def train(self,item,cat):
    
     features=self.getfeatures(item)
    
     for f in features:
    
         self.incf(f,cat)
    
     self.incc(cat)
    
  
    
     def fprob(self,f,cat):
    
     if self.catcount(cat)==0:
    
         return 0
    
     return self.fcount(f,cat)/self.catcount(cat)
    
  
    
     def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5):
    
     basicprob=prf(f,cat)
    
  
    
     totals=sum([self.fcount(f,c) for c in self.categories()])
    
  
    
     bp=((weight*ap)+(totals*basicprob))/(weight+totals)
    
     return bp
    
  
    
     def classify(self,item,default=None):
    
     probs={}
    
  
    
     max=0.0
    
     for cat in self.categories():
    
         probs[cat]=self.prob(item,cat)
    
         if probs[cat]>max:
    
             max=probs[cat]
    
             best=cat
    
             
    
     for cat in probs:
    
         if cat==best:
    
             continue
    
         if probs[cat]*self.getthreshold(best)>probs[best]:
    
             return default
    
         return best
    
         
    
 class naivebayes(classifier):
    
     
    
     def docprob(self,item,cat):
    
     features=self.getfeatures(item)
    
  
    
     p=1
    
     for f in features:
    
         p*=self.weightedprob(f,cat,self.fprob)
    
     return p
    
  
    
     def prob(self,item,cat):
    
     catprob=self.catcount(cat)/self.totalcount()
    
     docprob=self.docprob(item,cat)
    
     return docprob*catprob

全部评论 (0)

还没有任何评论哟~