Advertisement

数据挖掘实践指南读书笔记6

阅读量:
640?wx_fmt=png

1. 写在之前

本书涉及的源程序和数据都可以在以下网站中找到:

http://guidetodatamining.com/

https://www.zybuluo.com/hainingwyx/note/559139

2. 朴素贝叶斯和文本

训练阶段

  1. 将标识为同一假设的文档合并成一个文本文件

  2. 计算词在该文件中的出现次数n,形成一个词汇表

  3. 对于词汇表中的每个词w_k计算器在文本中的出现次数,记为n_k

  4. 对词汇表中的每个词(去除停用词)w_k,计算

640?wx_fmt=png

3. 学习

复制代码
 class BayesText:

    
  
    
     def __init__(self, trainingdir, stopwordlist):
    
     """This class implements a naive Bayes approach to text
    
     classification
    
     trainingdir is the training data. Each subdirectory of
    
     trainingdir is titled with the name of the classification
    
     category -- those subdirectories in turn contain the text
    
     files for that category.
    
     The stopwordlist is a list of words (one per line) will be
    
     removed before any counting takes place.
    
     """
    
     self.vocabulary = {}
    
     self.prob = {}
    
     self.totals = {}
    
     self.stopwords = {}         #停用词字典
    
     f = open(stopwordlist)
    
     for line in f:
    
         self.stopwords[line.strip()] = 1
    
     f.close()
    
     categories = os.listdir(trainingdir)
    
     #filter out files that are not directories
    
     self.categories = [filename for filename in categories
    
                        if os.path.isdir(trainingdir + filename)]
    
     print("Counting ...")
    
     for category in self.categories:
    
         print('    ' + category)
    
         # 计算当前类别的单词和单词数量,单词的总量
    
         (self.prob[category],
    
          self.totals[category]) = self.train(trainingdir, category)
    
     # I am going to eliminate any word in the 所有种类的单词库vocabulary
    
     # that doesn't occur at least 3 times
    
     toDelete = []
    
     for word in self.vocabulary:
    
         if self.vocabulary[word] < 3:
    
             # mark word for deletion
    
             # can't delete now because you can't delete
    
             # from a list you are currently iterating over
    
             toDelete.append(word)
    
     # now delete
    
     for word in toDelete:
    
         del self.vocabulary[word]
    
     # now compute probabilities
    
     vocabLength = len(self.vocabulary)
    
     print("Computing probabilities:")
    
     for category in self.categories:
    
         print('    ' + category)
    
         denominator = self.totals[category] + vocabLength
    
         for word in self.vocabulary:
    
             if word in self.prob[category]:
    
                 count = self.prob[category][word]
    
             else:
    
                 count = 1
    
             # 条件概率计算
    
             self.prob[category][word] = (float(count + 1)
    
                                          / denominator)
    
     print ("DONE TRAINING\n\n")
    
                 
    
     # input:trainingdir训练文件的目录, category训练文件的种类
    
     # return: (counts, total) (当前文件的单词和单词数量,所有单词的数量)
    
     def train(self, trainingdir, category):
    
     """counts word occurrences for a particular category"""
    
     currentdir = trainingdir + category
    
     files = os.listdir(currentdir)
    
     counts = {}
    
     total = 0
    
     for file in files:
    
         #print(currentdir + '/' + file)
    
         f = codecs.open(currentdir + '/' + file, 'r', 'iso8859-1')
    
         for line in f:
    
             tokens = line.split()
    
             for token in tokens:
    
                 # get rid of punctuation and lowercase token
    
                 token = token.strip('\'".,?:-')
    
                 token = token.lower()
    
                 if token != '' and not token in self.stopwords:
    
                     self.vocabulary.setdefault(token, 0)
    
                     self.vocabulary[token] += 1#所有文档的单词和单词数量
    
                     counts.setdefault(token, 0)
    
                     counts[token] += 1#当前文件的单词和单词数量
    
                     total += 1#所有单词的数量
    
         f.close()
    
     return(counts, total)
    
 # test code
    
 bT = BayesText(trainingDir, stoplistfile)
    
 bT.prob['rec.motorcycles']["god"]

分类阶段:

640?wx_fmt=png

如果概率极低,则Python可能难以处理该计算任务;建议将其转换为对数值进行处理以解决这个问题。对于停用词这一概念,在某些特殊情况下需要重新审视其适用性:例如在涉及性犯罪行为的数据集中发现某些特定词汇频率显著提升。

复制代码
     def classify(self, itemVector, numVector):

    
     """Return class we think item Vector is in"""
    
     results = []
    
     sqrt2pi = math.sqrt(2 * math.pi)
    
     for (category, prior) in self.prior.items():
    
         prob = prior
    
         col = 1
    
         for attrValue in itemVector:
    
             if not attrValue in self.conditional[category][col]:
    
                 # we did not find any instances of this attribute value
    
                 # occurring with this category so prob = 0
    
                 prob = 0
    
             else:
    
                 prob = prob * self.conditional[category][col][attrValue]
    
             col += 1
    
         col = 1
    
         for x in  numVector:
    
             mean = self.means[category][col]
    
             ssd = self.ssd[category][col]
    
             ePart = math.pow(math.e, -(x - mean)**2/(2*ssd**2))
    
             prob = prob * ((1.0 / (sqrt2pi*ssd)) * ePart)
    
             col += 1
    
         results.append((prob, category))
    
     # return the category with the highest probability
    
     #print(results)
    
     return(max(results)[1])
    
     
    
 # test code
    
 bT.classify(testDir+ 'rec.motorcycles/104673')

10-fold cross

复制代码
 from __future__ import print_function

    
 import os, codecs, math
    
  
    
 class BayesText:
    
     # input:训练文件目录,停用词,忽略的文件子集
    
     def __init__(self, trainingdir, stopwordlist, ignoreBucket):
    
     """This class implements a naive Bayes approach to text
    
     classification
    
     trainingdir is the training data. Each subdirectory of
    
     trainingdir is titled with the name of the classification
    
     category -- those subdirectories in turn contain the text
    
     files for that category.
    
     The stopwordlist is a list of words (one per line) will be
    
     removed before any counting takes place.
    
     """
    
     self.vocabulary = {}
    
     self.prob = {}
    
     self.totals = {}
    
     self.stopwords = {}
    
     f = open(stopwordlist)
    
     for line in f:
    
         self.stopwords[line.strip()] = 1
    
     f.close()
    
     categories = os.listdir(trainingdir)
    
     #filter out files that are not directories,in this program, neg and pos
    
     self.categories = [filename for filename in categories
    
                        if os.path.isdir(trainingdir + filename)]
    
     print("Counting ...")
    
     for category in self.categories:
    
         #print('    ' + category)
    
         (self.prob[category],
    
          self.totals[category]) = self.train(trainingdir, category,
    
                                              ignoreBucket)
    
     # I am going to eliminate any word in the vocabulary
    
     # that doesn't occur at least 3 times
    
     toDelete = []
    
     for word in self.vocabulary:
    
         if self.vocabulary[word] < 3:
    
             # mark word for deletion
    
             # can't delete now because you can't delete
    
             # from a list you are currently iterating over
    
             toDelete.append(word)
    
     # now delete
    
     for word in toDelete:
    
         del self.vocabulary[word]
    
     # now compute probabilities
    
     vocabLength = len(self.vocabulary)
    
     #print("Computing probabilities:")
    
     for category in self.categories:
    
         #print('    ' + category)
    
         denominator = self.totals[category] + vocabLength
    
         for word in self.vocabulary:
    
             if word in self.prob[category]:
    
                 count = self.prob[category][word]
    
             else:
    
                 count = 1
    
             self.prob[category][word] = (float(count + 1)
    
                                          / denominator)
    
     #print ("DONE TRAINING\n\n")
    
                 
    
   62.     def train(self, trainingdir, category, bucketNumberToIgnore):
    
     """counts word occurrences for a particular category"""
    
     ignore = "%i" % bucketNumberToIgnore
    
     currentdir = trainingdir + category
    
     directories = os.listdir(currentdir)
    
     counts = {}
    
     total = 0
    
     for directory in directories:
    
         if directory != ignore:
    
             currentBucket = trainingdir + category + "/" + directory
    
             files = os.listdir(currentBucket)
    
             #print("   " + currentBucket)
    
             for file in files:
    
                 f = codecs.open(currentBucket + '/' + file, 'r', 'iso8859-1')
    
                 for line in f:
    
                     tokens = line.split()
    
                     for token in tokens:
    
                         # get rid of punctuation and lowercase token
    
                         token = token.strip('\'".,?:-')
    
                         token = token.lower()
    
                         if token != '' and not token in self.stopwords:
    
                             self.vocabulary.setdefault(token, 0)
    
                             self.vocabulary[token] += 1
    
                             counts.setdefault(token, 0)
    
                             counts[token] += 1
    
                             total += 1
    
                 f.close()
    
     return(counts, total)
    
                 
    
                 
    
     def classify(self, filename):
    
     results = {}
    
     for category in self.categories:
    
         results[category] = 0
    
     f = codecs.open(filename, 'r', 'iso8859-1')
    
     for line in f:
    
         tokens = line.split()
    
         for token in tokens:
    
             #print(token)
    
             token = token.strip('\'".,?:-').lower()
    
             if token in self.vocabulary:
    
                 for category in self.categories:
    
                     if self.prob[category][token] == 0:
    
                         print("%s %s" % (category, token))
    
                     results[category] += math.log(
    
                         self.prob[category][token])
    
     f.close()
    
     results = list(results.items())
    
     results.sort(key=lambda tuple: tuple[1], reverse = True)
    
     # for debugging I can change this to give me the entire list
    
     return results[0][0]
    
  
    
     # input: 测试文件的分类目录,当前类别, 忽略子集
    
     # return: 当前类别下的分类结果{0:12,1:23}
    
     def testCategory(self, direc, category, bucketNumber):
    
     results = {}
    
     directory = direc + ("%i/" % bucketNumber)
    
     #print("Testing " + directory)
    
     files = os.listdir(directory)
    
     total = 0
    
     #correct = 0
    
     for file in files:
    
         total += 1
    
         result = self.classify(directory + file)
    
         results.setdefault(result, 0)
    
         results[result] += 1
    
         #if result == category:
    
         #               correct += 1
    
     return results
    
     
    
     # input: 测试文件目录, 忽略的子集文件
    
     # return: 所有类别的分类结果{1:{0:12,1:23},}
    
     def test(self, testdir, bucketNumber):
    
     """Test all files in the test directory--that directory is
    
     organized into subdirectories--each subdir is a classification
    
     category"""
    
     results = {}
    
     categories = os.listdir(testdir)
    
     #filter out files that are not directories
    
     categories = [filename for filename in categories if
    
                   os.path.isdir(testdir + filename)]
    
     for category in categories:
    
         #print(".", end="")
    
         results[category] = self.testCategory(
    
             testdir + category + '/', category, bucketNumber)
    
     return results
    
  
    
 def tenfold(dataPrefix, stoplist):
    
     results = {}
    
     for i in range(0,10):
    
     bT = BayesText(dataPrefix, stoplist, i)
    
     r = bT.test(theDir, i)
    
     for (key, value) in r.items():
    
         results.setdefault(key, {})
    
         for (ckey, cvalue) in value.items():
    
             results[key].setdefault(ckey, 0)
    
             results[key][ckey] += cvalue
    
             categories = list(results.keys())
    
     categories.sort()
    
     print(   "\n       Classified as: ")
    
     header =    "          "
    
     subheader = "        +"
    
     for category in categories:
    
     header += "% 2s   " % category
    
     subheader += "-----+"
    
     print (header)
    
     print (subheader)
    
     total = 0.0
    
     correct = 0.0
    
     for category in categories:
    
     row = " %s    |" % category
    
     for c2 in categories:
    
         if c2 in results[category]:
    
             count = results[category][c2]
    
         else:
    
             count = 0
    
         row += " %3i |" % count
    
         total += count
    
         if c2 == category:
    
             correct += count
    
     print(row)
    
     print(subheader)
    
     print("\n%5.3f percent correct" %((correct * 100) / total))
    
     print("total of %i instances" % total)
    
  
    
 # change these to match your directory structure
    
 prefixPath = "data/review_polarity/"
    
 theDir = prefixPath + "/txt_sentoken/"
    
 stoplistfile = prefixPath + "stopwords25.txt"
    
 tenfold(theDir, stoplistfile)

4.我的公众号

640?wx_fmt=png

全部评论 (0)

还没有任何评论哟~