新闻文章分类
 发布时间 
 阅读量: 
 阅读量 
    记录一下自然语言处理课程的作业。
    
    
      
    
        一、爬取数据集
1.爬取差别比较明显的五个类别
    import math
    import os
    import urllib.request
    import re
    from bs4 import BeautifulSoup
    
    
    #需要手动切换类别,进行爬取
    #0体育、1娱乐、2教育、3科技、4股票
    urllist=['https://sports.163.com/','https://ent.163.com/','https://edu.163.com/',
         'https://tech.163.com/','https://money.163.com/stock/']
    
    
    def get_urls():
    url = urllist[4] #修改下标,选择要爬取的类别
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html,'html.parser')
    links = soup.find_all(name="a", attrs={"href": re.compile("https://www.163.com/dy/article/")})
    urls = []
    for i in links:
        urls.append(i.get("href"))
    process_urls = list(set(urls))
    process_urls.sort(key=urls.index)#得到处理后的所有url
    return process_urls
    
    def get_article(urls):
    count = 0
    for url in urls:
        count = count + 1
        html = urllib.request.urlopen(url).read().decode("utf-8")
        soup = BeautifulSoup(html,'html.parser')
        links = soup.find_all(name="p", attrs={"id": re.compile("^0")})
        opl = open("article/股票.txt", "a+", encoding="utf-8") #修改文件名,选择要保存的文件
        for i in links:
            opl.write("%s\n" % (i.get_text())) # 循环写入段落内容
        opl.close()
        # 读取80篇新闻后停止
        if count == 10000:
            break
    
    
    if __name__ == "__main__":
    urls=[]
    urls = get_urls()
    get_article(urls)
    print('爬取完成')
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
        2.txt文件转换为csv文件,方便查看数据多少条
    import pandas as pd
    
    #转为csv文件,并增加索引,可以看文章数
    file=pd.read_csv("article/股票.txt",sep='\n')
    temp=pd.DataFrame(file)
    temp.columns=["content"]
    temp.to_csv("article/股票.csv")
    
    
      
      
      
      
      
      
      
    
        3.数据集展示

二、模型训练
1.读取数据,划分训练集和测试集
    import jieba
    import pandas as pd
    import random
    from sklearn.model_selection import train_test_split
    
    sports = pd.read_csv("./article/体育.csv", encoding='utf-8')
    sports = sports.dropna()
    ent = pd.read_csv("./article/娱乐.csv", encoding='utf-8')
    ent = ent.dropna()
    edu = pd.read_csv("./article/教育.csv", encoding='utf-8')
    edu = edu.dropna()
    tech = pd.read_csv("./article/科技.csv", encoding='utf-8')
    tech = tech.dropna()
    money = pd.read_csv("./article/股票.csv", encoding='utf-8')
    money = money.dropna()
    
    #选取数据
    sports = sports.content.values.tolist()[0:500]
    ent = ent.content.values.tolist()[0:500]
    edu = edu.content.values.tolist()[0:500]
    tech = tech.content.values.tolist()[0:500]
    money = money.content.values.tolist()[0:500]
    
    
    #加载停用词、去停用词
    stopwords=pd.read_csv("stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')
    stopwords=stopwords['stopword'].values
    
    #文本内容进行分词
    def preprocess_text(content_lines, sentences, category):
    for line in content_lines:
        try:
            segs=jieba.lcut(line)
            segs = filter(lambda x:len(x)>1, segs)
            segs = filter(lambda x:x not in stopwords, segs)
            sentences.append((" ".join(segs), category))
        except Exception and e:
            print(line)
            continue
    
    #调用方法分词,添加标签
    sentences = []
    preprocess_text(sports, sentences, '体育')
    preprocess_text(ent, sentences, '娱乐')
    preprocess_text(edu, sentences, '教育')
    preprocess_text(tech, sentences, '科技')
    preprocess_text(money, sentences, '股票')
    
    
    #打乱顺序,生成更可靠的训练集
    random.shuffle(sentences)
    #分成训练集和测试集
    x, y = zip(*sentences)
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1234)
    
    print(sentences)
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
        可以看到训练集里面的格式,高频词+标签

2.贝叶斯分类器训练模型,joblib保存模型
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
    from sklearn.naive_bayes import MultinomialNB 
    from sklearn import metrics 
    import joblib 
    
    #计算TF-IDF、词频矩阵,文本向量化
    count_vec = TfidfVectorizer()
    train_feature = count_vec.fit_transform(x_train) 
    
    
    #贝叶斯分类器训练模型
    clf = MultinomialNB(alpha=0.01).fit(train_feature,y_train)
    
    #用测试集测试准确度
    test_feature = count_vec.transform(x_test) 
    predict_labels = clf.predict(test_feature)
    scorce = metrics.accuracy_score(y_test, predict_labels)
    print(scorce)
    
    #保存模型
    joblib.dump(clf, 'bayes.pkl')
    joblib.dump(count_vec,'cout_vec.pkl')
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
        打印出用测试集测试的得分

三、成果展示
用python自带的UI界面
    import joblib
    import jieba
    import tkinter
    
    
    #tkinter界面(TK )
    window = tkinter.Tk()
    window.title('新闻文章分类器')
    window.geometry('800x300')
    
    def result():
    article = e1.get()  
    #加载模型
    model = joblib.load('bayes.pkl')
    cout_vec = joblib.load('cout_vec.pkl')
    words = jieba.cut(article)
    s = ' '.join(words)
    predict_feature = cout_vec.transform([s]) 
    predcit_label = model.predict(predict_feature)
    print(predcit_label[0])
    t.delete(1.0, 'end')  
    t.insert('insert', predcit_label[0])  # 将结果添加到文本框显示
    
    l1 = tkinter.Label(window, text='请输入新闻文章:')
    l1.pack()
    # 定义输入框
    e1 = tkinter.Entry(window, width=100)
    e1.pack()
    
    b1 = tkinter.Button(window, text="预测", command=result)
    b1.pack()
    # 定义文本框
    t = tkinter.Text(window, 
                 state='normal',  # 有disabled、normal 两个状态值,默认为normal
                 width=15, height=2
                 )
    t.pack()
    window.mainloop()
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
        输入文章:体育类

输入文章:娱乐类

全部评论 (0)
 还没有任何评论哟~ 
