京东评论爬取和词云绘制
发布时间
阅读量:
阅读量
以某男士腕表为例
要爬取的网址:https://item.jd.com/44238727209.html
观察开发者公鸡中的network看是否为json

并非多此一举,
当采用GET方法时,在浏览器中截取Request URL并查看返回结果,
而POST方法则需另寻途径,在表单中填写后再提交。
发现了一个有趣的现象:深入研究该URL(https://club.jd.com/comment/productPageComments.action)及其相关参数(如callback、productId等),有助于识别用户评论数据中的潜在模式。

多出一些字符的部分,在URL处理后就可以返回JSON了。
import requests
import pandas as pd
import os
import json
def get_page(url):
headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
try:
r=requests.get(url,headers=headers)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.json()
except Exception as e:
print("error",e)
return ""
def parse_page(data_json):
data_list=data_json['comments']
all_data=[]
for i in data_list:
data_one={}
data_one['用户名']=i['nickname']
#注意nickname没在video下他与video是并列的
data_one['评价内容']=i['content']
data_one['星级评价']=i['score']
data_one['评价日期']=i['creationTime']
if 'videos' in i:
data_one['评价视频url']=i['videos'][0]['remark']
else:
data_one['评价视频url']='没有视频'
all_data.append(data_one)
result=pd.DataFrame(all_data)
return result
def save_file(data_df):
colums=['用户名','评价内容','星级评价','评价日期','评价视频url']
if os.path.exists('京东某手表评价信息.xlsx'):
data_df.to_excel('京东某手表评价信息.xlsx',index=False,columns=colums, encoding='utf-8-sig',header=False)
else:
data_df.to_excel('京东某手表评价信息.xlsx',index=False,columns=colums, encoding='utf-8-sig',header=True)
print("保存成功")
import requests
import pandas as pd
import os
import json
def get_page(url):
headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
try:
r=requests.get(url,headers=headers)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.json()
except Exception as e:
print("error",e)
return ""
def parse_page(data_json):
data_list=data_json['comments']
all_data=[]
for i in data_list:
data_one={}
data_one['用户名']=i['nickname']
#注意nickname没在video下他与video是并列的
data_one['评价内容']=i['content']
data_one['星级评价']=i['score']
data_one['评价日期']=i['creationTime']
if 'videos' in i:
data_one['评价视频url']=i['videos'][0]['remark']
else:
data_one['评价视频url']='没有视频'
all_data.append(data_one)
result=pd.DataFrame(all_data)
return result
def save_file(data_df):
colums=['用户名','评价内容','星级评价','评价日期','评价视频url']
if os.path.exists('京东某手表评价信息.xlsx'):
data_df.to_excel('京东某手表评价信息.xlsx',index=False,columns=colums, encoding='utf-8-sig',header=False)
else:
data_df.to_excel('京东某手表评价信息.xlsx',index=False,columns=colums, encoding='utf-8-sig',header=True)
print("保存成功")
if __name__ == "__main__":
begin = int(input('请输入起始页:'))
end = int(input('请输入结束页:'))
all_data=pd.DataFrame()
for i in range(begin,end+1):
url="https://club.jd.com/comment/productPageComments.action?productId=44238727209&score=0&sortType=5&page={}&pageSize=10&isShadowSku=0&fold=1".format(i-1)
data_json = get_page(url)
result = parse_page(data_json)
all_data=all_data.append(result)
save_file(all_data)
#### 生成词云
```python
import wordcloud
import matplotlib.pyplot as plt
import jieba
import pandas as pd
full_data=pd.read_excel('comments.xlsx')
#读取csv文件时,要加上encoding否则汉字无法显示正常,如果文件名包含中文,要加上engine参数
#full_data=pd.read_csv("京东某手表评价信息1.xlsx",encoding='utf-8-sig',engine='python')
data=full_data['comment']
#print(data)
#第二种方法只读‘评价内容’,利用usecols指定要读的列号
#data=pd.read_excel('京东某手表评价信息1.xlsx',usecols=[1])
#data=pd.read_csv('京东某手表评价信息1.xlsx',usecols=[1],engine='python')
#读出数据后开始分词
comments=[]
for i in data:
temp=jieba.lcut(i)
comments +=temp
#等价于comments.extend(temp)
#print(comments)
#统计词频
counts={}
for word in comments:
if len(word)>1:
counts[word]=counts.get(word,0)+1 #在word键值对上+1
#print(counts)
#绘制显示词云
pic =plt.imread('timg.jfif') #加载词云图片
w=wordcloud.WordCloud(
mask=pic,
background_color='white', #词云背景色
font_path='C:/Windows/Fonts/simhei.TTF')
w.fit_words(counts)#传入词频
plt.imshow(w)#转为plt图形数据
plt.axis('off')#取消显示x-y轴
w.to_file('comments.jpg')#保存词云图片
#等价于plt.savefig('comments.jpg')
全部评论 (0)
还没有任何评论哟~
