Advertisement

scrapy爬取某网站景区评论爬虫

阅读量:

step1.研究网页结构,每个景点有一个景区的超“链接” https://piao.ctrip.com/ticket/dest/t2286.html

step2.链接到景区后,评论,在scrapy shell中不显示。推测应该是ajax等的发起的请求。

  1. 找到的地址是:https://sec-m.ctrip.com/restapi/soa2/12530/json/viewCommentList
  2. 请求体中包含,景区的viewid,就是景区链接里2286,其他就是一些分页等的内容,可以自己设定。

step3.计划这个爬虫分2步

  1. 爬取景点的code
  2. 根据code爬取 景区的评论

step4.源码放到git上了:https://github.com/wenwen0220/xiechengDemo

主要代码如下:

爬取code:

复制代码
 import scrapy

    
 from xiechengDemo.items import SceneryCodeItem
    
 import random
    
 import re
    
 #爬取景区的code
    
 class SceneryCodeSpider(scrapy.Spider):
    
 	name = "sceneryCode"
    
 	#要爬取的url集合
    
 	# start_urls = ['https://you.ctrip.com/sightlist/shandong100039/s0-p2.html']
    
 	#可以直接读取文件
    
 	start_urls=[i.strip() for i in open('/Users/jw/python/xiechengDemo/urls.txt').readlines()]
    
  
    
 	def parse(slf,response):
    
 		# print(response)
    
 		#用xpath获取需要的内容
    
 		sceneryName_list=response.xpath('.//*[@class="list_mod2"]/div[2]/dl/dt/a/text()').extract()
    
 		#获取景区的url连接地址
    
 		sceneryUrl_list=response.xpath('.//*[@class="list_mod2"]/div[2]/dl/dt/a/@href').extract()
    
 		# print(sceneryName_list)
    
 		list=[]
    
  
    
 		for i,j in zip(sceneryName_list,sceneryUrl_list):
    
 			#将url切分,获取景区code与城市名称
    
 			uri=j.split("/")
    
 			sceneryItem=SceneryCodeItem()
    
 			# item['_id']=str(random.randint(1,1000))
    
 			sceneryItem['provinceName']= "shandong"
    
 			#获取所有非数字的,正则表达式(qingdao)
    
 			sceneryItem['cityName']= re.findall("\D+",uri[2])[0]
    
 			sceneryItem['sceneryName']=i
    
 			#获取所有数字的,正则表达式(1234)
    
 			sceneryItem['sceneryCode']=re.findall("\d+",uri[3])[0]
    
 			print(sceneryItem)
    
 			yield sceneryItem
    
 		# 	list.append(sceneryItem)
    
 		# return list

爬取评论

复制代码
 import scrapy

    
 from xiechengDemo.items import SceneryCommentsItem
    
 import random
    
 import json
    
 import re
    
 import datetime
    
 from datetime import date
    
  
    
 #根据景区的id爬取景区的评论
    
 class SceneryCommentSpider(scrapy.Spider):
    
 	name = "sceneryComment"
    
  
    
 	def start_requests(self):
    
  
    
 		postUrl="https://sec-m.ctrip.com/restapi/soa2/12530/json/viewCommentList"
    
 		for data in self.getBody():
    
 			#FormRequest方法的content-type默认是“application/x-www-form-urlencoded”,请求会返回空,用下边的方法替换。
    
 			# yield scrapy.FormRequest(url=postUrl,formdata=data,callback=self.parse) 
    
 			yield scrapy.Request(
    
 				postUrl, 
    
 				body=json.dumps(data[0]), 
    
 				method='POST', 
    
 				headers={'Content-Type': 'application/json'},
    
 				callback=lambda response,sceneryCode=data[1],sceneryName=data[2]: self.parse(response,sceneryCode,sceneryName))
    
  
    
 	def parse(slf,response,sceneryCode,sceneryName):
    
 		# print(response.text)
    
  
    
 		# date=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
    
 		#获取今天的时间
    
 		# today = date.today()
    
 		beginDate=date(2019,1,1)
    
  
    
  
    
 		jsonArray=json.loads(response.body)['data']['comments']
    
 		for i in jsonArray:
    
 			#评论日期
    
 			# commentDate=datetime.datetime.strptime(i['date'],'%Y-%m-%d')
    
 			#获取年-月-日,格式是str
    
 			commentDateStr=datetime.datetime.strptime(i['date'], '%Y-%m-%d %H:%M').strftime('%Y-%m-%d')
    
 			#str转换成datetime
    
 			b=datetime.datetime.strptime(commentDateStr,'%Y-%m-%d')
    
 			#datetime转换成date
    
 			commentDate=datetime.datetime.date(b)
    
 			# print("------is",commentDate)
    
 			#不是2019年的就跳出
    
 			if commentDate<beginDate :
    
 				continue
    
  
    
 			sceneryCommentsItem=SceneryCommentsItem()
    
 			sceneryCommentsItem['id']=i['id']
    
 			sceneryCommentsItem['uid']=i['uid']
    
 			sceneryCommentsItem['title']=i['title']
    
 			sceneryCommentsItem['content']=i['content']
    
 			sceneryCommentsItem['date']=i['date']
    
 			sceneryCommentsItem['score']=i['score']
    
 			sceneryCommentsItem['sceneryCode']=sceneryCode
    
 			sceneryCommentsItem['sceneryName']=sceneryName
    
 			yield sceneryCommentsItem
    
  
    
 	#获取body的方法
    
 	def getBody(self):
    
 		# f=open("/Users/didi/jw/python/xiechengDemo/sceneryCode.json")
    
 		# res=f.read
    
 		# jsonArray=json.load(res)
    
 		#读取json文件
    
 		listData=[]
    
 		with open('/Users/jw/python/xiechengDemo/sceneryCode.json','r') as f:
    
 			#直接用load方法
    
 			jsonArray=json.load(f)
    
 		for i in jsonArray:
    
 			# print(i['sceneryCode'])
    
 			#请求的内容根据自己要爬取的页面数,与页面size自定义
    
 			data={
    
 				"pageid": "10650000804",
    
 			    "viewid": i['sceneryCode'],
    
 			    "tagid": "0",
    
 			    "pagenum": "1",
    
 			    "pagesize": "50",
    
 			    "contentType": "json",
    
 			    "head": {
    
 			        "appid": "100013776",
    
 			        "cid": "09031037211035410190",
    
 			        "ctok": "",
    
 			        "cver": "1.0",
    
 			        "lang": "01",
    
 			        "sid": "8888",
    
 			        "syscode": "09",
    
 			        "auth": "",
    
 			        "extension": [
    
 			            {
    
 			                "name": "protocal",
    
 			                "value": "https"
    
 			            }
    
 			        ]
    
 			    },
    
 			    "ver": "7.10.3.0319180000"
    
 			}
    
 			list=[]
    
 			list.append(data)
    
 			list.append(i['sceneryCode'])
    
 			list.append(i['sceneryName'])
    
 			listData.append(list)
    
 		return listData

最后的结果,写到了mysql如下:

全部评论 (0)

还没有任何评论哟~