Scrapy爬取网易新闻
发布时间
阅读量:
阅读量
- 创建一个scrapy项目
#在cmd中 依次输入
#scrapy startproject news
#cd news
#scrapy genspider -t crawl news163 news.163.com

- 在items.py文件里输入要爬取的内容
import scrapy
class NewsItem(scrapy.Item):
news_thread = scrapy.Field()
news_title = scrapy.Field()
news_time = scrapy.Field()
news_source = scrapy.Field()
source_url = scrapy.Field()
news_text = scrapy.Field()
news_url = scrapy.Field()

#导入需要的第三方库
import scrapy
from news.items import NewsItem
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider,Rule
#编写正则表达式
#https://news.163.com/20/0427/20/FB8E63MK00018AOR.html
#https://news.163.com/20/0428/07/FB9K5VRS0001899O.html
#对比两段url,确定正则表达式
https://news.163.com/20/0428/\d+/.*?html
#编写News163Spider(CrawlSpider)
#部分内容自动生成,只需要根据要爬取的网站修改
class News163Spider(CrawlSpider):
name = 'news163'
allowed_domains = ['news.163.com']
start_urls = ['http://news.163.com/']
rules = (
Rule(LinkExtractor(allow=r'https://news.163.com/20/0428/\d+/.*?html'), callback='parse_item', follow=True),
)
#具象化item
def parse_item(self, response):
item = NewsItem()
item['news_thread']=response.url.strip().split('/')[-1][:-5]
return item
#l例子:获取标题
self.get_title(response,item)
def get_title(self,response,item):
title=response.css('title::text').extract()#根据网站编写css提取策略
if title:
print('title:{}'.format(title[0]))
item['news_title']=title[0]
#获取其他内容也是类似的方法,根据实际情况进行调整,整段代码如下:
def parse_item(self, response):
item = NewsItem()
item['news_thread']=response.url.strip().split('/')[-1][:-5]
self.get_title(response,item)
self.get_time(response, item)
self.get_source(response, item)
self.get_source_url(response, item)
self.get_text(response, item)
self.get_url(response, item)
return item
def get_url(self, response, item):
url= response.url
if url:
item['news_url'] = url
def get_text(self, response, item):
text = response.css('.post_text p::text').extract()
if text:
print('text:{}'.format(text))
item['news_text'] = text
def get_source_url(self, response, item):
source_url = response.css('#ne_article_source::attr(href)').extract()
if source_url:
print('source_url:{}'.format(source_url[0]))
item['source_url'] = source_url[0]
def get_source(self, response, item):
source = response.css('#ne_article_source::text').extract()
if source:
print('source:{}'.format(source[0]))
item['news_source'] = source[0]
def get_title(self,response,item):
title=response.css('title::text').extract()
if title:
print('title:{}'.format(title[0]))
item['news_title']=title[0]
def get_time(self,response,item):
time=response.css('div.post_time_source::text').extract()
if time:
print('time:{}'.format(time[0].strip().replace('来源','').replace('\u3000','')))
item['news_time'] = time[0].strip().replace('来源','').replace('\u3000','')
from scrapy.exporters import CsvItemExporter
class NewsPipeline(object):
def __init__(self):
self.file=open('news_data.csv','wb')
self.exporter=CsvItemExporter(self.file,encoding="UTF-8")
self.exporter.start_exporting()
def close_spider(self,spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
5.设置setting.py文件

6.爬取数据
在cmd里输入 scrapy crawl news163
左侧出现一个news_data的csv文件,即为爬取的数据

全部评论 (0)
还没有任何评论哟~
