Advertisement

Scrapy爬取网易新闻

阅读量:
  1. 创建一个scrapy项目
复制代码
    #在cmd中 依次输入 
    #scrapy startproject news
    #cd news
    #scrapy genspider -t crawl news163 news.163.com
在这里插入图片描述
  1. 在items.py文件里输入要爬取的内容
复制代码
    import scrapy
    
    class NewsItem(scrapy.Item):
    news_thread = scrapy.Field()
    news_title = scrapy.Field()
    news_time = scrapy.Field()
    news_source = scrapy.Field()
    source_url = scrapy.Field()
    news_text = scrapy.Field()
    news_url = scrapy.Field()

3.分析页面源代码并编写news163.py 文件

在这里插入图片描述
复制代码
    #导入需要的第三方库
    import scrapy
    from news.items import NewsItem
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider,Rule
    
    #编写正则表达式
    #https://news.163.com/20/0427/20/FB8E63MK00018AOR.html
    #https://news.163.com/20/0428/07/FB9K5VRS0001899O.html
    #对比两段url,确定正则表达式
    https://news.163.com/20/0428/\d+/.*?html
    
    #编写News163Spider(CrawlSpider)
    #部分内容自动生成,只需要根据要爬取的网站修改
    class News163Spider(CrawlSpider):
    name = 'news163'
    allowed_domains = ['news.163.com']
    start_urls = ['http://news.163.com/']
    
    rules = (
        Rule(LinkExtractor(allow=r'https://news.163.com/20/0428/\d+/.*?html'), callback='parse_item', follow=True),
    )
    
    #具象化item
      def parse_item(self, response):
        item = NewsItem()
        item['news_thread']=response.url.strip().split('/')[-1][:-5]
        return item
    #l例子:获取标题
    	self.get_title(response,item)
    
    def get_title(self,response,item):
        title=response.css('title::text').extract()#根据网站编写css提取策略
        if title:
            print('title:{}'.format(title[0]))
            item['news_title']=title[0]
复制代码
     #获取其他内容也是类似的方法,根据实际情况进行调整,整段代码如下:
     def parse_item(self, response):
        item = NewsItem()
        item['news_thread']=response.url.strip().split('/')[-1][:-5]
        self.get_title(response,item)
        self.get_time(response, item)
        self.get_source(response, item)
        self.get_source_url(response, item)
        self.get_text(response, item)
        self.get_url(response, item)
        return item
    
    def get_url(self, response, item):
        url= response.url
        if url:
            item['news_url'] = url
    
    def get_text(self, response, item):
        text = response.css('.post_text p::text').extract()
        if text:
            print('text:{}'.format(text))
            item['news_text'] = text
    
    def get_source_url(self, response, item):
        source_url = response.css('#ne_article_source::attr(href)').extract()
        if source_url:
            print('source_url:{}'.format(source_url[0]))
            item['source_url'] = source_url[0]
    
    def get_source(self, response, item):
        source = response.css('#ne_article_source::text').extract()
        if source:
            print('source:{}'.format(source[0]))
            item['news_source'] = source[0]
    
    def get_title(self,response,item):
        title=response.css('title::text').extract()
        if title:
            print('title:{}'.format(title[0]))
            item['news_title']=title[0]
    
    def get_time(self,response,item):
        time=response.css('div.post_time_source::text').extract()
        if time:
            print('time:{}'.format(time[0].strip().replace('来源','').replace('\u3000','')))
            item['news_time'] = time[0].strip().replace('来源','').replace('\u3000','')

4.编写pipelines.py

复制代码
    from scrapy.exporters import CsvItemExporter
    
    class NewsPipeline(object):
    def __init__(self):
        self.file=open('news_data.csv','wb')
        self.exporter=CsvItemExporter(self.file,encoding="UTF-8")
        self.exporter.start_exporting()
    
    def close_spider(self,spider):
        self.exporter.finish_exporting()
        self.file.close()
    
    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

5.设置setting.py文件

在这里插入图片描述

6.爬取数据

复制代码
    在cmd里输入 scrapy crawl news163

左侧出现一个news_data的csv文件,即为爬取的数据

在这里插入图片描述

全部评论 (0)

还没有任何评论哟~