Advertisement

Scrapy爬取京东图书信息

阅读量:

网页架构较为基础,在实际操作中需要注意以下几点:首先,在获取价格时应首先定位其对应的接口;其次,在传递item时需采用深度复制方式以防止数值复制带来的问题;最后,请确保所有数据传输过程中的安全性和一致性。以下为完整的Spider源文件内容:

复制代码
    # -*- coding: utf-8 -*-
    import scrapy
    from copy import deepcopy
    import json
    
    
    class JsbookSpider(scrapy.Spider):
    name = 'jdbook'
    allowed_domains = ['jd.com', 'p.3.cn']
    start_urls = ['https://book.jd.com/booksort.html']
    
    def parse(self, response):
        item = {}
        dt_list = response.xpath('//div[@id="booksort"]/div[2]/dl/dt')
        # 获取大分类
        for dt in dt_list:
            item['big_sort'] = dt.xpath('./a/text()').extract_first()
    
            # 获取小分类
            em_list = dt.xpath('./following-sibling::dd[1]/em')
            for em in em_list:
                item['small_sort'] = em.xpath('./a/text()').extract_first()
                item['small_sort_href'] = em.xpath('./a/@href').extract_first()
                if item['small_sort_href'] is not None:
                    item['small_sort_href'] = 'https:' + item['small_sort_href']
    
                yield scrapy.Request(
                    url=item['small_sort_href'],
                    callback=self.parse_book_page,
                    meta={'item': deepcopy(item)}
                )
    
    def parse_book_page(self, response):
        item = response.meta.get('item')
    
        # 获取图书详情
        li_list = response.xpath('//ul[@class="gl-warp clearfix"]/li')
        for li in li_list:
            # 书名
            item['book_name'] = li.xpath('./div/div[@class="p-name"]/a/em/text()').extract_first()
            if item['book_name'] is not None:
                item['book_name'] = item['book_name'].strip()
            # 图书详情页URL
            item['book_href'] = li.xpath('./div/div[@class="p-name"]/a/@href').extract_first()
            if item['book_href'] is not None:
                item['book_href'] = 'https:' + item['book_href']
            # 简介
            # item['promo_words'] = li.xpath('./div/div[@class="p-name"]/a/i/text()').extract_first()
            # 作者
            item['book_author'] = li.xpath('./div/div[@class="p-bookdetails"]/span/span/a/@title').extract_first()
    
            # 获取价格
            num = li.xpath('./div/@data-sku').extract_first()
            if num is not None:
                url = 'https://p.3.cn/prices/mgets?&skuIds=J_{}'.format(num)
    
                yield scrapy.Request(
                    url=url,
                    callback=self.get_book_price,
                    meta={'item': deepcopy(item)}
                )
    
        # 获取下一页数据
        next_url = response.xpath('//a[@class="pn-next"]/@href').extract_first()
        if next_url is not None:
            next_url = "http://list.jd.com" + next_url
            print('获取下一页')
    
            yield scrapy.Request(
                url=next_url,
                callback=self.parse_book_page,
                meta={'item': deepcopy(item)}
            )
    
    
    def get_book_price(self, response):
        item = response.meta.get('item')
        item['book_price'] = json.loads(response.text)[0]['op']
        # print(item)
    
        yield item
    
    
    python
    
    
![](https://ad.itadn.com/c/weblog/blog-img/images/2025-08-18/wRuCfpLzNWIajOUhKlo7D2xt9PAZ.png)

全部评论 (0)

还没有任何评论哟~