Advertisement

京东图书爬虫

阅读量:

效果:

version_0

分析:

复制代码
    返回的数据是Json大分类:https://lapi.jd.com/pool?body={%22pid%22:%220101947%22}&source=book,每个poolid对应一个大title      ```      title: "文学综合馆",      operate_title: "",      sort_num: 2,      fid: 0,      id: 7369,      ext_info: {         poolId: "0101945"      }      ```      部分数据,即title对应一个poolId,属于同一个字典下拿到大分类的poolId,就可以用pooId取构造请求:https://lapi.jd.com/pool?body={%22pid%22:%220101945%22}&source=book,来获取中title以及小title,以及对应的小title的url请求小title的url拿到图书列表,图书信息在https://list.jd.com/listNew.php?cat=1713%2C3260%2C3345&page=150,在这个地址中,有总的页数,SEARCH.adv_param={page:"150",page_count:"200",psort:"0",cid1: 0,其中page_count为总的页数,page为当前页数,价格未有单独请求,包含在这个网址中,至此,网页分析完毕。

<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

源代码

复制代码
 # -*- coding: utf-8 -*-

    
 import scrapy
    
 import json
    
 from copy import deepcopy
    
 from pprint import pprint
    
 import re
    
 from urllib import parse
    
  
    
  
    
 class JdtsSpider(scrapy.Spider):
    
     name = 'jdts'
    
     # allowed_domains = ['book.jd.com','lapi.jd.com','coll.jd.com','list.jd.com','search.jd.com']
    
     allowed_domains = ["jd.com"]
    
     start_urls = ['https://lapi.jd.com/pool?body={%22pid%22:%220101947%22}&source=book']
    
  
    
     def parse(self, response):
    
     item = dict()
    
     json_obj = json.loads(response.body.decode())
    
     data_list = json_obj["data"][1:-4]
    
     for data in data_list:
    
         # 拿到大分类及poolId
    
         item["b_title"] = data["title"]
    
         item["poolId"] = data["ext_info"]["poolId"]
    
         # 用poolId去请求中级分类和下级分类
    
         temporary_var = '"pid":"{}"'.format(item["poolId"])
    
         next_url = 'https://lapi.jd.com/pool?body={'+temporary_var+'}&source=book'
    
         yield scrapy.Request(
    
             url=next_url,
    
             callback=self.parse_poolid,
    
             meta={"item":item}
    
         )
    
         break
    
  
    
     def parse_poolid(self,response):
    
     item = response.meta["item"]
    
  
    
     json_obj = json.loads(response.body.decode())
    
     data_list = json_obj["data"][2:]
    
  
    
     # 获取中级分类
    
     for data in data_list:
    
         item["m_title"] =    data["title"]
    
         s_item_list = data["children"]
    
         # 获得下级分类
    
         for s_item in s_item_list:
    
             item["s_title"] = s_item["title"]
    
             item["s_href"] = s_item["link_value"]
    
             yield scrapy.Request(
    
                 url=item["s_href"],
    
                 callback=self.parse_s_href,
    
                 meta={"item":item}
    
             )
    
  
    
     def parse_s_href(self,response):
    
     item = deepcopy(response.meta["item"])
    
     book_info = dict()
    
     print("-"*20)
    
     print("响应的url:", response.url)
    
     # with open   ("test.html",'w',encoding='utf8') as f:
    
     #     f.write(response.body.decode())
    
     # 拿到总页数
    
     count_page = int(re.findall(r'page_count:\"(.*?)\"',response.body.decode(),re.S)[0])
    
     print("count_page:",count_page)
    
     # 获取书籍列表
    
     content_list = re.findall(r'class="p-img">(.*?)<div class="p-commit">',response.body.decode(),re.S)
    
     for content in content_list:
    
         # 获取书籍信息
    
         item["book_info"] = book_info
    
         item["book_info"]["book_title"] = re.findall(r'<em>(.*?)</em>', content, re.S)[1]
    
         item["book_info"]["book_href"] = re.findall(r'href="(.*?)"',content,re.S)[0]
    
         item["book_info"]["book_href"] = parse.urljoin(response.url,item["book_info"]["book_href"])
    
         item["book_info"]["book_price"] = re.findall(r'<i>(.*?)</i>', content, re.S)[0]
    
         yield scrapy.Request(
    
             url=item["book_info"]["book_href"],
    
             callback=self.parse_detail,
    
             meta={"item":deepcopy(item)}
    
         )
    
     num = 2     # 计数指针
    
     while num<count_page:
    
         if "&page=" in response.url:
    
             next_page_href = response.url.split("&page",1)[0]+"&page={}".format(num)
    
             # print("*"*20)
    
             # print("next_page_url:", next_page_href)
    
             # print("*"*20)
    
         else:
    
             next_page_href = response.url+"&page={}".format(num)
    
             # print("next_page_url:",next_page_href)
    
         yield scrapy.Request(
    
             url=next_page_href,
    
             callback=self.parse_s_href,
    
             meta={"item":response.meta["item"]}
    
         )
    
         num += 1
    
  
    
     def parse_detail(self,response):
    
     item = response.meta["item"]
    
     item["book_info"]["book_author"] = response.xpath("//div[@class='p-author']/a/text()").extract_first()
    
     # url_1 = "https://dx.3.cn/desc/12234231"  skudid
    
     # url_2 = "https://cd.jd.com/description/channel?skuId=69612641897&mainSkuId=14541400416" skuid mainskuid
    
     skuid = re.findall(r'com/(.*?)\.html',response.url,re.S)[0]
    
     mainskuid = re.findall(r"mainSkuId:'(.*?)'",response.body.decode(),re.S)
    
     if mainskuid:
    
         mainskuid = mainskuid[0]
    
         url = "https://cd.jd.com/description/channel?skuId={}&mainSkuId={}"
    
         next_url = url.format(skuid,mainskuid)
    
     else:
    
         url = "https://dx.3.cn/desc/{}"
    
         next_url = url.format(skuid)
    
     item["book_info"]["book_description"] = next_url
    
     print("*"*20)
    
     print("描述文件获取地址:",next_url)
    
     print("*"*20)
    
     pprint(item)

全部评论 (0)

还没有任何评论哟~