Advertisement

python爬取京东商品评论稳1W+

阅读量:

本文介绍了使用Python爬取京东商品评论的方法。通过定义CommentSpider类实现了数据预览、页面抓取和数据保存功能。代码使用requests、re和json等库进行网络请求与数据解析,并将评论信息存储至数据库中。主要功能包括页面抓取与解析逻辑,并支持多页数据获取与处理。

python爬取京东商品评论稳1W+

    • 先预览下数据
    • 贴上代码

先预览下数据

在这里插入图片描述

贴上代码

复制代码
    import time
    
    import requests
    import re
    import json
    from jd_comment import db_util
    
    
    class CommentSpider:
    """
    爬取靳东
    """
    commentUrl = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&" \
                 "productId={0}&score=0&sortType=5&page={1}&pageSize=10&isShadowSku=0&rid=0&fold=1"
    
    origin_reffer = "https://item.jd.com/{0}.html"
    
    sleep_seconds = 2
    
    def __init__(self, productId):
        self.productId = productId
    
    def build_headers(self, next_page_cookie=None):
        Referer = self.origin_reffer.format(self.productId)
    
        if next_page_cookie is None:
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
                "Referer": Referer,
                "Host": "club.jd.com",
                "Cookie": "unpl=V2_ZzNtbRFfQBJzWxQEfB9UAWJQQF9KBBQVdVhOXHpOXwJkUUBfclRCFnUUR1RnGFQUZAEZXkJcRhZFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZHsYXgdgBhRVSlBzJXI4dmR9HlsCYQEiXHJWc1chVE9UeR1fBioDE19AUEYTfQBBZHopXw%3d%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_b9267bca67954bc39bf11a990f262cc3|1610205703260; __jdu=1033647996; areaId=2; PCSYCityID=CN_310000_310100_0; shshshfpb=wVjD8v2Dr7inEEgCOGiQ9kQ%3D%3D; shshshfpa=6792afdc-4156-d1ed-8c5a-86d979144193-1591804178; __jda=122270672.1033647996.1610205703.1610205703.1610205703.1; __jdc=122270672; shshshfp=4f2edd84f8946f1594a34d185b2d4b3b; 3AB9D23F7A4B3C9B=JVSHRSEP2KT6XOTDLFPMA3CYGKN3L5PI427XN6PJDRZ5PBUY6CV3KWZ6Q6YHQJLZI3BKFST2DHV55MHPYODYFB6MTA; ipLoc-djd=2-2830-51803-0; jwotest_product=99; shshshsID=690b740513ddb1e914cdc6870e46c538_4_1610206811306; __jdb=122270672.4.1033647996|1.1610205703; JSESSIONID=AE587338A97897165F8BCB899525EBF4.s1"
            }
        else:
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
                "Referer": Referer,
                "Host": "club.jd.com",
                "Cookie": "unpl=V2_ZzNtbRFfQBJzWxQEfB9UAWJQQF9KBBQVdVhOXHpOXwJkUUBfclRCFnUUR1RnGFQUZAEZXkJcRhZFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZHsYXgdgBhRVSlBzJXI4dmR9HlsCYQEiXHJWc1chVE9UeR1fBioDE19AUEYTfQBBZHopXw%3d%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_b9267bca67954bc39bf11a990f262cc3|1610205703260; __jdu=1033647996; areaId=2; PCSYCityID=CN_310000_310100_0; shshshfpb=wVjD8v2Dr7inEEgCOGiQ9kQ%3D%3D; shshshfpa=6792afdc-4156-d1ed-8c5a-86d979144193-1591804178;"
                          " __jda=122270672.1033647996.1610205703.1610205703.1610205703.1;"
                          " __jdc=122270672; shshshfp=4f2edd84f8946f1594a34d185b2d4b3b;"
                          " 3AB9D23F7A4B3C9B=JVSHRSEP2KT6XOTDLFPMA3CYGKN3L5PI427XN6PJDRZ5PBUY6CV3KWZ6Q6YHQJLZI3BKFST2DHV55MHPYODYFB6MTA;"
                          " ipLoc-djd=2-2830-51803-0; jwotest_product=99; shshshsID=690b740513ddb1e914cdc6870e46c538_4_1610206811306;"
                          " __jdb=122270672.4.1033647996|1.1610205703; "
                          "JSESSIONID={0}".format(next_page_cookie)
            }
        return headers
    
    def get_one_page_comment(self, page=0, next_page_cookie=None):
        url = self.commentUrl.format(self.productId, page)
        res = requests.get(url, headers=self.build_headers(next_page_cookie))
        coms_json = self.parse_text_res(res.text)
        next_page_cookie = requests.utils.dict_from_cookiejar(res.cookies)
        return coms_json, next_page_cookie
    
    def parse_text_res(self, text):
        match_com = re.findall("fetchJSON_comment98(.*)", text)[0][1:].replace(");", "")
        print(match_com)
        coms_json = json.loads(match_com)
        return coms_json
    
    def get_all_comments(self):
        coms_json, next_page_cookie = self.get_one_page_comment()
        print(next_page_cookie)
        maxPage = coms_json['maxPage']
        print("最大的页数:" + str(maxPage))
        comments = coms_json['comments']
    
        print("第一页")
        # 第一页的
        for comment in comments:
            id = comment['id']
            prd_id = self.productId
            content = comment['content'].replace("\n", " ")
            creationTime = comment['creationTime']
            print("Id:" + str(id))
            base_com = db_util.find_by_order_id(id)
    
            if len(base_com) == 0:
                db_util.insert_order_detail_other([
                    id, prd_id, creationTime, content
                ])
            else:
                print("已经存在")
            print()
    
        for page in range(1, int(maxPage)):
            print("第{0}页".format(page))
            res, next_page_cookie = self.get_one_page_comment(page=page, next_page_cookie=next_page_cookie)
            time.sleep(self.sleep_seconds)
            comments = res['comments']
            for comment in comments:
                id = comment['id']
                prd_id = self.productId
                print("Id:" + str(id))
                content = comment['content'].replace("\n", " ")
                creationTime = comment['creationTime']
                print()
                base_com = db_util.find_by_order_id(id)
                if len(base_com) == 0:
                    db_util.insert_order_detail_other([
                        id, prd_id, creationTime, content
                    ])
                else:
                    print("已经存在")
    
    
    if __name__ == "__main__":
    productId = "10022981047603"
    commetSpider = CommentSpider(productId)
    commetSpider.get_all_comments()
复制代码

全部评论 (0)

还没有任何评论哟~