Advertisement

爬取京东商品信息

阅读量:

爬取京东商品信息

环境:

  • Python 3.6
  • Pycharm
  • MYSQL

京东网页分析

主要抓取以下商品参数:

name

爬取京东商品信息首先得有商品信息入口,以商品女装(关键字)为例,

复制代码
    url = 'https://search.jd.com/Search?keyword=%s&enc=utf-8&page=%s'
    # keyword 为搜索关键字
    # page 为页码

简单分析京东商品信息页面,发现商品讯息:

在这里插入图片描述

我们可以通过xpath找到我们想要的信息:

在这里插入图片描述

找到我们想要的信息后我们通过PyMysql将商品信息存到数据库中:

复制代码
    def get_db(self):
        db = pymysql.connect(
            host='你的ip',
            port=3306,
            user='用户名',
            password='密码',
            db='库名',
            charset='utf8',
        )
        return db
    
    def result_save(self, data):
        #连接数据库
        db = self.get_db()
        #创建游标
        cursor = db.cursor()
    
        #sql语句
        sql = 'insert into taobao (name, price, shop_name, location) values(%s, %s, %s, %s)'
        try:
    
            for i in range(len(data['price'])):
                #执行sql语句
                cursor.execute(sql, (
                    data['name'], data['price'][i], data['shop_name'][i],
                    data['location'][i]))
                db.commit()
            print('爬取储存成功,共%s条。' % len(data['price']))
        except Exception as e:
            print(e)
            print('爬取失败')
            db.rollback()
    
        #关闭游标
        cursor.close()
        #关闭连接
        db.close()

最终保存的数据:

在这里插入图片描述

好了到这里就结束了,来看看我们全部代码:

复制代码
    from lxml import etree
    import pymysql
    import requests
    from selenium.common.exceptions import TimeoutException
    
    
    class Jd:
    def get_db(self):
        db = pymysql.connect(
            host='你的ip',
            port=3306,
            user='用户名',
            password='密码',
            db='库名',
            charset='utf8',
        )
        return db
    
    def result_save(self, data):
        db = self.get_db()
        cursor = db.cursor()
    
        sql = 'insert into table_name(name, price, shop_name, location) values(%s, %s, %s, %s)'
        try:
    
            for i in range(len(data['price'])):
                cursor.execute(sql, (
                    data['name'], data['price'][i], data['shop_name'][i],
                    data['location'][i]))
                db.commit()
            print('爬取储存成功,共%s条。' % len(data['price']))
        except Exception as e:
            print(e)
            print('爬取失败')
            db.rollback()
    
        cursor.close()
        db.close()
    
    
    def page_get(self, page, keyword):
    
        print('正在爬取第', page, '页')
        try:
            url = 'https://search.jd.com/Search?keyword=%s&enc=utf-8&page=%s' % (keyword, page)
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.5,zh;q=0.3',
                'Referer': 'https://www.jd.com/',
                'DNT': '1',
                'Connection': 'keep-alive',
                'Upgrade-Insecure-Requests': '1',
                'TE': 'Trailers',
            }
            browser = requests.get(url, headers=headers)
            html_str = browser.text
            html = etree.HTML(html_str)
            data = {}
            name = keyword
            shop_name = html.xpath("//div[@class='p-img']/a/@title")
            price = html.xpath("//div/ul/li/div/div/strong/i/text()")
            location = html.xpath("//div[@class='p-img']/a/img/@src")
            data['name'] = name
            data['price'] = price
            data['shop_name'] = shop_name
            data['location'] = location
            self.result_save(data)
    
        except TimeoutException:
            self.page_get(page, keyword)
    
    
    jd = Jd()
    pages = int(input('请输入要爬取的页数:'))
    keyword = input('请输入要搜索的关键字:')
    for i in range(1, pages + 1):
    jd.page_get(i, keyword)

全部评论 (0)

还没有任何评论哟~