爬取京东商品信息
发布时间
阅读量:
阅读量
爬取京东商品信息
环境:
- Python 3.6
- Pycharm
- MYSQL
京东网页分析
主要抓取以下商品参数:
name
爬取京东商品信息首先得有商品信息入口,以商品女装(关键字)为例,
url = 'https://search.jd.com/Search?keyword=%s&enc=utf-8&page=%s'
# keyword 为搜索关键字
# page 为页码
简单分析京东商品信息页面,发现商品讯息:

我们可以通过xpath找到我们想要的信息:

找到我们想要的信息后我们通过PyMysql将商品信息存到数据库中:
def get_db(self):
db = pymysql.connect(
host='你的ip',
port=3306,
user='用户名',
password='密码',
db='库名',
charset='utf8',
)
return db
def result_save(self, data):
#连接数据库
db = self.get_db()
#创建游标
cursor = db.cursor()
#sql语句
sql = 'insert into taobao (name, price, shop_name, location) values(%s, %s, %s, %s)'
try:
for i in range(len(data['price'])):
#执行sql语句
cursor.execute(sql, (
data['name'], data['price'][i], data['shop_name'][i],
data['location'][i]))
db.commit()
print('爬取储存成功,共%s条。' % len(data['price']))
except Exception as e:
print(e)
print('爬取失败')
db.rollback()
#关闭游标
cursor.close()
#关闭连接
db.close()
最终保存的数据:

好了到这里就结束了,来看看我们全部代码:
from lxml import etree
import pymysql
import requests
from selenium.common.exceptions import TimeoutException
class Jd:
def get_db(self):
db = pymysql.connect(
host='你的ip',
port=3306,
user='用户名',
password='密码',
db='库名',
charset='utf8',
)
return db
def result_save(self, data):
db = self.get_db()
cursor = db.cursor()
sql = 'insert into table_name(name, price, shop_name, location) values(%s, %s, %s, %s)'
try:
for i in range(len(data['price'])):
cursor.execute(sql, (
data['name'], data['price'][i], data['shop_name'][i],
data['location'][i]))
db.commit()
print('爬取储存成功,共%s条。' % len(data['price']))
except Exception as e:
print(e)
print('爬取失败')
db.rollback()
cursor.close()
db.close()
def page_get(self, page, keyword):
print('正在爬取第', page, '页')
try:
url = 'https://search.jd.com/Search?keyword=%s&enc=utf-8&page=%s' % (keyword, page)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.5,zh;q=0.3',
'Referer': 'https://www.jd.com/',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'TE': 'Trailers',
}
browser = requests.get(url, headers=headers)
html_str = browser.text
html = etree.HTML(html_str)
data = {}
name = keyword
shop_name = html.xpath("//div[@class='p-img']/a/@title")
price = html.xpath("//div/ul/li/div/div/strong/i/text()")
location = html.xpath("//div[@class='p-img']/a/img/@src")
data['name'] = name
data['price'] = price
data['shop_name'] = shop_name
data['location'] = location
self.result_save(data)
except TimeoutException:
self.page_get(page, keyword)
jd = Jd()
pages = int(input('请输入要爬取的页数:'))
keyword = input('请输入要搜索的关键字:')
for i in range(1, pages + 1):
jd.page_get(i, keyword)
全部评论 (0)
还没有任何评论哟~
