爬虫------爬取当当网服装信息(使用scrapy)
发布时间
阅读量:
阅读量
一、总体思路
1、创建scrapy项目
2、分析当当网特产网址
3、分析出所取部分xpath公式
4、编写item
5、编写爬虫
6、编写pipline文件将取到的数据存入到mysql中
二、具体实现
1、创建scrapy项目
scrapy startproject dangdang
2、分析当当网特产网址
第一页 http://category.dangdang.com/pg1-cid4003844.html
第二页http://category.dangdang.com/pg2-cid4003844.html
第三页http://category.dangdang.com/pg3-cid4003844.html
对比发现数字不同页数不同
3、分析出所取部分xpath公式
item["comment"] = response.xpath("//a[@name='itemlist-review']/text()").extract()
item["link"] = response.xpath("//a[@name='itemlist-title']/@href").extract()
item["title"] = response.xpath("//a[@name='itemlist-title']/text()").extract()
item['price'] = response.xpath("//span[@class='price_n']/text()").extract()
4项目代码
一项目结构

(1)item
定义四个字段分别存储title,link,price.comment
import scrapy
class DangdangItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title=scrapy.Field()
link=scrapy.Field()
comment=scrapy.Field()
price=scrapy.Field()
dd.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from dangdang.items import DangdangItem
class DdSpider(scrapy.Spider):
name ='dd'
allowed_domains = ['dangdang.com']
# 开始的url
start_urls = ['http://category.dangdang.com/pg1-cid4003844.html']
def parse(self, response):
item = DangdangItem()
item["comment"] = response.xpath("//a[@name='itemlist-review']/text()").extract()
item["link"] = response.xpath("//a[@name='itemlist-title']/@href").extract()
item["title"] = response.xpath("//a[@name='itemlist-title']/text()").extract()
item['price'] = response.xpath("//span[@class='price_n']/text()").extract()
# print(item["link"])
# 使用yield item直接将数据提交给pipline
yield item
# print(item)
"""使用回调用函数来解决翻页的问题"""
for i in range(2, 101):
url = 'http://category.dangdang.com/pg' + str(i) + '-cid4003844.html'
yield Request(url,callback=self.parse)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
# import uuid
class DangdangPipeline(object):
def process_item(self,item,spider):
for i in range(0, len(item["title"])):
title = item["title"][i]
link = item["link"][i]
comment = item["comment"][i]
price=item["price"][i]
# print("===============================")
# print(title+" "+price+ ":" + link + " " + comment)
return item
class MysqlPipeline():
def __init__(self, host, database, user, password, port):
self.host = host
self.database = database
self.user = user
self.password = password
self.port = port
@classmethod
def from_crawler(cls, crawler):
return cls(
host=crawler.settings.get('MYSQL_HOST'),
database=crawler.settings.get('MYSQL_DATABASE'),
user=crawler.settings.get('MYSQL_USER'),
password=crawler.settings.get('MYSQL_PASSWORD'),
port=crawler.settings.get('MYSQL_PORT'),
)
def open_spider(self,spider):
self.db = pymysql.connect(self.host, self.user, self.password, self.database, charset='utf8', port=self.port)
self.cursor = self.db.cursor()
def close_spider(self,spider):
self.db.close()
def process_item(self, item, spider):
for i in range(0, len(item["title"])):
title = item["title"][i]
link = item["link"][i]
comment = item["comment"][i]
price = item["price"][i]
sql="insert into goods(title,price,link,comment) values ('"+title+"','"+price+"','"+link+"','"+comment+"')"
# sql = "insert into goods(title,link,comment) values (title,link,comment)"
# print(sql)
# self.db.query(sql)
try:
self.cursor.execute(sql)
self.db.commit()
except Exception as error:
print(error)
return item
# -*- coding: utf-8 -*-
# Scrapy settings for dangdang project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'dangdang'
SPIDER_MODULES = ['dangdang.spiders']
NEWSPIDER_MODULE = 'dangdang.spiders'
"mysql的配置"
MYSQL_HOST = 'localhost'
MYSQL_DATABASE = 'dd'
MYSQL_USER = 'root'
MYSQL_PASSWORD = 'fankai'
MYSQL_PORT = 3306
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'dangdang (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'dangdang.middlewares.DangdangSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'dangdang.middlewares.DangdangDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#数字越小越靠近引擎,越先执行
ITEM_PIPELINES = {
'dangdang.pipelines.DangdangPipeline': 300,
'dangdang.pipelines.MysqlPipeline': 301,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
class MysqlPipeline():
def __init__(self, host, database, user, password, port):
self.host = host
self.database = database
self.user = user
self.password = password
self.port = port
@classmethod
def from_crawler(cls, crawler):
return cls(
host=crawler.settings.get('MYSQL_HOST'),
database=crawler.settings.get('MYSQL_DATABASE'),
user=crawler.settings.get('MYSQL_USER'),
password=crawler.settings.get('MYSQL_PASSWORD'),
port=crawler.settings.get('MYSQL_PORT'),
)
def open_spider(self,spider):
self.db = pymysql.connect(self.host, self.user, self.password, self.database, charset='utf8', port=self.port)
self.cursor = self.db.cursor()
def close_spider(self,spider):
self.db.close()
def process_item(self, item, spider):
for i in range(0, len(item["title"])):
title = item["title"][i]
link = item["link"][i]
comment = item["comment"][i]
price = item["price"][i]
sql="insert into goods(title,price,link,comment) values ('"+title+"','"+price+"','"+link+"','"+comment+"')"
# sql = "insert into goods(title,link,comment) values (title,link,comment)"
# print(sql)
# self.db.query(sql)
try:
self.cursor.execute(sql)
self.db.commit()
except Exception as error:
print(error)
return item
将数据存在mysql中
mysql创建表
CREATE DATABASE ddcreate TABLE goods(id INT(32)AUTO_INCREMENT PRIMARY KEY,title VARCHAR(100),price VARCHAR(100),link VARCHAR(100) UNIQUE,COMMENT VARCHAR(100))
执行结果

使用
from scrapy.http import Request
"""使用回调用函数来解决翻页的问题""" for i in range(2, 101): url = 'http://category.dangdang.com/pg' + str(i) + '-cid4003844.html' yield Request(url,callback=self.parse)
全部评论 (0)
还没有任何评论哟~
