Advertisement

爬虫------爬取当当网服装信息(使用scrapy)

阅读量:

一、总体思路

1、创建scrapy项目
2、分析当当网特产网址
3、分析出所取部分xpath公式
4、编写item
5、编写爬虫
6、编写pipline文件将取到的数据存入到mysql中

二、具体实现

1、创建scrapy项目

复制代码
    scrapy startproject dangdang

2、分析当当网特产网址

第一页 http://category.dangdang.com/pg1-cid4003844.html

第二页http://category.dangdang.com/pg2-cid4003844.html

第三页http://category.dangdang.com/pg3-cid4003844.html

对比发现数字不同页数不同

3、分析出所取部分xpath公式

复制代码
 item["comment"] = response.xpath("//a[@name='itemlist-review']/text()").extract()

    
 item["link"] = response.xpath("//a[@name='itemlist-title']/@href").extract()
    
 item["title"] = response.xpath("//a[@name='itemlist-title']/text()").extract()
    
 item['price'] = response.xpath("//span[@class='price_n']/text()").extract()

4项目代码

一项目结构

(1)item

定义四个字段分别存储title,link,price.comment

复制代码
 import scrapy

    
 class DangdangItem(scrapy.Item):
    
     # define the fields for your item here like:
    
     # name = scrapy.Field()
    
     title=scrapy.Field()
    
     link=scrapy.Field()
    
     comment=scrapy.Field()
    
     price=scrapy.Field()
复制代码
    dd.py
复制代码
 # -*- coding: utf-8 -*-

    
 import scrapy
    
 from scrapy.http import Request
    
 from dangdang.items import DangdangItem
    
  
    
 class DdSpider(scrapy.Spider):
    
     name ='dd'
    
     allowed_domains = ['dangdang.com']
    
     # 开始的url
    
     start_urls = ['http://category.dangdang.com/pg1-cid4003844.html']
    
  
    
     def parse(self, response):
    
     item = DangdangItem()
    
     item["comment"] = response.xpath("//a[@name='itemlist-review']/text()").extract()
    
     item["link"] = response.xpath("//a[@name='itemlist-title']/@href").extract()
    
     item["title"] = response.xpath("//a[@name='itemlist-title']/text()").extract()
    
     item['price'] = response.xpath("//span[@class='price_n']/text()").extract()
    
     # print(item["link"])
    
     # 使用yield item直接将数据提交给pipline
    
     yield item
    
     # print(item)
    
     """使用回调用函数来解决翻页的问题"""
    
     for i in range(2, 101):
    
         url = 'http://category.dangdang.com/pg' + str(i) + '-cid4003844.html'
    
         yield Request(url,callback=self.parse)

pipeline.py

复制代码
 # -*- coding: utf-8 -*-

    
  
    
 # Define your item pipelines here
    
 #
    
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    
 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
  
    
 import pymysql
    
 # import  uuid
    
  
    
 class DangdangPipeline(object):
    
     def process_item(self,item,spider):
    
     for i in range(0, len(item["title"])):
    
         title = item["title"][i]
    
         link = item["link"][i]
    
         comment = item["comment"][i]
    
         price=item["price"][i]
    
         # print("===============================")
    
         # print(title+"   "+price+ ":" + link + " " + comment)
    
     return item
    
  
    
  
    
 class MysqlPipeline():
    
     def __init__(self, host, database, user, password, port):
    
     self.host = host
    
     self.database = database
    
     self.user = user
    
     self.password = password
    
     self.port = port
    
  
    
     @classmethod
    
     def from_crawler(cls, crawler):
    
     return cls(
    
         host=crawler.settings.get('MYSQL_HOST'),
    
         database=crawler.settings.get('MYSQL_DATABASE'),
    
         user=crawler.settings.get('MYSQL_USER'),
    
         password=crawler.settings.get('MYSQL_PASSWORD'),
    
         port=crawler.settings.get('MYSQL_PORT'),
    
     )
    
  
    
     def open_spider(self,spider):
    
     self.db = pymysql.connect(self.host, self.user, self.password, self.database, charset='utf8', port=self.port)
    
     self.cursor = self.db.cursor()
    
  
    
     def close_spider(self,spider):
    
     self.db.close()
    
  
    
     def process_item(self, item, spider):
    
     for i in range(0, len(item["title"])):
    
         title = item["title"][i]
    
         link = item["link"][i]
    
         comment = item["comment"][i]
    
         price = item["price"][i]
    
         sql="insert into goods(title,price,link,comment) values ('"+title+"','"+price+"','"+link+"','"+comment+"')"
    
         # sql = "insert into goods(title,link,comment) values (title,link,comment)"
    
         # print(sql)
    
         # self.db.query(sql)
    
         try:
    
             self.cursor.execute(sql)
    
             self.db.commit()
    
         except Exception as error:
    
             print(error)
    
     return item

settings.py

复制代码
 # -*- coding: utf-8 -*-

    
  
    
 # Scrapy settings for dangdang project
    
 #
    
 # For simplicity, this file contains only settings considered important or
    
 # commonly used. You can find more settings consulting the documentation:
    
 #
    
 #     https://doc.scrapy.org/en/latest/topics/settings.html
    
 #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    
 #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    
  
    
 BOT_NAME = 'dangdang'
    
  
    
 SPIDER_MODULES = ['dangdang.spiders']
    
 NEWSPIDER_MODULE = 'dangdang.spiders'
    
  
    
 "mysql的配置"
    
 MYSQL_HOST = 'localhost'
    
 MYSQL_DATABASE = 'dd'
    
 MYSQL_USER = 'root'
    
 MYSQL_PASSWORD = 'fankai'
    
 MYSQL_PORT = 3306
    
  
    
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
    
 #USER_AGENT = 'dangdang (+http://www.yourdomain.com)'
    
  
    
 # Obey robots.txt rules
    
 ROBOTSTXT_OBEY = False
    
  
    
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
    
 #CONCURRENT_REQUESTS = 32
    
  
    
 # Configure a delay for requests for the same website (default: 0)
    
 # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
    
 # See also autothrottle settings and docs
    
 #DOWNLOAD_DELAY = 3
    
 # The download delay setting will honor only one of:
    
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    
 #CONCURRENT_REQUESTS_PER_IP = 16
    
  
    
 # Disable cookies (enabled by default)
    
 #COOKIES_ENABLED = False
    
  
    
 # Disable Telnet Console (enabled by default)
    
 #TELNETCONSOLE_ENABLED = False
    
  
    
 # Override the default request headers:
    
 #DEFAULT_REQUEST_HEADERS = {
    
 #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    
 #   'Accept-Language': 'en',
    
 #}
    
  
    
 # Enable or disable spider middlewares
    
 # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    
 #SPIDER_MIDDLEWARES = {
    
 #    'dangdang.middlewares.DangdangSpiderMiddleware': 543,
    
 #}
    
  
    
 # Enable or disable downloader middlewares
    
 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    
 #DOWNLOADER_MIDDLEWARES = {
    
 #    'dangdang.middlewares.DangdangDownloaderMiddleware': 543,
    
 #}
    
  
    
 # Enable or disable extensions
    
 # See https://doc.scrapy.org/en/latest/topics/extensions.html
    
 #EXTENSIONS = {
    
 #    'scrapy.extensions.telnet.TelnetConsole': None,
    
 #}
    
  
    
 # Configure item pipelines
    
 # # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
  
    
 #数字越小越靠近引擎,越先执行
    
 ITEM_PIPELINES = {
    
    'dangdang.pipelines.DangdangPipeline': 300,
    
    'dangdang.pipelines.MysqlPipeline': 301,
    
  
    
 }
    
  
    
 # Enable and configure the AutoThrottle extension (disabled by default)
    
 # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
    
 #AUTOTHROTTLE_ENABLED = True
    
 # The initial download delay
    
 #AUTOTHROTTLE_START_DELAY = 5
    
 # The maximum download delay to be set in case of high latencies
    
 #AUTOTHROTTLE_MAX_DELAY = 60
    
 # The average number of requests Scrapy should be sending in parallel to
    
 # each remote server
    
 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    
 # Enable showing throttling stats for every response received:
    
 #AUTOTHROTTLE_DEBUG = False
    
  
    
 # Enable and configure HTTP caching (disabled by default)
    
 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    
 #HTTPCACHE_ENABLED = True
    
 #HTTPCACHE_EXPIRATION_SECS = 0
    
 #HTTPCACHE_DIR = 'httpcache'
    
 #HTTPCACHE_IGNORE_HTTP_CODES = []
    
 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
    
  
    
  
    
 class MysqlPipeline():
    
     def __init__(self, host, database, user, password, port):
    
     self.host = host
    
     self.database = database
    
     self.user = user
    
     self.password = password
    
     self.port = port
    
  
    
     @classmethod
    
     def from_crawler(cls, crawler):
    
     return cls(
    
         host=crawler.settings.get('MYSQL_HOST'),
    
         database=crawler.settings.get('MYSQL_DATABASE'),
    
         user=crawler.settings.get('MYSQL_USER'),
    
         password=crawler.settings.get('MYSQL_PASSWORD'),
    
         port=crawler.settings.get('MYSQL_PORT'),
    
     )
    
  
    
     def open_spider(self,spider):
    
     self.db = pymysql.connect(self.host, self.user, self.password, self.database, charset='utf8', port=self.port)
    
     self.cursor = self.db.cursor()
    
  
    
     def close_spider(self,spider):
    
     self.db.close()
    
  
    
     def process_item(self, item, spider):
    
     for i in range(0, len(item["title"])):
    
         title = item["title"][i]
    
         link = item["link"][i]
    
         comment = item["comment"][i]
    
         price = item["price"][i]
    
         sql="insert into goods(title,price,link,comment) values ('"+title+"','"+price+"','"+link+"','"+comment+"')"
    
         # sql = "insert into goods(title,link,comment) values (title,link,comment)"
    
         # print(sql)
    
         # self.db.query(sql)
    
         try:
    
             self.cursor.execute(sql)
    
             self.db.commit()
    
         except Exception as error:
    
             print(error)
    
     return item

将数据存在mysql中

mysql创建表

复制代码
    CREATE DATABASE ddcreate TABLE goods(id INT(32)AUTO_INCREMENT PRIMARY KEY,title VARCHAR(100),price VARCHAR(100),link VARCHAR(100) UNIQUE,COMMENT VARCHAR(100))

执行结果

使用

from scrapy.http import Request
"""使用回调用函数来解决翻页的问题""" for i in range(2, 101): url = 'http://category.dangdang.com/pg' + str(i) + '-cid4003844.html' yield Request(url,callback=self.parse)

全部评论 (0)

还没有任何评论哟~