Advertisement

爬取生物医学信息网站植物基因序列信息(基于scrapy框架)

阅读量:

scrapy框架

scrapy框架是Python的主流爬虫框架之一,其主要功能包括爬取网络数据、提取结构性数据等。相信网上对于scrapy框架组件和工作流程的介绍数不胜数,我也就不班门弄斧啦,同时scrapy框架还提供了许多强大的命令行工具,如scrapy shell,scrapy parse ,scrapy fetch,scrapy view等,这些对于编写和测试爬虫大有裨益。

爬取网站介绍与分析

核心代码

复制代码
    # -*- coding: utf-8 -*-
    
    import scrapy
    
    
    class GovspiderItem(scrapy.Item):
    ribonucleic_acid_sequence = scrapy.Field()
复制代码
    # -*- coding: utf-8 -*-
    import scrapy
    from govspider.items import GovspiderItem
    from govspider.settings import TERM,TOTAL_PAGE
    import re
    # 并发量32
    # 下载延迟1s
    
    
    class GovSpider(scrapy.Spider):
    name = 'gov'
    allowed_domains = ['ncbi.nlm.nih.gov']
    start_urls = ['https://www.ncbi.nlm.nih.gov/nuccore/?term={}'.format(TERM)]
    total_num = None
    key = None
    i = 0
    j = 1  # 显示已经爬取的页数
    
    def parse(self, response):
        if self.j <= TOTAL_PAGE: 
            self.i += 1
            if self.i == 1:
           		# 获取post请求需要的部分相关参数
                self.total_num = response.xpath('//div[@class="title_and_pager"]/div/input[@name="EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_ResultsController.ResultCount"]/@value').getall()[0]
                self.key = response.xpath('//div/input[@name="EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.QueryKey"]/@value').getall()[0]
            links = response.xpath('//div[@class="shortcuts"]/a[@class="dblinks"][2]/@href').getall()
    			# 构建每页FASTA链接请求对象
            for i in links:
                new_url = 'https://www.ncbi.nlm.nih.gov' + i
                yield scrapy.Request(new_url, callback=self.xpath_parse)
            self.j += 1
            data = self.url_parse()
            print(self.total_num, self.key, self.i, '************************')
            # 翻页,发送下一页请求对象,获取响应对象
            yield scrapy.FormRequest('https://www.ncbi.nlm.nih.gov/nuccore/', formdata=data, callback=self.parse)
    
    # 解析响应对象,构建基因序列链接
    def xpath_parse(self, response):
        params_id = re.findall('val="(.*?)" SequenceSize=', response.text)[0]
        print(params_id)
        new_url = 'https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?id={}&db=nuccore&report=fasta&extrafeat=null&conwithfeat=on&hide-cdd=on&retmode=html&withmarkup=on&tool=portal&log$=seqview&maxdownloadsize=1000000'.format(params_id)
        yield scrapy.Request(new_url, callback=self.another_parse)
    
    # 解析目标,传入管道文件
    def another_parse(self, response):
        item = GovspiderItem()
        a = response.text
        if len(a) > 500:
            item['ribonucleic_acid_sequence'] = response.text
            yield item
      
    	# post请求的formdata数据
    def url_parse(self):
        data = {
            'term': TERM,  # 项目名
            'EntrezSystem2.PEntrez.Nuccore.Sequence_PageController.PreviousPageName': 'results',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_Facets.FacetsUrlFrag': 'filters=',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_Facets.FacetSubmitted': 'false',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_Facets.BMFacets': '',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.sPresentation': 'docsum',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.sPageSize': '20',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.sSort': 'none',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.FFormat': 'docsum',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.FSort': '',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.CSFormat': 'fasta_cds_na',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.GFFormat': 'gene_fasta',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.Db': 'nuccore',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.QueryKey': self.key,  # 必要参数
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.CurrFilter': 'all',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.ResultCount': self.total_num,  # 总数
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.ViewerParams': '',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.FileFormat': 'docsum',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.LastPresentation': 'docsum',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.Presentation': 'docsum',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.PageSize': '20',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.LastPageSize': '20',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.Sort': '',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.LastSort': '',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.FileSort': '',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.Format': '',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.LastFormat': '',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.PrevPageSize': '20',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.PrevPresentation': 'docsum',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.PrevSort': '',
            'CollectionStartIndex': '1',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_ResultsController.ResultCount': self.total_num,  # 总数
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_ResultsController.RunLastQuery': '',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_ResultsController.AccnsFromResult': '',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Entrez_Pager.CurrPage': str(self.j),   # 页数
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Entrez_Pager.cPage': str(self.j-1),  # 前一页
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.sPresentation2': 'docsum',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.sPageSize2': '20',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.sSort2': 'none',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.TopSendTo': 'genefeat',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.FFormat2': 'docsum',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.FSort2': '',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.CSFormat2': 'fasta_cds_na',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_DisplayBar.GFFormat2': 'gene_fasta',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_MultiItemSupl.Taxport.TxView': 'list',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_MultiItemSupl.Taxport.TxListSize': '5',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_MultiItemSupl.RelatedDataLinks.rdDatabase': 'rddbto',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Sequence_MultiItemSupl.RelatedDataLinks.DbName': 'nuccore',
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Discovery_SearchDetails.SearchDetailsTerm': '"{}"[Organism] OR {}[All Fields]'.format(TERM, TERM),  # 项目名
            'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.HistoryDisplay.Cmd': 'PageChanged',
            'EntrezSystem2.PEntrez.DbConnector.Db': 'nuccore',
            'EntrezSystem2.PEntrez.DbConnector.LastDb': 'nuccore',
            'EntrezSystem2.PEntrez.DbConnector.Term': TERM,  # 项目名
            'EntrezSystem2.PEntrez.DbConnector.LastTabCmd': '',
            'EntrezSystem2.PEntrez.DbConnector.LastQueryKey': self.key,
            'EntrezSystem2.PEntrez.DbConnector.IdsFromResult': '',
            'EntrezSystem2.PEntrez.DbConnector.LastIdsFromResult': '',
            'EntrezSystem2.PEntrez.DbConnector.LinkName': '',
            'EntrezSystem2.PEntrez.DbConnector.LinkReadableName': '',
            'EntrezSystem2.PEntrez.DbConnector.LinkSrcDb': '',
            'EntrezSystem2.PEntrez.DbConnector.Cmd': 'PageChanged',
            'EntrezSystem2.PEntrez.DbConnector.TabCmd': '',
            'EntrezSystem2.PEntrez.DbConnector.QueryKey': '',
            'p$a': 'EntrezSystem2.PEntrez.Nuccore.Sequence_ResultsPanel.Entrez_Pager.Page',
            'p$l': 'EntrezSystem2',
            'p$st': 'nuccore',
        }
        return data
复制代码
    # 只展示核心部分
    # 基因序列种类
    TERM = 'Coelogyne'
    # 控制爬取页数
    TOTAL_PAGE = 43
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    
    # Enable or disable downloader middlewares
    # middlewares.py文件需要创建一个Random_User_Agent类,给每个请求对象添加不同的uer-agent,否则会被对方服务器识别出是爬虫
    DOWNLOADER_MIDDLEWARES = {
       'govspider.middlewares.GovspiderDownloaderMiddleware': 543,
       'govspider.middlewares.Random_User_Agent': 500,
    }
复制代码
    import json
    from govspider.settings import TERM
    
    
    class GovspiderPipeline(object):
    """每个文件夹放一个,"""
    def __init__(self):
        self.file = open(r'C:\Users\User\Desktop\FASTA\{}.txt'.format(TERM), mode='a', encoding='utf-8') 
    
    def process_item(self, item, spider):
        jsontext = json.dumps(item['ribonucleic_acid_sequence'].replace('\n', ''), ensure_ascii=False)
        self.file.write(jsontext + '\n\n*************\n')
        return item

全部评论 (0)

还没有任何评论哟~