Advertisement

使用python抓取落网期刊图片

阅读量:

使用python抓取落网期刊图片

尽管过去两年多以来一直使用Python开发。

复制代码
    #coding:gbk
    import os
    
    import bs4
    import requests
    
    
    # 保存期刊图片的路径,你可以换成你自己的
    base_dir = r"F:\落网电台"
    
    def main():
    start_url = 'http://www.luoo.net/music/'
    #os.makedirs('落网电台')
    parse_detail(start_url)
    
    def parse_detail(req_url):
    '''
    # 解析包含期刊图片的页面
    '''
    resp = requests.get(req_url)
    resp.raise_for_status()
    resp.encoding = 'utf-8'
    
    soup = bs4.BeautifulSoup(resp.text, 'lxml')
    vol_items = soup.select('div.vol-list > div > a')
    print('当前页面期刊div.item节点数目:[%d]'%len(vol_items))
    if not (vol_items and len(vol_items)):
        print('当前没有需要下载的期刊image..')
        return True
    for item in vol_items:
        print type(item), item.name, item.attrs
        vol_title = item['title']
        vol_num = item['href'].split('/')[-1]
    
        for child in item.children:
            if isinstance(child, bs4.element.Tag) and child.name == 'img':
                image_url = child['src']
                print("当前图片的链接:[%s]"%image_url)
                download_img(image_url, vol_num, vol_title)
    
    # 开始处理翻页
    page_items = soup.select('div.paginator > a.next')
    if page_items and len(page_items):
        next_page_url = page_items[0]['href']
        print("下一页链接:[%s]"%next_page_url)
        parse_detail(next_page_url)
    
    
    
    def download_img(vol_img_url, vol_num, vol_title):
    '''
    # 保存期刊图片
    '''
    print('开始下载:[%s]'%vol_img_url)
    resp = requests.get(vol_img_url)
    resp.raise_for_status() # 下载出现问题,立即抛出异常
    
    try:
        print vol_title
        vol_title = vol_title.encode('gbk')
    except UnicodeEncodeError:
        vol_title = vol_title.encode('utf-8')
    
    image_name = 'Vol.{0} {1}.jpg'.format(vol_num, vol_title) 
    print image_name
    
    image_path = os.path.join(base_dir, image_name)
    # 避免重复下载
    if os.path.exists(image_path):
        return 
    
    image_handle = None
    try:
        image_handle = open(image_path, 'wb')
        for chunk in resp.iter_content(10000):
            image_handle.write(chunk)
    except IOError:
        return
    finally:
        if image_handle:
            image_handle.close()  
    
    if __name__ == '__main__':
    main()


1. 下面是运行脚本后,在路径basedir下生成的期刊图片:

这里写图片描述

2. 运行此脚本,需要安装依赖库:pip install bs4

全部评论 (0)

还没有任何评论哟~