使用python抓取落网期刊图片
发布时间
阅读量:
阅读量
使用python抓取落网期刊图片
尽管过去两年多以来一直使用Python开发。
#coding:gbk
import os
import bs4
import requests
# 保存期刊图片的路径,你可以换成你自己的
base_dir = r"F:\落网电台"
def main():
start_url = 'http://www.luoo.net/music/'
#os.makedirs('落网电台')
parse_detail(start_url)
def parse_detail(req_url):
'''
# 解析包含期刊图片的页面
'''
resp = requests.get(req_url)
resp.raise_for_status()
resp.encoding = 'utf-8'
soup = bs4.BeautifulSoup(resp.text, 'lxml')
vol_items = soup.select('div.vol-list > div > a')
print('当前页面期刊div.item节点数目:[%d]'%len(vol_items))
if not (vol_items and len(vol_items)):
print('当前没有需要下载的期刊image..')
return True
for item in vol_items:
print type(item), item.name, item.attrs
vol_title = item['title']
vol_num = item['href'].split('/')[-1]
for child in item.children:
if isinstance(child, bs4.element.Tag) and child.name == 'img':
image_url = child['src']
print("当前图片的链接:[%s]"%image_url)
download_img(image_url, vol_num, vol_title)
# 开始处理翻页
page_items = soup.select('div.paginator > a.next')
if page_items and len(page_items):
next_page_url = page_items[0]['href']
print("下一页链接:[%s]"%next_page_url)
parse_detail(next_page_url)
def download_img(vol_img_url, vol_num, vol_title):
'''
# 保存期刊图片
'''
print('开始下载:[%s]'%vol_img_url)
resp = requests.get(vol_img_url)
resp.raise_for_status() # 下载出现问题,立即抛出异常
try:
print vol_title
vol_title = vol_title.encode('gbk')
except UnicodeEncodeError:
vol_title = vol_title.encode('utf-8')
image_name = 'Vol.{0} {1}.jpg'.format(vol_num, vol_title)
print image_name
image_path = os.path.join(base_dir, image_name)
# 避免重复下载
if os.path.exists(image_path):
return
image_handle = None
try:
image_handle = open(image_path, 'wb')
for chunk in resp.iter_content(10000):
image_handle.write(chunk)
except IOError:
return
finally:
if image_handle:
image_handle.close()
if __name__ == '__main__':
main()
注
1. 下面是运行脚本后,在路径basedir下生成的期刊图片:

2. 运行此脚本,需要安装依赖库:pip install bs4
全部评论 (0)
还没有任何评论哟~
