python爬取中国大学(高校)基本信息
发布时间
阅读量:
阅读量
Python爬取中国大学(高校)基本信息
python爬取中国大学(高校)基本信息
简单的一个小爬虫,获取中国高校基本信息
一、输出到excel表格结果

二、代码
// An highlighted block
# -*- coding:utf-8 -*-
# author:zhang shuochuan
# datetime:2021/4/16 20:23
# software: PyCharm
from bs4 import BeautifulSoup
import urllib.request,urllib.error
import xlwt
from tqdm import tqdm
def getdata(): #获取网页html内容并解析
baseurl = 'https://gaokao.chsi.com.cn/sch/search.do?searchType=1&start='
data =[]
datalist = []
for i in tqdm(range(0,142)): #目标网站共142页,加入一个进度展示
url = baseurl + str(i * 20)
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.80 Safari/537.36 Edg/86.0.622.43"
}
request = urllib.request.Request(url, headers=head)
html = ""
response = urllib.request.urlopen(request, timeout=10)
html = response.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all('tr'):
for td in item.find_all('td'):
text = td.text.strip()
text = str(text).replace("\ue664", "是")
data.append(text)
if data == []:
continue
else:
datalist.append(data)
data = []
return datalist
def save(datalist): #保存到excel
university = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = university.add_sheet('中国大学名单', cell_overwrite_ok=True)
col = ("院校名称", "院校所在地", "教育行政主管部门", "院校类型", "学历层次", "一流大学建设高校", "一流学科建设高校", "研究生院", "满意度")
h = 1
j = 0
for i in range(0, 9):
sheet.write(0, i, col[i])
for data in datalist:
for i in data:
if i == '':
i = '不是'
sheet.write(h, j, i)
j = j + 1
j = 0
h = h + 1
university.save('中国大学.xls')
if __name__ == '__main__':
da = getdata()
save(da)
全部评论 (0)
还没有任何评论哟~
