Advertisement

python爬取中国大学(高校)基本信息

阅读量:

Python爬取中国大学(高校)基本信息

python爬取中国大学(高校)基本信息

简单的一个小爬虫,获取中国高校基本信息

一、输出到excel表格结果

结果展示

二、代码

复制代码
    // An highlighted block
    # -*- coding:utf-8 -*-
    # author:zhang shuochuan
    # datetime:2021/4/16 20:23
    # software: PyCharm
    
    from bs4 import BeautifulSoup
    import urllib.request,urllib.error
    import xlwt
    from tqdm import tqdm
    
    def getdata():     #获取网页html内容并解析
    baseurl = 'https://gaokao.chsi.com.cn/sch/search.do?searchType=1&start='
    data =[]
    datalist = []
    for i in tqdm(range(0,142)):      #目标网站共142页,加入一个进度展示
        url = baseurl + str(i * 20)
        head = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.80 Safari/537.36 Edg/86.0.622.43"
        }
        request = urllib.request.Request(url, headers=head)
        html = ""
        response = urllib.request.urlopen(request, timeout=10)
        html = response.read().decode("utf-8")
        soup = BeautifulSoup(html, "html.parser")
        for item in soup.find_all('tr'):
            for td in item.find_all('td'):
                text = td.text.strip()
                text = str(text).replace("\ue664", "是")
                data.append(text)
            if data == []:
                continue
            else:
                datalist.append(data)
                data = []
    return datalist
    
    def save(datalist):     #保存到excel
    university = xlwt.Workbook(encoding='utf-8', style_compression=0)
    sheet = university.add_sheet('中国大学名单', cell_overwrite_ok=True)
    col = ("院校名称", "院校所在地", "教育行政主管部门", "院校类型", "学历层次", "一流大学建设高校", "一流学科建设高校", "研究生院", "满意度")
    h = 1
    j = 0
    for i in range(0, 9):
        sheet.write(0, i, col[i])
    for data in datalist:
        for i in data:
            if i == '':
                i = '不是'
            sheet.write(h, j, i)
            j = j + 1
        j = 0
        h = h + 1
    university.save('中国大学.xls')
    
    
    if __name__ == '__main__':
    da = getdata()
    save(da)

全部评论 (0)

还没有任何评论哟~