Advertisement

学信网高校信息爬取

阅读量:

代码学习

这份是学信网上高校数据获取的Python代码。写得不够理想,请各位给予理解和支持。目前比较繁忙,在未来一段时间内打算对注释与代码优化工作进行改进。

复制代码
    import requests #爬虫库
    from bs4 import BeautifulSoup #html文本解析库
    import bs4
    
    def fileclear(file):
    file.seek(0)
    file.truncate()
    
    def gethtml(url):
    html=requests.get(url,timeout=30)
    page=html.text
    results=BeautifulSoup(page,"html.parser")#文本解析
    return results
    
    def get_trs(table):
    if isinstance(table,bs4.element.Tag):
        trs=table('tr')
    else:
        trs=[]
    return trs
    
    def get_tds(table):
    if isinstance(table,bs4.element.Tag):
        tds=table('td')
    else:
        tds=[]
    return tds
    
    def gettime(name):
    html=requests.get("https://baike.baidu.com/item/"+name,timeout=30,headers={'user-agent':'Mozilla/5.0'})
    html.encoding='utf-8'
    demo=html.text
    results=BeautifulSoup(demo,"html.parser")#文本解析
    context=(results.find('div',"basic-info cmn-clearfix").text.replace('\n',''))
    #print(context)
    start=context.find("创办时间")
    time=(context[start+4:start+8])
    return time
    
    def nametolocation1(name,city):
    try:
        html=requests.get("https://restapi.amap.com/v3/place/text?keywords="+name+"&types=高等院校&city="+city+"&offset=1&page=1&extensions=base&output=XML&key=自己的key",timeout=30)
        demo=html.text
        #print(demo)
        results=BeautifulSoup(demo,"html.parser")#文本解析
        location=results.find('location').text
    except:
        location="0,0"
    return location
    
    def nametolocation2(name,city):
    try:
        html=requests.get("https://restapi.amap.com/v3/geocode/geo?address="+name+"&city="+city+"&output=XML&key=自己的key",timeout=30)
        demo=html.text
        #print(demo)
        results=BeautifulSoup(demo,"html.parser")#文本解析
        location=results.find('location').text
    except:
        location="0,0"
    return location
    
    textfile1=open('F:/毕业设计/数据处理/数据/学信网高校数据.csv',"a", encoding="utf-8")
    fileclear(textfile1)
    
    textfile1.write("编号,名称,城市,主管单位,院校类型,层次,一流大学,一流学科,研究生院,满意度,网址,电话,地址,dl经度,dl纬度,poi经度,poi纬度\n")
    
    num=1
    for i in range(64):#64
    urlnum=i*20
    results=(gethtml('https://gaokao.chsi.com.cn/sch/search.do?searchType=1&xlcc=bk&start='+str(urlnum)))
    table=results.find('table')
    for tritem in get_trs(table):
        trtext=str(num)
        if tritem.td is None:
            continue
        else:
            schurl=(tritem.td.a['href'])
            try:
                schoolinfo=gethtml('https://gaokao.chsi.com.cn'+schurl)
                schoolinfolist=schoolinfo.find('div',"mid")('div')
                web_telelist=(schoolinfolist[0])('span')
                weburl=(web_telelist[0].text.replace(' ','').replace('\n','').replace(',',','))
                telenum=(web_telelist[1].text.replace(' ','').replace('\n','').replace(',',','))
                place=schoolinfolist[1].text.replace(' ','').replace('\n','').replace(',',',')
            except:
                weburl="无"
                telenum="无"
                place="无"
            num=num+1
        for tditem in get_tds(tritem):
            trtext=trtext+','+tditem.text.replace(' ','').replace('\n','').replace('','yes')
        trtext=trtext+','+weburl+','+telenum+','+place
        schname=trtext.split(',')[1]
        city=trtext.split(',')[2]
        print(city)
        trtext=trtext+','+nametolocation2(place,city)+','+nametolocation1(schname,city)
        print(trtext)
        textfile1.write(trtext.replace('\r','')+"\n")
    
    textfile1.close()

全部评论 (0)

还没有任何评论哟~