Advertisement

python 爬虫 爬取高考录取分数线 信息

阅读量:

python 爬虫获取高考录取分数数据

上一篇: axios 原生上传xlsx文件

下一篇: pandas 表格 数据补全空值

网页

链接地址为:https://gkcx.eol.cn/school/search;访问该链接即可获取所需信息

完整资料一个多g

e5e26861bade61ed13fe6ea2a0804c709ca.jpg
5348286ab0b8bbc21256127bacd80a1eb0d.jpg

高校信息爬取接口

复制代码
 import requests_html

    
 import json
    
  
    
 sess = requests_html.HTMLSession()
    
 url = "https://api.eol.cn/gkcx/api/"
    
 data = {
    
     "access_token": "",
    
     "admissions": "",
    
     "central": "",
    
     "department": "",
    
     "dual_class": "",
    
     "f211": "",
    
     "f985": "",
    
     "is_dual_class": "",
    
     "keyword": "",
    
     "page": 8, "province_id": "",
    
     "request_type": 1,
    
     "school_type": "",
    
     "size": 20,
    
     "sort": "view_total",
    
     "type": "",
    
     "uri": "apigkcx/api/school/hotlists"
    
 }
    
 res = sess.post(url, data)
    
 js = res.json()
    
 print(type(js))  # <class 'dict'>
    
  
    
 with open('./school.json', mode='w+') as f:
    
     f.write(js['data']['item'])
    
    
    
    
    代码解释

省份信息,在需要选择省份的下拉框页面中查看请求

e87ee73ca5e76a926cf3c4432690c66a78e.jpg
复制代码
 def get_province():

    
     url = "https://static-data.eol.cn/www/config/detial/1.json"
    
     res = sess.get(url)
    
     js = res.json()
    
     print(js)
    
     with open('./province.json', mode='w+') as f:
    
     json.dump(js, f)
    
  
    
    
    
    
    代码解释

接口存在最大数据传输容量的限制;采用分页方式请求全部数据后需整合相关学校的资料,并以.xlsx文件的形式输出结果。

完整代码

复制代码
 import requests_html

    
 import json
    
 import glob
    
 import pandas as pd
    
  
    
 sess = requests_html.HTMLSession()
    
  
    
  
    
 def get_school():
    
     url = "https://api.eol.cn/gkcx/api/"
    
     for pageNo in range(300):
    
     data = {
    
         "access_token": "",
    
         "admissions": "",
    
         "central": "",
    
         "department": "",
    
         "dual_class": "",
    
         "f211": "",
    
         "f985": "",
    
         "is_dual_class": "",
    
         "keyword": "",
    
         "page": pageNo,
    
         "province_id": "",
    
         "request_type": 1,
    
         "school_type": "",
    
         "size": 20,
    
         "sort": "view_total",
    
         "type": "",
    
         "uri": "apigkcx/api/school/hotlists"
    
     }
    
     res = sess.post(url, data)
    
     js = res.json()
    
     print(type(js['data']['item']))  # <class 'dict'>
    
     with open(f'./school/school_{pageNo}.json', mode='w+') as f:
    
         json.dump(js['data']['item'], f)
    
  
    
  
    
 def get_province():
    
     url = "https://static-data.eol.cn/www/config/detial/1.json"
    
     res = sess.get(url)
    
     js = res.json()
    
     print(js)
    
     with open('./province.json', mode='w+') as f:
    
     json.dump(js, f)
    
  
    
  
    
 def show_province():
    
     with open('./province.json', ) as f:
    
     data = json.load(f)
    
     print(data)
    
  
    
  
    
 def merge_school():
    
     all_school = []
    
     for path in glob.glob('./school/*'):
    
     with open(path) as f:
    
         data = json.load(f)
    
         all_school += data
    
  
    
     print(len(all_school)) # 2855
    
     all_school = json.dumps(all_school)
    
     with open('./json/school.json', mode='w+') as f:
    
     json.dump(all_school, f)
    
     df = pd.read_json(all_school)
    
     df.to_excel('./school.xlsx')
    
  
    
  
    
 if __name__ == '__main__':
    
     # get_school()
    
     # get_province()
    
     merge_school()
    
    
    
    
    代码解释

查询每个学校每年在各个省份的录取成绩

5b29eeb3b1a3d3571c138d32c453949eb47.jpg
复制代码
 import json

    
 import requests
    
 import os
    
 from multiprocessing import Pool
    
 import time
    
 import random
    
  
    
 years = range(2014, 2018 + 1)  # 年份
    
 pages = range(2)  # 数目,3页基本够用了
    
 types = [
    
     {
    
     'code': 1,
    
     'name': '理科'
    
     },
    
     {
    
     'code': 2,
    
     'name': '文科'
    
     }
    
 ]  # 文理科
    
 with open('./json/province.json') as f:
    
     provinces = json.load(f)
    
 with open('./json/school.json') as f:
    
     schools = json.load(f)
    
  
    
 headers = {
    
     'Accept': 'application/json, text/plain, */*',
    
     'Content-Type': 'application/json;charset=UTF-8',
    
     'Origin': 'https://gkcx.eol.cn',
    
     'Referer': 'https://gkcx.eol.cn/school/332',
    
     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
    
 }
    
  
    
  
    
 def get_score(school):
    
     path = f"./score/{school['name']}.json"
    
     if os.path.exists(path):
    
     print(school['name'], 'exists')
    
     return
    
     sess = requests.Session()
    
     url = "https://api.eol.cn/gkcx/api/"
    
     score = {}
    
     for type in types:
    
     score[type['name']] = {}
    
     for province in provinces:
    
         score[type['name']][province['name']] = {}
    
         for year in years:
    
             all_data = []
    
             try:
    
                 for page in pages:
    
                     data = {
    
                         "access_token": "",
    
                         "local_province_id": province['code'],  # 学生省份
    
                         "local_type_id": type['code'],  # 文理科 理科1 文科2
    
                         "page": page,  # 页号
    
                         "school_id": school['school_id'],  # 学校id
    
                         "size": 20,
    
                         "uri": "apidata/api/gk/score/special",
    
                         "year": year  # 年份
    
                     }
    
                     # time.sleep(random.random() * 0.1)
    
                     js = sess.post(url, data, headers=headers).json()
    
                     all_data += js['data']['item']
    
             except Exception as e:
    
                 print(e)
    
             score[type['name']][province['name']][str(year)] = all_data
    
  
    
     with open(path, mode='w+') as f:
    
     json.dump(score, f)
    
     print(school['name'], 'finished')
    
  
    
  
    
 def start():
    
     for school in schools[:10]:
    
     print(school['name'], 'start')
    
     get_score(school)
    
  
    
  
    
 def thread_start():
    
     #  线程数目太多会被访问拒绝
    
     p = Pool(4)
    
     for school in schools:
    
     print(school['name'], 'start')
    
     p.apply_async(get_score, args=(school,))
    
     p.close()  # 关闭进程池,不再接受新的任务
    
     p.join()  # 阻塞主进程,知道进程池中的进程完成任务
    
  
    
  
    
 def main():
    
     # start()
    
     thread_start()
    
  
    
  
    
 if __name__ == '__main__':
    
     main()
    
    
    
    
    代码解释

全部评论 (0)

还没有任何评论哟~