Advertisement

Python学习笔记_使用Python爬学校老师信息

阅读量:

例如,在某高校教师招聘平台(http://homepage.hrbeu.edu.cn/irisweb/manage/resume/search/index-subject)上进行信息查询和筛选。

复制代码
    import requests
    from lxml import etree
    from bs4 import BeautifulSoup
    import re
    import os
    
    
    def makedir(filePath):
    E = os.path.exists(filePath)
    if not E:
        os.makedirs(filePath)
        os.chdir(filePath)
        print('文件夹<' + filePath + '>创建成功!')
    else:
        print('文件夹已存在!')
    
    
    # 哈工程学院主页
    xueYuanUrl = 'http://homepage.hrbeu.edu.cn/irisweb/manage/resume/search/index-subject'
    # 哈工程学院老师
    teacherUrl = 'http://homepage.hrbeu.edu.cn/irisweb/manage/resume/search/ajaxSearchBySubjectNewPage'
    # 对应教师详细信息
    teacherDetailUrl = 'http://homepage.hrbeu.edu.cn/web/'
    
    filePath = 'F:/python/teacherInfo/'
    
    webSide = requests.get(xueYuanUrl)
    webSide.encoding = webSide.apparent_encoding
    selector = etree.HTML(webSide.text)
    
    # 获取学院名称及学院编码
    xueYuanNames = selector.xpath('//*[@id="subject_detail_div"]/ul/li/a/text()')
    xueYuanCodes = selector.xpath('//*[@id="subject_detail_div"]/ul/li/a/@code')
    xueYuanInfo = dict(zip(xueYuanCodes, xueYuanNames))
    # print(xueYuanInfo)
    
    for key, value in xueYuanInfo.items():
    params = {'subject_code': key, 'subject_name': value}
    xueYuanFilePath = filePath + value + '/'
    makedir(xueYuanFilePath)
    # print(params)
    allTeacher = requests.post(url=teacherUrl, data=params)
    selector = etree.HTML(allTeacher.text)
    soup = BeautifulSoup(allTeacher.text, "html.parser")
    scripts = soup.select('script[type="text/javascript"]')  # script 选择器
    scriptsLength = len(scripts)
    strr = scripts[scriptsLength - 1].get_text()
    #	var data = JSON.parse('{"personList":[{"zhName":"艾明晔","psnCode":"MjA4MTc3","zhNamePy":"aimingye"},{"zhName":"陈恒","psnCode":"MjA0NzEy","zhNamePy":"chenheng"},{"zhName":"陈晶莹","psnCode":"MjExOTgx","zhNamePy":"chenjingying"},{"zhName":"何志勇","psnCode":"MjA4MDQ0","zhNamePy":"hezhiyong"},{"zhName":"黄崇珍","psnCode":"MjEyMTkx","zhNamePy":"huangchongzhen"},{"zhName":"贾立江","psnCode":"MjA4MTYy","zhNamePy":"jialijiang"},{"zhName":"李海超","psnCode":"MjExNDA3","zhNamePy":"lihaichao"},{"zhName":"李拓晨","psnCode":"MjA4MDI1","zhNamePy":"lituochen"},{"zhName":"李婉红","psnCode":"MjA4MTYw","zhNamePy":"liwanhong"},{"zhName":"李晓娣","psnCode":"MjEyODYz","zhNamePy":"lixiaodi"},{"zhName":"李颖","psnCode":"MjA4MDI0","zhNamePy":"liying1"},{"zhName":"梁河","psnCode":"MjA4MTY1","zhNamePy":"lianghe"},{"zhName":"吕彦昭","psnCode":"MjEwNjEw","zhNamePy":"lvyanzhao"},{"zhName":"马永红","psnCode":"MjA4MDMy","zhNamePy":"mayonghong"},{"zhName":"牛大勇","psnCode":"MjA4MDQy","zhNamePy":"niudaiyong"},{"zhName":"孙立梅","psnCode":"MjA4MTYx","zhNamePy":"sunlimei"},{"zhName":"王玉晶","psnCode":"MjA4MTU4","zhNamePy":"wangyujing"},{"zhName":"张秀华","psnCode":"MjA4MDQw","zhNamePy":"zhangxiuhua"},{"zhName":"张玉喜","psnCode":"MjEyODY0","zhNamePy":"zhangyuxi"},{"zhName":"赵逸秀","psnCode":"MjgxMDE3","zhNamePy":"zhaoyixiu"}]}');
    nameList = re.findall('"zhNamePy":"(.*?)"', strr)
    zhNameList = re.findall('"zhName":"(.*?)"', strr)
    for name, zhName in zip(nameList, zhNameList):
        try:
            nameUrl = teacherDetailUrl + name
            # print(nameUrl)
            teacherInfo = requests.get(nameUrl)
            teacherInfo.encoding = teacherInfo.apparent_encoding
            selector = etree.HTML(teacherInfo.text)
    
            fp = open(xueYuanFilePath + zhName + '.txt', 'a', encoding='utf-8')  # 路径+ 学院教师名称 +格式+文件操作模式+文字编码方式
            # 姓名
            fp.write("".join(selector.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/h1/text()')))
            fp.write('\n')
            # 部门
            fp.write("".join(selector.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/p[1]/span[1]/text()')))
            fp.write("".join(selector.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/p[1]/font[1]/text()')))
            fp.write('\n')
            # 学科
            fp.write("".join(selector.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/p[2]/span[1]/text()')))
            fp.write("".join(selector.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/p[2]/font[1]/text()')))
            fp.write('\n')
            # 职务
            fp.write("".join(selector.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/p[3]/span[1]/text()')))
            fp.write("".join(selector.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/p[3]/font[1]/text()')))
            fp.write('\n')
            # 职称
            fp.write("".join(selector.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/p[4]/span[1]/text()')))
            fp.write("".join(selector.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/p[4]/font[1]/text()')))
            fp.write('\n')
            # 指导资格
            fp.write("".join(selector.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/p[5]/span[1]/text()')))
            fp.write("".join(selector.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/p[5]/font[1]/text()')))
            fp.write("".join(selector.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/p[5]/font[2]/text()')))
            fp.write('\n')
            # 电话
            fp.write("".join(selector.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/p[6]/span[1]/text()')))
            fp.write("".join(selector.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/p[6]/font[1]/text()')))
            fp.write('\n')
            # 传真
            fp.write("".join(selector.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/p[7]/span[1]/text()')))
            fp.write("".join(selector.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/p[7]/font[1]/text()')))
            fp.write('\n')
            # 邮箱
            fp.write("".join(selector.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/p[8]/span[1]/text()')))
            fp.write("".join(selector.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/p[8]/font[1]/text()')))
            fp.write('\n')
            # 邮编
            fp.write("".join(selector.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/p[9]/span[1]/text()')))
            fp.write("".join(selector.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/p[9]/font[1]/text()')))
            fp.write('\n')
            # 地址
            fp.write("".join(selector.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/p[10]/span[1]/text()')))
            fp.write("".join(selector.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/p[10]/font[1]/text()')))
            fp.write('\n')
        except:
            continue
    
    print('学院<'+value+'>'+'已打印完毕!!!')

代码控制台日志打印效果为:

在这里插入图片描述

文件夹生成如下:

在这里插入图片描述

教师信息截图:

在这里插入图片描述

文件打开效果为:

在这里插入图片描述

全部评论 (0)

还没有任何评论哟~