Advertisement

python selenium 爬取网页审查元素_Python 使用selenium爬取拉钩网Python职位信息(爬虫)...

阅读量:
662bed3c7d40dcfb91e93b1e194885bb.gif

爬取拉勾网python招聘职位

662bed3c7d40dcfb91e93b1e194885bb.gif
6fc3a88ac0c358cd587399b3cc404f6c.png

**17****/**10

周四 晴

c7656783a15de4c87e5409f5e2838ac7.gif

整体思路:

1 使用我们最近讲的selenium模块进行模拟浏览器爬取

2 网页解析使用 xpath(底层为c语言,效率高)

3保存为csv数据

需要的模块:

复制代码
    import randomimport timeimport csvfrom urllib.parse import quote   from lxml import etreefrom selenium import webdriver

其中 selenium 和 lxml 需要 pip install 命令进行安装

c7656783a15de4c87e5409f5e2838ac7.gif
复制代码
    class LaGoSpider(object):'''封装为一个类,方便操作'''    def __init__(self):        options = webdriver.ChromeOptions()        options.add_argument('--headless')        options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})        self.driver = webdriver.Chrome(r'D:\外安装软件\selenium1\chromedriver_win32\chromedriver.exe', options=options)        self.data_list = []    def address_url(self):    '''    获取目标url(拼接)    '''        self.citys = ['全国', '北京', '深圳', '广州', '杭州', '成都', '南京', '上海', '厦门', '西安', '长沙']        self.baseurl = 'https://www.lagou.com/jobs/list_python?px=default&city={}'        for self.city in self.citys:            self.url = self.baseurl.format(quote(self.city))            self.driver.get(self.url)            print('正在爬取' % self.city)            while True:                source = self.driver.page_source                self.position_url_parse(source)                next_page = self.driver.find_element_by_xpath('//span[@]')                if 'contains(class, "pager_next")' in next_page.get_attribute('class'): # 判断一页是否爬取完成                     print('' % self.city)                    break                else:                    self.driver.execute_script("arguments[0].click()", next_page)                    print('----------------爬取下一页--------------')                    time.sleep(random.randint(3, 5))
c7656783a15de4c87e5409f5e2838ac7.gif
复制代码
    def position_url_parse(self, source):'''获取每个职位的url'''        html = etree.HTML(source)        lis = html.xpath('//ul[@]//li')        for li in lis:            position_url = li.xpath('.//a[@]//@href')[0]            self.request_urls(position_url)            time.sleep(random.randint(1, 3))    def request_urls(self, list_url):        self.driver.execute_script('window.open("%s")' % list_url)        self.driver.switch_to_window(self.driver.window_handles[1])        source = self.driver.page_source        self.parse_position(source)        time.sleep(random.randint(1, 3))        self.driver.close()        self.driver.switch_to_window(self.driver.window_handles[0])        time.sleep(random.randint(1, 3))
c7656783a15de4c87e5409f5e2838ac7.gif
复制代码
    def parse_position(self, source):'''抓取每个职位的详情信息'''        self.data = {}        html = etree.HTML(source)        company = html.xpath('//dl[@]/dt/a/img/@alt')[0]        print(company)        self.data['公司'] = company        name = html.xpath('//div[@]//span[@]/text()')[0]        self.data['名称'] = name        salary = html.xpath('//dd[@]/p[1]/span[1][@]/text()')[0]        self.data['薪资'] = salary        city = ''.join(html.xpath('//dd[@]/p[1]/span[2]/text()')[0]).replace('/','')        self.data['城市'] = city        jinyan = ''.join(html.xpath('//dd[@]/p[1]/span[3]/text()')[0]).replace('/', '')        self.data['经验'] = jinyan        xueli = ''.join(html.xpath('//dd[@]/p[1]/span[4]/text()')[0]).replace('/','')        self.data['学历'] = xueli        zhihuo = html.xpath('//*[@id="job_detail"]/dd[1]/p/text()')[0]        self.data['职位诱惑'] = zhihuo        zhimiao = ''.join(html.xpath('//div[@]//p//text()')).replace('岗位职责: ', '').replace('岗位要求:', '').replace('岗位职责:', '').replace('工作职责:', '').replace('项目背景:', '').replace('-', '').strip()        self.data['职位描述'] = zhimiao        self.data_list.append(self.data)        self.csv_()    def csv_(self):    '''    保存数据为csv    '''        header = ['公司', '名称', '薪资', '城市', '经验', '学历', '职位诱惑', '职位描述']        with open('lagou_quanguo.csv', 'w', encoding='utf-8', newline='')as fb:            writer = csv.DictWriter(fb, header)            writer.writeheader()            writer.writerows(self.data_list)if __name__ == '__main__':    LG = LaGoSpider()    LG.address_url()
e92faedeb85ffb493984a09a5ee53121.gif

岁月有你 惜惜相处

729d432da97b8852503eaf326eb043f9.png
7e281314894ee00efe998498e297d33e.png
610e56dc91bb9fba93adb1577276f9a1.png

给我在看

d651bc37d015c487c19c10d8d6c93cf5.gif

全部评论 (0)

还没有任何评论哟~