python selenium 爬取网页审查元素_Python 使用selenium爬取拉钩网Python职位信息(爬虫)...
发布时间
阅读量:
阅读量

爬取拉勾网python招聘职位


**17****/**10
周四 晴

整体思路:
1 使用我们最近讲的selenium模块进行模拟浏览器爬取
2 网页解析使用 xpath(底层为c语言,效率高)
3保存为csv数据
需要的模块:
import randomimport timeimport csvfrom urllib.parse import quote from lxml import etreefrom selenium import webdriver
其中 selenium 和 lxml 需要 pip install 命令进行安装

class LaGoSpider(object):'''封装为一个类,方便操作''' def __init__(self): options = webdriver.ChromeOptions() options.add_argument('--headless') options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2}) self.driver = webdriver.Chrome(r'D:\外安装软件\selenium1\chromedriver_win32\chromedriver.exe', options=options) self.data_list = [] def address_url(self): ''' 获取目标url(拼接) ''' self.citys = ['全国', '北京', '深圳', '广州', '杭州', '成都', '南京', '上海', '厦门', '西安', '长沙'] self.baseurl = 'https://www.lagou.com/jobs/list_python?px=default&city={}' for self.city in self.citys: self.url = self.baseurl.format(quote(self.city)) self.driver.get(self.url) print('正在爬取' % self.city) while True: source = self.driver.page_source self.position_url_parse(source) next_page = self.driver.find_element_by_xpath('//span[@]') if 'contains(class, "pager_next")' in next_page.get_attribute('class'): # 判断一页是否爬取完成 print('' % self.city) break else: self.driver.execute_script("arguments[0].click()", next_page) print('----------------爬取下一页--------------') time.sleep(random.randint(3, 5))

def position_url_parse(self, source):'''获取每个职位的url''' html = etree.HTML(source) lis = html.xpath('//ul[@]//li') for li in lis: position_url = li.xpath('.//a[@]//@href')[0] self.request_urls(position_url) time.sleep(random.randint(1, 3)) def request_urls(self, list_url): self.driver.execute_script('window.open("%s")' % list_url) self.driver.switch_to_window(self.driver.window_handles[1]) source = self.driver.page_source self.parse_position(source) time.sleep(random.randint(1, 3)) self.driver.close() self.driver.switch_to_window(self.driver.window_handles[0]) time.sleep(random.randint(1, 3))

def parse_position(self, source):'''抓取每个职位的详情信息''' self.data = {} html = etree.HTML(source) company = html.xpath('//dl[@]/dt/a/img/@alt')[0] print(company) self.data['公司'] = company name = html.xpath('//div[@]//span[@]/text()')[0] self.data['名称'] = name salary = html.xpath('//dd[@]/p[1]/span[1][@]/text()')[0] self.data['薪资'] = salary city = ''.join(html.xpath('//dd[@]/p[1]/span[2]/text()')[0]).replace('/','') self.data['城市'] = city jinyan = ''.join(html.xpath('//dd[@]/p[1]/span[3]/text()')[0]).replace('/', '') self.data['经验'] = jinyan xueli = ''.join(html.xpath('//dd[@]/p[1]/span[4]/text()')[0]).replace('/','') self.data['学历'] = xueli zhihuo = html.xpath('//*[@id="job_detail"]/dd[1]/p/text()')[0] self.data['职位诱惑'] = zhihuo zhimiao = ''.join(html.xpath('//div[@]//p//text()')).replace('岗位职责: ', '').replace('岗位要求:', '').replace('岗位职责:', '').replace('工作职责:', '').replace('项目背景:', '').replace('-', '').strip() self.data['职位描述'] = zhimiao self.data_list.append(self.data) self.csv_() def csv_(self): ''' 保存数据为csv ''' header = ['公司', '名称', '薪资', '城市', '经验', '学历', '职位诱惑', '职位描述'] with open('lagou_quanguo.csv', 'w', encoding='utf-8', newline='')as fb: writer = csv.DictWriter(fb, header) writer.writeheader() writer.writerows(self.data_list)if __name__ == '__main__': LG = LaGoSpider() LG.address_url()

岁月有你 惜惜相处



给我在看

全部评论 (0)
还没有任何评论哟~
