Advertisement

python抓取网页内容到excel_Python网页内容Selenium抓取+Excel输出

阅读量:

需要安装的Python库包括:

pip install selenium

pip install openpyxl

Selenium需要安装Chrome Webdriver可以从网络下载拷贝到Chrome路径。

引用库

#import selenium libraries

from selenium importwebdriverfrom selenium.webdriver.common.by importByfrom selenium.webdriver.support.ui importWebDriverWaitfrom selenium.webdriver.support importexpected_conditions as EC#import excel library

from openpyxl importload_workbook#import time library

import time

等待页面加载完(指定内容元素出现)

#wait for report inital loading

defwaitLoad(driver):

element= WebDriverWait(driver, 200).until(

EC.presence_of_element_located((By.XPATH,".//*[text() = 'Past Due']"))

)

选择下拉菜单项目并点击按钮

#check/select application in application filter

defpickApp(driver,app):try:

elements= driver.find_elements(By.XPATH, ".//input[@value='"+app+"']")

driver.execute_script("arguments[0].click();", elements[0])

btn_filter= driver.find_element(By.ID, "cp-filter-btn")

driver.execute_script("arguments[0].click();", btn_filter)returnTrueexcept:return False

初始化Web Driver并开始浏览网页

#initialize Selenium web driver and navigate to page

driver =webdriver.Chrome()

driver.get('https://cionow.accenture.com/ComplianceOps')

读取Excel文件,获得工作簿对象

#initialize Excel workbook and sheets

wb =load_workbook(filename)

ws_apps= wb["Apps"]

ws_status= wb["Status"]

遍历Excel行数据

#list all applications and deal with each app

for row in ws_apps.iter_rows(min_row = 2):

app_id= row[1].value #application id

app_name = row[2].value #application name

du = row[3].value #delivery unit

md = row[4].value #managing director

lead = row[5].value #delivery unit lead

mgr = row[6].value #manager

通过XPath获得元素集合

compliance_row = driver.find_elements_by_xpath(".//div[contains(@class, 'compliance-row-1') or contains(@class, 'compliance-row-2')]/*")

处理指定Class的元素

#get level 1 metric name

if metric.get_attribute("class").find("level-1-metric")>-1:

level_1_metric_name=metric.text

level_2_metric_name= ""

通过CSS Selector获得元素集合

total = metric.find_elements_by_css_selector("td.total") #total vul number

metric_name = metric.find_elements_by_css_selector("td.metric-name") #level 3 metric name

action_req = metric.find_elements_by_css_selector("td.action-req") #action required number

app_due_date = metric.find_elements_by_css_selector("td.app-due-date") #app due date number

past_due = metric.find_elements_by_css_selector("td.past-due") #past due number

excep = metric.find_elements_by_css_selector("td.excep") #exception number

结束Web Driver线程(关闭浏览器)

driver.quit()

保存工作簿

wb.save(filename)

全部评论 (0)

还没有任何评论哟~