python抓取网页内容到excel_Python网页内容Selenium抓取+Excel输出
需要安装的Python库包括:
pip install selenium
pip install openpyxl
Selenium需要安装Chrome Webdriver可以从网络下载拷贝到Chrome路径。
引用库
#import selenium libraries
from selenium importwebdriverfrom selenium.webdriver.common.by importByfrom selenium.webdriver.support.ui importWebDriverWaitfrom selenium.webdriver.support importexpected_conditions as EC#import excel library
from openpyxl importload_workbook#import time library
import time
等待页面加载完(指定内容元素出现)
#wait for report inital loading
defwaitLoad(driver):
element= WebDriverWait(driver, 200).until(
EC.presence_of_element_located((By.XPATH,".//*[text() = 'Past Due']"))
)
选择下拉菜单项目并点击按钮
#check/select application in application filter
defpickApp(driver,app):try:
elements= driver.find_elements(By.XPATH, ".//input[@value='"+app+"']")
driver.execute_script("arguments[0].click();", elements[0])
btn_filter= driver.find_element(By.ID, "cp-filter-btn")
driver.execute_script("arguments[0].click();", btn_filter)returnTrueexcept:return False
初始化Web Driver并开始浏览网页
#initialize Selenium web driver and navigate to page
driver =webdriver.Chrome()
driver.get('https://cionow.accenture.com/ComplianceOps')
读取Excel文件,获得工作簿对象
#initialize Excel workbook and sheets
wb =load_workbook(filename)
ws_apps= wb["Apps"]
ws_status= wb["Status"]
遍历Excel行数据
#list all applications and deal with each app
for row in ws_apps.iter_rows(min_row = 2):
app_id= row[1].value #application id
app_name = row[2].value #application name
du = row[3].value #delivery unit
md = row[4].value #managing director
lead = row[5].value #delivery unit lead
mgr = row[6].value #manager
通过XPath获得元素集合
compliance_row = driver.find_elements_by_xpath(".//div[contains(@class, 'compliance-row-1') or contains(@class, 'compliance-row-2')]/*")
处理指定Class的元素
#get level 1 metric name
if metric.get_attribute("class").find("level-1-metric")>-1:
level_1_metric_name=metric.text
level_2_metric_name= ""
通过CSS Selector获得元素集合
total = metric.find_elements_by_css_selector("td.total") #total vul number
metric_name = metric.find_elements_by_css_selector("td.metric-name") #level 3 metric name
action_req = metric.find_elements_by_css_selector("td.action-req") #action required number
app_due_date = metric.find_elements_by_css_selector("td.app-due-date") #app due date number
past_due = metric.find_elements_by_css_selector("td.past-due") #past due number
excep = metric.find_elements_by_css_selector("td.excep") #exception number
结束Web Driver线程(关闭浏览器)
driver.quit()
保存工作簿
wb.save(filename)
