python爬取某宝商品信息
发布时间
阅读量:
阅读量
某平台是知名的电子商务企业,在学习了selenium之后,尝试用该技术收集某宝的商品信息。由于采用了自动化技术进行数据采集,并未设置网页分析环节
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
import conn
import time
import csv
def login(name):
'''解决登录和滑块验证问题'''
driver.get('https://www.taobao.com/')
driver.maximize_window()
driver.implicitly_wait(10)
driver.find_element_by_xpath('//*[@id="q"]').send_keys(name)
driver.find_element_by_xpath('//*[@id="J_TSearchForm"]/div[1]/button').click()
driver.find_element_by_xpath('//*[@id="fm-login-id"]').send_keys(conn.TaoUser)
time.sleep(1)
driver.find_element_by_xpath('//*[@id="fm-login-password"]').send_keys(conn.TaoPwd)
time.sleep(1)
action = ActionChains(driver)
yzm = driver.find_element_by_xpath('//*[@id="nc_1_n1z"]')
time.sleep(1)
action.click_and_hold(yzm).move_by_offset(xoffset=258, yoffset=0)#点击移动验证码
action.pause(0.8).perform()#使动作链执行
action.release()
driver.find_element_by_xpath('//*[@id="login-form"]/div[4]/button').click()
driver.implicitly_wait(10)
def get_info():
'''获取一页商品页信息并保存 '''
#注意使用elements方法提取可迭代的值
divs = driver.find_elements_by_xpath('//div[@class="items"]/div[@class="item J_MouserOnverReq "]')
for div in divs:
shop_name = div.find_element_by_xpath('.//div[@class="row row-2 title"]/a').text
store = div.find_element_by_xpath('.//div[@class="shop"]/a').text
ship_area = div.find_element_by_xpath('.//div[@class="location"]').text
price = div.find_element_by_xpath('.//div[@class="price g_price g_price-highlight"]/strong').text
pay_num = div.find_element_by_xpath('.//div[@class="deal-cnt"]').text
shop_url = div.find_element_by_xpath('.//div[@class="row row-2 title"]/a').get_attribute('href')
#print(shop_name, store, ship_area, price, pay_num, sep="|")
with open(r'seleniumDemo\shopinfo.csv', mode='a', newline="") as f:
csvwrite = csv.writer(f,delimiter=',')
csvwrite.writerow([shop_name, store, ship_area, price, pay_num, shop_url])
print('文件保存完成')
def turn_page():
'''翻页爬取'''
all_page = driver.find_element_by_xpath('//div[@class="total"]').text.split(' ')[1]
page = 1
try:
while page <= int(all_page):
print(f'=========正在爬取第{page}页信息=========')
driver.implicitly_wait(10)
get_info()
page += 1
#点击下一页
driver.find_element_by_xpath('//a[@class="J_Ajax num icon-tag"]').click()
time.sleep(5)
except Exception as e:
print(e)
'''
def get_info():
'带有title的csv表格,不过用的时候要把打开文件和写入title放
主入口那里'
#注意使用elements方法提取可迭代的值
divs = driver.find_elements_by_xpath('//div[@class="items"]/div[@class="item J_MouserOnverReq "]')
with open(r'seleniumDemo\shopinfo.csv', mode='w') as f:
f.write(f"{'商品名称'},{'价格'},{'付款人数'},{'店铺名称'},{'发货地址'},{'链接'}\n")
for div in divs:
shop_name = div.find_element_by_xpath('.//div[@class="row row-2 title"]/a').text
store = div.find_element_by_xpath('.//div[@class="shop"]/a').text
ship_area = div.find_element_by_xpath('.//div[@class="location"]').text
price = div.find_element_by_xpath('.//div[@class="price g_price g_price-highlight"]/strong').text
pay_num = div.find_element_by_xpath('.//div[@class="deal-cnt"]').text
shop_url = div.find_element_by_xpath('.//div[@class="row row-2 title"]/a').get_attribute('href')
#print(shop_name, store, ship_area, price, pay_num, sep="|")
f.write(f"{shop_name},{price},{pay_num},{store},{ship_area},{shop_url}\n")
print('文件保存完成')
'''
if __name__ == '__main__':
'''程序主入口,感觉用selenium爬太慢了 '''
choice = input('输入你想爬取的商品:')
driver = webdriver.Chrome(r'C:\Users\IT\Desktop\chromedriver.exe')
login(choice)
turn_page()
print('所有商品爬取完成。。。。')
全部评论 (0)
还没有任何评论哟~
