Advertisement

python爬取某宝商品信息

阅读量:

某平台是知名的电子商务企业,在学习了selenium之后,尝试用该技术收集某宝的商品信息。由于采用了自动化技术进行数据采集,并未设置网页分析环节

复制代码
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver import ActionChains
    import conn
    import time
    import csv
    
    def login(name):
    	'''解决登录和滑块验证问题'''
    driver.get('https://www.taobao.com/')
    driver.maximize_window()
    driver.implicitly_wait(10)
    driver.find_element_by_xpath('//*[@id="q"]').send_keys(name)
    driver.find_element_by_xpath('//*[@id="J_TSearchForm"]/div[1]/button').click()
    driver.find_element_by_xpath('//*[@id="fm-login-id"]').send_keys(conn.TaoUser)
    time.sleep(1)
    driver.find_element_by_xpath('//*[@id="fm-login-password"]').send_keys(conn.TaoPwd)
    time.sleep(1)
    action = ActionChains(driver)
    yzm = driver.find_element_by_xpath('//*[@id="nc_1_n1z"]')
    time.sleep(1)
    action.click_and_hold(yzm).move_by_offset(xoffset=258, yoffset=0)#点击移动验证码
    action.pause(0.8).perform()#使动作链执行
    action.release()
    driver.find_element_by_xpath('//*[@id="login-form"]/div[4]/button').click()
    driver.implicitly_wait(10)
    
    
    def get_info():
    	'''获取一页商品页信息并保存 '''
    #注意使用elements方法提取可迭代的值
    divs = driver.find_elements_by_xpath('//div[@class="items"]/div[@class="item J_MouserOnverReq  "]')
    for div in divs:
        shop_name = div.find_element_by_xpath('.//div[@class="row row-2 title"]/a').text
        store = div.find_element_by_xpath('.//div[@class="shop"]/a').text
        ship_area = div.find_element_by_xpath('.//div[@class="location"]').text
        price = div.find_element_by_xpath('.//div[@class="price g_price g_price-highlight"]/strong').text
        pay_num = div.find_element_by_xpath('.//div[@class="deal-cnt"]').text
        shop_url = div.find_element_by_xpath('.//div[@class="row row-2 title"]/a').get_attribute('href')
        #print(shop_name, store, ship_area, price, pay_num, sep="|")
        with open(r'seleniumDemo\shopinfo.csv', mode='a', newline="") as f:
            csvwrite = csv.writer(f,delimiter=',')
            csvwrite.writerow([shop_name, store, ship_area, price, pay_num, shop_url])
    print('文件保存完成')
    
    def turn_page():
    	'''翻页爬取'''
    all_page = driver.find_element_by_xpath('//div[@class="total"]').text.split(' ')[1]
    page = 1 
    try:
        while page <= int(all_page):
            print(f'=========正在爬取第{page}页信息=========')
            driver.implicitly_wait(10)
            get_info()
            page += 1
            #点击下一页
            driver.find_element_by_xpath('//a[@class="J_Ajax num icon-tag"]').click()
            time.sleep(5)
            
    except Exception as e:
        print(e)
    
    '''
    def get_info():
    	'带有title的csv表格,不过用的时候要把打开文件和写入title放
    	主入口那里'
    #注意使用elements方法提取可迭代的值
    divs = driver.find_elements_by_xpath('//div[@class="items"]/div[@class="item J_MouserOnverReq  "]')
    with open(r'seleniumDemo\shopinfo.csv', mode='w') as f:
        f.write(f"{'商品名称'},{'价格'},{'付款人数'},{'店铺名称'},{'发货地址'},{'链接'}\n")
        for div in divs:
            shop_name = div.find_element_by_xpath('.//div[@class="row row-2 title"]/a').text
            store = div.find_element_by_xpath('.//div[@class="shop"]/a').text
            ship_area = div.find_element_by_xpath('.//div[@class="location"]').text
            price = div.find_element_by_xpath('.//div[@class="price g_price g_price-highlight"]/strong').text
            pay_num = div.find_element_by_xpath('.//div[@class="deal-cnt"]').text
            shop_url = div.find_element_by_xpath('.//div[@class="row row-2 title"]/a').get_attribute('href')
            #print(shop_name, store, ship_area, price, pay_num, sep="|")
            f.write(f"{shop_name},{price},{pay_num},{store},{ship_area},{shop_url}\n")
    print('文件保存完成')
    '''
    
    if __name__ == '__main__':
    	'''程序主入口,感觉用selenium爬太慢了 '''
    choice = input('输入你想爬取的商品:')
    driver = webdriver.Chrome(r'C:\Users\IT\Desktop\chromedriver.exe')
    login(choice)
    turn_page()
    print('所有商品爬取完成。。。。')

全部评论 (0)

还没有任何评论哟~