链家二手房100页Xpath爬取保存csv
发布时间
阅读量:
阅读量
import random
import time
import csv
import requests
from fake_useragent import UserAgent
from lxml import etree
class House:
def __init__(self):
self.url = 'https://bj.lianjia.com/ershoufang/pg{}/'
self.f = open('链家100.csv','w')
self.write = csv.writer(self.f)
def get_html(self, url):
headers = {'UserAgent': UserAgent().random}
html = requests.get(url=url, headers=headers).text
self.parser_html(html)
def parser_html(self, html):
eobj = etree.HTML(html)
li_list = eobj.xpath("//li[@class='clear LOGVIEWDATA LOGCLICKDATA']")
for li in li_list:
title_list = li.xpath(".//div[@class='title']/a/text()")
title = title_list[0] if title_list else None
name_list = li.xpath(".//div[@class='positionInfo']/a[1]/text()")
name = name_list[0] if name_list else None
area_list = li.xpath(".//div[@class='positionInfo']/a[2]/text()")
area = area_list[0] if area_list else None
info_list = li.xpath(".//div[@class='houseInfo']/text()")
# 2室2厅 | 80.72平米 | 南 北 | 精装 | 高楼层(共6层) | 2004年建 | 板楼
info = info_list[0] if info_list else None
if info:
info_list1 = info.split("|")
if len(info_list1) == 7:
model = info_list1[0].strip()
size = info_list1[1].strip()
face = info_list1[2].strip()
decorate = info_list1[3].strip()
floor = info_list1[4].strip()
year = info_list1[5].strip()
type = info_list1[6].strip()
else:
model = size = face = decorate = floor = year = type = None
else:
model = size = face = decorate = floor = year = type = None
follow_list = li.xpath(".//div[@class='followInfo']/text()")
follow = follow_list[0].split('/')[0].strip() if follow_list else None
time = follow_list[0].split('/')[1].strip() if follow_list else None
price_list = li.xpath(".//div[@class='totalPrice']/span/text()")
price = price_list[0] + '万' if price_list else None
unit_list = li.xpath(".//div[@class='unitPrice']/span/text()")
unit = unit_list[0][2:-4] if unit_list else None
list1 = [title, name, area, model, size, face, decorate, floor, year, type, follow, time, price, unit]
self.write.writerow(list1)
def serve_forever(self):
self.write.writerow(
['title', 'name', 'area', 'model', 'size', 'face', 'decorate', 'floor', 'year', 'type', 'follow', 'time',
'price', 'unit'])
for i in range(1, 101):
html = self.url.format(i)
print(html)
self.get_html(html)
time.sleep(random.uniform(0, 1))
if __name__ == '__main__':
house = House()
house.serve_forever()
全部评论 (0)
还没有任何评论哟~
