Advertisement

链家二手房100页Xpath爬取保存csv

阅读量:
复制代码
    import random
    import time
    import csv
    import requests
    from fake_useragent import UserAgent
    from lxml import etree
    
    
    class House:
    def __init__(self):
        self.url = 'https://bj.lianjia.com/ershoufang/pg{}/'
        self.f = open('链家100.csv','w')
        self.write = csv.writer(self.f)
    
    def get_html(self, url):
        headers = {'UserAgent': UserAgent().random}
        html = requests.get(url=url, headers=headers).text
        self.parser_html(html)
    
    def parser_html(self, html):
        eobj = etree.HTML(html)
        li_list = eobj.xpath("//li[@class='clear LOGVIEWDATA LOGCLICKDATA']")
        for li in li_list:
            title_list = li.xpath(".//div[@class='title']/a/text()")
            title = title_list[0] if title_list else None
            name_list = li.xpath(".//div[@class='positionInfo']/a[1]/text()")
            name = name_list[0] if name_list else None
            area_list = li.xpath(".//div[@class='positionInfo']/a[2]/text()")
            area = area_list[0] if area_list else None
            info_list = li.xpath(".//div[@class='houseInfo']/text()")
            # 2室2厅 | 80.72平米 | 南 北 | 精装 | 高楼层(共6层) | 2004年建 | 板楼
            info = info_list[0] if info_list else None
            if info:
                info_list1 = info.split("|")
                if len(info_list1) == 7:
                    model = info_list1[0].strip()
                    size = info_list1[1].strip()
                    face = info_list1[2].strip()
                    decorate = info_list1[3].strip()
                    floor = info_list1[4].strip()
                    year = info_list1[5].strip()
                    type = info_list1[6].strip()
                else:
                    model = size = face = decorate = floor = year = type = None
            else:
                model = size = face = decorate = floor = year = type = None
            follow_list = li.xpath(".//div[@class='followInfo']/text()")
            follow = follow_list[0].split('/')[0].strip() if follow_list else None
            time = follow_list[0].split('/')[1].strip() if follow_list else None
            price_list = li.xpath(".//div[@class='totalPrice']/span/text()")
            price = price_list[0] + '万' if price_list else None
            unit_list = li.xpath(".//div[@class='unitPrice']/span/text()")
            unit = unit_list[0][2:-4] if unit_list else None
            list1 = [title, name, area, model, size, face, decorate, floor, year, type, follow, time, price, unit]
            self.write.writerow(list1)
    
    def serve_forever(self):
        self.write.writerow(
            ['title', 'name', 'area', 'model', 'size', 'face', 'decorate', 'floor', 'year', 'type', 'follow', 'time',
             'price', 'unit'])
        for i in range(1, 101):
            html = self.url.format(i)
            print(html)
            self.get_html(html)
            time.sleep(random.uniform(0, 1))
    
    
    if __name__ == '__main__':
    house = House()
    house.serve_forever()
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    

全部评论 (0)

还没有任何评论哟~