Advertisement

爬取北京二手房数据信息(python)

阅读量:

数据爬取

爬取北京二手房数据信息python代码:

复制代码
 # coding : utf-8

    
  
    
 from requests import get
    
 from bs4 import BeautifulSoup as bs
    
 from tqdm import tqdm
    
 from multiprocessing import Pool
    
 from time import time
    
 from os import listdir
    
 from csv import writer
    
 import re
    
  
    
 def url_spider(url):
    
     """
    
     爬取网页信息
    
     """
    
     headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER'}
    
     try:
    
     response = get(url, headers=headers, verify=True, timeout=10)
    
     if response.status_code == 200:
    
         soup = bs(response.text, features="html.parser")
    
         return soup
    
     else:
    
         print("\n\n*** (%s)请求状态异常 ***\n\n" % url)
    
  
    
         return None  # 状态码非200则返回空值
    
     except Exception as e:
    
     print('\n\n*** Requests.get(%s) gets wrong! ***\nThe program will try again later.\n\n' % url)
    
     print(e)
    
     return None
    
  
    
 def url_list(n1, n2):
    
     """
    
     获取具体房子URL信息,建立URL池
    
     """
    
     lis = []
    
     print('\n******  获取房子URL信息中...')
    
     for i in range(n1,n2+1):
    
     url = 'https://bj.lianjia.com/ershoufang/pg' + str(i) + '/'
    
     print('******  正在获取第{}页的房子URL信息'.format(i))
    
     res = url_spider(url)
    
     if res is not None:
    
         infos = res.find_all('div', class_="item")
    
         for info in infos:
    
             href = info.find("a", class_="title").get("href")
    
             lis.append(href)
    
     else:
    
         print('\n******  获取房子URL信息失败')
    
  
    
      #   time.sleep(5)
    
     # URL写入文件
    
     if 'urls.txt' not in listdir():
    
     with open('urls.txt', 'a', encoding="utf-8") as f:
    
         for i in lis:
    
             f.write(i+"\n")
    
         f.close()
    
     else:
    
     with open('urls.txt', 'r', encoding="utf-8") as f:
    
         fis = f.readlines()
    
         f.close()
    
     with open('urls.txt', 'a', encoding="utf-8") as f:
    
         for i in lis:
    
             k = i+"\n"
    
             if k not in fis:
    
                 f.write(k)
    
         f.close()
    
  
    
 def soup_parser(soup):
    
     """
    
     获取房子详细信息,并保存为csv文件
    
     """
    
     data = [soup.find('div', class_="communityName").find('a', class_="info").text,
    
         soup.find('div', class_="areaName").find('span', class_="info").text,
    
         soup.find('span', class_="total").text,
    
         soup.find('span', class_="unitPriceValue").text
    
         ]
    
     base = soup.find('div', class_="base").find_all('li')
    
     tran = soup.find('div', class_="transaction").find_all('li')
    
     for i in base:
    
     data.append(i.text)
    
     for i in tran:
    
     data.append(i.text)
    
     # time.sleep(3)
    
     return data
    
  
    
 def data_proce(data, label):
    
     """
    
     处理数据格式
    
     """
    
     # print(data)
    
     lis = []
    
     for i in data:
    
     a = i.replace('\xa0','_').replace('\n','').replace('  ','').replace(' ','_')
    
     st = a[:4]
    
     if st in label:
    
         a = a.replace(st,'')
    
     lis.append(a)
    
     return lis
    
  
    
 def write_csv(data_lis):
    
     """
    
     写入csv文件
    
     """
    
     label = ["小区名称", "所在区域", "总价", "单价", "房屋户型", "所在楼层",
    
          "建筑面积", "户型结构", "套内面积", "建筑类型", "房屋朝向", "建筑结构",
    
          "装修情况", "梯户比例", "供暖方式", "配备电梯", "挂牌时间", "交易权属",
    
          "上次交易", "房屋用途", "房屋年限", "房权所属", "抵押信息", "房本备件"]
    
     data_lis = data_proce(data_lis, label)
    
     if 'all_data.csv' not in listdir():
    
     with open('all_data.csv', 'a', newline='', encoding='utf-8') as f:
    
         writer = writer(f, delimiter=',')
    
         writer.writerow(label)
    
         writer.writerow(data_lis)
    
         f.close()
    
     else:
    
     with open('all_data.csv', 'a', newline='', encoding='utf-8') as f:
    
         writer = writer(f, delimiter=',')
    
         writer.writerow(data_lis)
    
         f.close()
    
  
    
 def main(url):
    
     """
    
     :return:
    
     """
    
     url = url.replace('\n', '').replace('\t', '')
    
     res = url_spider(url)
    
     if res is not None:
    
     print("******  正在获取{}的房子信息".format(url))
    
     data_csv = soup_parser(res)
    
     write_csv(data_csv)
    
     else:
    
     print("******  获取{}的房子信息失败".format(url))
    
  
    
  
    
 def async_main(urls):
    
     """
    
     :return:
    
     """
    
     p = Pool(4)
    
     for url in urls:
    
     p.apply_async(main, (url,))
    
     p.close()
    
     p.join()
    
  
    
  
    
 if __name__ == "__main__":
    
     start = time()
    
     n = input('请输入数据集范围(如1-10):').split("-")
    
     (n1, n2) = (int(n[0]), int(n[1]))
    
     url_list(n1, n2)  # 爬取房子URL信息
    
     with open('urls.txt', 'r', encoding="utf-8") as f:
    
     urls = f.readlines()
    
     f.close()
    
     print('\n******  共获取到{}套房子信息'.format(len(urls)))
    
     # main(urls[0])
    
     async_main(urls)
    
  
    
     end = time()
    
     print("所用时间: {}".format(end - start))
    
    
    
    

全部评论 (0)

还没有任何评论哟~