爬取北京二手房数据信息（python）

阅读量：

数据爬取

爬取北京二手房数据信息python代码：

复制代码

 # coding : utf-8

    
  
    
 from requests import get
    
 from bs4 import BeautifulSoup as bs
    
 from tqdm import tqdm
    
 from multiprocessing import Pool
    
 from time import time
    
 from os import listdir
    
 from csv import writer
    
 import re
    
  
    
 def url_spider(url):
    
     """
    
     爬取网页信息
    
     """
    
     headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER'}
    
     try:
    
     response = get(url, headers=headers, verify=True, timeout=10)
    
     if response.status_code == 200:
    
         soup = bs(response.text, features="html.parser")
    
         return soup
    
     else:
    
         print("\n\n*** (%s)请求状态异常 ***\n\n" % url)
    
  
    
         return None  # 状态码非200则返回空值
    
     except Exception as e:
    
     print('\n\n*** Requests.get(%s) gets wrong! ***\nThe program will try again later.\n\n' % url)
    
     print(e)
    
     return None
    
  
    
 def url_list(n1, n2):
    
     """
    
     获取具体房子URL信息，建立URL池
    
     """
    
     lis = []
    
     print('\n******  获取房子URL信息中...')
    
     for i in range(n1,n2+1):
    
     url = 'https://bj.lianjia.com/ershoufang/pg' + str(i) + '/'
    
     print('******  正在获取第{}页的房子URL信息'.format(i))
    
     res = url_spider(url)
    
     if res is not None:
    
         infos = res.find_all('div', class_="item")
    
         for info in infos:
    
             href = info.find("a", class_="title").get("href")
    
             lis.append(href)
    
     else:
    
         print('\n******  获取房子URL信息失败')
    
  
    
      #   time.sleep(5)
    
     # URL写入文件
    
     if 'urls.txt' not in listdir():
    
     with open('urls.txt', 'a', encoding="utf-8") as f:
    
         for i in lis:
    
             f.write(i+"\n")
    
         f.close()
    
     else:
    
     with open('urls.txt', 'r', encoding="utf-8") as f:
    
         fis = f.readlines()
    
         f.close()
    
     with open('urls.txt', 'a', encoding="utf-8") as f:
    
         for i in lis:
    
             k = i+"\n"
    
             if k not in fis:
    
                 f.write(k)
    
         f.close()
    
  
    
 def soup_parser(soup):
    
     """
    
     获取房子详细信息，并保存为csv文件
    
     """
    
     data = [soup.find('div', class_="communityName").find('a', class_="info").text,
    
         soup.find('div', class_="areaName").find('span', class_="info").text,
    
         soup.find('span', class_="total").text,
    
         soup.find('span', class_="unitPriceValue").text
    
         ]
    
     base = soup.find('div', class_="base").find_all('li')
    
     tran = soup.find('div', class_="transaction").find_all('li')
    
     for i in base:
    
     data.append(i.text)
    
     for i in tran:
    
     data.append(i.text)
    
     # time.sleep(3)
    
     return data
    
  
    
 def data_proce(data, label):
    
     """
    
     处理数据格式
    
     """
    
     # print(data)
    
     lis = []
    
     for i in data:
    
     a = i.replace('\xa0','_').replace('\n','').replace('  ','').replace(' ','_')
    
     st = a[:4]
    
     if st in label:
    
         a = a.replace(st,'')
    
     lis.append(a)
    
     return lis
    
  
    
 def write_csv(data_lis):
    
     """
    
     写入csv文件
    
     """
    
     label = ["小区名称", "所在区域", "总价", "单价", "房屋户型", "所在楼层",
    
          "建筑面积", "户型结构", "套内面积", "建筑类型", "房屋朝向", "建筑结构",
    
          "装修情况", "梯户比例", "供暖方式", "配备电梯", "挂牌时间", "交易权属",
    
          "上次交易", "房屋用途", "房屋年限", "房权所属", "抵押信息", "房本备件"]
    
     data_lis = data_proce(data_lis, label)
    
     if 'all_data.csv' not in listdir():
    
     with open('all_data.csv', 'a', newline='', encoding='utf-8') as f:
    
         writer = writer(f, delimiter=',')
    
         writer.writerow(label)
    
         writer.writerow(data_lis)
    
         f.close()
    
     else:
    
     with open('all_data.csv', 'a', newline='', encoding='utf-8') as f:
    
         writer = writer(f, delimiter=',')
    
         writer.writerow(data_lis)
    
         f.close()
    
  
    
 def main(url):
    
     """
    
     :return:
    
     """
    
     url = url.replace('\n', '').replace('\t', '')
    
     res = url_spider(url)
    
     if res is not None:
    
     print("******  正在获取{}的房子信息".format(url))
    
     data_csv = soup_parser(res)
    
     write_csv(data_csv)
    
     else:
    
     print("******  获取{}的房子信息失败".format(url))
    
  
    
  
    
 def async_main(urls):
    
     """
    
     :return:
    
     """
    
     p = Pool(4)
    
     for url in urls:
    
     p.apply_async(main, (url,))
    
     p.close()
    
     p.join()
    
  
    
  
    
 if __name__ == "__main__":
    
     start = time()
    
     n = input('请输入数据集范围（如1-10）:').split("-")
    
     (n1, n2) = (int(n[0]), int(n[1]))
    
     url_list(n1, n2)  # 爬取房子URL信息
    
     with open('urls.txt', 'r', encoding="utf-8") as f:
    
     urls = f.readlines()
    
     f.close()
    
     print('\n******  共获取到{}套房子信息'.format(len(urls)))
    
     # main(urls[0])
    
     async_main(urls)
    
  
    
     end = time()
    
     print("所用时间: {}".format(end - start))

全部评论 (0)

还没有任何评论哟~

爬取北京二手房数据信息（python）

数据爬取爬取北京二手房数据信息python代码： coding:utf8 fromrequestsimportget frombs4importBeautifulSoupasbs fromtqdmi...

北京二手房信息爬取

对北京二手房信息进行爬取，包括单线程和多线程。一单线程 importrequests frombs4importBeautifulSoup m1=[] m2=[] m3=[] foriinrange...

python爬取链家北京二手房信息（BeautifulSoup）

2022.3.21更新：本博客当时写的时候链家网页中一些属性或需要的信息名称都与参考博客不同，网站中的结构等会经常变化，因此如果直接使用本博客代码可能会导致采集不到需要信息，csv文件为空的情况，希望...

python爬取贝壳找房之北京二手房源信息

所用库 requests xpath解析库 multiprocessing多进程 pandas库用于保存csv文件实战背景本文首发于：python爬取贝壳找房之北京二手房源信息主要是为了做北京二...

北京安居客二手房信息爬取

importrequests fromlxmlimportetree importtime fromseleniumimportwebdriver importpandasaspd frombs4im...

python爬取北京租房信息

租房助手发现官网的筛选方式不能满足自己的需求，所以爬取相关网站制作出现在的东西来效果预览在线预览下面进行详细分析一.首先爬取起始地和终点地的路线及沿途地铁站名称 1.爬取8684.cn的地铁查...

爬取北京链家二手房数据

利用python爬取了北京链家主页的二手房数据，爬取时间为2020年1月8日。由于链家只显示了100页、每页30条，因此只能爬取3000条数据。后续将爬取各区的小区名，对每个小区的在售二手房数据进行...

python爬取链家_Python爬取链家北京二手房数据

今天分享一下前段时间抓取链家上北京二手房数据的项目。本次分享分为两部分，第一部分介绍如何使用scrapy抓取二手房数据，第二部分我将抓下来的数据进行了一些简单的分析和可视化。

Python爬取链家北京租房信息

刚学习了python，中途遇到很多问题，查了很多资料，最关键的就是要善于调试，div信息一定不要找错，下面就是我爬取租房信息的代码和运行结果：链家的房租网站两个导入的包 1.requests用来过...

爬虫+数据探索01-贝壳找房北京二手房信息数据

目录项目简介数据获取 1.爬虫 2.数据处理 3.数据探索项目简介爬取贝壳找房北京二手房信息数据，并整理成可分析数据数据获取（修正后数据链接：链接：https://pan.baidu.co...

是否确定退出登录?

爬取北京二手房数据信息（python）

数据爬取

全部评论 (0)

相关文章推荐

爬取北京二手房数据信息（python）

北京二手房信息爬取

python爬取链家北京二手房信息（BeautifulSoup）

python爬取贝壳找房之北京二手房源信息

北京安居客二手房信息爬取

python爬取北京租房信息

爬取北京链家二手房数据

python爬取链家_Python爬取链家北京二手房数据

Python爬取链家北京租房信息

爬虫+数据探索01-贝壳找房北京二手房信息数据