Advertisement

爬虫爬取链家二手房信息,对二手房做分析

阅读量:
复制代码
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    from bs4 import BeautifulSoup
    import requests
    
    
复制代码
    def generate_web_link(districts):
    '''此函数生成武汉地区所有区域二手房网页链接地址'''
    page_urls = [] 
    base_url = 'https://wh.lianjia.com/ershoufang/{}'
    for district in districts:
        district_url = base_url.format(district)
        res = requests.get(district_url).content.decode('utf-8')
        soup = BeautifulSoup(res,'lxml')
        totalpage = int(eval(soup.find('div',{'class':'page-box house-lst-page-box'})['page-data'])['totalPage'])#找出每个区域总共有多少页
        #eval函数将字符转化为表达式,find找出的内容是字符串形式的字典'{"totalPage":100,"curPage":1}'
        for page in range(1,totalpage+1):
            page_url = district_url + '/pg{}'.format(page)
            page_urls.append((district,page_url))
    return page_urls
    
    AI写代码
![](https://ad.itadn.com/c/weblog/blog-img/images/2025-05-31/CrtiUc9PVglRJ2vM4yKeSokuqYA3.png)
复制代码
    def house_info_spider(page_links):
    district_dicts = {'jiangan':'江岸','jianghan':'江汉','qiaokou':'硚口',
                    'dongxihu':'东西湖','wuchang':'武昌','qingshan':'青山',
                    'hongshan':'洪山','hanyang': '汉阳','donghugaoxin':'东湖高新',
                    'jiangxia':'江夏'}
    infos = pd.DataFrame()
    for page_link in page_links:
        res = requests.get(page_link[1]).content.decode('utf-8')
        soup = BeautifulSoup(res,'lxml')
        house_infos = [i.text for i in soup.find_all('div',{'class':'houseInfo'})]
        floors = [i.text for i in soup.find_all('div',{'class':'positionInfo'})]
        total_prices = [i.text for i in soup.find_all('div',{'class':'totalPrice'})]
        unit_prices = [i.text for i in soup.find_all('div',{'class':'unitPrice'})]
        house_districts = [district_dicts[page_link[0]]]*len(house_infos)
        for house_info,floor,total_price,unit_price,district in zip(house_infos,floors,total_prices,unit_prices,house_districts):
            infos = infos.append([[house_info,floor,total_price,unit_price,district]])
    infos.columns = ['信息','楼层','售价','单价','地区']
    return infos
    
    AI写代码
![](https://ad.itadn.com/c/weblog/blog-img/images/2025-05-31/HU0PIvtqk1WOK2hdCGw9D7jQr8Yi.png)
复制代码
    #开始采集数据
    if __name__ == '__main__':
    districts = ['jiangan','jianghan','qiaokou','dongxihu','wuchang','qingshan',
                 'hongshan','hanyang','donghugaoxin','jiangxia']
    page_links = generate_web_link(districts)
    house_datas = house_info_spider(page_links)
    
    
复制代码
    house_datas = house_datas.reset_index(drop=True)
    house_datas.to_csv('lianjia_house.csv',index=False)#数据保存到电脑
    

全部评论 (0)

还没有任何评论哟~