Advertisement

北京二手房信息爬取

阅读量:

对北京二手房信息进行爬取,包括单线程和多线程。

一 单线程

复制代码
    import requests
    from bs4 import BeautifulSoup
    m1=[]
    m2=[]
    m3=[]
    for i in range(1,101):
    urli='https://bj.lianjia.com/ershoufang/pg'+str(i)
    header = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
    ri=requests.get(urli, headers=header)
    htmli=ri.text.encode(ri.encoding).decode()
    soupi=BeautifulSoup(htmli,"lxml")
    #爬取地址信息
    l1i=soupi.findAll('div',attrs={'class':'positionInfo'})
    m1i=[i.text for i in l1i]
    #爬取房子信息
    l2i=soupi.findAll('div',attrs={'class':'houseInfo'})
    m2i=[i.text for i in l2i]
    #爬取价格信息
    l3i=soupi.findAll('div',attrs={'class':'totalPrice'})
    m3i=[i.text for i in l3i]
    m1=m1+m1i
    m2=m2+m2i
    m3=m3+m3i
    #写入表格
    dfs=pd.DataFrame()
    dfs['houseInfo']=m2
    dfs['positionInfo']=m1
    dfs['totalPrice']=m3
    dfs.index=range(len(m1))
    #导出表格
    dfs.to_csv("bj二手房成交.csv")

二 多线程

复制代码
    #多线程爬取房价信息
    import threading
    from time import ctime, sleep
    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    class myThread(threading.Thread):
    def __init__(self,name,counter):
        threading.Thread.__init__(self)
        self.name=name
        self.counter=counter
    def run(self):
        m1=[]
        m2=[]
        m3=[]
        for i in range(self.counter,self.counter+20):
            urli='https://bj.lianjia.com/ershoufang/pg'+str(i)
            header = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
            ri=requests.get(urli, headers=header)
            htmli=ri.text.encode(ri.encoding).decode()
            soupi=BeautifulSoup(htmli,"lxml")
            l1i=soupi.findAll('div',attrs={'class':'positionInfo'})
            m1i=[i.text for i in l1i]
            l2i=soupi.findAll('div',attrs={'class':'houseInfo'})
            m2i=[i.text for i in l2i]
            l3i=soupi.findAll('div',attrs={'class':'totalPrice'})
            m3i=[i.text for i in l3i]
            m1=m1+m1i
            m2=m2+m2i
            m3=m3+m3i
        dfs=pd.DataFrame()
        dfs['houseInfo']=m2
        dfs['positionInfo']=m1
        dfs['totalPrice']=m3
        dfs.index=range(len(m1))
        print(dfs,"%s"%ctime())
    
    thread1=myThread("Thread-1",1)
    thread2=myThread("Thread-2",21)
    thread3=myThread("Thread-3",41)
    thread4=myThread("Thread-4",61)
    thread1.start()
    thread2.start()
    thread3.start()
    thread4.start()
    thread1.join()
    thread2.join()
    thread3.join()
    thread4.join()
    print("退出主线程")

全部评论 (0)

还没有任何评论哟~