北京二手房信息爬取
发布时间
阅读量:
阅读量
对北京二手房信息进行爬取,包括单线程和多线程。
一 单线程
import requests
from bs4 import BeautifulSoup
m1=[]
m2=[]
m3=[]
for i in range(1,101):
urli='https://bj.lianjia.com/ershoufang/pg'+str(i)
header = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
ri=requests.get(urli, headers=header)
htmli=ri.text.encode(ri.encoding).decode()
soupi=BeautifulSoup(htmli,"lxml")
#爬取地址信息
l1i=soupi.findAll('div',attrs={'class':'positionInfo'})
m1i=[i.text for i in l1i]
#爬取房子信息
l2i=soupi.findAll('div',attrs={'class':'houseInfo'})
m2i=[i.text for i in l2i]
#爬取价格信息
l3i=soupi.findAll('div',attrs={'class':'totalPrice'})
m3i=[i.text for i in l3i]
m1=m1+m1i
m2=m2+m2i
m3=m3+m3i
#写入表格
dfs=pd.DataFrame()
dfs['houseInfo']=m2
dfs['positionInfo']=m1
dfs['totalPrice']=m3
dfs.index=range(len(m1))
#导出表格
dfs.to_csv("bj二手房成交.csv")
二 多线程
#多线程爬取房价信息
import threading
from time import ctime, sleep
import requests
from bs4 import BeautifulSoup
import pandas as pd
class myThread(threading.Thread):
def __init__(self,name,counter):
threading.Thread.__init__(self)
self.name=name
self.counter=counter
def run(self):
m1=[]
m2=[]
m3=[]
for i in range(self.counter,self.counter+20):
urli='https://bj.lianjia.com/ershoufang/pg'+str(i)
header = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
ri=requests.get(urli, headers=header)
htmli=ri.text.encode(ri.encoding).decode()
soupi=BeautifulSoup(htmli,"lxml")
l1i=soupi.findAll('div',attrs={'class':'positionInfo'})
m1i=[i.text for i in l1i]
l2i=soupi.findAll('div',attrs={'class':'houseInfo'})
m2i=[i.text for i in l2i]
l3i=soupi.findAll('div',attrs={'class':'totalPrice'})
m3i=[i.text for i in l3i]
m1=m1+m1i
m2=m2+m2i
m3=m3+m3i
dfs=pd.DataFrame()
dfs['houseInfo']=m2
dfs['positionInfo']=m1
dfs['totalPrice']=m3
dfs.index=range(len(m1))
print(dfs,"%s"%ctime())
thread1=myThread("Thread-1",1)
thread2=myThread("Thread-2",21)
thread3=myThread("Thread-3",41)
thread4=myThread("Thread-4",61)
thread1.start()
thread2.start()
thread3.start()
thread4.start()
thread1.join()
thread2.join()
thread3.join()
thread4.join()
print("退出主线程")
全部评论 (0)
还没有任何评论哟~
