二手房房价预测
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import statsmodels.api as sm
from numpy import corrcoef,array
from IPython.display import HTML, display
from statsmodels.formula.api import ols
H:\anaconda\lib\site-packages\pandas\tseries\tools.py:5: FutureWarning: The pandas.lib module is deprecated and will be removed in a future version. These are private functions and can be accessed from pandas._libs.lib instead
import pandas.lib as lib
H:\anaconda\lib\site-packages\pandas\tseries\tools.py:6: FutureWarning: The pandas.tslib module is deprecated and will be removed in a future version.
import pandas.tslib as tslib
H:\anaconda\lib\site-packages\statsmodels\compat\pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
from pandas.core import datetools
get_ipython().magic('matplotlib inline')
matplotlib.rcParams['axes.unicode_minus']=False #正常显示负号
plt.rcParams['font.sans-serif'] = ['SimHei'] #正常显示中文
1 数据清洗
dat=pd.read_csv("ershoufang.csv",delimiter=",",encoding="gbk") #读入二手房数据
print(dat.dtypes) #查看变量名
dat0=dat.copy() #工作文件命名为dat0
CATE object
bedrooms int64
halls int64
AREA float64
floor object
subway int64
school int64
price int64
LONG float64
LAT float64
NAME object
DISTRICT object
dtype: object
######通过最简单的描述分析,清洗数据,一个一个变量处理
#CATE,各个城区的观测数,分布比较均匀,石景山偏少,不做处理'
print(dat0.CATE.value_counts()) #查看城区的分布
fengtai 3003
haidian 3001
dongcheng 2944
chaoyang 2915
xicheng 2892
shijingshan 2040
Name: CATE, dtype: int64
#卧室数和厅数,查看分布'
print(dat0.bedrooms.value_counts()) #取值为0,6,7,8,9的相对较少,考虑删除
print(dat0.halls.value_counts()) #取值为4和9的只有6个观测,考虑删除
dat0=dat0[(dat0.bedrooms>0)&(dat0.bedrooms<6)] #对卧室数变量做部分删除处理
dat0=dat0[dat0.halls<4] #对厅数变量做部分删除处理
2 8070
3 4301
1 3445
4 746
5 154
6 52
7 13
9 6
8 4
0 4
Name: bedrooms, dtype: int64
1 11267
2 4399
0 988
3 135
4 5
9 1
Name: halls, dtype: int64
dat0.AREA.hist(bins=20,color='lightblue')
<matplotlib.axes._subplots.AxesSubplot at 0x1ac6201ce48>
[外链图片转存

(img-kG0rjUi2-1562725993940)(output_6_1.png)]
#房屋面积,有许多outlier,建议将面积小于30和大于300的做删除处理,依据是1%和99%分位数
print(dat0.AREA.quantile([0.01,0.99])) #查看房屋面积的1%和99%分位数
dat0=dat0[(dat0.AREA>30)&(dat0.AREA<300)] #对AREA变量做部分删除处理
01 30.0911
0.99 286.0000
Name: AREA, dtype: float64
dat0.AREA.hist(bins=20,color='lightblue')
<matplotlib.axes._subplots.AxesSubplot at 0x1ac5e1dd550>
[外链图片转存

(img-opeMW8T8-1562725993941)(output_8_1.png)]
#楼层,basement相对较少,去掉地下室进行建模分析
#低中高层不具有可比性,后续讨论需要注意
print(dat0.floor.value_counts()) #查看楼层数的分布
dat0=dat0[dat0["floor"]!="basement "] #去掉地下室观测
middle 5601
high 5577
low 5094
basement 132
Name: floor, dtype: int64
#是否邻近地铁,是否学区房是0-1变量,计算均值,即地铁房和学区房占比
print(round(dat0.subway.value_counts(ascending=False)/len(dat0.subway),3)) #82.8%的观测是邻近地铁的
print(round(dat0.school.value_counts(ascending=True)/len(dat0.school),3)) #30.2%的观测是学区房
#print(dat0.subway.mean()) #82.8%的观测是邻近地铁的
#print(dat0.school.mean()) #30.2%的观测是学区房
1 0.828
0 0.172
Name: subway, dtype: float64
1 0.302
0 0.698
Name: school, dtype: float64
dat0.price.hist(bins=20,color='lightblue')
<matplotlib.axes._subplots.AxesSubplot at 0x1ac6280c518>
[外链图片转存

(img-yTuNl6SR-1562725993942)(output_11_1.png)]
#单位面积房价,同样有许多outlier,按照0.1%和99.9%分位数,去掉单价小于1.35万和大于15万的
print(dat0.price.quantile([0.001,0.999])) #查看单位面积房价的0.1%和99.9%分位数
dat0=dat0[(dat0.price>13500)&(dat0.price<150000)] #对单位面积房价变量做部分删除处理
001 21576.940
0.999 144944.748
Name: price, dtype: float64
dat0.price.hist(bins=20,color='lightblue')
<matplotlib.axes._subplots.AxesSubplot at 0x1ac62924cc0>
[外链图片转存

(img-TVkIojeu-1562725993942)(output_13_1.png)]
#北京的经纬度取值范围:39.5--41 & 115.5--117.5
dat0=dat0[(dat0.LONG>=115.5)&(dat0.LONG<=117.5)]
dat0=dat0[(dat0.LAT>=39.5)&(dat0.LAT<=41)]
######查看删除的观测条数和比例
diff=dat.shape[0]-dat0.shape[0]
print(round(100*diff/dat.shape[0],2),"%")
48 %
###保存做过清洗的数据集
dat0.to_csv("mydata.csv",index=True,sep=',')
2 描述
dat0=pd.read_csv("mydata.csv",encoding="gbk") #读入清洗过后的数据
dat0.pop("Unnamed: 0") #去掉第一列序号 dat0.drop('Unnamed: 0',axis=1)
n=dat0.shape[0] #样本量
dat0.describe(include="all").T #查看数据基本描述
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| CATE | 16210 | 6 | fengtai | 2947 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| bedrooms | 16210 | NaN | NaN | NaN | 2.16619 | 0.809907 | 1 | 2 | 2 | 3 | 5 |
| halls | 16210 | NaN | NaN | NaN | 1.22141 | 0.532048 | 0 | 1 | 1 | 2 | 3 |
| AREA | 16210 | NaN | NaN | NaN | 91.7466 | 44.0008 | 30.06 | 60 | 78.83 | 110.517 | 299 |
| floor | 16210 | 3 | middle | 5580 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| subway | 16210 | NaN | NaN | NaN | 0.827822 | 0.377546 | 0 | 1 | 1 | 1 | 1 |
| school | 16210 | NaN | NaN | NaN | 0.303085 | 0.459606 | 0 | 0 | 0 | 1 | 1 |
| price | 16210 | NaN | NaN | NaN | 61151.8 | 22293.4 | 18348 | 42812.2 | 57473 | 76099.8 | 149871 |
| LONG | 16210 | NaN | NaN | NaN | 116.378 | 0.088902 | 115.982 | 116.326 | 116.384 | 116.438 | 117.118 |
| LAT | 16210 | NaN | NaN | NaN | 39.9274 | 0.0582735 | 39.6625 | 39.8905 | 39.9216 | 39.96 | 40.4792 |
| NAME | 16210 | 2956 | 远洋山水 | 187 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| DISTRICT | 16210 | 173 | 鲁谷 | 918 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
dat0.price=dat0.price/10000 #价格单位转换成万元
#将城区的水平由拼音改成中文,以便作图输出美观
dict1 = {
u'chaoyang' : "朝阳",
u'dongcheng' : "东城",
u'fengtai' : "丰台",
u'haidian' : "海淀",
u'shijingshan' : "石景山",
u'xicheng': "西城"
}
dat0.CATE = dat0.CATE.apply(lambda x : dict1[x]) #dat0.CATE = dat0.CATE.map(dict1)
dat0.head()
| CATE | bedrooms | halls | AREA | floor | subway | school | price | LONG | LAT | NAME | DISTRICT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 朝阳 | 1 | 0 | 46.06 | middle | 1 | 0 | 4.8850 | 116.459653 | 39.928347 | 10AM新坐标 | 方庄 |
| 1 | 朝阳 | 1 | 1 | 59.09 | middle | 1 | 0 | 4.6540 | 116.459653 | 39.928347 | 10AM新坐标 | 方庄 |
| 2 | 海淀 | 5 | 2 | 278.95 | high | 1 | 1 | 7.1662 | 116.303556 | 39.954807 | 10号名邸 | 紫竹桥 |
| 3 | 海淀 | 3 | 2 | 207.00 | high | 1 | 1 | 5.7972 | 116.303556 | 39.954807 | 10号名邸 | 紫竹桥 |
| 4 | 丰台 | 2 | 1 | 53.32 | low | 1 | 1 | 7.1268 | 116.418826 | 39.943813 | 17号旁门 | 蒲黄榆 |
2.1 因变量
price
#因变量直方图
dat0.price.hist(bins=20,color='lightblue')
#dat0.price.plot(kind="hist",color='lightblue')
plt.xlabel("单位面积房价(万元/平方米)")
plt.ylabel("频数")
<matplotlib.text.Text at 0x1ac62b15048>
[外链图片转存

(img-k1HrgKAH-1562725993943)(output_22_1.png)]
print(dat0.price.agg(['mean','median','std'])) #查看price的均值、中位数和标准差等更多信息
print(dat0.price.quantile([0.25,0.5,0.75]))
mean 6.115181
median 5.747300
std 2.229336
Name: price, dtype: float64
0.25 4.281225
0.50 5.747300
0.75 7.609975
Name: price, dtype: float64
#查看房价最高和最低的两条观测
pd.concat([(dat0[dat0.price==min(dat0.price)]),(dat0[dat0.price==max(dat0.price)])])
| CATE | bedrooms | halls | AREA | floor | subway | school | price | LONG | LAT | NAME | DISTRICT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2738 | 丰台 | 2 | 2 | 100.83 | high | 0 | 0 | 1.8348 | 116.217060 | 39.828799 | 东山坡三里 | 丰台其它 |
| 12788 | 西城 | 3 | 1 | 77.40 | low | 1 | 0 | 14.9871 | 116.374996 | 39.909901 | 新文化街 | 金融街 |
2.2 自变量:
CATE+bedrooms+halls+floor+subway+school+AREA
#整体来看
for i in range(7):
if i != 3:
print(dat0.columns.values[i],":")
print(dat0[dat0.columns.values[i]].agg(['value_counts']).T)
print("=======================================================================")
else:
continue
print('AREA:')
print(dat0.AREA.agg(['min','mean','median','max','std']).T)
CATE :
丰台 海淀 朝阳 东城 西城 石景山
value_counts 2947 2919 2864 2783 2750 1947
=======================================================================
bedrooms :
2 3 1 4 5
value_counts 7971 4250 3212 675 102
=======================================================================
halls :
1 2 0 3
value_counts 11082 4231 812 85
=======================================================================
floor :
middle high low
value_counts 5580 5552 5078
=======================================================================
subway :
1 0
value_counts 13419 2791
=======================================================================
school :
0 1
value_counts 11297 4913
=======================================================================
AREA:
min 30.060000
mean 91.746598
median 78.830000
max 299.000000
std 44.000768
Name: AREA, dtype: float64
2.2.1 CATE
#频次统计
dat0.CATE.value_counts().plot(kind = 'pie') #绘制柱形图
dat0.CATE.agg(['value_counts'])
#dat0.CATE.value_counts()
| value_counts | |
|---|---|
| 丰台 | 2947 |
| 海淀 | 2919 |
| 朝阳 | 2864 |
| 东城 | 2783 |
| 西城 | 2750 |
| 石景山 | 1947 |
[外链图片转存

(img-1uaCGuYD-1562725993944)(output_28_1.png)]
dat1=dat0[['CATE','price']]
dat1.CATE=dat1.CATE.astype("category")
dat1.CATE.cat.set_categories(["石景山","丰台","朝阳","海淀","东城","西城"],inplace=True)
dat1.sort_values(by=['CATE'],inplace=True)
sns.boxplot(x='CATE',y='price',data=dat1)
#dat1.boxplot(by='CATE',patch_artist=True)
plt.ylabel("单位面积房价(万元/平方米)")
plt.xlabel("城区")
plt.title("城区对房价的分组箱线图")
H:\anaconda\lib\site-packages\pandas\core\generic.py:3110: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
self[name] = value
H:\anaconda\lib\site-packages\ipykernel_launcher.py:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
after removing the cwd from sys.path.
<matplotlib.text.Text at 0x1ac62c13e80>
[外链图片转存

(img-16su6UQF-1562725993944)(output_29_2.png)]
dat0.price.groupby(dat0.CATE).mean().sort_values(ascending= True).plot(kind = 'barh') #不同城区的单位房价面积均值情况
<matplotlib.axes._subplots.AxesSubplot at 0x1ac62d8bbe0>
[外链图片转存

(img-Z5peLV2V-1562725993944)(output_30_1.png)]
2.2.2 bedrooms
#不同卧室数的单位面积房价差异不大
dat4=dat0[['bedrooms','price']]
dat4.price.groupby(dat4.bedrooms).mean().plot(kind='bar')
#dat4.boxplot(by='bedrooms',patch_artist=True)
<matplotlib.axes._subplots.AxesSubplot at 0x1ac62c304e0>
[外链图片转存

(img-zXihzl0V-1562725993945)(output_32_1.png)]
2.2.3 halls
#客厅数越多,单位面积房价递减
dat5=dat0[['halls','price']]
dat5.price.groupby(dat5.halls).mean().plot(kind='bar')
#dat5.boxplot(by='halls',patch_artist=True)
<matplotlib.axes._subplots.AxesSubplot at 0x1ac62dd86d8>

[外链图片转存失败(img-MiAX7fEL-1562725993945)(output_34_1.png)]
2.2.4 floor
#不同楼层的单位面积房价差异不明显
dat6=dat0[['floor','price']]
dat6.floor=dat6.floor.astype("category")
dat6.floor.cat.set_categories(["low","middle","high"],inplace=True)
dat6.sort_values(by=['floor'],inplace=True)
#dat6.boxplot(by='floor',patch_artist=True)
dat6.price.groupby(dat6.floor).mean().plot(kind='bar')
H:\anaconda\lib\site-packages\pandas\core\generic.py:3110: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
self[name] = value
H:\anaconda\lib\site-packages\ipykernel_launcher.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
"""
<matplotlib.axes._subplots.AxesSubplot at 0x1ac6401a550>

2.2.5 subway+school
print(pd.crosstab(dat0.subway,dat0.school))
sub_sch=pd.crosstab(dat0.subway,dat0.school)
sub_sch = sub_sch.div(sub_sch.sum(1),axis = 0)
sub_sch
school 0 1
subway
0 2378 413
1 8919 4500
| school | 0 | 1 |
|---|---|---|
| subway | ||
| 0 | 0.852024 | 0.147976 |
| 1 | 0.664655 | 0.335345 |
stack2dim(dat0, i="subway", j="school")
[外链图片转存

(img-u6xynXE9-1562725993946)(output_39_0.png)]
def stack2dim(raw, i, j, rotation = 0, location = 'upper left'):
'''
此函数是为了画两个维度标准化的堆积柱状图
要求是目标变量j是二分类的
raw为pandas的DataFrame数据框
i、j为两个分类变量的变量名称,要求带引号,比如"school"
rotation:水平标签旋转角度,默认水平方向,如标签过长,可设置一定角度,比如设置rotation = 40
location:分类标签的位置,如果被主体图形挡住,可更改为'upper left'
'''
import math
data_raw = pd.crosstab(raw[i], raw[j])
data = data_raw.div(data_raw.sum(1), axis=0) # 交叉表转换成比率,为得到标准化堆积柱状图
# 计算x坐标,及bar宽度
createVar = locals()
x = [0] #每个bar的中心x轴坐标
width = [] #bar的宽度
k = 0
for n in range(len(data)):
# 根据频数计算每一列bar的宽度
createVar['width' + str(n)] = data_raw.sum(axis=1)[n] / sum(data_raw.sum(axis=1))
width.append(createVar['width' + str(n)])
if n == 0:
continue
else:
k += createVar['width' + str(n - 1)] / 2 + createVar['width' + str(n)] / 2 + 0.05
x.append(k)
# 以下是通过频率交叉表矩阵生成一列对应堆积图每一块位置数据的数组,再把数组转化为矩阵
y_mat = []
n = 0
for p in range(data.shape[0]):
for q in range(data.shape[1]):
n += 1
y_mat.append(data.iloc[p, q])
if n == data.shape[0] * 2:
break
elif n % 2 == 1:
y_mat.extend([0] * (len(data) - 1))
elif n % 2 == 0:
y_mat.extend([0] * len(data))
y_mat = np.array(y_mat).reshape(len(data) * 2, len(data))
y_mat = pd.DataFrame(y_mat) # bar图中的y变量矩阵,每一行是一个y变量
# 通过x,y_mat中的每一行y,依次绘制每一块堆积图中的每一块图
createVar = locals()
for row in range(len(y_mat)):
createVar['a' + str(row)] = y_mat.iloc[row, :]
if row % 2 == 0:
if math.floor(row / 2) == 0:
label = data.columns.name + ': ' + str(data.columns[row])
plt.bar(x, createVar['a' + str(row)],
width=width[math.floor(row / 2)], label='0', color='#5F9EA0')
else:
plt.bar(x, createVar['a' + str(row)],
width=width[math.floor(row / 2)], color='#5F9EA0')
elif row % 2 == 1:
if math.floor(row / 2) == 0:
label = data.columns.name + ': ' + str(data.columns[row])
plt.bar(x, createVar['a' + str(row)], bottom=createVar['a' + str(row - 1)],
width=width[math.floor(row / 2)], label='1', color='#8FBC8F')
else:
plt.bar(x, createVar['a' + str(row)], bottom=createVar['a' + str(row - 1)],
width=width[math.floor(row / 2)], color='#8FBC8F')
plt.title(j + ' vs ' + i)
group_labels = [data.index.name + ': ' + str(name) for name in data.index]
plt.xticks(x, group_labels, rotation = rotation)
plt.ylabel(j)
plt.legend(shadow=True, loc=location)
plt.show()
#地铁、学区的分组箱线图
dat2=dat0[['subway','price']]
dat3=dat0[['school','price']]
dat2.boxplot(by='subway',patch_artist=True)
dat3.boxplot(by='school',patch_artist=True)
<matplotlib.axes._subplots.AxesSubplot at 0x1ac64377320>
[外链图片转存

(img-iDh0HX3o-1562725993946)(output_41_1.png)]
[外链图片转存

(img-ZQSdopOj-1562725993946)(output_41_2.png)]
2.2.6 AREA
#房屋面积和单位面积房价(取对数后)的散点图
datA=dat0[['AREA','price']]
datA['price_ln'] = np.log(datA['price']) #对price取对数
plt.figure(figsize=(8,8))
plt.ylim(0,3)
plt.scatter(datA.AREA,datA.price_ln,marker='.')
#plt.scatter(dat0.AREA,dat0.price,marker='.')
#dat0.plot.scatter(x='AREA',y='price')
plt.ylabel("单位面积房价(取对数后)")
plt.xlabel("面积(平方米)")
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-1-3b5fc552e5cb> in <module>()
1 #房屋面积和单位面积房价(取对数后)的散点图
----> 2 datA=dat0[['AREA','price']]
3 datA['price_ln'] = np.log(datA['price']) #对price取对数
4 plt.figure(figsize=(8,8))
5 plt.ylim(0,500)
NameError: name 'dat0' is not defined
#求AREA和price_ln的相关系数矩阵
data1=array(datA['price_ln'])
data2=array(datA['AREA'])
datB=array([data1,data2])
corrcoef(datB)
array([[ 1. , -0.05811827],
[-0.05811827, 1. ]])
3 建模
###厅数做因子化处理,变成二分变量,使得建模有更好的解读
###将是否有厅bind到已有数据集
dat0['style_new']=dat0.halls
dat0.style_new[dat0.style_new>0]='有厅'
dat0.style_new[dat0.style_new==0]='无厅'
dat0.head()
H:\anaconda\lib\site-packages\ipykernel_launcher.py:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
after removing the cwd from sys.path.
H:\anaconda\lib\site-packages\ipykernel_launcher.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
"""
| CATE | bedrooms | halls | AREA | floor | subway | school | price | LONG | LAT | NAME | DISTRICT | style_new | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 朝阳 | 1 | 0 | 46.06 | middle | 1 | 0 | 4.8850 | 116.459653 | 39.928347 | 10AM新坐标 | 方庄 | 无厅 |
| 1 | 朝阳 | 1 | 1 | 59.09 | middle | 1 | 0 | 4.6540 | 116.459653 | 39.928347 | 10AM新坐标 | 方庄 | 有厅 |
| 2 | 海淀 | 5 | 2 | 278.95 | high | 1 | 1 | 7.1662 | 116.303556 | 39.954807 | 10号名邸 | 紫竹桥 | 有厅 |
| 3 | 海淀 | 3 | 2 | 207.00 | high | 1 | 1 | 5.7972 | 116.303556 | 39.954807 | 10号名邸 | 紫竹桥 | 有厅 |
| 4 | 丰台 | 2 | 1 | 53.32 | low | 1 | 1 | 7.1268 | 116.418826 | 39.943813 | 17号旁门 | 蒲黄榆 | 有厅 |
#生成哑变量,并设置基准
data=pd.get_dummies(dat0[['CATE','floor']])
data.head()
| CATE_东城 | CATE_丰台 | CATE_朝阳 | CATE_海淀 | CATE_石景山 | CATE_西城 | floor_high | floor_low | floor_middle | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
| 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
| 2 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
| 3 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
| 4 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
data.drop(['CATE_石景山','floor_high'],axis=1,inplace=True)
data.head()
| CATE_东城 | CATE_丰台 | CATE_朝阳 | CATE_海淀 | CATE_西城 | floor_low | floor_middle | |
|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
| 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
| 2 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 3 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 4 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
#生成的哑变量与其他所需变量合并成新的数据框
dat1=pd.concat([data,dat0[['school','subway','style_new','bedrooms','AREA','price']]],axis=1)
dat1.head()
| CATE_东城 | CATE_丰台 | CATE_朝阳 | CATE_海淀 | CATE_西城 | floor_low | floor_middle | school | subway | style_new | bedrooms | AREA | price | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 无厅 | 1 | 46.06 | 4.8850 |
| 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 有厅 | 1 | 59.09 | 4.6540 |
| 2 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 有厅 | 5 | 278.95 | 7.1662 |
| 3 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 有厅 | 3 | 207.00 | 5.7972 |
| 4 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 有厅 | 2 | 53.32 | 7.1268 |
3.1 线性回归模型
###线性回归模型
lm1 = ols("price ~ CATE_丰台+CATE_朝阳+CATE_东城+CATE_海淀+CATE_西城+school+subway+floor_middle+floor_low+style_new+bedrooms+AREA", data=dat1).fit()
lm1_summary = lm1.summary()
lm1_summary #回归结果展示
HTML(
lm1_summary\
.as_html()\
.replace('<th> Adj. R-squared: </th>', '<th style="background-color:#aec7e8;"> Adj. R-squared: </th>')\
.replace('<th>coef</th>', '<th style="background-color:#ffbb78;">coef</th>')\
.replace('<th>std err</th>', '<th style="background-color:#c7e9c0;">std err</th>')\
.replace('<th>P>|t|</th>', '<th style="background-color:#bcbddc;">P>|t|</th>')\
.replace('<th>[95.0% Conf. Int.]</th>', '<th style="background-color:#ff9896;">[95.0% Conf. Int.]</th>')
)
OLS Regression Results | Dep. Variable:| price| R-squared: | 0.590 |
| --- | --- | --- | --- |
|---|---|---|---|
| Method: | Least Squares | F-statistic: | 1945. |
| Date: | Thu, 16 Nov 2017 | Prob (F-statistic): | 0.00 |
| Time: | 21:43:13 | Log-Likelihood: | -28762. |
| No. Observations: | 16210 | AIC: | 5.755e+04 |
| Df Residuals: | 16197 | BIC: | 5.765e+04 |
| Df Model: | 12 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P> | t | [0.025 | 0.975] | ||
|---|---|---|---|---|---|---|---|---|
| Intercept | 3.3155 | 0.068 | 48.522 | 0.000 | 3.182 | 3.449 | ||
| style_new[T.有厅] | 0.1719 | 0.053 | 3.260 | 0.001 | 0.069 | 0.275 | ||
| CATE_丰台 | 0.1313 | 0.042 | 3.124 | 0.002 | 0.049 | 0.214 | ||
| CATE_朝阳 | 0.8751 | 0.044 | 20.042 | 0.000 | 0.790 | 0.961 | ||
| CATE_东城 | 2.4430 | 0.045 | 54.039 | 0.000 | 2.354 | 2.532 | ||
| CATE_海淀 | 2.1913 | 0.044 | 49.632 | 0.000 | 2.105 | 2.278 | ||
| CATE_西城 | 3.7056 | 0.046 | 81.386 | 0.000 | 3.616 | 3.795 | ||
| school | 1.1831 | 0.028 | 42.607 | 0.000 | 1.129 | 1.238 | ||
| subway | 0.6721 | 0.031 | 21.741 | 0.000 | 0.612 | 0.733 | ||
| floor_middle | 0.1521 | 0.027 | 5.605 | 0.000 | 0.099 | 0.205 | ||
| floor_low | 0.1985 | 0.028 | 7.137 | 0.000 | 0.144 | 0.253 | ||
| bedrooms | 0.1111 | 0.020 | 5.500 | 0.000 | 0.072 | 0.151 | ||
| AREA | -0.0028 | 0.000 | -7.393 | 0.000 | -0.004 | -0.002 |
| Omnibus: | 1279.281 | Durbin-Watson: | 0.508 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 2206.275 |
| Skew: | 0.583 | Prob(JB): | 0.00 |
| Kurtosis: | 4.380 | Cond. No. | 837. |
dat1['pred1']=lm1.predict(dat1)
dat1['resid1']=lm1.resid
dat1.plot('pred1','resid1',kind='scatter') #模型诊断图,存在异方差现象,对因变量取对数
<matplotlib.axes._subplots.AxesSubplot at 0x1ac642da128>
[外链图片转存

(img-zfpGe5i2-1562725993948)(output_52_1.png)]
3.2 对数线性模型
###对数线性模型
dat1['price_ln'] = np.log(dat1['price']) #对price取对数
lm2 = ols("price_ln ~ CATE_丰台+CATE_朝阳+CATE_东城+CATE_海淀+CATE_西城+school+subway+floor_middle+floor_low+style_new+bedrooms+AREA", data=dat1).fit()
lm2_summary = lm2.summary()
lm2_summary #回归结果展示
HTML(
lm2_summary\
.as_html()\
.replace('<th> Adj. R-squared: </th>', '<th style="background-color:#aec7e8;"> Adj. R-squared: </th>')\
.replace('<th>coef</th>', '<th style="background-color:#ffbb78;">coef</th>')\
.replace('<th>std err</th>', '<th style="background-color:#c7e9c0;">std err</th>')\
.replace('<th>P>|t|</th>', '<th style="background-color:#bcbddc;">P>|t|</th>')\
.replace('<th>[95.0% Conf. Int.]</th>', '<th style="background-color:#ff9896;">[95.0% Conf. Int.]</th>')
)
OLS Regression Results | Dep. Variable:| price_ln| R-squared: | 0.608 |
| --- | --- | --- | --- |
|---|---|---|---|
| Method: | Least Squares | F-statistic: | 2095. |
| Date: | Thu, 16 Nov 2017 | Prob (F-statistic): | 0.00 |
| Time: | 21:43:30 | Log-Likelihood: | 977.77 |
| No. Observations: | 16210 | AIC: | -1930. |
| Df Residuals: | 16197 | BIC: | -1830. |
| Df Model: | 12 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P> | t | [0.025 | 0.975] | ||
|---|---|---|---|---|---|---|---|---|
| Intercept | 1.2360 | 0.011 | 113.285 | 0.000 | 1.215 | 1.257 | ||
| style_new[T.有厅] | 0.0275 | 0.008 | 3.266 | 0.001 | 0.011 | 0.044 | ||
| CATE_丰台 | 0.0441 | 0.007 | 6.574 | 0.000 | 0.031 | 0.057 | ||
| CATE_朝阳 | 0.2057 | 0.007 | 29.497 | 0.000 | 0.192 | 0.219 | ||
| CATE_东城 | 0.4577 | 0.007 | 63.408 | 0.000 | 0.444 | 0.472 | ||
| CATE_海淀 | 0.4320 | 0.007 | 61.279 | 0.000 | 0.418 | 0.446 | ||
| CATE_西城 | 0.6270 | 0.007 | 86.244 | 0.000 | 0.613 | 0.641 | ||
| school | 0.1719 | 0.004 | 38.772 | 0.000 | 0.163 | 0.181 | ||
| subway | 0.1282 | 0.005 | 25.963 | 0.000 | 0.118 | 0.138 | ||
| floor_middle | 0.0260 | 0.004 | 5.993 | 0.000 | 0.017 | 0.034 | ||
| floor_low | 0.0339 | 0.004 | 7.630 | 0.000 | 0.025 | 0.043 | ||
| bedrooms | 0.0141 | 0.003 | 4.358 | 0.000 | 0.008 | 0.020 | ||
| AREA | -0.0004 | 6e-05 | -5.842 | 0.000 | -0.000 | -0.000 |
| Omnibus: | 143.569 | Durbin-Watson: | 0.485 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 225.271 |
| Skew: | -0.057 | Prob(JB): | 1.21e-49 |
| Kurtosis: | 3.566 | Cond. No. | 837. |
dat1['pred2']=lm2.predict(dat1)
dat1['resid2']=lm2.resid
dat1.plot('pred2','resid2',kind='scatter') #模型诊断图,异方差现象得到消除
<matplotlib.axes._subplots.AxesSubplot at 0x1ac64658d68>
[外链图片转存

(img-u0smdGtz-1562725993948)(output_56_1.png)]
3.3 有交互项的对数线性模型,城区和学区之间的交互作用
###有交互项的对数线性模型,城区和学区之间的交互作用
lm3 = ols("price_ln ~ (CATE_丰台+CATE_朝阳+CATE_东城+CATE_海淀+CATE_西城)*school+subway+floor_middle+floor_low+style_new+bedrooms+AREA", data=dat1).fit()
lm3_summary = lm3.summary()
lm3_summary #回归结果展示
HTML(
lm3_summary\
.as_html()\
.replace('<th> Adj. R-squared: </th>', '<th style="background-color:#aec7e8;"> Adj. R-squared: </th>')\
.replace('<th>coef</th>', '<th style="background-color:#ffbb78;">coef</th>')\
.replace('<th>std err</th>', '<th style="background-color:#c7e9c0;">std err</th>')\
.replace('<th>P>|t|</th>', '<th style="background-color:#bcbddc;">P>|t|</th>')\
.replace('<th>[95.0% Conf. Int.]</th>', '<th style="background-color:#ff9896;">[95.0% Conf. Int.]</th>')
)
OLS Regression Results | Dep. Variable:| price_ln| R-squared: | 0.611 |
| --- | --- | --- | --- |
|---|---|---|---|
| Method: | Least Squares | F-statistic: | 1497. |
| Date: | Thu, 16 Nov 2017 | Prob (F-statistic): | 0.00 |
| Time: | 21:43:40 | Log-Likelihood: | 1039.1 |
| No. Observations: | 16210 | AIC: | -2042. |
| Df Residuals: | 16192 | BIC: | -1904. |
| Df Model: | 17 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P> | t | [0.025 | 0.975] | ||
|---|---|---|---|---|---|---|---|---|
| Intercept | 1.2406 | 0.011 | 113.933 | 0.000 | 1.219 | 1.262 | ||
| style_new[T.有厅] | 0.0271 | 0.008 | 3.225 | 0.001 | 0.011 | 0.044 | ||
| CATE_丰台 | 0.0430 | 0.007 | 6.374 | 0.000 | 0.030 | 0.056 | ||
| CATE_朝阳 | 0.2184 | 0.007 | 30.132 | 0.000 | 0.204 | 0.233 | ||
| CATE_东城 | 0.4467 | 0.008 | 55.915 | 0.000 | 0.431 | 0.462 | ||
| CATE_海淀 | 0.4121 | 0.008 | 52.760 | 0.000 | 0.397 | 0.427 | ||
| CATE_西城 | 0.6177 | 0.008 | 73.109 | 0.000 | 0.601 | 0.634 | ||
| school | -0.1800 | 0.054 | -3.347 | 0.001 | -0.285 | -0.075 | ||
| CATE_丰台:school | 0.2948 | 0.059 | 5.009 | 0.000 | 0.179 | 0.410 | ||
| CATE_朝阳:school | 0.2780 | 0.055 | 5.073 | 0.000 | 0.171 | 0.385 | ||
| CATE_东城:school | 0.3706 | 0.054 | 6.802 | 0.000 | 0.264 | 0.477 | ||
| CATE_海淀:school | 0.3876 | 0.054 | 7.116 | 0.000 | 0.281 | 0.494 | ||
| CATE_西城:school | 0.3638 | 0.054 | 6.675 | 0.000 | 0.257 | 0.471 | ||
| subway | 0.1257 | 0.005 | 25.475 | 0.000 | 0.116 | 0.135 | ||
| floor_middle | 0.0264 | 0.004 | 6.110 | 0.000 | 0.018 | 0.035 | ||
| floor_low | 0.0343 | 0.004 | 7.757 | 0.000 | 0.026 | 0.043 | ||
| bedrooms | 0.0143 | 0.003 | 4.437 | 0.000 | 0.008 | 0.021 | ||
| AREA | -0.0004 | 5.99e-05 | -5.888 | 0.000 | -0.000 | -0.000 |
| Omnibus: | 120.144 | Durbin-Watson: | 0.493 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 185.301 |
| Skew: | -0.029 | Prob(JB): | 5.79e-41 |
| Kurtosis: | 3.520 | Cond. No. | 7.55e+03 |
#导出lm3的建模数据,用SPSS的示意图来讲解交互作用项
dat1.to_excel("C:/Users/Tim/Desktop/dat1.xlsx")
###交互作用的解释
schools=['丰台','朝阳','东城','海淀','西城']
print('石景山非学区房\t',round(dat0[(dat0['CATE']=='石景山')&(dat0['school']==0)]['price'].mean(),2),'万元/平方米\t',
'石景山学区房\t',round(dat0[(dat0['CATE']=='石景山')&(dat0['school']==1)]['price'].mean(),2),'万元/平方米')
print('-------------------------------------------------------------------------')
for i in schools:
print(i+'非学区房\t',round(dat1[(dat1['CATE_'+i]==1)&(dat1['school']==0)]['price'].mean(),2),'万元/平方米\t',i+'学区房\t',round(dat1[(dat1['CATE_'+i]==1)&(dat1['school']==1)]['price'].mean(),2),'万元/平方米')
石景山非学区房 4.04 万元/平方米 石景山学区房 3.31 万元/平方米
-------------------------------------------------------------------------
丰台非学区房 4.23 万元/平方米 丰台学区房 4.89 万元/平方米
朝阳非学区房 5.16 万元/平方米 朝阳学区房 5.74 万元/平方米
东城非学区房 6.63 万元/平方米 东城学区房 7.85 万元/平方米
海淀非学区房 6.14 万元/平方米 海淀学区房 7.69 万元/平方米
西城非学区房 7.7 万元/平方米 西城学区房 9.25 万元/平方米
###探索石景山学区房价格比较低的原因,是否是样本量的问题?
print('石景山非学区房\t',dat0[(dat0['CATE']=='石景山')&(dat0['school']==0)].shape[0],'\t',
'石景山学区房\t',dat0[(dat0['CATE']=='石景山')&(dat0['school']==1)].shape[0],'\t','石景山学区房仅占石景山所有二手房的0.92%')
石景山非学区房 1929 石景山学区房 18 石景山学区房仅占石景山所有二手房的0.92%
###构造图形揭示不同城区是否学区房的价格问题
df=pd.DataFrame()
CATE=['石景山','丰台','朝阳','东城','海淀','西城']
Noschool=[]
school=[]
for i in CATE:
Noschool.append(dat0[(dat0['CATE']==i)&(dat0['school']==0)]['price'].mean())
school.append(dat0[(dat0['CATE']==i)&(dat0['school']==1)]['price'].mean())
df['CATE']=pd.Series(CATE)
df['Noschool']=pd.Series(Noschool)
df['school']=pd.Series(school)
df
| CATE | Noschool | school | |
|---|---|---|---|
| 0 | 石景山 | 4.035388 | 3.310733 |
| 1 | 丰台 | 4.229100 | 4.887162 |
| 2 | 朝阳 | 5.158851 | 5.740341 |
| 3 | 东城 | 6.627689 | 7.851490 |
| 4 | 海淀 | 6.138580 | 7.691126 |
| 5 | 西城 | 7.698937 | 9.246887 |
df1=df['Noschool'].T.values
df2=df['school'].T.values
plt.figure(figsize=(10,6))
x1=range(0,len(df))
x2=[i+0.3 for i in x1]
plt.bar(x1,df1,color='b',width=0.3,alpha=0.6,label='非学区房')
plt.bar(x2,df2,color='r',width=0.3,alpha=0.6,label='学区房')
plt.xlabel('城区')
plt.ylabel('单位面积价格')
plt.title('分城区、是否学区的房屋价格')
plt.legend(loc='upper left')
plt.xticks(range(0,6),CATE)
plt.show()
[外链图片转存

(img-vCOOKQdr-1562725993949)(output_63_0.png)]
###分城区的学区房分组箱线图
school=['石景山','丰台','朝阳','东城','海淀','西城']
for i in school:
dat0[dat0.CATE==i][['school','price']].boxplot(by='school',patch_artist=True)
plt.xlabel(i+'学区房')
[外链图片转存

(img-qn3N4nok-1562725993949)(output_64_0.png)]
[外链图片转存

(img-HBL7DKy4-1562725993950)(output_64_1.png)]
[外链图片转存

(img-BTs0RkEU-1562725993950)(output_64_2.png)]

[外链图片转存失败(img-pFGgASi7-1562725993950)(output_64_3.png)]
[外链图片转存

(img-5cU80Md9-1562725993951)(output_64_4.png)]
[外链图片转存

(img-34GOGHFL-1562725993951)(output_64_5.png)]
###假想情形,做预测,x_new是新的自变量
x_new=dat1.head(1)
x_new
| CATE_东城 | CATE_丰台 | CATE_朝阳 | CATE_海淀 | CATE_西城 | floor_low | floor_middle | school | subway | style_new | bedrooms | AREA | price | pred1 | resid1 | price_ln | pred2 | resid2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 无厅 | 1 | 46.06 | 4.885 | 4.997947 | -0.112947 | 1.586169 | 1.593629 | -0.007459 |
x_new['CATE_朝阳']=0
x_new['floor_middle']=0
x_new['CATE_西城']=1
x_new['bedrooms']=2
x_new['halls']=1
x_new['AREA']=85
x_new['floor_low']=1
x_new['subway']=1
x_new['school']=1
x_new['style_new']="有厅"
x_new
#预测值
print("单位面积房价:",round(math.exp(lm3.predict(x_new)),2),"万元/平方米")
print("总价:",round(math.exp(lm3.predict(x_new))*85,2),"万元")
H:\anaconda\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
"""Entry point for launching an IPython kernel.
H:\anaconda\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
H:\anaconda\lib\site-packages\ipykernel_launcher.py:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
after removing the cwd from sys.path.
H:\anaconda\lib\site-packages\ipykernel_launcher.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
"""
H:\anaconda\lib\site-packages\ipykernel_launcher.py:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
H:\anaconda\lib\site-packages\ipykernel_launcher.py:7: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
import sys
H:\anaconda\lib\site-packages\ipykernel_launcher.py:8: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
H:\anaconda\lib\site-packages\ipykernel_launcher.py:9: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
if __name__ == '__main__':
单位面积房价: 9.28 万元/平方米
总价: 788.67 万元
H:\anaconda\lib\site-packages\ipykernel_launcher.py:10: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
# Remove the CWD from sys.path while we load stuff.
H:\anaconda\lib\site-packages\ipykernel_launcher.py:11: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
# This is added back by InteractiveShellApp.init_path()
x_new1=dat1.head(1)
x_new1['CATE_朝阳']=0
x_new1['CATE_东城']=1
x_new1['bedrooms']=2
x_new1['halls']=1
x_new1['AREA']=70
x_new1['subway']=1
x_new1['school']=1
x_new1['style_new']="有厅"
#预测值
print("单位面积房价:",round(math.exp(lm3.predict(x_new1)),2),"万元/平方米")
print("总价:",round(math.exp(lm3.predict(x_new1))*70,2),"万元")
H:\anaconda\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
This is separate from the ipykernel package so we can avoid doing imports until
H:\anaconda\lib\site-packages\ipykernel_launcher.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
"""
H:\anaconda\lib\site-packages\ipykernel_launcher.py:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
H:\anaconda\lib\site-packages\ipykernel_launcher.py:7: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
import sys
H:\anaconda\lib\site-packages\ipykernel_launcher.py:8: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
H:\anaconda\lib\site-packages\ipykernel_launcher.py:9: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
if __name__ == '__main__':
H:\anaconda\lib\site-packages\ipykernel_launcher.py:10: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
# Remove the CWD from sys.path while we load stuff.
H:\anaconda\lib\site-packages\ipykernel_launcher.py:11: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
# This is added back by InteractiveShellApp.init_path()
单位面积房价: 7.85 万元/平方米
总价: 549.69 万元
