数据挖掘:R语言k均值聚类
发布时间
阅读量:
阅读量
文章目录
-
-
-
- k均值聚类(青少年)
-
-
k-means clustering algorithm is a widely used unsupervised machine learning method for data analysis and pattern recognition.
> #==---K均值聚类-----------------------
>
> data<-read.csv('snsdata.csv')
>
> #查看某一特征的缺失值数量
> table(data$gender,useNA = 'ifany') #或者可以用sum(is.na(data$gender))
F M <NA>
22054 5222 2724
>
> summary(data$age)
Min. 1st Qu. Median Mean 3rd Qu. Max.
086 16.312 17.287 17.994 18.259 106.927
NA's
5086
> #限制年龄,青少年合理的年龄
> data$age<-ifelse(data$age>=13 & data$age <=20,data$age,NA)
>
> #将性别特征作为一个单独的类别
> data$female<-ifelse(data$gender=='F' & !is.na(data$gender),1,0)
> data$no_gender<-ifelse(is.na(data$gender),1,0)
> #--插补缺失值
>
> mean(data$age,na.rm = T) #这里如果不处理缺失值,均值是无法计算的
[1] 17.25243
> #按照毕业年份进行插补,而不是直接用总的平均
> aggregate(data=data,age~gradyear,mean,na.rm=T) #为数据的子组计算统计量
gradyear age
1 2006 18.65586
2 2007 17.70617
3 2008 16.76770
4 2009 15.81957
> ave_age<-ave(data$age,data$gradyear,FUN = function(x) mean(x,na.rm = T))
> data$age<-ifelse(is.na(data$age),ave_age,data$age)
> #==3、训练模型-----------------------------
>
> #归一化
> dat<-data[5:40]
> dat<-scale(dat)
> model<-kmeans(dat,5)
> #评估模型性能
> model$size #分为5组,每组的数量长度
[1] 3338 1037 801 3236 21588
>
> model$centers #聚类质心的坐标
basketball football soccer softball volleyball
1 0.02555414 0.10780408 0.05043110 -0.02352462 0.04504455
2 0.37840886 0.38329763 0.14612738 0.14970565 0.09127316
3 -0.12432147 0.02638575 -0.08225981 -0.05927577 -0.09100731
4 1.13627903 1.04688788 0.51821200 0.87870851 0.80441228
5 -0.18784172 -0.19298656 -0.08944400 -0.13307117 -0.12855245
swimming cheerleading baseball tennis sports
1 0.31023275 0.65006420 -0.03747727 0.07039886 -0.05164418
2 0.25111447 0.18827933 0.28683097 0.11646463 0.81647105
3 0.04671618 -0.11994087 -0.11019698 0.01460085 -0.11690740
4 0.12079138 0.01377962 0.87924593 0.14489299 0.88234954
5 -0.07987137 -0.10717427 -0.13569190 -0.03874070 -0.15915937
cute sex sexy hot kissed
1 0.882295320 0.002139268 0.294172680 0.73553775 -0.006165031
2 0.465295744 2.046954725 0.541440225 0.29527795 3.015963132
3 -0.047632737 -0.045258590 -0.012056845 -0.06619831 -0.067805344
4 0.009705961 -0.035651981 -0.002880267 0.01181699 -0.089160084
5 -0.158461559 -0.091634750 -0.070615334 -0.12723009 -0.128040614
dance band marching music rock
1 0.72507896 -0.05865963 -0.122635644 0.1925923 0.1092707
2 0.45712266 0.39072771 -0.007061162 1.2118752 1.2404660
3 0.03376371 3.39790133 4.624635235 0.3967476 0.1734602
4 0.01545967 -0.04930562 -0.115585212 0.1538179 0.1598251
5 -0.13764230 -0.12838358 -0.134964743 -0.1257707 -0.1068762
god church jesus bible hair
1 0.05415822 0.18556832 -0.007765453 -0.05154500 0.403405052
2 0.41099396 0.16493613 0.102426287 0.06068004 2.601304288
3 0.07184030 0.03625217 0.039134315 0.02996570 -0.050571595
4 0.51862331 0.62911665 0.487859543 0.52544360 0.007479408
5 -0.10852279 -0.13226446 -0.078300678 -0.07481962 -0.186576530
dress blonde mall shopping clothes
1 0.63830236 0.03728880 0.92399000 1.17231895 0.688550521
2 0.52337603 0.36809233 0.61830967 0.26349949 1.232343283
3 0.03921043 -0.01494127 -0.09716585 -0.06678824 -0.002505384
4 -0.03856507 0.02956906 -0.01414121 0.04210179 0.008796552
5 -0.11951108 -0.02732533 -0.16684616 -0.19775772 -0.166888107
hollister abercrombie die death drunk
1 1.1410179 1.0547064 0.035725307 0.09580716 0.04080774
2 0.2712611 0.3826856 1.703287568 0.93064011 1.84293978
3 -0.1726721 -0.1517681 -0.016920170 0.03000829 -0.08589740
4 -0.1301830 -0.1200203 0.002598997 0.05896191 -0.05161181
5 -0.1635368 -0.1578425 -0.087104761 -0.06946987 -0.08391352
drugs
1 -0.05091813
2 2.72485283
3 -0.08096140
4 -0.06936923
5 -0.10961547
> data$cluster<-model$cluster
> #用aggregate()函数分组,了解每一类不同特征的关系
> aggregate(data=data,age~cluster,mean)
cluster age
1 1 16.99662
2 2 17.09666
3 3 17.37388
4 4 17.09612
5 5 17.29740
全部评论 (0)
还没有任何评论哟~
