Advertisement

数据挖掘:R语言k均值聚类

阅读量:

文章目录

        • k均值聚类(青少年)

k-means clustering algorithm is a widely used unsupervised machine learning method for data analysis and pattern recognition.

复制代码
    > #==---K均值聚类-----------------------
    > 
    > data<-read.csv('snsdata.csv')
    > 
    > #查看某一特征的缺失值数量
    > table(data$gender,useNA = 'ifany')    #或者可以用sum(is.na(data$gender))
    
    F     M  <NA> 
    22054  5222  2724 
    > 
    > summary(data$age)
       Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
086  16.312  17.287  17.994  18.259 106.927 
       NA's 
       5086
复制代码
    > #限制年龄,青少年合理的年龄
    > data$age<-ifelse(data$age>=13 & data$age <=20,data$age,NA)
    > 
    > #将性别特征作为一个单独的类别
    > data$female<-ifelse(data$gender=='F' & !is.na(data$gender),1,0)
    > data$no_gender<-ifelse(is.na(data$gender),1,0)
    > #--插补缺失值
    > 
    > mean(data$age,na.rm = T)    #这里如果不处理缺失值,均值是无法计算的
    [1] 17.25243
    > #按照毕业年份进行插补,而不是直接用总的平均
    > aggregate(data=data,age~gradyear,mean,na.rm=T)    #为数据的子组计算统计量
      gradyear      age
    1     2006 18.65586
    2     2007 17.70617
    3     2008 16.76770
    4     2009 15.81957
    > ave_age<-ave(data$age,data$gradyear,FUN = function(x) mean(x,na.rm = T))
    > data$age<-ifelse(is.na(data$age),ave_age,data$age)
    > #==3、训练模型-----------------------------
    > 
    > #归一化
    > dat<-data[5:40]
    > dat<-scale(dat)
    > model<-kmeans(dat,5)
    > #评估模型性能
    > model$size    #分为5组,每组的数量长度
    [1]  3338  1037   801  3236 21588
    > 
    > model$centers    #聚类质心的坐标
       basketball    football      soccer    softball  volleyball
    1  0.02555414  0.10780408  0.05043110 -0.02352462  0.04504455
    2  0.37840886  0.38329763  0.14612738  0.14970565  0.09127316
    3 -0.12432147  0.02638575 -0.08225981 -0.05927577 -0.09100731
    4  1.13627903  1.04688788  0.51821200  0.87870851  0.80441228
    5 -0.18784172 -0.19298656 -0.08944400 -0.13307117 -0.12855245
     swimming cheerleading    baseball      tennis      sports
    1  0.31023275   0.65006420 -0.03747727  0.07039886 -0.05164418
    2  0.25111447   0.18827933  0.28683097  0.11646463  0.81647105
    3  0.04671618  -0.11994087 -0.11019698  0.01460085 -0.11690740
    4  0.12079138   0.01377962  0.87924593  0.14489299  0.88234954
    5 -0.07987137  -0.10717427 -0.13569190 -0.03874070 -0.15915937
          cute          sex         sexy         hot       kissed
    1  0.882295320  0.002139268  0.294172680  0.73553775 -0.006165031
    2  0.465295744  2.046954725  0.541440225  0.29527795  3.015963132
    3 -0.047632737 -0.045258590 -0.012056845 -0.06619831 -0.067805344
    4  0.009705961 -0.035651981 -0.002880267  0.01181699 -0.089160084
    5 -0.158461559 -0.091634750 -0.070615334 -0.12723009 -0.128040614
        dance        band     marching      music       rock
    1  0.72507896 -0.05865963 -0.122635644  0.1925923  0.1092707
    2  0.45712266  0.39072771 -0.007061162  1.2118752  1.2404660
    3  0.03376371  3.39790133  4.624635235  0.3967476  0.1734602
    4  0.01545967 -0.04930562 -0.115585212  0.1538179  0.1598251
    5 -0.13764230 -0.12838358 -0.134964743 -0.1257707 -0.1068762
          god      church        jesus       bible         hair
    1  0.05415822  0.18556832 -0.007765453 -0.05154500  0.403405052
    2  0.41099396  0.16493613  0.102426287  0.06068004  2.601304288
    3  0.07184030  0.03625217  0.039134315  0.02996570 -0.050571595
    4  0.51862331  0.62911665  0.487859543  0.52544360  0.007479408
    5 -0.10852279 -0.13226446 -0.078300678 -0.07481962 -0.186576530
        dress      blonde        mall    shopping      clothes
    1  0.63830236  0.03728880  0.92399000  1.17231895  0.688550521
    2  0.52337603  0.36809233  0.61830967  0.26349949  1.232343283
    3  0.03921043 -0.01494127 -0.09716585 -0.06678824 -0.002505384
    4 -0.03856507  0.02956906 -0.01414121  0.04210179  0.008796552
    5 -0.11951108 -0.02732533 -0.16684616 -0.19775772 -0.166888107
       hollister abercrombie          die       death       drunk
    1  1.1410179   1.0547064  0.035725307  0.09580716  0.04080774
    2  0.2712611   0.3826856  1.703287568  0.93064011  1.84293978
    3 -0.1726721  -0.1517681 -0.016920170  0.03000829 -0.08589740
    4 -0.1301830  -0.1200203  0.002598997  0.05896191 -0.05161181
    5 -0.1635368  -0.1578425 -0.087104761 -0.06946987 -0.08391352
        drugs
    1 -0.05091813
    2  2.72485283
    3 -0.08096140
    4 -0.06936923
    5 -0.10961547
复制代码
    > data$cluster<-model$cluster
    > #用aggregate()函数分组,了解每一类不同特征的关系
    > aggregate(data=data,age~cluster,mean)
      cluster      age
    1       1 16.99662
    2       2 17.09666
    3       3 17.37388
    4       4 17.09612
    5       5 17.29740

全部评论 (0)

还没有任何评论哟~