export HADOOP_HOME=/usr/local/hadoop-single
———————————————————–cluster—————— ———————————-
bin/hadoop fs -put /opt/synthetic_control.data testdata/synthetic_control.data
bin/hadoop jar /usr/local/mahout/mahout-examples-0.8-job.jar org.apache.mahout.clustering.syntheticcontrol.kmeans.Job
bin/hadoop jar /usr/local/mahout/mahout-examples-0.8-job.jar org.apache.mahout.clustering.syntheticcontrol.kmeans.Job -i /user/demo/synthetic_control.data -o /user/demo/kmeans-output
也可以在eclipse中直接运行测试
将序列文件导出查看 加上–pointsDir 就会导出所有点的集合 如果不加 就只有分类后的说明
bin/mahout clusterdump –input output/clusters-8-final –pointsDir output/clusteredPoints –output /opt/mahout-kmeans
bin/mahout clusterdump –input output/clusters-1-final –pointsDir output/clusteredPoints –output /opt/mahout-kmeans2
bin/mahout clusterdump –input output/clusters-8-final/part-r-00000 –output /opt/mahout-kmeans2
随便输入数据测试
输入文件
1 1.46
1 1.45
1 1.35
1 1.41
1 1.43
1 1.40
1001 1002
1001 1002.2
1001 1002.1
1001 1002.3
1001.5 1002
k=2
输出 不解释
VL-8{n=6 c=[1.000, 1.417] r=[1:0.036]}
Weight : [props – optional]: Point:
1.0: [1.000, 1.460]
1.0: [1.000, 1.450]
1.0: [1.000, 1.350]
1.0: [1.000, 1.410]
1.0: [1.000, 1.430]
1.0: [1.000, 1.400]
VL-10{n=5 c=[1001.100, 1002.120] r=[0.200, 0.117]}
Weight : [props – optional]: Point:
1.0: [1001.000, 1002.000]
1.0: [1001.000, 1002.200]
1.0: [1001.000, 1002.100]
1.0: [1001.000, 1002.300]
1.0: [1001.500, 1002.000]
———————————————————-cf———————————————————-
bin/hadoop jar /usr/local/mahout/mahout-core-0.8-job.jar org.apache.mahout.cf.taste.hadoop.item.RecommenderJob –input /user/demo/cf-input –output /user/demo/cf-output –usersFile /user/demo/users.txt –similarityClassname SIMILARITY_COOCCURRENCE
输入文件格式
userId,itemId,preference
1,101,5.0
1,102,3.0
1,103,2.5
2,101,2.0
2,102,2.5
2,103,5.0
2,104,2.0
注意input的格式 不能有空行 尤其是最后 –usersFile用来指定为哪些用户推荐 默认是所有
输出结果
1 [105:3.875,104:3.7222223,106:3.6]
2 [106:2.9285715,105:2.5833333,107:2.0]
3 [106:3.5,102:3.3333333,103:3.3125]
4 [107:4.75,105:4.3333335,102:4.111111]
5 [107:3.8333333]
———————————————————classifier——————————————————
———train data———
每个文件夹代表一类
bin/mahout seqdirectory -i c/20news-bydate-train -o c/20news-bydate-train-seq
bin/mahout seq2sparse -i c/20news-bydate-train-seq -o c/20news-bydate-train-vectors -lnorm -nv -wt tfidf
——–test data——–
bin/mahout seqdirectory -i c/20news-bydate-test -o c/20news-bydate-test-seq
bin/mahout seq2sparse -i c/20news-bydate-test-seq -o c/20news-bydate-test-vectors -lnorm -nv -wt tfidf
———train———–
bin/mahout trainnb -i c/20news-bydate-train-vectors/tfidf-vectors -el -o c/model -li c/labelindex -ow -c
这样测试 准确率相当高 那是当然
bin/mahout testnb -i c/20news-bydate-train-vectors/tfidf-vectors -m c/model -l c/labelindex -ow -o c/20news-testing -c
=======================================================
Summary
——————————————————-
Correctly Classified Instances : 11182 98.8333%
Incorrectly Classified Instances : 132 1.1667%
Total Classified Instances : 11314
=======================================================
用测试数据 准确率比较低?
bin/mahout testnb -i c/20news-bydate-test-vectors/tfidf-vectors -m c/model -l c/labelindex -ow -o c/20news-testing -c
=======================================================
Summary
——————————————————-
Correctly Classified Instances : 340 4.5141%
Incorrectly Classified Instances : 7192 95.4859%
Total Classified Instances : 7532
=======================================================
——————————————————-fpm—————————————————-
milk,egg,bread,chip
egg,popcorn,chip,beer
egg,bread,chip
milk,egg,bread,popcorn,chip,beer
milk,bread,beer
egg,bread,beer
milk,bread,chip
milk,egg,bread,butter,chip
milk,egg,butter,chip
milk1,egg1,bread1,chip1
egg1,popcorn1,chip1,beer1
egg1,bread1,chip1
milk1,egg1,bread1,popcorn1,chip1,beer1
milk1,bread1,beer1
egg1,bread1,beer1
milk1,bread1,chip1
milk1,egg1,bread1,butter1,chip1
milk1,egg1,butter1,chip1
bin/hadoop fs -put /opt/basket.txt /user/demo/basket.txt
bin/hadoop jar /usr/local/mahout/mahout-core-0.8-job.jar org.apache.mahout.fpm.pfpgrowth.FPGrowthDriver -i /user/demo/basket.txt -o /user/demo/basket-output
bin/hadoop jar /usr/local/mahout/mahout-core-0.8-job.jar org.apache.mahout.fpm.pfpgrowth.FPGrowthDriver -i /user/demo/basket100k.dat -o /user/demo/basket-output-100k -g 2 -s 50