Last active
August 29, 2015 13:57
-
-
Save tuxdna/9808278 to your computer and use it in GitHub Desktop.
Fuzzy / KMeans Clustering on Reuters corpus using Mahout 0.7
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# WORKFLOW: seqdirectory -> seq2sparse -> {kmeans, fkmeans} -> clusterdump | |
export HADOOP_USER_NAME=hduser | |
## Download, unzip, and extract reuters text data from SGML files | |
WORK_DIR=/tmp/clustering/reuters | |
mkdir -p $WORK_DIR | |
curl http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz -o ${WORK_DIR}/reuters21578.tar.gz | |
mkdir -p ${WORK_DIR}/reuters-sgm | |
tar xzf ${WORK_DIR}/reuters21578.tar.gz -C ${WORK_DIR}/reuters-sgm | |
CLUSTERING_DIR=/data/clustering | |
mahout org.apache.lucene.benchmark.utils.ExtractReuters file://${WORK_DIR}/reuters-sgm file://${WORK_DIR}/reuters-out | |
hadoop fs -copyFromLocal file://${WORK_DIR}/reuters-out $CLUSTERING_DIR | |
ANALYZER=org.apache.lucene.analysis.WhitespaceAnalyzer | |
DISTMETRIC=org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure | |
TXTDIR=/data/clustering/reuters-out | |
SEQDIR=/data/clustering/reuters-out-seqdir | |
SPARSEDIR=/data/clustering/reuters-out-sparse | |
TFIDF_VEC=$SPARSEDIR/tfidf-vectors | |
DICT=$SPARSEDIR/dictionary.file-* | |
mahout seqdirectory -c UTF-8 -i $TXTDIR -o $SEQDIR | |
mahout seq2sparse -chunk 200 -wt tfidf -s 5 -md 3 -x 90 -ng 2 -ml 50 -seq -n 2 -nr 5 -a $ANALYZER -i $SEQDIR -o $SPARSEDIR | |
## KMeans clustering | |
INITCLUSTERS=/data/clustering/reuters-out-kmeans-initialclusters | |
CLUSTERS=/data/clustering/reuters-out-kmeans-clusters | |
FINAL_CLUST=$CLUSTERS/clusters-*-final | |
mahout kmeans -cd 1.0 -k 20 -x 20 -dm $DISTMETRIC -i $TFIDF_VEC -c $INITCLUSTERS -o $CLUSTERS | |
mahout clusterdump -b 10 -n 10 -dt sequencefile -d $DICT -i $FINAL_CLUST -o ./reuters-kmeans-dump | |
## Fuzzy KMeans clustering | |
F_INITCLUSTERS=/data/clustering/reuters-out-fkmeans-initialclusters | |
F_CLUSTERS=/data/clustering/reuters-out-fkmeans-clusters | |
F_FINAL_CLUST=$F_CLUSTERS/clusters-*-final | |
mahout fkmeans -cd 1.0 -k 21 -m 2 -ow -x 10 -dm $DISTMETRIC -i $TFIDF_VEC -c $F_INITCLUSTERS -o $F_CLUSTERS | |
mahout clusterdump -b 10 -n 10 -dt sequencefile -d $DICT -i $F_FINAL_CLUST -o ./reuters-fkmeans-dump | |
MODEL_DISTRIBUTION=org.apache.mahout.clustering.dirichlet.models.GaussianClusterDistribution | |
MODEL_PROTOTYPE=org.apache.mahout.math.SequentialAccessSparseVector | |
DIRICHLET_CLUSTERS=/data/clustering/reuters-out-dirichlet-clusters | |
mahout dirichlet -k 60 -x 10 -a0 1.0 -md $MODEL_DISTRIBUTION -mp $MODEL_PROTOTYPE -i $TFIDF_VEC -o $DIRICHLET_CLUSTERS | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment