myui/news20b-mllib_logress.md

## news20b-mllib_logress.md

      
    Raw
  

              news20b-mllib_logress.md
            
          
    The dataset used in the Evaluation

http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#news20.binary
head -1000 news20.binary | sed 's/+1/1/g' | sed 's/-1/0/g' > news20.binary.1000
sort -R news20.binary > news20.random
head -1000 news20.random | sed 's/+1/1/g' | sed 's/-1/0/g' > news20.random.1000
Evaluated code

import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.classification.LogisticRegressionWithSGD

//val training = MLUtils.loadLibSVMFile(sc, "hdfs://dm01:8020/dataset/news20-binary/raw/news20.binary.1000",  multiclass=false)
val training = MLUtils.loadLibSVMFile(sc, "hdfs://dm01:8020/dataset/news20-binary/raw/news20.random.1000",  multiclass=false)
// val training = MLUtils.loadLibSVMFile(sc, "hdfs://dm01:8020/dataset/news20-binary/raw/news20.random.1000",  multiclass=false, numFeatures = 1354731 , minPartitions = 32)

val numFeatures = training .take(1)(0).features.size
//numFeatures: Int = 178560 for news20.binary.1000
//numFeatures: Int = 1354731 for news20.random.1000
val model = LogisticRegressionWithSGD.train(training, numIterations=1)