eliasah/0.question_svm_with_sgd.md

## 0.question_svm_with_sgd.md

      
    Raw
  

              0.question_svm_with_sgd.md
            
          
    Hello,
I am using linear SVM to train my model and generate a line through my data. However my model always predicts 1 for all the feature examples. Here is my code:
print data_rdd.take(5)
[LabeledPoint(1.0, [1.9643,4.5957]), LabeledPoint(1.0, [2.2753,3.8589]), LabeledPoint(1.0, [2.9781,4.5651]), LabeledPoint(1.0, [2.932,3.5519]), LabeledPoint(1.0, [3.5772,2.856])]

from pyspark.mllib.classification import SVMWithSGD
from pyspark.mllib.linalg import Vectors
from sklearn.svm import SVC
data_rdd=x_df.map(lambda x:LabeledPoint(x[1],x[0]))
model = SVMWithSGD.train(data_rdd, iterations=1000,regParam=1)
X=x_df.map(lambda x:x[0]).collect()
Y=x_df.map(lambda x:x[1]).collect()

pred=[]
for i in X:
pred.append(model.predict(i))
print pred
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

  
## 1.svm_with_sgd.py
# code tested with pyspark
# pyspark --packages com.databricks:spark-csv_2.10:1.5.0

from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.classification import LabeledPoint
from pyspark.mllib.classification import SVMWithSGD

# read data
df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('data.csv')

# prepare data
assembler = VectorAssembler(inputCols=["X", "Y"], outputCol="features")
data = assembler.transform(df)

# create RDD[LabeledPoint]
rdd = data.map(lambda row: LabeledPoint(row.label, row.features))

# Train SVM With SGD model
model = SVMWithSGD.train(rdd, iterations=1000,regParam=1.0,intercept=True,step=0.1)

# Create unlabled data
unlabeled_data = data.map(lambda x : x.features)

# Make prediction on unlabeled data and collect
model.predict(unlabeled_data).collect()
# [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]

## 2.data.csv

          
            X
            Y
            label

            
              1.9643
               4.5957
                  1

            
              2.2753
               3.8589
                  1

            
              2.9781
               4.5651
                  1

            
              2.932
               3.5519
                  1

            
              3.5772
               2.856
                  1

            
              4.015
               3.1937
                  1

            
              3.3814
               3.4291
                  1

            
              3.9113
               4.1761
                  1

            
              2.7822
               4.0431
                  1

            
              2.5518
               4.6162
                  1

            
              3.3698
               3.9101
                  1

            
              3.1048
               3.0709
                  1

            
              1.9182
               4.0534
                  1

            
              2.2638
               4.3706
                  1

            
              2.6555
               3.5008
                  1

            
              3.1855
               4.2888
                  1

            
              3.6579
               3.8692
                  1

            
              3.9113
               3.4291
                  1

            
              3.6002
               3.1221
                  1

            
              3.0357
               3.3165
                  1

            
              1.5841
               3.3575
                  0

            
              2.0103
               3.2039
                  0

            
              1.9527
               2.7843
                  0

            
              2.2753
               2.7127
                  0

            
              2.3099
               2.9584
                  0

            
              2.8283
               2.6309
                  0

            
              3.0473
               2.2931
                  0

            
              2.4827
               2.0373
                  0

            
              2.5057
               2.3853
                  0

            
              1.8721
               2.0577
                  0

            
              2.0103
               2.3546
                  0

            
              1.2269
               2.3239
                  0

            
              1.8951
               2.9174
                  0

            
              1.561
               3.0709
                  0

            
              1.5495
               2.6923
                  0

            
              1.6878
               2.4057
                  0

            
              1.4919
               2.0271
                  0

            
              0.962
               2.682
                  0

            
              1.1693
               2.9276
                  0

            
              0.8122
               2.9992
                  0

            
              0.9735
               3.3881
                  0

            
              1.25
               3.1937
                  0

            
              1.3191
               3.5109
                  0

            
              2.2292
               2.201
                  0

            
              2.4482
               2.6411
                  0

            
              2.7938
               1.9656
                  0

            
              2.091
               1.6177
                  0

            
              2.5403
               2.8867
                  0

            
              0.9044
               3.0198
                  0

            
              0.76615
               2.5899
                  0

            
              0.086405
               4.1045
                  1
	# code tested with pyspark
	# pyspark --packages com.databricks:spark-csv_2.10:1.5.0

	from pyspark.ml.feature import VectorAssembler
	from pyspark.mllib.classification import LabeledPoint
	from pyspark.mllib.classification import SVMWithSGD

	# read data
	df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('data.csv')

	# prepare data
	assembler = VectorAssembler(inputCols=["X", "Y"], outputCol="features")
	data = assembler.transform(df)

	# create RDD[LabeledPoint]
	rdd = data.map(lambda row: LabeledPoint(row.label, row.features))

	# Train SVM With SGD model
	model = SVMWithSGD.train(rdd, iterations=1000,regParam=1.0,intercept=True,step=0.1)

	# Create unlabled data
	unlabeled_data = data.map(lambda x : x.features)

	# Make prediction on unlabeled data and collect
	model.predict(unlabeled_data).collect()
	# [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
X	Y	label
1.9643	4.5957	1
2.2753	3.8589	1
2.9781	4.5651	1
2.932	3.5519	1
3.5772	2.856	1
4.015	3.1937	1
3.3814	3.4291	1
3.9113	4.1761	1
2.7822	4.0431	1
2.5518	4.6162	1
3.3698	3.9101	1
3.1048	3.0709	1
1.9182	4.0534	1
2.2638	4.3706	1
2.6555	3.5008	1
3.1855	4.2888	1
3.6579	3.8692	1
3.9113	3.4291	1
3.6002	3.1221	1
3.0357	3.3165	1
1.5841	3.3575	0
2.0103	3.2039	0
1.9527	2.7843	0
2.2753	2.7127	0
2.3099	2.9584	0
2.8283	2.6309	0
3.0473	2.2931	0
2.4827	2.0373	0
2.5057	2.3853	0
1.8721	2.0577	0
2.0103	2.3546	0
1.2269	2.3239	0
1.8951	2.9174	0
1.561	3.0709	0
1.5495	2.6923	0
1.6878	2.4057	0
1.4919	2.0271	0
0.962	2.682	0
1.1693	2.9276	0
0.8122	2.9992	0
0.9735	3.3881	0
1.25	3.1937	0
1.3191	3.5109	0
2.2292	2.201	0
2.4482	2.6411	0
2.7938	1.9656	0
2.091	1.6177	0
2.5403	2.8867	0
0.9044	3.0198	0
0.76615	2.5899	0
0.086405	4.1045	1