Skip to content

Instantly share code, notes, and snippets.

@myui
Created August 23, 2019 06:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save myui/aa6e142a95ca8f995cc8e49146dbe2eb to your computer and use it in GitHub Desktop.
Save myui/aa6e142a95ca8f995cc8e49146dbe2eb to your computer and use it in GitHub Desktop.
xgboost_iris.md

xgboost binary classification example

prepare data

prepare iris data https://support.treasuredata.com/hc/en-us/articles/360001260787-Iris-Multiclass-Classification-by-RandomForest

-- create xgboost input format (see https://xgboost.readthedocs.io/en/latest/tutorials/input_format.html)
INSERT OVERWRITE TABLE input
select
  rowid, 
  indexed_features(sepal_length, sepal_width, petal_length, petal_width) as features,
  if(class = 'Iris-setosa', 1, 0) as label -- label need to be 0 or 1 
from
  original;

train

INSERT OVERWRITE TABLE model
select 
  train_xgboost_classifier(features, label) 
    as (model_id, model)
from (
  select features, label
  from input
  cluster by rand(43) -- shuffle
) shuffled;

predict

select rowid, avg(predicted) as predicted
from (
  select xgboost_predict(rowid, features, model_id, model)
    as (rowid, predicted)
  from 
    model l
	LEFT OUTER JOIN input r -- actually cross-join taking model for left
) t
group by rowid
@myui
Copy link
Author

myui commented Aug 23, 2019

java.lang.RuntimeException: java.lang.NoClassDefFoundError: scala/Product$class
    at org.apache.hadoop.hive.ql.exec.mr.ExecReducer.reduce(ExecReducer.java:257)
    at org.apache.hadoop.mapred.ReduceTask.runOldReducer(ReduceTask.java:444)
    at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:392)
    at org.apache.hadoop.mapred.LocalContainerLauncher$EventHandler.runSubtask(LocalContainerLauncher.java:414)
    at org.apache.hadoop.mapred.LocalContainerLauncher$EventHandler.runTask(LocalContainerLauncher.java:301)
    at org.apache.hadoop.mapred.LocalContainerLauncher$EventHandler.access$200(LocalContainerLauncher.java:187)
    at org.apache.hadoop.mapred.LocalContainerLauncher$EventHandler$1.run(LocalContainerLauncher.java:230)
    at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
    at java.util.concurrent.FutureTask.run(FutureTask.java:266)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.NoClassDefFoundError: scala/Product$class
    at ml.dmlc.xgboost4j.LabeledPoint.<init>(LabeledPoint.scala:35)
    at ml.dmlc.xgboost4j.LabeledPoint.<init>(LabeledPoint.scala:41)
    at hivemall.xgboost.XGBoostUtils.parseFeatures(XGBoostUtils.java:54)
    at hivemall.xgboost.XGBoostUDTF.process(XGBoostUDTF.java:294)
    at org.apache.hadoop.hive.ql.exec.UDTFOperator.process(UDTFOperator.java:116)
    at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:897)
    at org.apache.hadoop.hive.ql.exec.SelectOperator.process(SelectOperator.java:95)
    at org.apache.hadoop.hive.ql.exec.mr.ExecReducer.reduce(ExecReducer.java:236)
    ... 11 more
Caused by: java.lang.ClassNotFoundException: scala.Product$class
    at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
    at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:338)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
    ... 19 more

@myui
Copy link
Author

myui commented Aug 23, 2019

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment