Last active
May 25, 2024 10:20
-
-
Save dacr/8c3d5e76223cf6ac67fdcdbcf0472ff5 to your computer and use it in GitHub Desktop.
Playing with smile and california housing dataset / published by https://github.com/dacr/code-examples-manager #a532819f-03fb-429b-90b2-02babad00863/21c5e05805065eab66d318ea8507f3db3635a2ae
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// summary : Playing with smile and california housing dataset | |
// keywords : smile, machine-learning, cal_housing, ai, @testable | |
// publish : gist | |
// authors : David Crosson | |
// license : Apache NON-AI License Version 2.0 (https://raw.githubusercontent.com/non-ai-licenses/non-ai-licenses/main/NON-AI-APACHE2) | |
// id : a532819f-03fb-429b-90b2-02babad00863 | |
// created-on : 2021-03-05T09:23:01Z | |
// managed-by : https://github.com/dacr/code-examples-manager | |
// execution : scala ammonite script (http://ammonite.io/) - run as follow 'amm scriptname.sc' | |
// run-with : scala-cli $file | |
// --------------------- | |
//> using scala "3.4.2" | |
//> using dep "com.github.pathikrit::better-files:3.9.2" | |
//> using dep "com.github.haifengl:smile-scala_2.13:3.0.1" | |
//> using dep "org.bytedeco:javacpp-platform:1.5.8" | |
//> using dep "org.bytedeco:javacpp:1.5.8,classifier=linux-x86_64" | |
//> using dep "org.bytedeco:arpack-ng:3.8.0-1.5.8,classifier=linux-x86_64" | |
//> using dep "org.bytedeco:openblas:0.3.21-1.5.8,classifier=linux-x86_64" | |
//> using dep "org.slf4j:slf4j-nop:2.0.7" | |
//> using dep "com.lihaoyi::requests:0.8.0" | |
// --------------------- | |
import better.files.* | |
import scala.language.postfixOps | |
import smile.* | |
import smile.util.* | |
import smile.math.* | |
import smile.math.MathEx.* | |
import smile.math.distance.* | |
import smile.data.formula.* | |
import smile.regression.OLS | |
//import smile.plot.vega.* // FOR VEGA RENDERING | |
import smile.plot.swing.* // FOR SWING RENDERING | |
import smile.plot.show | |
import smile.plot.Render.* | |
// ===================================================================== | |
// Normalize input data in the same way as for python | |
/* | |
datasets : | |
* http://lib.stat.cmu.edu/datasets/ | |
used dataset : | |
* original data : https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz | |
+ with some adaptations : | |
- https://github.com/scikit-learn/scikit-learn/blob/95d4f0841/sklearn/datasets/_california_housing.py#L51 | |
- CaliforniaHousing/cal_housing.data | |
* https://github.com/crhodes2/Get-Rich/blob/master/tensorflow/Lib/site-packages/sklearn/datasets/descr/california_housing.rst | |
*/ | |
val infile=file"cal_housing.data" | |
if (infile.notExists) { | |
val url = "https://gist.githubusercontent.com/dacr/4dd9b6cf55154559684f96aeeed33f64/raw/68989aefc9c483601439aeb4e8a0c49b8e2de0db/cal_housing.data" | |
for {out <- infile.newOutputStream.autoClosed} {requests.get(url).writeBytesTo(out)} | |
} | |
val outfile=file"cal_housing.csv" | |
if (!outfile.exists) { | |
val columnsIndex = List(8, 7, 2, 3, 4, 5, 6, 1, 0) | |
val featureNames = List("Price", "MedInc", "HouseAge", "AveRooms", "AveBedrms", "Population", "AveOccup", "Latitude", "Longitude") | |
val it = | |
infile | |
.lineIterator | |
.map{_.split(",")} | |
.map{a => columnsIndex.map(i => a(i).toDouble)} | |
.map{ | |
case List(ct,c0,c1,c2,c3,c4,c5,c6,c7) => | |
List(ct/100000d,c0, c1, c2/c5, c3/c5, c4, c4/c5, c6, c7) | |
} | |
outfile.append(featureNames.mkString(",")).append("\n") | |
it.foreach{ a => outfile.append(a.mkString(",")).append("\n") } | |
} | |
// ===================================================================== | |
// SMILE PART - using linear regression | |
val houses = read.csv("cal_housing.csv") | |
val model = OLS.fit("Price" ~, houses) | |
println(model) | |
val tuple = houses(0) | |
val predicted = model.predict(tuple) | |
println(s"predicted : $predicted") | |
// TODO - to be continued... |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment