Skip to content

Instantly share code, notes, and snippets.

@ranjanan
Created May 16, 2018 12:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ranjanan/a2a9e0bf5a32a303f4961fd581ca0f5d to your computer and use it in GitHub Desktop.
Save ranjanan/a2a9e0bf5a32a303f4961fd581ca0f5d to your computer and use it in GitHub Desktop.
Flights Dataset Comparison
import DecisionTree
using ScikitLearn
@sk_import tree: DecisionTreeClassifier
import PyCall: PyObject
using JuliaDB
function load_data()
isfile("balance.data") ||
download("https://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data",
"balance.data")
isfile("hflights.csv") ||
download("https://raw.githubusercontent.com/piever/JuliaDBTutorial/master/hflights.csv",
"hflights.csv")
loadtable("hflights.csv")
end
function evaluate_results(truth, pred)
a1 = f(1,1)
a2 = f(0,0)
a3 = f(1,0)
a4 = f(0,1)
tp = map(a1, truth, pred) |> countnz
tn = map(a2, truth, pred) |> countnz
fn = map(a3, truth, pred) |> countnz
fp = map(a4, truth, pred) |> countnz
@show tp
@show fp
@show fn
@show tn
precision = tp / (tp + fp)
@show precision
recall = tp/(tp+fn)
@show recall
f1 = 2 / (1/recall + 1/precision)
@show f1
end
f(a,b) = (x,y) -> x == a && y == b
function train_models(features, labels)
m_julia = DecisionTree.DecisionTreeClassifier()
DecisionTree.fit!(m_julia, features, labels)
m_python = DecisionTreeClassifier()
m_python[:fit](features, labels)
m_julia, m_python
end
function run_model(m::DecisionTree.DecisionTreeClassifier, feat)
DecisionTree.predict(m, feat)
end
function run_model(m::PyObject, feat)
m[:predict](feat)
end
function to_matrix(data, features)
for f in features
data = smooth_nas(data, column(data, f), f)
end
reduce(hcat, columns(data, features))
end
smooth_nas(data, v::Vector{T}, f) where T = data
function smooth_nas(data, v::DataValues.DataValueVector{T}, f) where T
data = setcol(data, f,
f => x -> x == JuliaDB.NA ? JuliaDB.DataValue(zero(T)) : x)
data = dropna(data, f)
end
function to_vector(v::DataValues.DataValueVector{T}) where T
v = map(x -> x == JuliaDB.NA ? JuliaDB.DataValue(zero(T)) : x, v)
Int.(dropna(v))
end
to_vector(v::Vector{T}) where T = Int.(v)
function main()
# Data
data = load_data()
# Features
features = (:Year, :Month, :DayOfWeek, :DayofMonth,
:ActualElapsedTime, :AirTime)
label = :Cancelled
# Features and labels
feat = select(data, features)
labels = column(data, label)
# Train and test
feat_train = to_matrix(feat[1:159247], features)
labels_train = to_vector(labels[1:159247])
feat_test = to_matrix(feat[159248:end], features)
labels_test = to_vector(labels[159248:end])
@show "done1"
# Training
m_julia, m_python = train_models(feat_train, labels_train)
@show "done2"
# Get predictions
pred_julia = run_model(m_julia, feat_test)
pred_python = run_model(m_python, feat_test)
@show "done3"
println("Julia version: ")
evaluate_results(labels_test, pred_julia)
println("Python version: ")
evaluate_results(labels_test, pred_python)
pred_julia, pred_python
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment