Created
May 16, 2018 12:27
-
-
Save ranjanan/a2a9e0bf5a32a303f4961fd581ca0f5d to your computer and use it in GitHub Desktop.
Flights Dataset Comparison
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import DecisionTree | |
using ScikitLearn | |
@sk_import tree: DecisionTreeClassifier | |
import PyCall: PyObject | |
using JuliaDB | |
function load_data() | |
isfile("balance.data") || | |
download("https://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data", | |
"balance.data") | |
isfile("hflights.csv") || | |
download("https://raw.githubusercontent.com/piever/JuliaDBTutorial/master/hflights.csv", | |
"hflights.csv") | |
loadtable("hflights.csv") | |
end | |
function evaluate_results(truth, pred) | |
a1 = f(1,1) | |
a2 = f(0,0) | |
a3 = f(1,0) | |
a4 = f(0,1) | |
tp = map(a1, truth, pred) |> countnz | |
tn = map(a2, truth, pred) |> countnz | |
fn = map(a3, truth, pred) |> countnz | |
fp = map(a4, truth, pred) |> countnz | |
@show tp | |
@show fp | |
@show fn | |
@show tn | |
precision = tp / (tp + fp) | |
@show precision | |
recall = tp/(tp+fn) | |
@show recall | |
f1 = 2 / (1/recall + 1/precision) | |
@show f1 | |
end | |
f(a,b) = (x,y) -> x == a && y == b | |
function train_models(features, labels) | |
m_julia = DecisionTree.DecisionTreeClassifier() | |
DecisionTree.fit!(m_julia, features, labels) | |
m_python = DecisionTreeClassifier() | |
m_python[:fit](features, labels) | |
m_julia, m_python | |
end | |
function run_model(m::DecisionTree.DecisionTreeClassifier, feat) | |
DecisionTree.predict(m, feat) | |
end | |
function run_model(m::PyObject, feat) | |
m[:predict](feat) | |
end | |
function to_matrix(data, features) | |
for f in features | |
data = smooth_nas(data, column(data, f), f) | |
end | |
reduce(hcat, columns(data, features)) | |
end | |
smooth_nas(data, v::Vector{T}, f) where T = data | |
function smooth_nas(data, v::DataValues.DataValueVector{T}, f) where T | |
data = setcol(data, f, | |
f => x -> x == JuliaDB.NA ? JuliaDB.DataValue(zero(T)) : x) | |
data = dropna(data, f) | |
end | |
function to_vector(v::DataValues.DataValueVector{T}) where T | |
v = map(x -> x == JuliaDB.NA ? JuliaDB.DataValue(zero(T)) : x, v) | |
Int.(dropna(v)) | |
end | |
to_vector(v::Vector{T}) where T = Int.(v) | |
function main() | |
# Data | |
data = load_data() | |
# Features | |
features = (:Year, :Month, :DayOfWeek, :DayofMonth, | |
:ActualElapsedTime, :AirTime) | |
label = :Cancelled | |
# Features and labels | |
feat = select(data, features) | |
labels = column(data, label) | |
# Train and test | |
feat_train = to_matrix(feat[1:159247], features) | |
labels_train = to_vector(labels[1:159247]) | |
feat_test = to_matrix(feat[159248:end], features) | |
labels_test = to_vector(labels[159248:end]) | |
@show "done1" | |
# Training | |
m_julia, m_python = train_models(feat_train, labels_train) | |
@show "done2" | |
# Get predictions | |
pred_julia = run_model(m_julia, feat_test) | |
pred_python = run_model(m_python, feat_test) | |
@show "done3" | |
println("Julia version: ") | |
evaluate_results(labels_test, pred_julia) | |
println("Python version: ") | |
evaluate_results(labels_test, pred_python) | |
pred_julia, pred_python | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment