Last active
August 29, 2015 14:14
-
-
Save bmurzeau/04a057647330aba14224 to your computer and use it in GitHub Desktop.
PredicSis API Script for both Kaggle Give Me Some Credit challenge and KDD Cup 2008 Breast Cancer (PredicSis API vs Google Prediction)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubygems' | |
require 'bundler/setup' | |
require 'benchmark' | |
require 'predicsis_ml_sdk' | |
PredicsisMlSdk.configure do |config| | |
config.logger = Logger.new(STDOUT) | |
config.logger.level = Logger::WARN | |
end | |
# Preconditions: | |
TOKEN = 'MY_TOKEN_HERE' # please visit https://developer.predicsis.com' | |
SEPARATOR = ',' | |
HEADER = false | |
TARGET_VARIABLE = 'Var1' | |
MAIN_MODALITY = 'True' | |
TRAIN_KEY, TEST_KEY = case ARGV[0] | |
when 'kdd' | |
['blog/kdd_2008_breast_cancer_train.txt', | |
'blog/kdd_2008_breast_cancer_score.txt'] | |
when 'kaggle' | |
['blog/kaggle_credit_scoring_train.txt', | |
'blog/kaggle_credit_scoring_score.txt'] | |
else | |
puts "Invalid argument: expected 'kdd' or 'kaggle'" | |
exit 1 | |
end | |
class Workflow | |
# Start workflow | |
def self.perform | |
$stderr.puts 'Initialize data' | |
init | |
global_time = Benchmark.measure do | |
# Building model | |
time = Benchmark.measure do | |
$stderr.puts 'Building model' | |
# Create a dictionary | |
dictionary_params = { name: 'My Kdd Dictionary', dataset_id: @learning_dataset.id, | |
header: HEADER, separator: SEPARATOR } | |
@dictionary = PredicsisMlSdk::Dictionary.create(dictionary_params, TOKEN) | |
# Select and update target variable to unsure that it is categorical | |
@dictionary.wait_for_result | |
@variable = @dictionary.variables.select do |v| | |
v.update(type: 'categorical') if v.name.eql?(TARGET_VARIABLE) | |
end.first | |
# Create preparation rules | |
preparation_rules_params = { name: 'My Kdd Preparation Rules Set', | |
variable_id: @variable.id, | |
dataset_id: @learning_dataset.id } | |
@preparation_rules = PredicsisMlSdk::PreparationRulesSet.create( | |
preparation_rules_params, TOKEN) | |
@preparation_rules.wait_for_result | |
# Create the model | |
classifier_params = { type: 'classifier', title: 'My Classifier', | |
preparation_rules_set_id: @preparation_rules.id } | |
@classifier = PredicsisMlSdk::Model.create(classifier_params, TOKEN) | |
@classifier.wait_for_result | |
end | |
$stderr.puts "completed in: #{time.real} seconds" | |
# Generating predictions | |
time = Benchmark.measure do | |
$stderr.puts 'Generating predictions' | |
# Create a modalities set to get all available modalities for the target variable | |
modalities_set_params = { name: 'My Kdd Modalities Set', | |
variable_id: @variable.id, | |
dataset_id: @learning_dataset.id } | |
@modalities_set = PredicsisMlSdk::ModalitiesSet.create(modalities_set_params, TOKEN) | |
@modalities_set.wait_for_result | |
# Apply a score | |
scoreset_params = { name: 'My Kdd Scoreset', classifier_id: @classifier.id, | |
separator: SEPARATOR, header: HEADER, | |
dataset_id: @scoring_dataset.id, | |
modalities_set_id: @modalities_set.id, | |
main_modality: MAIN_MODALITY, | |
data_file: { filename: 'output.txt' } } | |
@scoreset = PredicsisMlSdk::Dataset.create(scoreset_params, TOKEN) | |
@scoreset.wait_for_result | |
end | |
$stderr.puts "completed in: #{time.real} seconds" | |
end | |
$stderr.puts "Predicsis finished in: #{global_time.real} seconds" | |
rescue => e | |
raise e | |
end | |
def self.init | |
# Create a learning source | |
learning_source_params = { name: 'My Kdd Train Source', bucket: 'predicsis-data-demos', key: TRAIN_KEY } | |
@learning_source = PredicsisMlSdk::Source.create(learning_source_params, TOKEN) | |
# Create a learning dataset | |
learning_dataset_params = { source_ids: [@learning_source.id], header: HEADER, | |
name: 'My Kdd Train Dataset', separator: SEPARATOR } | |
@learning_dataset = PredicsisMlSdk::Dataset.create(learning_dataset_params, TOKEN) | |
@learning_dataset.wait_for_result | |
# Create a scoring source | |
scoring_source_params = { name: 'My Kdd Test Source', bucket: 'predicsis-data-demos', key: TEST_KEY } | |
@scoring_source = PredicsisMlSdk::Source.create(scoring_source_params, TOKEN) | |
# Create a scoring dataset | |
scoring_dataset_params = { source_ids: [@scoring_source.id], header: HEADER, | |
separator: SEPARATOR, name: 'My Kdd Test Dataset' } | |
@scoring_dataset = PredicsisMlSdk::Dataset.create(scoring_dataset_params, TOKEN) | |
@scoring_dataset.wait_for_result | |
end | |
end | |
if __FILE__ == $0 | |
begin | |
Workflow.perform | |
rescue => e | |
PredicsisMlSdk.logger.error "#{e.message}: #{e.error if e.respond_to? :error}" | |
exit 1 | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment