Skip to content

Instantly share code, notes, and snippets.

@cwpearson
Created November 8, 2018 18:10
Show Gist options
  • Save cwpearson/f3d6d408d5277074890e0486bc5bd78c to your computer and use it in GitHub Desktop.
Save cwpearson/f3d6d408d5277074890e0486bc5bd78c to your computer and use it in GitHub Desktop.
#! /bin/env python
"""converting airline dataset from GBM-benchmarks to svm"""
import pandas as pd
with open('airline_14col.data') as f:
num_lines = sum(1 for line in f)
print(num_lines)
train_lines = int(num_lines * 0.95)
test_lines = num_lines - train_lines
train_f = open("airline_14col.data.train", "w")
test_f = open("airline_14col.data.test", "w")
train_f.write("")
test_f.write("")
train_f = open("airline_14col.data.train", "a")
test_f = open("airline_14col.data.test", "a")
codes = {}
def to_ordinal(v):
try:
ordinal = int(v)
except ValueError:
global codes
if v not in codes:
print("new categorical:", v)
codes[v] = len(codes)
ordinal = codes[v]
return str(ordinal)
with open('airline_14col.data') as f:
for li, line in enumerate(f):
fields = line.split(",")
output = "1" if int(fields[13]) > 0 else "0"
for i, f in enumerate(fields[1:13]):
output += " " + str(i) + ":" + to_ordinal(f)
output += "\n"
if li % 10000 == 0:
print(float(li)/num_lines)
if li < train_lines:
train_f.write(output)
else:
test_f.write(output)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment