Skip to content

Instantly share code, notes, and snippets.

@goyder
Created February 12, 2021 14:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save goyder/2738a0f8f2e8a044c2f5c42e2bc8664a to your computer and use it in GitHub Desktop.
Save goyder/2738a0f8f2e8a044c2f5c42e2bc8664a to your computer and use it in GitHub Desktop.
import fileinput
# Config
data_folder = "../data"
input_filename = "iris.data"
output_filename = "processed.iris.data"
classes_to_include = ["Iris-setosa", "Iris-versicolor"]
# Filepaths
input_data_filepath = data_folder + "/" + input_filename
output_data_filepath = data_folder + "/" + output_filename
# Read in dataset
print "Inputting data from '" + input_data_filepath + "'"
rows = []
for row in fileinput.input(input_data_filepath):
content = row.split(",")
if len(content) > 1:
content[-1] = content[-1].replace("\012", "")
rows.append(content)
# Identify all the class names
classes = {}
i = 0
for row in rows:
class_name = row[-1]
if not classes.has_key(class_name):
classes[class_name] = str(i)
i = i + 1
print "Classes identified: "
print classes
# Output dataset
print "Outputting data to '" + output_data_filepath + "'"
output_file = open(output_data_filepath, "w")
for row in rows:
if row[-1] in classes_to_include:
row[-1] = classes[row[-1]]
output_file.write(",".join(row) + "\n")
output_file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment