Skip to content

Instantly share code, notes, and snippets.

@ajsaraujo
Created June 18, 2020 14:51
Show Gist options
  • Save ajsaraujo/d3db91f55d21bc9cb59650191560f6d5 to your computer and use it in GitHub Desktop.
Save ajsaraujo/d3db91f55d21bc9cb59650191560f6d5 to your computer and use it in GitHub Desktop.
Script to min-max normalize a dataset in Python
import sys
def read_file(file_path):
dataset = []
with open(file_path, "r") as file:
for line in file:
str_values = line.split()
row = [ float(value) for value in str_values if value not in [" ", "\n"] ]
dataset.append(row)
return dataset
def treat_negative_entries(dataset):
number_of_rows = len(dataset)
number_of_columns = len(dataset[0])
for j in range(number_of_columns):
smallest_entry = dataset[0][j]
for i in range(1, number_of_rows):
smallest_entry = min(smallest_entry, dataset[i][j])
if smallest_entry < 0:
for i in range(number_of_rows):
dataset[i][j] += (-smallest_entry)
return dataset
def normalize_value(value, smallest_value, delta):
return ( value - smallest_value ) / delta
def normalize_dataset(dataset):
number_of_rows = len(dataset)
number_of_columns = len(dataset[0])
for j in range(number_of_columns):
smallest_entry = dataset[0][j]
biggest_entry = dataset[0][j]
for i in range(1, number_of_rows):
smallest_entry = min(smallest_entry, dataset[i][j])
biggest_entry = max(biggest_entry, dataset[i][j])
delta = biggest_entry - smallest_entry
for i in range(number_of_rows):
dataset[i][j] = normalize_value(dataset[i][j], smallest_entry, delta)
return dataset
def to_string(dataset):
for i, row in enumerate(dataset):
for j, value in enumerate(row):
dataset[i][j] = str("%.4f" % value)
lines = [ " ".join(row) for row in dataset ]
single_string = "\n".join(lines)
return single_string
def write_output(dataset_string, file_path):
with open(file_path, "w") as file:
file.write(dataset_string)
input_file_path = sys.argv[1]
output_file_path = sys.argv[2]
dataset = read_file(input_file_path)
non_negative = treat_negative_entries(dataset)
normalized = normalize_dataset(non_negative)
dataset_string = to_string(normalized)
write_output(dataset_string, output_file_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment