Skip to content

Instantly share code, notes, and snippets.

@youngsoul
Created October 25, 2023 17:19
Show Gist options
  • Save youngsoul/eb92591ee9128f680e7d08c9bcefc7e7 to your computer and use it in GitHub Desktop.
Save youngsoul/eb92591ee9128f680e7d08c9bcefc7e7 to your computer and use it in GitHub Desktop.
read libsvm files and create csv files from them
from typing import Tuple
import pandas as pd
def read_libsvm_file(filename) -> Tuple[list[dict], set]:
data = [] # array of json records
unique_columns = set()
with open(filename, "r") as f:
while True:
line = f.readline()
if not line:
break
data_values = line.split(" ")
json_record = {}
for i, value in enumerate(data_values):
if i == 0:
json_record['target'] = data_values[0]
else:
if value and ":" in value:
k, v = value.split(":")
json_record[k] = str(v).strip()
unique_columns.add(f"{k}")
data.append(json_record)
return data, unique_columns
def main(input_file: str, output_file: str | None = None) -> pd.DataFrame:
print("#" * 30)
print(f"Process input file: {input_file}")
# rows - all of the rows in the libsvm, where each row is a json/dict of the hashvalue column, and the value
# unique_columns - is a set of all of the unique hashvalue columns found
rows, unique_columns = read_libsvm_file(input_file)
# print(unique_columns)
expanded_data = [] # array of json records with all unique columns in each row
for row in rows:
# find the sparse row column has values that are in the complete unique set
# then set any from the unique set outside the intersection to zero, those
# in the unique set, set to the value from the row
intersection = unique_columns.intersection(row.keys())
expanded_data_row = {}
expanded_data_row['target'] = row['target']
for unique_column in list(unique_columns):
if unique_column in intersection:
expanded_data_row[unique_column] = float(row[unique_column])
else:
expanded_data_row[unique_column] = 0.0
expanded_data.append(expanded_data_row)
df = pd.DataFrame(expanded_data)
if output_file:
df.to_csv(output_file, header=True, index=False)
return df
if __name__ == '__main__':
base_dir = "./libsvmfiles/timebased"
df_test = main(f"{base_dir}/test.libsvm", f"{base_dir}/csv/test.csv")
df_val = main(f"{base_dir}/validation.libsvm", f"{base_dir}/csv/val.csv")
df_train = main(f"{base_dir}/train.libsvm", f"{base_dir}/csv/train.csv")
df_all = pd.concat([df_train, df_val, df_test], ignore_index=True)
df_all.fillna(value=0.0)
df_all.to_csv(f"{base_dir}/csv/all_data.csv", header=True, index=False)
df_train_val = pd.concat([df_train, df_val], ignore_index=True)
df_train_val.fillna(value=0.0)
df_train_val.to_csv(f"{base_dir}/csv/train_val.csv", header=True, index=False)
print(f"Test DF shape: {df_test.shape}")
print(f"Validation DF shape: {df_val.shape}")
print(f"Train DF shape: {df_train.shape}")
print(f"Train Val DF shape: {df_train_val.shape}")
print(f"All data shape: {df_all.shape}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment