youngsoul/libsvm_to_csv.py

## libsvm_to_csv.py
from typing import Tuple

import pandas as pd


def read_libsvm_file(filename) -> Tuple[list[dict], set]:
    data = []  # array of json records
    unique_columns = set()

    with open(filename, "r") as f:
        while True:
            line = f.readline()
            if not line:
                break
            data_values = line.split(" ")
            json_record = {}
            for i, value in enumerate(data_values):
                if i == 0:
                    json_record['target'] = data_values[0]
                else:
                    if value and ":" in value:
                        k, v = value.split(":")
                        json_record[k] = str(v).strip()
                        unique_columns.add(f"{k}")
            data.append(json_record)

        return data, unique_columns


def main(input_file: str, output_file: str | None = None) -> pd.DataFrame:
    print("#" * 30)
    print(f"Process input file: {input_file}")

    # rows - all of the rows in the libsvm, where each row is a json/dict of the hashvalue column, and the value
    # unique_columns - is a set of all of the unique hashvalue columns found
    rows, unique_columns = read_libsvm_file(input_file)
    # print(unique_columns)
    expanded_data = []  # array of json records with all unique columns in each row
    for row in rows:
        # find the sparse row column has values that are in the complete unique set
        # then set any from the unique set outside the intersection to zero, those
        # in the unique set, set to the value from the row
        intersection = unique_columns.intersection(row.keys())
        expanded_data_row = {}
        expanded_data_row['target'] = row['target']
        for unique_column in list(unique_columns):
            if unique_column in intersection:
                expanded_data_row[unique_column] = float(row[unique_column])
            else:
                expanded_data_row[unique_column] = 0.0

        expanded_data.append(expanded_data_row)

    df = pd.DataFrame(expanded_data)
    if output_file:
        df.to_csv(output_file, header=True, index=False)

    return df


if __name__ == '__main__':
    base_dir = "./libsvmfiles/timebased"
    df_test = main(f"{base_dir}/test.libsvm", f"{base_dir}/csv/test.csv")
    df_val = main(f"{base_dir}/validation.libsvm", f"{base_dir}/csv/val.csv")
    df_train = main(f"{base_dir}/train.libsvm", f"{base_dir}/csv/train.csv")

    df_all = pd.concat([df_train, df_val, df_test], ignore_index=True)
    df_all.fillna(value=0.0)
    df_all.to_csv(f"{base_dir}/csv/all_data.csv", header=True, index=False)

    df_train_val = pd.concat([df_train, df_val], ignore_index=True)
    df_train_val.fillna(value=0.0)
    df_train_val.to_csv(f"{base_dir}/csv/train_val.csv", header=True, index=False)

    print(f"Test DF shape: {df_test.shape}")
    print(f"Validation DF shape: {df_val.shape}")
    print(f"Train DF shape: {df_train.shape}")
    print(f"Train Val DF shape: {df_train_val.shape}")
    print(f"All data shape: {df_all.shape}")
	from typing import Tuple

	import pandas as pd


	def read_libsvm_file(filename) -> Tuple[list[dict], set]:
	data = [] # array of json records
	unique_columns = set()

	with open(filename, "r") as f:
	while True:
	line = f.readline()
	if not line:
	break
	data_values = line.split(" ")
	json_record = {}
	for i, value in enumerate(data_values):
	if i == 0:
	json_record['target'] = data_values[0]
	else:
	if value and ":" in value:
	k, v = value.split(":")
	json_record[k] = str(v).strip()
	unique_columns.add(f"{k}")
	data.append(json_record)

	return data, unique_columns


	def main(input_file: str, output_file: str \| None = None) -> pd.DataFrame:
	print("#" * 30)
	print(f"Process input file: {input_file}")

	# rows - all of the rows in the libsvm, where each row is a json/dict of the hashvalue column, and the value
	# unique_columns - is a set of all of the unique hashvalue columns found
	rows, unique_columns = read_libsvm_file(input_file)
	# print(unique_columns)
	expanded_data = [] # array of json records with all unique columns in each row
	for row in rows:
	# find the sparse row column has values that are in the complete unique set
	# then set any from the unique set outside the intersection to zero, those
	# in the unique set, set to the value from the row
	intersection = unique_columns.intersection(row.keys())
	expanded_data_row = {}
	expanded_data_row['target'] = row['target']
	for unique_column in list(unique_columns):
	if unique_column in intersection:
	expanded_data_row[unique_column] = float(row[unique_column])
	else:
	expanded_data_row[unique_column] = 0.0

	expanded_data.append(expanded_data_row)

	df = pd.DataFrame(expanded_data)
	if output_file:
	df.to_csv(output_file, header=True, index=False)

	return df


	if __name__ == '__main__':
	base_dir = "./libsvmfiles/timebased"
	df_test = main(f"{base_dir}/test.libsvm", f"{base_dir}/csv/test.csv")
	df_val = main(f"{base_dir}/validation.libsvm", f"{base_dir}/csv/val.csv")
	df_train = main(f"{base_dir}/train.libsvm", f"{base_dir}/csv/train.csv")

	df_all = pd.concat([df_train, df_val, df_test], ignore_index=True)
	df_all.fillna(value=0.0)
	df_all.to_csv(f"{base_dir}/csv/all_data.csv", header=True, index=False)

	df_train_val = pd.concat([df_train, df_val], ignore_index=True)
	df_train_val.fillna(value=0.0)
	df_train_val.to_csv(f"{base_dir}/csv/train_val.csv", header=True, index=False)

	print(f"Test DF shape: {df_test.shape}")
	print(f"Validation DF shape: {df_val.shape}")
	print(f"Train DF shape: {df_train.shape}")
	print(f"Train Val DF shape: {df_train_val.shape}")
	print(f"All data shape: {df_all.shape}")