Skip to content

Instantly share code, notes, and snippets.

@rikturr
Created March 31, 2018 18:17
Show Gist options
  • Save rikturr/9bd22c1b26f39f8931339d83a3adf3f1 to your computer and use it in GitHub Desktop.
Save rikturr/9bd22c1b26f39f8931339d83a3adf3f1 to your computer and use it in GitHub Desktop.
Convert a folder of libsvm txt files into a sparse scipy array and save in .npz format
from glob import glob
import argparse
import os
import scipy.sparse as sp
import numpy as np
from sklearn.datasets import load_svmlight_file
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("path", help="Path to libsvm folder")
parser.add_argument("out", help="Output path")
parser.add_argument("n_features", type=int, help="Number of features")
parser.add_argument("--x_name", default='features')
parser.add_argument("--y_name", default='labels')
return parser.parse_args()
args = parse_args()
if not os.path.exists(args.out):
os.mkdir(args.out)
libsvm_files = glob(os.path.join(args.path, '*.txt.gz'))
x_list = []
y_list = []
for f in libsvm_files:
print('Loading file: {}'.format(f))
x, y = load_svmlight_file(f, n_features=args.n_features)
x_list.append(x)
y_list.append(y)
x = sp.vstack(x_list)
y = np.concatenate(y_list, axis=0)
x_path = os.path.join(args.out, '{}.npz'.format(args.x_name))
y_path = os.path.join(args.out, '{}.npy'.format(args.y_name))
sp.save_npz(x_path, x)
np.save(y_path, y)
print('Saved numpy files: {}, {}'.format(x_path, y_path))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment