Skip to content

Instantly share code, notes, and snippets.

@PGijsbers
Created October 18, 2019 09:21
Show Gist options
  • Save PGijsbers/d6d262241587a59af2d9715f06185dee to your computer and use it in GitHub Desktop.
Save PGijsbers/d6d262241587a59af2d9715f06185dee to your computer and use it in GitHub Desktop.
script used for comparing mldata datasets to openml datasets
import arff
from collections import defaultdict
import logging
import openml
import os
import pandas as pd
import sys
def load_arff(file_path):
with open(file_path, 'r') as fh:
return arff.load(fh)
def get_metafeatures(data):
name = data['relation']
instances = len(data['data'])
features = len(data['data'][0])
missing = len([v for row in data['data'] for v in row if v is None])
return name, instances, features, missing
def compare(data_features, characteristics):
""" Compares `characteristics` to the `data`.
:param data: dict. arff data
:param characteristics: dict. OpenML dataset description
:return: Tuple[bool, int, int, int].
True if dataset name of A is contained in B or vice versa.
Difference in number of samples.
Difference in number of features.
Difference in number of missing values.
"""
name, instances, features, missing = data_features
return (name in characteristics['name'] or characteristics['name'] in name,
abs(characteristics.get('NumberOfInstances', float('nan')) - instances),
abs(characteristics.get('NumberOfFeatures', float('nan')) - features),
abs(characteristics.get('NumberOfMissingValues', float('nan')) - missing))
def create_df_matches(dataset_folder=None):
oml_datasets = openml.datasets.list_datasets()
comparisons = []
arff_files = [filepath for filepath in os.listdir(dataset_folder) if filepath.endswith('.arff')]
for i, file_path in enumerate(arff_files):
logging.info("[{:3d}/{:3d}] {}".format(i+1, len(arff_files), file_path[:-5]))
new_data = load_arff(os.path.join(dataset_folder, file_path))
new_data_metafeatures = get_metafeatures(new_data)
for did, oml_dataset in oml_datasets.items():
name_match, d_instances, d_features, d_missing = compare(new_data_metafeatures, oml_dataset)
if name_match or sum([d_instances, d_features, d_missing]) == 0:
comparisons.append([file_path[:-5], did, name_match, d_instances, d_features, d_missing])
return pd.DataFrame(comparisons, columns=['name', 'did', 'name_match', 'd_instances', 'd_features', 'd_missing'])
def move_bad_files(folder):
sub_folder = 'bad/'
arff_files = [filepath for filepath in os.listdir(folder) if filepath.endswith('.arff')]
if not os.path.exists(os.path.join(folder, sub_folder)):
os.makedirs(os.path.join(folder, sub_folder))
for i, file_path in enumerate(arff_files):
try:
load_arff(os.path.join(folder, file_path))
except arff.ArffException as e:
logging.info("[{:3d}/{:3d}] Moving {}, reason: {}".format(i+1, len(arff_files), file_path[:-5], str(e)))
os.rename(os.path.join(folder, file_path), os.path.join(folder + 'bad/', file_path))
def get_matches_per_dataset(df, fn, exclude=[]):
matches = defaultdict(list)
for i, row in df.iterrows():
if row['name'] in exclude:
continue
if fn(row):
matches[row['name']].append(row['did'])
return matches
def row_print_dict(d):
max_len = max([len(k) for k in d])
for k, values in d.items():
print('{}: {}'.format(k.ljust(max_len), values))
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
logging.basicConfig()
# sys.argv[1] is to be the path to a folder with .arff files.
logging.info("Checking .arff files for correctness. This can take a while."
"Bad files will be moved to {}bad/".format(sys.argv[1]))
move_bad_files(sys.argv[1])
logging.info("Checking for matches against OpenML.")
df = create_df_matches(sys.argv[1])
logging.info("Aggregating results...")
matched_datasets = []
def perfect_match(row):
return row['name_match'] and row['d_instances'] == 0 and row['d_features'] == 0 and row['d_missing'] == 0
perfect_matches = get_matches_per_dataset(df, fn=perfect_match, exclude=matched_datasets)
matched_datasets += list(perfect_matches)
print("The following datasets have matching names (A contained in B or B contained in A),"
" and have the same number of instances, features and missing values:")
row_print_dict(perfect_matches)
def close_match(row):
return row['name_match'] and sum([row['d_instances'] == 0, row['d_features'] == 0, row['d_missing'] == 0]) == 2
close_matches = get_matches_per_dataset(df, fn=close_match, exclude=matched_datasets)
matched_datasets += list(close_matches)
print("The following datasets have matching names, but differ in either instances, features, or missing values:")
row_print_dict(close_matches)
def name_match(row):
return row['name_match'] and sum([row['d_instances'] == 0, row['d_features'] == 0, row['d_missing'] == 0]) < 2
name_matches = get_matches_per_dataset(df, fn=name_match, exclude=matched_datasets)
matched_datasets += list(name_matches)
print("The following datasets have matching names, but differ in more than one way:")
row_print_dict(name_matches)
def shape_match(row):
return (not row['name_match']) and row['d_instances'] == 0 and row['d_features'] == 0 and row['d_missing'] == 0
shape_matches = get_matches_per_dataset(df, fn=shape_match)
matched_datasets += list(shape_matches)
print("The following datasets do not have matching names,"
"but have the same number of instances, features and missing values:")
row_print_dict(shape_matches)
all_datasets = df['name'].unique()
no_matches = [name for name in all_datasets if name not in matched_datasets]
print("The following datasets do not match any of the above criteria:")
for no_match in no_matches:
print(no_match)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment