PGijsbers/mldata.py Secret

## mldata.py
import arff
from collections import defaultdict
import logging
import openml
import os
import pandas as pd
import sys


def load_arff(file_path):
    with open(file_path, 'r') as fh:
        return arff.load(fh)


def get_metafeatures(data):
    name = data['relation']
    instances = len(data['data'])
    features = len(data['data'][0])
    missing = len([v for row in data['data'] for v in row if v is None])
    return name, instances, features, missing


def compare(data_features, characteristics):
    """ Compares `characteristics` to the `data`.
    :param data: dict. arff data
    :param characteristics: dict. OpenML dataset description
    :return: Tuple[bool, int, int, int].
             True if dataset name of A is contained in B or vice versa.
             Difference in number of samples.
             Difference in number of features.
             Difference in number of missing values.
    """
    name, instances, features, missing = data_features
    return (name in characteristics['name'] or characteristics['name'] in name,
            abs(characteristics.get('NumberOfInstances', float('nan')) - instances),
            abs(characteristics.get('NumberOfFeatures', float('nan')) - features),
            abs(characteristics.get('NumberOfMissingValues', float('nan')) - missing))


def create_df_matches(dataset_folder=None):
    oml_datasets = openml.datasets.list_datasets()
    comparisons = []
    arff_files = [filepath for filepath in os.listdir(dataset_folder) if filepath.endswith('.arff')]
    for i, file_path in enumerate(arff_files):
        logging.info("[{:3d}/{:3d}] {}".format(i+1, len(arff_files), file_path[:-5]))
        new_data = load_arff(os.path.join(dataset_folder, file_path))
        new_data_metafeatures = get_metafeatures(new_data)
        for did, oml_dataset in oml_datasets.items():
            name_match, d_instances, d_features, d_missing = compare(new_data_metafeatures, oml_dataset)
            if name_match or sum([d_instances, d_features, d_missing]) == 0:
                comparisons.append([file_path[:-5], did, name_match, d_instances, d_features, d_missing])
    return pd.DataFrame(comparisons, columns=['name', 'did', 'name_match', 'd_instances', 'd_features', 'd_missing'])


def move_bad_files(folder):
    sub_folder = 'bad/'
    arff_files = [filepath for filepath in os.listdir(folder) if filepath.endswith('.arff')]

    if not os.path.exists(os.path.join(folder, sub_folder)):
        os.makedirs(os.path.join(folder, sub_folder))

    for i, file_path in enumerate(arff_files):
        try:
            load_arff(os.path.join(folder, file_path))
        except arff.ArffException as e:
            logging.info("[{:3d}/{:3d}] Moving {}, reason: {}".format(i+1, len(arff_files), file_path[:-5], str(e)))
            os.rename(os.path.join(folder, file_path), os.path.join(folder + 'bad/', file_path))


def get_matches_per_dataset(df, fn, exclude=[]):
    matches = defaultdict(list)
    for i, row in df.iterrows():
        if row['name'] in exclude:
            continue
        if fn(row):
            matches[row['name']].append(row['did'])
    return matches


def row_print_dict(d):
    max_len = max([len(k) for k in d])
    for k, values in d.items():
        print('{}: {}'.format(k.ljust(max_len), values))


if __name__ == '__main__':
    logging.getLogger().setLevel(logging.INFO)
    logging.basicConfig()

    # sys.argv[1] is to be the path to a folder with .arff files.
    logging.info("Checking .arff files for correctness. This can take a while."
                 "Bad files will be moved to {}bad/".format(sys.argv[1]))
    move_bad_files(sys.argv[1])
    logging.info("Checking for matches against OpenML.")
    df = create_df_matches(sys.argv[1])

    logging.info("Aggregating results...")

    matched_datasets = []

    def perfect_match(row):
        return row['name_match'] and row['d_instances'] == 0 and row['d_features'] == 0 and row['d_missing'] == 0
    perfect_matches = get_matches_per_dataset(df, fn=perfect_match, exclude=matched_datasets)
    matched_datasets += list(perfect_matches)
    print("The following datasets have matching names (A contained in B or B contained in A),"
          " and have the same number of instances, features and missing values:")
    row_print_dict(perfect_matches)

    def close_match(row):
        return row['name_match'] and sum([row['d_instances'] == 0, row['d_features'] == 0, row['d_missing'] == 0]) == 2
    close_matches = get_matches_per_dataset(df, fn=close_match, exclude=matched_datasets)
    matched_datasets += list(close_matches)
    print("The following datasets have matching names, but differ in either instances, features, or missing values:")
    row_print_dict(close_matches)

    def name_match(row):
        return row['name_match'] and sum([row['d_instances'] == 0, row['d_features'] == 0, row['d_missing'] == 0]) < 2
    name_matches = get_matches_per_dataset(df, fn=name_match, exclude=matched_datasets)
    matched_datasets += list(name_matches)
    print("The following datasets have matching names, but differ in more than one way:")
    row_print_dict(name_matches)

    def shape_match(row):
        return (not row['name_match']) and row['d_instances'] == 0 and row['d_features'] == 0 and row['d_missing'] == 0
    shape_matches = get_matches_per_dataset(df, fn=shape_match)
    matched_datasets += list(shape_matches)
    print("The following datasets do not have matching names,"
          "but have the same number of instances, features and missing values:")
    row_print_dict(shape_matches)

    all_datasets = df['name'].unique()
    no_matches = [name for name in all_datasets if name not in matched_datasets]
    print("The following datasets do not match any of the above criteria:")
    for no_match in no_matches:
        print(no_match)
	import arff
	from collections import defaultdict
	import logging
	import openml
	import os
	import pandas as pd
	import sys


	def load_arff(file_path):
	with open(file_path, 'r') as fh:
	return arff.load(fh)


	def get_metafeatures(data):
	name = data['relation']
	instances = len(data['data'])
	features = len(data['data'][0])
	missing = len([v for row in data['data'] for v in row if v is None])
	return name, instances, features, missing


	def compare(data_features, characteristics):
	""" Compares `characteristics` to the `data`.
	:param data: dict. arff data
	:param characteristics: dict. OpenML dataset description
	:return: Tuple[bool, int, int, int].
	True if dataset name of A is contained in B or vice versa.
	Difference in number of samples.
	Difference in number of features.
	Difference in number of missing values.
	"""
	name, instances, features, missing = data_features
	return (name in characteristics['name'] or characteristics['name'] in name,
	abs(characteristics.get('NumberOfInstances', float('nan')) - instances),
	abs(characteristics.get('NumberOfFeatures', float('nan')) - features),
	abs(characteristics.get('NumberOfMissingValues', float('nan')) - missing))


	def create_df_matches(dataset_folder=None):
	oml_datasets = openml.datasets.list_datasets()
	comparisons = []
	arff_files = [filepath for filepath in os.listdir(dataset_folder) if filepath.endswith('.arff')]
	for i, file_path in enumerate(arff_files):
	logging.info("[{:3d}/{:3d}] {}".format(i+1, len(arff_files), file_path[:-5]))
	new_data = load_arff(os.path.join(dataset_folder, file_path))
	new_data_metafeatures = get_metafeatures(new_data)
	for did, oml_dataset in oml_datasets.items():
	name_match, d_instances, d_features, d_missing = compare(new_data_metafeatures, oml_dataset)
	if name_match or sum([d_instances, d_features, d_missing]) == 0:
	comparisons.append([file_path[:-5], did, name_match, d_instances, d_features, d_missing])
	return pd.DataFrame(comparisons, columns=['name', 'did', 'name_match', 'd_instances', 'd_features', 'd_missing'])


	def move_bad_files(folder):
	sub_folder = 'bad/'
	arff_files = [filepath for filepath in os.listdir(folder) if filepath.endswith('.arff')]

	if not os.path.exists(os.path.join(folder, sub_folder)):
	os.makedirs(os.path.join(folder, sub_folder))

	for i, file_path in enumerate(arff_files):
	try:
	load_arff(os.path.join(folder, file_path))
	except arff.ArffException as e:
	logging.info("[{:3d}/{:3d}] Moving {}, reason: {}".format(i+1, len(arff_files), file_path[:-5], str(e)))
	os.rename(os.path.join(folder, file_path), os.path.join(folder + 'bad/', file_path))


	def get_matches_per_dataset(df, fn, exclude=[]):
	matches = defaultdict(list)
	for i, row in df.iterrows():
	if row['name'] in exclude:
	continue
	if fn(row):
	matches[row['name']].append(row['did'])
	return matches


	def row_print_dict(d):
	max_len = max([len(k) for k in d])
	for k, values in d.items():
	print('{}: {}'.format(k.ljust(max_len), values))


	if __name__ == '__main__':
	logging.getLogger().setLevel(logging.INFO)
	logging.basicConfig()

	# sys.argv[1] is to be the path to a folder with .arff files.
	logging.info("Checking .arff files for correctness. This can take a while."
	"Bad files will be moved to {}bad/".format(sys.argv[1]))
	move_bad_files(sys.argv[1])
	logging.info("Checking for matches against OpenML.")
	df = create_df_matches(sys.argv[1])

	logging.info("Aggregating results...")

	matched_datasets = []

	def perfect_match(row):
	return row['name_match'] and row['d_instances'] == 0 and row['d_features'] == 0 and row['d_missing'] == 0
	perfect_matches = get_matches_per_dataset(df, fn=perfect_match, exclude=matched_datasets)
	matched_datasets += list(perfect_matches)
	print("The following datasets have matching names (A contained in B or B contained in A),"
	" and have the same number of instances, features and missing values:")
	row_print_dict(perfect_matches)

	def close_match(row):
	return row['name_match'] and sum([row['d_instances'] == 0, row['d_features'] == 0, row['d_missing'] == 0]) == 2
	close_matches = get_matches_per_dataset(df, fn=close_match, exclude=matched_datasets)
	matched_datasets += list(close_matches)
	print("The following datasets have matching names, but differ in either instances, features, or missing values:")
	row_print_dict(close_matches)

	def name_match(row):
	return row['name_match'] and sum([row['d_instances'] == 0, row['d_features'] == 0, row['d_missing'] == 0]) < 2
	name_matches = get_matches_per_dataset(df, fn=name_match, exclude=matched_datasets)
	matched_datasets += list(name_matches)
	print("The following datasets have matching names, but differ in more than one way:")
	row_print_dict(name_matches)

	def shape_match(row):
	return (not row['name_match']) and row['d_instances'] == 0 and row['d_features'] == 0 and row['d_missing'] == 0
	shape_matches = get_matches_per_dataset(df, fn=shape_match)
	matched_datasets += list(shape_matches)
	print("The following datasets do not have matching names,"
	"but have the same number of instances, features and missing values:")
	row_print_dict(shape_matches)

	all_datasets = df['name'].unique()
	no_matches = [name for name in all_datasets if name not in matched_datasets]
	print("The following datasets do not match any of the above criteria:")
	for no_match in no_matches:
	print(no_match)