belsander/football-data.co.uk_convert.py

## football-data.co.uk_convert.py
#!/usr/bin/python

"""football-data.co.uk_convert.py: Convert xslx files to one or more workable csv file(s)

Copyright (C) 2016  Sander Bel

This program is free software: you can redistribute it and/or modify it under the terms of the
GNU General Public License as published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License along with this program.  If not,
see <http://www.gnu.org/licenses/>.
"""

__author__ = "Sander Bel"
__copyright__ = "Copyright (C) 2016  Sander Bel"
__credits__ = ["Sander Bel"]
__licence__ = "GPLv3"
__version__ = "1.0.0"
__maintainer__ = "Sander Bel"
__email__ = "sander@proapps.be"
__status__ = "Production"


from argparse import ArgumentParser, RawTextHelpFormatter
from glob import glob
from os.path import splitext
from xlrd import open_workbook
from csv import writer, QUOTE_NONE
from zipfile import ZipFile


# GLOBALS
ALL_COMBINED        = False
SORT_HEADERS        = False
EXTRACT_ZIP         = False
TRANSPOSE           = False
EXT_CSV             = ".csv"
EXT_XSL             = ".xls"
EXT_ZIP             = ".zip"
CONVERT_CSV_EXT     = "_converted%s" % (EXT_CSV)
COMMA_SIGN          = ","
DOT_SIGN            = "."
CSV_DELIMITER       = COMMA_SIGN
OUTPUT_DECIMAL_SIGN = DOT_SIGN
NULL_VALUE          = "0"
ENCODING            = "UTF-8"
HEADERS             = ["Div", "Date", "HomeTeam", "AwayTeam", "FTR", "B365A", "B365D", "B365H", "BSA", "BSD", "BSH", \
                       "BWA", "BWD", "BWH", "GBA", "GBD", "GBH", "IWA", "IWD", "IWH", "LBA", "LBD", "LBH", "PSA", "PSD", \
                       "PSH", "SBA", "SBD", "SBH", "SJA", "SJD", "SJH", "SOA", "SOD", "SOH", "SYA", "SYD", "SYH", "VCA", \
                       "VCD", "VCH", "WHA", "WHD", "WHH"]

arg_parser = ArgumentParser(prog=__file__, description="Convert xls files to csv.", epilog="%s  Copyright (C) 2016  Sander Bel\nThis program comes with ABSOLUTELY NO WARRANTY; for details type '%s --license'.\nThis is free software, and you are welcome to redistribute it under certain conditions; type '%s --license' for details." % ((__file__,) * 3), formatter_class=RawTextHelpFormatter)
arg_parser.add_argument("-l", "--license", action="store_true", help="show license information and exit")
arg_parser.add_argument("-v", "--version", action="version", help="show version information and exit", version="%(prog)s v" + __version__)
arg_parser.add_argument("-a", "--all_combined", default=ALL_COMBINED, help="combine all the xls files into one csv file, instead of a seperate csv output file for each xls input file")
arg_parser.add_argument("-s", "--sort_headers", default=SORT_HEADERS, help="sort headers no longer based upon bookmaker, but based upon results (what is the name of A,D,H?)")
arg_parser.add_argument("-t", "--transpose", default=TRANSPOSE, help="transpose columns to rows and visa versa")
arg_parser.add_argument("-e", "--extract_zip", default=EXTRACT_ZIP, help="before importing the xls files, extract all zip files within the current directory (not including subdirectories)")
arg_parser.add_argument("-c", "--csv_delimiter", default=CSV_DELIMITER, help="delimiter used for the output csv file(s) (default: '%s')" % (CSV_DELIMITER))
arg_parser.add_argument("-o", "--output_decimal_sign", default=OUTPUT_DECIMAL_SIGN, help="decimal ('%s' or '%s') sign for the output csv file(s) (default: '%s')" % (COMMA_SIGN, DOT_SIGN, OUTPUT_DECIMAL_SIGN))
arguments = arg_parser.parse_args()

if arguments.license:
    print("For more information regarding the %s license, see <https://blog.b-e-l.be/gnu-gplv3-license>" % (__licence__))
    exit(0)

# Parse arguments and set global variables accordingly
global_vars = list(i for i in globals().iterkeys() if all(w.isupper() for w in i if w.isalpha()))
for global_var in global_vars:
    try:
        exec("if arguments.%s: %s = arguments.%s" % (global_var.lower(), global_var, global_var.lower()))
    except AttributeError:
        pass

# ACTUAL SCRIPT -- LEAVE ALONE
if OUTPUT_DECIMAL_SIGN == ".":
    INPUT_DECIMAL_SIGN = ","
elif OUTPUT_DECIMAL_SIGN == ",":
    INPUT_DECIMAL_SIGN = "."
else:
    arg_parser.print_help()
    exit(1)

if SORT_HEADERS:
    EXTRA, A, D, H = list(), list(), list(), list();
    for header in HEADERS:
        if header.endswith("A"):
            A.append(header)
        elif header.endswith("D"):
            D.append(header)
        elif header.endswith("H"):
            H.append(header)
        else:
            EXTRA.append(header)
    A.sort()
    D.sort()
    H.sort()
    HEADERS = EXTRA + A + D + H

def generate_dict_out():
    return_dict = dict()
    for header in HEADERS:
        return_dict[header] = list()
    return return_dict

def set_decimal_seperator(row):
    return list(str(row_value).replace(INPUT_DECIMAL_SIGN, OUTPUT_DECIMAL_SIGN) if isinstance(row_value, float) else str(row_value.encode(ENCODING)) for row_value in row)

def dump_to_csv(csv_file_name, dict_data):
    with open(csv_file_name, "w") as csv_out:
        for header in HEADERS:
            csv_writer = writer(csv_out, delimiter=CSV_DELIMITER, quoting=QUOTE_NONE)
            csv_writer.writerow([header] + dict_data[header])
    dict_data.clear()


print("ALL EXISING FILES WILL BE AUTOMATICALLY OVERWRITTEN! Are you sure you want to continue?")
print("START")
if EXTRACT_ZIP:
    for zip_file in glob("*%s" % (EXT_ZIP)):
        with ZipFile(zip_file, "r") as zip_file_fd:
            print("Extracting files out of archive (%s): %s" % (zip_file, zip_file_fd.namelist()))
            zip_file_fd.extractall()

dict_out = generate_dict_out()
xls_files = glob("*%s" % (EXT_XSL))
xls_files.sort()

for xls_file in xls_files:
    print("Opening xls file: %s" % (xls_file))

    if not(ALL_COMBINED) and not(dict_out):
        dict_out = generate_dict_out()

    xls_workbook = open_workbook(xls_file)
    for sheet_name in xls_workbook.sheet_names():
        xls_workbook_sheet = xls_workbook.sheet_by_name(sheet_name)

        dict_translate = dict()
        list_not_present = list()

        first_row = True
        for row_num in xrange(xls_workbook_sheet.nrows):
            row_values = set_decimal_seperator(xls_workbook_sheet.row_values(row_num))
            if first_row:
                first_row = False
                for header in HEADERS:
                    if header in row_values:
                        dict_translate[row_values.index(header)] = header
                    else:
                        list_not_present.append(header)
            else:
                for value_column in dict_translate.keys():
                    row_value = row_values[value_column]
                    if row_value.strip() == "":
                        row_value = NULL_VALUE
                    dict_out[dict_translate[value_column]].append(row_value)

                for header_not_present in list_not_present:
                    dict_out[header_not_present].append(NULL_VALUE)

    dict_translate.clear()
    del list_not_present[:]

    if not(ALL_COMBINED):
        xls_file_name, xls_file_ext = splitext(xls_file)
        print("Writing csv data to: %s%s" % (xls_file_name, CONVERT_CSV_EXT))
        dump_to_csv("%s%s" % (xls_file_name, CONVERT_CSV_EXT), dict_out)

if ALL_COMBINED:
    print("Writing all csv data to combined file: %s%s" % ("all-combined", CONVERT_CSV_EXT))
    dump_to_csv("%s%s" % ("all-combined", CONVERT_CSV_EXT), dict_out)

print("DONE")
exit(0)
	#!/usr/bin/python

	"""football-data.co.uk_convert.py: Convert xslx files to one or more workable csv file(s)

	Copyright (C) 2016 Sander Bel

	This program is free software: you can redistribute it and/or modify it under the terms of the
	GNU General Public License as published by the Free Software Foundation, either version 3 of the
	License, or (at your option) any later version.
	This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
	even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	General Public License for more details.
	You should have received a copy of the GNU General Public License along with this program. If not,
	see <http://www.gnu.org/licenses/>.
	"""

	__author__ = "Sander Bel"
	__copyright__ = "Copyright (C) 2016 Sander Bel"
	__credits__ = ["Sander Bel"]
	__licence__ = "GPLv3"
	__version__ = "1.0.0"
	__maintainer__ = "Sander Bel"
	__email__ = "sander@proapps.be"
	__status__ = "Production"


	from argparse import ArgumentParser, RawTextHelpFormatter
	from glob import glob
	from os.path import splitext
	from xlrd import open_workbook
	from csv import writer, QUOTE_NONE
	from zipfile import ZipFile


	# GLOBALS
	ALL_COMBINED = False
	SORT_HEADERS = False
	EXTRACT_ZIP = False
	TRANSPOSE = False
	EXT_CSV = ".csv"
	EXT_XSL = ".xls"
	EXT_ZIP = ".zip"
	CONVERT_CSV_EXT = "_converted%s" % (EXT_CSV)
	COMMA_SIGN = ","
	DOT_SIGN = "."
	CSV_DELIMITER = COMMA_SIGN
	OUTPUT_DECIMAL_SIGN = DOT_SIGN
	NULL_VALUE = "0"
	ENCODING = "UTF-8"
	HEADERS = ["Div", "Date", "HomeTeam", "AwayTeam", "FTR", "B365A", "B365D", "B365H", "BSA", "BSD", "BSH", \
	"BWA", "BWD", "BWH", "GBA", "GBD", "GBH", "IWA", "IWD", "IWH", "LBA", "LBD", "LBH", "PSA", "PSD", \
	"PSH", "SBA", "SBD", "SBH", "SJA", "SJD", "SJH", "SOA", "SOD", "SOH", "SYA", "SYD", "SYH", "VCA", \
	"VCD", "VCH", "WHA", "WHD", "WHH"]

	arg_parser = ArgumentParser(prog=__file__, description="Convert xls files to csv.", epilog="%s Copyright (C) 2016 Sander Bel\nThis program comes with ABSOLUTELY NO WARRANTY; for details type '%s --license'.\nThis is free software, and you are welcome to redistribute it under certain conditions; type '%s --license' for details." % ((__file__,) * 3), formatter_class=RawTextHelpFormatter)
	arg_parser.add_argument("-l", "--license", action="store_true", help="show license information and exit")
	arg_parser.add_argument("-v", "--version", action="version", help="show version information and exit", version="%(prog)s v" + __version__)
	arg_parser.add_argument("-a", "--all_combined", default=ALL_COMBINED, help="combine all the xls files into one csv file, instead of a seperate csv output file for each xls input file")
	arg_parser.add_argument("-s", "--sort_headers", default=SORT_HEADERS, help="sort headers no longer based upon bookmaker, but based upon results (what is the name of A,D,H?)")
	arg_parser.add_argument("-t", "--transpose", default=TRANSPOSE, help="transpose columns to rows and visa versa")
	arg_parser.add_argument("-e", "--extract_zip", default=EXTRACT_ZIP, help="before importing the xls files, extract all zip files within the current directory (not including subdirectories)")
	arg_parser.add_argument("-c", "--csv_delimiter", default=CSV_DELIMITER, help="delimiter used for the output csv file(s) (default: '%s')" % (CSV_DELIMITER))
	arg_parser.add_argument("-o", "--output_decimal_sign", default=OUTPUT_DECIMAL_SIGN, help="decimal ('%s' or '%s') sign for the output csv file(s) (default: '%s')" % (COMMA_SIGN, DOT_SIGN, OUTPUT_DECIMAL_SIGN))
	arguments = arg_parser.parse_args()

	if arguments.license:
	print("For more information regarding the %s license, see <https://blog.b-e-l.be/gnu-gplv3-license>" % (__licence__))
	exit(0)

	# Parse arguments and set global variables accordingly
	global_vars = list(i for i in globals().iterkeys() if all(w.isupper() for w in i if w.isalpha()))
	for global_var in global_vars:
	try:
	exec("if arguments.%s: %s = arguments.%s" % (global_var.lower(), global_var, global_var.lower()))
	except AttributeError:
	pass

	# ACTUAL SCRIPT -- LEAVE ALONE
	if OUTPUT_DECIMAL_SIGN == ".":
	INPUT_DECIMAL_SIGN = ","
	elif OUTPUT_DECIMAL_SIGN == ",":
	INPUT_DECIMAL_SIGN = "."
	else:
	arg_parser.print_help()
	exit(1)

	if SORT_HEADERS:
	EXTRA, A, D, H = list(), list(), list(), list();
	for header in HEADERS:
	if header.endswith("A"):
	A.append(header)
	elif header.endswith("D"):
	D.append(header)
	elif header.endswith("H"):
	H.append(header)
	else:
	EXTRA.append(header)
	A.sort()
	D.sort()
	H.sort()
	HEADERS = EXTRA + A + D + H

	def generate_dict_out():
	return_dict = dict()
	for header in HEADERS:
	return_dict[header] = list()
	return return_dict

	def set_decimal_seperator(row):
	return list(str(row_value).replace(INPUT_DECIMAL_SIGN, OUTPUT_DECIMAL_SIGN) if isinstance(row_value, float) else str(row_value.encode(ENCODING)) for row_value in row)

	def dump_to_csv(csv_file_name, dict_data):
	with open(csv_file_name, "w") as csv_out:
	for header in HEADERS:
	csv_writer = writer(csv_out, delimiter=CSV_DELIMITER, quoting=QUOTE_NONE)
	csv_writer.writerow([header] + dict_data[header])
	dict_data.clear()


	print("ALL EXISING FILES WILL BE AUTOMATICALLY OVERWRITTEN! Are you sure you want to continue?")
	print("START")
	if EXTRACT_ZIP:
	for zip_file in glob("*%s" % (EXT_ZIP)):
	with ZipFile(zip_file, "r") as zip_file_fd:
	print("Extracting files out of archive (%s): %s" % (zip_file, zip_file_fd.namelist()))
	zip_file_fd.extractall()

	dict_out = generate_dict_out()
	xls_files = glob("*%s" % (EXT_XSL))
	xls_files.sort()

	for xls_file in xls_files:
	print("Opening xls file: %s" % (xls_file))

	if not(ALL_COMBINED) and not(dict_out):
	dict_out = generate_dict_out()

	xls_workbook = open_workbook(xls_file)
	for sheet_name in xls_workbook.sheet_names():
	xls_workbook_sheet = xls_workbook.sheet_by_name(sheet_name)

	dict_translate = dict()
	list_not_present = list()

	first_row = True
	for row_num in xrange(xls_workbook_sheet.nrows):
	row_values = set_decimal_seperator(xls_workbook_sheet.row_values(row_num))
	if first_row:
	first_row = False
	for header in HEADERS:
	if header in row_values:
	dict_translate[row_values.index(header)] = header
	else:
	list_not_present.append(header)
	else:
	for value_column in dict_translate.keys():
	row_value = row_values[value_column]
	if row_value.strip() == "":
	row_value = NULL_VALUE
	dict_out[dict_translate[value_column]].append(row_value)

	for header_not_present in list_not_present:
	dict_out[header_not_present].append(NULL_VALUE)

	dict_translate.clear()
	del list_not_present[:]

	if not(ALL_COMBINED):
	xls_file_name, xls_file_ext = splitext(xls_file)
	print("Writing csv data to: %s%s" % (xls_file_name, CONVERT_CSV_EXT))
	dump_to_csv("%s%s" % (xls_file_name, CONVERT_CSV_EXT), dict_out)

	if ALL_COMBINED:
	print("Writing all csv data to combined file: %s%s" % ("all-combined", CONVERT_CSV_EXT))
	dump_to_csv("%s%s" % ("all-combined", CONVERT_CSV_EXT), dict_out)

	print("DONE")
	exit(0)