albert-yu/imslp_scratchpad.py

## imslp_scratchpad.py
# this script was used to convert instrumentation data from IMSLP
# to a collection of objects, decoupling instrument count and misc.
# notes from the long string blob

import os
import json
import re

files = os.listdir(".")
jsons = filter(lambda x: x.endswith(".json"), files)


split_pattern = "\n|,\s*(?![^()]*\))"

cnt_prog = re.compile("\d+")
notes_prog = re.compile("\(.*\)")

for jsonfile in jsons:
    d = None
    with open(jsonfile, "r") as f:
        d = json.load(f)
    imslp_info = None
    if "imslp_info" in d:
        imslp_info = d["imslp_info"]
    if not imslp_info:
        continue
    if "First Publication." in imslp_info:
        d["first_published"] = imslp_info["First Publication."]

    if "Instrumentation" in imslp_info:
        formatted = []
        instr = imslp_info["Instrumentation"]
        parts = re.split(split_pattern, instr)
        for part in parts:
            part = part.strip()
            instr_part = dict()
            # extract instrument count and notes
            count_m = cnt_prog.search(part)

            # indices of instrument start and end
            name_start = 0
            name_end = len(part)
            if count_m:
                # extract only the number portion
                count_m2 = re.search("\d+", count_m.group(0))
                count = int(count_m2.group(0))
                instr_part["count"] = count

                # start where the count ends
                name_start = count_m2.end()

            notes_m = notes_prog.search(part)
            if notes_m:
                with_parens = notes_m.group(0)
                without_parens = with_parens[1:len(with_parens) - 1]
                instr_part["notes"] = without_parens

                # name ends where the notes start
                name_end = notes_m.start()

            # extract instrument name
            instr_name = part[name_start:name_end].strip()
            instr_part["name"] = instr_name
            # ignore empty strings
            if len(instr_name) == 0:
                continue
            formatted.append(instr_part)

        d["arrangement"] = formatted

    with open(jsonfile, "w") as f:
        json.dump(d, f)
    print(jsonfile)
	# this script was used to convert instrumentation data from IMSLP
	# to a collection of objects, decoupling instrument count and misc.
	# notes from the long string blob

	import os
	import json
	import re

	files = os.listdir(".")
	jsons = filter(lambda x: x.endswith(".json"), files)


	split_pattern = "\n\|,\s(?![^()]\))"

	cnt_prog = re.compile("\d+")
	notes_prog = re.compile("\(.*\)")

	for jsonfile in jsons:
	d = None
	with open(jsonfile, "r") as f:
	d = json.load(f)
	imslp_info = None
	if "imslp_info" in d:
	imslp_info = d["imslp_info"]
	if not imslp_info:
	continue
	if "First Publication." in imslp_info:
	d["first_published"] = imslp_info["First Publication."]

	if "Instrumentation" in imslp_info:
	formatted = []
	instr = imslp_info["Instrumentation"]
	parts = re.split(split_pattern, instr)
	for part in parts:
	part = part.strip()
	instr_part = dict()
	# extract instrument count and notes
	count_m = cnt_prog.search(part)

	# indices of instrument start and end
	name_start = 0
	name_end = len(part)
	if count_m:
	# extract only the number portion
	count_m2 = re.search("\d+", count_m.group(0))
	count = int(count_m2.group(0))
	instr_part["count"] = count

	# start where the count ends
	name_start = count_m2.end()

	notes_m = notes_prog.search(part)
	if notes_m:
	with_parens = notes_m.group(0)
	without_parens = with_parens[1:len(with_parens) - 1]
	instr_part["notes"] = without_parens

	# name ends where the notes start
	name_end = notes_m.start()

	# extract instrument name
	instr_name = part[name_start:name_end].strip()
	instr_part["name"] = instr_name
	# ignore empty strings
	if len(instr_name) == 0:
	continue
	formatted.append(instr_part)

	d["arrangement"] = formatted

	with open(jsonfile, "w") as f:
	json.dump(d, f)
	print(jsonfile)