olepbr/film_parse.py

## film_parse.py
import json
from dataclasses import dataclass

from dataclass_csv import DataclassWriter

@dataclass
class Film:
    """ Bare-bones class representing a film. """
    name: str
    distributor: str
    publisher: str

# Helper functions. NOTE: passing the list of distributors around and mutating
# it directly is not particularly nice, but it'll do for this hacky script.

def find_distributor(term, dist_list, name):
    """ Helper for finding a term representing a distributor. """
    if term["@type"] == "code" and "#text" in term and term["#text"] == "DIS":
        dist_list.append(name["namePart"])

def process_name(name, dist_list):
    """ Process a name entity and figure out if it's a distributor. """
    if name["@type"] == "corporate":
        # People, what's that? Corporations forever!
        role_term = name["role"]["roleTerm"]
        # We only care about distributors here, since we already got the publisher from its own field.
        if isinstance(role_term, list):
            for term in role_term:
                find_distributor(term, dist_list, name)
        else:
            find_distributor(role_term, dist_list, name)

# Load the JSON.
with open("filmografi_split.json", "r") as r_file:
    data = json.load(r_file)

films = []

for film in data:
    title_info = film["titleInfo"]
    if isinstance(title_info, list):
        # This bad boy has multiple titles, let's assume the first one suffices.
        title = title_info[0]['title']
    else:
        title = title_info["title"]
    if "publisher" in film["originInfo"]:
        publisher = film["originInfo"]["publisher"]
    else:
        # There doesn't appear to be a publisher, let's just mark it as unknown. (We could also just leave this blank.)
        publisher = "Unknown"

    # Get the names associated with this film.
    names = film["name"]
    # Start with an empty list of distributors (there might be several).
    distributor = []

    if isinstance(names, list):
        # God, there's more of you.
        for name in names:
            process_name(name, distributor)
    else:
        # There's only one name associated with this film.
        process_name(name, distributor)

    if isinstance(publisher, list):
        # The publisher field contained multitudes; let's make it into a string.
        publisher = ", ".join(publisher)

    # Join the distributors, too.
    distributor = ", ".join(distributor)

    # Initialise an instance of the class and add it to the list of films.
    cur_film = Film(name=title, distributor=distributor, publisher=publisher)
    films.append(cur_film)

    # The dataclass creates a pretty nice default string representation of the
    # class; uncommenting the below can be helpful for (extremely professional)
    # debugging.
    #print(cur_film)

# Finally, write the list of Film instances to a CSV.
with open("filmografi.csv", "w") as f:
    w = DataclassWriter(f, films, Film)
    w.write()
	import json
	from dataclasses import dataclass

	from dataclass_csv import DataclassWriter

	@dataclass
	class Film:
	""" Bare-bones class representing a film. """
	name: str
	distributor: str
	publisher: str

	# Helper functions. NOTE: passing the list of distributors around and mutating
	# it directly is not particularly nice, but it'll do for this hacky script.

	def find_distributor(term, dist_list, name):
	""" Helper for finding a term representing a distributor. """
	if term["@type"] == "code" and "#text" in term and term["#text"] == "DIS":
	dist_list.append(name["namePart"])

	def process_name(name, dist_list):
	""" Process a name entity and figure out if it's a distributor. """
	if name["@type"] == "corporate":
	# People, what's that? Corporations forever!
	role_term = name["role"]["roleTerm"]
	# We only care about distributors here, since we already got the publisher from its own field.
	if isinstance(role_term, list):
	for term in role_term:
	find_distributor(term, dist_list, name)
	else:
	find_distributor(role_term, dist_list, name)

	# Load the JSON.
	with open("filmografi_split.json", "r") as r_file:
	data = json.load(r_file)

	films = []

	for film in data:
	title_info = film["titleInfo"]
	if isinstance(title_info, list):
	# This bad boy has multiple titles, let's assume the first one suffices.
	title = title_info[0]['title']
	else:
	title = title_info["title"]
	if "publisher" in film["originInfo"]:
	publisher = film["originInfo"]["publisher"]
	else:
	# There doesn't appear to be a publisher, let's just mark it as unknown. (We could also just leave this blank.)
	publisher = "Unknown"

	# Get the names associated with this film.
	names = film["name"]
	# Start with an empty list of distributors (there might be several).
	distributor = []

	if isinstance(names, list):
	# God, there's more of you.
	for name in names:
	process_name(name, distributor)
	else:
	# There's only one name associated with this film.
	process_name(name, distributor)

	if isinstance(publisher, list):
	# The publisher field contained multitudes; let's make it into a string.
	publisher = ", ".join(publisher)

	# Join the distributors, too.
	distributor = ", ".join(distributor)

	# Initialise an instance of the class and add it to the list of films.
	cur_film = Film(name=title, distributor=distributor, publisher=publisher)
	films.append(cur_film)

	# The dataclass creates a pretty nice default string representation of the
	# class; uncommenting the below can be helpful for (extremely professional)
	# debugging.
	#print(cur_film)

	# Finally, write the list of Film instances to a CSV.
	with open("filmografi.csv", "w") as f:
	w = DataclassWriter(f, films, Film)
	w.write()