Skip to content

Instantly share code, notes, and snippets.

@olepbr
Last active July 4, 2023 12:29
Show Gist options
  • Save olepbr/8966aa7b8676b5e73d6c52ffc8f009c9 to your computer and use it in GitHub Desktop.
Save olepbr/8966aa7b8676b5e73d6c52ffc8f009c9 to your computer and use it in GitHub Desktop.
Quick script to help @mariusbax@snabelen.no get some Norwegian film data. Absolutely not production-grade; this script comes with zero correctness guarantees :-) Mastodon thread: https://snabelen.no/@mariusbax/110644892786488792
import json
from dataclasses import dataclass
from dataclass_csv import DataclassWriter
@dataclass
class Film:
""" Bare-bones class representing a film. """
name: str
distributor: str
publisher: str
# Helper functions. NOTE: passing the list of distributors around and mutating
# it directly is not particularly nice, but it'll do for this hacky script.
def find_distributor(term, dist_list, name):
""" Helper for finding a term representing a distributor. """
if term["@type"] == "code" and "#text" in term and term["#text"] == "DIS":
dist_list.append(name["namePart"])
def process_name(name, dist_list):
""" Process a name entity and figure out if it's a distributor. """
if name["@type"] == "corporate":
# People, what's that? Corporations forever!
role_term = name["role"]["roleTerm"]
# We only care about distributors here, since we already got the publisher from its own field.
if isinstance(role_term, list):
for term in role_term:
find_distributor(term, dist_list, name)
else:
find_distributor(role_term, dist_list, name)
# Load the JSON.
with open("filmografi_split.json", "r") as r_file:
data = json.load(r_file)
films = []
for film in data:
title_info = film["titleInfo"]
if isinstance(title_info, list):
# This bad boy has multiple titles, let's assume the first one suffices.
title = title_info[0]['title']
else:
title = title_info["title"]
if "publisher" in film["originInfo"]:
publisher = film["originInfo"]["publisher"]
else:
# There doesn't appear to be a publisher, let's just mark it as unknown. (We could also just leave this blank.)
publisher = "Unknown"
# Get the names associated with this film.
names = film["name"]
# Start with an empty list of distributors (there might be several).
distributor = []
if isinstance(names, list):
# God, there's more of you.
for name in names:
process_name(name, distributor)
else:
# There's only one name associated with this film.
process_name(name, distributor)
if isinstance(publisher, list):
# The publisher field contained multitudes; let's make it into a string.
publisher = ", ".join(publisher)
# Join the distributors, too.
distributor = ", ".join(distributor)
# Initialise an instance of the class and add it to the list of films.
cur_film = Film(name=title, distributor=distributor, publisher=publisher)
films.append(cur_film)
# The dataclass creates a pretty nice default string representation of the
# class; uncommenting the below can be helpful for (extremely professional)
# debugging.
#print(cur_film)
# Finally, write the list of Film instances to a CSV.
with open("filmografi.csv", "w") as f:
w = DataclassWriter(f, films, Film)
w.write()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment