Last active
July 4, 2023 12:29
-
-
Save olepbr/8966aa7b8676b5e73d6c52ffc8f009c9 to your computer and use it in GitHub Desktop.
Quick script to help @mariusbax@snabelen.no get some Norwegian film data. Absolutely not production-grade; this script comes with zero correctness guarantees :-) Mastodon thread: https://snabelen.no/@mariusbax/110644892786488792
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from dataclasses import dataclass | |
from dataclass_csv import DataclassWriter | |
@dataclass | |
class Film: | |
""" Bare-bones class representing a film. """ | |
name: str | |
distributor: str | |
publisher: str | |
# Helper functions. NOTE: passing the list of distributors around and mutating | |
# it directly is not particularly nice, but it'll do for this hacky script. | |
def find_distributor(term, dist_list, name): | |
""" Helper for finding a term representing a distributor. """ | |
if term["@type"] == "code" and "#text" in term and term["#text"] == "DIS": | |
dist_list.append(name["namePart"]) | |
def process_name(name, dist_list): | |
""" Process a name entity and figure out if it's a distributor. """ | |
if name["@type"] == "corporate": | |
# People, what's that? Corporations forever! | |
role_term = name["role"]["roleTerm"] | |
# We only care about distributors here, since we already got the publisher from its own field. | |
if isinstance(role_term, list): | |
for term in role_term: | |
find_distributor(term, dist_list, name) | |
else: | |
find_distributor(role_term, dist_list, name) | |
# Load the JSON. | |
with open("filmografi_split.json", "r") as r_file: | |
data = json.load(r_file) | |
films = [] | |
for film in data: | |
title_info = film["titleInfo"] | |
if isinstance(title_info, list): | |
# This bad boy has multiple titles, let's assume the first one suffices. | |
title = title_info[0]['title'] | |
else: | |
title = title_info["title"] | |
if "publisher" in film["originInfo"]: | |
publisher = film["originInfo"]["publisher"] | |
else: | |
# There doesn't appear to be a publisher, let's just mark it as unknown. (We could also just leave this blank.) | |
publisher = "Unknown" | |
# Get the names associated with this film. | |
names = film["name"] | |
# Start with an empty list of distributors (there might be several). | |
distributor = [] | |
if isinstance(names, list): | |
# God, there's more of you. | |
for name in names: | |
process_name(name, distributor) | |
else: | |
# There's only one name associated with this film. | |
process_name(name, distributor) | |
if isinstance(publisher, list): | |
# The publisher field contained multitudes; let's make it into a string. | |
publisher = ", ".join(publisher) | |
# Join the distributors, too. | |
distributor = ", ".join(distributor) | |
# Initialise an instance of the class and add it to the list of films. | |
cur_film = Film(name=title, distributor=distributor, publisher=publisher) | |
films.append(cur_film) | |
# The dataclass creates a pretty nice default string representation of the | |
# class; uncommenting the below can be helpful for (extremely professional) | |
# debugging. | |
#print(cur_film) | |
# Finally, write the list of Film instances to a CSV. | |
with open("filmografi.csv", "w") as f: | |
w = DataclassWriter(f, films, Film) | |
w.write() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment