Created
June 23, 2023 15:58
-
-
Save DeathVenom54/8b9a82b47989ca9970ec59a0f33233e2 to your computer and use it in GitHub Desktop.
Parsing openlibrary works and authors dump to sqlite database
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Extracts selected fields and saves them in sqlite database | |
# Have a file count.txt in the directory with text "0" | |
import json | |
import sqlite3 | |
import traceback | |
from contextlib import closing | |
global init_count | |
with open('count.txt', 'r') as file: | |
init_count = 0 | |
count = file.read().rstrip() | |
init_count = int(count) | |
with closing(sqlite3.connect('books.db')) as conn: | |
with closing(conn.cursor()) as cursor: | |
cursor.execute('CREATE TABLE IF NOT EXISTS books(work_id TEXT PRIMARY KEY, title TEXT, description TEXT, author_ids TEXT, covers TEXT);') | |
cursor.execute('CREATE TABLE IF NOT EXISTS subjects(id INTEGER PRIMARY KEY AUTOINCREMENT, subject TEXT UNIQUE);') | |
cursor.execute('CREATE TABLE IF NOT EXISTS book_subject(work_id TEXT, subject_id INTEGER, PRIMARY KEY (work_id, subject_id));') | |
with open('<works dump file>') as file: | |
for (i, line) in enumerate(file): | |
try: | |
if i < init_count: | |
continue | |
if i > init_count + 10000000000: | |
break | |
if i % 100 == 0: | |
if i % 1000 == 0: | |
print(f'Count: {i}') | |
with open('count.txt', 'w') as file: | |
file.write(str(i)) | |
conn.commit() | |
lsplit = line.split('\t') | |
if lsplit[0] != '/type/work': | |
continue | |
work_id = title = description = author_ids = covers = subjects = None | |
work_id = lsplit[1][7:] | |
json_data = json.loads(lsplit[4]) | |
if 'title' not in json_data: | |
continue | |
title = json_data['title'] | |
description = json_data['description'] if 'description' in json_data else None | |
if isinstance(description, dict): | |
description = description['value'] | |
author_ids = [] | |
if 'authors' in json_data: | |
for author in json_data['authors']: | |
if 'author' in author: | |
if isinstance(author['author'], str): | |
author_ids.append(author['author'][9:]) | |
else: | |
author_ids.append(author['author']['key'][9:]) | |
covers = json_data['covers'] if 'covers' in json_data else [] | |
subjects = json_data['subjects'] if 'subjects' in json_data else [] | |
# print(title, description, json.dumps(author_ids), json.dumps(covers), subjects) | |
cursor.execute('INSERT INTO books VALUES (?, ?, ?, ?, ?)', (work_id, title, description, json.dumps(author_ids), json.dumps(covers))).fetchone() | |
for subject in subjects: | |
subject_id = cursor.execute('SELECT id FROM subjects WHERE subject = ?', (subject,)).fetchone() | |
if subject_id: | |
cursor.execute('INSERT INTO book_subject VALUES(?,?)', (work_id, subject_id[0])) | |
else: | |
cursor.execute('INSERT INTO subjects (subject) VALUES(?)', (subject,)) | |
subject_id = cursor.execute('SELECT id FROM subjects WHERE subject = ?', (subject,)).fetchone() | |
cursor.execute('INSERT INTO book_subject VALUES(?,?)', (work_id, subject_id[0])) | |
except sqlite3.IntegrityError as e: | |
print(e) | |
pass | |
except Exception: | |
print(traceback.format_exc()) | |
print(title, description, json.dumps(author_ids), json.dumps(covers), subjects) | |
with open('count.txt', 'w') as file: | |
file.write(str(i)) | |
break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment