Skip to content

Instantly share code, notes, and snippets.

@DeathVenom54
Created June 23, 2023 15:58
Show Gist options
  • Save DeathVenom54/8b9a82b47989ca9970ec59a0f33233e2 to your computer and use it in GitHub Desktop.
Save DeathVenom54/8b9a82b47989ca9970ec59a0f33233e2 to your computer and use it in GitHub Desktop.
Parsing openlibrary works and authors dump to sqlite database
import json
import sqlite3
import traceback
from contextlib import closing
global init_count
STEP_SIZE = 1000
with open('count_a.txt', 'r') as file:
init_count = 0
count = file.read().rstrip()
init_count = int(count)
with closing(sqlite3.connect('books.db')) as conn:
with closing(conn.cursor()) as cursor:
cursor.execute('CREATE TABLE IF NOT EXISTS authors(id TEXT PRIMARY KEY, name TEXT NOT NULL, birth_date TEXT, death_date TEXT);')
with open('<authors dump file>') as file:
for (i, line) in enumerate(file):
try:
if i < init_count:
continue
if i > init_count + 1000000000:
break
if i % STEP_SIZE == 0:
print(f'Count: {i}')
with open('count_a.txt', 'w') as file:
file.write(str(i))
conn.commit()
lsplit = line.split('\t')
if lsplit[0] != '/type/author':
continue
id = lsplit[1][9:]
json_data = json.loads(lsplit[4])
birth_date = json_data['birth_date'] if 'birth_date' in json_data else None
death_date = json_data['death_date'] if 'death_date' in json_data else None
name = json_data['name'] if 'name' in json_data else None
if name is None and 'personal_name' in json_data:
name = json_data['personal_name']
if name is None:
continue
cursor.execute('INSERT INTO authors VALUES (?, ?, ?, ?)', (id, name, birth_date, death_date))
except sqlite3.IntegrityError as e:
print(e)
pass
except Exception:
print(traceback.format_exc())
with open('count_a.txt', 'w') as file:
file.write(str(i - STEP_SIZE))
break
# Extracts selected fields and saves them in sqlite database
# Have a file count.txt in the directory with text "0"
import json
import sqlite3
import traceback
from contextlib import closing
global init_count
with open('count.txt', 'r') as file:
init_count = 0
count = file.read().rstrip()
init_count = int(count)
with closing(sqlite3.connect('books.db')) as conn:
with closing(conn.cursor()) as cursor:
cursor.execute('CREATE TABLE IF NOT EXISTS books(work_id TEXT PRIMARY KEY, title TEXT, description TEXT, author_ids TEXT, covers TEXT);')
cursor.execute('CREATE TABLE IF NOT EXISTS subjects(id INTEGER PRIMARY KEY AUTOINCREMENT, subject TEXT UNIQUE);')
cursor.execute('CREATE TABLE IF NOT EXISTS book_subject(work_id TEXT, subject_id INTEGER, PRIMARY KEY (work_id, subject_id));')
with open('<works dump file>') as file:
for (i, line) in enumerate(file):
try:
if i < init_count:
continue
if i > init_count + 10000000000:
break
if i % 100 == 0:
if i % 1000 == 0:
print(f'Count: {i}')
with open('count.txt', 'w') as file:
file.write(str(i))
conn.commit()
lsplit = line.split('\t')
if lsplit[0] != '/type/work':
continue
work_id = title = description = author_ids = covers = subjects = None
work_id = lsplit[1][7:]
json_data = json.loads(lsplit[4])
if 'title' not in json_data:
continue
title = json_data['title']
description = json_data['description'] if 'description' in json_data else None
if isinstance(description, dict):
description = description['value']
author_ids = []
if 'authors' in json_data:
for author in json_data['authors']:
if 'author' in author:
if isinstance(author['author'], str):
author_ids.append(author['author'][9:])
else:
author_ids.append(author['author']['key'][9:])
covers = json_data['covers'] if 'covers' in json_data else []
subjects = json_data['subjects'] if 'subjects' in json_data else []
# print(title, description, json.dumps(author_ids), json.dumps(covers), subjects)
cursor.execute('INSERT INTO books VALUES (?, ?, ?, ?, ?)', (work_id, title, description, json.dumps(author_ids), json.dumps(covers))).fetchone()
for subject in subjects:
subject_id = cursor.execute('SELECT id FROM subjects WHERE subject = ?', (subject,)).fetchone()
if subject_id:
cursor.execute('INSERT INTO book_subject VALUES(?,?)', (work_id, subject_id[0]))
else:
cursor.execute('INSERT INTO subjects (subject) VALUES(?)', (subject,))
subject_id = cursor.execute('SELECT id FROM subjects WHERE subject = ?', (subject,)).fetchone()
cursor.execute('INSERT INTO book_subject VALUES(?,?)', (work_id, subject_id[0]))
except sqlite3.IntegrityError as e:
print(e)
pass
except Exception:
print(traceback.format_exc())
print(title, description, json.dumps(author_ids), json.dumps(covers), subjects)
with open('count.txt', 'w') as file:
file.write(str(i))
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment