Created
April 15, 2022 14:09
-
-
Save Orsucciu/00c4c92eac7a1829084f2dff24d9f8a1 to your computer and use it in GitHub Desktop.
Python script to fetch items from an omekaS db, and turns them into csv import friendly excel files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import openpyxl | |
from openpyxl import Workbook, load_workbook | |
from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE | |
from openpyxl.workbook import workbook | |
import mysql.connector | |
mydb = mysql.connector.connect( | |
host="", | |
user="", | |
password="", | |
database="omekas" | |
) | |
mycursor = mydb.cursor() | |
# main query. This gets the resource_ids we will later use | |
# the important parameters are AND value2.value REGEXP (%s) | |
# and AND value1.value LIKE %s | |
# value2.value REGEXP (%s) checks if the value is one of the properties we'll pass later "[1901|1902]" this is used to check that the item was published in a given year | |
# value1.value is the name | |
resource_ids_query = """ | |
SELECT DISTINCT value1.resource_id | |
FROM value value1 | |
JOIN value value2 | |
ON value1.resource_id = value2.resource_id | |
JOIN value value3 | |
ON value3.resource_id = value1.resource_id | |
WHERE | |
value1.property_id = 1 | |
AND value2.value REGEXP (%s) | |
AND value1.value LIKE %s | |
AND value2.property_id = 7 | |
AND value3.property_id = 8; | |
""" | |
# give a list of years, and a name | |
# returns the ids of the issues we look for | |
values_query = """ | |
SELECT * | |
FROM value | |
WHERE value.resource_id = %s | |
ORDER BY value.property_id ASC; | |
""" | |
medias_query = "SELECT * FROM media WHERE item_id = %s" | |
# example, get all books called u nutiziale from years 1901 or 1902 | |
to_extract = [ | |
["U Nutiziale", "[1901|1902]"] | |
] | |
zero_ids = [] | |
for book in to_extract: | |
wrkbk = workbook.Workbook() | |
wb = wrkbk.active | |
# wb.append(["db_id", "Resource_id", "Property", "Value", "Medias", "Storage Id"]) | |
wb.append(["db_id", "Resource_id", "Medias", "Storage Id"]) | |
print("Treating " + book[0]) | |
mycursor.execute(resource_ids_query, (book[1], "%"+book[0]+"%")) | |
ids = mycursor.fetchall() | |
print("Got " + str(len(ids)) + " ids") | |
if len(ids) == 0: | |
zero_ids.append(book[0]) | |
mycursor.reset() | |
columns_dictionnary = {} | |
# where we start from | |
rows = 2 | |
columns = 4 | |
for book_id in ids: | |
print("Treating " + str(book_id[0]) + " From book " + book[0]) | |
mycursor.execute(values_query, (book_id[0],)) | |
values = mycursor.fetchall() | |
mycursor.execute(medias_query, (book_id[0],)) | |
medias = mycursor.fetchall() | |
print("Medias associated : " + str(len(medias))) | |
mycursor.reset() | |
my_row = {"db_id": "", "Resource_id": "", "Medias": "", "Storage Id": ""} | |
for value in values: | |
# more human-readable approach | |
my_row["db_id"] = value[0] #fck memory lol | |
my_row["Resource_id"] = value[1] #how complex is that uh big boy | |
if my_row.get(str(value[2])) is None: | |
# we add the column key to the dictionnary | |
columns_dictionnary[str(value[2])] = columns+1 | |
my_row[str(value[2])] = ILLEGAL_CHARACTERS_RE.sub(r'', str(value[6])) | |
# after we create a column, we increment their count | |
columns += 1 | |
else: | |
my_row[str(value[2])] += ("$" + str(ILLEGAL_CHARACTERS_RE.sub(r'', str(value[6])))) | |
try: | |
wb.append(list(my_row.values())) | |
except openpyxl.utils.exceptions.IllegalCharacterError: | |
wb.append(list(my_row.values())) | |
# we add the medias | |
for media in medias: | |
wb.append([" ", " ", media[5], media[7]]) | |
rows += 1 | |
wb.append([]) | |
rows += 1 | |
wrkbk.save(book[0] + ".xlsx") | |
wrkbk.close() | |
print("Done") | |
mydb.close() | |
print("Closed") | |
print("Zero ids : " + str(zero_ids)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment