Skip to content

Instantly share code, notes, and snippets.

@Orsucciu
Created April 15, 2022 14:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Orsucciu/00c4c92eac7a1829084f2dff24d9f8a1 to your computer and use it in GitHub Desktop.
Save Orsucciu/00c4c92eac7a1829084f2dff24d9f8a1 to your computer and use it in GitHub Desktop.
Python script to fetch items from an omekaS db, and turns them into csv import friendly excel files
import openpyxl
from openpyxl import Workbook, load_workbook
from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
from openpyxl.workbook import workbook
import mysql.connector
mydb = mysql.connector.connect(
host="",
user="",
password="",
database="omekas"
)
mycursor = mydb.cursor()
# main query. This gets the resource_ids we will later use
# the important parameters are AND value2.value REGEXP (%s)
# and AND value1.value LIKE %s
# value2.value REGEXP (%s) checks if the value is one of the properties we'll pass later "[1901|1902]" this is used to check that the item was published in a given year
# value1.value is the name
resource_ids_query = """
SELECT DISTINCT value1.resource_id
FROM value value1
JOIN value value2
ON value1.resource_id = value2.resource_id
JOIN value value3
ON value3.resource_id = value1.resource_id
WHERE
value1.property_id = 1
AND value2.value REGEXP (%s)
AND value1.value LIKE %s
AND value2.property_id = 7
AND value3.property_id = 8;
"""
# give a list of years, and a name
# returns the ids of the issues we look for
values_query = """
SELECT *
FROM value
WHERE value.resource_id = %s
ORDER BY value.property_id ASC;
"""
medias_query = "SELECT * FROM media WHERE item_id = %s"
# example, get all books called u nutiziale from years 1901 or 1902
to_extract = [
["U Nutiziale", "[1901|1902]"]
]
zero_ids = []
for book in to_extract:
wrkbk = workbook.Workbook()
wb = wrkbk.active
# wb.append(["db_id", "Resource_id", "Property", "Value", "Medias", "Storage Id"])
wb.append(["db_id", "Resource_id", "Medias", "Storage Id"])
print("Treating " + book[0])
mycursor.execute(resource_ids_query, (book[1], "%"+book[0]+"%"))
ids = mycursor.fetchall()
print("Got " + str(len(ids)) + " ids")
if len(ids) == 0:
zero_ids.append(book[0])
mycursor.reset()
columns_dictionnary = {}
# where we start from
rows = 2
columns = 4
for book_id in ids:
print("Treating " + str(book_id[0]) + " From book " + book[0])
mycursor.execute(values_query, (book_id[0],))
values = mycursor.fetchall()
mycursor.execute(medias_query, (book_id[0],))
medias = mycursor.fetchall()
print("Medias associated : " + str(len(medias)))
mycursor.reset()
my_row = {"db_id": "", "Resource_id": "", "Medias": "", "Storage Id": ""}
for value in values:
# more human-readable approach
my_row["db_id"] = value[0] #fck memory lol
my_row["Resource_id"] = value[1] #how complex is that uh big boy
if my_row.get(str(value[2])) is None:
# we add the column key to the dictionnary
columns_dictionnary[str(value[2])] = columns+1
my_row[str(value[2])] = ILLEGAL_CHARACTERS_RE.sub(r'', str(value[6]))
# after we create a column, we increment their count
columns += 1
else:
my_row[str(value[2])] += ("$" + str(ILLEGAL_CHARACTERS_RE.sub(r'', str(value[6]))))
try:
wb.append(list(my_row.values()))
except openpyxl.utils.exceptions.IllegalCharacterError:
wb.append(list(my_row.values()))
# we add the medias
for media in medias:
wb.append([" ", " ", media[5], media[7]])
rows += 1
wb.append([])
rows += 1
wrkbk.save(book[0] + ".xlsx")
wrkbk.close()
print("Done")
mydb.close()
print("Closed")
print("Zero ids : " + str(zero_ids))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment