Orsucciu/Estrazzione.py

## Estrazzione.py
import openpyxl
from openpyxl import Workbook, load_workbook
from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
from openpyxl.workbook import workbook
import mysql.connector

mydb = mysql.connector.connect(
    host="",
    user="",
    password="",
    database="omekas"
)

mycursor = mydb.cursor()

# main query. This gets the resource_ids we will later use
# the important parameters are AND value2.value REGEXP (%s)
#                          and AND value1.value LIKE %s
# value2.value REGEXP (%s) checks if the value is one of the properties we'll pass later "[1901|1902]" this is used to check that the item was published in a given year
# value1.value is the name
resource_ids_query = """
SELECT DISTINCT value1.resource_id
FROM value value1
JOIN value value2
	ON value1.resource_id = value2.resource_id
JOIN value value3
    ON value3.resource_id = value1.resource_id
WHERE
	value1.property_id = 1
    AND value2.value REGEXP (%s)
    AND value1.value LIKE %s
    AND value2.property_id = 7
    AND value3.property_id = 8;
"""
# give a list of years, and a name
# returns the ids of the issues we look for

values_query = """
SELECT *
FROM value
WHERE value.resource_id = %s
ORDER BY value.property_id ASC;
"""

medias_query = "SELECT * FROM media WHERE item_id = %s"

# example, get all books called u nutiziale from years 1901 or 1902
to_extract = [
  ["U Nutiziale", "[1901|1902]"]
]

zero_ids = []

for book in to_extract:
    wrkbk = workbook.Workbook()
    wb = wrkbk.active

    # wb.append(["db_id", "Resource_id", "Property", "Value", "Medias", "Storage Id"])
    wb.append(["db_id", "Resource_id", "Medias", "Storage Id"])

    print("Treating " + book[0])
    mycursor.execute(resource_ids_query, (book[1], "%"+book[0]+"%"))
    ids = mycursor.fetchall()
    print("Got " + str(len(ids)) + " ids")
    if len(ids) == 0:
        zero_ids.append(book[0])
    mycursor.reset()
    columns_dictionnary = {}
    # where we start from
    rows = 2
    columns = 4

    for book_id in ids:
        print("Treating " + str(book_id[0]) + " From book " + book[0])
        mycursor.execute(values_query, (book_id[0],))
        values = mycursor.fetchall()
        mycursor.execute(medias_query, (book_id[0],))
        medias = mycursor.fetchall()
        print("Medias associated : " + str(len(medias)))

        mycursor.reset()

        my_row = {"db_id": "", "Resource_id": "", "Medias": "", "Storage Id": ""}

        for value in values:

            # more human-readable approach
            my_row["db_id"] = value[0] #fck memory lol
            my_row["Resource_id"] = value[1] #how complex is that uh big boy

            if my_row.get(str(value[2])) is None:
                # we add the column key to the dictionnary

                columns_dictionnary[str(value[2])] = columns+1
                my_row[str(value[2])] = ILLEGAL_CHARACTERS_RE.sub(r'', str(value[6]))
                # after we create a column, we increment their count
                columns += 1

            else:

                my_row[str(value[2])] += ("$" + str(ILLEGAL_CHARACTERS_RE.sub(r'', str(value[6]))))
       try:
            wb.append(list(my_row.values()))
        except openpyxl.utils.exceptions.IllegalCharacterError:
            wb.append(list(my_row.values()))
        # we add the medias
        for media in medias:
            wb.append([" ", " ", media[5], media[7]])
            rows += 1

        wb.append([])
        rows += 1
    wrkbk.save(book[0] + ".xlsx")
    wrkbk.close()

print("Done")
mydb.close()
print("Closed")
print("Zero ids : " + str(zero_ids))
	import openpyxl
	from openpyxl import Workbook, load_workbook
	from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
	from openpyxl.workbook import workbook
	import mysql.connector

	mydb = mysql.connector.connect(
	host="",
	user="",
	password="",
	database="omekas"
	)

	mycursor = mydb.cursor()

	# main query. This gets the resource_ids we will later use
	# the important parameters are AND value2.value REGEXP (%s)
	# and AND value1.value LIKE %s
	# value2.value REGEXP (%s) checks if the value is one of the properties we'll pass later "[1901\|1902]" this is used to check that the item was published in a given year
	# value1.value is the name
	resource_ids_query = """
	SELECT DISTINCT value1.resource_id
	FROM value value1
	JOIN value value2
	ON value1.resource_id = value2.resource_id
	JOIN value value3
	ON value3.resource_id = value1.resource_id
	WHERE
	value1.property_id = 1
	AND value2.value REGEXP (%s)
	AND value1.value LIKE %s
	AND value2.property_id = 7
	AND value3.property_id = 8;
	"""
	# give a list of years, and a name
	# returns the ids of the issues we look for

	values_query = """
	SELECT *
	FROM value
	WHERE value.resource_id = %s
	ORDER BY value.property_id ASC;
	"""

	medias_query = "SELECT * FROM media WHERE item_id = %s"

	# example, get all books called u nutiziale from years 1901 or 1902
	to_extract = [
	["U Nutiziale", "[1901\|1902]"]
	]

	zero_ids = []

	for book in to_extract:
	wrkbk = workbook.Workbook()
	wb = wrkbk.active

	# wb.append(["db_id", "Resource_id", "Property", "Value", "Medias", "Storage Id"])
	wb.append(["db_id", "Resource_id", "Medias", "Storage Id"])

	print("Treating " + book[0])
	mycursor.execute(resource_ids_query, (book[1], "%"+book[0]+"%"))
	ids = mycursor.fetchall()
	print("Got " + str(len(ids)) + " ids")
	if len(ids) == 0:
	zero_ids.append(book[0])
	mycursor.reset()
	columns_dictionnary = {}
	# where we start from
	rows = 2
	columns = 4

	for book_id in ids:
	print("Treating " + str(book_id[0]) + " From book " + book[0])
	mycursor.execute(values_query, (book_id[0],))
	values = mycursor.fetchall()
	mycursor.execute(medias_query, (book_id[0],))
	medias = mycursor.fetchall()
	print("Medias associated : " + str(len(medias)))

	mycursor.reset()

	my_row = {"db_id": "", "Resource_id": "", "Medias": "", "Storage Id": ""}

	for value in values:

	# more human-readable approach
	my_row["db_id"] = value[0] #fck memory lol
	my_row["Resource_id"] = value[1] #how complex is that uh big boy

	if my_row.get(str(value[2])) is None:
	# we add the column key to the dictionnary

	columns_dictionnary[str(value[2])] = columns+1
	my_row[str(value[2])] = ILLEGAL_CHARACTERS_RE.sub(r'', str(value[6]))
	# after we create a column, we increment their count
	columns += 1

	else:

	my_row[str(value[2])] += ("$" + str(ILLEGAL_CHARACTERS_RE.sub(r'', str(value[6]))))
	try:
	wb.append(list(my_row.values()))
	except openpyxl.utils.exceptions.IllegalCharacterError:
	wb.append(list(my_row.values()))
	# we add the medias
	for media in medias:
	wb.append([" ", " ", media[5], media[7]])
	rows += 1

	wb.append([])
	rows += 1
	wrkbk.save(book[0] + ".xlsx")
	wrkbk.close()

	print("Done")
	mydb.close()
	print("Closed")
	print("Zero ids : " + str(zero_ids))