Created
February 15, 2024 14:54
-
-
Save saattrupdan/91c3fd53ceae252dd54439b45736c2e0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import requests as rq | |
from PyPDF2 import PdfReader | |
import io | |
import re | |
from tqdm.auto import tqdm | |
from datasets import Dataset | |
def new_record(test_type: str, year: int, version: str) -> dict: | |
return dict( | |
question_id=None, | |
question=None, | |
option_a=None, | |
option_b=None, | |
option_c=None, | |
answer="", | |
test_type=test_type, | |
year=year, | |
version=version, | |
) | |
def record_done_except_answer(record: dict) -> bool: | |
return ( | |
record['question_id'] is not None | |
and record['question'] is not None | |
and record['option_a'] is not None | |
and record['option_b'] is not None | |
) | |
def build_dataset(test_type: str, year: int, version: str, test_url: str, answer_url: str) -> pd.DataFrame: | |
pdf_file = io.BytesIO(rq.get(test_url).content) | |
reader = PdfReader(pdf_file) | |
pages = [page.extract_text() for page in reader.pages] | |
records = list() | |
record = new_record(test_type=test_type, year=year, version=version) | |
for page in pages[2:]: | |
for line in page.split('\n'): | |
line = line.strip() | |
if not line: | |
continue | |
line_is_a_header = re.match(r'[0-9]+ ·.*', line) | |
line_begins_a_question = re.match(r'[0-9]+\..*', line) | |
line_continues_a_question = re.match(r'[a-zæøåA-ZÆØÅ]{2,}', line) | |
line_is_an_answer = re.match(r'[ABC]:.*', line) | |
if line_is_a_header: | |
continue | |
elif line_begins_a_question: | |
if record_done_except_answer(record): | |
records.append(record) | |
record = new_record(test_type=test_type, year=year, version=version) | |
question_id = int(re.search(r'^[0-9]+(?=\.)', line).group()) | |
record["question_id"] = question_id | |
question = line | |
question = re.sub(r'^[0-9]+\.', '', question) | |
question = re.sub(r' +', ' ', question) | |
question = re.sub(r' *\?', '?', question) | |
question = question.strip() | |
record['question'] = question | |
elif line_continues_a_question: | |
question += line + " " | |
question = re.sub(r' +', ' ', question) | |
question = re.sub(r' *\?', '?', question) | |
question = question.strip() | |
record['question'] = question | |
elif line_is_an_answer: | |
option_letter = re.search(r'^[ABC]', line).group().lower() | |
option = line | |
option = re.sub(r'^[ABC]+:', '', option) | |
option = option.strip() | |
record[f'option_{option_letter}'] = option | |
if record_done_except_answer(record): | |
records.append(record) | |
df = pd.DataFrame.from_records(records) | |
try: | |
df.set_index('question_id', inplace=True) | |
except KeyError: | |
raise RuntimeError( | |
f"Could not find 'question_id' column in the {test_type} test " | |
f"from year {year}, version {version}" | |
) | |
pdf_file = io.BytesIO(rq.get(answer_url).content) | |
reader = PdfReader(pdf_file) | |
pages = [page.extract_text() for page in reader.pages] | |
for page in pages: | |
for line in page.split('\n'): | |
line = line.strip() | |
if not line: | |
continue | |
contains_answer = re.match("[0-9]+ +[ABC].*", line) is not None | |
if not contains_answer: | |
continue | |
question_id = int(re.search("^[0-9]+", line).group()) | |
answer = re.search(r'[ABC]', line).group() | |
if question_id not in df.index: | |
raise RuntimeError( | |
f"Couldn't find the question ID {question_id} in the {test_type} test " | |
f"from year {year}, version {version}" | |
) | |
df.loc[question_id, "answer"] = answer | |
return df | |
tests = [ | |
dict( | |
test_type="indfødsretsprøven", | |
year=2020, | |
version="summer", | |
test_url="https://danskogproever.dk/media/9835/indfoedsretsproeven-2020-06.pdf", | |
answer_url="https://danskogproever.dk/media/9836/indfoedsretsproeven-2020-06-retteark.pdf", | |
), | |
dict( | |
test_type="indfødsretsprøven", | |
year=2020, | |
version="winter", | |
test_url="https://danskogproever.dk/media/9838/indfoedsretsproeven-2020-11.pdf", | |
answer_url="https://danskogproever.dk/media/9837/indfoedsretsproeven-2020-11-retteark.pdf", | |
), | |
dict( | |
test_type="indfødsretsprøven", | |
year=2021, | |
version="summer", | |
test_url="https://danskogproever.dk/media/9830/indfoedsretsproeven-2021-06.pdf", | |
answer_url="https://danskogproever.dk/media/10033/indfoedsretsproeven-2021-06-retteark.pdf", | |
), | |
dict( | |
test_type="indfødsretsprøven", | |
year=2021, | |
version="winter", | |
test_url="https://danskogproever.dk/media/9833/indfoedsretsproeven-2021-11.pdf", | |
answer_url="https://danskogproever.dk/media/9834/indfoedsretsproeven-2021-11-retteark.pdf", | |
), | |
dict( | |
test_type="indfødsretsprøven", | |
year=2022, | |
version="summer", | |
test_url="https://danskogproever.dk/media/9828/indfoedsretsproeven-2022-06.pdf", | |
answer_url="https://danskogproever.dk/media/9829/indfoedsretsproeven-2022-06-retteark.pdf", | |
), | |
dict( | |
test_type="indfødsretsprøven", | |
year=2022, | |
version="winter", | |
test_url="https://danskogproever.dk/media/9831/indfoedsretsproeven-2022-11.pdf", | |
answer_url="https://danskogproever.dk/media/9832/indfoedsretsproeven-2022-11-retteark.pdf", | |
), | |
dict( | |
test_type="indfødsretsprøven", | |
year=2023, | |
version="summer", | |
test_url="https://danskogproever.dk/media/10032/indfoedsretsproeven-sommer-2023.pdf", | |
answer_url="https://danskogproever.dk/media/10031/retteark-indfoedsretsproeven-sommer-2023.pdf", | |
), | |
dict( | |
test_type="indfødsretsprøven", | |
year=2023, | |
version="winter", | |
test_url="https://danskogproever.dk/media/11356/indfoedsretsproeven-vinter-2023.pdf", | |
answer_url="https://danskogproever.dk/media/11352/indfoedsretsproeven-vinter-2023-retteark.pdf", | |
), | |
dict( | |
test_type="medborgerskabsprøven", | |
year=2016, | |
version="winter", | |
test_url="https://danskogproever.dk/media/10050/medborgerskabsproeven-vinter-2016.pdf", | |
answer_url="https://danskogproever.dk/media/10048/medborgerskabsproeven-vinter-2016-retteark.pdf", | |
), | |
dict( | |
test_type="medborgerskabsprøven", | |
year=2017, | |
version="summer", | |
test_url="https://danskogproever.dk/media/10038/medborgerskabsproeven-sommer-2017.pdf", | |
answer_url="https://danskogproever.dk/media/10037/medborgerskabsproeven-sommer-2017-retteark.pdf", | |
), | |
dict( | |
test_type="medborgerskabsprøven", | |
year=2017, | |
version="winter", | |
test_url="https://danskogproever.dk/media/10051/medborgerskabsproeven-vinter-2017.pdf", | |
answer_url="https://danskogproever.dk/media/10049/medborgerskabsproeven-vinter-2017-retteark.pdf", | |
), | |
dict( | |
test_type="medborgerskabsprøven", | |
year=2018, | |
version="summer", | |
test_url="https://danskogproever.dk/media/10040/medborgerskabsproeven-sommer-2018.pdf", | |
answer_url="https://danskogproever.dk/media/10035/medborgerskabsproeven-sommer-2018-retteark.pdf", | |
), | |
dict( | |
test_type="medborgerskabsprøven", | |
year=2018, | |
version="winter", | |
test_url="https://danskogproever.dk/media/10053/medborgerskabsproeven-vinter-2018.pdf", | |
answer_url="https://danskogproever.dk/media/10052/medborgerskabsproeven-vinter-2018-retteark.pdf", | |
), | |
dict( | |
test_type="medborgerskabsprøven", | |
year=2019, | |
version="summer", | |
test_url="https://danskogproever.dk/media/10042/medborgerskabsproeven-sommer-2019.pdf", | |
answer_url="https://danskogproever.dk/media/10041/medborgerskabsproeven-sommer-2019-retteark.pdf", | |
), | |
dict( | |
test_type="medborgerskabsprøven", | |
year=2019, | |
version="winter", | |
test_url="https://danskogproever.dk/media/10055/medborgerskabsproeven-vinter-2019.pdf", | |
answer_url="https://danskogproever.dk/media/10054/medborgerskabsproeven-vinter-2019-retteark.pdf", | |
), | |
dict( | |
test_type="medborgerskabsprøven", | |
year=2020, | |
version="summer", | |
test_url="https://danskogproever.dk/media/10044/medborgerskabsproeven-sommer-2020.pdf", | |
answer_url="https://danskogproever.dk/media/10043/medborgerskabsproeven-sommer-2020-retteark.pdf", | |
), | |
dict( | |
test_type="medborgerskabsprøven", | |
year=2020, | |
version="winter", | |
test_url="https://danskogproever.dk/media/10057/medborgerskabsproeven-vinter-2020.pdf", | |
answer_url="https://danskogproever.dk/media/10056/medborgerskabsproeven-vinter-2020-retteark.pdf", | |
), | |
dict( | |
test_type="medborgerskabsprøven", | |
year=2021, | |
version="summer", | |
test_url="https://danskogproever.dk/media/10047/medborgerskabsproeven-sommer-2021.pdf", | |
answer_url="https://danskogproever.dk/media/10045/medborgerskabsproeven-sommer-2021-retteark.pdf", | |
), | |
dict( | |
test_type="medborgerskabsprøven", | |
year=2021, | |
version="winter", | |
test_url="https://danskogproever.dk/media/10061/medborgerskabsproeven-vinter-2021.pdf", | |
answer_url="https://danskogproever.dk/media/10058/medborgerskabsproeven-vinter-2021-retteark.pdf", | |
), | |
dict( | |
test_type="medborgerskabsprøven", | |
year=2022, | |
version="summer", | |
test_url="https://danskogproever.dk/media/10039/medborgerskabsproeven-s22.pdf", | |
answer_url="https://danskogproever.dk/media/10036/medborgerskabsproeven-retteark-s22.pdf", | |
), | |
dict( | |
test_type="medborgerskabsprøven", | |
year=2022, | |
version="winter", | |
test_url="https://danskogproever.dk/media/10060/medborgerskabsproeven-vinter-2022.pdf", | |
answer_url="https://danskogproever.dk/media/10062/retteark-medborgerskabsproeven-vinter-2022.pdf", | |
), | |
dict( | |
test_type="medborgerskabsprøven", | |
year=2023, | |
version="summer", | |
test_url="https://danskogproever.dk/media/10046/medborgerskabsproeven-sommer-2023.pdf", | |
answer_url="https://danskogproever.dk/media/11349/retteark-medborgerskabsproeve-sommer-2023.pdf", | |
), | |
dict( | |
test_type="medborgerskabsprøven", | |
year=2023, | |
version="winter", | |
test_url="https://danskogproever.dk/media/11357/medborgerkskabsproeve-vinter-2023.pdf", | |
answer_url="https://danskogproever.dk/media/11358/medborgerskabsproeve-vinter-2023-retteark.pdf", | |
), | |
] | |
df = pd.concat([build_dataset(**test).reset_index() for test in tqdm(tests)]).reset_index(drop=True) | |
df = df[["question", "option_a", "option_b", "option_c", "answer", "test_type", "year", "version", "question_id"]] | |
dataset = Dataset.from_pandas(df) | |
dataset.push_to_hub('alexandrainst/danish-citizen-tests', private=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment