Skip to content

Instantly share code, notes, and snippets.

@saattrupdan
Created February 15, 2024 14:54
Show Gist options
  • Save saattrupdan/91c3fd53ceae252dd54439b45736c2e0 to your computer and use it in GitHub Desktop.
Save saattrupdan/91c3fd53ceae252dd54439b45736c2e0 to your computer and use it in GitHub Desktop.
import pandas as pd
import requests as rq
from PyPDF2 import PdfReader
import io
import re
from tqdm.auto import tqdm
from datasets import Dataset
def new_record(test_type: str, year: int, version: str) -> dict:
return dict(
question_id=None,
question=None,
option_a=None,
option_b=None,
option_c=None,
answer="",
test_type=test_type,
year=year,
version=version,
)
def record_done_except_answer(record: dict) -> bool:
return (
record['question_id'] is not None
and record['question'] is not None
and record['option_a'] is not None
and record['option_b'] is not None
)
def build_dataset(test_type: str, year: int, version: str, test_url: str, answer_url: str) -> pd.DataFrame:
pdf_file = io.BytesIO(rq.get(test_url).content)
reader = PdfReader(pdf_file)
pages = [page.extract_text() for page in reader.pages]
records = list()
record = new_record(test_type=test_type, year=year, version=version)
for page in pages[2:]:
for line in page.split('\n'):
line = line.strip()
if not line:
continue
line_is_a_header = re.match(r'[0-9]+ ·.*', line)
line_begins_a_question = re.match(r'[0-9]+\..*', line)
line_continues_a_question = re.match(r'[a-zæøåA-ZÆØÅ]{2,}', line)
line_is_an_answer = re.match(r'[ABC]:.*', line)
if line_is_a_header:
continue
elif line_begins_a_question:
if record_done_except_answer(record):
records.append(record)
record = new_record(test_type=test_type, year=year, version=version)
question_id = int(re.search(r'^[0-9]+(?=\.)', line).group())
record["question_id"] = question_id
question = line
question = re.sub(r'^[0-9]+\.', '', question)
question = re.sub(r' +', ' ', question)
question = re.sub(r' *\?', '?', question)
question = question.strip()
record['question'] = question
elif line_continues_a_question:
question += line + " "
question = re.sub(r' +', ' ', question)
question = re.sub(r' *\?', '?', question)
question = question.strip()
record['question'] = question
elif line_is_an_answer:
option_letter = re.search(r'^[ABC]', line).group().lower()
option = line
option = re.sub(r'^[ABC]+:', '', option)
option = option.strip()
record[f'option_{option_letter}'] = option
if record_done_except_answer(record):
records.append(record)
df = pd.DataFrame.from_records(records)
try:
df.set_index('question_id', inplace=True)
except KeyError:
raise RuntimeError(
f"Could not find 'question_id' column in the {test_type} test "
f"from year {year}, version {version}"
)
pdf_file = io.BytesIO(rq.get(answer_url).content)
reader = PdfReader(pdf_file)
pages = [page.extract_text() for page in reader.pages]
for page in pages:
for line in page.split('\n'):
line = line.strip()
if not line:
continue
contains_answer = re.match("[0-9]+ +[ABC].*", line) is not None
if not contains_answer:
continue
question_id = int(re.search("^[0-9]+", line).group())
answer = re.search(r'[ABC]', line).group()
if question_id not in df.index:
raise RuntimeError(
f"Couldn't find the question ID {question_id} in the {test_type} test "
f"from year {year}, version {version}"
)
df.loc[question_id, "answer"] = answer
return df
tests = [
dict(
test_type="indfødsretsprøven",
year=2020,
version="summer",
test_url="https://danskogproever.dk/media/9835/indfoedsretsproeven-2020-06.pdf",
answer_url="https://danskogproever.dk/media/9836/indfoedsretsproeven-2020-06-retteark.pdf",
),
dict(
test_type="indfødsretsprøven",
year=2020,
version="winter",
test_url="https://danskogproever.dk/media/9838/indfoedsretsproeven-2020-11.pdf",
answer_url="https://danskogproever.dk/media/9837/indfoedsretsproeven-2020-11-retteark.pdf",
),
dict(
test_type="indfødsretsprøven",
year=2021,
version="summer",
test_url="https://danskogproever.dk/media/9830/indfoedsretsproeven-2021-06.pdf",
answer_url="https://danskogproever.dk/media/10033/indfoedsretsproeven-2021-06-retteark.pdf",
),
dict(
test_type="indfødsretsprøven",
year=2021,
version="winter",
test_url="https://danskogproever.dk/media/9833/indfoedsretsproeven-2021-11.pdf",
answer_url="https://danskogproever.dk/media/9834/indfoedsretsproeven-2021-11-retteark.pdf",
),
dict(
test_type="indfødsretsprøven",
year=2022,
version="summer",
test_url="https://danskogproever.dk/media/9828/indfoedsretsproeven-2022-06.pdf",
answer_url="https://danskogproever.dk/media/9829/indfoedsretsproeven-2022-06-retteark.pdf",
),
dict(
test_type="indfødsretsprøven",
year=2022,
version="winter",
test_url="https://danskogproever.dk/media/9831/indfoedsretsproeven-2022-11.pdf",
answer_url="https://danskogproever.dk/media/9832/indfoedsretsproeven-2022-11-retteark.pdf",
),
dict(
test_type="indfødsretsprøven",
year=2023,
version="summer",
test_url="https://danskogproever.dk/media/10032/indfoedsretsproeven-sommer-2023.pdf",
answer_url="https://danskogproever.dk/media/10031/retteark-indfoedsretsproeven-sommer-2023.pdf",
),
dict(
test_type="indfødsretsprøven",
year=2023,
version="winter",
test_url="https://danskogproever.dk/media/11356/indfoedsretsproeven-vinter-2023.pdf",
answer_url="https://danskogproever.dk/media/11352/indfoedsretsproeven-vinter-2023-retteark.pdf",
),
dict(
test_type="medborgerskabsprøven",
year=2016,
version="winter",
test_url="https://danskogproever.dk/media/10050/medborgerskabsproeven-vinter-2016.pdf",
answer_url="https://danskogproever.dk/media/10048/medborgerskabsproeven-vinter-2016-retteark.pdf",
),
dict(
test_type="medborgerskabsprøven",
year=2017,
version="summer",
test_url="https://danskogproever.dk/media/10038/medborgerskabsproeven-sommer-2017.pdf",
answer_url="https://danskogproever.dk/media/10037/medborgerskabsproeven-sommer-2017-retteark.pdf",
),
dict(
test_type="medborgerskabsprøven",
year=2017,
version="winter",
test_url="https://danskogproever.dk/media/10051/medborgerskabsproeven-vinter-2017.pdf",
answer_url="https://danskogproever.dk/media/10049/medborgerskabsproeven-vinter-2017-retteark.pdf",
),
dict(
test_type="medborgerskabsprøven",
year=2018,
version="summer",
test_url="https://danskogproever.dk/media/10040/medborgerskabsproeven-sommer-2018.pdf",
answer_url="https://danskogproever.dk/media/10035/medborgerskabsproeven-sommer-2018-retteark.pdf",
),
dict(
test_type="medborgerskabsprøven",
year=2018,
version="winter",
test_url="https://danskogproever.dk/media/10053/medborgerskabsproeven-vinter-2018.pdf",
answer_url="https://danskogproever.dk/media/10052/medborgerskabsproeven-vinter-2018-retteark.pdf",
),
dict(
test_type="medborgerskabsprøven",
year=2019,
version="summer",
test_url="https://danskogproever.dk/media/10042/medborgerskabsproeven-sommer-2019.pdf",
answer_url="https://danskogproever.dk/media/10041/medborgerskabsproeven-sommer-2019-retteark.pdf",
),
dict(
test_type="medborgerskabsprøven",
year=2019,
version="winter",
test_url="https://danskogproever.dk/media/10055/medborgerskabsproeven-vinter-2019.pdf",
answer_url="https://danskogproever.dk/media/10054/medborgerskabsproeven-vinter-2019-retteark.pdf",
),
dict(
test_type="medborgerskabsprøven",
year=2020,
version="summer",
test_url="https://danskogproever.dk/media/10044/medborgerskabsproeven-sommer-2020.pdf",
answer_url="https://danskogproever.dk/media/10043/medborgerskabsproeven-sommer-2020-retteark.pdf",
),
dict(
test_type="medborgerskabsprøven",
year=2020,
version="winter",
test_url="https://danskogproever.dk/media/10057/medborgerskabsproeven-vinter-2020.pdf",
answer_url="https://danskogproever.dk/media/10056/medborgerskabsproeven-vinter-2020-retteark.pdf",
),
dict(
test_type="medborgerskabsprøven",
year=2021,
version="summer",
test_url="https://danskogproever.dk/media/10047/medborgerskabsproeven-sommer-2021.pdf",
answer_url="https://danskogproever.dk/media/10045/medborgerskabsproeven-sommer-2021-retteark.pdf",
),
dict(
test_type="medborgerskabsprøven",
year=2021,
version="winter",
test_url="https://danskogproever.dk/media/10061/medborgerskabsproeven-vinter-2021.pdf",
answer_url="https://danskogproever.dk/media/10058/medborgerskabsproeven-vinter-2021-retteark.pdf",
),
dict(
test_type="medborgerskabsprøven",
year=2022,
version="summer",
test_url="https://danskogproever.dk/media/10039/medborgerskabsproeven-s22.pdf",
answer_url="https://danskogproever.dk/media/10036/medborgerskabsproeven-retteark-s22.pdf",
),
dict(
test_type="medborgerskabsprøven",
year=2022,
version="winter",
test_url="https://danskogproever.dk/media/10060/medborgerskabsproeven-vinter-2022.pdf",
answer_url="https://danskogproever.dk/media/10062/retteark-medborgerskabsproeven-vinter-2022.pdf",
),
dict(
test_type="medborgerskabsprøven",
year=2023,
version="summer",
test_url="https://danskogproever.dk/media/10046/medborgerskabsproeven-sommer-2023.pdf",
answer_url="https://danskogproever.dk/media/11349/retteark-medborgerskabsproeve-sommer-2023.pdf",
),
dict(
test_type="medborgerskabsprøven",
year=2023,
version="winter",
test_url="https://danskogproever.dk/media/11357/medborgerkskabsproeve-vinter-2023.pdf",
answer_url="https://danskogproever.dk/media/11358/medborgerskabsproeve-vinter-2023-retteark.pdf",
),
]
df = pd.concat([build_dataset(**test).reset_index() for test in tqdm(tests)]).reset_index(drop=True)
df = df[["question", "option_a", "option_b", "option_c", "answer", "test_type", "year", "version", "question_id"]]
dataset = Dataset.from_pandas(df)
dataset.push_to_hub('alexandrainst/danish-citizen-tests', private=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment