Dan Saattrup Smart saattrupdan

## create_dataset.py
# /// script
# requires-python = ">=3.10,<4.0"
# dependencies = [
#     "datasets==4.1.1",
# ]
# ///

"""Create a Danish and English instruction dataset from multiple sources."""

import io

## install_nvcc_with_cuda128.sh
#!/bin/bash
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get -y install cuda-toolkit-12-8
echo 'CUDA_VERSION="12.8"' >> ~/.bashrc
echo 'export PATH=/usr/local/cuda-${CUDA_VERSION}/bin${PATH:+:${PATH}}$' >> ~/.bashrc
echo 'export LD_LIBRARY_PATH=/usr/local/cuda-${CUDA_VERSION}/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}' >> ~/.bashrc
source ~/.bashrc
echo "Done! Here's the NVCC version:"

## openai_generation.py
"""Example use of generating with OpenAI models.

Installation:
    $ pip install openai
"""

from openai import OpenAI
from openai.types.chat import (
    ChatCompletionMessageParam,
    ChatCompletionSystemMessageParam,

## danish-citizen-tests.py
import pandas as pd
import requests as rq
from PyPDF2 import PdfReader
import io
import re
from tqdm.auto import tqdm
from datasets import Dataset

def new_record(test_type: str, year: int, version: str) -> dict:
    return dict(

## dacoref.py
'''Convert a CONLLU file with coreference data to a JSONL file with clusters.

Usage:
    python parse_dacoref.py <input_file>

Author:
    Dan Saattrup Nielsen (dan.nielsen@alexandra.dk)
'''

import conllu

## download_ftspeech.py
"""Downloads the FTSpeech corpus."""

from selenium import webdriver
from selenium.webdriver.common.by import By
from pathlib import Path
import time
from getpass import getpass


def main() -> None:

## create_danfever_splits.py
from datasets import load_dataset

# Load the DanFEVER dataset
dataset = load_dataset("strombergnlp/danfever", split="train")

# Convert the dataset to a Pandas DataFrame
df = dataset.to_pandas()

# Get list unique `evidence_extract` values, along with their counts
evidence_extract_counts = df.evidence_extract.value_counts()

## create-danish-wit.py
"""Unpack the WIT dataset and extract the Danish samples."""

from datasets.arrow_dataset import Example
from datasets.dataset_dict import DatasetDict
from datasets.load import load_dataset
from pathlib import Path
from tqdm.auto import tqdm
import re
	# /// script
	# requires-python = ">=3.10,<4.0"
	# dependencies = [
	# "datasets==4.1.1",
	# ]
	# ///

	"""Create a Danish and English instruction dataset from multiple sources."""

	import io
	#!/bin/bash
	wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
	sudo dpkg -i cuda-keyring_1.1-1_all.deb
	sudo apt-get update
	sudo apt-get -y install cuda-toolkit-12-8
	echo 'CUDA_VERSION="12.8"' >> ~/.bashrc
	echo 'export PATH=/usr/local/cuda-${CUDA_VERSION}/bin${PATH:+:${PATH}}$' >> ~/.bashrc
	echo 'export LD_LIBRARY_PATH=/usr/local/cuda-${CUDA_VERSION}/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}' >> ~/.bashrc
	source ~/.bashrc
	echo "Done! Here's the NVCC version:"
	"""Example use of generating with OpenAI models.

	Installation:
	$ pip install openai
	"""

	from openai import OpenAI
	from openai.types.chat import (
	ChatCompletionMessageParam,
	ChatCompletionSystemMessageParam,
	import pandas as pd
	import requests as rq
	from PyPDF2 import PdfReader
	import io
	import re
	from tqdm.auto import tqdm
	from datasets import Dataset

	def new_record(test_type: str, year: int, version: str) -> dict:
	return dict(
	'''Convert a CONLLU file with coreference data to a JSONL file with clusters.

	Usage:
	python parse_dacoref.py <input_file>

	Author:
	Dan Saattrup Nielsen (dan.nielsen@alexandra.dk)
	'''

	import conllu
	"""Downloads the FTSpeech corpus."""

	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from pathlib import Path
	import time
	from getpass import getpass


	def main() -> None:
	from datasets import load_dataset

	# Load the DanFEVER dataset
	dataset = load_dataset("strombergnlp/danfever", split="train")

	# Convert the dataset to a Pandas DataFrame
	df = dataset.to_pandas()

	# Get list unique `evidence_extract` values, along with their counts
	evidence_extract_counts = df.evidence_extract.value_counts()
	"""Unpack the WIT dataset and extract the Danish samples."""

	from datasets.arrow_dataset import Example
	from datasets.dataset_dict import DatasetDict
	from datasets.load import load_dataset
	from pathlib import Path
	from tqdm.auto import tqdm
	import re