Kasper Junge kasperjunge

## pydantic_to_openai_function_calling.py
from pydantic import BaseModel, Field
import json

class ReceiptDataExtractor(BaseModel):
    date: str = Field(description="The date.")
    amount: float = Field(description="The total amount.")
    supplier: str = Field(description="The supplier.")

    class Config:
        function_meta = {

## chunk_df.py
import pandas as pd

def chunk_dataframe(df: pd.DataFrame, chunk_size: int) -> pd.DataFrame:
    for i in range(0, len(df), chunk_size):
        yield df[i:i+chunk_size]

## gist:4ec27724272109b9a413e4d1477ef7cd
from fastapi import FastAPI
import transformers
import torch

app = FastAPI()

# Load the tokenizer and model
tokenizer = transformers.AutoTokenizer.from_pretrained("model_name")
model = transformers.AutoModelForTokenClassification.from_pretrained("model_name")

## measure_function_speed.py
import time
import statistics
from typing import Callable, Any, Tuple

def time_function(func: Callable, func_input: Any, n_runs: int) -> Tuple[float]:
    times = []
    for _ in range(n_runs):
        start = time.perf_counter()
        func(func_input)
        times.append(time.perf_counter() - start)

## print_wordpiece_tokens.py
from transformers import AutoTokenizer

# define sample text
text = "Rødgrød med fløde."

# init tokenizer
model_id = "Maltehb/danish-bert-botxo"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# encode text

## tokenizer_huggingface_dataset.py
from typing import Union
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict

def tokenize_huggingface_dataset(
    ds: Union[Dataset, DatasetDict],
    tokenizer: AutoTokenizer,
    max_length: int = 512,
    truncation: bool = True,
) -> Union[Dataset, DatasetDict]:

## dataframe_to_huggingface_dataset.py
import pandas as pd
from datasets import Dataset, DatasetDict

def dataframe_to_huggingface_dataset(df: pd.DataFrame) -> DatasetDict:
    """Convert at dataframe with split column into Hugging Face DatasetDict.

    Args:
        df (pd.DataFrame): Dataset stored in a pandas dataframe with a
        column named "split" that tells what split a datapoint belongs to.


## danish_sota_ner.py
import pandas as pd
from transformers import pipeline

text = """
Dan Saattrup Nielsen arbejder som AI Specialist hos Alexandra Instituttet
og er han næstformand i Dansk Data Science Community.
"""

ner = pipeline(
    task="ner",

## flatten_list.py
from typing import List, Any

def flatten_list(list_of_lists: List[List[Any]]) -> List[Any]:
    """Merge/flatten a list of lists into one single list.
    Example: [[1, 2, 3],[4, 5, 6]] --> [1, 2, 3, 4, 5, 6]

    Args:
        list_of_lists (List[list]): List of lists to be merged/flattened.

    Returns:

## download_n_eb_articles.py
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset

def download_n_eb_articles(n: int) -> pd.DataFrame:
    """Extract n Ekstra Bladet articles from the Danish subset
    of the mC4 dataset.
    Args:
        n (int): Number of articles to extract.
    Returns:
	from pydantic import BaseModel, Field
	import json

	class ReceiptDataExtractor(BaseModel):
	date: str = Field(description="The date.")
	amount: float = Field(description="The total amount.")
	supplier: str = Field(description="The supplier.")

	class Config:
	function_meta = {
	import pandas as pd

	def chunk_dataframe(df: pd.DataFrame, chunk_size: int) -> pd.DataFrame:
	for i in range(0, len(df), chunk_size):
	yield df[i:i+chunk_size]
	from fastapi import FastAPI
	import transformers
	import torch

	app = FastAPI()

	# Load the tokenizer and model
	tokenizer = transformers.AutoTokenizer.from_pretrained("model_name")
	model = transformers.AutoModelForTokenClassification.from_pretrained("model_name")
	import time
	import statistics
	from typing import Callable, Any, Tuple

	def time_function(func: Callable, func_input: Any, n_runs: int) -> Tuple[float]:
	times = []
	for _ in range(n_runs):
	start = time.perf_counter()
	func(func_input)
	times.append(time.perf_counter() - start)
	from transformers import AutoTokenizer

	# define sample text
	text = "Rødgrød med fløde."

	# init tokenizer
	model_id = "Maltehb/danish-bert-botxo"
	tokenizer = AutoTokenizer.from_pretrained(model_id)

	# encode text
	from typing import Union
	from transformers import AutoTokenizer
	from datasets import Dataset, DatasetDict

	def tokenize_huggingface_dataset(
	ds: Union[Dataset, DatasetDict],
	tokenizer: AutoTokenizer,
	max_length: int = 512,
	truncation: bool = True,
	) -> Union[Dataset, DatasetDict]:
	import pandas as pd
	from datasets import Dataset, DatasetDict

	def dataframe_to_huggingface_dataset(df: pd.DataFrame) -> DatasetDict:
	"""Convert at dataframe with split column into Hugging Face DatasetDict.

	Args:
	df (pd.DataFrame): Dataset stored in a pandas dataframe with a
	column named "split" that tells what split a datapoint belongs to.
	import pandas as pd
	from transformers import pipeline

	text = """
	Dan Saattrup Nielsen arbejder som AI Specialist hos Alexandra Instituttet
	og er han næstformand i Dansk Data Science Community.
	"""

	ner = pipeline(
	task="ner",
	from typing import List, Any

	def flatten_list(list_of_lists: List[List[Any]]) -> List[Any]:
	"""Merge/flatten a list of lists into one single list.
	Example: [[1, 2, 3],[4, 5, 6]] --> [1, 2, 3, 4, 5, 6]

	Args:
	list_of_lists (List[list]): List of lists to be merged/flattened.

	Returns:
	import pandas as pd
	from tqdm import tqdm
	from datasets import load_dataset

	def download_n_eb_articles(n: int) -> pd.DataFrame:
	"""Extract n Ekstra Bladet articles from the Danish subset
	of the mC4 dataset.
	Args:
	n (int): Number of articles to extract.
	Returns: