Dominik Weckmüller do-me

## untracked_md_files.sh
# What's this good for? If you're batch editing/creating new markdown files for
# mkdocs and need to add the entries in the mkdocs.yml file

git ls-files --others --exclude-standard '*.md' | grep '\.md$' | while read filename; do
    echo "- $(basename "${filename%.*}" | sed 's/^[[:space:]]*//'): $(basename "$filename")"
done

# Output
# - copernicus_service_provider: copernicus_service_provider.md
# - expanded_uncertainty: expanded_uncertainty.md

## fetch_all_jsons_concat_to_geoparquet.py
import requests
import geopandas as gpd
from shapely.geometry import shape
import json
from tqdm import tqdm
import time

# Define the base URL and parameters
base_url = "https://geoportal.saarland.de/spatial-objects/408/collections/cp:CadastralParcel/items"
limit = 500

## bge-m3_batch_benchmarks.py
import time
import matplotlib.pyplot as plt

long_text = """Near a great forest there lived a poor woodcutter and his wife, and his two children; the boy's name was Hansel and the girl's Grethel. They had very little to bite or to sup, and once, when there was great dearth in the land, the man could not even gain the daily bread. As he lay in bed one night thinking of this, and turning and tossing, he sighed heavily, and said to his wife, "What will become of us? we cannot even feed our children; there is nothing left for ourselves."
"I will tell you what, husband," answered the wife; "we will take the children early in the morning into the forest, where it is thickest; we will make them a fire, and we will give each of them a piece of bread, then we will go to our work and leave them alone; they will never find the way home again, and we shall be quit of them."
"No, wife," said the man, "I cannot do that; I cannot find in my heart to take my children into the forest and to leave them there alone; the wild an

## overturemaps_places_plot_datashader.py
import geopandas as gpd
import datashader as ds
from colorcet import fire

# download the data before with:
# overturemaps download -f geoparquet --type=place -o places.parquet
gdf = gpd.read_parquet("places.parquet") # takes 3 mins on my M3

# plotting takes 1 min
cvs = ds.Canvas(plot_width=2000, plot_height=1000)

## server.py
#!/usr/bin/env python3
"""
License: MIT License
Copyright (c) 2023 Miel Donkers
Very simple HTTP server in python for logging requests
Modified for CORS (Access-Control-Allow-Origin) when e.g. sending requests from the frontend
Usage::
  ./server.py [<port>]
"""
from http.server import BaseHTTPRequestHandler, HTTPServer

## cosine_similarity.py
from numpy.linalg import norm
import numpy as np

# Define the cosine similarity function with automatic list-to-array conversion
def cos_sim(a, b):
    # Check if either input is NaN, empty, or contains empty strings
    if a is None or b is None or not a or not b:
        return np.nan
    if isinstance(a, list) and any(x == "" or x is None for x in a):
        return np.nan

## pandas_pickle.py
import pandas as pd
import pickle

def write_pd_pickle(df, filename, pickle_cols=None):
    """
    Writes a pandas DataFrame to a Parquet file, pickling specified columns.

    The function takes a DataFrame and pickles the specified columns before saving
    the DataFrame to a Parquet file. This is useful for saving columns that contain
    data types that Parquet might not natively support, such as lists or dictionaries.

## semantic_text_splitter_pandarallel.py
from semantic_text_splitter import TextSplitter
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

splitter = TextSplitter((1500,2000)) # equals around 512 tokens embedding model context, referring to chars here

def wrap_func(text):
    return splitter.chunks(text)
df["chunks"] = df["text"].parallel_apply(wrap_func)

## pandarallel.py
import pandas as pd
import numpy as np
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

# Create a sample dataframe with 10,000 rows and 2 columns
np.random.seed(0)  # for reproducibility
df = pd.DataFrame({'numbers': np.random.randint(1, 100, size=10000000)})

## replace_multiple_whitespaces.py
import re

# Default replaces white spaces, tabs, line breaks etc.
def replace_multiple_whitespaces(text):
    return re.sub(r'\s+', ' ', text) # use re.sub(r'[ \t]+', ' ', text) if line breaks should be preserved

# Use this function if you want to preserve exactly one line break and remove the rest like above
def replace_multiple_whitespaces_keep_one_linebreak(text):
    text = re.sub(r'[ \t]*\r?\n[ \t\r\n]*', '\n', text)
    # Replace one or more spaces or tabs with a single space (for remaining white spaces)
	# What's this good for? If you're batch editing/creating new markdown files for
	# mkdocs and need to add the entries in the mkdocs.yml file

	git ls-files --others --exclude-standard '*.md' \| grep '\.md$' \| while read filename; do
	echo "- $(basename "${filename%.}" \| sed 's/^[[:space:]]//'): $(basename "$filename")"
	done

	# Output
	# - copernicus_service_provider: copernicus_service_provider.md
	# - expanded_uncertainty: expanded_uncertainty.md
	import requests
	import geopandas as gpd
	from shapely.geometry import shape
	import json
	from tqdm import tqdm
	import time

	# Define the base URL and parameters
	base_url = "https://geoportal.saarland.de/spatial-objects/408/collections/cp:CadastralParcel/items"
	limit = 500
	import time
	import matplotlib.pyplot as plt

	long_text = """Near a great forest there lived a poor woodcutter and his wife, and his two children; the boy's name was Hansel and the girl's Grethel. They had very little to bite or to sup, and once, when there was great dearth in the land, the man could not even gain the daily bread. As he lay in bed one night thinking of this, and turning and tossing, he sighed heavily, and said to his wife, "What will become of us? we cannot even feed our children; there is nothing left for ourselves."
	"I will tell you what, husband," answered the wife; "we will take the children early in the morning into the forest, where it is thickest; we will make them a fire, and we will give each of them a piece of bread, then we will go to our work and leave them alone; they will never find the way home again, and we shall be quit of them."
	"No, wife," said the man, "I cannot do that; I cannot find in my heart to take my children into the forest and to leave them there alone; the wild an
	import geopandas as gpd
	import datashader as ds
	from colorcet import fire

	# download the data before with:
	# overturemaps download -f geoparquet --type=place -o places.parquet
	gdf = gpd.read_parquet("places.parquet") # takes 3 mins on my M3

	# plotting takes 1 min
	cvs = ds.Canvas(plot_width=2000, plot_height=1000)
	#!/usr/bin/env python3
	"""
	License: MIT License
	Copyright (c) 2023 Miel Donkers
	Very simple HTTP server in python for logging requests
	Modified for CORS (Access-Control-Allow-Origin) when e.g. sending requests from the frontend
	Usage::
	./server.py [<port>]
	"""
	from http.server import BaseHTTPRequestHandler, HTTPServer
	from numpy.linalg import norm
	import numpy as np

	# Define the cosine similarity function with automatic list-to-array conversion
	def cos_sim(a, b):
	# Check if either input is NaN, empty, or contains empty strings
	if a is None or b is None or not a or not b:
	return np.nan
	if isinstance(a, list) and any(x == "" or x is None for x in a):
	return np.nan
	import pandas as pd
	import pickle

	def write_pd_pickle(df, filename, pickle_cols=None):
	"""
	Writes a pandas DataFrame to a Parquet file, pickling specified columns.

	The function takes a DataFrame and pickles the specified columns before saving
	the DataFrame to a Parquet file. This is useful for saving columns that contain
	data types that Parquet might not natively support, such as lists or dictionaries.
	from semantic_text_splitter import TextSplitter
	from pandarallel import pandarallel

	pandarallel.initialize(progress_bar=True)

	splitter = TextSplitter((1500,2000)) # equals around 512 tokens embedding model context, referring to chars here

	def wrap_func(text):
	return splitter.chunks(text)
	df["chunks"] = df["text"].parallel_apply(wrap_func)
	import re

	# Default replaces white spaces, tabs, line breaks etc.
	def replace_multiple_whitespaces(text):
	return re.sub(r'\s+', ' ', text) # use re.sub(r'[ \t]+', ' ', text) if line breaks should be preserved

	# Use this function if you want to preserve exactly one line break and remove the rest like above
	def replace_multiple_whitespaces_keep_one_linebreak(text):
	text = re.sub(r'[ \t]\r?\n[ \t\r\n]', '\n', text)
	# Replace one or more spaces or tabs with a single space (for remaining white spaces)