Skip to content

Instantly share code, notes, and snippets.

View do-me's full-sized avatar

Dominik Weckmüller do-me

View GitHub Profile
@do-me
do-me / replace_multiple_whitespaces.py
Last active April 23, 2024 15:27
Replace an arbitrary number of white spaces by just one white space in Python for data cleaning (useful for XML/HTML parsing)
import re
# Default replaces white spaces, tabs, line breaks etc.
def replace_multiple_whitespaces(text):
return re.sub(r'\s+', ' ', text) # use re.sub(r'[ \t]+', ' ', text) if line breaks should be preserved
# Use this function if you want to preserve exactly one line break and remove the rest like above
def replace_multiple_whitespaces_keep_one_linebreak(text):
text = re.sub(r'[ \t]*\r?\n[ \t\r\n]*', '\n', text)
# Replace one or more spaces or tabs with a single space (for remaining white spaces)
@do-me
do-me / tippecanoe.md
Created April 10, 2024 14:15
tippecanoe convert geojson to pbf vector tiles
  • gdf.to_json("temp.json")
  • tippecanoe --no-feature-limit --no-tile-size-limit --no-tile-compression -s EPSG:4326 --output-to-directory tilesDirectory --force temp.json as tippecanoe unfortunately does not support geoparquet yet
  • then host the tiles with npx http-server --cors (note that --cors is not needed for QGIS but for static sites like Leaflet or similar running on localhost)
  •  use http://localhost:8080/tilesDirectory/{z}/{x}/{y}.pbf in a map framework of your choice

Works like a charm for me. (From: geopandas/geopandas#2295 (comment))

@do-me
do-me / chunk_and_average_embeddings.py
Created March 5, 2024 10:33
Chunk text in chunks of N words and calculate the average embedding from all chunks iterating over a pandas df
import numpy as np
from tqdm import tqdm
tqdm.pandas()
def chunk_text(text, max_words=100):
words = text.split()
chunks = [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words)]
# Convert the list of embeddings to a NumPy array and change its dtype to float32
embeddings_array = np.array(model.encode(chunks)).astype('float32')
@do-me
do-me / export_html_table_to_csv.js
Created February 22, 2024 08:35
Export all html tables to csv (sep=|)
function exportTableToCSV(filename) {
var csv = [];
var rows = document.querySelectorAll('table tr');
for (var i = 0; i < rows.length; i++) {
var row = [], cols = rows[i].querySelectorAll('td, th');
for (var j = 0; j < cols.length; j++) {
var cellText = cols[j].textContent.trim();
var link = cols[j].querySelector('a');
@do-me
do-me / cleanup.py
Created January 18, 2024 16:01
Ubuntu clean up trash & cache
import subprocess
def run_shell_command(command):
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
output, error = process.communicate()
if process.returncode != 0:
print(f"Error executing command: {command}")
print(f"Error message: {error.decode('utf-8')}")
else:
@do-me
do-me / weighted_mean_embedding.py
Last active February 8, 2024 15:45
Wieghted average embedding in Python with numpy
import numpy as np
# takes an arbitrary number of vectors/weights as input
def weighted_mean_embedding(vectors, weights):
if len(vectors) != len(weights):
raise ValueError("Number of vectors must be equal to the number of weights")
# Convert lists to NumPy arrays and cast to float
vectors = [np.array(vector, dtype=float) for vector in vectors]
@do-me
do-me / zip-to-pandas.py
Created January 9, 2024 15:12
Zipfile with .txt documents to pandas df
import zipfile
import pandas as pd
from io import TextIOWrapper
# Specify the path to your zip file
zip_file_path = 'zipfile.zip'
# Create empty lists to store filename and content
filenames = []
contents = []
@do-me
do-me / mkdocs_material_miner.py
Created January 1, 2024 15:46
Create one markdown file with all wiki content of mkdocs material
import os
# Assuming you're in the "docs" dir
output_file_path = "mkdocs_material_docs_31_12_2023.md"
max_depth = 10 # Set the maximum depth of subdirectories to include
md_files = [os.path.join(root, file) for root, dirs, files in
os.walk(".", topdown=True) for file in files if
file.endswith('.md') and root.count(os.path.sep) - ".".count(os.path.sep) < max_depth]
@do-me
do-me / photon.py
Created December 27, 2023 15:21
Photon (komoot) helper functions in Python
import requests
from requests.auth import HTTPBasicAuth
base_url = "https://photon.yourserver.de"
# Create an HTTP session with basic authentication
username = "username"
password = "password"
session = requests.Session()
session.auth = HTTPBasicAuth(username, password)
@do-me
do-me / word_counter.py
Created December 19, 2023 15:28
Get the most frequent words in a pandas text column
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm
import pandas as pd
import string
# Download NLTK stopwords
import nltk
nltk.download('stopwords')