Hannes Mühleisen hannes

## dlopen.md

      
              1 file
            
          
              0 forks
            
          
              2 comments
            
          
              9 stars
            
          
                hannes
                / dlopen.md
            
            
              Last active
              December 15, 2023 10:42
            
          
    Parallel Python within the same process or hacking around the cursed GIL with a hand-rolled library loader

From its obscure beginnings in Amsterdam, the Python programming language has become a fundamental building block of our digital society. It is used literally everywhere and by everyone for a mind-boggingly wide variety of tasks.
Python is also the lingua franca of Data Science, tying together tools for data loading, wrangling, analysis and AI. There is a massive ecosystem of contributed Python packages, which - for example - allows reading every obscure data format under the sun. This makes Python and its ecosystem extremely valuable for analytical data management systems: Users are likely somewhat familiar with Python due to its immense popularity and the ecosystem provides solutions for most data problems. As a result, Python is being integrated into SQL systems, typically through so-called User-Defined Functions (UDFs). For example, [Apach

  
## torrent.py
#!/usr/bin/env python3
import sqlite3 # yay SQLite!
import base64
import os
import sys

if sys.version_info<(3,6,0):
	print("You need python 3.6 or later to run this script.")
	exit(-1)

## miniparquet-altrep-demo.R
# install like so:
# remotes::install_github("hannesmuehleisen/miniparquet", ref="altrep")
options(tibble.print_max = 10, tibble.print_min = 10)

# parquet file from https://archive.luftdaten.info/parquet/2019-08/sds011/part-00000-54e23417-8f54-4a91-9b6b-b8724706a9a7-c000.snappy.parquet

f <- "pqtest/big_data.snappy.parquet"
system(sprintf("ls -lah %s", f))

# the read_parquet() function only reads metadata and sets up ALTREP

## applemail.R
library("DBI")
library("ggplot2")

con <- dbConnect(RSQLite::SQLite(), dbname="~/Library/Mail/V5/MailData/Envelope Index", flags=RSQLite::SQLITE_RO)

# messages per month
msg_per_month <- dbGetQuery(con, "SELECT MIN(DATETIME(date_sent, 'unixepoch')) as start, STRFTIME('%m', DATETIME(date_sent, 'unixepoch')) AS month, STRFTIME('%Y', DATETIME(date_sent, 'unixepoch')) AS year, COUNT(*) AS n FROM messages WHERE mailbox IN (SELECT ROWID FROM mailboxes WHERE url LIKE '%Sent') GROUP BY year, month ORDER BY year, month")
msg_per_month$start <- as.Date(msg_per_month$start)

ggplot(msg_per_month, aes(x=start, y=n, group=1)) + geom_line()

## tpch.jl
# Pkg.add("CSV")
Pkg.update()

using DataFrames, CSV
wr = true
ne = false
region = CSV.read("region.tbl", delim='|', header=["r_regionkey", "r_name", "r_comment"], weakrefstrings=wr, nullable=ne)
nation = CSV.read("nation.tbl", delim='|', header=["n_nationkey", "n_name", "n_regionkey", "n_comment"],  weakrefstrings=wr, nullable=ne)
supplier = CSV.read("supplier.tbl", delim='|', header=["s_suppkey","s_name","s_address","s_nationkey","s_phone","s_acctbal","s_comment"], weakrefstrings=wr, nullable=ne)
customer = CSV.read("customer.tbl", delim='|', header=["c_custkey","c_name","c_address","c_nationkey","c_phone","c_acctbal","c_mktsegment","c_comment"], weakrefstrings=wr, nullable=ne)

## ststospotify.py
import urllib.request, json
import re
import pandas as pd
import spotipy
import itertools

# you will need to create a spotify app and add the credentials below
# also create a public spotify playlist and get it ID (the last part of its URI)
# import spotipy.util as util
# token = util.prompt_for_user_token('hfmuehleisen',"playlist-modify-public",client_id='XXX',client_secret='XXX',redirect_uri='http://example.com/callback')

## dbplot-monetdblite.R
# install.packages("MonetDBLite")
# install.packages("dbplot")
# install.packages("nycflights13")
# install.packages("ggplot2")
# install.packages("dplyr")

library("ggplot2")
library("dplyr")
library("dbplot")

## test.html
<!DOCTYPE html>
<html>
<head>
 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
 <title>Hello, Mark</title>
</head>
<body><h1>fuuu</h1></body></html>

## monetdblite-demo.R
# rm -r /tmp/hmda

install.packages("MonetDBLite")
library(DBI)

dbdir <- "/tmp/hmda"
con <- dbConnect(MonetDBLite::MonetDBLite(), dbdir)

# download at http://homepages.cwi.nl/~hannes/hmda.rds
dd <- readRDS("/tmp/hmda.rds")

## pamap2.R
library(MonetDB.R)

files <- dir("/Users/hannes/Desktop/PAMAP2_Dataset/Protocol/", pattern="*.dat", full.names=T)
tnames <- paste0("pamap2_", sub("^([^.]*).*", "\\1", basename(files)))
snames <- sub(".*(\\d{3})$", "\\1", tnames)

loadsql <- c("DROP VIEW pamap2", paste0("DROP TABLE ",tnames,"; "), paste0("CREATE TABLE ",tnames," (timestamp float, activityID int, heart_rate int, hand_temp float, hand_acc_16_1 float,  hand_acc_16_2 float, hand_acc_16_3 float, hand_acc_6_1 float, hand_acc_6_2 float, hand_acc_6_3 float, hand_gyro_1 float, hand_gyro_2 float, hand_gyro_3 float, hand_mag_1 float, hand_mag_2 float, hand_mag_3 float, hand_or_1 float, hand_or_2 float, hand_or_3 float, hand_or_4 float, chest_temp float, chest_acc_16_1 float,  chest_acc_16_2 float, chest_acc_16_3 float, chest_acc_6_1 float, chest_acc_6_2 float, chest_acc_6_3 float, chest_gyro_1 float, chest_gyro_2 float, chest_gyro_3 float, chest_mag_1 float, chest_mag_2 float, chest_mag_3 float, chest_or_1 float, chest_or_2 float, chest_or_3 float, chest
	#!/usr/bin/env python3
	import sqlite3 # yay SQLite!
	import base64
	import os
	import sys

	if sys.version_info<(3,6,0):
	print("You need python 3.6 or later to run this script.")
	exit(-1)
	# install like so:
	# remotes::install_github("hannesmuehleisen/miniparquet", ref="altrep")
	options(tibble.print_max = 10, tibble.print_min = 10)

	# parquet file from https://archive.luftdaten.info/parquet/2019-08/sds011/part-00000-54e23417-8f54-4a91-9b6b-b8724706a9a7-c000.snappy.parquet

	f <- "pqtest/big_data.snappy.parquet"
	system(sprintf("ls -lah %s", f))

	# the read_parquet() function only reads metadata and sets up ALTREP
	library("DBI")
	library("ggplot2")

	con <- dbConnect(RSQLite::SQLite(), dbname="~/Library/Mail/V5/MailData/Envelope Index", flags=RSQLite::SQLITE_RO)

	# messages per month
	msg_per_month <- dbGetQuery(con, "SELECT MIN(DATETIME(date_sent, 'unixepoch')) as start, STRFTIME('%m', DATETIME(date_sent, 'unixepoch')) AS month, STRFTIME('%Y', DATETIME(date_sent, 'unixepoch')) AS year, COUNT(*) AS n FROM messages WHERE mailbox IN (SELECT ROWID FROM mailboxes WHERE url LIKE '%Sent') GROUP BY year, month ORDER BY year, month")
	msg_per_month$start <- as.Date(msg_per_month$start)

	ggplot(msg_per_month, aes(x=start, y=n, group=1)) + geom_line()
	# Pkg.add("CSV")
	Pkg.update()

	using DataFrames, CSV
	wr = true
	ne = false
	region = CSV.read("region.tbl", delim='\|', header=["r_regionkey", "r_name", "r_comment"], weakrefstrings=wr, nullable=ne)
	nation = CSV.read("nation.tbl", delim='\|', header=["n_nationkey", "n_name", "n_regionkey", "n_comment"], weakrefstrings=wr, nullable=ne)
	supplier = CSV.read("supplier.tbl", delim='\|', header=["s_suppkey","s_name","s_address","s_nationkey","s_phone","s_acctbal","s_comment"], weakrefstrings=wr, nullable=ne)
	customer = CSV.read("customer.tbl", delim='\|', header=["c_custkey","c_name","c_address","c_nationkey","c_phone","c_acctbal","c_mktsegment","c_comment"], weakrefstrings=wr, nullable=ne)
	import urllib.request, json
	import re
	import pandas as pd
	import spotipy
	import itertools

	# you will need to create a spotify app and add the credentials below
	# also create a public spotify playlist and get it ID (the last part of its URI)
	# import spotipy.util as util
	# token = util.prompt_for_user_token('hfmuehleisen',"playlist-modify-public",client_id='XXX',client_secret='XXX',redirect_uri='http://example.com/callback')
	# install.packages("MonetDBLite")
	# install.packages("dbplot")
	# install.packages("nycflights13")
	# install.packages("ggplot2")
	# install.packages("dplyr")

	library("ggplot2")
	library("dplyr")
	library("dbplot")
	<!DOCTYPE html>
	<html>
	<head>
	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
	<title>Hello, Mark</title>
	</head>
	<body><h1>fuuu</h1></body></html>
	# rm -r /tmp/hmda

	install.packages("MonetDBLite")
	library(DBI)

	dbdir <- "/tmp/hmda"
	con <- dbConnect(MonetDBLite::MonetDBLite(), dbdir)

	# download at http://homepages.cwi.nl/~hannes/hmda.rds
	dd <- readRDS("/tmp/hmda.rds")
	library(MonetDB.R)

	files <- dir("/Users/hannes/Desktop/PAMAP2_Dataset/Protocol/", pattern="*.dat", full.names=T)
	tnames <- paste0("pamap2_", sub("^([^.]).", "\\1", basename(files)))
	snames <- sub(".*(\\d{3})$", "\\1", tnames)

	loadsql <- c("DROP VIEW pamap2", paste0("DROP TABLE ",tnames,"; "), paste0("CREATE TABLE ",tnames," (timestamp float, activityID int, heart_rate int, hand_temp float, hand_acc_16_1 float, hand_acc_16_2 float, hand_acc_16_3 float, hand_acc_6_1 float, hand_acc_6_2 float, hand_acc_6_3 float, hand_gyro_1 float, hand_gyro_2 float, hand_gyro_3 float, hand_mag_1 float, hand_mag_2 float, hand_mag_3 float, hand_or_1 float, hand_or_2 float, hand_or_3 float, hand_or_4 float, chest_temp float, chest_acc_16_1 float, chest_acc_16_2 float, chest_acc_16_3 float, chest_acc_6_1 float, chest_acc_6_2 float, chest_acc_6_3 float, chest_gyro_1 float, chest_gyro_2 float, chest_gyro_3 float, chest_mag_1 float, chest_mag_2 float, chest_mag_3 float, chest_or_1 float, chest_or_2 float, chest_or_3 float, chest