Hannes Mühleisen hannes

## dlopen.md

      
              1 file
            
          
              0 forks
            
          
              2 comments
            
          
              9 stars
            
          
                hannes
                / dlopen.md
            
            
              Last active
              December 15, 2023 10:42
            
          
    Parallel Python within the same process or hacking around the cursed GIL with a hand-rolled library loader

From its obscure beginnings in Amsterdam, the Python programming language has become a fundamental building block of our digital society. It is used literally everywhere and by everyone for a mind-boggingly wide variety of tasks.
Python is also the lingua franca of Data Science, tying together tools for data loading, wrangling, analysis and AI. There is a massive ecosystem of contributed Python packages, which - for example - allows reading every obscure data format under the sun. This makes Python and its ecosystem extremely valuable for analytical data management systems: Users are likely somewhat familiar with Python due to its immense popularity and the ecosystem provides solutions for most data problems. As a result, Python is being integrated into SQL systems, typically through so-called User-Defined Functions (UDFs). For example, [Apach

  
## torrent.py
#!/usr/bin/env python3
import sqlite3 # yay SQLite!
import base64
import os
import sys

if sys.version_info<(3,6,0):
	print("You need python 3.6 or later to run this script.")
	exit(-1)

## tpch.jl
# Pkg.add("CSV")
Pkg.update()

using DataFrames, CSV
wr = true
ne = false
region = CSV.read("region.tbl", delim='|', header=["r_regionkey", "r_name", "r_comment"], weakrefstrings=wr, nullable=ne)
nation = CSV.read("nation.tbl", delim='|', header=["n_nationkey", "n_name", "n_regionkey", "n_comment"],  weakrefstrings=wr, nullable=ne)
supplier = CSV.read("supplier.tbl", delim='|', header=["s_suppkey","s_name","s_address","s_nationkey","s_phone","s_acctbal","s_comment"], weakrefstrings=wr, nullable=ne)
customer = CSV.read("customer.tbl", delim='|', header=["c_custkey","c_name","c_address","c_nationkey","c_phone","c_acctbal","c_mktsegment","c_comment"], weakrefstrings=wr, nullable=ne)

## unifi.js
var https = require('https');
var fs = require('fs');
var net = require('net');
var WebSocketServer = require('websocket').server;

// this works on ubuntu if you set chmod 755 on those files and the private dir
var httpsopts = {
  key: fs.readFileSync('/etc/ssl/private/ssl-cert-snakeoil.key'),
  cert: fs.readFileSync('/etc/ssl/certs/ssl-cert-snakeoil.pem')
};

## miniparquet-altrep-demo.R
# install like so:
# remotes::install_github("hannesmuehleisen/miniparquet", ref="altrep")
options(tibble.print_max = 10, tibble.print_min = 10)

# parquet file from https://archive.luftdaten.info/parquet/2019-08/sds011/part-00000-54e23417-8f54-4a91-9b6b-b8724706a9a7-c000.snappy.parquet

f <- "pqtest/big_data.snappy.parquet"
system(sprintf("ls -lah %s", f))

# the read_parquet() function only reads metadata and sets up ALTREP

## ststospotify.py
import urllib.request, json
import re
import pandas as pd
import spotipy
import itertools

# you will need to create a spotify app and add the credentials below
# also create a public spotify playlist and get it ID (the last part of its URI)
# import spotipy.util as util
# token = util.prompt_for_user_token('hfmuehleisen',"playlist-modify-public",client_id='XXX',client_secret='XXX',redirect_uri='http://example.com/callback')

## 60minsdemo-monetdblite-shiny.R
# updated 2018-01-31, hm

library(MonetDBLite)
library(reshape2)
library(shiny)
library(leaflet)
library(ggplot2)
library(ggthemes)
library(DBI)

## tycho.R
day_diff <- function(year, month, day, hour=0, min=0, sec=0, tz="UTC") {
	as.numeric(difftime(ISOdatetime(year,month,day,hour,min,sec,tz), ISOdatetime(2000,1,1,12,0,0,"UTC"), units="days"))
}

# gah
d2r <- function(x) x * 0.0174532925
r2d <- function(x) x * 57.2957795

# Swabian power!
kepler <- function(M, e) {

## applemail.R
library("DBI")
library("ggplot2")

con <- dbConnect(RSQLite::SQLite(), dbname="~/Library/Mail/V5/MailData/Envelope Index", flags=RSQLite::SQLITE_RO)

# messages per month
msg_per_month <- dbGetQuery(con, "SELECT MIN(DATETIME(date_sent, 'unixepoch')) as start, STRFTIME('%m', DATETIME(date_sent, 'unixepoch')) AS month, STRFTIME('%Y', DATETIME(date_sent, 'unixepoch')) AS year, COUNT(*) AS n FROM messages WHERE mailbox IN (SELECT ROWID FROM mailboxes WHERE url LIKE '%Sent') GROUP BY year, month ORDER BY year, month")
msg_per_month$start <- as.Date(msg_per_month$start)

ggplot(msg_per_month, aes(x=start, y=n, group=1)) + geom_line()

## dbplot-monetdblite.R
# install.packages("MonetDBLite")
# install.packages("dbplot")
# install.packages("nycflights13")
# install.packages("ggplot2")
# install.packages("dplyr")

library("ggplot2")
library("dplyr")
library("dbplot")
	#!/usr/bin/env python3
	import sqlite3 # yay SQLite!
	import base64
	import os
	import sys

	if sys.version_info<(3,6,0):
	print("You need python 3.6 or later to run this script.")
	exit(-1)
	# Pkg.add("CSV")
	Pkg.update()

	using DataFrames, CSV
	wr = true
	ne = false
	region = CSV.read("region.tbl", delim='\|', header=["r_regionkey", "r_name", "r_comment"], weakrefstrings=wr, nullable=ne)
	nation = CSV.read("nation.tbl", delim='\|', header=["n_nationkey", "n_name", "n_regionkey", "n_comment"], weakrefstrings=wr, nullable=ne)
	supplier = CSV.read("supplier.tbl", delim='\|', header=["s_suppkey","s_name","s_address","s_nationkey","s_phone","s_acctbal","s_comment"], weakrefstrings=wr, nullable=ne)
	customer = CSV.read("customer.tbl", delim='\|', header=["c_custkey","c_name","c_address","c_nationkey","c_phone","c_acctbal","c_mktsegment","c_comment"], weakrefstrings=wr, nullable=ne)
	var https = require('https');
	var fs = require('fs');
	var net = require('net');
	var WebSocketServer = require('websocket').server;

	// this works on ubuntu if you set chmod 755 on those files and the private dir
	var httpsopts = {
	key: fs.readFileSync('/etc/ssl/private/ssl-cert-snakeoil.key'),
	cert: fs.readFileSync('/etc/ssl/certs/ssl-cert-snakeoil.pem')
	};
	# install like so:
	# remotes::install_github("hannesmuehleisen/miniparquet", ref="altrep")
	options(tibble.print_max = 10, tibble.print_min = 10)

	# parquet file from https://archive.luftdaten.info/parquet/2019-08/sds011/part-00000-54e23417-8f54-4a91-9b6b-b8724706a9a7-c000.snappy.parquet

	f <- "pqtest/big_data.snappy.parquet"
	system(sprintf("ls -lah %s", f))

	# the read_parquet() function only reads metadata and sets up ALTREP
	import urllib.request, json
	import re
	import pandas as pd
	import spotipy
	import itertools

	# you will need to create a spotify app and add the credentials below
	# also create a public spotify playlist and get it ID (the last part of its URI)
	# import spotipy.util as util
	# token = util.prompt_for_user_token('hfmuehleisen',"playlist-modify-public",client_id='XXX',client_secret='XXX',redirect_uri='http://example.com/callback')
	# updated 2018-01-31, hm

	library(MonetDBLite)
	library(reshape2)
	library(shiny)
	library(leaflet)
	library(ggplot2)
	library(ggthemes)
	library(DBI)
	day_diff <- function(year, month, day, hour=0, min=0, sec=0, tz="UTC") {
	as.numeric(difftime(ISOdatetime(year,month,day,hour,min,sec,tz), ISOdatetime(2000,1,1,12,0,0,"UTC"), units="days"))
	}

	# gah
	d2r <- function(x) x * 0.0174532925
	r2d <- function(x) x * 57.2957795

	# Swabian power!
	kepler <- function(M, e) {
	library("DBI")
	library("ggplot2")

	con <- dbConnect(RSQLite::SQLite(), dbname="~/Library/Mail/V5/MailData/Envelope Index", flags=RSQLite::SQLITE_RO)

	# messages per month
	msg_per_month <- dbGetQuery(con, "SELECT MIN(DATETIME(date_sent, 'unixepoch')) as start, STRFTIME('%m', DATETIME(date_sent, 'unixepoch')) AS month, STRFTIME('%Y', DATETIME(date_sent, 'unixepoch')) AS year, COUNT(*) AS n FROM messages WHERE mailbox IN (SELECT ROWID FROM mailboxes WHERE url LIKE '%Sent') GROUP BY year, month ORDER BY year, month")
	msg_per_month$start <- as.Date(msg_per_month$start)

	ggplot(msg_per_month, aes(x=start, y=n, group=1)) + geom_line()
	# install.packages("MonetDBLite")
	# install.packages("dbplot")
	# install.packages("nycflights13")
	# install.packages("ggplot2")
	# install.packages("dplyr")

	library("ggplot2")
	library("dplyr")
	library("dbplot")