Skip to content

Instantly share code, notes, and snippets.

View mneedham's full-sized avatar

Mark Needham mneedham

View GitHub Profile
@mneedham
mneedham / matches.py
Last active March 19, 2023 17:53
DuckDB Relational API
import duckdb
import pandas as pd
con = duckdb.connect('atp-matches.db')
con.sql("INSTALL httpfs")
con.sql("LOAD httpfs")
csv_files = [
f"https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_{year}.csv"
@mneedham
mneedham / queries.sql
Created November 1, 2022 22:46
On the fly joins on CSV files with DuckDB
CREATE OR REPLACE TABLE players
AS SELECT * FROM read_csv_auto('atp_players.csv', SAMPLE_SIZE=-1);
CREATE OR REPLACE TABLE rankings AS
select *
from 'atp_rankings_*.csv';
SELECT player_id, name_first, name_last
FROM players
LIMIT 5;
@mneedham
mneedham / queries.sql
Created October 28, 2022 12:26
Querying ATP matches using DuckDB
-- Fails because of weird date
CREATE TABLE players AS
select *
from 'atp_players.csv';
-- all varchar
CREATE TABLE players1 AS
select *
from read_csv_auto('atp_players.csv', ALL_VARCHAR=TRUE);
@mneedham
mneedham / duckdb.sql
Created October 21, 2022 13:58
Queries against DuckDB
SELECT count(*)
FROM 'data/*.parquet';
SELECT *
FROM 'data/*.parquet'
LIMIT 10;
DESCRIBE
SELECT *
FROM 'data/yellow_tripdata_2011-07.parquet';
@mneedham
mneedham / parquet-cli.sh
Created October 14, 2022 18:24
An intro to Apache Parquet
# The NYC Taxis Dataset - https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page
pip install parquet-cli
parq data/yellow_tripdata_2022-01.parquet
parq data/yellow_tripdata_2022-01.parquet --schema
parq data/yellow_tripdata_2022-01.parquet --head 10
///usr/bin/env jbang "$0" "$@" ; exit $?
//DEPS org.apache.pinot:pinot-jdbc-client:0.8.0
//DEPS com.github.freva:ascii-table:1.2.0
// jbang Query.java "select * from customers limit 5"
// export JDBC_URL="jdbc:pinot://localhost:9000"
import static java.lang.System.*;
import java.sql.*;
import java.util.*;
@mneedham
mneedham / event.txt
Last active November 5, 2021 14:12
event.txt
event: message
id: [{"topic":"eqiad.mediawiki.recentchange","partition":0,"timestamp":1635935095001},{"topic":"codfw.mediawiki.recentchange","partition":0,"offset":-1}]
data: {"$schema":"/mediawiki/recentchange/1.0.0","meta":{"uri":"https://pl.wikipedia.org/wiki/Kategoria:Infoboksy_%E2%80%93_brakuj%C4%85ce_parametry_%E2%80%93_Jednostka_administracyjna_infobox","request_id":"45eb5803-f73b-410c-a640-c76c5d86d91f","id":"5a7ee8b2-fead-4d54-b6fc-4bb433a08e51","dt":"2021-11-03T10:24:55Z","domain":"pl.wikipedia.org","stream":"mediawiki.recentchange","topic":"eqiad.mediawiki.recentchange","partition":0,"offset":3410637559},"id":119002871,"type":"categorize","namespace":14,"title":"Kategoria:Infoboksy – brakujące parametry – Jednostka administracyjna infobox","comment":"usunięto [[:Sukhaura]] z kategorii","timestamp":1635935095,"user":"MalarzBOT","bot":true,"server_url":"https://pl.wikipedia.org","server_name":"pl.wikipedia.org","server_script_path":"/w","wiki":"plwiki","parsedcomment":"usunięto <a href=\"/wiki/Sukh
@mneedham
mneedham / app.py
Last active July 20, 2021 21:18
Streamlit/Pinot - dynamic selection of metric and time range
import streamlit as st
from pinotdb import connect
import pandas as pd
import time
st.title("GitHub Events")
broker_port = 8000
conn = connect(host='localhost', port=broker_port, path='/query/sql', scheme='http')
@mneedham
mneedham / app.py
Last active July 20, 2021 19:55
Pinot queries rendered in Streamlit
import streamlit as st
from pinotdb import connect
import pandas as pd
st.title("GitHub Events")
broker_port = 8000
conn = connect(host='localhost', port=broker_port, path='/query/sql', scheme='http')
query = f"""
@mneedham
mneedham / app.py
Created July 20, 2021 16:40
Streamlit app
import streamlit as st
st.title("GitHub Events")
st.write("Welcome to our first Streamlit app")