Skip to content

Instantly share code, notes, and snippets.

@mneedham
Last active March 19, 2023 17:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mneedham/bd69020cf76dd0baac8282b2cf4265e7 to your computer and use it in GitHub Desktop.
Save mneedham/bd69020cf76dd0baac8282b2cf4265e7 to your computer and use it in GitHub Desktop.
DuckDB Relational API
import duckdb
import pandas as pd
con = duckdb.connect('atp-matches.db')
con.sql("INSTALL httpfs")
con.sql("LOAD httpfs")
csv_files = [
f"https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_{year}.csv"
for year in range(1968,2024)
]
con.execute("""
CREATE OR REPLACE TABLE matches AS
SELECT * FROM read_csv_auto($1, types={'winner_seed': 'VARCHAR', 'loser_seed': 'VARCHAR'})
""", [csv_files])
df = con.execute("""
SELECT *
FROM matches
WHERE (loser_name = $1 AND winner_name = $2) OR
(loser_name = $2 AND winner_name = $1)
ORDER BY tourney_date
""", ["Richard Gasquet", "Rafael Nadal"]).fetchdf()
rel = con.from_df(df)
players = con.from_df(pd.DataFrame({"player": ["Rafael Nadal", "Richard Gasquet"]})).set_alias("players")
surfaces = con.sql("select distinct surface from matches WHERE surface is not null").set_alias('surfaces')
ps = con.sql('select * from players cross join surfaces').set_alias("ps")
winners = rel.aggregate("count(*) AS wins, winner_name, surface").set_alias("winners")
(ps.join(winners,
condition="""ps.player = winners.winner_name AND
ps.surface = winners.surface""",
how="left")
.project("player, ps.surface as surface, coalesce(wins, 0) AS wins")
.aggregate("surface, list(row(player,wins)) AS results")
)
# Make sure to always use the same DuckDB connection when creating relations, or you'll get this erro:
# duckdb.Error: Cannot combine LEFT and RIGHT relations of different connections!
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment