Skip to content

Instantly share code, notes, and snippets.

@Cuahchic
Created August 25, 2023 09:59
Show Gist options
  • Save Cuahchic/6703428e0d2e9b1153c988a4b65a88bf to your computer and use it in GitHub Desktop.
Save Cuahchic/6703428e0d2e9b1153c988a4b65a88bf to your computer and use it in GitHub Desktop.
Using SQL with Pandas DataFrames
# %% Imports
import numpy as np
import pandas as pd
import sqlite3
# %% Generate some data
data_people1 = {
'idx': [1, 2, 3, 4],
'age': [6, 39, 69, 41],
'name': ['John', 'Sue', 'James', 'Agatha']
}
data_people2 = {
'idx': [5, 6],
'age': [73, 18],
'name': ['Dorothy', 'Peter']
}
data_people_heights = {
'name': ['John', 'Sue', 'James', 'Agatha'],
'height_m': [0.97, 1.49, 1.58, 1.71]
}
data_people_status = {
'age_min': [0, 18, 65],
'age_max': [18, 65, 100],
'status': ['child', 'working age', 'retired']
}
# Create DataFrames
df_people1 = pd.DataFrame(data_people1)
df_people2 = pd.DataFrame(data_people2)
df_people_heights = pd.DataFrame(data_people_heights)
df_people_status = pd.DataFrame(data_people_status)
# %% Step 1: Add heights to the people DataFrame
# Python way
df_people_python = df_people1.merge(right=df_people_heights, how="left", left_on="name", right_on="name") # Obviously you can modify the "how" to be inner, left, right etc for different types of joins
# SQL way
with sqlite3.connect(':memory:') as conn: # Create context manager for connection
# Write tables into database
df_people1.to_sql(name='people', con=conn, index=False) # See docs here: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html
df_people_heights.to_sql(name='people_heights', con=conn, index=False)
# Get cursor for doing stuff
cur = conn.cursor()
# Index to speed up join
cur.execute('CREATE INDEX index_people__name ON people(name ASC);') # Not necessary for such a small table in this example but can improve performance on large DataFrames/tables
cur.close()
# Get data back to Python, docs: https://pandas.pydata.org/docs/reference/api/pandas.read_sql_query.html
# NOTE: In the docs the parse_dates parameter can be used to convert timestamps to pd.Timestamp data types
df_people_sql = pd.read_sql_query(
sql='SELECT p.*, ph.height_m FROM people as p LEFT JOIN people_heights as ph ON p.name = ph.name;',
con=conn
)
# Check they are the same
assert df_people_python.compare(other=df_people_sql).empty, "The DataFrames are not the same"
# %% Step 2: Add some new rows to the people DataFrame
# Python way
df_people_python = df_people1.append(other=df_people2, ignore_index=True)
# SQL way
with sqlite3.connect(':memory:') as conn: # Create context manager for connection
# Write tables into database
df_people1.to_sql(name='people1', con=conn, index=False)
df_people2.to_sql(name='people2', con=conn, index=False)
# Get data back to Python
df_people_sql = pd.read_sql_query(
sql='SELECT * FROM people1 UNION ALL SELECT * FROM people2;',
con=conn
)
# Check they are the same
assert df_people_python.compare(other=df_people_sql).empty, "The DataFrames are not the same"
# %% Step 3: Add status based on between style logic which is more awkward in Python
# Python way, from: https://stackoverflow.com/questions/44367672/best-way-to-join-merge-by-range-in-pandas
i, j = np.where((df_people1.age.values[:, None] >= df_people_status.age_min.values) & (df_people1.age.values[:, None] < df_people_status.age_max.values))
df_people_python = pd.concat([
df_people1.loc[i, :].reset_index(drop=True),
df_people_status.loc[j, ["status"]].reset_index(drop=True)
], axis=1)
# SQL way
with sqlite3.connect(':memory:') as conn: # Create context manager for connection
# Write tables into database
df_people1.to_sql(name='people', con=conn, index=False)
df_people_status.to_sql(name='people_status', con=conn, index=False)
# Create query to join tables
sql = '''
SELECT p.*, ps.status
FROM people p
LEFT JOIN
people_status ps
ON p.age BETWEEN ps.age_min AND ps.age_max
'''
# Get data back to Python
df_people_sql = pd.read_sql_query(sql=sql, con=conn)
# Check they are the same
assert df_people_python.compare(other=df_people_sql).empty, "The DataFrames are not the same"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment