Cuahchic/using_sql_with_pandas.py

## using_sql_with_pandas.py
# %% Imports
import numpy as np
import pandas as pd
import sqlite3

# %% Generate some data
data_people1 = {
        'idx': [1, 2, 3, 4],
        'age': [6, 39, 69, 41],
        'name': ['John', 'Sue', 'James', 'Agatha']
    }

data_people2 = {
        'idx': [5, 6],
        'age': [73, 18],
        'name': ['Dorothy', 'Peter']
    }

data_people_heights = {
        'name': ['John', 'Sue', 'James', 'Agatha'],
        'height_m': [0.97, 1.49, 1.58, 1.71]
    }

data_people_status = {
        'age_min': [0, 18, 65],
        'age_max': [18, 65, 100],
        'status': ['child', 'working age', 'retired']
    }

# Create DataFrames
df_people1 = pd.DataFrame(data_people1)
df_people2 = pd.DataFrame(data_people2)
df_people_heights = pd.DataFrame(data_people_heights)
df_people_status = pd.DataFrame(data_people_status)


# %% Step 1: Add heights to the people DataFrame
# Python way
df_people_python = df_people1.merge(right=df_people_heights, how="left", left_on="name", right_on="name") # Obviously you can modify the "how" to be inner, left, right etc for different types of joins

# SQL way
with sqlite3.connect(':memory:') as conn: # Create context manager for connection
    # Write tables into database
    df_people1.to_sql(name='people', con=conn, index=False) # See docs here: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html
    df_people_heights.to_sql(name='people_heights', con=conn, index=False)

    # Get cursor for doing stuff
    cur = conn.cursor()

    # Index to speed up join
    cur.execute('CREATE INDEX index_people__name ON people(name ASC);') # Not necessary for such a small table in this example but can improve performance on large DataFrames/tables
    cur.close()

    # Get data back to Python, docs: https://pandas.pydata.org/docs/reference/api/pandas.read_sql_query.html
    # NOTE: In the docs the parse_dates parameter can be used to convert timestamps to pd.Timestamp data types
    df_people_sql = pd.read_sql_query(
            sql='SELECT p.*, ph.height_m FROM people as p LEFT JOIN people_heights as ph ON p.name = ph.name;',
            con=conn
        )

# Check they are the same
assert df_people_python.compare(other=df_people_sql).empty, "The DataFrames are not the same"

# %% Step 2: Add some new rows to the people DataFrame
# Python way
df_people_python = df_people1.append(other=df_people2, ignore_index=True)

# SQL way
with sqlite3.connect(':memory:') as conn: # Create context manager for connection
    # Write tables into database
    df_people1.to_sql(name='people1', con=conn, index=False)
    df_people2.to_sql(name='people2', con=conn, index=False)

    # Get data back to Python
    df_people_sql = pd.read_sql_query(
            sql='SELECT * FROM people1 UNION ALL SELECT * FROM people2;',
            con=conn
        )

# Check they are the same
assert df_people_python.compare(other=df_people_sql).empty, "The DataFrames are not the same"

# %% Step 3: Add status based on between style logic which is more awkward in Python
# Python way, from: https://stackoverflow.com/questions/44367672/best-way-to-join-merge-by-range-in-pandas
i, j = np.where((df_people1.age.values[:, None] >= df_people_status.age_min.values) & (df_people1.age.values[:, None] < df_people_status.age_max.values))

df_people_python = pd.concat([
        df_people1.loc[i, :].reset_index(drop=True),
        df_people_status.loc[j, ["status"]].reset_index(drop=True)
    ], axis=1)

# SQL way
with sqlite3.connect(':memory:') as conn: # Create context manager for connection
    # Write tables into database
    df_people1.to_sql(name='people', con=conn, index=False)
    df_people_status.to_sql(name='people_status', con=conn, index=False)

    # Create query to join tables
    sql = '''
        SELECT  p.*, ps.status
        FROM    people p
                LEFT JOIN
                people_status ps
                ON p.age BETWEEN ps.age_min AND ps.age_max
        '''

    # Get data back to Python
    df_people_sql = pd.read_sql_query(sql=sql, con=conn)

# Check they are the same
assert df_people_python.compare(other=df_people_sql).empty, "The DataFrames are not the same"
	# %% Imports
	import numpy as np
	import pandas as pd
	import sqlite3

	# %% Generate some data
	data_people1 = {
	'idx': [1, 2, 3, 4],
	'age': [6, 39, 69, 41],
	'name': ['John', 'Sue', 'James', 'Agatha']
	}

	data_people2 = {
	'idx': [5, 6],
	'age': [73, 18],
	'name': ['Dorothy', 'Peter']
	}

	data_people_heights = {
	'name': ['John', 'Sue', 'James', 'Agatha'],
	'height_m': [0.97, 1.49, 1.58, 1.71]
	}

	data_people_status = {
	'age_min': [0, 18, 65],
	'age_max': [18, 65, 100],
	'status': ['child', 'working age', 'retired']
	}

	# Create DataFrames
	df_people1 = pd.DataFrame(data_people1)
	df_people2 = pd.DataFrame(data_people2)
	df_people_heights = pd.DataFrame(data_people_heights)
	df_people_status = pd.DataFrame(data_people_status)


	# %% Step 1: Add heights to the people DataFrame
	# Python way
	df_people_python = df_people1.merge(right=df_people_heights, how="left", left_on="name", right_on="name") # Obviously you can modify the "how" to be inner, left, right etc for different types of joins

	# SQL way
	with sqlite3.connect(':memory:') as conn: # Create context manager for connection
	# Write tables into database
	df_people1.to_sql(name='people', con=conn, index=False) # See docs here: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html
	df_people_heights.to_sql(name='people_heights', con=conn, index=False)

	# Get cursor for doing stuff
	cur = conn.cursor()

	# Index to speed up join
	cur.execute('CREATE INDEX index_people__name ON people(name ASC);') # Not necessary for such a small table in this example but can improve performance on large DataFrames/tables
	cur.close()

	# Get data back to Python, docs: https://pandas.pydata.org/docs/reference/api/pandas.read_sql_query.html
	# NOTE: In the docs the parse_dates parameter can be used to convert timestamps to pd.Timestamp data types
	df_people_sql = pd.read_sql_query(
	sql='SELECT p.*, ph.height_m FROM people as p LEFT JOIN people_heights as ph ON p.name = ph.name;',
	con=conn
	)

	# Check they are the same
	assert df_people_python.compare(other=df_people_sql).empty, "The DataFrames are not the same"

	# %% Step 2: Add some new rows to the people DataFrame
	# Python way
	df_people_python = df_people1.append(other=df_people2, ignore_index=True)

	# SQL way
	with sqlite3.connect(':memory:') as conn: # Create context manager for connection
	# Write tables into database
	df_people1.to_sql(name='people1', con=conn, index=False)
	df_people2.to_sql(name='people2', con=conn, index=False)

	# Get data back to Python
	df_people_sql = pd.read_sql_query(
	sql='SELECT * FROM people1 UNION ALL SELECT * FROM people2;',
	con=conn
	)

	# Check they are the same
	assert df_people_python.compare(other=df_people_sql).empty, "The DataFrames are not the same"

	# %% Step 3: Add status based on between style logic which is more awkward in Python
	# Python way, from: https://stackoverflow.com/questions/44367672/best-way-to-join-merge-by-range-in-pandas
	i, j = np.where((df_people1.age.values[:, None] >= df_people_status.age_min.values) & (df_people1.age.values[:, None] < df_people_status.age_max.values))

	df_people_python = pd.concat([
	df_people1.loc[i, :].reset_index(drop=True),
	df_people_status.loc[j, ["status"]].reset_index(drop=True)
	], axis=1)

	# SQL way
	with sqlite3.connect(':memory:') as conn: # Create context manager for connection
	# Write tables into database
	df_people1.to_sql(name='people', con=conn, index=False)
	df_people_status.to_sql(name='people_status', con=conn, index=False)

	# Create query to join tables
	sql = '''
	SELECT p.*, ps.status
	FROM people p
	LEFT JOIN
	people_status ps
	ON p.age BETWEEN ps.age_min AND ps.age_max
	'''

	# Get data back to Python
	df_people_sql = pd.read_sql_query(sql=sql, con=conn)

	# Check they are the same
	assert df_people_python.compare(other=df_people_sql).empty, "The DataFrames are not the same"