Skip to content

Instantly share code, notes, and snippets.

@gordthompson
Last active October 21, 2023 19:41
Show Gist options
  • Save gordthompson/3e013c5023b9791b40fde9fedd032393 to your computer and use it in GitHub Desktop.
Save gordthompson/3e013c5023b9791b40fde9fedd032393 to your computer and use it in GitHub Desktop.
search DataFrame for null values that could trigger a row scan with fast_executemany
"""
A DataFrame that contains a lot of "null" values (NaN, etc.) can cause a
significant performance penalty with fast_executemany=True. For details, see
https://github.com/mkleehammer/pyodbc/issues/741
This routine scans the specified DataFrame to see how many "row scans" would be
required when using .to_sql() with fast_executemany=True. The first row must
always be scanned, but after that a re-scan will flush the current rows to the
server, increasing network traffic and slowing things down.
version 1.3 - 2023-10-21
"""
from collections import OrderedDict
import pandas as pd
def test_df_for_rescans(df):
# we haven't scanned yet, so flag all columns as null
is_null = {col_name: True for col_name in df.columns}
rescan_count = -1
triggered_by = dict()
first_one = True
for row_idx in df.index:
scan_needed = False
for col_name in df.columns:
if pd.notna(df.at[row_idx, col_name]) and is_null[col_name]:
scan_needed = True
if not first_one:
n = triggered_by.get(col_name, 0)
triggered_by[col_name] = n + 1
first_one = False
break
if scan_needed:
rescan_count += 1
is_null = {
col_name: pd.isna(df.at[row_idx, col_name])
for col_name in df.columns
}
if not any(is_null.values()):
# all False, so no need to scan subsequent rows
break
return {
"total rows": len(df),
"re-scans": rescan_count,
"triggered by": dict(
OrderedDict(
sorted(triggered_by.items(), key=lambda t: t[1], reverse=True)
)
),
}
if __name__ == "__main__":
# example data
my_df = pd.DataFrame(
[(1, None), (None, 2), (3, None), (4, 4), (None, 5)],
columns=["col_1", "col_2"],
)
results = test_df_for_rescans(my_df)
print(results)
"""
{'total rows': 5, 're-scans': 3, 'triggered by': {'col_2': 2, 'col_1': 1}}
"""
"""
Notes:
Scans are triggered if a column value is non-null when its previous state
was recorded as null.
The first row always triggers a scan, so that does not count as a re-scan.
In the above example, the second and third rows trigger re-scans because
the None values traded places, and the fourth row triggers a re-scan
because the col_2 value changed from null to non-null. However, the fifth
row does not trigger a re-scan because the previous row did not contain
any null values.
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment