gordthompson/test_df_for_rescans.py

## test_df_for_rescans.py
"""
A DataFrame that contains a lot of "null" values (NaN, etc.) can cause a
significant performance penalty with fast_executemany=True. For details, see

https://github.com/mkleehammer/pyodbc/issues/741

This routine scans the specified DataFrame to see how many "row scans" would be
required when using .to_sql() with fast_executemany=True. The first row must
always be scanned, but after that a re-scan will flush the current rows to the
server, increasing network traffic and slowing things down.

version 1.3 - 2023-10-21
"""

from collections import OrderedDict

import pandas as pd


def test_df_for_rescans(df):
    # we haven't scanned yet, so flag all columns as null
    is_null = {col_name: True for col_name in df.columns}
    rescan_count = -1
    triggered_by = dict()
    first_one = True
    for row_idx in df.index:
        scan_needed = False
        for col_name in df.columns:
            if pd.notna(df.at[row_idx, col_name]) and is_null[col_name]:
                scan_needed = True
                if not first_one:
                    n = triggered_by.get(col_name, 0)
                    triggered_by[col_name] = n + 1
                first_one = False
                break
        if scan_needed:
            rescan_count += 1
            is_null = {
                col_name: pd.isna(df.at[row_idx, col_name])
                for col_name in df.columns
            }
            if not any(is_null.values()):
                # all False, so no need to scan subsequent rows
                break
    return {
        "total rows": len(df),
        "re-scans": rescan_count,
        "triggered by": dict(
            OrderedDict(
                sorted(triggered_by.items(), key=lambda t: t[1], reverse=True)
            )
        ),
    }


if __name__ == "__main__":
    # example data
    my_df = pd.DataFrame(
        [(1, None), (None, 2), (3, None), (4, 4), (None, 5)],
        columns=["col_1", "col_2"],
    )

    results = test_df_for_rescans(my_df)

    print(results)
    """
    {'total rows': 5, 're-scans': 3, 'triggered by': {'col_2': 2, 'col_1': 1}}
    """

    """
    Notes:

    Scans are triggered if a column value is non-null when its previous state
    was recorded as null.

    The first row always triggers a scan, so that does not count as a re-scan.

    In the above example, the second and third rows trigger re-scans because
    the None values traded places, and the fourth row triggers a re-scan
    because the col_2 value changed from null to non-null. However, the fifth
    row does not trigger a re-scan because the previous row did not contain
    any null values.
    """
	"""
	A DataFrame that contains a lot of "null" values (NaN, etc.) can cause a
	significant performance penalty with fast_executemany=True. For details, see

	https://github.com/mkleehammer/pyodbc/issues/741

	This routine scans the specified DataFrame to see how many "row scans" would be
	required when using .to_sql() with fast_executemany=True. The first row must
	always be scanned, but after that a re-scan will flush the current rows to the
	server, increasing network traffic and slowing things down.

	version 1.3 - 2023-10-21
	"""

	from collections import OrderedDict

	import pandas as pd


	def test_df_for_rescans(df):
	# we haven't scanned yet, so flag all columns as null
	is_null = {col_name: True for col_name in df.columns}
	rescan_count = -1
	triggered_by = dict()
	first_one = True
	for row_idx in df.index:
	scan_needed = False
	for col_name in df.columns:
	if pd.notna(df.at[row_idx, col_name]) and is_null[col_name]:
	scan_needed = True
	if not first_one:
	n = triggered_by.get(col_name, 0)
	triggered_by[col_name] = n + 1
	first_one = False
	break
	if scan_needed:
	rescan_count += 1
	is_null = {
	col_name: pd.isna(df.at[row_idx, col_name])
	for col_name in df.columns
	}
	if not any(is_null.values()):
	# all False, so no need to scan subsequent rows
	break
	return {
	"total rows": len(df),
	"re-scans": rescan_count,
	"triggered by": dict(
	OrderedDict(
	sorted(triggered_by.items(), key=lambda t: t[1], reverse=True)
	)
	),
	}


	if __name__ == "__main__":
	# example data
	my_df = pd.DataFrame(
	[(1, None), (None, 2), (3, None), (4, 4), (None, 5)],
	columns=["col_1", "col_2"],
	)

	results = test_df_for_rescans(my_df)

	print(results)
	"""
	{'total rows': 5, 're-scans': 3, 'triggered by': {'col_2': 2, 'col_1': 1}}
	"""

	"""
	Notes:

	Scans are triggered if a column value is non-null when its previous state
	was recorded as null.

	The first row always triggers a scan, so that does not count as a re-scan.

	In the above example, the second and third rows trigger re-scans because
	the None values traded places, and the fourth row triggers a re-scan
	because the col_2 value changed from null to non-null. However, the fifth
	row does not trigger a re-scan because the previous row did not contain
	any null values.
	"""