Skip to content

Instantly share code, notes, and snippets.

@toriningen
Last active September 30, 2019 07:43
Show Gist options
  • Save toriningen/fb10e1d71b222808ad5b3e26c3e27880 to your computer and use it in GitHub Desktop.
Save toriningen/fb10e1d71b222808ad5b3e26c3e27880 to your computer and use it in GitHub Desktop.
Fast way to merge lots of Pandas dataframes (~4x times faster than plain reduce())
import pandas as pd
_missing = object()
def sorting_reduce(function, iterable, key=lambda x: x, initializer=_missing):
"""Stable sorting reduce.
Cumulatively applies the function to the two smallest elements of the set.
Preserves order of elements of same size.
"""
import heapq
iterable = iter(iterable)
if initializer is _missing:
initializer = next(iterable, _missing)
assert initializer is not _missing, "At least one item or initializer expected"
items = [
(key(initializer), -1, initializer),
*((key(x), seq, x) for seq, x in enumerate(iterable)),
]
heapq.heapify(items)
seq = len(items) - 1
while len(items) >= 2:
_, seq_left, left = heapq.heappop(items)
_, seq_right, right = heapq.heappop(items)
if seq_left > seq_right:
left, right = right, left
new = function(left, right)
heapq.heappush(items, (key(new), seq, new))
seq += 1
_, _, result = heapq.heappop(items)
return result
def merge_all(frames, *args, **kwargs):
assert frames, "At least one frame expected"
return sorting_reduce(
lambda left, right: pd.merge(left, right, *args, **kwargs),
frames,
key=lambda df: len(df.index),
)
# usage example
merged_df = merge_all([df1, df2, df3], how='outer', left_index=True, right_index=True, copy=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment