Last active
September 30, 2019 07:43
-
-
Save toriningen/fb10e1d71b222808ad5b3e26c3e27880 to your computer and use it in GitHub Desktop.
Fast way to merge lots of Pandas dataframes (~4x times faster than plain reduce())
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
_missing = object() | |
def sorting_reduce(function, iterable, key=lambda x: x, initializer=_missing): | |
"""Stable sorting reduce. | |
Cumulatively applies the function to the two smallest elements of the set. | |
Preserves order of elements of same size. | |
""" | |
import heapq | |
iterable = iter(iterable) | |
if initializer is _missing: | |
initializer = next(iterable, _missing) | |
assert initializer is not _missing, "At least one item or initializer expected" | |
items = [ | |
(key(initializer), -1, initializer), | |
*((key(x), seq, x) for seq, x in enumerate(iterable)), | |
] | |
heapq.heapify(items) | |
seq = len(items) - 1 | |
while len(items) >= 2: | |
_, seq_left, left = heapq.heappop(items) | |
_, seq_right, right = heapq.heappop(items) | |
if seq_left > seq_right: | |
left, right = right, left | |
new = function(left, right) | |
heapq.heappush(items, (key(new), seq, new)) | |
seq += 1 | |
_, _, result = heapq.heappop(items) | |
return result | |
def merge_all(frames, *args, **kwargs): | |
assert frames, "At least one frame expected" | |
return sorting_reduce( | |
lambda left, right: pd.merge(left, right, *args, **kwargs), | |
frames, | |
key=lambda df: len(df.index), | |
) | |
# usage example | |
merged_df = merge_all([df1, df2, df3], how='outer', left_index=True, right_index=True, copy=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment