Last active
November 10, 2022 17:24
-
-
Save pmbaumgartner/17ea7e236c11df1c8a680fe547a312d7 to your computer and use it in GitHub Desktop.
Django/Postgres Data Load - Performance Comparison
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from contextlib import closing | |
from io import StringIO | |
from time import time | |
import pandas as pd | |
from django.core.management.base import BaseCommand | |
from django.db import transaction | |
from faker import Faker | |
from core.models import Thing | |
fake = Faker() | |
def fake_thing(): | |
return dict( | |
char=fake.name(), | |
text=fake.text(), | |
integer=fake.pyint(), | |
float=fake.pyfloat(), | |
boolean=fake.pybool(), | |
) | |
def in_memory_csv(data): | |
"""Creates an in-memory csv. | |
Assumes `data` is a list of dicts | |
with native python types.""" | |
mem_csv = StringIO() | |
pd.DataFrame(data).to_csv(mem_csv, index=False) | |
mem_csv.seek(0) | |
return mem_csv | |
class Command(BaseCommand): | |
def handle(self, *args, **options): | |
Thing.objects.all().delete() | |
n_things = 500000 | |
things = [fake_thing() for _ in range(n_things)] | |
# 01. NAIVE | |
start_01 = time() | |
for thing in things: | |
t = Thing(**thing) | |
t.save() | |
finish_01 = time() | |
assert Thing.objects.count() == n_things | |
assert Thing.objects.first().char == things[0]["char"] | |
Thing.objects.all().delete() | |
# 02. IN TRANSACTION | |
start_02 = time() | |
with transaction.atomic(): | |
for thing in things: | |
t = Thing(**thing) | |
t.save() | |
finish_02 = time() | |
assert Thing.objects.count() == n_things | |
assert Thing.objects.first().char == things[0]["char"] | |
Thing.objects.all().delete() | |
# 03. BULK CREATE | |
start_03 = time() | |
thing_objects = [] | |
for thing in things: | |
t = Thing(**thing) | |
thing_objects.append(t) | |
Thing.objects.bulk_create(thing_objects) | |
finish_03 = time() | |
assert Thing.objects.count() == n_things | |
assert Thing.objects.first().char == things[0]["char"] | |
Thing.objects.all().delete() | |
# 04. CSV | |
start_04 = time() | |
mem_csv = in_memory_csv(things) | |
with closing(mem_csv) as csv_io: | |
Thing.objects.from_csv(csv_io) | |
finish_04 = time() | |
assert Thing.objects.count() == n_things | |
assert Thing.objects.first().char == things[0]["char"] | |
Thing.objects.all().delete() | |
duration_01 = finish_01 - start_01 | |
duration_02 = finish_02 - start_02 | |
duration_03 = finish_03 - start_03 | |
duration_04 = finish_04 - start_04 | |
self.stdout.write(f"THINGS: {n_things}") | |
self.stdout.write( | |
f"Method 01: {duration_01:.2f}s. Speedup: {duration_01 / duration_01:.2f}" | |
) | |
self.stdout.write( | |
f"Method 02: {duration_02:.2f}s. Speedup: {duration_01 / duration_02:.2f}" | |
) | |
self.stdout.write( | |
f"Method 03: {duration_03:.2f}s. Speedup: {duration_01 / duration_03:.2f}" | |
) | |
self.stdout.write( | |
f"Method 04: {duration_04:.2f}s. Speedup: {duration_01 / duration_04:.2f}" | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment