Skip to content

Instantly share code, notes, and snippets.

@pmbaumgartner
Last active November 10, 2022 17:24
Show Gist options
  • Save pmbaumgartner/17ea7e236c11df1c8a680fe547a312d7 to your computer and use it in GitHub Desktop.
Save pmbaumgartner/17ea7e236c11df1c8a680fe547a312d7 to your computer and use it in GitHub Desktop.
Django/Postgres Data Load - Performance Comparison
from contextlib import closing
from io import StringIO
from time import time
import pandas as pd
from django.core.management.base import BaseCommand
from django.db import transaction
from faker import Faker
from core.models import Thing
fake = Faker()
def fake_thing():
return dict(
char=fake.name(),
text=fake.text(),
integer=fake.pyint(),
float=fake.pyfloat(),
boolean=fake.pybool(),
)
def in_memory_csv(data):
"""Creates an in-memory csv.
Assumes `data` is a list of dicts
with native python types."""
mem_csv = StringIO()
pd.DataFrame(data).to_csv(mem_csv, index=False)
mem_csv.seek(0)
return mem_csv
class Command(BaseCommand):
def handle(self, *args, **options):
Thing.objects.all().delete()
n_things = 500000
things = [fake_thing() for _ in range(n_things)]
# 01. NAIVE
start_01 = time()
for thing in things:
t = Thing(**thing)
t.save()
finish_01 = time()
assert Thing.objects.count() == n_things
assert Thing.objects.first().char == things[0]["char"]
Thing.objects.all().delete()
# 02. IN TRANSACTION
start_02 = time()
with transaction.atomic():
for thing in things:
t = Thing(**thing)
t.save()
finish_02 = time()
assert Thing.objects.count() == n_things
assert Thing.objects.first().char == things[0]["char"]
Thing.objects.all().delete()
# 03. BULK CREATE
start_03 = time()
thing_objects = []
for thing in things:
t = Thing(**thing)
thing_objects.append(t)
Thing.objects.bulk_create(thing_objects)
finish_03 = time()
assert Thing.objects.count() == n_things
assert Thing.objects.first().char == things[0]["char"]
Thing.objects.all().delete()
# 04. CSV
start_04 = time()
mem_csv = in_memory_csv(things)
with closing(mem_csv) as csv_io:
Thing.objects.from_csv(csv_io)
finish_04 = time()
assert Thing.objects.count() == n_things
assert Thing.objects.first().char == things[0]["char"]
Thing.objects.all().delete()
duration_01 = finish_01 - start_01
duration_02 = finish_02 - start_02
duration_03 = finish_03 - start_03
duration_04 = finish_04 - start_04
self.stdout.write(f"THINGS: {n_things}")
self.stdout.write(
f"Method 01: {duration_01:.2f}s. Speedup: {duration_01 / duration_01:.2f}"
)
self.stdout.write(
f"Method 02: {duration_02:.2f}s. Speedup: {duration_01 / duration_02:.2f}"
)
self.stdout.write(
f"Method 03: {duration_03:.2f}s. Speedup: {duration_01 / duration_03:.2f}"
)
self.stdout.write(
f"Method 04: {duration_04:.2f}s. Speedup: {duration_01 / duration_04:.2f}"
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment