Created
August 6, 2019 13:33
-
-
Save jamesdunham/adb6909837a40540e5be010f030a2e45 to your computer and use it in GitHub Desktop.
Load a DataFrame into Postgres
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from io import StringIO | |
import psycopg2 | |
from tqdm import tqdm | |
def copy_from(df: pd.DataFrame, | |
table: str, | |
connection: psycopg2.extensions.connection, | |
chunk_size: int = 10000): | |
"""Load a DataFrame into Postgres. | |
Using COPY FROM via psycopg2 seems to be faster than the to_sql() method. | |
- http://initd.org/psycopg/docs/cursor.html#cursor.copy_from | |
- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html | |
- https://www.postgresql.org/docs/current/sql-copy.html | |
:param df: DataFrame to load. | |
:param table: Name of a table. | |
:param connection: Database connection. | |
:param chunk_size: Number of rows to load at once. | |
:return: None | |
""" | |
cursor = connection.cursor() | |
df = df.copy() | |
# Per COPY FROM documentation: | |
# "Backslash characters can be used in the COPY data to quote data characters that might otherwise be taken as row | |
# or column delimiters. In particular, the following characters must be preceded by a backslash if they appear as | |
# part of a column value: backslash itself, newline, carriage return, and the current delimiter character." | |
escaped = {'\\': '\\\\', '\n': r'\n', '\r': r'\r', '\t': r'\t'} | |
for col in df.columns: | |
if df.dtypes[col] == 'object': | |
for v, e in escaped.items(): | |
df[col] = df[col].str.replace(v, e) | |
try: | |
for i in tqdm(range(0, df.shape[0], chunk_size)): | |
f = StringIO() | |
chunk = df.iloc[i:(i + chunk_size)] | |
# The default separator is a tab, and NULLs are indicated by the two character-string '\N' | |
chunk.to_csv(f, index=False, header=False, sep='\t', na_rep='\\N', quoting=None) | |
f.seek(0) | |
cursor.copy_from(f, table, columns=list(df.columns)) | |
connection.commit() | |
except psycopg2.Error: | |
connection.rollback() | |
cursor.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Today, I've seen your code for StringIO and I'm thinking of adding it to my Python code. Thank you.