Skip to content

Instantly share code, notes, and snippets.

@abhishekmishragithub
Created April 22, 2023 07:08
Show Gist options
  • Select an option

  • Save abhishekmishragithub/0cc2fd2bf9e6c862758f0dc93ad74213 to your computer and use it in GitHub Desktop.

Select an option

Save abhishekmishragithub/0cc2fd2bf9e6c862758f0dc93ad74213 to your computer and use it in GitHub Desktop.
import os
import argparse
from time import time
import pandas as pd
from sqlalchemy import create_engine
def ingest_data(user, password, host, port, db, table_name, url):
# the backup files are gzipped, and it's important to keep the correct extension
# for pandas to be able to open the file
if url.endswith('.csv.gz'):
csv_name = 'yellow_tripdata_2021-01.csv.gz'
else:
csv_name = 'output.csv'
os.system(f"wget {url} -O {csv_name}")
postgres_url = f'postgresql://{user}:{password}@{host}:{port}/{db}'
engine = create_engine(postgres_url)
df_iter = pd.read_csv(csv_name, iterator=True, chunksize=100000)
df = next(df_iter)
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
df.head(n=0).to_sql(name=table_name, con=engine, if_exists='replace')
df.to_sql(name=table_name, con=engine, if_exists='append')
while True:
try:
t_start = time()
df = next(df_iter)
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
df.to_sql(name=table_name, con=engine, if_exists='append')
t_end = time()
print('inserted another chunk, took %.3f second' % (t_end - t_start))
except StopIteration:
print("Finished ingesting data into the postgres database")
break
if __name__ == '__main__':
user = "postgres"
password = "admin"
host = "localhost"
port = "5433"
db = "ny_taxi"
table_name = "yellow_taxi_trips"
csv_url = "https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2021-01.csv.gz"
ingest_data(user, password, host, port, db, table_name, csv_url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment