Skip to content

Instantly share code, notes, and snippets.

@sahuk
Created February 22, 2016 10:20
Show Gist options
  • Save sahuk/7a6ccd24708be8294c76 to your computer and use it in GitHub Desktop.
Save sahuk/7a6ccd24708be8294c76 to your computer and use it in GitHub Desktop.
"""
Converts a large CSV into SQL, can process some of the smaller chunks
Based on https://plot.ly/python/big-data-analytics-with-pandas-and-sqlite/
Original code probably from https://github.com/chriddyp
"""
import pandas
from sqlalchemy import create_engine
import tqdm
CSV1 = 'movies.csv'
CSV2 = 'countries.csv'
SQL = 'db.sqlite3'
engine = create_engine('sqlite:///{}'.format(SQL)):w
def to_sqlite(csvfile, disk_engine, tablename):
chunksize = 50000
for df in tqdm(pandas.read_csv(csvfile, chunksize=chunksize, iterator=True, encoding='utf-8')):
df.to_sql(tablename, disk_engine, if_exists='append')
# some sample SQL statements
# it's important to filter the dataset with a 'where' clause, or use the 'limit' statement
# otherwise we might end up loading the entire dataset into memory, which we want to avoid
first5 = "select * from movies limit 5" # Everything we know about the first 5 entries
names_of_top10 = "select title, score from movies order by score DESC limit 10" # the first 10 entries, by DESCending scoref
# This is what I type in IPython. Every defined variable will tab complete,
# so I just type the first letters and hit the TAB key to get a suggestion of what I was trying to type.
%load csv2sql.py
to_sqlite(CSV1, engine, 'movies')
to_sqlite(CSV2, engine, 'countries')
df = pandas.read_sql_query(first5, disk_engine)
# display the dataframe etc
df = pandas.read_sql_query(names_of_top10, disk_engine)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment