sahuk/csv2sql.py

## csv2sql.py
"""
Converts a large CSV into SQL, can process some of the smaller chunks

Based on https://plot.ly/python/big-data-analytics-with-pandas-and-sqlite/
Original code probably from https://github.com/chriddyp
"""
import pandas
from sqlalchemy import create_engine
import tqdm


CSV1 = 'movies.csv'
CSV2 = 'countries.csv'
SQL = 'db.sqlite3'


engine = create_engine('sqlite:///{}'.format(SQL)):w


def to_sqlite(csvfile, disk_engine, tablename):
  chunksize = 50000
  for df in tqdm(pandas.read_csv(csvfile, chunksize=chunksize, iterator=True, encoding='utf-8')):
    df.to_sql(tablename, disk_engine, if_exists='append')


# some sample SQL statements
# it's important to filter the dataset with a 'where' clause, or use the 'limit' statement
# otherwise we might end up loading the entire dataset into memory, which we want to avoid
first5 = "select * from movies limit 5" # Everything we know about the first 5 entries
names_of_top10 = "select title, score from movies order by score DESC limit 10" # the first 10 entries, by DESCending scoref

## in_ipython.py
# This is what I type in IPython. Every defined variable will tab complete,
# so I just type the first letters and hit the TAB key to get a suggestion of what I was trying to type.

%load csv2sql.py
to_sqlite(CSV1, engine, 'movies')
to_sqlite(CSV2, engine, 'countries')

df = pandas.read_sql_query(first5, disk_engine)
# display the dataframe etc

df = pandas.read_sql_query(names_of_top10, disk_engine)
	"""
	Converts a large CSV into SQL, can process some of the smaller chunks

	Based on https://plot.ly/python/big-data-analytics-with-pandas-and-sqlite/
	Original code probably from https://github.com/chriddyp
	"""
	import pandas
	from sqlalchemy import create_engine
	import tqdm


	CSV1 = 'movies.csv'
	CSV2 = 'countries.csv'
	SQL = 'db.sqlite3'


	engine = create_engine('sqlite:///{}'.format(SQL)):w


	def to_sqlite(csvfile, disk_engine, tablename):
	chunksize = 50000
	for df in tqdm(pandas.read_csv(csvfile, chunksize=chunksize, iterator=True, encoding='utf-8')):
	df.to_sql(tablename, disk_engine, if_exists='append')


	# some sample SQL statements
	# it's important to filter the dataset with a 'where' clause, or use the 'limit' statement
	# otherwise we might end up loading the entire dataset into memory, which we want to avoid
	first5 = "select * from movies limit 5" # Everything we know about the first 5 entries
	names_of_top10 = "select title, score from movies order by score DESC limit 10" # the first 10 entries, by DESCending scoref
	# This is what I type in IPython. Every defined variable will tab complete,
	# so I just type the first letters and hit the TAB key to get a suggestion of what I was trying to type.

	%load csv2sql.py
	to_sqlite(CSV1, engine, 'movies')
	to_sqlite(CSV2, engine, 'countries')

	df = pandas.read_sql_query(first5, disk_engine)
	# display the dataframe etc

	df = pandas.read_sql_query(names_of_top10, disk_engine)