JeffSpies/persist_to_disk.py

## persist_to_disk.py
# Write after all requests have been made
# - If it crashes, you lose all those time-consuming requests.
# - Doing it again is not respectful to the server you're making requests to.

data = []
for i in my_list_of_things:
    response = make_request(i)
    item = parse_response(response)
    data.append(item)
write_to_disk(data) # could be pickle, json, csv, sqlite leveldb

# Write after each request
# - Writing to disk is slow; will further slow things down.

for i in my_list_of_things:
    response = make_request(i)
    item = parse_response(response)
    write_to_disk(data) # could be pickle, json, csv, sqlite, leveldb

# Write after X number of requests
# - You risk losing only X requests if something crashes
# + You can write some more code to start where you left off

data = []
for count, i in enumerate(my_list_of_things):
    if count % X == 0:
        write_to_disk(data)
        data = []
    response = make_request(i)
    item = parse_response(response)
    data.append(item)
write_to_disk(data) # could be pickle, json, csv, sqlite, leveldb

# On write_to_disk:
# - Can you just query the resulting data in memory?
#   If so, a pandas data frame is quite powerful. (http://pandas.pydata.org/)
#       In that case, I might suggest
#           - Writing to one CSV file
#           - Writing to one or many pickle file(s)
#           - Writing to many JSON files
#   If not, look at
#       - sqlite (libraries: sqlite3, peewee, sqlalchemy)
#           - Do you know/like SQL? Do you need querying against complex relationships?
#           - Either way, look at
#               - http://docs.peewee-orm.com/en/latest/
#               - https://www.sqlalchemy.org/
#       - tinydb (https://github.com/msiemens/tinydb)
#           - Stores in JSON
#               - Make it faster: https://tinydb.readthedocs.io/en/latest/usage.html#alternative-json-library
#           - Has an indexing extension if you are quering from this a lot
#       - leveldb (a bunch of interfaces; not sure what is best)
	# Write after all requests have been made
	# - If it crashes, you lose all those time-consuming requests.
	# - Doing it again is not respectful to the server you're making requests to.

	data = []
	for i in my_list_of_things:
	response = make_request(i)
	item = parse_response(response)
	data.append(item)
	write_to_disk(data) # could be pickle, json, csv, sqlite leveldb

	# Write after each request
	# - Writing to disk is slow; will further slow things down.

	for i in my_list_of_things:
	response = make_request(i)
	item = parse_response(response)
	write_to_disk(data) # could be pickle, json, csv, sqlite, leveldb

	# Write after X number of requests
	# - You risk losing only X requests if something crashes
	# + You can write some more code to start where you left off

	data = []
	for count, i in enumerate(my_list_of_things):
	if count % X == 0:
	write_to_disk(data)
	data = []
	response = make_request(i)
	item = parse_response(response)
	data.append(item)
	write_to_disk(data) # could be pickle, json, csv, sqlite, leveldb

	# On write_to_disk:
	# - Can you just query the resulting data in memory?
	# If so, a pandas data frame is quite powerful. (http://pandas.pydata.org/)
	# In that case, I might suggest
	# - Writing to one CSV file
	# - Writing to one or many pickle file(s)
	# - Writing to many JSON files
	# If not, look at
	# - sqlite (libraries: sqlite3, peewee, sqlalchemy)
	# - Do you know/like SQL? Do you need querying against complex relationships?
	# - Either way, look at
	# - http://docs.peewee-orm.com/en/latest/
	# - https://www.sqlalchemy.org/
	# - tinydb (https://github.com/msiemens/tinydb)
	# - Stores in JSON
	# - Make it faster: https://tinydb.readthedocs.io/en/latest/usage.html#alternative-json-library
	# - Has an indexing extension if you are quering from this a lot
	# - leveldb (a bunch of interfaces; not sure what is best)