Last active
July 27, 2016 16:54
-
-
Save enewe101/e1da9b9a5681f90df36354d61dad79f3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
import shutil | |
from t4k import ( | |
PersistentOrderedDict as POD, ProgressTracker as PT, | |
DuplicateKeyException | |
) | |
# The POD is a general-purpose data store that syncs to disk | |
# The PT is a subclass that's a bit more convenient for tracking | |
# long-running processes. | |
# | |
# Here's a typical example | |
# | |
# Let me just define a couple things first to use in the example... | |
CITIES = [ | |
'Montreal', 'Toronto', 'Vancouver', 'Ottawa', 'New York', 'London', | |
'Tokyo' | |
] | |
MAX_TRIES = 3 | |
class CityError(Exception): | |
pass | |
def process_city(city): | |
if random.random() < 0.3: | |
print 'CityError! Try again later!' | |
raise CityError() | |
if city in {'Montreal', 'Toronto', 'Ottawa'}: | |
msg = 'Edward lived there once.' | |
elif city == 'Tokyo': | |
msg = 'Tokyo is a cool city!' | |
else: | |
msg = "%s is a crap city!" % city | |
print msg | |
return msg | |
# OK, here's the example. To simulate dealing with an unreliable resource, | |
# the call to process_city will randomly throw an error. | |
# You'll probably need to run the example a few times before it successfully | |
# processes all the cities. Try it out! | |
tracker = PT('city-tracker') | |
for city in CITIES: | |
if not tracker.check_or_add(city): | |
if tracker[city]['_tries'] > MAX_TRIES: | |
continue | |
tracker.increment_tries(city) | |
else: | |
print '%s is already done, skiping!' % city | |
continue | |
try: | |
result = process_city(city) | |
except CityError: | |
continue | |
tracker.set(city, 'result', result) | |
tracker.mark_done(city) | |
######## More details below! | |
# | |
# Store arbitrary data: (key, subkey, value) | |
# | |
# When making a PT or POD, give a path -- a directory will be made there | |
# that will store the POD files. | |
pod = PT('example-pod') | |
# Creates a new entry for something you want to track. | |
# for example, maybe you want to track which urls have been crawled | |
# which files have been written etc. | |
pod.add('some-key') | |
# Trying to add the same key twice is an error! | |
try: | |
pod.add('some-key') | |
except DuplicateKeyException: | |
print 'Trying to add the same key twice is an error!' | |
# You can set arbitrary data on the entry. | |
# By default, this will be synced to disk immediately. | |
pod.set('some-key', 'sub-key', 42) | |
pod['some-key']['sub-key'] # this will return 42. | |
# | |
# Keep track of progress | |
# | |
# Check if an entry is "done". It will return false this time | |
pod.check('some-key') | |
# Mark an entry done | |
pod.mark_done('some-key') | |
# Now this returns True... | |
pod.check('some-key') | |
# Checking for a non-existent key is fine. It returns False | |
pod.check('non-existent-key') | |
# A common pattern is to check if a key is done, and also, if it doesn't | |
# exist, to add it. This will return False if the entry is not done or | |
# doesn't exist, and if it doesn't exist it the key will be added. | |
pod.check_or_add('some-key') | |
# Keep track of the number of times you try to process an entry | |
pod.increment_tries('some-key') | |
# The entry's number of tries is in a subkey '_tries'. | |
# This would return 1 right now, because we called increment_tries above | |
pod['some-key']['_tries'] | |
# Reset tries back to zero | |
pod.reset_tries('some-key') | |
# Now this returns 0 | |
pod['some-key']['_tries'] | |
# | |
# Control syncing to disk | |
# | |
# If you're making many changes, then syncing them to disc becomes | |
# too much of a performance hit. Call hold, and any changes you make | |
# will be held in memory, but won't be synced to disk | |
pod.hold() # After calling pod.hold, any changes are reflected in memory | |
# Then, calling unhold() later will cause all of those changes to be synced | |
pod.unhold() | |
# A common pattern is: | |
# 1. call hold() | |
# 2. make several updates | |
# 3. call unhold() | |
# You can manually mark some entry as needing to be synced. Usually | |
# you don't need this unless your doing some hacky stuff. Calling this | |
# will either cause the entry to be synced immediately, or if hold() has | |
# been called, the entry will be synced as soon as unhold() is called | |
pod.update('some-key') | |
# Remove the example-pod so that the example can be run multiple times | |
# without raising unintended DuplicateKey exceptions | |
shutil.rmtree('example-pod') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment