Created
March 30, 2009 20:16
-
-
Save chrisgemignani/87867 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Fetch people for Twitalyzer's mechanical turk. Will fetch and submit | |
Twitalyzer turk people forever with DELAY seconds between each pull. | |
Hit Ctrl-C (OSX, Linux) or Ctrl-D (Windows) to stop. | |
To use, just run: | |
> python twitalizer_turk.py | |
""" | |
import urllib, re, pprint, time, random | |
# time to wait between fetches | |
DELAY = 10.0 | |
# regex to find people in Twitalyzer's Mechanical Turk page | |
people_re = re.compile('href=\"/twitalyzer/r.asp\?u=.*\&uri=turk.asp\">') | |
# url to load a person's data | |
engine_url = 'http://twitalyzer.com/twitalyzer/engine.asp?u=%s&lang=&uri=turk.asp' | |
# turk | |
turk_url = 'http://twitalyzer.com/twitalyzer/turk.asp' | |
while True: | |
try: | |
# get the content of the turk page | |
turk_content = urllib.urlopen(turk_url).read() | |
# grab all the people | |
turk_people = [] | |
for tp in people_re.findall(turk_content, re.MULTILINE): | |
name = tp[26:-15] | |
if name: | |
turk_people.append(name) | |
if turk_people: | |
# pick a person at random | |
person = random.choice(turk_people) | |
print "%d people in queue" % len(turk_people) | |
print "\tfetching", person | |
starttime = time.time() | |
f = urllib.urlopen(engine_url % person).read() | |
print "\tprocessed in %.2f seconds" % (time.time() - starttime) | |
except Exception, e: | |
print "Ooops! Hit a problem:", e | |
print "\tsleeping for %.1f seconds" % DELAY | |
time.sleep(DELAY) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment