Last active
May 1, 2023 16:36
-
-
Save ghl3/4556336 to your computer and use it in GitHub Desktop.
Using python threading to download random wikipedia articles.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import urllib2 | |
import threading | |
from Queue import Queue | |
from random import choice | |
def get_random_article(namespace=None): | |
""" Download a random wikipiedia article""" | |
try: | |
url = 'http://en.wikipedia.org/wiki/Special:Random' | |
if namespace != None: | |
url += '/' + namespace | |
req = urllib2.Request(url, None, { 'User-Agent' : 'x'}) | |
page = urllib2.urlopen(req).readlines() | |
return page | |
except (urllib2.HTTPError, urllib2.URLError): | |
print "Failed to get article" | |
raise | |
class DocumentDownloader(threading.Thread): | |
""" A class to download a user's top Artists | |
To be used as an individual thread to take | |
a list of users from a shared queue and | |
download their top artists | |
""" | |
def __init__(self, queue): | |
threading.Thread.__init__(self) | |
self._stop = threading.Event() | |
self.articles = [] | |
self.queue = queue | |
def stop(self): | |
self._stop.set() | |
def stopped(self): | |
return self._stop.isSet() | |
def get_articles(self): | |
return self.articles | |
def run(self): | |
while True: | |
if self.stopped(): | |
return | |
if self.queue.empty(): | |
time.sleep(0.1) | |
continue | |
try: | |
namespace = self.queue.get() | |
#article = get_random_article(namespace) | |
article = get_random_article() | |
self.articles.append(article) | |
print "Successfully processed namespace: ", namespace, | |
print " by thread: ", self.ident | |
# No need for a 'queue.task_done' since we're | |
# not joining on the queue | |
except: | |
print "Failed to process namespace: ", namespace | |
def get_random_documents(num_documents, num_threads=4): | |
""" Download 'num_documents' random documents from | |
the lastfm api. | |
Each document contains the top artists for a random | |
user from LastFM. | |
These documents are downloaded in parallel using | |
separate threads | |
""" | |
wiki_namespaces = """ | |
Main | |
User | |
Wikipedia | |
File | |
MediaWiki | |
Template | |
Help | |
Category | |
Portal | |
Book""".split() | |
q = Queue() | |
threads = [] | |
try: | |
# Create the threads and 'start' them. | |
# At this point, they are listening to the | |
# queue, waiting to consume | |
for i in xrange(num_threads): | |
thread = DocumentDownloader(q) | |
thread.setDaemon(True) | |
thread.start() | |
threads.append(thread) | |
# We want to download one page for each namespace, | |
# so we put every namespace in the queue, and | |
# these will be processed by the threads | |
for i in xrange(num_documents): | |
namespace = choice(wiki_namespaces) | |
q.put(namespace) | |
# Wait for all entries in the queue | |
# to be processed by our threads | |
# One could do a queue.join() here, | |
# but I prefer to use a loop and a timeout | |
while not q.empty(): | |
time.sleep(1.0) | |
# Terminate the threads once our | |
# queue has been fully processed | |
for thread in threads: | |
thread.stop() | |
for thread in threads: | |
thread.join() | |
except: | |
print "Main thread hit exception" | |
# Kill any running threads | |
for thread in threads: | |
thread.stop() | |
for thread in threads: | |
thread.join() | |
raise | |
# Collect all downloaded documents | |
# from our threads | |
documents = [] | |
for thread in threads: | |
documents.extend(thread.get_articles()) | |
return documents | |
if __name__ == "__main__": | |
documents = get_random_documents(10) | |
for document in documents: | |
print document[:10] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I already used this script to download a wikipedia data/articles but it didn't work. so the error is like this: "Failed to process namespace Main User Wikipedia File MediaWiki Template Help Category Portal Book". How can I resolve this error? I need your help!
Thanks.