Skip to content

Instantly share code, notes, and snippets.

@baditaflorin
Last active December 22, 2016 12:05
Show Gist options
  • Save baditaflorin/fb6dcec9388989d54c9d307dbfe7e261 to your computer and use it in GitHub Desktop.
Save baditaflorin/fb6dcec9388989d54c9d307dbfe7e261 to your computer and use it in GitHub Desktop.
#Source http://www.craigaddyman.com/python-queues-and-multi-threading/
# standard libraries
from datetime import datetime
import Queue
from threading import Thread
# third party libraries
from bs4 import BeautifulSoup
import requests
# capture current time
startTime = datetime.now()
# create the instance
q = Queue.LifoQueue()
# specify sitemap to get all site links
url = "http://www.telegraph.co.uk/wrestling/sitemap.xml"
# request sitemap and make the 'soup'
r = requests.get(url, timeout=5)
data = r.text
soup = BeautifulSoup(data.encode('utf-8'))
def sitemap_parser(soup):
# parse sitemap for all links
for url in soup.findAll("loc"):
q.put(url.text) # add each url to the queue for processing
sitemap_parser(soup)
def grab_data_from_queue():
while not q.empty(): # check that the queue isn't empty
url = q.get() # get the item from the queue
r = requests.get(url.strip()) # request the url
print r.status_code, r.url # print the response code and destination url
print len(r.content)
q.task_done() # specify that you are done with the item
for i in range(80): # aka number of threadtex
t1 = Thread(target=grab_data_from_queue) # target is the above function
t1.start() # start the thread
q.join()
# print current time minus the start time
print datetime.now() - startTime
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment