baditaflorin/awesome_web_page_scraper.py

## awesome_web_page_scraper.py
#Source http://www.craigaddyman.com/python-queues-and-multi-threading/
# standard libraries
from datetime import datetime
import Queue
from threading import Thread

# third party libraries
from bs4 import BeautifulSoup
import requests

# capture current time
startTime = datetime.now()

# create the instance
q = Queue.LifoQueue()

# specify sitemap to get all site links
url = "http://www.telegraph.co.uk/wrestling/sitemap.xml"

# request sitemap and make the  'soup'
r = requests.get(url, timeout=5)
data = r.text
soup = BeautifulSoup(data.encode('utf-8'))


def sitemap_parser(soup):
    # parse sitemap for all links
    for url in soup.findAll("loc"):
        q.put(url.text)  # add each url to the queue for processing

sitemap_parser(soup)


def grab_data_from_queue():
    while not q.empty():  # check that the queue isn't empty

        url = q.get()  # get the item from the queue

        r = requests.get(url.strip())  # request the url

        print r.status_code, r.url  # print the response code and destination url
        print len(r.content)

        q.task_done()  # specify that you are done with the item


for i in range(80):  # aka number of threadtex
    t1 = Thread(target=grab_data_from_queue)  # target is the above function
    t1.start()  # start the thread

q.join()

# print current time minus the start time
print datetime.now() - startTime
	#Source http://www.craigaddyman.com/python-queues-and-multi-threading/
	# standard libraries
	from datetime import datetime
	import Queue
	from threading import Thread

	# third party libraries
	from bs4 import BeautifulSoup
	import requests

	# capture current time
	startTime = datetime.now()

	# create the instance
	q = Queue.LifoQueue()

	# specify sitemap to get all site links
	url = "http://www.telegraph.co.uk/wrestling/sitemap.xml"

	# request sitemap and make the 'soup'
	r = requests.get(url, timeout=5)
	data = r.text
	soup = BeautifulSoup(data.encode('utf-8'))


	def sitemap_parser(soup):
	# parse sitemap for all links
	for url in soup.findAll("loc"):
	q.put(url.text) # add each url to the queue for processing

	sitemap_parser(soup)


	def grab_data_from_queue():
	while not q.empty(): # check that the queue isn't empty

	url = q.get() # get the item from the queue

	r = requests.get(url.strip()) # request the url

	print r.status_code, r.url # print the response code and destination url
	print len(r.content)

	q.task_done() # specify that you are done with the item


	for i in range(80): # aka number of threadtex
	t1 = Thread(target=grab_data_from_queue) # target is the above function
	t1.start() # start the thread

	q.join()

	# print current time minus the start time
	print datetime.now() - startTime