tomplex/multiproc_download.py

## multiproc_download.py
__author__ = 'tom caruso'

from multiprocessing import Pool
import requests
import time
import sys

base_url = 'http://investorshub.advfn.com/boards/read_msg.aspx?message_id={id}'
start_message = 130084355


def get_page(address):
    """
    Get the page at the given address. If the link is good (200 status code) then do stuff.

    :param address: A valid URL.
    :return: None
    """
    r = requests.get(address)
    if r.ok:
        # do stuff with the downloaded page here
        pass  # and remove this


def link_generator(start, stop):
    """
    A python Generator object.
    :param start: number to start at
    :param stop: number to stop at

    :return: A URL in the given range.
    """
    for n in range(start, stop):
        yield base_url.format(id=n)


def main():
    try:
        num_procs, num_pages = int(sys.argv[1]), int(sys.argv[2])
    except:
        num_procs, num_pages = 10, 1000

    start_time = time.time()
    # create a pool of workers
    print('creating pool with {} workers'.format(num_procs))
    pool = Pool(processes=num_procs)

    # create our generator
    linkgen = link_generator(start_message, start_message + num_pages)

    # map the pool's processes to get_page, over our link generator.
    # essentially, this meants that instead of storing a list of n links in memory, we instead
    # can just create one whenever a process needs one.
    pool.map(get_page, linkgen)

    print('got {} pages in {} seconds'.format(num_pages, time.time() - start_time))


if __name__ == '__main__':
    main()
	__author__ = 'tom caruso'

	from multiprocessing import Pool
	import requests
	import time
	import sys

	base_url = 'http://investorshub.advfn.com/boards/read_msg.aspx?message_id={id}'
	start_message = 130084355


	def get_page(address):
	"""
	Get the page at the given address. If the link is good (200 status code) then do stuff.

	:param address: A valid URL.
	:return: None
	"""
	r = requests.get(address)
	if r.ok:
	# do stuff with the downloaded page here
	pass # and remove this


	def link_generator(start, stop):
	"""
	A python Generator object.
	:param start: number to start at
	:param stop: number to stop at

	:return: A URL in the given range.
	"""
	for n in range(start, stop):
	yield base_url.format(id=n)


	def main():
	try:
	num_procs, num_pages = int(sys.argv[1]), int(sys.argv[2])
	except:
	num_procs, num_pages = 10, 1000

	start_time = time.time()
	# create a pool of workers
	print('creating pool with {} workers'.format(num_procs))
	pool = Pool(processes=num_procs)

	# create our generator
	linkgen = link_generator(start_message, start_message + num_pages)

	# map the pool's processes to get_page, over our link generator.
	# essentially, this meants that instead of storing a list of n links in memory, we instead
	# can just create one whenever a process needs one.
	pool.map(get_page, linkgen)

	print('got {} pages in {} seconds'.format(num_pages, time.time() - start_time))


	if __name__ == '__main__':
	main()