Skip to content

Instantly share code, notes, and snippets.

@tomplex
Created April 12, 2017 00:15
Show Gist options
  • Save tomplex/1b7d4d6b20ff2cbe5200c08566c49bf3 to your computer and use it in GitHub Desktop.
Save tomplex/1b7d4d6b20ff2cbe5200c08566c49bf3 to your computer and use it in GitHub Desktop.
How to use Python multiprocessing to get and process web pages asyncronously
__author__ = 'tom caruso'
from multiprocessing import Pool
import requests
import time
import sys
base_url = 'http://investorshub.advfn.com/boards/read_msg.aspx?message_id={id}'
start_message = 130084355
def get_page(address):
"""
Get the page at the given address. If the link is good (200 status code) then do stuff.
:param address: A valid URL.
:return: None
"""
r = requests.get(address)
if r.ok:
# do stuff with the downloaded page here
pass # and remove this
def link_generator(start, stop):
"""
A python Generator object.
:param start: number to start at
:param stop: number to stop at
:return: A URL in the given range.
"""
for n in range(start, stop):
yield base_url.format(id=n)
def main():
try:
num_procs, num_pages = int(sys.argv[1]), int(sys.argv[2])
except:
num_procs, num_pages = 10, 1000
start_time = time.time()
# create a pool of workers
print('creating pool with {} workers'.format(num_procs))
pool = Pool(processes=num_procs)
# create our generator
linkgen = link_generator(start_message, start_message + num_pages)
# map the pool's processes to get_page, over our link generator.
# essentially, this meants that instead of storing a list of n links in memory, we instead
# can just create one whenever a process needs one.
pool.map(get_page, linkgen)
print('got {} pages in {} seconds'.format(num_pages, time.time() - start_time))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment