Created
April 12, 2017 00:15
-
-
Save tomplex/1b7d4d6b20ff2cbe5200c08566c49bf3 to your computer and use it in GitHub Desktop.
How to use Python multiprocessing to get and process web pages asyncronously
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'tom caruso' | |
from multiprocessing import Pool | |
import requests | |
import time | |
import sys | |
base_url = 'http://investorshub.advfn.com/boards/read_msg.aspx?message_id={id}' | |
start_message = 130084355 | |
def get_page(address): | |
""" | |
Get the page at the given address. If the link is good (200 status code) then do stuff. | |
:param address: A valid URL. | |
:return: None | |
""" | |
r = requests.get(address) | |
if r.ok: | |
# do stuff with the downloaded page here | |
pass # and remove this | |
def link_generator(start, stop): | |
""" | |
A python Generator object. | |
:param start: number to start at | |
:param stop: number to stop at | |
:return: A URL in the given range. | |
""" | |
for n in range(start, stop): | |
yield base_url.format(id=n) | |
def main(): | |
try: | |
num_procs, num_pages = int(sys.argv[1]), int(sys.argv[2]) | |
except: | |
num_procs, num_pages = 10, 1000 | |
start_time = time.time() | |
# create a pool of workers | |
print('creating pool with {} workers'.format(num_procs)) | |
pool = Pool(processes=num_procs) | |
# create our generator | |
linkgen = link_generator(start_message, start_message + num_pages) | |
# map the pool's processes to get_page, over our link generator. | |
# essentially, this meants that instead of storing a list of n links in memory, we instead | |
# can just create one whenever a process needs one. | |
pool.map(get_page, linkgen) | |
print('got {} pages in {} seconds'.format(num_pages, time.time() - start_time)) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment