Created
February 11, 2018 22:23
-
-
Save Rotzke/eff29c5a7d1838962512a09bba6c3561 to your computer and use it in GitHub Desktop.
Multiprocessing example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib2 | |
import csv | |
from bs4 import BeautifulSoup | |
from multiprocessing.dummy import Pool # This is a thread-based Pool | |
from multiprocessing import cpu_count | |
def crawlToCSV(URLrecord): | |
OpenSomeSiteURL = urllib2.urlopen(URLrecord) | |
Soup_SomeSite = BeautifulSoup(OpenSomeSiteURL, "lxml") | |
OpenSomeSiteURL.close() | |
tbodyTags = Soup_SomeSite.find("tbody") | |
trTags = tbodyTags.find_all("tr", class_="result-item ") | |
placeHolder = [] | |
for trTag in trTags: | |
tdTags = trTag.find("td", class_="result-value") | |
tdTags_string = tdTags.string | |
placeHolder.append(tdTags_string) | |
return placeHolder | |
if __name__ == "__main__": | |
fileName = "SomeSiteValidURLs.csv" | |
pool = Pool(cpu_count() * 2) # Creates a Pool with cpu_count * 2 threads. | |
with open(FileName, "rb") as f: | |
results = pool.map(crawlToCSV, f) # results is a list of all the placeHolder lists returned from each call to crawlToCSV | |
with open("Output.csv", "ab") as f: | |
writeFile = csv.writer(f) | |
for result in results: | |
writeFile.writerow(result) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment