Created
August 25, 2014 17:30
-
-
Save EricIO/225bf93e0ee216578e4b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
import random | |
import time | |
def get_url(tag_object): | |
a_tag = tag_object.find_elements_by_tag_name('a')[0] | |
return a_tag.get_attribute('href') | |
def write_results(f, url_list): | |
if len(url_list) == 0: return | |
for page in url_list: | |
url = get_url(page) | |
f.write(url + '\n') | |
if __name__ == '__main__': | |
queries = open('01urls', 'r').readlines() | |
# Start the browser | |
browser = webdriver.Firefox() | |
for query in queries: | |
file_name = query[36:51] + '-' + query[53:56] | |
print('Scraping results for: ' + file_name) | |
with open('urls/' + file_name, 'w') as f: | |
browser.get(query.strip('\n')) | |
results = browser.find_elements_by_class_name('r') | |
write_results(f, results) | |
while True: | |
try: | |
time.sleep(random.randrange(25,60)) | |
browser.get(browser.find_element_by_id('pnnext'). | |
get_attribute('href')) | |
write_results(f, browser.find_elements_by_class_name('r')) | |
print('\t Grabbing next result page') | |
except: | |
print('No more results') | |
break | |
print('Done') | |
time.sleep(random.randrange(25,60)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment