Skip to content

Instantly share code, notes, and snippets.

@EricIO
Created August 25, 2014 17:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save EricIO/225bf93e0ee216578e4b to your computer and use it in GitHub Desktop.
Save EricIO/225bf93e0ee216578e4b to your computer and use it in GitHub Desktop.
from selenium import webdriver
import random
import time
def get_url(tag_object):
a_tag = tag_object.find_elements_by_tag_name('a')[0]
return a_tag.get_attribute('href')
def write_results(f, url_list):
if len(url_list) == 0: return
for page in url_list:
url = get_url(page)
f.write(url + '\n')
if __name__ == '__main__':
queries = open('01urls', 'r').readlines()
# Start the browser
browser = webdriver.Firefox()
for query in queries:
file_name = query[36:51] + '-' + query[53:56]
print('Scraping results for: ' + file_name)
with open('urls/' + file_name, 'w') as f:
browser.get(query.strip('\n'))
results = browser.find_elements_by_class_name('r')
write_results(f, results)
while True:
try:
time.sleep(random.randrange(25,60))
browser.get(browser.find_element_by_id('pnnext').
get_attribute('href'))
write_results(f, browser.find_elements_by_class_name('r'))
print('\t Grabbing next result page')
except:
print('No more results')
break
print('Done')
time.sleep(random.randrange(25,60))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment