Skip to content

Instantly share code, notes, and snippets.

@thisismattmiller
Created March 30, 2022 18:31
Show Gist options
  • Save thisismattmiller/9849181e08e068a59013d25e5bba9403 to your computer and use it in GitHub Desktop.
Save thisismattmiller/9849181e08e068a59013d25e5bba9403 to your computer and use it in GitHub Desktop.
Code for https://youtu.be/pwnIcJ9p2C4 Web scraping with selenium
import glob
from bs4 import BeautifulSoup
import json
all_files = list(glob.glob('html/*.html'))
all_data = []
for file_name in all_files:
with open(file_name) as infile:
html = infile.read()
print(file_name)
soup = BeautifulSoup(html,'html.parser')
for row in range(0,12):
for left_or_right in ['Left', 'Right']:
data = {}
for field in ['CommonNameRow', 'TypeRow','ZoneRow','HeightRow','SynonymRow']:
use_id = f"MainContentPlaceHolder_SearchResultsList_SearchResultControl{left_or_right}_{row}_{field}_{row}"
element = soup.find('div', {'id': use_id})
if element != None:
data[field] = element.get_text().strip()
use_id = f"MainContentPlaceHolder_SearchResultsList_SearchResultControl{left_or_right}_{row}_TaxonHTMLName_{row}"
element = soup.find('a', {'id': use_id})
data['plant_link'] = 'https://www.missouribotanicalgarden.org' + element['href']
data['taxon_name']= element.get_text().strip()
all_data.append(data)
json.dump(all_data,open('all_data.json','w'),indent=2)
from selenium import webdriver
import glob
all_existing_html_files = list(glob.glob('html/*.html'))
driver = webdriver.Chrome(executable_path="./chromedriver")
driver.get('https://www.missouribotanicalgarden.org/PlantFinder/PlantFinderProfileResults.aspx')
js_command = """
document.getElementById('MainContentPlaceHolder_pagingUserControlTop_RecordsPerPage').value = 24;
document.getElementById('MainContentPlaceHolder_pagingUserControlTop_RecordsPerPage').dispatchEvent( new Event('change') );
"""
driver.execute_script(js_command)
for page in range(1,357):
filename = f'html/source_{page}.html'
if filename in all_existing_html_files:
print('Skipping page:',page)
continue
print('Doing page',page)
js_command = f"""
document.getElementById('MainContentPlaceHolder_pagingUserControlTop_pageTextBox').value = {page};
document.getElementById('MainContentPlaceHolder_pagingUserControlTop_pageTextBox').dispatchEvent( new KeyboardEvent('keypress', {{ key: 'Enter', code: 'Enter', keyCode: 13 }} ) )
"""
driver.execute_script(js_command)
html_source_code = driver.page_source
with open(filename,'w') as outfile:
outfile.write(html_source_code)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment