Skip to content

Instantly share code, notes, and snippets.

@ctivanovich
Last active May 6, 2019 08:53
Show Gist options
  • Save ctivanovich/020aa891273148515164be16b181571f to your computer and use it in GitHub Desktop.
Save ctivanovich/020aa891273148515164be16b181571f to your computer and use it in GitHub Desktop.
A weather data scraper I made for Wunderground.com, using XPaths and dealing with asynchronously loading page elements.
import bs4
import datetime
import os
import pandas as pd
import psycopg2
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options
import csv
BASE_URL = "https://www.wunderground.com/history/daily/ZSSS/date/"
CITY = "Shanghai"
options = Options()
options.add_argument("--window-size=3200,1800")
options.add_argument("--disable-gpu")
options.add_argument("--start-maximized")
options.add_argument("--headless")
chrome_path = os.path.join(os.getcwd(), "path_to_chromedriver\\chromedriver.exe")
header = [
'Date',
'Time',
'Temperature',
'Dew Point',
'Humidity',
'Wind',
'Wind Speed',
'Wind Gust',
'Pressure',
'Precip.',
'Precip Accum',
'Condition']
def daterange(start_date, end_date):
for n in range(int ((end_date - start_date).days)):
yield start_date + datetime.timedelta(n)
def scrape_weather_data(csv_writer):
'''Takes a date_range iterable (list or generator), uses dates to construct
webpage queries and to index output'''
table_xpath = r'//*[@id="inner-content"]/div[2]/div[3]/div/div[1]/div/div/city-history-observation'
timeout = 60
import pandas as pd
df = pd.read_csv('Shanghai_weather.csv', usecols=['Date'])
dates_acquired = pd.to_datetime(df.Date).dt.date.unique()
del df #not needed after a quick data check
for d in daterange(START, END):
try:
if d in dates_acquired:
continue
else:
d = d.strftime('%Y-%m-%d')
city_url = BASE_URL + f"{d}/req_city={CITY}&req_statename=China"
driver.get(city_url)
element_present = EC.presence_of_element_located((By.TAG_NAME, 'table'))
WebDriverWait(driver, timeout).until(element_present)
table = driver.find_element_by_xpath(table_xpath)
# next_col_link = driver.find_element_by_xpath(next_colum_xpath)
for i, el in enumerate(table.find_elements_by_tag_name("tr")):
observations = {'Date':d}
if i == 0:
continue
for i, datum in enumerate(el.find_elements_by_tag_name("td")):
observations[header[i+1]] = datum.text
csv_writer.writerow(observations)
except TimeoutException:
print(f"Timed out waiting for page to load on day {d}")
missed_dates.append(d)
with open('Shanghai_weather.csv', 'a') as f:
driver = webdriver.chrome.webdriver.WebDriver(executable_path=chrome_path, options=options)
driver.maximize_window()
START = datetime.date(2017,1,1)
END = datetime.date(2018,12,31)
#missed dates is appended to by the scrape_weather_data function, here it is simply declared
#just in time for the function call
missed_dates = []
writer = csv.DictWriter(f, fieldnames=header)
scrape_weather_data(writer)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment