ctivanovich/scrape_wunderground_weather.py

## scrape_wunderground_weather.py
import bs4
import datetime
import os
import pandas as pd
import psycopg2

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options

import csv

BASE_URL = "https://www.wunderground.com/history/daily/ZSSS/date/"
CITY = "Shanghai"

options = Options()
options.add_argument("--window-size=3200,1800")
options.add_argument("--disable-gpu")
options.add_argument("--start-maximized")
options.add_argument("--headless")

chrome_path = os.path.join(os.getcwd(), "path_to_chromedriver\\chromedriver.exe")

header = [
    'Date',
    'Time',
    'Temperature',
    'Dew Point',
    'Humidity',
    'Wind',
    'Wind Speed',
    'Wind Gust',
    'Pressure',
    'Precip.',
    'Precip Accum',
    'Condition']

def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days)):
        yield start_date + datetime.timedelta(n)

def scrape_weather_data(csv_writer):
    '''Takes a date_range iterable (list or generator), uses dates to construct
    webpage queries and to index output'''

    table_xpath = r'//*[@id="inner-content"]/div[2]/div[3]/div/div[1]/div/div/city-history-observation'
    timeout = 60

    import pandas as pd
    df = pd.read_csv('Shanghai_weather.csv', usecols=['Date'])
    dates_acquired = pd.to_datetime(df.Date).dt.date.unique()

    del df #not needed after a quick data check

    for d in daterange(START, END):
        try:
            if d in dates_acquired:
                continue
            else:
                d = d.strftime('%Y-%m-%d')

            city_url = BASE_URL + f"{d}/req_city={CITY}&req_statename=China"
            driver.get(city_url)
            element_present = EC.presence_of_element_located((By.TAG_NAME, 'table'))
            WebDriverWait(driver, timeout).until(element_present)
            table = driver.find_element_by_xpath(table_xpath)
            # next_col_link = driver.find_element_by_xpath(next_colum_xpath)
            for i, el in enumerate(table.find_elements_by_tag_name("tr")):
                observations = {'Date':d}
                if i == 0:
                    continue
                for i, datum in enumerate(el.find_elements_by_tag_name("td")):
                    observations[header[i+1]] = datum.text
                csv_writer.writerow(observations)
        except TimeoutException:
            print(f"Timed out waiting for page to load on day {d}")
            missed_dates.append(d)


with open('Shanghai_weather.csv', 'a') as f:
    driver = webdriver.chrome.webdriver.WebDriver(executable_path=chrome_path, options=options)
    driver.maximize_window()

    START = datetime.date(2017,1,1)
    END = datetime.date(2018,12,31)

    #missed dates is appended to by the scrape_weather_data function, here it is simply declared
    #just in time for the function call
    missed_dates = []

    writer = csv.DictWriter(f, fieldnames=header)

    scrape_weather_data(writer)
	import bs4
	import datetime
	import os
	import pandas as pd
	import psycopg2

	from selenium import webdriver
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.webdriver.common.by import By
	from selenium.common.exceptions import TimeoutException
	from selenium.webdriver.chrome.options import Options

	import csv

	BASE_URL = "https://www.wunderground.com/history/daily/ZSSS/date/"
	CITY = "Shanghai"

	options = Options()
	options.add_argument("--window-size=3200,1800")
	options.add_argument("--disable-gpu")
	options.add_argument("--start-maximized")
	options.add_argument("--headless")

	chrome_path = os.path.join(os.getcwd(), "path_to_chromedriver\\chromedriver.exe")

	header = [
	'Date',
	'Time',
	'Temperature',
	'Dew Point',
	'Humidity',
	'Wind',
	'Wind Speed',
	'Wind Gust',
	'Pressure',
	'Precip.',
	'Precip Accum',
	'Condition']

	def daterange(start_date, end_date):
	for n in range(int ((end_date - start_date).days)):
	yield start_date + datetime.timedelta(n)

	def scrape_weather_data(csv_writer):
	'''Takes a date_range iterable (list or generator), uses dates to construct
	webpage queries and to index output'''

	table_xpath = r'//*[@id="inner-content"]/div[2]/div[3]/div/div[1]/div/div/city-history-observation'
	timeout = 60

	import pandas as pd
	df = pd.read_csv('Shanghai_weather.csv', usecols=['Date'])
	dates_acquired = pd.to_datetime(df.Date).dt.date.unique()

	del df #not needed after a quick data check

	for d in daterange(START, END):
	try:
	if d in dates_acquired:
	continue
	else:
	d = d.strftime('%Y-%m-%d')

	city_url = BASE_URL + f"{d}/req_city={CITY}&req_statename=China"
	driver.get(city_url)
	element_present = EC.presence_of_element_located((By.TAG_NAME, 'table'))
	WebDriverWait(driver, timeout).until(element_present)
	table = driver.find_element_by_xpath(table_xpath)
	# next_col_link = driver.find_element_by_xpath(next_colum_xpath)
	for i, el in enumerate(table.find_elements_by_tag_name("tr")):
	observations = {'Date':d}
	if i == 0:
	continue
	for i, datum in enumerate(el.find_elements_by_tag_name("td")):
	observations[header[i+1]] = datum.text
	csv_writer.writerow(observations)
	except TimeoutException:
	print(f"Timed out waiting for page to load on day {d}")
	missed_dates.append(d)


	with open('Shanghai_weather.csv', 'a') as f:
	driver = webdriver.chrome.webdriver.WebDriver(executable_path=chrome_path, options=options)
	driver.maximize_window()

	START = datetime.date(2017,1,1)
	END = datetime.date(2018,12,31)

	#missed dates is appended to by the scrape_weather_data function, here it is simply declared
	#just in time for the function call
	missed_dates = []

	writer = csv.DictWriter(f, fieldnames=header)

	scrape_weather_data(writer)