Created
July 26, 2023 15:38
-
-
Save tomrockdsouza/407660987e135e6dee0426c13e48778d to your computer and use it in GitHub Desktop.
Write a python script to download the historical climate data of all locations into a csv file. http://www.weather.gov.sg/climate-historical-daily/ Output as a csv file in below format with all the stations in a file per month. Please share with us how to run your script.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
1) Write a python script to download the historical climate data of all locations into a csv file. | |
http://www.weather.gov.sg/climate-historical-daily/ | |
Output as a csv file in below format with all the stations in a file per month for 2022 Sep, 2022 Oct and 2022 Nov. | |
CSV format: | |
Station,Year,Month,Day,Daily Rainfall Total (mm),Highest 30 min Rainfall (mm),Highest 60 min Rainfall (mm),Highest 120 min Rainfall (mm),Mean Temperature (°C),Maximum Temperature (°C),Minimum Temperature (°C),Mean Wind Speed (km/h),Max Wind Speed (km/h) | |
Changi,2023,1,1, 0.0, 0.0, 0.0, 0.0, 27.2, 30.6, 25.3, 9.8, 31.5 | |
... | |
Admiralty,2023,1,1, 0.0, 0.0, 0.0, 0.0, 26.8, 30.1, 24.8, 10.7, 27.8 | |
... | |
Please share with us how to run your script. | |
Python libraries Required to run this script: | |
duckdb | |
requests | |
bs4 | |
''' | |
from bs4 import BeautifulSoup | |
import requests | |
import duckdb | |
import concurrent.futures | |
from time import time | |
import os,shutil,json,sys,codecs | |
sys.setrecursionlimit(10000) | |
timeout_parameter = 120 | |
parallel_config_1 = 6 | |
parallel_config_2 = 5 | |
# found that these parameters work best | |
# speed of download depends on availability of server | |
hidden_gem=b'\xef\xbb\xbf'.decode() | |
month_mapping = { | |
'January': '01', | |
'February': '02', | |
'March': '03', | |
'April': '04', | |
'May': '05', | |
'June': '06', | |
'July': '07', | |
'August': '08', | |
'September': '09', | |
'October': '10', | |
'November': '11', | |
'December': '12', | |
} | |
def data_cleaning(save_path,response_content): | |
try: | |
with open(save_path, 'wb') as file: | |
file.write( | |
response_content | |
.replace(b'(C)',b'(\xc2\xb0C)') | |
.replace(b' Min ',b' min ') | |
.replace(b'\xef\xbb\xbf',b'') | |
if isinstance(response_content,bytes) | |
else | |
response_content | |
.replace('(C)','(°C)') | |
.replace(' Min ',' min ') | |
.replace(hidden_gem,'').encode() | |
) | |
except: | |
print('Hell Broke Loose',response.content) | |
return ['get_saving', url, None, None] | |
with codecs.open(save_path, 'r', encoding='utf-8', errors='ignore') as file: | |
content = file.read() | |
with codecs.open(save_path, 'w', encoding='utf-8') as file: | |
file.write(content) | |
return ['success', None, None, None] | |
def download_csv(url, save_path): | |
print(url) | |
for i in range(3): | |
try: | |
response = requests.get(url, timeout=timeout_parameter) | |
break | |
except: | |
if i==2: | |
return ['get_downloading', url, None, None] | |
if not response.status_code == 200: | |
return ['get_downloading', url, None, None] | |
return data_cleaning(save_path,response.content) | |
def get_csv_link(dictx): | |
station, year, month = dictx["station"], dictx["year"], dictx["month"] | |
print('get_csv_link', station, year, month) | |
return download_csv( | |
f'http://www.weather.gov.sg/files/dailydata/DAILYDATA_{station}_{year}{month_mapping[month]}.csv', | |
f'weather_files/DAILYDATA_{station}_{year}{month_mapping[month]}.csv' | |
) | |
def get_months_then_csv_link(dictx): | |
station, cityname, year = dictx["station"], dictx["cityname"], dictx["year"] | |
print('get_months_then_csv_link', station, cityname, year) | |
for i in range(3): | |
try: | |
months_response = requests.post( | |
url='http://www.weather.gov.sg/wp-content/themes/wiptheme/page-functions/functions-climate-historical-daily-months.php', | |
data={ | |
'year': year, | |
'cityname': cityname | |
}, | |
timeout=timeout_parameter | |
) | |
break | |
except: | |
if i==2: | |
return ['get_months', cityname, year, None] | |
if not months_response.status_code == 200: | |
return ['get_months', cityname, year, None] | |
months_content = months_response.content | |
months_soup = BeautifulSoup(months_content, 'html.parser') | |
all_months = [month_html.a['href'][1:] for month_html in months_soup.find('ul').find_all('li')] | |
valuez = [] | |
for month in all_months: | |
valuez.append({'station': station, 'year': year, 'month': month}) | |
results = [] | |
with concurrent.futures.ThreadPoolExecutor(max_workers=parallel_config_2) as executor: | |
futures = [executor.submit(get_csv_link, d) for d in valuez] | |
for future in concurrent.futures.as_completed(futures): | |
results.append(future.result()) | |
return results | |
def get_years_then_months_then_csv_link(dictx): | |
cityname, station = dictx[1], dictx[0] | |
print('get_years_then_months_then_csv_link', cityname, station) | |
for i in range(3): | |
try: | |
years_response = requests.post( | |
url='http://www.weather.gov.sg/wp-content/themes/wiptheme/page-functions/functions-climate-historical-daily-year.php', | |
data={ | |
'stationCode': station | |
}, | |
timeout=timeout_parameter | |
) | |
break | |
except: | |
if i==2: | |
return ['get_years', cityname, None, None] | |
if not years_response.status_code == 200: | |
return ['get_years', cityname, None, None] | |
years_content = years_response.content | |
years_soup = BeautifulSoup(years_content, 'html.parser') | |
all_years = [month_html.a['href'][1:] for month_html in years_soup.find('ul').find_all('li')] | |
valuey = [] | |
for year in all_years: | |
valuey.append({'station': station, 'cityname': cityname, 'year': year}) | |
results = [] | |
with concurrent.futures.ThreadPoolExecutor(max_workers=parallel_config_1) as executor: | |
futures = [executor.submit(get_months_then_csv_link, d) for d in valuey] | |
for future in concurrent.futures.as_completed(futures): | |
results.append(future.result()) | |
return results | |
if __name__ == '__main__': | |
start = time() | |
if os.path.isdir('weather_files'): | |
shutil.rmtree('weather_files') | |
os.mkdir('weather_files') | |
url = "http://www.weather.gov.sg/climate-historical-daily/" | |
response = requests.get(url, timeout=timeout_parameter) | |
html_content = response.content | |
soup = BeautifulSoup(html_content, 'html.parser') | |
element = soup.find(id="cityname") | |
sibling = element.find_next_sibling('ul') | |
data_dict = {} | |
for x in sibling.find_all('li'): | |
y = x.find(onclick=True) | |
data_dict[y['onclick'][9:-2]] = x.a.string | |
results = [] | |
with concurrent.futures.ProcessPoolExecutor(max_workers=os.cpu_count()) as executor: | |
futures = [executor.submit(get_years_then_months_then_csv_link, d) for d in data_dict.items()] | |
for future in concurrent.futures.as_completed(futures): | |
results.append(future.result()) | |
try: | |
with open('scraper_log.txt', 'w') as w: | |
w.write(json.dumps(results, indent=1)) | |
except: | |
pass | |
print(f'Downloaded Data in {time() - start} s') | |
# gives mileage of 32.9MB in 180 seconds | |
start2= time() | |
conn = duckdb.connect(':memory:') | |
# Merge and sort the CSV files using a single SQL statement | |
conn.execute(""" | |
create table weather_sg as | |
SELECT Station, | |
CAST(Year as INTEGER) as Year, | |
CAST(Month as INTEGER) as Month, | |
CAST(Day as INTEGER) as Day, | |
replace("Daily Rainfall Total (mm)",' ','') as "Daily Rainfall Total (mm)", | |
replace("Highest 30 min Rainfall (mm)",' ','') as "Highest 30 min Rainfall (mm)", | |
replace("Highest 60 min Rainfall (mm)",' ','') as "Highest 60 min Rainfall (mm)", | |
replace("Highest 120 min Rainfall (mm)",' ','') as "Highest 120 min Rainfall (mm)", | |
replace("Mean Temperature (°C)",' ','') as "Mean Temperature (°C)", | |
replace("Maximum Temperature (°C)",' ','') as "Maximum Temperature (°C)", | |
replace("Minimum Temperature (°C)",' ','') as "Minimum Temperature (°C)", | |
replace("Mean Wind Speed (km/h)",' ','') as "Mean Wind Speed (km/h)", | |
replace("Max Wind Speed (km/h)",' ','') as "Max Wind Speed (km/h)" | |
FROM | |
READ_CSV( | |
'weather_files/*.csv', | |
skip=1, | |
header=False, | |
sep=',', | |
QUOTE='"', | |
escape='\\', | |
columns={ | |
'Station':'VARCHAR', | |
'Year':'INTEGER', | |
'Month':'INTEGER', | |
'Day':'INTEGER', | |
'Daily Rainfall Total (mm)':'VARCHAR', | |
'Highest 30 min Rainfall (mm)':'VARCHAR', | |
'Highest 60 min Rainfall (mm)':'VARCHAR', | |
'Highest 120 min Rainfall (mm)':'VARCHAR', | |
'Mean Temperature (°C)':'VARCHAR', | |
'Maximum Temperature (°C)':'VARCHAR', | |
'Minimum Temperature (°C)':'VARCHAR', | |
'Mean Wind Speed (km/h)':'VARCHAR', | |
'Max Wind Speed (km/h)':'VARCHAR' | |
} | |
) | |
order by Station asc,Year desc, Month desc, Day desc | |
; | |
""") | |
conn.execute("""COPY weather_sg TO 'output.csv' WITH (HEADER 1);""") | |
conn.close() | |
print(f'Created CSV in Data in {time() - start2} s') | |
print('end') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment