Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tomrockdsouza/407660987e135e6dee0426c13e48778d to your computer and use it in GitHub Desktop.
Save tomrockdsouza/407660987e135e6dee0426c13e48778d to your computer and use it in GitHub Desktop.
Write a python script to download the historical climate data of all locations into a csv file. http://www.weather.gov.sg/climate-historical-daily/ Output as a csv file in below format with all the stations in a file per month. Please share with us how to run your script.
'''
1) Write a python script to download the historical climate data of all locations into a csv file.
http://www.weather.gov.sg/climate-historical-daily/
Output as a csv file in below format with all the stations in a file per month for 2022 Sep, 2022 Oct and 2022 Nov.
CSV format:
Station,Year,Month,Day,Daily Rainfall Total (mm),Highest 30 min Rainfall (mm),Highest 60 min Rainfall (mm),Highest 120 min Rainfall (mm),Mean Temperature (°C),Maximum Temperature (°C),Minimum Temperature (°C),Mean Wind Speed (km/h),Max Wind Speed (km/h)
Changi,2023,1,1, 0.0, 0.0, 0.0, 0.0, 27.2, 30.6, 25.3, 9.8, 31.5
...
Admiralty,2023,1,1, 0.0, 0.0, 0.0, 0.0, 26.8, 30.1, 24.8, 10.7, 27.8
...
Please share with us how to run your script.
Python libraries Required to run this script:
duckdb
requests
bs4
'''
from bs4 import BeautifulSoup
import requests
import duckdb
import concurrent.futures
from time import time
import os,shutil,json,sys,codecs
sys.setrecursionlimit(10000)
timeout_parameter = 120
parallel_config_1 = 6
parallel_config_2 = 5
# found that these parameters work best
# speed of download depends on availability of server
hidden_gem=b'\xef\xbb\xbf'.decode()
month_mapping = {
'January': '01',
'February': '02',
'March': '03',
'April': '04',
'May': '05',
'June': '06',
'July': '07',
'August': '08',
'September': '09',
'October': '10',
'November': '11',
'December': '12',
}
def data_cleaning(save_path,response_content):
try:
with open(save_path, 'wb') as file:
file.write(
response_content
.replace(b'(C)',b'(\xc2\xb0C)')
.replace(b' Min ',b' min ')
.replace(b'\xef\xbb\xbf',b'')
if isinstance(response_content,bytes)
else
response_content
.replace('(C)','(°C)')
.replace(' Min ',' min ')
.replace(hidden_gem,'').encode()
)
except:
print('Hell Broke Loose',response.content)
return ['get_saving', url, None, None]
with codecs.open(save_path, 'r', encoding='utf-8', errors='ignore') as file:
content = file.read()
with codecs.open(save_path, 'w', encoding='utf-8') as file:
file.write(content)
return ['success', None, None, None]
def download_csv(url, save_path):
print(url)
for i in range(3):
try:
response = requests.get(url, timeout=timeout_parameter)
break
except:
if i==2:
return ['get_downloading', url, None, None]
if not response.status_code == 200:
return ['get_downloading', url, None, None]
return data_cleaning(save_path,response.content)
def get_csv_link(dictx):
station, year, month = dictx["station"], dictx["year"], dictx["month"]
print('get_csv_link', station, year, month)
return download_csv(
f'http://www.weather.gov.sg/files/dailydata/DAILYDATA_{station}_{year}{month_mapping[month]}.csv',
f'weather_files/DAILYDATA_{station}_{year}{month_mapping[month]}.csv'
)
def get_months_then_csv_link(dictx):
station, cityname, year = dictx["station"], dictx["cityname"], dictx["year"]
print('get_months_then_csv_link', station, cityname, year)
for i in range(3):
try:
months_response = requests.post(
url='http://www.weather.gov.sg/wp-content/themes/wiptheme/page-functions/functions-climate-historical-daily-months.php',
data={
'year': year,
'cityname': cityname
},
timeout=timeout_parameter
)
break
except:
if i==2:
return ['get_months', cityname, year, None]
if not months_response.status_code == 200:
return ['get_months', cityname, year, None]
months_content = months_response.content
months_soup = BeautifulSoup(months_content, 'html.parser')
all_months = [month_html.a['href'][1:] for month_html in months_soup.find('ul').find_all('li')]
valuez = []
for month in all_months:
valuez.append({'station': station, 'year': year, 'month': month})
results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=parallel_config_2) as executor:
futures = [executor.submit(get_csv_link, d) for d in valuez]
for future in concurrent.futures.as_completed(futures):
results.append(future.result())
return results
def get_years_then_months_then_csv_link(dictx):
cityname, station = dictx[1], dictx[0]
print('get_years_then_months_then_csv_link', cityname, station)
for i in range(3):
try:
years_response = requests.post(
url='http://www.weather.gov.sg/wp-content/themes/wiptheme/page-functions/functions-climate-historical-daily-year.php',
data={
'stationCode': station
},
timeout=timeout_parameter
)
break
except:
if i==2:
return ['get_years', cityname, None, None]
if not years_response.status_code == 200:
return ['get_years', cityname, None, None]
years_content = years_response.content
years_soup = BeautifulSoup(years_content, 'html.parser')
all_years = [month_html.a['href'][1:] for month_html in years_soup.find('ul').find_all('li')]
valuey = []
for year in all_years:
valuey.append({'station': station, 'cityname': cityname, 'year': year})
results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=parallel_config_1) as executor:
futures = [executor.submit(get_months_then_csv_link, d) for d in valuey]
for future in concurrent.futures.as_completed(futures):
results.append(future.result())
return results
if __name__ == '__main__':
start = time()
if os.path.isdir('weather_files'):
shutil.rmtree('weather_files')
os.mkdir('weather_files')
url = "http://www.weather.gov.sg/climate-historical-daily/"
response = requests.get(url, timeout=timeout_parameter)
html_content = response.content
soup = BeautifulSoup(html_content, 'html.parser')
element = soup.find(id="cityname")
sibling = element.find_next_sibling('ul')
data_dict = {}
for x in sibling.find_all('li'):
y = x.find(onclick=True)
data_dict[y['onclick'][9:-2]] = x.a.string
results = []
with concurrent.futures.ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
futures = [executor.submit(get_years_then_months_then_csv_link, d) for d in data_dict.items()]
for future in concurrent.futures.as_completed(futures):
results.append(future.result())
try:
with open('scraper_log.txt', 'w') as w:
w.write(json.dumps(results, indent=1))
except:
pass
print(f'Downloaded Data in {time() - start} s')
# gives mileage of 32.9MB in 180 seconds
start2= time()
conn = duckdb.connect(':memory:')
# Merge and sort the CSV files using a single SQL statement
conn.execute("""
create table weather_sg as
SELECT Station,
CAST(Year as INTEGER) as Year,
CAST(Month as INTEGER) as Month,
CAST(Day as INTEGER) as Day,
replace("Daily Rainfall Total (mm)",' ','') as "Daily Rainfall Total (mm)",
replace("Highest 30 min Rainfall (mm)",' ','') as "Highest 30 min Rainfall (mm)",
replace("Highest 60 min Rainfall (mm)",' ','') as "Highest 60 min Rainfall (mm)",
replace("Highest 120 min Rainfall (mm)",' ','') as "Highest 120 min Rainfall (mm)",
replace("Mean Temperature (°C)",' ','') as "Mean Temperature (°C)",
replace("Maximum Temperature (°C)",' ','') as "Maximum Temperature (°C)",
replace("Minimum Temperature (°C)",' ','') as "Minimum Temperature (°C)",
replace("Mean Wind Speed (km/h)",' ','') as "Mean Wind Speed (km/h)",
replace("Max Wind Speed (km/h)",' ','') as "Max Wind Speed (km/h)"
FROM
READ_CSV(
'weather_files/*.csv',
skip=1,
header=False,
sep=',',
QUOTE='"',
escape='\\',
columns={
'Station':'VARCHAR',
'Year':'INTEGER',
'Month':'INTEGER',
'Day':'INTEGER',
'Daily Rainfall Total (mm)':'VARCHAR',
'Highest 30 min Rainfall (mm)':'VARCHAR',
'Highest 60 min Rainfall (mm)':'VARCHAR',
'Highest 120 min Rainfall (mm)':'VARCHAR',
'Mean Temperature (°C)':'VARCHAR',
'Maximum Temperature (°C)':'VARCHAR',
'Minimum Temperature (°C)':'VARCHAR',
'Mean Wind Speed (km/h)':'VARCHAR',
'Max Wind Speed (km/h)':'VARCHAR'
}
)
order by Station asc,Year desc, Month desc, Day desc
;
""")
conn.execute("""COPY weather_sg TO 'output.csv' WITH (HEADER 1);""")
conn.close()
print(f'Created CSV in Data in {time() - start2} s')
print('end')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment