tomrockdsouza/ComfortDelGroTaxi_WebScraping_Test.py

## ComfortDelGroTaxi_WebScraping_Test.py
'''

1) Write a python script to download the historical climate data of all locations into a csv file.
http://www.weather.gov.sg/climate-historical-daily/

Output as a csv file in below format with all the stations in a file per month for 2022 Sep, 2022 Oct and 2022 Nov.

CSV format:
Station,Year,Month,Day,Daily Rainfall Total (mm),Highest 30 min Rainfall (mm),Highest 60 min Rainfall (mm),Highest 120 min Rainfall (mm),Mean Temperature (°C),Maximum Temperature (°C),Minimum Temperature (°C),Mean Wind Speed (km/h),Max Wind Speed (km/h)
Changi,2023,1,1,     0.0,     0.0,     0.0,     0.0,    27.2,    30.6,    25.3,     9.8,    31.5
...
Admiralty,2023,1,1,     0.0,     0.0,     0.0,     0.0,    26.8,    30.1,    24.8,    10.7,    27.8
...

Please share with us how to run your script.

Python libraries Required to run this script:
duckdb
requests
bs4

'''
from bs4 import BeautifulSoup
import requests
import duckdb
import concurrent.futures
from time import time
import os,shutil,json,sys,codecs

sys.setrecursionlimit(10000)
timeout_parameter = 120
parallel_config_1 = 6
parallel_config_2 = 5
# found that these parameters work best
# speed of download depends on availability of server
hidden_gem=b'\xef\xbb\xbf'.decode()

month_mapping = {
    'January': '01',
    'February': '02',
    'March': '03',
    'April': '04',
    'May': '05',
    'June': '06',
    'July': '07',
    'August': '08',
    'September': '09',
    'October': '10',
    'November': '11',
    'December': '12',
}

def data_cleaning(save_path,response_content):
    try:
        with open(save_path, 'wb') as file:
            file.write(
                response_content
                .replace(b'(C)',b'(\xc2\xb0C)')
                .replace(b' Min ',b' min ')
                .replace(b'\xef\xbb\xbf',b'')
                if isinstance(response_content,bytes)
                else
                response_content
                .replace('(C)','(°C)')
                .replace(' Min ',' min ')
                .replace(hidden_gem,'').encode()
            )
    except:
        print('Hell Broke Loose',response.content)
        return ['get_saving', url, None, None]
    with codecs.open(save_path, 'r', encoding='utf-8', errors='ignore') as file:
        content = file.read()
    with codecs.open(save_path, 'w', encoding='utf-8') as file:
        file.write(content)
    return ['success', None, None, None]

def download_csv(url, save_path):
    print(url)
    for i in range(3):
        try:
            response = requests.get(url, timeout=timeout_parameter)
            break
        except:
            if i==2:
                return ['get_downloading', url, None, None]
    if not response.status_code == 200:
        return ['get_downloading', url, None, None]
    return data_cleaning(save_path,response.content)


def get_csv_link(dictx):
    station, year, month = dictx["station"], dictx["year"], dictx["month"]
    print('get_csv_link', station, year, month)
    return download_csv(
        f'http://www.weather.gov.sg/files/dailydata/DAILYDATA_{station}_{year}{month_mapping[month]}.csv',
        f'weather_files/DAILYDATA_{station}_{year}{month_mapping[month]}.csv'
    )


def get_months_then_csv_link(dictx):
    station, cityname, year = dictx["station"], dictx["cityname"], dictx["year"]
    print('get_months_then_csv_link', station, cityname, year)
    for i in range(3):
        try:
            months_response = requests.post(
                url='http://www.weather.gov.sg/wp-content/themes/wiptheme/page-functions/functions-climate-historical-daily-months.php',
                data={
                    'year': year,
                    'cityname': cityname
                },
                timeout=timeout_parameter
            )
            break
        except:
            if i==2:
                return ['get_months', cityname, year, None]
    if not months_response.status_code == 200:
        return ['get_months', cityname, year, None]
    months_content = months_response.content
    months_soup = BeautifulSoup(months_content, 'html.parser')
    all_months = [month_html.a['href'][1:] for month_html in months_soup.find('ul').find_all('li')]
    valuez = []
    for month in all_months:
        valuez.append({'station': station, 'year': year, 'month': month})
    results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=parallel_config_2) as executor:
        futures = [executor.submit(get_csv_link, d) for d in valuez]
        for future in concurrent.futures.as_completed(futures):
            results.append(future.result())
    return results


def get_years_then_months_then_csv_link(dictx):
    cityname, station = dictx[1], dictx[0]
    print('get_years_then_months_then_csv_link', cityname, station)
    for i in range(3):
        try:
            years_response = requests.post(
                url='http://www.weather.gov.sg/wp-content/themes/wiptheme/page-functions/functions-climate-historical-daily-year.php',
                data={
                    'stationCode': station
                },
                timeout=timeout_parameter
            )
            break
        except:
            if i==2:
                return ['get_years', cityname, None, None]
    if not years_response.status_code == 200:
        return ['get_years', cityname, None, None]
    years_content = years_response.content
    years_soup = BeautifulSoup(years_content, 'html.parser')
    all_years = [month_html.a['href'][1:] for month_html in years_soup.find('ul').find_all('li')]
    valuey = []
    for year in all_years:
        valuey.append({'station': station, 'cityname': cityname, 'year': year})
    results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=parallel_config_1) as executor:
        futures = [executor.submit(get_months_then_csv_link, d) for d in valuey]
        for future in concurrent.futures.as_completed(futures):
            results.append(future.result())
    return results


if __name__ == '__main__':
    start = time()
    if os.path.isdir('weather_files'):
        shutil.rmtree('weather_files')
    os.mkdir('weather_files')
    url = "http://www.weather.gov.sg/climate-historical-daily/"
    response = requests.get(url, timeout=timeout_parameter)
    html_content = response.content
    soup = BeautifulSoup(html_content, 'html.parser')
    element = soup.find(id="cityname")
    sibling = element.find_next_sibling('ul')
    data_dict = {}
    for x in sibling.find_all('li'):
        y = x.find(onclick=True)
        data_dict[y['onclick'][9:-2]] = x.a.string
    results = []
    with concurrent.futures.ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
        futures = [executor.submit(get_years_then_months_then_csv_link, d) for d in data_dict.items()]
        for future in concurrent.futures.as_completed(futures):
            results.append(future.result())

    try:
        with open('scraper_log.txt', 'w') as w:
            w.write(json.dumps(results, indent=1))
    except:
        pass
    print(f'Downloaded Data in {time() - start} s')
    # gives mileage of 32.9MB in 180 seconds
    start2= time()
    conn = duckdb.connect(':memory:')
    # Merge and sort the CSV files using a single SQL statement
    conn.execute("""
        create table weather_sg as
        SELECT  Station,
                CAST(Year as INTEGER) as Year,
                CAST(Month as INTEGER) as Month,
                CAST(Day as INTEGER) as Day,
                replace("Daily Rainfall Total (mm)",' ','') as "Daily Rainfall Total (mm)",
                replace("Highest 30 min Rainfall (mm)",' ','') as "Highest 30 min Rainfall (mm)",
                replace("Highest 60 min Rainfall (mm)",' ','') as "Highest 60 min Rainfall (mm)",
                replace("Highest 120 min Rainfall (mm)",' ','') as "Highest 120 min Rainfall (mm)",
                replace("Mean Temperature (°C)",' ','') as "Mean Temperature (°C)",
                replace("Maximum Temperature (°C)",' ','') as "Maximum Temperature (°C)",
                replace("Minimum Temperature (°C)",' ','') as "Minimum Temperature (°C)",
                replace("Mean Wind Speed (km/h)",' ','') as "Mean Wind Speed (km/h)",
                replace("Max Wind Speed (km/h)",' ','') as "Max Wind Speed (km/h)"
        FROM
        READ_CSV(
            'weather_files/*.csv',
            skip=1,
            header=False,
            sep=',',
            QUOTE='"',
            escape='\\',
            columns={
                'Station':'VARCHAR',
                'Year':'INTEGER',
                'Month':'INTEGER',
                'Day':'INTEGER',
                'Daily Rainfall Total (mm)':'VARCHAR',
                'Highest 30 min Rainfall (mm)':'VARCHAR',
                'Highest 60 min Rainfall (mm)':'VARCHAR',
                'Highest 120 min Rainfall (mm)':'VARCHAR',
                'Mean Temperature (°C)':'VARCHAR',
                'Maximum Temperature (°C)':'VARCHAR',
                'Minimum Temperature (°C)':'VARCHAR',
                'Mean Wind Speed (km/h)':'VARCHAR',
                'Max Wind Speed (km/h)':'VARCHAR'
            }
        )
        order by Station asc,Year desc, Month desc, Day desc
       ;
    """)
    conn.execute("""COPY weather_sg TO 'output.csv' WITH (HEADER 1);""")
    conn.close()
    print(f'Created CSV in Data in {time() - start2} s')
    print('end')
	'''

	1) Write a python script to download the historical climate data of all locations into a csv file.
	http://www.weather.gov.sg/climate-historical-daily/

	Output as a csv file in below format with all the stations in a file per month for 2022 Sep, 2022 Oct and 2022 Nov.

	CSV format:
	Station,Year,Month,Day,Daily Rainfall Total (mm),Highest 30 min Rainfall (mm),Highest 60 min Rainfall (mm),Highest 120 min Rainfall (mm),Mean Temperature (°C),Maximum Temperature (°C),Minimum Temperature (°C),Mean Wind Speed (km/h),Max Wind Speed (km/h)
	Changi,2023,1,1, 0.0, 0.0, 0.0, 0.0, 27.2, 30.6, 25.3, 9.8, 31.5
	...
	Admiralty,2023,1,1, 0.0, 0.0, 0.0, 0.0, 26.8, 30.1, 24.8, 10.7, 27.8
	...

	Please share with us how to run your script.

	Python libraries Required to run this script:
	duckdb
	requests
	bs4

	'''
	from bs4 import BeautifulSoup
	import requests
	import duckdb
	import concurrent.futures
	from time import time
	import os,shutil,json,sys,codecs

	sys.setrecursionlimit(10000)
	timeout_parameter = 120
	parallel_config_1 = 6
	parallel_config_2 = 5
	# found that these parameters work best
	# speed of download depends on availability of server
	hidden_gem=b'\xef\xbb\xbf'.decode()

	month_mapping = {
	'January': '01',
	'February': '02',
	'March': '03',
	'April': '04',
	'May': '05',
	'June': '06',
	'July': '07',
	'August': '08',
	'September': '09',
	'October': '10',
	'November': '11',
	'December': '12',
	}

	def data_cleaning(save_path,response_content):
	try:
	with open(save_path, 'wb') as file:
	file.write(
	response_content
	.replace(b'(C)',b'(\xc2\xb0C)')
	.replace(b' Min ',b' min ')
	.replace(b'\xef\xbb\xbf',b'')
	if isinstance(response_content,bytes)
	else
	response_content
	.replace('(C)','(°C)')
	.replace(' Min ',' min ')
	.replace(hidden_gem,'').encode()
	)
	except:
	print('Hell Broke Loose',response.content)
	return ['get_saving', url, None, None]
	with codecs.open(save_path, 'r', encoding='utf-8', errors='ignore') as file:
	content = file.read()
	with codecs.open(save_path, 'w', encoding='utf-8') as file:
	file.write(content)
	return ['success', None, None, None]

	def download_csv(url, save_path):
	print(url)
	for i in range(3):
	try:
	response = requests.get(url, timeout=timeout_parameter)
	break
	except:
	if i==2:
	return ['get_downloading', url, None, None]
	if not response.status_code == 200:
	return ['get_downloading', url, None, None]
	return data_cleaning(save_path,response.content)




	def get_csv_link(dictx):
	station, year, month = dictx["station"], dictx["year"], dictx["month"]
	print('get_csv_link', station, year, month)
	return download_csv(
	f'http://www.weather.gov.sg/files/dailydata/DAILYDATA_{station}_{year}{month_mapping[month]}.csv',
	f'weather_files/DAILYDATA_{station}_{year}{month_mapping[month]}.csv'
	)


	def get_months_then_csv_link(dictx):
	station, cityname, year = dictx["station"], dictx["cityname"], dictx["year"]
	print('get_months_then_csv_link', station, cityname, year)
	for i in range(3):
	try:
	months_response = requests.post(
	url='http://www.weather.gov.sg/wp-content/themes/wiptheme/page-functions/functions-climate-historical-daily-months.php',
	data={
	'year': year,
	'cityname': cityname
	},
	timeout=timeout_parameter
	)
	break
	except:
	if i==2:
	return ['get_months', cityname, year, None]
	if not months_response.status_code == 200:
	return ['get_months', cityname, year, None]
	months_content = months_response.content
	months_soup = BeautifulSoup(months_content, 'html.parser')
	all_months = [month_html.a['href'][1:] for month_html in months_soup.find('ul').find_all('li')]
	valuez = []
	for month in all_months:
	valuez.append({'station': station, 'year': year, 'month': month})
	results = []
	with concurrent.futures.ThreadPoolExecutor(max_workers=parallel_config_2) as executor:
	futures = [executor.submit(get_csv_link, d) for d in valuez]
	for future in concurrent.futures.as_completed(futures):
	results.append(future.result())
	return results


	def get_years_then_months_then_csv_link(dictx):
	cityname, station = dictx[1], dictx[0]
	print('get_years_then_months_then_csv_link', cityname, station)
	for i in range(3):
	try:
	years_response = requests.post(
	url='http://www.weather.gov.sg/wp-content/themes/wiptheme/page-functions/functions-climate-historical-daily-year.php',
	data={
	'stationCode': station
	},
	timeout=timeout_parameter
	)
	break
	except:
	if i==2:
	return ['get_years', cityname, None, None]
	if not years_response.status_code == 200:
	return ['get_years', cityname, None, None]
	years_content = years_response.content
	years_soup = BeautifulSoup(years_content, 'html.parser')
	all_years = [month_html.a['href'][1:] for month_html in years_soup.find('ul').find_all('li')]
	valuey = []
	for year in all_years:
	valuey.append({'station': station, 'cityname': cityname, 'year': year})
	results = []
	with concurrent.futures.ThreadPoolExecutor(max_workers=parallel_config_1) as executor:
	futures = [executor.submit(get_months_then_csv_link, d) for d in valuey]
	for future in concurrent.futures.as_completed(futures):
	results.append(future.result())
	return results


	if __name__ == '__main__':
	start = time()
	if os.path.isdir('weather_files'):
	shutil.rmtree('weather_files')
	os.mkdir('weather_files')
	url = "http://www.weather.gov.sg/climate-historical-daily/"
	response = requests.get(url, timeout=timeout_parameter)
	html_content = response.content
	soup = BeautifulSoup(html_content, 'html.parser')
	element = soup.find(id="cityname")
	sibling = element.find_next_sibling('ul')
	data_dict = {}
	for x in sibling.find_all('li'):
	y = x.find(onclick=True)
	data_dict[y['onclick'][9:-2]] = x.a.string
	results = []
	with concurrent.futures.ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
	futures = [executor.submit(get_years_then_months_then_csv_link, d) for d in data_dict.items()]
	for future in concurrent.futures.as_completed(futures):
	results.append(future.result())

	try:
	with open('scraper_log.txt', 'w') as w:
	w.write(json.dumps(results, indent=1))
	except:
	pass
	print(f'Downloaded Data in {time() - start} s')
	# gives mileage of 32.9MB in 180 seconds
	start2= time()
	conn = duckdb.connect(':memory:')
	# Merge and sort the CSV files using a single SQL statement
	conn.execute("""
	create table weather_sg as
	SELECT Station,
	CAST(Year as INTEGER) as Year,
	CAST(Month as INTEGER) as Month,
	CAST(Day as INTEGER) as Day,
	replace("Daily Rainfall Total (mm)",' ','') as "Daily Rainfall Total (mm)",
	replace("Highest 30 min Rainfall (mm)",' ','') as "Highest 30 min Rainfall (mm)",
	replace("Highest 60 min Rainfall (mm)",' ','') as "Highest 60 min Rainfall (mm)",
	replace("Highest 120 min Rainfall (mm)",' ','') as "Highest 120 min Rainfall (mm)",
	replace("Mean Temperature (°C)",' ','') as "Mean Temperature (°C)",
	replace("Maximum Temperature (°C)",' ','') as "Maximum Temperature (°C)",
	replace("Minimum Temperature (°C)",' ','') as "Minimum Temperature (°C)",
	replace("Mean Wind Speed (km/h)",' ','') as "Mean Wind Speed (km/h)",
	replace("Max Wind Speed (km/h)",' ','') as "Max Wind Speed (km/h)"
	FROM
	READ_CSV(
	'weather_files/*.csv',
	skip=1,
	header=False,
	sep=',',
	QUOTE='"',
	escape='\\',
	columns={
	'Station':'VARCHAR',
	'Year':'INTEGER',
	'Month':'INTEGER',
	'Day':'INTEGER',
	'Daily Rainfall Total (mm)':'VARCHAR',
	'Highest 30 min Rainfall (mm)':'VARCHAR',
	'Highest 60 min Rainfall (mm)':'VARCHAR',
	'Highest 120 min Rainfall (mm)':'VARCHAR',
	'Mean Temperature (°C)':'VARCHAR',
	'Maximum Temperature (°C)':'VARCHAR',
	'Minimum Temperature (°C)':'VARCHAR',
	'Mean Wind Speed (km/h)':'VARCHAR',
	'Max Wind Speed (km/h)':'VARCHAR'
	}
	)
	order by Station asc,Year desc, Month desc, Day desc
	;
	""")
	conn.execute("""COPY weather_sg TO 'output.csv' WITH (HEADER 1);""")
	conn.close()
	print(f'Created CSV in Data in {time() - start2} s')
	print('end')