Skip to content

Instantly share code, notes, and snippets.

@breyten
Created August 9, 2016 09:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save breyten/4dfbd1d95fa23faeb88b71dd4cbc1563 to your computer and use it in GitHub Desktop.
Save breyten/4dfbd1d95fa23faeb88b71dd4cbc1563 to your computer and use it in GitHub Desktop.
Gets climate data from wikipedia country pages
#!/usr/bin/env python
import os
import sys
import re
from pprint import pprint
import json
from time import sleep
import requests
from BeautifulSoup import BeautifulSoup
def get_countries():
resp = requests.get('https://en.wikipedia.org/wiki/List_of_sovereign_states')
soup = BeautifulSoup(resp.content)
countries = {}
for country_row in soup.find('table', 'wikitable').findAll('tr'):
link = country_row.find('a', href=re.compile(r'^\/wiki\/.*'))
if link is not None:
countries[link['href']] = link.text
return countries
def get_climate_table(soup):
for table in soup.findAll('table', 'wikitable'):
thead = table.find('tr')
if thead is None:
continue
thead_th = thead.find('th')
if thead_th is None:
continue
if thead_th.text.startswith('Climate'): # bingo
return table
def get_climate_info(table):
climate = {}
month_row = table.findAll('tr')[1]
months = [t.text for t in month_row.findAll('th')[1:]]
for row in table.findAll('tr')[2:]:
th = row.find('th')
if th is None:
continue
cat_name = u' '.join(th.findAll(text=True))
data = [td.text for td in row.findAll('td')]
climate[cat_name] = dict(zip(months, data))
return climate
def get_country_climate(country_relative_link):
resp = requests.get('http://en.wikipedia.org%s' % (country_relative_link,))
soup = BeautifulSoup(resp.content)
climate_table = get_climate_table(soup)
if climate_table is not None:
return get_climate_info(climate_table)
else:
return {}
def main(argv=None):
if argv is None:
argv = sys.argv
countries = get_countries()
climates = {}
for country_link, country_name in countries.iteritems():
climate = get_country_climate(country_link)
climates[country_name] = {
'url': country_link,
'climate': climate
}
print >>sys.stderr, country_name
sleep(1)
print json.dumps(climates)
return 0
if __name__ == '__main__':
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment