Skip to content

Instantly share code, notes, and snippets.

@FashionableNonsense
Created September 14, 2017 03:09
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save FashionableNonsense/940db2f0b18944b362cfdc6f9c85d977 to your computer and use it in GitHub Desktop.
Save FashionableNonsense/940db2f0b18944b362cfdc6f9c85d977 to your computer and use it in GitHub Desktop.
Scrape Wikipedia dumps for data on Brazilian cities.
from collections import _itemgetter
import cartopy.crs as ccrs
import cartopy
import matplotlib.pyplot as plt
import cartopy.io.shapereader as shpr
class Parser:
def __init__(self):
pass
def getInfoboxFromText(self, string):
# Takes in a string (in this case, the whole page for a city), and parses it so that it
# returns a dictionary with the infobox information.
new_dict = {}
list_from_line = string.splitlines()
key = ""
value = ""
for i in list_from_line:
# Loops through all the lines in the page, looking for the characters that start an
# infobox key ('|'). The end of the infobox ('}}') breaks the loop.
if i.startswith("}}") or i.startswith(" }}"):
break
if i.startswith(' |') or i.startswith('|') or i.startswith(' |') \
or (i.startswith("<!") and i.find("nome") != -1):
if i.find("=") == -1 and i.find(' ') != -1:
i.replace(" ", "=", 1)
key = i[i.find('|') + 1:i.find('=')]
# In the text file, there's a bunch of whitespace before the '=', so I
# strip that.
key = key.strip(' ')
key = "nome" if key == "name" else key
value = i[i.find('=') + 1:]
value = value.strip(' ')
new_dict[key] = value
return new_dict
def parseDate(self, string):
# Takes in a string of date, returns the year.
# This is quite clunky, but it's the way I could make it work. Since every date had a different
# format, I substituted every special character with a '+', and then stripped the string using
# that character.
string = string.replace('{', '+')
string = string.replace('}', '+')
string = string.replace('[', '+')
string = string.replace(']', '+')
string = string.replace('|', '+')
string = string.replace(' ', '+')
string = string.replace('\'', '+')
list = string.split('+')
year = ""
# Loop through the lines, discards values that are not integers, and the evaluates for years
# (numbers between 1500 and 2017).
for item in list:
try:
item = int(item)
isInt = True
except ValueError:
isInt = False
if isInt and int(item) > 1500 and int(item) < 2017:
year = item
return year
def geoParser(self, string):
# Takes in a string and returns a geoposition in the format Direction-Degrees-Minutes-Seconds.
list = []
direct = ""
g = ""
m = ""
s = ""
# Also use the clunky 'substitute by + and split the string using that character".
if len(string) > 3 and string != "False":
string = string.replace('{', '+')
string = string.replace('}', '+')
string = string.replace('[', '+')
string = string.replace(']', '+')
string = string.replace('|', '+')
string = string.replace(' ', '+')
string = string.replace('\'', '+')
string = string.replace('=', '+')
string = string.replace('-', '+')
list = string.split('+')
temp_list = []
# Checks for empty geoposition data.
for item0 in list:
if item0 != '':
temp_list.append(item0)
list = temp_list
# Extracts the direction, degrees, minutes and seconds.
for item in list:
if item == "S" or item == "N" or item == "n" or item == "s" \
or item == "O" or item == "L" or item == "W" or item == "E" \
or item == "o" or item == "l" or item == "w" or item == "e":
direct = item
if item == "latG" or item == "latg" or item == "lonG" or item == "long":
g = list[list.index(item) + 1]
if item == "latM" or item == "latm" or item == "lonM" or item == "lonm":
m = list[list.index(item) + 1]
if item == "latS" or item == "lats" or item == "lonS" or item == "lons":
if len(list) > list.index(item) + 1:
s = list[list.index(item) + 1]
if g == '' or m == '' or s == '':
return "no_geo"
return direct + "-" + g + "-" + m + "-" + s
else:
return "no_geo"
def dmsToDec(self,str):
# Pretty straightforward: takes the string formed in the last functions and turns it into a
# decimal geoposition format (float). It also returns 'no_geo' if there is no data.
list = str.split('-')
sign = 0.0
try:
g = float(list[1])
m = float(list[2])
s = float(list[3][:2])
result = g + m/60 + s/3600
except:
return 'no_geo'
if list[0] == 'S' or list[0] == 'W' or list[0] == 'O' or \
list[0] == 's' or list[0]=='w' or list[0] == 'o':
result = -result
return result
parser = Parser()
# Open source file created with ScrapeWiki.
r_file = open("Pages.txt", 'r', encoding='utf8')
# Sets an empy string to contain the infobox text, a flag for start and end of the pages
# and lists to contain the values already evaluated, the final list and the final list sorted.
string = ""
flag = False
# These are the values to extract from the infobox. I extract the name to evaluate if an eventual
# copy of the page was already evaluated (there were some).
values_to_extract = ['nome', 'fundação', 'latP', 'lonP']
values_passed = []
final_list = []
final_list_sorted = []
# Loops through all lines on r_file, and separates the individual pages based on the
# ### NEW PAGE ### and ### END PAGE ### flags.
for line in r_file:
if line.find("### NEW PAGE###") != -1:
flag = True
if flag:
string = string + line
if line.find("### END PAGE ###") != -1:
# When the end of the page is reached, the string is passed into the getInfoboxFromText method
# from Parser.
dictionary = parser.getInfoboxFromText(string)
string = ""
# Gets the individual values of interest from the dictionary returned by getInfoboxFromText, and
# stores them in the variables below. If there is any missing data, it stores False.
name = dictionary.get('nome', False)
date = dictionary.get('fundação', False)
lat = dictionary.get('latP', False)
lon = dictionary.get('lonP', False)
# Parse the date to extract the year and latitude and longitude to extract in P-D-M-S form.
date = parser.parseDate(str(date))
lat = parser.geoParser(str(lat))
lon = parser.geoParser(str(lon))
# If there is any missing data, discard the whole city.
if name == "" or name == "False" or date == "False" or date == "" or \
lat == "False" or lat == "no_geo" or lon == "False" or lon == "no_geo":
continue
else:
# Discards repeated pages.
if not name in values_passed:
values_passed.append(name)
# Parse the geolocation into float.
lat = parser.dmsToDec(lat)
lon = parser.dmsToDec(lon)
# Discard cities with missing geolocation.
if lat == "no_geo" or lon == "no_geo":
continue
else:
final_list.append([name, date, lat, lon])
flag = False
# Use _itemgetter to sort the list by year (item 1 of the list).
final_list_sorted = sorted(final_list, key=_itemgetter(1))
# Now is the mapping part of the program.
# Start a plot, in the PlateCarre projection.
ax = plt.axes(projection=ccrs.PlateCarree())
ax.add_feature(cartopy.feature.OCEAN)
#ax.coastlines(resolution='10m', alpha=0.5)
dir(ax.coastlines)
ax.set_extent([-31,-75,7,-34])
# This downloads the NaturalEarth data for country borders;
filename = shpr.natural_earth(resolution='110m', category='cultural', name='admin_0_countries')
reader = shpr.Reader(filename)
countries = reader.records()
# Loop through the countries, find Brazil.
for country in countries:
if country.attributes['admin'] == 'Brazil':
# Add brazilian borders, paints it a nice green, because forests, that's why.
ax.add_geometries(country.geometry, ccrs.PlateCarree(), facecolor=("#2ab23a"), label=country.attributes['adm0_a3'])
else:
# Rest of the countries are grey, because they don'y matter (just kidding).
ax.add_geometries(country.geometry, ccrs.PlateCarree(), facecolor=("#97a39a"), label=country.attributes['adm0_a3'])
# Initiates a text for the year box and stores it on a variable.
text = ax.text(0,0,1)
# Loop through the years 1500 to 2017.
for year in range(1500, 2018):
# Remove the previous text, and add another with the current yeat.
text.remove()
text = ax.text(-72, -32, year, color='white', fontsize=18, fontweight='bold',
bbox={'facecolor': '#040911', 'alpha': 0.7, 'pad': 10})
# Loop through the sorted list.
for city in final_list_sorted:
# If the city was founded on the current year, put it on the map.
if int(city[1]) == year:
plt.plot(city[3], city[2], marker='.', markersize='2', color='#46125e', alpha=0.8)
plt.draw()
# Save image file.
plt.savefig(str(year), frameon=False, bbox_inches='tight', pad_inches=0, dpi=300)
r_file.close()
# Counter for the number of cities.
counter = 0
# File to write the pages to.
w_file = open("Pages.txt", 'a', encoding='utf8')
# There are 4 dump files, so I loop through four times.
for file_num in range(1, 5):
# Open the dump file.
r_file = open("ptwiki-20170901-pages-meta-current" + str(file_num) + ".txt", 'r', encoding='utf8')
# Flag used in the loop, to look for key expressions.
flag = False
# Loop through all lines in file.
for line in r_file:
# Every infobox for brazilian cities starts with the header "{{Info/Município do Brasil", so
# I look for that to start the infobox. When found, it writes to w_file all the page, which ends
# at with a </text>.
if line.find("{{Info/Município do Brasil") != -1:
counter += 1
w_file.write('\n\n### NEW PAGE###' + '\n\n' + str(counter) + '\n')
flag = True
if flag:
w_file.write(line)
if flag and line.find("</text>") != -1:
w_file.write("### END PAGE ###")
flag = False
r_file.close()
w_file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment