Created
September 14, 2017 03:09
-
-
Save FashionableNonsense/940db2f0b18944b362cfdc6f9c85d977 to your computer and use it in GitHub Desktop.
Scrape Wikipedia dumps for data on Brazilian cities.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import _itemgetter | |
import cartopy.crs as ccrs | |
import cartopy | |
import matplotlib.pyplot as plt | |
import cartopy.io.shapereader as shpr | |
class Parser: | |
def __init__(self): | |
pass | |
def getInfoboxFromText(self, string): | |
# Takes in a string (in this case, the whole page for a city), and parses it so that it | |
# returns a dictionary with the infobox information. | |
new_dict = {} | |
list_from_line = string.splitlines() | |
key = "" | |
value = "" | |
for i in list_from_line: | |
# Loops through all the lines in the page, looking for the characters that start an | |
# infobox key ('|'). The end of the infobox ('}}') breaks the loop. | |
if i.startswith("}}") or i.startswith(" }}"): | |
break | |
if i.startswith(' |') or i.startswith('|') or i.startswith(' |') \ | |
or (i.startswith("<!") and i.find("nome") != -1): | |
if i.find("=") == -1 and i.find(' ') != -1: | |
i.replace(" ", "=", 1) | |
key = i[i.find('|') + 1:i.find('=')] | |
# In the text file, there's a bunch of whitespace before the '=', so I | |
# strip that. | |
key = key.strip(' ') | |
key = "nome" if key == "name" else key | |
value = i[i.find('=') + 1:] | |
value = value.strip(' ') | |
new_dict[key] = value | |
return new_dict | |
def parseDate(self, string): | |
# Takes in a string of date, returns the year. | |
# This is quite clunky, but it's the way I could make it work. Since every date had a different | |
# format, I substituted every special character with a '+', and then stripped the string using | |
# that character. | |
string = string.replace('{', '+') | |
string = string.replace('}', '+') | |
string = string.replace('[', '+') | |
string = string.replace(']', '+') | |
string = string.replace('|', '+') | |
string = string.replace(' ', '+') | |
string = string.replace('\'', '+') | |
list = string.split('+') | |
year = "" | |
# Loop through the lines, discards values that are not integers, and the evaluates for years | |
# (numbers between 1500 and 2017). | |
for item in list: | |
try: | |
item = int(item) | |
isInt = True | |
except ValueError: | |
isInt = False | |
if isInt and int(item) > 1500 and int(item) < 2017: | |
year = item | |
return year | |
def geoParser(self, string): | |
# Takes in a string and returns a geoposition in the format Direction-Degrees-Minutes-Seconds. | |
list = [] | |
direct = "" | |
g = "" | |
m = "" | |
s = "" | |
# Also use the clunky 'substitute by + and split the string using that character". | |
if len(string) > 3 and string != "False": | |
string = string.replace('{', '+') | |
string = string.replace('}', '+') | |
string = string.replace('[', '+') | |
string = string.replace(']', '+') | |
string = string.replace('|', '+') | |
string = string.replace(' ', '+') | |
string = string.replace('\'', '+') | |
string = string.replace('=', '+') | |
string = string.replace('-', '+') | |
list = string.split('+') | |
temp_list = [] | |
# Checks for empty geoposition data. | |
for item0 in list: | |
if item0 != '': | |
temp_list.append(item0) | |
list = temp_list | |
# Extracts the direction, degrees, minutes and seconds. | |
for item in list: | |
if item == "S" or item == "N" or item == "n" or item == "s" \ | |
or item == "O" or item == "L" or item == "W" or item == "E" \ | |
or item == "o" or item == "l" or item == "w" or item == "e": | |
direct = item | |
if item == "latG" or item == "latg" or item == "lonG" or item == "long": | |
g = list[list.index(item) + 1] | |
if item == "latM" or item == "latm" or item == "lonM" or item == "lonm": | |
m = list[list.index(item) + 1] | |
if item == "latS" or item == "lats" or item == "lonS" or item == "lons": | |
if len(list) > list.index(item) + 1: | |
s = list[list.index(item) + 1] | |
if g == '' or m == '' or s == '': | |
return "no_geo" | |
return direct + "-" + g + "-" + m + "-" + s | |
else: | |
return "no_geo" | |
def dmsToDec(self,str): | |
# Pretty straightforward: takes the string formed in the last functions and turns it into a | |
# decimal geoposition format (float). It also returns 'no_geo' if there is no data. | |
list = str.split('-') | |
sign = 0.0 | |
try: | |
g = float(list[1]) | |
m = float(list[2]) | |
s = float(list[3][:2]) | |
result = g + m/60 + s/3600 | |
except: | |
return 'no_geo' | |
if list[0] == 'S' or list[0] == 'W' or list[0] == 'O' or \ | |
list[0] == 's' or list[0]=='w' or list[0] == 'o': | |
result = -result | |
return result | |
parser = Parser() | |
# Open source file created with ScrapeWiki. | |
r_file = open("Pages.txt", 'r', encoding='utf8') | |
# Sets an empy string to contain the infobox text, a flag for start and end of the pages | |
# and lists to contain the values already evaluated, the final list and the final list sorted. | |
string = "" | |
flag = False | |
# These are the values to extract from the infobox. I extract the name to evaluate if an eventual | |
# copy of the page was already evaluated (there were some). | |
values_to_extract = ['nome', 'fundação', 'latP', 'lonP'] | |
values_passed = [] | |
final_list = [] | |
final_list_sorted = [] | |
# Loops through all lines on r_file, and separates the individual pages based on the | |
# ### NEW PAGE ### and ### END PAGE ### flags. | |
for line in r_file: | |
if line.find("### NEW PAGE###") != -1: | |
flag = True | |
if flag: | |
string = string + line | |
if line.find("### END PAGE ###") != -1: | |
# When the end of the page is reached, the string is passed into the getInfoboxFromText method | |
# from Parser. | |
dictionary = parser.getInfoboxFromText(string) | |
string = "" | |
# Gets the individual values of interest from the dictionary returned by getInfoboxFromText, and | |
# stores them in the variables below. If there is any missing data, it stores False. | |
name = dictionary.get('nome', False) | |
date = dictionary.get('fundação', False) | |
lat = dictionary.get('latP', False) | |
lon = dictionary.get('lonP', False) | |
# Parse the date to extract the year and latitude and longitude to extract in P-D-M-S form. | |
date = parser.parseDate(str(date)) | |
lat = parser.geoParser(str(lat)) | |
lon = parser.geoParser(str(lon)) | |
# If there is any missing data, discard the whole city. | |
if name == "" or name == "False" or date == "False" or date == "" or \ | |
lat == "False" or lat == "no_geo" or lon == "False" or lon == "no_geo": | |
continue | |
else: | |
# Discards repeated pages. | |
if not name in values_passed: | |
values_passed.append(name) | |
# Parse the geolocation into float. | |
lat = parser.dmsToDec(lat) | |
lon = parser.dmsToDec(lon) | |
# Discard cities with missing geolocation. | |
if lat == "no_geo" or lon == "no_geo": | |
continue | |
else: | |
final_list.append([name, date, lat, lon]) | |
flag = False | |
# Use _itemgetter to sort the list by year (item 1 of the list). | |
final_list_sorted = sorted(final_list, key=_itemgetter(1)) | |
# Now is the mapping part of the program. | |
# Start a plot, in the PlateCarre projection. | |
ax = plt.axes(projection=ccrs.PlateCarree()) | |
ax.add_feature(cartopy.feature.OCEAN) | |
#ax.coastlines(resolution='10m', alpha=0.5) | |
dir(ax.coastlines) | |
ax.set_extent([-31,-75,7,-34]) | |
# This downloads the NaturalEarth data for country borders; | |
filename = shpr.natural_earth(resolution='110m', category='cultural', name='admin_0_countries') | |
reader = shpr.Reader(filename) | |
countries = reader.records() | |
# Loop through the countries, find Brazil. | |
for country in countries: | |
if country.attributes['admin'] == 'Brazil': | |
# Add brazilian borders, paints it a nice green, because forests, that's why. | |
ax.add_geometries(country.geometry, ccrs.PlateCarree(), facecolor=("#2ab23a"), label=country.attributes['adm0_a3']) | |
else: | |
# Rest of the countries are grey, because they don'y matter (just kidding). | |
ax.add_geometries(country.geometry, ccrs.PlateCarree(), facecolor=("#97a39a"), label=country.attributes['adm0_a3']) | |
# Initiates a text for the year box and stores it on a variable. | |
text = ax.text(0,0,1) | |
# Loop through the years 1500 to 2017. | |
for year in range(1500, 2018): | |
# Remove the previous text, and add another with the current yeat. | |
text.remove() | |
text = ax.text(-72, -32, year, color='white', fontsize=18, fontweight='bold', | |
bbox={'facecolor': '#040911', 'alpha': 0.7, 'pad': 10}) | |
# Loop through the sorted list. | |
for city in final_list_sorted: | |
# If the city was founded on the current year, put it on the map. | |
if int(city[1]) == year: | |
plt.plot(city[3], city[2], marker='.', markersize='2', color='#46125e', alpha=0.8) | |
plt.draw() | |
# Save image file. | |
plt.savefig(str(year), frameon=False, bbox_inches='tight', pad_inches=0, dpi=300) | |
r_file.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Counter for the number of cities. | |
counter = 0 | |
# File to write the pages to. | |
w_file = open("Pages.txt", 'a', encoding='utf8') | |
# There are 4 dump files, so I loop through four times. | |
for file_num in range(1, 5): | |
# Open the dump file. | |
r_file = open("ptwiki-20170901-pages-meta-current" + str(file_num) + ".txt", 'r', encoding='utf8') | |
# Flag used in the loop, to look for key expressions. | |
flag = False | |
# Loop through all lines in file. | |
for line in r_file: | |
# Every infobox for brazilian cities starts with the header "{{Info/Município do Brasil", so | |
# I look for that to start the infobox. When found, it writes to w_file all the page, which ends | |
# at with a </text>. | |
if line.find("{{Info/Município do Brasil") != -1: | |
counter += 1 | |
w_file.write('\n\n### NEW PAGE###' + '\n\n' + str(counter) + '\n') | |
flag = True | |
if flag: | |
w_file.write(line) | |
if flag and line.find("</text>") != -1: | |
w_file.write("### END PAGE ###") | |
flag = False | |
r_file.close() | |
w_file.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment