FashionableNonsense/BrCities.py

## BrCities.py
from collections import _itemgetter
import cartopy.crs as ccrs
import cartopy
import matplotlib.pyplot as plt
import cartopy.io.shapereader as shpr


class Parser:

    def __init__(self):
        pass

    def getInfoboxFromText(self, string):

        # Takes in a string (in  this case, the whole page for a city), and parses it so that it
        # returns a dictionary with the infobox information.

        new_dict = {}
        list_from_line = string.splitlines()
        key = ""
        value = ""

        for i in list_from_line:
            # Loops through all the lines in the page, looking for the characters that start an
            # infobox key ('|'). The end of the infobox ('}}') breaks the loop.

            if i.startswith("}}") or i.startswith(" }}"):
                break

            if i.startswith(' |') or i.startswith('|') or i.startswith('  |') \
                    or (i.startswith("&lt;!") and i.find("nome") != -1):

                if i.find("=") == -1 and i.find('   ') != -1:
                    i.replace("   ", "=", 1)

                key = i[i.find('|') + 1:i.find('=')]
                # In the text file, there's a bunch of whitespace before the '=', so I
                # strip that.
                key = key.strip(' ')
                key = "nome" if key == "name" else key
                value = i[i.find('=') + 1:]
                value = value.strip(' ')
                new_dict[key] = value

        return new_dict

    def parseDate(self, string):
        # Takes in a string of date, returns the year.
        # This is quite clunky, but it's the way I could make it work. Since every date had a different
        # format, I substituted every special character with a '+', and then stripped the string using
        # that character.
        string = string.replace('{', '+')
        string = string.replace('}', '+')
        string = string.replace('[', '+')
        string = string.replace(']', '+')
        string = string.replace('|', '+')
        string = string.replace(' ', '+')
        string = string.replace('\'', '+')
        list = string.split('+')
        year = ""


        # Loop through the lines, discards values that are not integers, and the evaluates for years
        # (numbers between 1500 and 2017).
        for item in list:
            try:
                item = int(item)
                isInt = True
            except ValueError:
                isInt = False

            if isInt and int(item) > 1500 and int(item) < 2017:
                year = item

        return year

    def geoParser(self, string):

        # Takes in a string and returns a geoposition in the format Direction-Degrees-Minutes-Seconds.
        list = []
        direct = ""
        g = ""
        m = ""
        s = ""

        # Also use the clunky 'substitute by + and split the string using that character".
        if len(string) > 3 and string != "False":
            string = string.replace('{', '+')
            string = string.replace('}', '+')
            string = string.replace('[', '+')
            string = string.replace(']', '+')
            string = string.replace('|', '+')
            string = string.replace(' ', '+')
            string = string.replace('\'', '+')
            string = string.replace('=', '+')
            string = string.replace('-', '+')
            list = string.split('+')
            temp_list = []

            # Checks for empty geoposition data.
            for item0 in list:
                if item0 != '':
                    temp_list.append(item0)

            list = temp_list

            # Extracts the direction, degrees, minutes and seconds.
            for item in list:
                if item == "S" or item == "N" or item == "n" or item == "s" \
                        or item == "O" or item == "L" or item == "W" or item == "E" \
                        or item == "o" or item == "l" or item == "w" or item == "e":
                    direct = item

                if item == "latG" or item == "latg" or item == "lonG" or item == "long":
                    g = list[list.index(item) + 1]

                if item == "latM" or item == "latm" or item == "lonM" or item == "lonm":
                    m = list[list.index(item) + 1]

                if item == "latS" or item == "lats" or item == "lonS" or item == "lons":
                    if len(list) > list.index(item) + 1:
                        s = list[list.index(item) + 1]

            if g == '' or m == '' or s == '':
                return "no_geo"

            return direct + "-" + g + "-" + m + "-" + s
        else:
            return "no_geo"

    def dmsToDec(self,str):

        # Pretty straightforward: takes the string formed in the last functions and turns it into a
        # decimal geoposition format (float). It also returns 'no_geo' if there is no data.

        list = str.split('-')
        sign = 0.0
        try:
            g = float(list[1])
            m = float(list[2])
            s = float(list[3][:2])
            result = g + m/60 + s/3600
        except:
            return 'no_geo'

        if list[0] == 'S' or list[0] == 'W' or list[0] == 'O' or \
                list[0] == 's' or list[0]=='w' or list[0] == 'o':
            result = -result

        return result

parser = Parser()
# Open source file created with ScrapeWiki.
r_file = open("Pages.txt", 'r', encoding='utf8')

# Sets an empy string to contain the infobox text, a flag for start and end of the pages
# and lists to contain the values already evaluated, the final list and the final list sorted.
string = ""
flag = False
# These are the values to extract from the infobox. I extract the name to evaluate if an eventual
# copy of the page was already evaluated (there were some).
values_to_extract = ['nome', 'fundação', 'latP', 'lonP']
values_passed = []
final_list = []
final_list_sorted = []

# Loops through all lines on r_file, and separates the individual pages based on the
# ### NEW PAGE ### and ### END PAGE ### flags.
for line in r_file:

    if line.find("### NEW PAGE###") != -1:
        flag = True

    if flag:
        string = string + line

    if line.find("### END PAGE ###") != -1:

        # When the end of the page is reached, the string is passed into the getInfoboxFromText method
        # from Parser.
        dictionary = parser.getInfoboxFromText(string)
        string = ""

        # Gets the individual values of interest from the dictionary returned by getInfoboxFromText, and
        # stores them in the variables below. If there is any missing data, it stores False.
        name = dictionary.get('nome', False)
        date = dictionary.get('fundação', False)
        lat = dictionary.get('latP', False)
        lon = dictionary.get('lonP', False)


        # Parse the date to extract the year and latitude and longitude to extract in P-D-M-S form.
        date = parser.parseDate(str(date))
        lat = parser.geoParser(str(lat))
        lon = parser.geoParser(str(lon))

        # If there is any missing data, discard the whole city.
        if name == "" or name == "False" or date == "False" or date == "" or \
                        lat == "False" or lat == "no_geo" or lon == "False" or lon == "no_geo":

            continue
        else:
            # Discards repeated pages.
            if not name in values_passed:
                values_passed.append(name)
                # Parse the geolocation into float.
                lat = parser.dmsToDec(lat)
                lon = parser.dmsToDec(lon)

                # Discard cities with missing geolocation.
                if lat == "no_geo" or lon == "no_geo":
                    continue
                else:
                    final_list.append([name, date, lat, lon])


        flag = False

# Use _itemgetter to sort the list by year (item 1 of the list).
final_list_sorted = sorted(final_list, key=_itemgetter(1))

# Now is the mapping part of the program.
# Start a plot, in the PlateCarre projection.
ax = plt.axes(projection=ccrs.PlateCarree())
ax.add_feature(cartopy.feature.OCEAN)
#ax.coastlines(resolution='10m', alpha=0.5)
dir(ax.coastlines)
ax.set_extent([-31,-75,7,-34])

# This downloads the NaturalEarth data for country borders;
filename = shpr.natural_earth(resolution='110m', category='cultural', name='admin_0_countries')
reader = shpr.Reader(filename)
countries = reader.records()

# Loop through the countries, find Brazil.
for country in countries:

    if country.attributes['admin'] == 'Brazil':
        # Add brazilian borders, paints it a nice green, because forests, that's why.
        ax.add_geometries(country.geometry, ccrs.PlateCarree(), facecolor=("#2ab23a"), label=country.attributes['adm0_a3'])
    else:
        # Rest of the countries are grey, because they don'y matter (just kidding).
        ax.add_geometries(country.geometry, ccrs.PlateCarree(), facecolor=("#97a39a"), label=country.attributes['adm0_a3'])

# Initiates a text for the year box and stores it on a variable.
text = ax.text(0,0,1)

# Loop through the years 1500 to 2017.
for year in range(1500, 2018):
    # Remove the previous text, and add another with the current yeat.
    text.remove()
    text = ax.text(-72, -32, year, color='white', fontsize=18, fontweight='bold',
            bbox={'facecolor': '#040911', 'alpha': 0.7, 'pad': 10})

    # Loop through the sorted list.
    for city in final_list_sorted:

        # If the city was founded on the current year, put it on the map.
        if int(city[1]) == year:
            plt.plot(city[3], city[2], marker='.', markersize='2', color='#46125e', alpha=0.8)
            plt.draw()
    # Save image file.
    plt.savefig(str(year), frameon=False, bbox_inches='tight', pad_inches=0, dpi=300)

r_file.close()

## ScrapeWiki.py

# Counter for the number of cities.
counter = 0
# File to write the pages to.
w_file = open("Pages.txt", 'a', encoding='utf8')

# There are 4 dump files, so I loop through four times.
for file_num in range(1, 5):

    # Open the dump file.
    r_file = open("ptwiki-20170901-pages-meta-current" + str(file_num) + ".txt", 'r', encoding='utf8')
    # Flag used in the loop, to look for key expressions.
    flag = False

    # Loop through all lines in file.
    for line in r_file:

        # Every infobox for brazilian cities starts with the header "{{Info/Município do Brasil", so
        # I look for that to start the infobox. When found, it writes to w_file all the page, which ends
        # at with a </text>.

        if line.find("{{Info/Município do Brasil") != -1:
            counter += 1
            w_file.write('\n\n### NEW PAGE###' + '\n\n' + str(counter) + '\n')
            flag = True

        if flag:
            w_file.write(line)

        if flag and line.find("</text>") != -1:
            w_file.write("### END PAGE ###")
            flag = False

    r_file.close()

w_file.close()
	from collections import _itemgetter
	import cartopy.crs as ccrs
	import cartopy
	import matplotlib.pyplot as plt
	import cartopy.io.shapereader as shpr


	class Parser:

	def __init__(self):
	pass

	def getInfoboxFromText(self, string):

	# Takes in a string (in this case, the whole page for a city), and parses it so that it
	# returns a dictionary with the infobox information.

	new_dict = {}
	list_from_line = string.splitlines()
	key = ""
	value = ""

	for i in list_from_line:
	# Loops through all the lines in the page, looking for the characters that start an
	# infobox key ('\|'). The end of the infobox ('}}') breaks the loop.

	if i.startswith("}}") or i.startswith(" }}"):
	break

	if i.startswith(' \|') or i.startswith('\|') or i.startswith(' \|') \
	or (i.startswith("<!") and i.find("nome") != -1):

	if i.find("=") == -1 and i.find(' ') != -1:
	i.replace(" ", "=", 1)

	key = i[i.find('\|') + 1:i.find('=')]
	# In the text file, there's a bunch of whitespace before the '=', so I
	# strip that.
	key = key.strip(' ')
	key = "nome" if key == "name" else key
	value = i[i.find('=') + 1:]
	value = value.strip(' ')
	new_dict[key] = value

	return new_dict

	def parseDate(self, string):
	# Takes in a string of date, returns the year.
	# This is quite clunky, but it's the way I could make it work. Since every date had a different
	# format, I substituted every special character with a '+', and then stripped the string using
	# that character.
	string = string.replace('{', '+')
	string = string.replace('}', '+')
	string = string.replace('[', '+')
	string = string.replace(']', '+')
	string = string.replace('\|', '+')
	string = string.replace(' ', '+')
	string = string.replace('\'', '+')
	list = string.split('+')
	year = ""


	# Loop through the lines, discards values that are not integers, and the evaluates for years
	# (numbers between 1500 and 2017).
	for item in list:
	try:
	item = int(item)
	isInt = True
	except ValueError:
	isInt = False

	if isInt and int(item) > 1500 and int(item) < 2017:
	year = item

	return year

	def geoParser(self, string):

	# Takes in a string and returns a geoposition in the format Direction-Degrees-Minutes-Seconds.
	list = []
	direct = ""
	g = ""
	m = ""
	s = ""

	# Also use the clunky 'substitute by + and split the string using that character".
	if len(string) > 3 and string != "False":
	string = string.replace('{', '+')
	string = string.replace('}', '+')
	string = string.replace('[', '+')
	string = string.replace(']', '+')
	string = string.replace('\|', '+')
	string = string.replace(' ', '+')
	string = string.replace('\'', '+')
	string = string.replace('=', '+')
	string = string.replace('-', '+')
	list = string.split('+')
	temp_list = []

	# Checks for empty geoposition data.
	for item0 in list:
	if item0 != '':
	temp_list.append(item0)

	list = temp_list

	# Extracts the direction, degrees, minutes and seconds.
	for item in list:
	if item == "S" or item == "N" or item == "n" or item == "s" \
	or item == "O" or item == "L" or item == "W" or item == "E" \
	or item == "o" or item == "l" or item == "w" or item == "e":
	direct = item

	if item == "latG" or item == "latg" or item == "lonG" or item == "long":
	g = list[list.index(item) + 1]

	if item == "latM" or item == "latm" or item == "lonM" or item == "lonm":
	m = list[list.index(item) + 1]

	if item == "latS" or item == "lats" or item == "lonS" or item == "lons":
	if len(list) > list.index(item) + 1:
	s = list[list.index(item) + 1]

	if g == '' or m == '' or s == '':
	return "no_geo"

	return direct + "-" + g + "-" + m + "-" + s
	else:
	return "no_geo"

	def dmsToDec(self,str):

	# Pretty straightforward: takes the string formed in the last functions and turns it into a
	# decimal geoposition format (float). It also returns 'no_geo' if there is no data.

	list = str.split('-')
	sign = 0.0
	try:
	g = float(list[1])
	m = float(list[2])
	s = float(list[3][:2])
	result = g + m/60 + s/3600
	except:
	return 'no_geo'

	if list[0] == 'S' or list[0] == 'W' or list[0] == 'O' or \
	list[0] == 's' or list[0]=='w' or list[0] == 'o':
	result = -result

	return result

	parser = Parser()
	# Open source file created with ScrapeWiki.
	r_file = open("Pages.txt", 'r', encoding='utf8')

	# Sets an empy string to contain the infobox text, a flag for start and end of the pages
	# and lists to contain the values already evaluated, the final list and the final list sorted.
	string = ""
	flag = False
	# These are the values to extract from the infobox. I extract the name to evaluate if an eventual
	# copy of the page was already evaluated (there were some).
	values_to_extract = ['nome', 'fundação', 'latP', 'lonP']
	values_passed = []
	final_list = []
	final_list_sorted = []

	# Loops through all lines on r_file, and separates the individual pages based on the
	# ### NEW PAGE ### and ### END PAGE ### flags.
	for line in r_file:

	if line.find("### NEW PAGE###") != -1:
	flag = True

	if flag:
	string = string + line

	if line.find("### END PAGE ###") != -1:

	# When the end of the page is reached, the string is passed into the getInfoboxFromText method
	# from Parser.
	dictionary = parser.getInfoboxFromText(string)
	string = ""

	# Gets the individual values of interest from the dictionary returned by getInfoboxFromText, and
	# stores them in the variables below. If there is any missing data, it stores False.
	name = dictionary.get('nome', False)
	date = dictionary.get('fundação', False)
	lat = dictionary.get('latP', False)
	lon = dictionary.get('lonP', False)


	# Parse the date to extract the year and latitude and longitude to extract in P-D-M-S form.
	date = parser.parseDate(str(date))
	lat = parser.geoParser(str(lat))
	lon = parser.geoParser(str(lon))

	# If there is any missing data, discard the whole city.
	if name == "" or name == "False" or date == "False" or date == "" or \
	lat == "False" or lat == "no_geo" or lon == "False" or lon == "no_geo":

	continue
	else:
	# Discards repeated pages.
	if not name in values_passed:
	values_passed.append(name)
	# Parse the geolocation into float.
	lat = parser.dmsToDec(lat)
	lon = parser.dmsToDec(lon)

	# Discard cities with missing geolocation.
	if lat == "no_geo" or lon == "no_geo":
	continue
	else:
	final_list.append([name, date, lat, lon])


	flag = False

	# Use _itemgetter to sort the list by year (item 1 of the list).
	final_list_sorted = sorted(final_list, key=_itemgetter(1))

	# Now is the mapping part of the program.
	# Start a plot, in the PlateCarre projection.
	ax = plt.axes(projection=ccrs.PlateCarree())
	ax.add_feature(cartopy.feature.OCEAN)
	#ax.coastlines(resolution='10m', alpha=0.5)
	dir(ax.coastlines)
	ax.set_extent([-31,-75,7,-34])

	# This downloads the NaturalEarth data for country borders;
	filename = shpr.natural_earth(resolution='110m', category='cultural', name='admin_0_countries')
	reader = shpr.Reader(filename)
	countries = reader.records()

	# Loop through the countries, find Brazil.
	for country in countries:

	if country.attributes['admin'] == 'Brazil':
	# Add brazilian borders, paints it a nice green, because forests, that's why.
	ax.add_geometries(country.geometry, ccrs.PlateCarree(), facecolor=("#2ab23a"), label=country.attributes['adm0_a3'])
	else:
	# Rest of the countries are grey, because they don'y matter (just kidding).
	ax.add_geometries(country.geometry, ccrs.PlateCarree(), facecolor=("#97a39a"), label=country.attributes['adm0_a3'])

	# Initiates a text for the year box and stores it on a variable.
	text = ax.text(0,0,1)

	# Loop through the years 1500 to 2017.
	for year in range(1500, 2018):
	# Remove the previous text, and add another with the current yeat.
	text.remove()
	text = ax.text(-72, -32, year, color='white', fontsize=18, fontweight='bold',
	bbox={'facecolor': '#040911', 'alpha': 0.7, 'pad': 10})

	# Loop through the sorted list.
	for city in final_list_sorted:

	# If the city was founded on the current year, put it on the map.
	if int(city[1]) == year:
	plt.plot(city[3], city[2], marker='.', markersize='2', color='#46125e', alpha=0.8)
	plt.draw()
	# Save image file.
	plt.savefig(str(year), frameon=False, bbox_inches='tight', pad_inches=0, dpi=300)

	r_file.close()

	# Counter for the number of cities.
	counter = 0
	# File to write the pages to.
	w_file = open("Pages.txt", 'a', encoding='utf8')

	# There are 4 dump files, so I loop through four times.
	for file_num in range(1, 5):

	# Open the dump file.
	r_file = open("ptwiki-20170901-pages-meta-current" + str(file_num) + ".txt", 'r', encoding='utf8')
	# Flag used in the loop, to look for key expressions.
	flag = False

	# Loop through all lines in file.
	for line in r_file:

	# Every infobox for brazilian cities starts with the header "{{Info/Município do Brasil", so
	# I look for that to start the infobox. When found, it writes to w_file all the page, which ends
	# at with a </text>.

	if line.find("{{Info/Município do Brasil") != -1:
	counter += 1
	w_file.write('\n\n### NEW PAGE###' + '\n\n' + str(counter) + '\n')
	flag = True

	if flag:
	w_file.write(line)

	if flag and line.find("</text>") != -1:
	w_file.write("### END PAGE ###")
	flag = False

	r_file.close()

	w_file.close()