Skip to content

Instantly share code, notes, and snippets.

View morkapronczay's full-sized avatar

Mór Kapronczay morkapronczay

View GitHub Profile
@morkapronczay
morkapronczay / overpass_query.py
Created April 23, 2019 12:23
Overpass query for I. kerület in Budapest
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
overpass_url = "http://overpass-api.de/api/interpreter"
overpass_query = """
[out:json];
area["ISO3166–1"="HU"][admin_level=2];
(
rel["name"="I. kerület"](area);
);
out center;
"""
def json_to_geojson(data, districts):
# create a geojson from a list of dictionaries
# containing coordinates with the name of the polygon
# in our case a polygon is a district
assert type(data) == list, "The parameter data should be a list of coordinates with a name argument!"
geojson = {
"type": "FeatureCollection",
"features": [
{
# create a map
m = folium.Map(location=[np.median(gdf_income_tax['centroid_lat'].tolist()),
np.median(gdf_income_tax['centroid_lon'].tolist())],
tiles='Stamen Toner', zoom_start=12)
## add chloropleth layer
m.choropleth(
geo_data=geojson,
name='Income Tax Per Capita',
data=gdf_income_tax,
import wikipedia as wp
def extract_content_pages(files, page_list, languages=languages):
# iterate over languages
for lang in languages:
print(lang)
wp.set_lang(lang)
try:
files[lang]
except KeyError:
from nltk.tokenize import RegexpTokenizer
# tokenized text - remove punctuation
tokenizer = RegexpTokenizer(r'\w+')
texts_split = {lan: {key: tokenizer.tokenize(text) for key, text in texts[lan].items()} for lan in languages}
from nltk.corpus import stopwords
# create one big list per language for easier handling
text_bylang = {lan: sum([val for key, val in texts_split[lan].items()], []) for lan in languages}
# long format of languages for stopword identification
languages_long = {'en': 'english', 'de': 'german', 'hu': 'hungarian', 'ro': 'romanian'}
# create dict of stopwords by language
stopwords_bylang = {lan: set(stopwords.words(languages_long[lan])) for lan in languages}
from nltk.stem import SnowballStemmer
# define stemmer objects by language
stemmers = {lan: SnowballStemmer(languages_long[lan]) for lan in languages}
# stem text
text_bylang_stemmed = {lan: [stemmers[lan].stem(word) for word in text_bylang[lan]] for lan in languages}
# stem and remove stopwords
text_bylang_stop_stemmed = {lan: [stemmers[lan].stem(word) for word in text_bylang_stop[lan]] for lan in languages}
from gensim import corpora
# create stemmed, stopword removed corpus
# by language by doc (wiki page)
texts_bylang_byhuman = {lan:
{key:
[stemmers[lan].stem(word)
for word in val if not word in stopwords_bylang[lan]]
for key, val in texts_split[lan].items()}
for lan in languages}
# for sending http get requests
import requests
# for parsing the response in searchable format
from bs4 import BeautifulSoup
# send requests
r = requests.get('https://en.wikipedia.org/wiki/List_of_national_anthems')
# parse into searchable object
soup = BeautifulSoup(r.content, 'html5lib')
import gender_guesser.detector as gender
import pandas as pd
# instatiate the detector
d = gender.Detector()
# this functions adds a gender column for a specific column
def guess_col_gender(col, suff='_gender', df=df, d=d):
# extract first names by splitting by ' ' and choosing the first element
first_names = [f.split(' ')[0] for f in df[col].tolist()]