This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
actual = np.array([1, 2, 3, 4, 5]) | |
predicted = np.array([1.1, 1.9, 2.7, 4.5, 6]) | |
def msle(actual: np.ndarray, predicted: np.ndarray) -> float: | |
log_differences = np.subtract(np.log(1 + actual), np.log(1 + predicted)) | |
squared_log_differences = np.square(log_differences) | |
return np.mean(squared_log_differences) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
actual = np.array([1, 2, 3, 4, 5]) | |
predicted = np.array([1.1, 1.9, 2.7, 4.5, 6]) | |
def mse(actual: np.ndarray, predicted: np.ndarray) -> float: | |
differences = np.subtract(actual, predicted) | |
squared_differences = np.square(differences) | |
return np.mean(squared_differences) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import geopandas as gpd | |
import folium | |
import matplotlib.pyplot as plt | |
# read date | |
url = 'https://geoservicos.pbh.gov.br/geoserver/wfs?service=WFS&version=1.0.0&request=GetFeature&typeName=ide_bhgeo:BAIRRO&srsName=EPSG:31983&outputFormat=application%2Fjson' | |
gdf = gpd.read_file(url) | |
# check if data is right | |
fig, ax = plt.subplots(figsize=(10,10)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gender_guesser.detector as gender | |
import pandas as pd | |
# instatiate the detector | |
d = gender.Detector() | |
# this functions adds a gender column for a specific column | |
def guess_col_gender(col, suff='_gender', df=df, d=d): | |
# extract first names by splitting by ' ' and choosing the first element | |
first_names = [f.split(' ')[0] for f in df[col].tolist()] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# for sending http get requests | |
import requests | |
# for parsing the response in searchable format | |
from bs4 import BeautifulSoup | |
# send requests | |
r = requests.get('https://en.wikipedia.org/wiki/List_of_national_anthems') | |
# parse into searchable object | |
soup = BeautifulSoup(r.content, 'html5lib') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from gensim import corpora | |
# create stemmed, stopword removed corpus | |
# by language by doc (wiki page) | |
texts_bylang_byhuman = {lan: | |
{key: | |
[stemmers[lan].stem(word) | |
for word in val if not word in stopwords_bylang[lan]] | |
for key, val in texts_split[lan].items()} | |
for lan in languages} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.stem import SnowballStemmer | |
# define stemmer objects by language | |
stemmers = {lan: SnowballStemmer(languages_long[lan]) for lan in languages} | |
# stem text | |
text_bylang_stemmed = {lan: [stemmers[lan].stem(word) for word in text_bylang[lan]] for lan in languages} | |
# stem and remove stopwords | |
text_bylang_stop_stemmed = {lan: [stemmers[lan].stem(word) for word in text_bylang_stop[lan]] for lan in languages} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.corpus import stopwords | |
# create one big list per language for easier handling | |
text_bylang = {lan: sum([val for key, val in texts_split[lan].items()], []) for lan in languages} | |
# long format of languages for stopword identification | |
languages_long = {'en': 'english', 'de': 'german', 'hu': 'hungarian', 'ro': 'romanian'} | |
# create dict of stopwords by language | |
stopwords_bylang = {lan: set(stopwords.words(languages_long[lan])) for lan in languages} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.tokenize import RegexpTokenizer | |
# tokenized text - remove punctuation | |
tokenizer = RegexpTokenizer(r'\w+') | |
texts_split = {lan: {key: tokenizer.tokenize(text) for key, text in texts[lan].items()} for lan in languages} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import wikipedia as wp | |
def extract_content_pages(files, page_list, languages=languages): | |
# iterate over languages | |
for lang in languages: | |
print(lang) | |
wp.set_lang(lang) | |
try: | |
files[lang] | |
except KeyError: |
NewerOlder