Skip to content

Instantly share code, notes, and snippets.

View morkapronczay's full-sized avatar

Mór Kapronczay morkapronczay

View GitHub Profile
@morkapronczay
morkapronczay / calculate-msle.py
Created March 21, 2023 08:31
A snippet on how to calculate mean squared logarithmic error (MSLE) from scratch.
import numpy as np
actual = np.array([1, 2, 3, 4, 5])
predicted = np.array([1.1, 1.9, 2.7, 4.5, 6])
def msle(actual: np.ndarray, predicted: np.ndarray) -> float:
log_differences = np.subtract(np.log(1 + actual), np.log(1 + predicted))
squared_log_differences = np.square(log_differences)
return np.mean(squared_log_differences)
@morkapronczay
morkapronczay / calculate-mse.py
Last active March 21, 2023 08:30
A snippet on how to calculate mean squared error (MSE) from scratch.
import numpy as np
actual = np.array([1, 2, 3, 4, 5])
predicted = np.array([1.1, 1.9, 2.7, 4.5, 6])
def mse(actual: np.ndarray, predicted: np.ndarray) -> float:
differences = np.subtract(actual, predicted)
squared_differences = np.square(differences)
return np.mean(squared_differences)
import geopandas as gpd
import folium
import matplotlib.pyplot as plt
# read date
url = 'https://geoservicos.pbh.gov.br/geoserver/wfs?service=WFS&version=1.0.0&request=GetFeature&typeName=ide_bhgeo:BAIRRO&srsName=EPSG:31983&outputFormat=application%2Fjson'
gdf = gpd.read_file(url)
# check if data is right
fig, ax = plt.subplots(figsize=(10,10))
import gender_guesser.detector as gender
import pandas as pd
# instatiate the detector
d = gender.Detector()
# this functions adds a gender column for a specific column
def guess_col_gender(col, suff='_gender', df=df, d=d):
# extract first names by splitting by ' ' and choosing the first element
first_names = [f.split(' ')[0] for f in df[col].tolist()]
# for sending http get requests
import requests
# for parsing the response in searchable format
from bs4 import BeautifulSoup
# send requests
r = requests.get('https://en.wikipedia.org/wiki/List_of_national_anthems')
# parse into searchable object
soup = BeautifulSoup(r.content, 'html5lib')
from gensim import corpora
# create stemmed, stopword removed corpus
# by language by doc (wiki page)
texts_bylang_byhuman = {lan:
{key:
[stemmers[lan].stem(word)
for word in val if not word in stopwords_bylang[lan]]
for key, val in texts_split[lan].items()}
for lan in languages}
from nltk.stem import SnowballStemmer
# define stemmer objects by language
stemmers = {lan: SnowballStemmer(languages_long[lan]) for lan in languages}
# stem text
text_bylang_stemmed = {lan: [stemmers[lan].stem(word) for word in text_bylang[lan]] for lan in languages}
# stem and remove stopwords
text_bylang_stop_stemmed = {lan: [stemmers[lan].stem(word) for word in text_bylang_stop[lan]] for lan in languages}
from nltk.corpus import stopwords
# create one big list per language for easier handling
text_bylang = {lan: sum([val for key, val in texts_split[lan].items()], []) for lan in languages}
# long format of languages for stopword identification
languages_long = {'en': 'english', 'de': 'german', 'hu': 'hungarian', 'ro': 'romanian'}
# create dict of stopwords by language
stopwords_bylang = {lan: set(stopwords.words(languages_long[lan])) for lan in languages}
from nltk.tokenize import RegexpTokenizer
# tokenized text - remove punctuation
tokenizer = RegexpTokenizer(r'\w+')
texts_split = {lan: {key: tokenizer.tokenize(text) for key, text in texts[lan].items()} for lan in languages}
import wikipedia as wp
def extract_content_pages(files, page_list, languages=languages):
# iterate over languages
for lang in languages:
print(lang)
wp.set_lang(lang)
try:
files[lang]
except KeyError: