Skip to content

Instantly share code, notes, and snippets.

@morrisalp
morrisalp / wiktionary_category.py
Last active June 24, 2023 21:05
Get all page names in a given Wiktionary category (e.g. "English lemmas") using the Wiki REST API.
import requests
def pages_in_wiktionary_category(category_name, language = 'en'):
cont = ''
while True:
url = f'https://{language}.wiktionary.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:{category_name}&cmlimit=500&format=json&cmcontinue={cont}'
obj = requests.get(url).json()
for x in obj['query']['categorymembers']: yield x['title']
if 'continue' not in obj: break
cont = obj['continue']['cmcontinue']
@morrisalp
morrisalp / .vimrc
Last active October 5, 2019 07:08
my personal .vimrc
set encoding=utf-8
set autoindent
set expandtab
set tabstop=4
set shiftwidth=4
set number
set hlsearch incsearch
set wildmenu
set showcmd
syntax on
@morrisalp
morrisalp / load_conll2003.py
Created November 19, 2019 17:22
load CONLL2003 dataset using Pandas
import pandas as pd
def read_conll(filename):
df = pd.read_csv(filename,
sep = ' ', header = None, keep_default_na = False,
names = ['TOKEN', 'POS', 'CHUNK', 'NE'],
quoting = 3, skip_blank_lines = False)
df['SENTENCE'] = (df.TOKEN == '').cumsum()
return df[df.TOKEN != '']
@morrisalp
morrisalp / transformer.py
Last active February 2, 2021 09:12
minimal TF 2.0 (+ Keras) example of a transformer, based on the Peter Bloem article "Transformers from Scratch" (http://www.peterbloem.nl/blog/transformers)
from tensorflow.keras.layers import Input, Dense, Lambda, Reshape, Activation, Layer, LayerNormalization, Add
from tensorflow.keras.models import Sequential
from tensorflow.keras import Model
import tensorflow as tf
class SelfAttention(Layer):
def __init__(self, heads = 8):
super().__init__()
self.heads = heads
@morrisalp
morrisalp / spacy_newline.py
Last active February 12, 2020 11:50
Spacy English model with sentence segmentation on newlines
import spacy
nlp = spacy.load('en')
def set_custom_boundaries(doc):
for token in doc[:-1]:
if token.text == "\n":
doc[token.i+1].is_sent_start = True
return doc
@morrisalp
morrisalp / grequests_tqdm.py
Last active September 18, 2023 12:19
send async HTTP requests using grequests with tqdm progress bar
from tqdm import tqdm
import requests, grequests
class ProgressSession():
def __init__(self, urls):
self.pbar = tqdm(total = len(urls), desc = 'Making async requests')
self.urls = urls
def update(self, r, *args, **kwargs):
if not r.is_redirect:
self.pbar.update()
@morrisalp
morrisalp / html2text.py
Created April 7, 2020 11:45
sane text extraction given html string, using BeautifulSoup
from bs4 import BeautifulSoup as bs
def html2text(html):
soup = bs(html, features='lxml')
for script in soup(["script", "style"]):
script.decompose()
for br in soup.find_all("br"):
br.replace_with("\n")
return soup.get_text(separator=' ').strip()
@morrisalp
morrisalp / simple_bert.py
Last active June 17, 2020 00:53
minimal example of getting BERT embeddings for sentence, using TF 2.0 + Tensorflow Hub + HuggingFace tokenizers library
import tensorflow as tf
import tensorflow_hub as hub
from tokenizers import BertWordPieceTokenizer
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
import numpy as np
class BERTPreprocessor:
SEP_TOKEN = '[SEP]'
@morrisalp
morrisalp / top_k_categorical_accuracy.py
Created July 24, 2020 07:46
top K categorical accuracy for numpy arrays (sklearn predict_proba outputs)
import numpy as np
def top_k_categorical_accuracy(y_true, y_pred_proba, k=1):
return np.equal(np.argsort(y_pred_proba)[:, -k:], y_true[:, None]).any(axis=1).mean()
@morrisalp
morrisalp / flask_caching_demo.py
Created August 23, 2020 17:59
demo of flask_caching - calculating pi with ζ(2)
from flask import Flask
from flask_caching import Cache
app = Flask(__name__)
app.config.from_mapping({"CACHE_TYPE": "simple"})
cache = Cache(app)
def approximate_pi(n):
output = 0
for i in range(1, n):