Skip to content

Instantly share code, notes, and snippets.

View mayhewsw's full-sized avatar

Stephen Mayhew mayhewsw

View GitHub Profile
@mayhewsw
mayhewsw / get_size.sh
Created October 11, 2020 00:52
Get Download Sizes of All Wikipedias
DUMP_DATE="20201001"
while read lng; do
URL="https://dumps.wikimedia.org/${lng}wiki/${DUMP_DATE}/${lng}wiki-${DUMP_DATE}-pages-articles.xml.bz2"
#echo $lng
curl -sI $URL | grep -i Content-Length | awk -v l="$lng" '{print l " " $2}'
done < langs.txt
@mayhewsw
mayhewsw / flair_char_indexer.py
Created July 15, 2019 19:20
Flair in Allennlp (quick and dirty)
from typing import Dict, List, Tuple, Any
from flair.data import Dictionary
from flair.embeddings import FlairEmbeddings
from overrides import overrides
from allennlp.common.checks import ConfigurationError
from allennlp.common.util import pad_sequence_to_length
from allennlp.data.tokenizers.token import Token
from allennlp.data.token_indexers.token_indexer import TokenIndexer
@mayhewsw
mayhewsw / combine.py
Created June 27, 2019 14:23
Combine two word vector text files (esp from different languages)
#!/usr/bin/python
import sys
def combine(f1, f2, outf, limit=-1, dim=64):
words1 = {}
words2 = {}
with open(f1) as f:
for i,line in enumerate(f):
if i > limit > -1:
@mayhewsw
mayhewsw / display_pytorch_matrics.py
Created May 2, 2019 17:15
Display pytorch matrices using matplotlib.
import torch
import matplotlib.pyplot as plt
import numpy as np
# I learned this one in allennlp, hence the name.
p = "path/to/model/best.th"
w = torch.load(p)
for k in w.keys():
print(k)
@mayhewsw
mayhewsw / reader.java
Created April 24, 2019 19:54
Reader for LLF dictionaries, probably.
import edu.illinois.cs.cogcomp.core.algorithms.LevensteinDistance;
import edu.illinois.cs.cogcomp.core.datastructures.Pair;
import edu.illinois.cs.cogcomp.core.datastructures.ViewNames;
import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Sentence;
import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation;
import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotationUtilities;
import edu.illinois.cs.cogcomp.core.datastructures.textannotation.View;
import edu.illinois.cs.cogcomp.core.io.LineIO;
import edu.illinois.cs.cogcomp.core.utilities.SerializationHelper;
import org.apache.commons.io.FilenameUtils;
@mayhewsw
mayhewsw / allennlp_vocab_test.ipynb
Last active April 5, 2019 22:40
Allennlp Vocabulary Tests.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@mayhewsw
mayhewsw / vector_select.sh
Created March 20, 2019 19:41
Assuming a text-vector file with header line, this will help you select the number of vectors you want, and clean a little.
# Number of vector lines you want
N=50000
IN=$1
OUT=$2
# Get the dimension from the header.
DIM=$(head -n 1 $IN | cut -d' ' -f2)
# Actually take the top...
@mayhewsw
mayhewsw / ner_allennlp_test.py
Last active December 22, 2018 23:11
Use Allennlp for NER programmatically, and test the runtime.
from allennlp.predictors.predictor import Predictor
import time
model = "https://s3-us-west-2.amazonaws.com/allennlp/models/ner-model-2018.04.26.tar.gz"
print("Loading model...")
@mayhewsw
mayhewsw / codenames.py
Created January 2, 2018 20:27
Codenames clue giver
# coding: utf-8
from gensim.models import KeyedVectors
# Downloaded from fasttext: https://fasttext.cc/docs/en/english-vectors.html
# Converted to word2vec binary format for faster loading (see convert.py)
vec = KeyedVectors.load_word2vec_format("~/data/wiki-news-300d-1M.vec.bin", binary=True)
from itertools import combinations
from nltk.stem import WordNetLemmatizer,PorterStemmer
wnl = WordNetLemmatizer()
stemmer = PorterStemmer()
@mayhewsw
mayhewsw / translate.py
Created August 10, 2016 22:04
Google API Word Translation
#!/usr/bin/python
from googleapiclient.discovery import build
import codecs
import HTMLParser
import shelve
# As of Aug 1 2016
API_KEY = "YOUR_API_KEY_HERE"