Skip to content

Instantly share code, notes, and snippets.

View AbeHandler's full-sized avatar

Abe Handler AbeHandler

View GitHub Profile
# To make this file, I started with the NLTK PTB tokenizer. Then did the following.
# - I commented out lines of code that call parts of the NLTK API and copied in the code that is loaded from other files
# - This process makes a file that runs the PTB tokenizer as a single Python file, using the NLTK implementation
# - I made this file on Oct 14, 2020 and don't update it based on any changes in NLTK since then
######################################################################################################
# Natural Language Toolkit: Tokenizers
#
@AbeHandler
AbeHandler / jsonl2tsv.py
Created January 16, 2020 17:28
convert jsonl file to tsv
import csv
import json
import argparse
parser = argparse.ArgumentParser(description='parser')
parser.add_argument("-jsonl_file")
args = parser.parse_args()
import random
def prob_a_to_b(): return int(random.uniform(0,1) > 0)
def prob_b_to_a(): return int(random.uniform(0,1) > .5)
current_state, samples = 0, []
for i in range(10000):
samples.append(current_state)
# http://www.locallyoptimal.com/blog/2013/01/20/elegant-n-gram-generation-in-python/
input_list = ['all', 'this', 'happened', 'more', 'or', 'less']
def find_ngrams(input_list, n):
return zip(*[input_list[i:] for i in range(n)])
# jq: JSONL ↔︎ JSON conversion h/t https://gist.github.com/sloanlance/c3bf746b6396f60d321f5535e1ced892
@tsv => make tsv file
@csv => make csv file
-c flag => compact instead of pretty printed
1. ## JSONL → JSON
import numpy as np
from scipy.optimize import fmin_l_bfgs_b
# http://cs.jhu.edu/~jason/tutorials/loglin/formulas.pdf
def phi(n_i, e, z):
'''One feature needs to be the PMI of n_i'''
f1 = not n_i[0].isupper() and z == 0
f2 = not n_i[0].lower() in e.lower() and z == 0
@AbeHandler
AbeHandler / mixture_of_multinomials.ipynb
Last active August 9, 2019 19:11
mixture_of_multinomials w/ EM
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
'''
Approximate permutation test (i.e. don't actually do all permutations), as described in Wasserman's All of Statistics
If you spot any bugs, email me!
'''
import itertools
import numpy as np
# converts spacy tags (as of Jun 21, https://spacy.io/api/annotation) to ark tags
# http://www.cs.cmu.edu/~ark/TweetNLP/gimpel+etal.acl11.pdf
# If you spot any mistakes in the conversion, email Abe
spacy2ark = {}
spacy2ark["ADJ"] = "A"
spacy2ark["ADP"] = "P"
spacy2ark["ADV"] = "R"
%norm f D # delete all chars after a space in vim
Vx # fast deletion of line w/o copying deleted to register
a (append) to insert text to the right of the cursor
A to add text to the end of a line
D # Delete the characters under the cursor until the end of the line
d f [char] # delete from cursor up to and including char, e.g. d f w deletes from cursor to 1st w