This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# To make this file, I started with the NLTK PTB tokenizer. Then did the following. | |
# - I commented out lines of code that call parts of the NLTK API and copied in the code that is loaded from other files | |
# - This process makes a file that runs the PTB tokenizer as a single Python file, using the NLTK implementation | |
# - I made this file on Oct 14, 2020 and don't update it based on any changes in NLTK since then | |
###################################################################################################### | |
# Natural Language Toolkit: Tokenizers | |
# |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import json | |
import argparse | |
parser = argparse.ArgumentParser(description='parser') | |
parser.add_argument("-jsonl_file") | |
args = parser.parse_args() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
def prob_a_to_b(): return int(random.uniform(0,1) > 0) | |
def prob_b_to_a(): return int(random.uniform(0,1) > .5) | |
current_state, samples = 0, [] | |
for i in range(10000): | |
samples.append(current_state) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# http://www.locallyoptimal.com/blog/2013/01/20/elegant-n-gram-generation-in-python/ | |
input_list = ['all', 'this', 'happened', 'more', 'or', 'less'] | |
def find_ngrams(input_list, n): | |
return zip(*[input_list[i:] for i in range(n)]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# jq: JSONL ↔︎ JSON conversion h/t https://gist.github.com/sloanlance/c3bf746b6396f60d321f5535e1ced892 | |
@tsv => make tsv file | |
@csv => make csv file | |
-c flag => compact instead of pretty printed | |
1. ## JSONL → JSON |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from scipy.optimize import fmin_l_bfgs_b | |
# http://cs.jhu.edu/~jason/tutorials/loglin/formulas.pdf | |
def phi(n_i, e, z): | |
'''One feature needs to be the PMI of n_i''' | |
f1 = not n_i[0].isupper() and z == 0 | |
f2 = not n_i[0].lower() in e.lower() and z == 0 | |
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Approximate permutation test (i.e. don't actually do all permutations), as described in Wasserman's All of Statistics | |
If you spot any bugs, email me! | |
''' | |
import itertools | |
import numpy as np | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# converts spacy tags (as of Jun 21, https://spacy.io/api/annotation) to ark tags | |
# http://www.cs.cmu.edu/~ark/TweetNLP/gimpel+etal.acl11.pdf | |
# If you spot any mistakes in the conversion, email Abe | |
spacy2ark = {} | |
spacy2ark["ADJ"] = "A" | |
spacy2ark["ADP"] = "P" | |
spacy2ark["ADV"] = "R" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%norm f D # delete all chars after a space in vim | |
Vx # fast deletion of line w/o copying deleted to register | |
a (append) to insert text to the right of the cursor | |
A to add text to the end of a line | |
D # Delete the characters under the cursor until the end of the line | |
d f [char] # delete from cursor up to and including char, e.g. d f w deletes from cursor to 1st w |
NewerOlder