Abe Handler AbeHandler

## NLTK's PTB Tokenizer As Standalone File
# To make this file, I started with the NLTK PTB tokenizer. Then did the following.
#      - I commented out lines of code that call parts of the NLTK API and copied in the code that is loaded from other files
#      - This process makes a file that runs the PTB tokenizer as a single Python file, using the NLTK implementation
#      - I made this file on Oct 14, 2020 and don't update it based on any changes in NLTK since then


######################################################################################################

# Natural Language Toolkit: Tokenizers
#

## jsonl2tsv.py
import csv
import json

import argparse

parser = argparse.ArgumentParser(description='parser')

parser.add_argument("-jsonl_file")

args = parser.parse_args()

## markov_sample.py
import random

def prob_a_to_b(): return int(random.uniform(0,1) > 0)

def prob_b_to_a(): return int(random.uniform(0,1) > .5)

current_state, samples = 0, []

for i in range(10000):
    samples.append(current_state)

## python ngrams
# http://www.locallyoptimal.com/blog/2013/01/20/elegant-n-gram-generation-in-python/

input_list = ['all', 'this', 'happened', 'more', 'or', 'less']

def find_ngrams(input_list, n):
    return zip(*[input_list[i:] for i in range(n)])

## jq cheatsheet
# jq: JSONL ↔︎ JSON conversion  h/t https://gist.github.com/sloanlance/c3bf746b6396f60d321f5535e1ced892


@tsv => make tsv file

@csv => make csv file

-c flag => compact instead of pretty printed

1. ## JSONL → JSON

## loglin gradient example
import numpy as np
from scipy.optimize import fmin_l_bfgs_b
# http://cs.jhu.edu/~jason/tutorials/loglin/formulas.pdf

def phi(n_i, e, z):
    '''One feature needs to be the PMI of n_i'''

    f1 = not n_i[0].isupper() and z == 0
    f2 = not n_i[0].lower() in e.lower() and z == 0


## mixture_of_multinomials.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                AbeHandler
                / mixture_of_multinomials.ipynb
            
            
              Last active
              August 9, 2019 19:11
            
              
                mixture_of_multinomials w/ EM
              
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## permutation test
'''
Approximate permutation test (i.e. don't actually do all permutations), as described in Wasserman's All of Statistics

If you spot any bugs, email me!
'''

import itertools
import numpy as np


## spacy2ark tag conversion
# converts spacy tags (as of Jun 21, https://spacy.io/api/annotation) to ark tags
# http://www.cs.cmu.edu/~ark/TweetNLP/gimpel+etal.acl11.pdf

# If you spot any mistakes in the conversion, email Abe

spacy2ark = {}

spacy2ark["ADJ"] = "A"
spacy2ark["ADP"] = "P"
spacy2ark["ADV"] = "R"

## vim commands
 %norm f D # delete all chars after a space in vim

Vx # fast deletion of line w/o copying deleted to register

a (append) to insert text to the right of the cursor
A to add text to the end of a line

D	# Delete the characters under the cursor until the end of the line

d f [char] # delete from cursor up to and including char, e.g. d f w deletes from cursor to 1st w
	# To make this file, I started with the NLTK PTB tokenizer. Then did the following.
	# - I commented out lines of code that call parts of the NLTK API and copied in the code that is loaded from other files
	# - This process makes a file that runs the PTB tokenizer as a single Python file, using the NLTK implementation
	# - I made this file on Oct 14, 2020 and don't update it based on any changes in NLTK since then


	######################################################################################################

	# Natural Language Toolkit: Tokenizers
	#
	import csv
	import json

	import argparse

	parser = argparse.ArgumentParser(description='parser')

	parser.add_argument("-jsonl_file")

	args = parser.parse_args()
	import random

	def prob_a_to_b(): return int(random.uniform(0,1) > 0)

	def prob_b_to_a(): return int(random.uniform(0,1) > .5)

	current_state, samples = 0, []

	for i in range(10000):
	samples.append(current_state)
	# http://www.locallyoptimal.com/blog/2013/01/20/elegant-n-gram-generation-in-python/

	input_list = ['all', 'this', 'happened', 'more', 'or', 'less']

	def find_ngrams(input_list, n):
	return zip(*[input_list[i:] for i in range(n)])
	# jq: JSONL ↔︎ JSON conversion h/t https://gist.github.com/sloanlance/c3bf746b6396f60d321f5535e1ced892


	@tsv => make tsv file

	@csv => make csv file

	-c flag => compact instead of pretty printed

	1. ## JSONL → JSON
	import numpy as np
	from scipy.optimize import fmin_l_bfgs_b
	# http://cs.jhu.edu/~jason/tutorials/loglin/formulas.pdf

	def phi(n_i, e, z):
	'''One feature needs to be the PMI of n_i'''

	f1 = not n_i[0].isupper() and z == 0
	f2 = not n_i[0].lower() in e.lower() and z == 0
	'''
	Approximate permutation test (i.e. don't actually do all permutations), as described in Wasserman's All of Statistics

	If you spot any bugs, email me!
	'''

	import itertools
	import numpy as np
	# converts spacy tags (as of Jun 21, https://spacy.io/api/annotation) to ark tags
	# http://www.cs.cmu.edu/~ark/TweetNLP/gimpel+etal.acl11.pdf

	# If you spot any mistakes in the conversion, email Abe

	spacy2ark = {}

	spacy2ark["ADJ"] = "A"
	spacy2ark["ADP"] = "P"
	spacy2ark["ADV"] = "R"
	%norm f D # delete all chars after a space in vim

	Vx # fast deletion of line w/o copying deleted to register

	a (append) to insert text to the right of the cursor
	A to add text to the end of a line

	D # Delete the characters under the cursor until the end of the line

	d f [char] # delete from cursor up to and including char, e.g. d f w deletes from cursor to 1st w