Reid Pryzant rpryzant

## process.sh
#! /usr/bin/env bash
# Processes the v2 subtitles/aspec corpora into one combined corpus
# Requirements
# - Processed subtitlesv2 corpus
# - ASPEC

CORPUS1=$1
CORPUS2=$2
TARGET=$3    # target language = [ja, zh, fr]
LOC="/scr/rpryzant/chinese_english_corpora/"

## imdb.py
# working version of https://gist.github.com/jayrambhia/1678382


import urllib2
import json

def search(query):
    get_url = 'http://theapache64.xyz:8080/movie_db/search?keyword=%s' % query
    response = urllib2.urlopen(get_url).read().decode('utf-8')
    return json.loads(response)

## attention
"""
Usage (for our feedforward context):
make sure you initialize the layer with

score_fn='bahdanau'

and then when you use the module in your forward()
method, you can feed it a vector of zeros for your query:

query = torch.zeros(rnn_outputs[:, 0, :].shape)

## gist:561cc1b4d372cce7479fd14290eacbc3

def rm_refs(x):
    REF_RE = '<ref([-\w=" <>]+)?>.*?<([ ]+)?\/([ ]+)?ref>'
    x = re.sub(REF_RE, ' ', x)
    # leading </ref>
    if '</ref>' in x:
        x = re.sub(REF_RE, ' ', '<ref>' + x)
    # trailing <ref>
    if '<ref' in x:
        x = re.sub(REF_RE, ' ', x + '</ref>')

## pc_utils.py
from sklearn.decomposition import TruncatedSVD


def compute_pc(X,npc=1):
    """
    Compute the principal components.
    X: numpy array [data, features]
    npc: num principal components
    """
    svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)

## gist:a2324dd608c63f1637b1e36a1ffce46d
"""
USAGE

model = build_model()
attributor = Attributor(model, target_class=1, tokenizer=tokenizer)

...
# viz = interactive vizualization that you can dump into a file and look at in a web browser
# t2a = map of token to its attribution score
viz, t2a, attrs, y_prob, y_hat = attributor.attr_and_visualize(
	#! /usr/bin/env bash
	# Processes the v2 subtitles/aspec corpora into one combined corpus
	# Requirements
	# - Processed subtitlesv2 corpus
	# - ASPEC

	CORPUS1=$1
	CORPUS2=$2
	TARGET=$3 # target language = [ja, zh, fr]
	LOC="/scr/rpryzant/chinese_english_corpora/"
	# working version of https://gist.github.com/jayrambhia/1678382


	import urllib2
	import json

	def search(query):
	get_url = 'http://theapache64.xyz:8080/movie_db/search?keyword=%s' % query
	response = urllib2.urlopen(get_url).read().decode('utf-8')
	return json.loads(response)
	"""
	Usage (for our feedforward context):
	make sure you initialize the layer with

	score_fn='bahdanau'

	and then when you use the module in your forward()
	method, you can feed it a vector of zeros for your query:

	query = torch.zeros(rnn_outputs[:, 0, :].shape)

	def rm_refs(x):
	REF_RE = '<ref([-\w=" <>]+)?>.*?<([ ]+)?\/([ ]+)?ref>'
	x = re.sub(REF_RE, ' ', x)
	# leading </ref>
	if '</ref>' in x:
	x = re.sub(REF_RE, ' ', '<ref>' + x)
	# trailing <ref>
	if '<ref' in x:
	x = re.sub(REF_RE, ' ', x + '</ref>')
	from sklearn.decomposition import TruncatedSVD


	def compute_pc(X,npc=1):
	"""
	Compute the principal components.
	X: numpy array [data, features]
	npc: num principal components
	"""
	svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
	"""
	USAGE

	model = build_model()
	attributor = Attributor(model, target_class=1, tokenizer=tokenizer)

	...
	# viz = interactive vizualization that you can dump into a file and look at in a web browser
	# t2a = map of token to its attribution score
	viz, t2a, attrs, y_prob, y_hat = attributor.attr_and_visualize(