Motoki Wu tokestermw

## index.html
<html>
  <head>
    <script src="http://d3js.org/d3.v3.min.js"></script>
    <!-- <script src="http://www.jstat.org/jstat-1.0.0.min.js"></script> -->
    <!-- above source is down, need to build our own jstat (https://github.com/jstat/jstat), meanwhile, use unreliable mirror: -->
    <script src="https://raw.githubusercontent.com/jstat/jstat/master/dist/jstat.min.js"></script>
  </head>
  <body>
<div id="d3"></div>
<script type="text/javascript">

## DownloadPitchFX.R
# DownloadPitchFX.R
# downloads the massive MLB Gameday data.
# Version 0.4
# Version History
# 0.5 ~ grab player data, both pitchers and batters, ability to pick team
# 0.4 ~ get team data, and ability to grab team info, checks to see if regular season
# 0.3 ~ updated so 2010 works, fixed some bugs, and saves as tab delimited file
# 0.2 ~ inputs are start and end dates
# 0.1 ~ grab Pitch f/x data from MLB Gameday, specify date ranges (takes half a minute for a day's worth of data on my 2.5Ghz machine)

## .emacs
;; Revo64
;(setenv "PATH" (concat (getenv "PATH") ":/sw/bin"))
;(setq exec-path (append '("/opt/REvolution/Revo-3.2/Revo64/R.framework/Resources/")))
;    (setq exec-path (append exec-path '("/sw/bin")))


; scroll when running code
(setq comint-prompt-read-only nil)
(setq comint-scroll-to-bottom-on-input t)
(setq comint-scroll-to-bottom-on-output t)

## GetData.jl
module GetData

typealias RaggedMatrix{T} Array{Array{T,1},1}

function parse(filename)
    file = readlines(open(filename))

    x = convert(RaggedMatrix{String},
            [apply(vcat, [rstrip(term) for term in split(line, '\t')[2:]])
             for line in file])

## imbalancedrandomforests.py
from collections import Counter

import numpy as np

from sklearn.metrics import precision_score, recall_score, classification_report, roc_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import balance_weights

## visualizing_topic_models.py
import json
import urlparse
from itertools import chain
flatten = chain.from_iterable

from nltk import word_tokenize

from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.tfidfmodel import TfidfModel

## preprocess-twitter.py
"""
preprocess-twitter.py

python preprocess-twitter.py "Some random text with #hashtags, @mentions and http://t.co/kdjfkdjf (links). :)"

Script for preprocessing tweets by Romain Paulus
with small modifications by Jeffrey Pennington
with translation to Python by Motoki Wu

Translation of Ruby script to create features for GloVe vectors for Twitter data.

## moar-data.R

library("lme4")
library("ggplot2")
library("plyr")
library(reshape2)

head(Dyestuff)
#   Batch Yield
# 1     A  1545
# 2     A  1440

## ptb_lm_model.py
B = self.igor.batch_size
R = self.igor.rnn_size
S = self.igor.max_sequence_len
V = self.igor.vocab_size
E = self.igor.embedding_size
### loaded from glove
emb_W = self.igor.embeddings.astype(theano.config.floatX)

## dropout parameters
p_emb = self.igor.p_emb_dropout

## sampling_with_model.py
#!/usr/bin/env python
# coding: utf-8
"""Sampling Sequence Data from model"""

import numpy as np
import tensorflow as tf
import json
import cPickle as pickle
import itertools as it
from rnnlib import PTBModel
	<html>
	<head>
	<script src="http://d3js.org/d3.v3.min.js"></script>
	<!-- <script src="http://www.jstat.org/jstat-1.0.0.min.js"></script> -->
	<!-- above source is down, need to build our own jstat (https://github.com/jstat/jstat), meanwhile, use unreliable mirror: -->
	<script src="https://raw.githubusercontent.com/jstat/jstat/master/dist/jstat.min.js"></script>
	</head>
	<body>
	<div id="d3"></div>
	<script type="text/javascript">
	# DownloadPitchFX.R
	# downloads the massive MLB Gameday data.
	# Version 0.4
	# Version History
	# 0.5 ~ grab player data, both pitchers and batters, ability to pick team
	# 0.4 ~ get team data, and ability to grab team info, checks to see if regular season
	# 0.3 ~ updated so 2010 works, fixed some bugs, and saves as tab delimited file
	# 0.2 ~ inputs are start and end dates
	# 0.1 ~ grab Pitch f/x data from MLB Gameday, specify date ranges (takes half a minute for a day's worth of data on my 2.5Ghz machine)
	;; Revo64
	;(setenv "PATH" (concat (getenv "PATH") ":/sw/bin"))
	;(setq exec-path (append '("/opt/REvolution/Revo-3.2/Revo64/R.framework/Resources/")))
	; (setq exec-path (append exec-path '("/sw/bin")))


	; scroll when running code
	(setq comint-prompt-read-only nil)
	(setq comint-scroll-to-bottom-on-input t)
	(setq comint-scroll-to-bottom-on-output t)
	module GetData

	typealias RaggedMatrix{T} Array{Array{T,1},1}

	function parse(filename)
	file = readlines(open(filename))

	x = convert(RaggedMatrix{String},
	[apply(vcat, [rstrip(term) for term in split(line, '\t')[2:]])
	for line in file])
	from collections import Counter

	import numpy as np

	from sklearn.metrics import precision_score, recall_score, classification_report, roc_curve
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.datasets import make_classification
	from sklearn.cross_validation import train_test_split
	from sklearn.preprocessing import balance_weights
	import json
	import urlparse
	from itertools import chain
	flatten = chain.from_iterable

	from nltk import word_tokenize

	from gensim.corpora import Dictionary
	from gensim.models.ldamodel import LdaModel
	from gensim.models.tfidfmodel import TfidfModel
	"""
	preprocess-twitter.py

	python preprocess-twitter.py "Some random text with #hashtags, @mentions and http://t.co/kdjfkdjf (links). :)"

	Script for preprocessing tweets by Romain Paulus
	with small modifications by Jeffrey Pennington
	with translation to Python by Motoki Wu

	Translation of Ruby script to create features for GloVe vectors for Twitter data.

	library("lme4")
	library("ggplot2")
	library("plyr")
	library(reshape2)

	head(Dyestuff)
	# Batch Yield
	# 1 A 1545
	# 2 A 1440
	B = self.igor.batch_size
	R = self.igor.rnn_size
	S = self.igor.max_sequence_len
	V = self.igor.vocab_size
	E = self.igor.embedding_size
	### loaded from glove
	emb_W = self.igor.embeddings.astype(theano.config.floatX)

	## dropout parameters
	p_emb = self.igor.p_emb_dropout
	#!/usr/bin/env python
	# coding: utf-8
	"""Sampling Sequence Data from model"""

	import numpy as np
	import tensorflow as tf
	import json
	import cPickle as pickle
	import itertools as it
	from rnnlib import PTBModel