neil kodner neilkod

## gist:2270271
goal: to have data frame with team name(all_team_summary), mean time in minutes for men, mean time in minutes for women.

instead of doing this:

raw_data = read.csv('/Users/nkodner/Dropbox/development/python/2012_mb_corporate_run/data/results_2012.tsv',header=FALSE, sep='\t',stringsAsFactors=FALSE)
names(raw_data) <- c('overall_position','gender_position','bib','name','time','seconds','minutes','gender','team')


male_runners <- raw_data[raw_data$gender == "M",]
female_runners <- raw_data[raw_data$gender == "F",]

## gist:2166420
nkodner@hadoop4 fope$ grep -i josh /usr/share/dict/words
Josh
josh
josher
joshi
Joshua
nkodner@hadoop4 fope$ grep -i bradley !$
grep -i bradley /usr/share/dict/words
Bradley
nkodner@hadoop4 fope$ grep -i jake !$

## gist:2166367
#!/bin/python
import json, random
from collections import defaultdict
positions = defaultdict(list)
f = open('players.json','r').read()
data = json.loads(f)
players = data['body']['players']
for player in players:
	if player['firstname'][0] == player['lastname'][0]:
		positions[player['position']].append(player)

## gist:2159054
api endpoint http://developer.cbssports.com/documentation/api/files/stats/categories

Out of all of the statistics available through the baseball api, these are the statistics that are considered 'bad':

>>> [(x['name'],x['formula'],x['abbr']) for x in data['body']['stats_categories'] if x['is_bad']!=0]
[(u'Walks per Nine', u'BBI * 9 / INN', u'BBd9'), (u'Singles Allowed', '', u'1BA'), (u'Doubles Allowed', '', u'2BA'), (u'Hit Batsmen', '', u'HB'), (u'Strikeouts (Batter)', '', u'KO'), (u'Men on Base/ 9 Innings', u'(HA + BBI + HB) * 9 / INN', u'Rd9'), (u'Wild Pitches', '', u'WP'), (u'Runs Allowed', '', u'RA'), (u'Batting Average Against', u'HA / ABA', u'BAA'), (u'Home Runs Allowed', '', u'HRA'), (u'Balks', '', u'B'), (u'Relief Losses', '', u'RL'), (u'Inherited Runners Scored', '', u'IRS'), (u'Intentional Walks', '', u'IBBI'), (u'Total Bases Allowed', '', u'TBA'), (u'Earned Run Average', u'ER * 9 / INN', u'ERA'), (u'Caught Stealing', '', u'CS'), (u'Hits per Nine', u'HA * 9 / INN', u'Hd9'), (u'Ground Into Double P

## gist:2117936
# set -e
# if set -e was actually set, the first time the -f command
# returns non-zero, the entire program would exit with the return code
# from the ssh command. DO NOT WANT

while true
do
  ssh user@some.server.com "[[ -f $src_dir/nz_${DOMAIN}_done.flag ]]"
  retval=$?
  if [[ $retval -ne 0 ]]

## gist:1917507
using example at http://bulbflow.com/api/

nkodner@hadoop4 gremlin$ sudo easy_install bulbs
Password:
Searching for bulbs
Best match: bulbs 0.2.2
Processing bulbs-0.2.2-py2.7.egg
bulbs 0.2.2 is already the active version in easy-install.pth

Using /Library/Python/2.7/site-packages/bulbs-0.2.2-py2.7.egg

## gist:1712796
goal:
to copy directories and .dat files from remote server.
the daily directories don't exist locally.

can rsync create the missing directories for a range of days?

rsync -xtv -e ssh nkodner@server:/var/opt/dw/data/flurry_analytics_daily/2012-01-1[1-5]/*.dat $DW_DATA_HOME/flurry_analytics_daily

## gist:1588596
zookeeper shell output below after python output
------------------------------------------------


nkodner@hadoop4 zookeeper$ python -v watch_znode_for_changes.py
# installing zipimport hook
import zipimport # builtin
# installed zipimport hook
# /System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site.pyc matches /System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site.py
import site # precompiled from /System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site.pyc

## gist:1529672
import codecs

def writeToLogUnicode(logFile,text):
  fileHandle = codecs.open(logFile,'a','utf-8')
  fileHandle.write(text + '\n')
  fileHandle.close()

writeToLogUnicode('foo.log', 'unicode text goes here')

## parse_fastload_log.py
cat parse_fastload_log.py
import re
logfile="/var/opt/sports_dw/dev/td_sports/td_sports_log/pt_email_dim.log"
values = {}

items = ['Total Records Read',
         'Total Error Table 1',
         'Total Error Table 2',
         'Total Inserts Applied',
         'Total Duplicate Rows']
	goal: to have data frame with team name(all_team_summary), mean time in minutes for men, mean time in minutes for women.

	instead of doing this:

	raw_data = read.csv('/Users/nkodner/Dropbox/development/python/2012_mb_corporate_run/data/results_2012.tsv',header=FALSE, sep='\t',stringsAsFactors=FALSE)
	names(raw_data) <- c('overall_position','gender_position','bib','name','time','seconds','minutes','gender','team')


	male_runners <- raw_data[raw_data$gender == "M",]
	female_runners <- raw_data[raw_data$gender == "F",]
	nkodner@hadoop4 fope$ grep -i josh /usr/share/dict/words
	Josh
	josh
	josher
	joshi
	Joshua
	nkodner@hadoop4 fope$ grep -i bradley !$
	grep -i bradley /usr/share/dict/words
	Bradley
	nkodner@hadoop4 fope$ grep -i jake !$
	#!/bin/python
	import json, random
	from collections import defaultdict
	positions = defaultdict(list)
	f = open('players.json','r').read()
	data = json.loads(f)
	players = data['body']['players']
	for player in players:
	if player['firstname'][0] == player['lastname'][0]:
	positions[player['position']].append(player)
	api endpoint http://developer.cbssports.com/documentation/api/files/stats/categories

	Out of all of the statistics available through the baseball api, these are the statistics that are considered 'bad':

	>>> [(x['name'],x['formula'],x['abbr']) for x in data['body']['stats_categories'] if x['is_bad']!=0]
	[(u'Walks per Nine', u'BBI * 9 / INN', u'BBd9'), (u'Singles Allowed', '', u'1BA'), (u'Doubles Allowed', '', u'2BA'), (u'Hit Batsmen', '', u'HB'), (u'Strikeouts (Batter)', '', u'KO'), (u'Men on Base/ 9 Innings', u'(HA + BBI + HB) * 9 / INN', u'Rd9'), (u'Wild Pitches', '', u'WP'), (u'Runs Allowed', '', u'RA'), (u'Batting Average Against', u'HA / ABA', u'BAA'), (u'Home Runs Allowed', '', u'HRA'), (u'Balks', '', u'B'), (u'Relief Losses', '', u'RL'), (u'Inherited Runners Scored', '', u'IRS'), (u'Intentional Walks', '', u'IBBI'), (u'Total Bases Allowed', '', u'TBA'), (u'Earned Run Average', u'ER * 9 / INN', u'ERA'), (u'Caught Stealing', '', u'CS'), (u'Hits per Nine', u'HA * 9 / INN', u'Hd9'), (u'Ground Into Double P
	# set -e
	# if set -e was actually set, the first time the -f command
	# returns non-zero, the entire program would exit with the return code
	# from the ssh command. DO NOT WANT

	while true
	do
	ssh user@some.server.com "[[ -f $src_dir/nz_${DOMAIN}_done.flag ]]"
	retval=$?
	if [[ $retval -ne 0 ]]
	using example at http://bulbflow.com/api/

	nkodner@hadoop4 gremlin$ sudo easy_install bulbs
	Password:
	Searching for bulbs
	Best match: bulbs 0.2.2
	Processing bulbs-0.2.2-py2.7.egg
	bulbs 0.2.2 is already the active version in easy-install.pth

	Using /Library/Python/2.7/site-packages/bulbs-0.2.2-py2.7.egg
	goal:
	to copy directories and .dat files from remote server.
	the daily directories don't exist locally.

	can rsync create the missing directories for a range of days?

	rsync -xtv -e ssh nkodner@server:/var/opt/dw/data/flurry_analytics_daily/2012-01-1[1-5]/*.dat $DW_DATA_HOME/flurry_analytics_daily
	zookeeper shell output below after python output
	------------------------------------------------


	nkodner@hadoop4 zookeeper$ python -v watch_znode_for_changes.py
	# installing zipimport hook
	import zipimport # builtin
	# installed zipimport hook
	# /System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site.pyc matches /System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site.py
	import site # precompiled from /System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site.pyc
	import codecs

	def writeToLogUnicode(logFile,text):
	fileHandle = codecs.open(logFile,'a','utf-8')
	fileHandle.write(text + '\n')
	fileHandle.close()

	writeToLogUnicode('foo.log', 'unicode text goes here')
	cat parse_fastload_log.py
	import re
	logfile="/var/opt/sports_dw/dev/td_sports/td_sports_log/pt_email_dim.log"
	values = {}

	items = ['Total Records Read',
	'Total Error Table 1',
	'Total Error Table 2',
	'Total Inserts Applied',
	'Total Duplicate Rows']