César de Pablo zdepablo

## dynet-tagger.py
"""
DyNet implementation of a sequence labeler (POS taggger).
This is a translation of this tagger in PyTorch: https://gist.github.com/hal3/8c170c4400576eb8d0a8bd94ab231232

Basic architecture:
 - take words
 - run though bidirectional GRU
 - predict labels one word at a time (left to right), using a recurrent neural network "decoder"
The decoder updates hidden state based on:
 - most recent word

## install_submodules.R
install_submodule_git <- function(x, ...) {
  install_dir <- tempfile()
  system(paste("git clone --recursive", shQuote(x), shQuote(install_dir)))
  devtools::install(install_dir, ...)
}
install_submodule_git("https://github.com/jonkeane/mocapGrip")

## brand-sentiment.py
import smaclient
from TwitterAPI import TwitterAPI
import matplotlib.pyplot as plt


# Go to http://dev.twitter.com and create an app.
# The consumer key and secret will be generated for you after
consumer_key = <consumer-key>
consumer_secret = <consumer-secret>

## gist:daf71447c82391c1b4311ffcceec2ebe

# java -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=12605 Main # Name of .class program

export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/pr/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/hadoop/lib/native

java -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=12611 -cp ta   rget/da_record_linkage-0.0.1-SNAPSHOT-jar-with-dependencies.jar da_record_linkage.TestSnappy

netstat -plten | grep LISTEN | grep :120* # See if there is any open port

## split_strat_scale.r
library(caret)

## select training indices preserving class distribution
in.train <- createDataPartition(yclass, p=0.8, list=FALSE)
summary(factor(yclass))
ytra <- yclass[in.train]; summary(factor(ytra))
ytst <- yclass[-in.train]; summary(factor(ytst))

## standardize features: training parameters of scaling for test-part
Xtra <- scale(X[in.train,])

## hive-receipts
# Overwrite non-partitioned table with their own contents
CREATE table xx_COPY LIKE xx;

INSERT OVERWRITE TABLE xx
SELECT * FROM xx

# Overwrite partitioned table with their own contents
CREATE table xx_COPY LIKE xx;

SHOW PARTITIONS ABC;

## hadoop-fs-receipts
# Reference: http://hadoop.apache.org/docs/r2.7.0/hadoop-project-dist/hadoop-common/FileSystemShell.html

# Show disk usage in human format
  hadoop fs -du -s -h /user/hive/warehouse/da_cdepablo*

# Show permissions
 hadoop fs -getfacl /user/hive/warehouse/da_cdepablo*

# Change permissions
 hadoop fs -setfacl -R -m other::rwx /user/hive/warehouse/da_cdepablo

## gist:3587a6755b080b85136c
#Number of active users per service - with a cutoff

SELECT  `service`, COUNT(*) num_users
FROM
(
SELECT  `service`, `hash_key`, COUNT(*) num_requests
FROM `log`
WHERE `date_operation` > '2014-12-01'
GROUP BY `service`, `hash_key`
ORDER BY num_requests DESC

## 0_reuse_code.js
// Use Gists to store code you would like to remember later on
console.log(window); // log the "window" object to the console

## extractranks.py
#!/usr/bin/python
# -*- coding: utf-8 -*-

from lxml import html,etree
import requests
import unicodecsv

def group(iterator, count):
	itr = iter(iterator)
	while True:
	"""
	DyNet implementation of a sequence labeler (POS taggger).
	This is a translation of this tagger in PyTorch: https://gist.github.com/hal3/8c170c4400576eb8d0a8bd94ab231232

	Basic architecture:
	- take words
	- run though bidirectional GRU
	- predict labels one word at a time (left to right), using a recurrent neural network "decoder"
	The decoder updates hidden state based on:
	- most recent word
	install_submodule_git <- function(x, ...) {
	install_dir <- tempfile()
	system(paste("git clone --recursive", shQuote(x), shQuote(install_dir)))
	devtools::install(install_dir, ...)
	}
	install_submodule_git("https://github.com/jonkeane/mocapGrip")
	import smaclient
	from TwitterAPI import TwitterAPI
	import matplotlib.pyplot as plt


	# Go to http://dev.twitter.com and create an app.
	# The consumer key and secret will be generated for you after
	consumer_key = <consumer-key>
	consumer_secret = <consumer-secret>

	# java -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=12605 Main # Name of .class program

	export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/pr/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/hadoop/lib/native

	java -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=12611 -cp ta rget/da_record_linkage-0.0.1-SNAPSHOT-jar-with-dependencies.jar da_record_linkage.TestSnappy

	netstat -plten \| grep LISTEN \| grep :120* # See if there is any open port
	library(caret)

	## select training indices preserving class distribution
	in.train <- createDataPartition(yclass, p=0.8, list=FALSE)
	summary(factor(yclass))
	ytra <- yclass[in.train]; summary(factor(ytra))
	ytst <- yclass[-in.train]; summary(factor(ytst))

	## standardize features: training parameters of scaling for test-part
	Xtra <- scale(X[in.train,])
	# Overwrite non-partitioned table with their own contents
	CREATE table xx_COPY LIKE xx;

	INSERT OVERWRITE TABLE xx
	SELECT * FROM xx

	# Overwrite partitioned table with their own contents
	CREATE table xx_COPY LIKE xx;

	SHOW PARTITIONS ABC;
	# Reference: http://hadoop.apache.org/docs/r2.7.0/hadoop-project-dist/hadoop-common/FileSystemShell.html

	# Show disk usage in human format
	hadoop fs -du -s -h /user/hive/warehouse/da_cdepablo*

	# Show permissions
	hadoop fs -getfacl /user/hive/warehouse/da_cdepablo*

	# Change permissions
	hadoop fs -setfacl -R -m other::rwx /user/hive/warehouse/da_cdepablo
	#Number of active users per service - with a cutoff

	SELECT `service`, COUNT(*) num_users
	FROM
	(
	SELECT `service`, `hash_key`, COUNT(*) num_requests
	FROM `log`
	WHERE `date_operation` > '2014-12-01'
	GROUP BY `service`, `hash_key`
	ORDER BY num_requests DESC
	// Use Gists to store code you would like to remember later on
	console.log(window); // log the "window" object to the console
	#!/usr/bin/python
	# -- coding: utf-8 --

	from lxml import html,etree
	import requests
	import unicodecsv

	def group(iterator, count):
	itr = iter(iterator)
	while True: