mr1azl

## Big Query Google Analytics sessions export including site speed metrics.sql
SELECT
  first_sessions.sid                                                               AS sessionId,
  visitorId,
  first_transactions.transactionId                                                 AS transactionId,
  timestamp,
  deviceCategory,
  landingPage,
  pageviews,
  timeOnSite,
  channel,

## faster_toPandas.py
import pandas as pd

def _map_to_pandas(rdds):
    """ Needs to be here due to pickling issues """
    return [pd.DataFrame(list(rdds))]

def toPandas(df, n_partitions=None):
    """
    Returns the contents of `df` as a local `pandas.DataFrame` in a speedy fashion. The DataFrame is
    repartitioned if `n_partitions` is passed.

## ElementWithCount.scala
import org.apache.spark.sql.types.SQLUserDefinedType

@SQLUserDefinedType(udt = classOf[ElementWithCountUDT])
case class ElementWithCount(element:String, count:Int) extends Serializable {

  override def toString: String = {

      Seq(
        element,
        count

## spark-thred-safe.scala
object ServerSparkContext {

  private[this] lazy val _sqlContext = {
    val conf = new SparkConf()
      .setAppName("....")
    val sc = new SparkContext(conf)

    // TODO: Bug in Spark: http://stackoverflow.com/questions/30323212
    val ctx = new HiveContext(sc)
    ctx.setConf("spark.sql.hive.convertMetastoreParquet", "false")

## ubuntu14.04-command-line-install-android-sdk
# install openjdk
sudo apt-get install openjdk-7-jdk

# download android sdk
wget http://dl.google.com/android/android-sdk_r24.2-linux.tgz

tar -xvf android-sdk_r24.2-linux.tgz
cd android-sdk-linux/tools

# install all sdk packages

## df.py
data = sqlContext.load("/home/rxin/ints.parquet")
data.groupBy("a").agg(col("a"), avg("num")).collect()

## python
from apscheduler.jobstores.base import JobLookupError
from apscheduler.schedulers.background import BackgroundScheduler
import time


def hello():
    print(time.localtime().tm_sec)


def kill_hello(scheduler):

## gist:6e3dbb232bafec0792ba
import scala.language.experimental.macros
import scala.reflect.macros.blackbox.Context

trait Mappable[T] {
  def toMap(t: T): Map[String, Any]
  def fromMap(map: Map[String, Any]): T
}

object Mappable {


## gist:1972797
$ time find /opt/local/share/emacs/23.4/lisp/ -type f -name \*.gz -exec zgrep --color=yes -H -n -e "conf-mode" {} \;
real 0m12.239s
user 0m7.327s
sys  0m6.804s

$ time find /opt/local/share/emacs/23.4/lisp/ -type f -name \*.gz -exec zgrep --color=yes -H -n -e "conf-mode" {} +
real 0m8.574s
user 0m4.950s
sys  0m5.995s

## gcd_and_lcm.py
# Greatest common divisor of 1 or more numbers.
from functools import reduce


def gcd(*numbers):
    """
    Return the greatest common divisor of 1 or more integers

    Examples
    --------
	SELECT
	first_sessions.sid AS sessionId,
	visitorId,
	first_transactions.transactionId AS transactionId,
	timestamp,
	deviceCategory,
	landingPage,
	pageviews,
	timeOnSite,
	channel,
	import pandas as pd

	def _map_to_pandas(rdds):
	""" Needs to be here due to pickling issues """
	return [pd.DataFrame(list(rdds))]

	def toPandas(df, n_partitions=None):
	"""
	Returns the contents of `df` as a local `pandas.DataFrame` in a speedy fashion. The DataFrame is
	repartitioned if `n_partitions` is passed.
	import org.apache.spark.sql.types.SQLUserDefinedType

	@SQLUserDefinedType(udt = classOf[ElementWithCountUDT])
	case class ElementWithCount(element:String, count:Int) extends Serializable {

	override def toString: String = {

	Seq(
	element,
	count
	object ServerSparkContext {

	private[this] lazy val _sqlContext = {
	val conf = new SparkConf()
	.setAppName("....")
	val sc = new SparkContext(conf)

	// TODO: Bug in Spark: http://stackoverflow.com/questions/30323212
	val ctx = new HiveContext(sc)
	ctx.setConf("spark.sql.hive.convertMetastoreParquet", "false")
	# install openjdk
	sudo apt-get install openjdk-7-jdk

	# download android sdk
	wget http://dl.google.com/android/android-sdk_r24.2-linux.tgz

	tar -xvf android-sdk_r24.2-linux.tgz
	cd android-sdk-linux/tools

	# install all sdk packages
	data = sqlContext.load("/home/rxin/ints.parquet")
	data.groupBy("a").agg(col("a"), avg("num")).collect()
	from apscheduler.jobstores.base import JobLookupError
	from apscheduler.schedulers.background import BackgroundScheduler
	import time


	def hello():
	print(time.localtime().tm_sec)


	def kill_hello(scheduler):
	import scala.language.experimental.macros
	import scala.reflect.macros.blackbox.Context

	trait Mappable[T] {
	def toMap(t: T): Map[String, Any]
	def fromMap(map: Map[String, Any]): T
	}

	object Mappable {
	$ time find /opt/local/share/emacs/23.4/lisp/ -type f -name \*.gz -exec zgrep --color=yes -H -n -e "conf-mode" {} \;
	real 0m12.239s
	user 0m7.327s
	sys 0m6.804s

	$ time find /opt/local/share/emacs/23.4/lisp/ -type f -name \*.gz -exec zgrep --color=yes -H -n -e "conf-mode" {} +
	real 0m8.574s
	user 0m4.950s
	sys 0m5.995s
	# Greatest common divisor of 1 or more numbers.
	from functools import reduce


	def gcd(*numbers):
	"""
	Return the greatest common divisor of 1 or more integers

	Examples
	--------