Mike Sukmanowsky msukmanowsky

## custom_code_bolt.py
import logging

from streamparse.bolt import Bolt


log = logging.getLogger("custom_code_bolt")


class CustomCodeBolt(Bolt):

## storm_version.py
import re


class InvalidVersionException(Exception): pass

class StormVersion(object):

    VERSION_RE = re.compile(r"(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)"
                             "(?P<older_patch>\.\d+)?(?P<other>.*)")
    RC_RE = re.compile(r"-rc(?P<release_candidate>\d+)", re.IGNORECASE)

## CassandraConverters.scala
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *

## pyspark_cassandra.py
from pyspark.context import SparkContext
from pyspark.serializers import BatchedSerializer, PickleSerializer
from pyspark.rdd import RDD

from py4j.java_gateway import java_import


class CassandraSparkContext(SparkContext):

    def _do_init(self, *args, **kwargs):

## spark_gzip.py
from pyspark import SparkContext


def main():
    sc = SparkContext(appName="Test Compression")
    # RDD has to be key, value pairs
    data = sc.parallelize([
        ("key1", "value1"),
        ("key2", "value2"),
        ("key3", "value3"),

## parsely_api.r
install.packages("jsonlite", dependencies = TRUE)
install.packages("RCurl", dependencies = TRUE)
library("jsonlite")
library("RCurl")


base_url <- "https://api.parsely.com/v2"
apikey <- "computerworld.com"
api_secret <- "YOUR SECRET KEY"

## auto_kill_tunnels.sh
#!/usr/bin/env bash
# Hitting CTRL-C kills the Django server as well as all tunnels that were created

TUNNEL_PIDS=()
function kill_tunnels() {
    for tunnel_pid in "${TUNNEL_PIDS[@]}"
    do
        kill $tunnel_pid
    done
}

## *nix_command_cheat_sheet.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                msukmanowsky
                / *nix_command_cheat_sheet.md
            
            
              Last active
              August 29, 2015 14:22
            
          
    Basics

Sort the output of a command

By 3rd column (1-indexed) in reverse order

sort -k3 -r

  
## aggregateByKey.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                msukmanowsky
                / aggregateByKey.md
            
            
              Last active
              August 29, 2015 14:27
            
          
    Spark / PySpark aggregateByKey Example

The existing examples for this are good, but they miss a pretty critical observation, the number of partitions and how this affects things.
Assume we have the following script, aggregate_by_key.py:
import pprint
from pyspark.context import SparkContext

  
## pytz_dst_bug.py
import datetime as dt
import pprint

import pytz

print(pytz.__version__)
# '2015.4'

timezone = pytz.timezone('Europe/London')
tmsp = dt.datetime(2015, 3, 29, 1, tzinfo=pytz.utc)
	import logging

	from streamparse.bolt import Bolt


	log = logging.getLogger("custom_code_bolt")


	class CustomCodeBolt(Bolt):
	import re


	class InvalidVersionException(Exception): pass

	class StormVersion(object):

	VERSION_RE = re.compile(r"(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)"
	"(?P<older_patch>\.\d+)?(?P<other>.*)")
	RC_RE = re.compile(r"-rc(?P<release_candidate>\d+)", re.IGNORECASE)
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	from pyspark.context import SparkContext
	from pyspark.serializers import BatchedSerializer, PickleSerializer
	from pyspark.rdd import RDD

	from py4j.java_gateway import java_import


	class CassandraSparkContext(SparkContext):

	def _do_init(self, args, *kwargs):
	from pyspark import SparkContext


	def main():
	sc = SparkContext(appName="Test Compression")
	# RDD has to be key, value pairs
	data = sc.parallelize([
	("key1", "value1"),
	("key2", "value2"),
	("key3", "value3"),
	install.packages("jsonlite", dependencies = TRUE)
	install.packages("RCurl", dependencies = TRUE)
	library("jsonlite")
	library("RCurl")


	base_url <- "https://api.parsely.com/v2"
	apikey <- "computerworld.com"
	api_secret <- "YOUR SECRET KEY"
	#!/usr/bin/env bash
	# Hitting CTRL-C kills the Django server as well as all tunnels that were created

	TUNNEL_PIDS=()
	function kill_tunnels() {
	for tunnel_pid in "${TUNNEL_PIDS[@]}"
	do
	kill $tunnel_pid
	done
	}
	import datetime as dt
	import pprint

	import pytz

	print(pytz.__version__)
	# '2015.4'

	timezone = pytz.timezone('Europe/London')
	tmsp = dt.datetime(2015, 3, 29, 1, tzinfo=pytz.utc)