Oleksii Iepishkin epishkin

## .gitconfig
[alias]
  co = checkout
  ci = commit
  st = status -sb
  cln = remote prune origin
  br = branch
  hist = log --pretty=format:\"%h %ad | %s%d [%an]\" --date=short
  hist-graph = log --pretty=format:\"%h %ad | %s%d [%an]\" --graph --date=short
  lr = "!f() { git log $1...$2 --left-right --oneline; }; f"
  type = cat-file -t

## upload.md

      
              2 files
            
          
              0 forks
            
          
              0 comments
            
          
              1 star
            
          
                epishkin
                / upload.md
            
            
              Last active
              December 19, 2015 12:08
            
              
                script to upload oozie workflow / coordinator to hdfs
              
          
    project structure:
.
├── oozie
│   ├── upload.sh
│   ├── combined_queries
│      ├── ...
│   └── simple_reports
│      ├── lib
│      │   ├── avro-1.7.4.jar


## readme.md

      
              1 file
            
          
              0 forks
            
          
              1 comment
            
          
              1 star
            
          
                epishkin
                / readme.md
            
            
              Last active
              December 27, 2015 16:49
            
              
                Hadoop Howto
              
          
    Howto setup Cloudera Sandbox

Download hadoop & oozie


Download tarballs of hadoop and oozie from http://www.cloudera.com/content/dev-center/en/home/developer-admin-resources/cdh-components.html
extract into ~/opt/ so you should have them in ~/opt/hadoop-2.0.0-cdh4.4.0 and ~/opt/oozie-3.3.2-cdh4.4.0

Update env variables

in ~/.bash_login or ~/.bash_profile
export HDP_HOME=$HOME/opt/hadoop-2.0.0-cdh4.4.0


## gist:9844553
find . -type f -print0 | xargs -0 gsed -i 's/\.sum(/.sum[Double](/g'
find . -type f -print0 | xargs -0 gsed -i 's/\.plus\[/.sum[/g'
find . -type f -print0 | xargs -0 gsed -i 's/import com.twitter.scalding.DateOps.richDateToCalendar/import com.twitter.scalding.RichDate.toCalendar/'
find . -type f -print0 | xargs -0 gsed -i 's/ RichDate("\([^"]\+\)")(\([^)]\+\))/ com.twitter.scalding.DateParser.default.parse("\1")(\2).get/g'
find . -type f -print0 | xargs -0 gsed -i 's/\.then[^(Do)]/.thenDo/g'
find . -type f -print0 | xargs -0 gsed -i 's/Mode\.mode/mode/g'
find . -type f -print0 | xargs -0 gsed -i 's/new RichDate/RichDate/g'

find . -type f -print0 | xargs -0 gsed -i 's/import scalding.avro/import com.twitter.scalding.avro/'

## git-branches.sh
remove_head() {
  for BRANCH in $ALL_BRANCHES;
  do
    if [ "$BRANCH" = "->" ] || [ "$BRANCH" = "origin/HEAD" ]; then
      continue
    fi

    echo $BRANCH
  done | sort -u
}

## count_uniques.scala
// 2 m/r jobs :-(
.unique('item_id_from, 'item_id_to, 'user_id)            // 1st m/r
.groupBy('item_id_from, 'item_id_to) { _.size('count) }  // 2nd m/r

// 1 m/r job but more code
.map('user_id -> 'user_id) { id: String => Set(id) }
.groupBy('item_id_from, 'item_id_to) {
  _.sum[Set[String}]('user_id)
}
.map('user_id -> 'count) { ids: Set[String] => ids.size }

## SaveCountersToHdfs.scala
import java.io.PrintWriter

import cascading.stats.CascadingStats
import com.twitter.scalding._

/**
 * Writes all custom counters into a tsv file args("counters-file") if this property is set.
 *
 * Output format:
 * counter_name value

## shapes.md

      
              2 files
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                epishkin
                / shapes.md
            
            
              Last active
              December 14, 2016 19:00
            
          
    import turtle
t = turtle.Pen()

def line(count, size, alpha, beta):
	if count==0:
		return
	else:
	    t.forward(size)
 t.right(180 - beta)
	[alias]
	co = checkout
	ci = commit
	st = status -sb
	cln = remote prune origin
	br = branch
	hist = log --pretty=format:\"%h %ad \| %s%d [%an]\" --date=short
	hist-graph = log --pretty=format:\"%h %ad \| %s%d [%an]\" --graph --date=short
	lr = "!f() { git log $1...$2 --left-right --oneline; }; f"
	type = cat-file -t
	find . -type f -print0 \| xargs -0 gsed -i 's/\.sum(/.sum[Double](/g'
	find . -type f -print0 \| xargs -0 gsed -i 's/\.plus\[/.sum[/g'
	find . -type f -print0 \| xargs -0 gsed -i 's/import com.twitter.scalding.DateOps.richDateToCalendar/import com.twitter.scalding.RichDate.toCalendar/'
	find . -type f -print0 \| xargs -0 gsed -i 's/ RichDate("\([^"]\+\)")(\([^)]\+\))/ com.twitter.scalding.DateParser.default.parse("\1")(\2).get/g'
	find . -type f -print0 \| xargs -0 gsed -i 's/\.then[^(Do)]/.thenDo/g'
	find . -type f -print0 \| xargs -0 gsed -i 's/Mode\.mode/mode/g'
	find . -type f -print0 \| xargs -0 gsed -i 's/new RichDate/RichDate/g'

	find . -type f -print0 \| xargs -0 gsed -i 's/import scalding.avro/import com.twitter.scalding.avro/'
	remove_head() {
	for BRANCH in $ALL_BRANCHES;
	do
	if [ "$BRANCH" = "->" ] \|\| [ "$BRANCH" = "origin/HEAD" ]; then
	continue
	fi

	echo $BRANCH
	done \| sort -u
	}
	// 2 m/r jobs :-(
	.unique('item_id_from, 'item_id_to, 'user_id) // 1st m/r
	.groupBy('item_id_from, 'item_id_to) { _.size('count) } // 2nd m/r

	// 1 m/r job but more code
	.map('user_id -> 'user_id) { id: String => Set(id) }
	.groupBy('item_id_from, 'item_id_to) {
	_.sum[Set[String}]('user_id)
	}
	.map('user_id -> 'count) { ids: Set[String] => ids.size }
	import java.io.PrintWriter

	import cascading.stats.CascadingStats
	import com.twitter.scalding._

	/**
	* Writes all custom counters into a tsv file args("counters-file") if this property is set.
	*
	* Output format:
	* counter_name value