Kris softberries

## sites.txt
3dprinting.meta.stackexchange.com
3dprinting.stackexchange.com
academia.meta.stackexchange.com
academia.stackexchange.com
ai.meta.stackexchange.com
ai.stackexchange.com
android.meta.stackexchange.com
android.stackexchange.com
anime.meta.stackexchange.com
anime.stackexchange.com

## prepare-data.sh
#!/usr/bin/env bash

S3_DESTINATION_FOLDER="grajo001log/users"
SITES_FILE=sites.txt

IFS=$'\n'
for site in `cat $SITES_FILE`
do
    echo "$site"
    wget https://archive.org/download/stackexchange/$site.7z

## README.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                softberries
                / README.md
            
            
              Created
              October 23, 2017 12:10
            
              
                Command to get the size of the bucket
              
          
    aws s3api list-objects --bucket YOUR_BUCKET_NAME --output json --query "[sum(Contents[].Size), length(Contents[])]" | awk 'NR!=2 {print $0;next} NR==2 {print $0/1024/1024/1024" GB"}'

  
## AgeCounterJob.scala
class AgeCounterJob(args: Args) extends Job(args) {

  val lines: TypedPipe[String] = TypedPipe.from(TextLine(args("input")))

  val tokens: TypedPipe[Token] = lines.flatMap(f => xmlToToken(f))

  val byAge = tokens.groupBy(_.age)
  byAge.size
    .write(TypedTsv[(Int, Long)](args("output")))

## README.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                softberries
                / README.md
            
            
              Created
              October 23, 2017 12:14
            
              
                run the job locally

              
    hadoop jar target/scala-2.11/emr-scalding-tutorial-assembly-0.1.jar com.softwaremill.AgeCounterJob — hdfs — input “data/*” — output data-output

  
## README.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                softberries
                / README.md
            
            
              Created
              October 23, 2017 12:15
            
              
                run the job locally ver2
              
          
    hadoop jar target/scala-2.11/emr-scalding-tutorial-assembly-0.1.jar com.softwaremill.AgeCounterJob — local — input “data/hello.txt” — output data-output.txt

  
## README.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                softberries
                / README.md
            
            
              Created
              October 23, 2017 12:18
            
              
                copy the jar file to s3 location
              
          
    aws s3 cp target/scala-2.11/emr-scalding-tutorial-assembly-0.1.jar s3://grajo001log/emr-scalding-tutorial-assembly-0.1.jar

  
## README.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                softberries
                / README.md
            
            
              Created
              October 23, 2017 12:18
            
              
                start up emr cluster with the task attached
              
          
    aws emr create-cluster --name "Scalding test cluster" --ami-version 3.9.0 --use-default-roles --instance-type m1.medium --instance-count 3 --log-uri s3://grajo001out/2 --steps Type=CUSTOM_JAR,Name="EMR Tutorial",ActionOnFailure=TERMINATE_CLUSTER,Jar=s3://grajo001log/emr-scalding-tutorial-assembly-0.1.jar
,Args=["com.softwaremill.AgeCounterJob","--hdfs","--input","s3n://grajo001log/users/*","--output","s3n://grajo001log/output"] --auto-terminate

  
## task.json
[
  {
    "Name": "emr-scalding-tutorial",
    "Args": [
      "com.softwaremill.AgeCounterJob",
      "--input",
      "s3://grajo001log/users/*",
      "--output",
      "s3://grajo001out/scaldingoutput",
      "--hdfs"

## README.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                softberries
                / README.md
            
            
              Created
              October 23, 2017 12:27
            
              
                push task to emr
              
          
    aws emr add-steps --cluster-id=YOUR-CLUSTER_ID --steps=file://./task.json
	3dprinting.meta.stackexchange.com
	3dprinting.stackexchange.com
	academia.meta.stackexchange.com
	academia.stackexchange.com
	ai.meta.stackexchange.com
	ai.stackexchange.com
	android.meta.stackexchange.com
	android.stackexchange.com
	anime.meta.stackexchange.com
	anime.stackexchange.com
	#!/usr/bin/env bash

	S3_DESTINATION_FOLDER="grajo001log/users"
	SITES_FILE=sites.txt

	IFS=$'\n'
	for site in `cat $SITES_FILE`
	do
	echo "$site"
	wget https://archive.org/download/stackexchange/$site.7z
	class AgeCounterJob(args: Args) extends Job(args) {

	val lines: TypedPipe[String] = TypedPipe.from(TextLine(args("input")))

	val tokens: TypedPipe[Token] = lines.flatMap(f => xmlToToken(f))

	val byAge = tokens.groupBy(_.age)
	byAge.size
	.write(TypedTsv[(Int, Long)](args("output")))
	[
	{
	"Name": "emr-scalding-tutorial",
	"Args": [
	"com.softwaremill.AgeCounterJob",
	"--input",
	"s3://grajo001log/users/*",
	"--output",
	"s3://grajo001out/scaldingoutput",
	"--hdfs"