aws s3api list-objects --bucket YOUR_BUCKET_NAME --output json --query "[sum(Contents[].Size), length(Contents[])]" | awk 'NR!=2 {print $0;next} NR==2 {print $0/1024/1024/1024" GB"}'
3dprinting.meta.stackexchange.com | |
3dprinting.stackexchange.com | |
academia.meta.stackexchange.com | |
academia.stackexchange.com | |
ai.meta.stackexchange.com | |
ai.stackexchange.com | |
android.meta.stackexchange.com | |
android.stackexchange.com | |
anime.meta.stackexchange.com | |
anime.stackexchange.com |
#!/usr/bin/env bash | |
S3_DESTINATION_FOLDER="grajo001log/users" | |
SITES_FILE=sites.txt | |
IFS=$'\n' | |
for site in `cat $SITES_FILE` | |
do | |
echo "$site" | |
wget https://archive.org/download/stackexchange/$site.7z |
class AgeCounterJob(args: Args) extends Job(args) { | |
val lines: TypedPipe[String] = TypedPipe.from(TextLine(args("input"))) | |
val tokens: TypedPipe[Token] = lines.flatMap(f => xmlToToken(f)) | |
val byAge = tokens.groupBy(_.age) | |
byAge.size | |
.write(TypedTsv[(Int, Long)](args("output"))) |
hadoop jar target/scala-2.11/emr-scalding-tutorial-assembly-0.1.jar com.softwaremill.AgeCounterJob — hdfs — input “data/*” — output data-output
hadoop jar target/scala-2.11/emr-scalding-tutorial-assembly-0.1.jar com.softwaremill.AgeCounterJob — local — input “data/hello.txt” — output data-output.txt
aws s3 cp target/scala-2.11/emr-scalding-tutorial-assembly-0.1.jar s3://grajo001log/emr-scalding-tutorial-assembly-0.1.jar
aws emr create-cluster --name "Scalding test cluster" --ami-version 3.9.0 --use-default-roles --instance-type m1.medium --instance-count 3 --log-uri s3://grajo001out/2 --steps Type=CUSTOM_JAR,Name="EMR Tutorial",ActionOnFailure=TERMINATE_CLUSTER,Jar=s3://grajo001log/emr-scalding-tutorial-assembly-0.1.jar ,Args=["com.softwaremill.AgeCounterJob","--hdfs","--input","s3n://grajo001log/users/*","--output","s3n://grajo001log/output"] --auto-terminate
[ | |
{ | |
"Name": "emr-scalding-tutorial", | |
"Args": [ | |
"com.softwaremill.AgeCounterJob", | |
"--input", | |
"s3://grajo001log/users/*", | |
"--output", | |
"s3://grajo001out/scaldingoutput", | |
"--hdfs" |
aws emr add-steps --cluster-id=YOUR-CLUSTER_ID --steps=file://./task.json