- あらかじめ aws cli の設定をすませておいてください
- あとは以下のスクリプトを順番に実行してください
./emr_create_spark_cluster.sh`
./emr_pyspark_wc.sh`
emr_create_spark_cluster.shの上の変数群は適宜自分の環境に合わせて置き換えてください- クラスタのインスタンスタイプや台数等は適当なので,必要に応じて書き換えてください
- その他のcliコマンドについては,以下を参照してください
./emr_create_spark_cluster.sh`
./emr_pyspark_wc.sh`
emr_create_spark_cluster.sh の上の変数群は適宜自分の環境に合わせて置き換えてください| #!/bin/bash -xe | |
| sudo apt-get install python-setuptools | |
| sudo easy_install pip | |
| sudo pip install -U SimpleCV |
| #!/bin/bash | |
| BOOTSTRAP_PATH=$1 # 例: s3://my_buclet/scripts/ | |
| BOOTSTRUP_SCRIPT="bootstrap.sh" | |
| KEY_NAME="XXXXX.pem" | |
| SUBNET_ID="subnet-XXXXXXXX" # 例: subnet-fa16b1d7" | |
| MASTER_SECURITY_GROUP= "sg-XXXXXXXX" # 例: sg-256fe358 | |
| SLAVE_SECURITY_GROUP= "sg-XXXXXXXX" # 例: sg-256fe357 | |
| aws emr create-cluster --applications Name=Hadoop Name=Hive Name=Spark Name=Tez \ | |
| --ec2-attributes '{ | |
| "KeyName":${KEY_NAME}, | |
| "InstanceProfile":"EMR_EC2_DefaultRole", | |
| "SubnetId":${SUBNET_ID}, | |
| "EmrManagedSlaveSecurityGroup":${SLAVE_SECURITY_GROUP}, | |
| "EmrManagedMasterSecurityGroup":${MASTER_SECURITY_GROUP} | |
| }' \ | |
| --service-role EMR_DefaultRole --release-label emr-5.2.0 \ | |
| --name 'Spark Cluster' --instance-groups '[ | |
| { | |
| "InstanceCount":1, | |
| "InstanceGroupType":"MASTER", | |
| "InstanceType":"m3.xlarge", | |
| "Name":"Master instance group - 1" | |
| }, | |
| { | |
| "InstanceCount":5, | |
| "EbsConfiguration":{ | |
| "EbsBlockDeviceConfigs":[ | |
| { | |
| "VolumeSpecification":{ | |
| "SizeInGB":840, | |
| "VolumeType":"gp2" | |
| }, | |
| "VolumesPerInstance":1 | |
| } | |
| ], | |
| "EbsOptimized":false | |
| }, | |
| "InstanceGroupType":"CORE", | |
| "InstanceType":"r3.2xlarge", | |
| "Name":"Core instance group - 2" | |
| } | |
| ]' \ | |
| --bootstrap-action Path=${BOOTSTRAP_PATH}${BOOTSTRAP_SCRIPT} |
| #!/bin/bash | |
| CLUSTER_ID=$1 # aws cli list-clustersで取得可能 | |
| IN_PATH=$2 # 例: s3://my_buclet/in_path/ | |
| OUT_PATH=$3 # 例: s3://my_buclet/out_path/ | |
| PYSPARK_PATH=$4 # 例: s3://my_buclet/scripts/ | |
| PYSPARK_SCRIPT="wc.py" | |
| aws s3 cp spark/${PYSPARK_SCRIPT} ${PYSPARK_PATH} | |
| aws emr add-steps --cluster-id ${CLUSTER_ID} \ | |
| --steps Type=spark,Name=SparkWordCountApp,Args=[--deploy-mode,cluster,--master,yarn,--conf,spark.yarn.submit.waitAppCompletion=true,--num-executors,1,--executor-cores,1,--executor-memory,1g,${PYSPARK_PATH}${PYSPARK_SCRIPT},${IN_PTH},${OUT_PATH}],ActionOnFailure=CONTINUE |
| from __future__ import print_function | |
| from pyspark import SparkContext | |
| import sys | |
| if __name__ == "__main__": | |
| if len(sys.argv) != 3: | |
| print("Usage: wordcount ", file=sys.stderr) | |
| exit(-1) | |
| sc = SparkContext(appName="WordCount") | |
| text_file = sc.textFile(sys.argv[1]) | |
| counts = text_file.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b) | |
| counts.saveAsTextFile(sys.argv[2]) | |
| sc.stop() |