Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
# Properties for the Coordinator flow
# Should contain settings that:
# a) personalize your deployment, or
# b) settings to connect to the correct Hadoop cluster
# Your username (Kerberos!). Needs to be done twice, didn't find a way around that
user.name=joldenbeuving
applicationPath=hdfs:///user/joldenbeuving/oozie-test2/
# Pinpoint the location of the application: it will delete
# and re-create this location, so please be careful!
oozie.coord.application.path=${applicationPath}/jilles-coordinator.xml
jobTracker=hadoop-dn:8021
nameNode=nameservice1
# Regex filter, examples include:
# - Just an IP address (this will match anywhere in the input JSON)
# - loggedInUserId\":\"1586651\" this will match anything for which the user was... logged in, etc
# More info: http://pig.apache.org/docs/r0.8.1/piglatin_ref2.html#REGEX_EXTRACT_ALL
regexFilter=92\\.109\\.217\\.222
<coordinator-app name="jilles-test-coordinator"
frequency="${coord:days(1)}"
start="2014-03-20T18:56Z" end="2015-06-05T18:56Z" timezone="Europe/Amsterdam"
xmlns="uri:oozie:coordinator:0.2">
<controls>
<!-- See http://stackoverflow.com/a/21818132 -->
<concurrency>1</concurrency>
<execution>FIFO</execution>
<throttle>5</throttle>
</controls>
<datasets>
<!-- Naming convention used here:
[e]dinfra
-> 'din': Data INput or OUTput
-> 'dc1': Data center 1 or 2, etc
-> '[e]': Event (as opposed to dataset)
-->
<dataset name="dindc1" frequency="${coord:days(1)}"
initial-instance="2014-03-20T04:00Z" timezone="Europe/Amsterdam">
<uri-template>hdfs:///user/app/dc1/${YEAR}/${MONTH}/${DAY}/</uri-template>
<done-flag></done-flag>
</dataset>
<dataset name="dindc2" frequency="${coord:days(1)}"
initial-instance="2014-03-20T04:00Z" timezone="Europe/Amsterdam">
<uri-template>hdfs:///user/app/dc2/ams01/${YEAR}/${MONTH}/${DAY}/</uri-template>
<done-flag></done-flag>
</dataset>
<dataset name="dout" frequency="${coord:days(1)}"
initial-instance="2014-03-20T18:56Z" timezone="Europe/Amsterdam">
<uri-template>hdfs:///user/app/oozie-test-output-data/${YEAR}/${MONTH}/${DAY}/</uri-template>
<done-flag></done-flag>
</dataset>
</datasets>
<!-- Select the data (in our case the day) that we want to process
For more info on this, see: http://tinyurl.com/q74oom7 -->
<input-events>
<data-in name="eindc1" dataset="dindc1">
<instance>${coord:current(0)}</instance>
</data-in>
<data-in name="eindc2" dataset="dindc2">
<instance>${coord:current(0)}</instance>
</data-in>
</input-events>
<output-events>
<data-out name="eout" dataset="dout">
<instance>${coord:current(0)}</instance>
</data-out>
</output-events>
<!-- Setup the actual workflow, let it know where we found new
data ('inputDir') and where we require the workflow to store
the results ('outputDir') -->
<action>
<workflow>
<app-path>${applicationPath}</app-path>
<configuration>
<property>
<name>inputPath</name>
<!-- List both DC1 and DC2 events, Pig will handle these properly -->
<value>${coord:dataIn('eindc1')},${coord:dataIn('eindc2')}</value>
</property>
<property>
<name>outputPath</name>
<value>${coord:dataOut('eout')}</value>
</property>
<property>
<name>customJobName</name>
<value>'${coord:user()}: Applying filter on incoming application data. Code here: https://linktogitrepo. Storing data in ${coord:dataOut('eout')}'</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>regexFilter</name>
<value>${regexFilter}</value>
</property>
</configuration>
</workflow>
</action>
</coordinator-app>
#! /bin/bash
export OOZIE_URL="http://hadoop-dn:11000/oozie/"
# Take application path from the properties file (DRY)
APP_PATH=`grep "^applicationPath" coordinator.properties | grep -o "hdfs.*"`
kinit -R
# do dryrun, and exit if problems are found
#(oozie job -dryrun -config coordinator.properties) || exit
echo Copying files to HDFS
hdfs dfs -rm -f -R $APP_PATH
hdfs dfs -mkdir $APP_PATH
hdfs dfs -mkdir $APP_PATH/data
hdfs dfs -mkdir $APP_PATH/lib
hdfs dfs -put *.{xml,pig} $APP_PATH
hdfs dfs -put ./lib/*.jar $APP_PATH/lib/
hdfs dfs -ls -R $APP_PATH
oozie job -run -config coordinator.properties
echo " ^---- Note this is the job ID (if everything went alright)"
echo
echo "Ways to get more info on the coordinator you just submitted:"
echo " https://hue-domain/oozie/list_oozie_coordinators/"
echo " $ oozie job --jobtype coord"
echo " $ oozie job -info 0000004-091209145813488-oozie-dani-C"
SET job.name '$customJobName';
REGISTER json-simple-1.1.1.jar;
DEFINE JSON2MAP com.twitter.elephantbird.pig.piggybank.JsonStringToMap();
A = LOAD '$inputPath';
B = FOREACH A GENERATE
REGEX_EXTRACT_ALL($0, '(.*$filter.*)');
C = FILTER B BY (SIZE($0) < 3);
D = FOREACH C GENERATE FLATTEN($0);
E = FOREACH D GENERATE JSON2MAP($0) AS json, $0 AS fullline:chararray;
F = FOREACH E GENERATE FLATTEN(json#'date') AS requestdate, fullline;
G = FOREACH F GENERATE (long) requestdate, fullline;
H = ORDER G BY $0 ASC;
STORE H INTO '$outputPath';
{"requestIp":"22.249.73.204","url":"http://www.example.com/zlkasdfj/url.extension.html","date":1366788978906,"userAgent":"Mozilla/5.0","requestTimeMillis":209,"dispatchTime":209,"ssl":false,"responseCode":200,"responseSize":22443,"method":"GET","sessionId":"1f66d92b-66dd-727675d1bab7","loggedInUserId":"","uniqueRequestId":"9df01305-ae4b-6dee17b2069b"}
{"requestIp":"22.209.91.3","url":"http://www.example.com/moreurls.galore.html","date":1366788979022,"userAgent":"Mozilla/4.0","requestTimeMillis":96,"dispatchTime":96,"ssl":false,"responseCode":200,"responseSize":17180,"referrer":"http://www.example.com/previous.page.galore.html","method":"GET","sessionId":"b1e11781-3c2b-82b2-c761478e262a","loggedInUserId":"","uniqueRequestId":"12c7445a-5e-fa8-81d4-19222a421ba6","gaCookie__utmb":"161234094.18.14.1366"}
<workflow-app name="Jilles Test Workflow" xmlns="uri:oozie:workflow:0.4">
<start to="filter-for-particular-ip"/>
<action name="filter-for-particular-ip">
<pig>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<script>${applicationPath}/filter-ip.pig</script>
<param>filter=${regexFilter}</param>
<param>inputPath=${inputPath}</param>
<param>outputPath=${outputPath}</param>
<param>customJobName=${customJobName}</param>
<!-- for <file>, you can not use f.ex. applicationPath for a reason thats beyond me -->
<file>/user/joldenbeuving/oozie-test2/lib/json-simple-1.1.1.jar#json-simple-1.1.1.jar</file>
</pig>
<ok to="end"/>
<error to="kill"/>
</action>
<kill name="kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<end name="end"/>
</workflow-app>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment