public
Last active

Pig script for analyzing mail sending times

  • Download Gist
mail_sendtime.pig
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
REGISTER file:/home/hadoop/lib/pig/piggybank.jar;
 
DEFINE EXTRACT org.apache.pig.piggybank.evaluation.string.EXTRACT();
 
DEFINE CustomFormatToISO org.apache.pig.piggybank.evaluation.datetime.convert.CustomFormatToISO();
DEFINE ISOToUnix org.apache.pig.piggybank.evaluation.datetime.convert.ISOToUnix();
 
DEFINE DATE_TIME org.apache.pig.piggybank.evaluation.datetime.DATE_TIME();
DEFINE FORMAT_DT org.apache.pig.piggybank.evaluation.datetime.FORMAT_DT();
 
DEFINE FORMAT org.apache.pig.piggybank.evaluation.string.FORMAT();
 
%default YEAR `date +%Y`;
 
RAW_LOGS = LOAD '$INPUT' as (line:chararray);
 
SRC = FOREACH RAW_LOGS GENERATE
FLATTEN(
EXTRACT(line, '(\\S+)\\s+(\\d+)\\s+(\\S+)\\s+(\\S+)\\s+sendmail\\[(\\d+)\\]:\\s+(\\w+):\\s+from=<([^>]+)>,\\s+size=(\\d+),\\s+class=(\\d+),\\s+nrcpts=(\\d+),\\s+msgid=<([^>]+)>.*relay=(\\S+)')
)
AS (
month: chararray,
day: chararray,
time: chararray,
mailserver: chararray,
pid: chararray,
sendmailid: chararray,
src: chararray,
size: chararray,
classnumber: chararray,
nrcpts: chararray,
msgid: chararray,
relay: chararray
);
 
T1 = FOREACH SRC GENERATE sendmailid, FORMAT('%s-%s-%s %s', $YEAR, month, day, time) as timestamp;
FILTER_T1 = FILTER T1 BY NOT sendmailid IS NULL;
DUMP FILTER_T1;
 
R1 = FOREACH FILTER_T1 GENERATE sendmailid, DATE_TIME(timestamp, 'yyyy-MMM-d HH:mm:ss') as dt;
DUMP R1;
 
-- ISOToUnix returns milliseconds, so we divide by 1000 to get seconds
toEpoch1 = FOREACH R1 GENERATE sendmailid, dt, ISOToUnix(dt) / 1000 as epoch:long;
DUMP toEpoch1;
 
DEST = FOREACH RAW_LOGS GENERATE
FLATTEN(
EXTRACT(line, '(\\S+)\\s+(\\d+)\\s+(\\S+)\\s+(\\S+)\\s+sendmail\\[(\\d+)\\]:\\s+(\\w+):\\s+to=<([^>]+)>,\\s+delay=([^,]+),\\s+xdelay=([^,]+),.*relay=(\\S+)\\s+\\[\\S+\\],\\s+dsn=\\S+,\\s+stat=(.*)')
)
AS (
month: chararray,
day: chararray,
time: chararray,
mailserver: chararray,
pid: chararray,
sendmailid: chararray,
dest: chararray,
delay: chararray,
xdelay: chararray,
relay: chararray,
stat: chararray
);
 
T2 = FOREACH DEST GENERATE sendmailid, FORMAT('%s-%s-%s %s', $YEAR, month, day, time) as timestamp, dest, stat;
FILTER_T2 = FILTER T2 BY NOT sendmailid IS NULL;
 
R2 = FOREACH FILTER_T2 GENERATE sendmailid, DATE_TIME(timestamp, 'yyyy-MMM-d HH:mm:ss') as dt, dest, stat;
 
-- ISOToUnix returns milliseconds, so we divide by 1000 to get seconds
toEpoch2 = FOREACH R2 GENERATE sendmailid, dt, ISOToUnix(dt) / 1000 AS epoch:long, dest, stat;
 
R3 = JOIN toEpoch1 BY sendmailid, toEpoch2 BY sendmailid;
R4 = FOREACH R3 GENERATE $0, $5 - $2, $6, $7;
R5 = ORDER R4 BY $1 DESC;
STORE R5 INTO '$OUTPUT';

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.