public
Last active

Pig script for Nginx access log date/time conversions

  • Download Gist
nginx_time_conversions.pig
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
REGISTER file:/home/hadoop/lib/pig/piggybank.jar;
 
DEFINE EXTRACT org.apache.pig.piggybank.evaluation.string.EXTRACT();
DEFINE CustomFormatToISO org.apache.pig.piggybank.evaluation.datetime.convert.CustomFormatToISO();
DEFINE ISOToUnix org.apache.pig.piggybank.evaluation.datetime.convert.ISOToUnix();
DEFINE DATE_TIME org.apache.pig.piggybank.evaluation.datetime.DATE_TIME();
DEFINE FORMAT_DT org.apache.pig.piggybank.evaluation.datetime.FORMAT_DT();
DEFINE FORMAT org.apache.pig.piggybank.evaluation.string.FORMAT();
 
RAW_LOGS = LOAD '$INPUT' as (line:chararray);
LOGS_BASE = FOREACH RAW_LOGS GENERATE
 
FLATTEN(
EXTRACT(line, '(\\S+) - - \\[([^\\[]+)\\]\\s+"([^"]+)"\\s+(\\d+)\\s+(\\d+)\\s+"([^"]+)"\\s+"([^"]+)"\\s+"([^"]+)"\\s+(\\S+)')
)
AS (
ip: chararray,
timestamp: chararray,
url: chararray,
status: chararray,
bytes: chararray,
referrer: chararray,
useragent: chararray,
xfwd: chararray,
reqtime: chararray
);
DATE_URL = FOREACH LOGS_BASE GENERATE timestamp;
F = FILTER DATE_URL BY NOT timestamp IS NULL;
 
R1 = FOREACH F GENERATE timestamp, DATE_TIME(timestamp, 'dd/MMM/yyyy:HH:mm:ss Z') as dt;
DUMP R1;
 
-- ISOToUnix returns milliseconds, so we divide by 1000 to get seconds
-- toEpoch1 = FOREACH R1 GENERATE dt, ISOToUnix(dt) / 1000 as epoch:long;
DUMP toEpoch1;
 
FD = FOREACH R1 GENERATE FORMAT_DT('yyyy-MM-dd HH:mm:ss Z', dt) as fdt;
DUMP FD;
 
toISO = FOREACH FD GENERATE fdt, CustomFormatToISO(fdt, 'yyyy-MM-dd HH:mm:ss Z');
DUMP toISO;

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.