Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Hive query to create table for parsing Nginx logs
-- Table for raw NGINX logs
CREATE EXTERNAL TABLE IF NOT EXISTS `raw_nginx_logs` (
`remote_addr` string,
`remote_user` string,
`time_local` string,
`http_verb` string,
`url` string,
`http_ver` string,
`status` int,
`body_bytes_sent` int,
`http_referer` string,
`http_user_agent` string
)
-- Use RegEx to parse each line
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH serdeproperties(
"input.regex" = "([0-9\\.]+) - ([^ ]*) \\[([^\\]]*)\\] \"([^ ]*) ([^ ]*) ([^ ]*)\" ([0-9]*) ([0-9]*) \"(.*)\" \"(.*)\",
"output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s %10$s"
)
-- This also supports gzip, just have to have a .gz extension
STORED AS TEXTFILE
-- Be explicit where we are going to store this table
LOCATION 'hdfs://nameservice/user/hive/warehouse/raw_nginx_logs';
@nimboya

This comment has been minimized.

Copy link

@nimboya nimboya commented Sep 24, 2018

Tried it and I received this error

FAILED: ParseException line 17:3 mismatched input 'output' expecting ) near '"([0-9\\.]+) - ([^ ]*) \\[([^\\]]*)\\] \"([^ ]*) ([^ ]*) ([^ ]*)\" ([0-9]*) ([0-9]*) \"(.*)\" \"(.*)\",

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.