Hive query to create table for parsing Nginx logs
-- Table for raw NGINX logs | |
CREATE EXTERNAL TABLE IF NOT EXISTS `raw_nginx_logs` ( | |
`remote_addr` string, | |
`remote_user` string, | |
`time_local` string, | |
`http_verb` string, | |
`url` string, | |
`http_ver` string, | |
`status` int, | |
`body_bytes_sent` int, | |
`http_referer` string, | |
`http_user_agent` string | |
) | |
-- Use RegEx to parse each line | |
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' | |
WITH serdeproperties( | |
"input.regex" = "([0-9\\.]+) - ([^ ]*) \\[([^\\]]*)\\] \"([^ ]*) ([^ ]*) ([^ ]*)\" ([0-9]*) ([0-9]*) \"(.*)\" \"(.*)\", | |
"output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s %10$s" | |
) | |
-- This also supports gzip, just have to have a .gz extension | |
STORED AS TEXTFILE | |
-- Be explicit where we are going to store this table | |
LOCATION 'hdfs://nameservice/user/hive/warehouse/raw_nginx_logs'; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This comment has been minimized.
Tried it and I received this error
FAILED: ParseException line 17:3 mismatched input 'output' expecting ) near '"([0-9\\.]+) - ([^ ]*) \\[([^\\]]*)\\] \"([^ ]*) ([^ ]*) ([^ ]*)\" ([0-9]*) ([0-9]*) \"(.*)\" \"(.*)\",