Skip to content

Instantly share code, notes, and snippets.

@emk
Created January 3, 2012 18:01
Show Gist options
  • Star 45 You must be signed in to star a gist
  • Fork 33 You must be signed in to fork a gist
  • Save emk/1556097 to your computer and use it in GitHub Desktop.
Save emk/1556097 to your computer and use it in GitHub Desktop.
Apache log analysis with Hadoop, Hive and HBase
-- This is a Hive program. Hive is an SQL-like language that compiles
-- into Hadoop Map/Reduce jobs. It's very popular among analysts at
-- Facebook, because it allows them to query enormous Hadoop data
-- stores using a language much like SQL.
-- Our logs are stored on the Hadoop Distributed File System, in the
-- directory /logs/randomhacks.net/access. They're ordinary Apache
-- logs in *.gz format.
--
-- We want to pretend that these gzipped log files are a database table,
-- and use a regular expression to split lines into database columns.
CREATE EXTERNAL TABLE access(
host STRING,
identity STRING,
user STRING,
time STRING,
request STRING,
status STRING,
size STRING,
referer STRING,
agent STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?",
"output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s"
)
STORED AS TEXTFILE
LOCATION '/logs/randomhacks.net/access';
-- We want to store our logs in HBase, which is designed to hold tables
-- with billions of rows and millions of columns. HBase stores rows
-- sorted by primary key, so you can efficiently read all records within
-- a given range of keys.
--
-- Here, our key is a Unix time stamp, and we assume that it always has
-- the same number of digits (hey, this was a dodgy late night hack).
-- So we could easily grab all the records from a specific time period.
--
-- We store our data in 3 column families: "m" for metadata, "r" for
-- referrer data, and "a" for user-agent data. This allows us to only
-- load a subset of columns for a given query.
CREATE EXTERNAL TABLE access_hbase(
key STRING, -- Unix time + ":" + unique identifier.
host STRING, -- The IP address of the host making the request.
identity STRING, -- ??? (raw log data)
user STRING, -- ??? (raw log data)
time BIGINT, -- Unix time, UTC.
method STRING, -- "GET", etc.
path STRING, -- "/logo.png", etc.
protocol STRING, -- "HTTP/1.1", etc.
status SMALLINT, -- 200, 404, etc.
size BIGINT, -- Response size, in bytes.
referer_host STRING, -- "www.google.com", etc.
referer STRING, -- Full referrer string.
agent STRING) -- Full agent string.
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES (
"hbase.columns.mapping" = ":key,m:host,m:identity,m:user,m:time,m:method,m:path,m:protocol,m:status,m:size,r:referer_host,r:referer,a:agent"
)
TBLPROPERTIES ("hbase.table.name" = "randomhacks_access");
-- Copy our data from raw Apache log files to HBase, cleaning it up as we go. This is basically
-- a pseudo-SQL query which calls a few Java helpers.
--
-- Note the "TABLESAMPLE" clause, which says to pick one of every 20 records at random.
INSERT OVERWRITE TABLE access_hbase
SELECT concat(cast(unix_timestamp(time, "[dd/MMM/yyyy:HH:mm:ss Z]") AS STRING), ":", guid()) AS key,
host,
unquote_apache(identity),
unquote_apache(user),
unix_timestamp(time, "[dd/MMM/yyyy:HH:mm:ss Z]"),
re_extract(unquote_apache(request), "([^ ]*) ([^ ]*) ([^\"]*)", 1) AS method,
re_extract(unquote_apache(request), "([^ ]*) ([^ ]*) ([^\"]*)", 2) AS path,
re_extract(unquote_apache(request), "([^ ]*) ([^ ]*) ([^\"]*)", 3) AS protocol,
cast(status AS SMALLINT) AS status,
cast(size AS BIGINT) AS size,
re_extract(unquote_apache(referer), "[^:]+:?/+([^/]*).*", 1) AS referer_host,
unquote_apache(referer) AS referer,
unquote_apache(agent)
FROM access TABLESAMPLE(BUCKET 1 OUT OF 20 ON rand())
WHERE unix_timestamp(time, "[dd/MMM/yyyy:HH:mm:ss Z]") IS NOT NULL;
-- Find the 50 most popular pages on the site.
SELECT path, count(*) AS cnt
FROM access_hbase GROUP BY path
ORDER BY cnt DESC LIMIT 50;
-- Categorize our articles by year and count how many hits each year received.
SELECT pubyear, count(*)
FROM (SELECT re_extract(path, "/articles/([0-9]+)/.*", 1) AS pubyear
FROM access_hbase) access
WHERE pubyear IS NOT NULL
GROUP BY pubyear;
@janmechtel
Copy link

Great starting point for our webaccess log analysis. I'm puzzled by the guid() function though, can't find any reference for it could you point me in the same direction?

@janmechtel
Copy link

Now that I digged deeper I believe that you have a couple of helper UDFs? Please share them if you can: guid(), unquote_apache(), re_extract()

@focabr
Copy link

focabr commented Apr 6, 2014

Hi, Please share: guid(), unquote_apache() and re_extract() !!!!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment