mattbornski/apache-logs-hive.sql

## apache-logs-hive.sql
-- This is a Hive program. Hive is an SQL-like language that compiles
-- into Hadoop Map/Reduce jobs. It's very popular among analysts at
-- Facebook, because it allows them to query enormous Hadoop data
-- stores using a language much like SQL.

-- Our logs are stored on the Hadoop Distributed File System, in the
-- directory /logs/randomhacks.net/access.  They're ordinary Apache
-- logs in *.gz format.
--
-- We want to pretend that these gzipped log files are a database table,
-- and use a regular expression to split lines into database columns.
CREATE EXTERNAL TABLE access(
  host STRING,
  identity STRING,
  user STRING,
  time STRING,
  request STRING,
  status STRING,
  size STRING,
  referer STRING,
  agent STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
  "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?",
  "output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s"
)
STORED AS TEXTFILE
LOCATION '/logs/randomhacks.net/access';

-- We want to store our logs in HBase, which is designed to hold tables
-- with billions of rows and millions of columns. HBase stores rows
-- sorted by primary key, so you can efficiently read all records within
-- a given range of keys.
--
-- Here, our key is a Unix time stamp, and we assume that it always has
-- the same number of digits (hey, this was a dodgy late night hack).
-- So we could easily grab all the records from a specific time period.
--
-- We store our data in 3 column families: "m" for metadata, "r" for
-- referrer data, and "a" for user-agent data. This allows us to only
-- load a subset of columns for a given query.
CREATE EXTERNAL TABLE access_hbase(
  key STRING,          -- Unix time + ":" + unique identifier.
  host STRING,         -- The IP address of the host making the request.
  identity STRING,     -- ??? (raw log data)
  user STRING,         -- ??? (raw log data)
  time BIGINT,         -- Unix time, UTC.
  method STRING,       -- "GET", etc.
  path STRING,         -- "/logo.png", etc.
  protocol STRING,     -- "HTTP/1.1", etc.
  status SMALLINT,     -- 200, 404, etc.
  size BIGINT,         -- Response size, in bytes.
  referer_host STRING, -- "www.google.com", etc.
  referer STRING,      -- Full referrer string.
  agent STRING)        -- Full agent string.
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES (
  "hbase.columns.mapping" = ":key,m:host,m:identity,m:user,m:time,m:method,m:path,m:protocol,m:status,m:size,r:referer_host,r:referer,a:agent"
)
TBLPROPERTIES ("hbase.table.name" = "randomhacks_access");

-- Copy our data from raw Apache log files to HBase, cleaning it up as we go.  This is basically
-- a pseudo-SQL query which calls a few Java helpers.
--
-- Note the "TABLESAMPLE" clause, which says to pick one of every 20 records at random.
INSERT OVERWRITE TABLE access_hbase
  SELECT concat(cast(unix_timestamp(time, "[dd/MMM/yyyy:HH:mm:ss Z]") AS STRING), ":", guid()) AS key,
         host,
         unquote_apache(identity),
         unquote_apache(user),
         unix_timestamp(time, "[dd/MMM/yyyy:HH:mm:ss Z]"),
         re_extract(unquote_apache(request), "([^ ]*) ([^ ]*) ([^\"]*)", 1) AS method,
         re_extract(unquote_apache(request), "([^ ]*) ([^ ]*) ([^\"]*)", 2) AS path,
         re_extract(unquote_apache(request), "([^ ]*) ([^ ]*) ([^\"]*)", 3) AS protocol,
         cast(status AS SMALLINT) AS status,
         cast(size AS BIGINT) AS size,
         re_extract(unquote_apache(referer), "[^:]+:?/+([^/]*).*", 1) AS referer_host,
         unquote_apache(referer) AS referer,
         unquote_apache(agent)
    FROM access TABLESAMPLE(BUCKET 1 OUT OF 20 ON rand())
    WHERE unix_timestamp(time, "[dd/MMM/yyyy:HH:mm:ss Z]") IS NOT NULL;

-- Find the 50 most popular pages on the site.
SELECT path, count(*) AS cnt
  FROM access_hbase GROUP BY path
  ORDER BY cnt DESC LIMIT 50;

-- Categorize our articles by year and count how many hits each year received.
SELECT pubyear, count(*)
  FROM (SELECT re_extract(path, "/articles/([0-9]+)/.*", 1) AS pubyear
          FROM access_hbase) access
  WHERE pubyear IS NOT NULL
  GROUP BY pubyear;
	-- This is a Hive program. Hive is an SQL-like language that compiles
	-- into Hadoop Map/Reduce jobs. It's very popular among analysts at
	-- Facebook, because it allows them to query enormous Hadoop data
	-- stores using a language much like SQL.

	-- Our logs are stored on the Hadoop Distributed File System, in the
	-- directory /logs/randomhacks.net/access. They're ordinary Apache
	-- logs in *.gz format.
	--
	-- We want to pretend that these gzipped log files are a database table,
	-- and use a regular expression to split lines into database columns.
	CREATE EXTERNAL TABLE access(
	host STRING,
	identity STRING,
	user STRING,
	time STRING,
	request STRING,
	status STRING,
	size STRING,
	referer STRING,
	agent STRING)
	ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
	WITH SERDEPROPERTIES (
	"input.regex" = "([^ ]) ([^ ]) ([^ ]) (-\|\\[[^\\]]\\]) ([^ \"]\|\"[^\"]\") (-\|[0-9]) (-\|[0-9])(?: ([^ \"]\|\"[^\"]\") ([^ \"]\|\"[^\"]\"))?",
	"output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s"
	)
	STORED AS TEXTFILE
	LOCATION '/logs/randomhacks.net/access';

	-- We want to store our logs in HBase, which is designed to hold tables
	-- with billions of rows and millions of columns. HBase stores rows
	-- sorted by primary key, so you can efficiently read all records within
	-- a given range of keys.
	--
	-- Here, our key is a Unix time stamp, and we assume that it always has
	-- the same number of digits (hey, this was a dodgy late night hack).
	-- So we could easily grab all the records from a specific time period.
	--
	-- We store our data in 3 column families: "m" for metadata, "r" for
	-- referrer data, and "a" for user-agent data. This allows us to only
	-- load a subset of columns for a given query.
	CREATE EXTERNAL TABLE access_hbase(
	key STRING, -- Unix time + ":" + unique identifier.
	host STRING, -- The IP address of the host making the request.
	identity STRING, -- ??? (raw log data)
	user STRING, -- ??? (raw log data)
	time BIGINT, -- Unix time, UTC.
	method STRING, -- "GET", etc.
	path STRING, -- "/logo.png", etc.
	protocol STRING, -- "HTTP/1.1", etc.
	status SMALLINT, -- 200, 404, etc.
	size BIGINT, -- Response size, in bytes.
	referer_host STRING, -- "www.google.com", etc.
	referer STRING, -- Full referrer string.
	agent STRING) -- Full agent string.
	STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
	WITH SERDEPROPERTIES (
	"hbase.columns.mapping" = ":key,m:host,m:identity,m:user,m:time,m:method,m:path,m:protocol,m:status,m:size,r:referer_host,r:referer,a:agent"
	)
	TBLPROPERTIES ("hbase.table.name" = "randomhacks_access");

	-- Copy our data from raw Apache log files to HBase, cleaning it up as we go. This is basically
	-- a pseudo-SQL query which calls a few Java helpers.
	--
	-- Note the "TABLESAMPLE" clause, which says to pick one of every 20 records at random.
	INSERT OVERWRITE TABLE access_hbase
	SELECT concat(cast(unix_timestamp(time, "[dd/MMM/yyyy:HH:mm:ss Z]") AS STRING), ":", guid()) AS key,
	host,
	unquote_apache(identity),
	unquote_apache(user),
	unix_timestamp(time, "[dd/MMM/yyyy:HH:mm:ss Z]"),
	re_extract(unquote_apache(request), "([^ ]) ([^ ]) ([^\"]*)", 1) AS method,
	re_extract(unquote_apache(request), "([^ ]) ([^ ]) ([^\"]*)", 2) AS path,
	re_extract(unquote_apache(request), "([^ ]) ([^ ]) ([^\"]*)", 3) AS protocol,
	cast(status AS SMALLINT) AS status,
	cast(size AS BIGINT) AS size,
	re_extract(unquote_apache(referer), "[^:]+:?/+([^/]).", 1) AS referer_host,
	unquote_apache(referer) AS referer,
	unquote_apache(agent)
	FROM access TABLESAMPLE(BUCKET 1 OUT OF 20 ON rand())
	WHERE unix_timestamp(time, "[dd/MMM/yyyy:HH:mm:ss Z]") IS NOT NULL;

	-- Find the 50 most popular pages on the site.
	SELECT path, count(*) AS cnt
	FROM access_hbase GROUP BY path
	ORDER BY cnt DESC LIMIT 50;

	-- Categorize our articles by year and count how many hits each year received.
	SELECT pubyear, count(*)
	FROM (SELECT re_extract(path, "/articles/([0-9]+)/.*", 1) AS pubyear
	FROM access_hbase) access
	WHERE pubyear IS NOT NULL
	GROUP BY pubyear;