Skip to content

Instantly share code, notes, and snippets.

View alexanderdean's full-sized avatar

Alexander Dean alexanderdean

View GitHub Profile
/**
* Loader for Thrift SnowplowRawEvent objects which
* are inbound as a simple Byte Array.
*/
object ThriftByteArrayLoader extends CollectorLoader[Array[Byte]] {
private val thriftDeserializer = new TDeserializer
/**
* Converts the source string into a MaybeCanonicalInput.
@alexanderdean
alexanderdean / video_played.json
Last active August 29, 2015 14:02
Example JSON Schema for a video_played.json
{
"$schema": "http://json-schema.org/schema#",
"description": "Schema for a video_played event",
"type": "object",
"properties": {
"length": {
"type": "number"
},
"id": {
"type": "string"
{
"schema": "iglu:com.channel2.vod/video_played/jsonschema/1-0-0",
"data": {
"length": 213,
"id": "hY7gQrO"
}
}
{
"$schema": "http://iglucentral.com/schemas/com.snowplowanalytics.self-desc/schema/jsonschema/1-0-0#",
"description": "Schema for a video_played event",
"self": {
"vendor": "com.channel2.vod",
"name": "video_played",
"format": "jsonschema",
"version": "1-0-0"
},
"type": "object",
input_lines = LOAD '$INPUT' AS (line:chararray);
-- Extract words from each line and put them into a pig bag
-- datatype, then flatten the bag to get one word on each row
words = FOREACH input_lines GENERATE FLATTEN(TOKENIZE(line)) AS word;
-- filter out any words that are just white spaces
filtered_words = FILTER words BY word MATCHES '\\w+';
-- create a group for each word
// Request body expected to validate against this JSON Schema
private val PayloadDataSchema =
SchemaCriterion("com.snowplowanalytics.snowplow", "payload_data", "jsonschema", 1, 0)
// Check JSON is a payload_data version 1-0-*, and verify it against the schema
val body: ValidatedNel[JsonNode] = bodyNode.verifySchemaAndValidate(schemaCriterion)
@alexanderdean
alexanderdean / gist:1427099
Created December 3, 2011 13:17
For DictShield issue #45
#!/usr/bin/env python
from dictshield.document import Document, EmbeddedDocument
from dictshield.base import UUIDField, ShieldException
from dictshield.fields import (StringField,
DateTimeField,
IntField,
ListField,
FloatField,
EmbeddedDocumentField)
@alexanderdean
alexanderdean / gist:4001236
Created November 2, 2012 12:55
Intermittent error running SnowPlow clojure-collector via `lein ring server-headless`
╭─alex@nasqueron ~/Development/SnowPlow/snowplow/2-collectors/clojure-collector ‹feature/clj-collector›
╰─$ lein ring server-headless
SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
2012-11-02 12:53:35.833:INFO:oejs.Server:jetty-7.6.1.v20120215Started server on port 3000
2012-11-02 12:53:35.912:INFO:oejs.AbstractConnector:Started SelectChannelConnector@0.0.0.0:3000
^C% ╭─alex@nasqueron ~/Development/SnowPlow/snowplow/2-collectors/clojure-collector ‹feature/clj-collector›
╰─$ lein ring server-headless
@alexanderdean
alexanderdean / fetch_and_combine.py
Created November 15, 2012 08:31 — forked from larsyencken/fetch_and_combine.py
Aggregating CloudFront logs
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# fetch_and_combine.py
#
"""
Scans CloudFront logs in an S3 bucket for any that are new. Combines log files
into a single local file per hour. If logs for multiple CloudFront
distributions are present, combines them all.
@alexanderdean
alexanderdean / gist:4205020
Created December 4, 2012 15:12
ICE table defn with fewer lookups & fatter fields
CREATE TABLE IF NOT EXISTS events_fewer_lookups (
-- App
`app_id` varchar(255) comment 'lookup', -- 'lookup' is a varchar optimisation for Infobright
`platform` varchar(50) comment 'lookup',
-- Date/time
`dt` date,
`tm` time,
-- Event
`event` varchar(255) comment 'lookup', -- Renamed in 0.0.3
`event_id` varchar(38), -- Added in 0.0.3