Skip to content

Instantly share code, notes, and snippets.

@mhawksey
Created January 29, 2012 09:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mhawksey/1698071 to your computer and use it in GitHub Desktop.
Save mhawksey/1698071 to your computer and use it in GitHub Desktop.
Google Refine CaPReT data to add location
// What this does - need to replace YOUR_KEY_HERE with one from http://ipinfodb.com/
// Create column domain at index 5 based on column TEXT SOURCE using expression grel:split(value,"/")[1]
// Reorder rows
// Create column dummy at index 6 based on column domain using expression grel:value
// Blank down cells in column dummy
// Create column source_location at index 6 by fetching URLs based on column domain using expression grel:"http://api.ipinfodb.com/v3/ip-city/?key=YOUR_KEY_HERE&ip="+value
// Fill down cells in column source_location
// Remove column dummy
// Create column target_location at index 2 by fetching URLs based on column IP using expression grel:"http://api.ipinfodb.com/v3/ip-city/?key=YOUR_KEY_HERE&ip="+value
// Create column text_decoded at index 9 based on column TEXT COPIED using expression grel:unescape(value,"url")
// Create column timestamp at index 12 based on column TEXT COPIED ON using expression jython:import time return int(time.mktime(time.strptime(value, '%a %b %d %Y %H:%M:%S')))
// Reorder rows
// Create column target_long at index 3 based on column target_location using expression grel:split(value,";")[-2]
// Create column target_lat at index 3 based on column target_location using expression grel:split(value,";")[-3]
// Create column source_long at index 10 based on column source_location using expression grel:split(value,";")[-2]
// Create column source_lat at index 10 based on column source_location using expression grel:split(value,";")[-3]
[
{
"op": "core/column-addition",
"description": "Create column domain at index 5 based on column TEXT SOURCE using expression grel:split(value,\"/\")[1]",
"engineConfig": {
"facets": [
{
"invert": false,
"expression": "isBlank(value)",
"selectError": false,
"omitError": false,
"selectBlank": false,
"name": "TEXT SOURCE",
"omitBlank": false,
"columnName": "TEXT SOURCE",
"type": "list",
"selection": [
{
"v": {
"v": false,
"l": "false"
}
}
]
},
{
"invert": false,
"expression": "isBlank(value)",
"selectError": false,
"omitError": false,
"selectBlank": false,
"name": "dummy",
"omitBlank": false,
"columnName": "dummy",
"type": "list",
"selection": [
{
"v": {
"v": false,
"l": "false"
}
}
]
}
],
"mode": "row-based"
},
"newColumnName": "domain",
"columnInsertIndex": 5,
"baseColumnName": "TEXT SOURCE",
"expression": "grel:split(value,\"/\")[1]",
"onError": "set-to-blank"
},
{
"op": "core/row-reorder",
"description": "Reorder rows",
"mode": "record-based",
"sorting": {
"criteria": [
{
"reverse": false,
"caseSensitive": false,
"column": "domain",
"valueType": "string",
"blankPosition": 2,
"errorPosition": 1
}
]
}
},
{
"op": "core/column-addition",
"description": "Create column dummy at index 6 based on column domain using expression grel:value",
"engineConfig": {
"facets": [
{
"invert": false,
"expression": "isBlank(value)",
"selectError": false,
"omitError": false,
"selectBlank": false,
"name": "TEXT SOURCE",
"omitBlank": false,
"columnName": "TEXT SOURCE",
"type": "list",
"selection": [
{
"v": {
"v": false,
"l": "false"
}
}
]
},
{
"invert": false,
"expression": "isBlank(value)",
"selectError": false,
"omitError": false,
"selectBlank": false,
"name": "dummy",
"omitBlank": false,
"columnName": "dummy",
"type": "list",
"selection": [
{
"v": {
"v": false,
"l": "false"
}
}
]
}
],
"mode": "row-based"
},
"newColumnName": "dummy",
"columnInsertIndex": 6,
"baseColumnName": "domain",
"expression": "grel:value",
"onError": "set-to-blank"
},
{
"op": "core/blank-down",
"description": "Blank down cells in column dummy",
"engineConfig": {
"facets": [
{
"invert": false,
"expression": "isBlank(value)",
"selectError": false,
"omitError": false,
"selectBlank": false,
"name": "TEXT SOURCE",
"omitBlank": false,
"columnName": "TEXT SOURCE",
"type": "list",
"selection": [
{
"v": {
"v": false,
"l": "false"
}
}
]
},
{
"invert": false,
"expression": "isBlank(value)",
"selectError": false,
"omitError": false,
"selectBlank": false,
"name": "dummy",
"omitBlank": false,
"columnName": "dummy",
"type": "list",
"selection": [
{
"v": {
"v": false,
"l": "false"
}
}
]
}
],
"mode": "row-based"
},
"columnName": "dummy"
},
{
"op": "core/column-addition-by-fetching-urls",
"description": "Create column source_location at index 6 by fetching URLs based on column domain using expression grel:\"http://api.ipinfodb.com/v3/ip-city/?key=YOUR_KEY_HERE&ip=\"+value",
"engineConfig": {
"facets": [
{
"invert": false,
"expression": "isBlank(value)",
"selectError": false,
"omitError": false,
"selectBlank": false,
"name": "dummy",
"omitBlank": false,
"columnName": "dummy",
"type": "list",
"selection": [
{
"v": {
"v": false,
"l": "false"
}
}
]
}
],
"mode": "row-based"
},
"newColumnName": "source_location",
"columnInsertIndex": 6,
"baseColumnName": "domain",
"urlExpression": "grel:\"http://api.ipinfodb.com/v3/ip-city/?key=YOUR_KEY_HERE&ip=\"+value",
"onError": "set-to-blank",
"delay": 2000
},
{
"op": "core/fill-down",
"description": "Fill down cells in column source_location",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "source_location"
},
{
"op": "core/column-removal",
"description": "Remove column dummy",
"columnName": "dummy"
},
{
"op": "core/column-addition-by-fetching-urls",
"description": "Create column target_location at index 2 by fetching URLs based on column IP using expression grel:\"http://api.ipinfodb.com/v3/ip-city/?key=YOUR_API_KEYip=\"+value",
"engineConfig": {
"facets": [
{
"invert": false,
"expression": "isBlank(value)",
"selectError": false,
"omitError": false,
"selectBlank": false,
"name": "TEXT SOURCE",
"omitBlank": false,
"columnName": "TEXT SOURCE",
"type": "list",
"selection": [
{
"v": {
"v": false,
"l": "false"
}
}
]
}
],
"mode": "row-based"
},
"newColumnName": "target_location",
"columnInsertIndex": 2,
"baseColumnName": "IP",
"urlExpression": "grel:\"http://api.ipinfodb.com/v3/ip-city/?key=YOU_API_KEY&ip=\"+value",
"onError": "set-to-blank",
"delay": 1000
},
{
"op": "core/column-addition",
"description": "Create column text_decoded at index 9 based on column TEXT COPIED using expression grel:unescape(value,\"url\")",
"engineConfig": {
"facets": [
{
"invert": false,
"expression": "isBlank(value)",
"selectError": false,
"omitError": false,
"selectBlank": false,
"name": "TEXT SOURCE",
"omitBlank": false,
"columnName": "TEXT SOURCE",
"type": "list",
"selection": [
{
"v": {
"v": false,
"l": "false"
}
}
]
}
],
"mode": "row-based"
},
"newColumnName": "text_decoded",
"columnInsertIndex": 9,
"baseColumnName": "TEXT COPIED",
"expression": "grel:unescape(value,\"url\")",
"onError": "set-to-blank"
},
{
"op": "core/column-addition",
"description": "Create column timestamp at index 12 based on column TEXT COPIED ON using expression jython:import time \nreturn int(time.mktime(time.strptime(value, '%a %b %d %Y %H:%M:%S')))",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"newColumnName": "timestamp",
"columnInsertIndex": 12,
"baseColumnName": "TEXT COPIED ON",
"expression": "jython:import time \nreturn int(time.mktime(time.strptime(value, '%a %b %d %Y %H:%M:%S')))",
"onError": "set-to-blank"
},
{
"op": "core/row-reorder",
"description": "Reorder rows",
"mode": "record-based",
"sorting": {
"criteria": [
{
"reverse": false,
"column": "timestamp",
"valueType": "number",
"blankPosition": 2,
"errorPosition": 1
}
]
}
},
{
"op": "core/column-addition",
"description": "Create column target_long at index 3 based on column target_location using expression grel:split(value,\";\")[-2]",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"newColumnName": "target_long",
"columnInsertIndex": 3,
"baseColumnName": "target_location",
"expression": "grel:split(value,\";\")[-2]",
"onError": "set-to-blank"
},
{
"op": "core/column-addition",
"description": "Create column target_lat at index 3 based on column target_location using expression grel:split(value,\";\")[-3]",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"newColumnName": "target_lat",
"columnInsertIndex": 3,
"baseColumnName": "target_location",
"expression": "grel:split(value,\";\")[-3]",
"onError": "set-to-blank"
},
{
"op": "core/column-addition",
"description": "Create column source_long at index 10 based on column source_location using expression grel:split(value,\";\")[-2]",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"newColumnName": "source_long",
"columnInsertIndex": 10,
"baseColumnName": "source_location",
"expression": "grel:split(value,\";\")[-2]",
"onError": "set-to-blank"
},
{
"op": "core/column-addition",
"description": "Create column source_lat at index 10 based on column source_location using expression grel:split(value,\";\")[-2]",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"newColumnName": "source_lat",
"columnInsertIndex": 10,
"baseColumnName": "source_location",
"expression": "grel:split(value,\";\")[-3]",
"onError": "set-to-blank"
}
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment