Created
January 29, 2012 09:48
-
-
Save mhawksey/1698071 to your computer and use it in GitHub Desktop.
Google Refine CaPReT data to add location
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// What this does - need to replace YOUR_KEY_HERE with one from http://ipinfodb.com/ | |
// Create column domain at index 5 based on column TEXT SOURCE using expression grel:split(value,"/")[1] | |
// Reorder rows | |
// Create column dummy at index 6 based on column domain using expression grel:value | |
// Blank down cells in column dummy | |
// Create column source_location at index 6 by fetching URLs based on column domain using expression grel:"http://api.ipinfodb.com/v3/ip-city/?key=YOUR_KEY_HERE&ip="+value | |
// Fill down cells in column source_location | |
// Remove column dummy | |
// Create column target_location at index 2 by fetching URLs based on column IP using expression grel:"http://api.ipinfodb.com/v3/ip-city/?key=YOUR_KEY_HERE&ip="+value | |
// Create column text_decoded at index 9 based on column TEXT COPIED using expression grel:unescape(value,"url") | |
// Create column timestamp at index 12 based on column TEXT COPIED ON using expression jython:import time return int(time.mktime(time.strptime(value, '%a %b %d %Y %H:%M:%S'))) | |
// Reorder rows | |
// Create column target_long at index 3 based on column target_location using expression grel:split(value,";")[-2] | |
// Create column target_lat at index 3 based on column target_location using expression grel:split(value,";")[-3] | |
// Create column source_long at index 10 based on column source_location using expression grel:split(value,";")[-2] | |
// Create column source_lat at index 10 based on column source_location using expression grel:split(value,";")[-3] | |
[ | |
{ | |
"op": "core/column-addition", | |
"description": "Create column domain at index 5 based on column TEXT SOURCE using expression grel:split(value,\"/\")[1]", | |
"engineConfig": { | |
"facets": [ | |
{ | |
"invert": false, | |
"expression": "isBlank(value)", | |
"selectError": false, | |
"omitError": false, | |
"selectBlank": false, | |
"name": "TEXT SOURCE", | |
"omitBlank": false, | |
"columnName": "TEXT SOURCE", | |
"type": "list", | |
"selection": [ | |
{ | |
"v": { | |
"v": false, | |
"l": "false" | |
} | |
} | |
] | |
}, | |
{ | |
"invert": false, | |
"expression": "isBlank(value)", | |
"selectError": false, | |
"omitError": false, | |
"selectBlank": false, | |
"name": "dummy", | |
"omitBlank": false, | |
"columnName": "dummy", | |
"type": "list", | |
"selection": [ | |
{ | |
"v": { | |
"v": false, | |
"l": "false" | |
} | |
} | |
] | |
} | |
], | |
"mode": "row-based" | |
}, | |
"newColumnName": "domain", | |
"columnInsertIndex": 5, | |
"baseColumnName": "TEXT SOURCE", | |
"expression": "grel:split(value,\"/\")[1]", | |
"onError": "set-to-blank" | |
}, | |
{ | |
"op": "core/row-reorder", | |
"description": "Reorder rows", | |
"mode": "record-based", | |
"sorting": { | |
"criteria": [ | |
{ | |
"reverse": false, | |
"caseSensitive": false, | |
"column": "domain", | |
"valueType": "string", | |
"blankPosition": 2, | |
"errorPosition": 1 | |
} | |
] | |
} | |
}, | |
{ | |
"op": "core/column-addition", | |
"description": "Create column dummy at index 6 based on column domain using expression grel:value", | |
"engineConfig": { | |
"facets": [ | |
{ | |
"invert": false, | |
"expression": "isBlank(value)", | |
"selectError": false, | |
"omitError": false, | |
"selectBlank": false, | |
"name": "TEXT SOURCE", | |
"omitBlank": false, | |
"columnName": "TEXT SOURCE", | |
"type": "list", | |
"selection": [ | |
{ | |
"v": { | |
"v": false, | |
"l": "false" | |
} | |
} | |
] | |
}, | |
{ | |
"invert": false, | |
"expression": "isBlank(value)", | |
"selectError": false, | |
"omitError": false, | |
"selectBlank": false, | |
"name": "dummy", | |
"omitBlank": false, | |
"columnName": "dummy", | |
"type": "list", | |
"selection": [ | |
{ | |
"v": { | |
"v": false, | |
"l": "false" | |
} | |
} | |
] | |
} | |
], | |
"mode": "row-based" | |
}, | |
"newColumnName": "dummy", | |
"columnInsertIndex": 6, | |
"baseColumnName": "domain", | |
"expression": "grel:value", | |
"onError": "set-to-blank" | |
}, | |
{ | |
"op": "core/blank-down", | |
"description": "Blank down cells in column dummy", | |
"engineConfig": { | |
"facets": [ | |
{ | |
"invert": false, | |
"expression": "isBlank(value)", | |
"selectError": false, | |
"omitError": false, | |
"selectBlank": false, | |
"name": "TEXT SOURCE", | |
"omitBlank": false, | |
"columnName": "TEXT SOURCE", | |
"type": "list", | |
"selection": [ | |
{ | |
"v": { | |
"v": false, | |
"l": "false" | |
} | |
} | |
] | |
}, | |
{ | |
"invert": false, | |
"expression": "isBlank(value)", | |
"selectError": false, | |
"omitError": false, | |
"selectBlank": false, | |
"name": "dummy", | |
"omitBlank": false, | |
"columnName": "dummy", | |
"type": "list", | |
"selection": [ | |
{ | |
"v": { | |
"v": false, | |
"l": "false" | |
} | |
} | |
] | |
} | |
], | |
"mode": "row-based" | |
}, | |
"columnName": "dummy" | |
}, | |
{ | |
"op": "core/column-addition-by-fetching-urls", | |
"description": "Create column source_location at index 6 by fetching URLs based on column domain using expression grel:\"http://api.ipinfodb.com/v3/ip-city/?key=YOUR_KEY_HERE&ip=\"+value", | |
"engineConfig": { | |
"facets": [ | |
{ | |
"invert": false, | |
"expression": "isBlank(value)", | |
"selectError": false, | |
"omitError": false, | |
"selectBlank": false, | |
"name": "dummy", | |
"omitBlank": false, | |
"columnName": "dummy", | |
"type": "list", | |
"selection": [ | |
{ | |
"v": { | |
"v": false, | |
"l": "false" | |
} | |
} | |
] | |
} | |
], | |
"mode": "row-based" | |
}, | |
"newColumnName": "source_location", | |
"columnInsertIndex": 6, | |
"baseColumnName": "domain", | |
"urlExpression": "grel:\"http://api.ipinfodb.com/v3/ip-city/?key=YOUR_KEY_HERE&ip=\"+value", | |
"onError": "set-to-blank", | |
"delay": 2000 | |
}, | |
{ | |
"op": "core/fill-down", | |
"description": "Fill down cells in column source_location", | |
"engineConfig": { | |
"facets": [], | |
"mode": "row-based" | |
}, | |
"columnName": "source_location" | |
}, | |
{ | |
"op": "core/column-removal", | |
"description": "Remove column dummy", | |
"columnName": "dummy" | |
}, | |
{ | |
"op": "core/column-addition-by-fetching-urls", | |
"description": "Create column target_location at index 2 by fetching URLs based on column IP using expression grel:\"http://api.ipinfodb.com/v3/ip-city/?key=YOUR_API_KEYip=\"+value", | |
"engineConfig": { | |
"facets": [ | |
{ | |
"invert": false, | |
"expression": "isBlank(value)", | |
"selectError": false, | |
"omitError": false, | |
"selectBlank": false, | |
"name": "TEXT SOURCE", | |
"omitBlank": false, | |
"columnName": "TEXT SOURCE", | |
"type": "list", | |
"selection": [ | |
{ | |
"v": { | |
"v": false, | |
"l": "false" | |
} | |
} | |
] | |
} | |
], | |
"mode": "row-based" | |
}, | |
"newColumnName": "target_location", | |
"columnInsertIndex": 2, | |
"baseColumnName": "IP", | |
"urlExpression": "grel:\"http://api.ipinfodb.com/v3/ip-city/?key=YOU_API_KEY&ip=\"+value", | |
"onError": "set-to-blank", | |
"delay": 1000 | |
}, | |
{ | |
"op": "core/column-addition", | |
"description": "Create column text_decoded at index 9 based on column TEXT COPIED using expression grel:unescape(value,\"url\")", | |
"engineConfig": { | |
"facets": [ | |
{ | |
"invert": false, | |
"expression": "isBlank(value)", | |
"selectError": false, | |
"omitError": false, | |
"selectBlank": false, | |
"name": "TEXT SOURCE", | |
"omitBlank": false, | |
"columnName": "TEXT SOURCE", | |
"type": "list", | |
"selection": [ | |
{ | |
"v": { | |
"v": false, | |
"l": "false" | |
} | |
} | |
] | |
} | |
], | |
"mode": "row-based" | |
}, | |
"newColumnName": "text_decoded", | |
"columnInsertIndex": 9, | |
"baseColumnName": "TEXT COPIED", | |
"expression": "grel:unescape(value,\"url\")", | |
"onError": "set-to-blank" | |
}, | |
{ | |
"op": "core/column-addition", | |
"description": "Create column timestamp at index 12 based on column TEXT COPIED ON using expression jython:import time \nreturn int(time.mktime(time.strptime(value, '%a %b %d %Y %H:%M:%S')))", | |
"engineConfig": { | |
"facets": [], | |
"mode": "row-based" | |
}, | |
"newColumnName": "timestamp", | |
"columnInsertIndex": 12, | |
"baseColumnName": "TEXT COPIED ON", | |
"expression": "jython:import time \nreturn int(time.mktime(time.strptime(value, '%a %b %d %Y %H:%M:%S')))", | |
"onError": "set-to-blank" | |
}, | |
{ | |
"op": "core/row-reorder", | |
"description": "Reorder rows", | |
"mode": "record-based", | |
"sorting": { | |
"criteria": [ | |
{ | |
"reverse": false, | |
"column": "timestamp", | |
"valueType": "number", | |
"blankPosition": 2, | |
"errorPosition": 1 | |
} | |
] | |
} | |
}, | |
{ | |
"op": "core/column-addition", | |
"description": "Create column target_long at index 3 based on column target_location using expression grel:split(value,\";\")[-2]", | |
"engineConfig": { | |
"facets": [], | |
"mode": "row-based" | |
}, | |
"newColumnName": "target_long", | |
"columnInsertIndex": 3, | |
"baseColumnName": "target_location", | |
"expression": "grel:split(value,\";\")[-2]", | |
"onError": "set-to-blank" | |
}, | |
{ | |
"op": "core/column-addition", | |
"description": "Create column target_lat at index 3 based on column target_location using expression grel:split(value,\";\")[-3]", | |
"engineConfig": { | |
"facets": [], | |
"mode": "row-based" | |
}, | |
"newColumnName": "target_lat", | |
"columnInsertIndex": 3, | |
"baseColumnName": "target_location", | |
"expression": "grel:split(value,\";\")[-3]", | |
"onError": "set-to-blank" | |
}, | |
{ | |
"op": "core/column-addition", | |
"description": "Create column source_long at index 10 based on column source_location using expression grel:split(value,\";\")[-2]", | |
"engineConfig": { | |
"facets": [], | |
"mode": "row-based" | |
}, | |
"newColumnName": "source_long", | |
"columnInsertIndex": 10, | |
"baseColumnName": "source_location", | |
"expression": "grel:split(value,\";\")[-2]", | |
"onError": "set-to-blank" | |
}, | |
{ | |
"op": "core/column-addition", | |
"description": "Create column source_lat at index 10 based on column source_location using expression grel:split(value,\";\")[-2]", | |
"engineConfig": { | |
"facets": [], | |
"mode": "row-based" | |
}, | |
"newColumnName": "source_lat", | |
"columnInsertIndex": 10, | |
"baseColumnName": "source_location", | |
"expression": "grel:split(value,\";\")[-3]", | |
"onError": "set-to-blank" | |
} | |
] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment