Created
February 20, 2020 20:10
-
-
Save felixlohmeier/a5a893190e4aa8e26091664908d04e20 to your computer and use it in GitHub Desktop.
jython-cross.py: OpenRefine text transform to join multi-valued cells with multi-valued cells from another project (Python / Jython)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
gift | recipient | |
---|---|---|
lamp | mary | |
clock | noël,sørina | |
sweets | mary,noël | |
wine | sørina,anonymous | |
cheese | peter |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# jython-cross.py, Felix Lohmeier, v0.1, 2020-02-20 | |
# OpenRefine text transform to join multi-valued cells with multi-valued cells from another project | |
# script language: Python / Jython | |
# supported versions: OpenRefine 2.7 - 3.2 | |
# warning: facets and filters will break the script (it depends on rowIndex 0) | |
# config | |
host = u"localhost" | |
port = u"3333" | |
foreign_project = u"my address book" | |
foreign_column_match = u"friend" | |
foreign_column_value = u"address" | |
foreign_separator_match = u";" | |
foreign_separator_value = u"|" | |
foreign_separator_new = u"; " | |
origin_separator = u"," | |
origin_separator_new = u"\u241F" | |
placeholder = u"\u2400" | |
if rowIndex == 0: | |
import sys | |
import os | |
import subprocess | |
# download openrefine-client | |
ver = sys.platform.lower() | |
if ver.startswith('java'): | |
import java.lang | |
ver = java.lang.System.getProperty("os.name").lower() | |
if ver.startswith('win32'): | |
client = "openrefine-client_0-3-8_windows.exe" | |
if (not os.path.isfile(client)): | |
import urllib | |
urllib.urlretrieve("https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.8/openrefine-client_0-3-8_windows.exe", client) | |
elif ver.startswith('darwin'): | |
client = "./openrefine-client_0-3-8_mac" | |
if (not os.path.isfile(client)): | |
import urllib | |
import subprocess | |
urllib.urlretrieve("https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.8/openrefine-client_0-3-8_mac", client) | |
subprocess.call(["chmod", "+x", client]) | |
elif ver.startswith('linux'): | |
client = "./openrefine-client_0-3-8_linux" | |
if (not os.path.isfile(client)): | |
import urllib | |
import subprocess | |
urllib.urlretrieve("https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.8/openrefine-client_0-3-8_linux", client) | |
subprocess.call(["chmod", "+x", client]) | |
# export foreign project | |
import csv | |
template = u"{{if(and(isNonBlank(cells['" + foreign_column_match + u"'].value), isNonBlank(cells['" + foreign_column_value + u"'].value)), forEach(cells['" + foreign_column_match + u"'].value.split('" + foreign_separator_match + "'), v, forEach(cells['" + foreign_column_value + u"'].value.split('" + foreign_separator_value + "'), x, v.escape('csv') + ',' + x.escape('csv')).join('\n')).join('\n') + '\n', '')}}" | |
row_separator = u"" | |
export = csv.reader(subprocess.check_output([client, u"-H", host, u"-P", port, foreign_project, u"--template", template, u"--rowSeparator", row_separator]).splitlines()) | |
# create lookup table from export | |
from collections import defaultdict | |
d = defaultdict(list) | |
[d[row[0].decode("utf8")].append(row[1].decode("utf8")) for row in export] | |
global lookup_table | |
lookup_table = {k: foreign_separator_new.join(v) for k,v in d.iteritems()} | |
# return value from lookup table | |
return origin_separator_new.join([lookup_table[x] if x in lookup_table else placeholder for x in value.split(origin_separator)]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
friend | address | |
---|---|---|
noël | 120 Main St.|20 nuclear plant (☢) | |
mary | 0 peace location ☮|1 Avenue des Champs-Élysées | |
mary;noël | mary's and noël's second address | |
sørina | 17 Morning Crescent|69 Yin Yang (☯) | |
sørina | sørina's second address | |
peter |
Author
felixlohmeier
commented
Feb 20, 2020
@felixlohmeier an iter-generator for the win ! Great job here !
Cross function in OpenRefine 3.4 beta (cf. OpenRefine/OpenRefine#2456) now supports values (instead of cells only). Splitting the values in the base column is now possible with GREL:
forEach(
value.split(","),
v,
forEach(
forNonBlank(
v.cross(
"my-address-book",
"friend"
),
x,
x,
[null]
),
r,
forNonBlank(
r.cells["address"].value,
x,
x,
"␀"
)
).join("; ")
).join("␟")
The jython script in this gist has only two additional features left:
- splitting values in the target column
- replace separator from target column
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment