Skip to content

Instantly share code, notes, and snippets.

@mdoering
Created February 17, 2015 15:18
Show Gist options
  • Save mdoering/a42ab1f9b76e70309633 to your computer and use it in GitHub Desktop.
Save mdoering/a42ab1f9b76e70309633 to your computer and use it in GitHub Desktop.
package org.gbif.markus.udf;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.google.common.base.Strings;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
/**
* A simple UDF for Hive that parses UUIDs (uuid:XYZ, uri:uuid:XYZ, uuid:XYZ, uuid:xyz) into canonical UUID representations.
*/
@Description(
name = "identifierType",
value = "_FUNC_(field)")
public class IdentifierTypeUDF extends UDF {
private final Text text = new Text();
public static enum IdentifierTypeEnum {
URL, URN,
INTEGER, DOUBLE,
UUID, UUID_URN, UUID_PREFIX,
DOI, DOI_NAME, DOI_URN, DOI_HTTP,
LSID, LSID_URN, LSID_HTTP,
TRIPLET,
HANDLER, ARK,
OTHER, NONE
}
private static final Pattern uuidUrn = Pattern.compile("^urn:uuid:[a-f0-9-]+$", Pattern.CASE_INSENSITIVE);
private static final Pattern uuidPrefix = Pattern.compile("^uuid:[a-f0-9-]+$", Pattern.CASE_INSENSITIVE);
private static final String DOI = " *10(\\.[0-9]+)+/.+$";
private static final Pattern doi = Pattern.compile("^doi:"+DOI, Pattern.CASE_INSENSITIVE);
private static final Pattern doiName = Pattern.compile("^"+DOI, Pattern.CASE_INSENSITIVE);
private static final Pattern doiUrn = Pattern.compile("^urn:doi:"+DOI, Pattern.CASE_INSENSITIVE);
private static final Pattern doiHttp = Pattern.compile("^https?://(dx\\.)?doi\\.org/"
+ "(urn:)?(doi:)?" + DOI, Pattern.CASE_INSENSITIVE);
private static final Pattern lsid = Pattern.compile("^lsid:.+$", Pattern.CASE_INSENSITIVE);
private static final Pattern lsidUrn = Pattern.compile("^urn:lsid:.+$", Pattern.CASE_INSENSITIVE);
private static final Pattern lsidHttp = Pattern.compile("^http://lsid.tdwg.org/(summary/)?"
+ "(urn:)?lsid:.+$", Pattern.CASE_INSENSITIVE);
private static final Pattern url = Pattern.compile("^http(s?)://.+/.+$", Pattern.CASE_INSENSITIVE);
private static final Pattern urn = Pattern.compile("^urn:([a-z]+):.+$", Pattern.CASE_INSENSITIVE);
private static final Pattern triplet = Pattern.compile("^(\\w+)[ :.-](\\w+)[ :.-](.+)$", Pattern.CASE_INSENSITIVE);
public Text evaluate(Text field) {
if (field == null) {
set(IdentifierTypeEnum.NONE);
} else {
final String val = field.toString();
if (Strings.isNullOrEmpty(val)) {
set(IdentifierTypeEnum.NONE);
} else {
if (!tryNumber(val)) {
if (!tryUUID(val)) {
if (!tryDOI(val)) {
if (!tryLSID(val)) {
if (!tryUrln(val)) {
if (!tryTriplet(val)) {
set(IdentifierTypeEnum.OTHER);
}
}
}
}
}
}
}
}
return text;
}
private void set(IdentifierTypeEnum type) {
text.set(type.name());
}
private boolean tryUrln(String val) {
if (url.matcher(val).matches()) {
set(IdentifierTypeEnum.URL);
return true;
} else {
Matcher m = urn.matcher(val);
if (m.find()) {
String scheme = m.group(1);
if (StringUtils.isAlpha(scheme)) {
text.set("URN:"+scheme);
} else {
set(IdentifierTypeEnum.URN);
}
return true;
}
}
return false;
}
private boolean tryLSID(String val) {
if (lsidUrn.matcher(val).matches()) {
set(IdentifierTypeEnum.LSID_URN);
} else if (lsidHttp.matcher(val).matches()) {
set(IdentifierTypeEnum.LSID_HTTP);
} else if (lsid.matcher(val).matches()) {
set(IdentifierTypeEnum.LSID);
} else {
return false;
}
return true;
}
private boolean tryTriplet(String val) {
if (triplet.matcher(val).matches()) {
set(IdentifierTypeEnum.TRIPLET);
return true;
}
return false;
}
private boolean tryUUID(String val) {
try {
UUID.fromString(val);
set(IdentifierTypeEnum.UUID);
} catch (Exception e) {
if (uuidUrn.matcher(val).matches()) {
set(IdentifierTypeEnum.UUID_URN);
} else if (uuidPrefix.matcher(val).matches()) {
set(IdentifierTypeEnum.UUID_PREFIX);
} else {
return false;
}
}
return true;
}
private boolean tryDOI(String val) {
if (doiUrn.matcher(val).matches()) {
set(IdentifierTypeEnum.DOI_URN);
} else if (doiHttp.matcher(val).matches()) {
set(IdentifierTypeEnum.DOI_HTTP);
} else if (doiName.matcher(val).matches()) {
set(IdentifierTypeEnum.DOI_NAME);
} else if (doi.matcher(val).matches()) {
set(IdentifierTypeEnum.DOI);
} else {
return false;
}
return true;
}
private boolean tryNumber(String val) {
try {
Integer.valueOf(val);
set(IdentifierTypeEnum.INTEGER);
} catch (Exception e) {
try {
Double.valueOf(val);
set(IdentifierTypeEnum.DOUBLE);
} catch (Exception e1) {
return false;
}
}
return true;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment