Created
April 9, 2018 00:02
-
-
Save asdacap/49e14b2ff1954c2c7c2c7100a1535a58 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import com.github.jsonldjava.core.RDFDataset | |
import com.github.jsonldjava.core.RDFDatasetUtils | |
import org.apache.commons.codec.binary.Hex | |
import org.apache.commons.codec.digest.DigestUtils | |
import org.apache.commons.collections4.iterators.PermutationIterator | |
/** | |
* Normalize jsonld-java's RDFDataset using URDNA2015 specification | |
* | |
* Passes json-gold tests | |
*/ | |
class URDNA2015(inputDataset: RDFDataset) { | |
val inputQuads = inputDataset.graphNames().flatMap { inputDataset.getQuads(it) } | |
val blankNodeToQuad: MutableMap<String, MutableList<RDFDataset.Quad>> = mutableMapOf() | |
val hashToNodes: MutableMap<String, MutableList<String>> = mutableMapOf() | |
val cannonicalIdentifierIssuer = BlankNodeIdentifierIssuer() | |
val hashFirstDegreeCache: MutableMap<String, String> = mutableMapOf() | |
fun normalize(): RDFDataset { | |
inputQuads.forEach { quad -> | |
quad.forEach({ | |
val theVal = it.value | |
if (theVal is RDFDataset.BlankNode) { | |
val identifier = theVal.value | |
if (!blankNodeToQuad.containsKey(identifier)) { | |
blankNodeToQuad[identifier] = mutableListOf() | |
} | |
blankNodeToQuad[identifier]!!.add(quad) | |
} | |
}) | |
} | |
// 3. | |
val nnBlankNodes = blankNodeToQuad.keys.toMutableList() | |
// 4. | |
var simple = true | |
// 5. | |
while (simple) { | |
simple = false | |
hashToNodes.clear() | |
nnBlankNodes.forEach { | |
val quadHash = hashFirstDegree(it) | |
if (!hashToNodes.containsKey(quadHash)) { | |
hashToNodes[quadHash] = mutableListOf() | |
} | |
hashToNodes[quadHash]!!.add(it) | |
} | |
hashToNodes.keys.sorted().forEach { hash -> | |
val nodes = hashToNodes[hash]!! | |
if (nodes.size == 1) { | |
val id = nodes[0] | |
cannonicalIdentifierIssuer.getId(id) | |
nnBlankNodes.remove(id) | |
hashToNodes.remove(hash) | |
simple = true | |
} | |
} | |
} | |
// 6. | |
hashToNodes.keys.sorted().forEach { | |
val hashPathList: MutableList<Pair<String, BlankNodeIdentifierIssuer>> = mutableListOf() | |
hashToNodes[it]!!.forEach { | |
if (!cannonicalIdentifierIssuer.hasId(it)) { | |
val issuer = BlankNodeIdentifierIssuer("_:b") | |
issuer.getId(it) | |
hashPathList.add(hashNDegreeQuad(it, issuer)) | |
} | |
} | |
hashPathList.sortedBy { it.first } .forEach { | |
it.second.issuedIdentifier.forEach { | |
cannonicalIdentifierIssuer.getId(it) | |
} | |
} | |
} | |
// Output it | |
val newRDFDataset = RDFDataset() | |
inputQuads.forEach { | |
val subject: String = if (it.subject is RDFDataset.BlankNode) { | |
cannonicalIdentifierIssuer.getId(it.subject.value) | |
} else { | |
it.subject.value | |
} | |
var graph = it.graph?.value | |
if(it.graph != null && it.graph is RDFDataset.BlankNode) { | |
graph = cannonicalIdentifierIssuer.getId(it.graph.value) | |
} | |
val predicate: String = it.predicate.value | |
if (it.`object` is RDFDataset.Literal) { | |
val `object`: String = it.`object`.value | |
val objectDataType : String? = if (it.`object` is RDFDataset.BlankNode) { null } else { it.`object`.datatype } | |
val objectLanguage : String? = if (it.`object` is RDFDataset.BlankNode) { null } else { it.`object`.language } | |
newRDFDataset.addQuad(subject, predicate, `object`, objectDataType, objectLanguage, graph) | |
} else if (it.`object` is RDFDataset.BlankNode) { | |
val objectValue = cannonicalIdentifierIssuer.getId(it.`object`.value) | |
newRDFDataset.addQuad(subject, predicate, objectValue, graph) | |
} else { | |
val objectValue = it.`object`.value | |
newRDFDataset.addQuad(subject, predicate, objectValue, graph) | |
} | |
} | |
return newRDFDataset | |
} | |
fun hashNDegreeQuad(identifier: String, nodeIdentifierIssuer: BlankNodeIdentifierIssuer): Pair<String, BlankNodeIdentifierIssuer> { | |
val hashToRelatedBNodesMap: MutableMap<String, MutableList<String>> = mutableMapOf() | |
val quads = blankNodeToQuad[identifier]!! | |
quads.forEach { quad -> | |
quad.forEach { type, value -> | |
if (value is RDFDataset.BlankNode && value.value != identifier) { | |
val position = when (type) { | |
"subject" -> "s" | |
"graph" -> "g" | |
"object" -> "o" | |
else -> { | |
throw Exception("Unknown node position") | |
} | |
} | |
val hash = hashRelateBlankNode(value.value, quad, nodeIdentifierIssuer, position) | |
if (!hashToRelatedBNodesMap.containsKey(hash)) { | |
hashToRelatedBNodesMap[hash] = mutableListOf() | |
} | |
hashToRelatedBNodesMap[hash]!!.add(value.value) | |
} | |
} | |
} | |
val stringToHashBuilder = StringBuilder() | |
var replacedIssuer = nodeIdentifierIssuer | |
hashToRelatedBNodesMap.entries.sortedBy { it.key }.forEach { | |
stringToHashBuilder.append(it.key) | |
var choosenPath: String = "" | |
var choosenIssuer = BlankNodeIdentifierIssuer() | |
val permutations = PermutationIterator(it.value).asSequence().toList() | |
var loopCount = 0; | |
permutations.forEach permutationLoop@{ | |
loopCount++; | |
var copiedIssuer = replacedIssuer.copy() | |
var path = "" | |
val recursionList = mutableListOf<String>() | |
it.forEach { id -> | |
if (cannonicalIdentifierIssuer.hasId(id)) { | |
path += cannonicalIdentifierIssuer.getId(id) | |
} else { | |
if (!copiedIssuer.hasId(id)) { | |
recursionList.add(id) | |
} | |
path += copiedIssuer.getId(id) | |
} | |
if (choosenPath.length != 0 && path.length >= choosenPath.length && path > choosenPath) { | |
return@permutationLoop | |
} | |
} | |
recursionList.forEach { related -> | |
val result = hashNDegreeQuad(related, copiedIssuer) | |
path += copiedIssuer.getId(related) | |
path += "<" + result.first + ">" | |
copiedIssuer = result.second | |
if (choosenPath.length != 0 && path.length >= choosenPath.length && path > choosenPath) { | |
return@permutationLoop | |
} | |
} | |
if (choosenPath.length == 0 || path < choosenPath) { | |
choosenPath = path | |
choosenIssuer = copiedIssuer | |
} | |
} | |
stringToHashBuilder.append(choosenPath) | |
replacedIssuer = choosenIssuer | |
} | |
return Pair(DigestUtils.sha256(stringToHashBuilder.toString()).let(Hex::encodeHexString), replacedIssuer) | |
} | |
fun hashRelateBlankNode(related: String, quad: RDFDataset.Quad, identityIssuer: BlankNodeIdentifierIssuer, position: String): String { | |
val identifier: String = if (cannonicalIdentifierIssuer.hasId(related)) { | |
cannonicalIdentifierIssuer.getId(related) | |
} else if (identityIssuer.hasId(related)) { | |
identityIssuer.getId(related) | |
} else { | |
hashFirstDegree(related) | |
} | |
if (position != "g") { | |
return Hex.encodeHexString(DigestUtils.sha256(position + "<" + quad.predicate.value + ">" + identifier)) | |
} else { | |
return Hex.encodeHexString(DigestUtils.sha256(position+identifier)) | |
} | |
} | |
fun hashFirstDegree(identifier: String): String = | |
hashFirstDegreeCache.getOrPut(identifier, { | |
val quads = blankNodeToQuad[identifier]!! | |
val sortedNQuads = quads.map { | |
val asCopy = it.clone() as RDFDataset.Quad | |
asCopy.forEach { | |
val theval = it.value | |
if (theval is RDFDataset.BlankNode) { | |
if (theval.value == identifier) { | |
asCopy[it.key] = RDFDataset.BlankNode("_:a") | |
} else { | |
asCopy[it.key] = RDFDataset.BlankNode("_:z") | |
} | |
} | |
} | |
toNQuadSingle(asCopy) | |
}.sorted() | |
val joined = sortedNQuads.joinToString("") | |
DigestUtils.sha256(joined).let(Hex::encodeHexString) | |
}) | |
fun toNQuadSingle(quad: RDFDataset.Quad): String { | |
// Well... RDFDatasetUtils that nquad a single quad is not public... so, this happened. | |
// Or you could copy the RDFDatasetUtils | |
val dSet = RDFDataset() | |
val subject = quad.subject.value | |
val graph = quad.graph?.value | |
val predicate: String = quad.predicate.value | |
if (quad.`object` is RDFDataset.Literal) { | |
val `object`: String = quad.`object`.value | |
val objectDataType : String? = quad.`object`.datatype | |
val objectLanguage : String? = quad.`object`.language | |
dSet.addQuad(subject, predicate, `object`, objectDataType, objectLanguage, graph) | |
} else { | |
val objectValue = quad.`object`.value | |
dSet.addQuad(subject, predicate, objectValue, graph) | |
} | |
return RDFDatasetUtils.toNQuads(dSet) | |
} | |
class BlankNodeIdentifierIssuer( | |
val prefix: String = "_:c14n", | |
var counter: Int = 0, | |
val issuedIdentifier: MutableList<String> = mutableListOf(), | |
val issuedIdentifierMap: MutableMap<String, String> = mutableMapOf() | |
) { | |
fun getId(existingId: String): String { | |
return issuedIdentifierMap.getOrPut(existingId, { | |
val newIdentifier = prefix + counter | |
counter += 1 | |
issuedIdentifier.add(existingId) | |
issuedIdentifierMap[existingId] = newIdentifier | |
return newIdentifier | |
}) | |
} | |
fun hasId(existingId: String) = issuedIdentifierMap.containsKey(existingId) | |
fun copy() = BlankNodeIdentifierIssuer( | |
prefix, | |
counter, | |
issuedIdentifier.toMutableList(), | |
issuedIdentifierMap.toMutableMap() | |
) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment