Skip to content

Instantly share code, notes, and snippets.

@asdacap
Created April 9, 2018 00:02
Show Gist options
  • Save asdacap/49e14b2ff1954c2c7c2c7100a1535a58 to your computer and use it in GitHub Desktop.
Save asdacap/49e14b2ff1954c2c7c2c7100a1535a58 to your computer and use it in GitHub Desktop.
import com.github.jsonldjava.core.RDFDataset
import com.github.jsonldjava.core.RDFDatasetUtils
import org.apache.commons.codec.binary.Hex
import org.apache.commons.codec.digest.DigestUtils
import org.apache.commons.collections4.iterators.PermutationIterator
/**
* Normalize jsonld-java's RDFDataset using URDNA2015 specification
*
* Passes json-gold tests
*/
class URDNA2015(inputDataset: RDFDataset) {
val inputQuads = inputDataset.graphNames().flatMap { inputDataset.getQuads(it) }
val blankNodeToQuad: MutableMap<String, MutableList<RDFDataset.Quad>> = mutableMapOf()
val hashToNodes: MutableMap<String, MutableList<String>> = mutableMapOf()
val cannonicalIdentifierIssuer = BlankNodeIdentifierIssuer()
val hashFirstDegreeCache: MutableMap<String, String> = mutableMapOf()
fun normalize(): RDFDataset {
inputQuads.forEach { quad ->
quad.forEach({
val theVal = it.value
if (theVal is RDFDataset.BlankNode) {
val identifier = theVal.value
if (!blankNodeToQuad.containsKey(identifier)) {
blankNodeToQuad[identifier] = mutableListOf()
}
blankNodeToQuad[identifier]!!.add(quad)
}
})
}
// 3.
val nnBlankNodes = blankNodeToQuad.keys.toMutableList()
// 4.
var simple = true
// 5.
while (simple) {
simple = false
hashToNodes.clear()
nnBlankNodes.forEach {
val quadHash = hashFirstDegree(it)
if (!hashToNodes.containsKey(quadHash)) {
hashToNodes[quadHash] = mutableListOf()
}
hashToNodes[quadHash]!!.add(it)
}
hashToNodes.keys.sorted().forEach { hash ->
val nodes = hashToNodes[hash]!!
if (nodes.size == 1) {
val id = nodes[0]
cannonicalIdentifierIssuer.getId(id)
nnBlankNodes.remove(id)
hashToNodes.remove(hash)
simple = true
}
}
}
// 6.
hashToNodes.keys.sorted().forEach {
val hashPathList: MutableList<Pair<String, BlankNodeIdentifierIssuer>> = mutableListOf()
hashToNodes[it]!!.forEach {
if (!cannonicalIdentifierIssuer.hasId(it)) {
val issuer = BlankNodeIdentifierIssuer("_:b")
issuer.getId(it)
hashPathList.add(hashNDegreeQuad(it, issuer))
}
}
hashPathList.sortedBy { it.first } .forEach {
it.second.issuedIdentifier.forEach {
cannonicalIdentifierIssuer.getId(it)
}
}
}
// Output it
val newRDFDataset = RDFDataset()
inputQuads.forEach {
val subject: String = if (it.subject is RDFDataset.BlankNode) {
cannonicalIdentifierIssuer.getId(it.subject.value)
} else {
it.subject.value
}
var graph = it.graph?.value
if(it.graph != null && it.graph is RDFDataset.BlankNode) {
graph = cannonicalIdentifierIssuer.getId(it.graph.value)
}
val predicate: String = it.predicate.value
if (it.`object` is RDFDataset.Literal) {
val `object`: String = it.`object`.value
val objectDataType : String? = if (it.`object` is RDFDataset.BlankNode) { null } else { it.`object`.datatype }
val objectLanguage : String? = if (it.`object` is RDFDataset.BlankNode) { null } else { it.`object`.language }
newRDFDataset.addQuad(subject, predicate, `object`, objectDataType, objectLanguage, graph)
} else if (it.`object` is RDFDataset.BlankNode) {
val objectValue = cannonicalIdentifierIssuer.getId(it.`object`.value)
newRDFDataset.addQuad(subject, predicate, objectValue, graph)
} else {
val objectValue = it.`object`.value
newRDFDataset.addQuad(subject, predicate, objectValue, graph)
}
}
return newRDFDataset
}
fun hashNDegreeQuad(identifier: String, nodeIdentifierIssuer: BlankNodeIdentifierIssuer): Pair<String, BlankNodeIdentifierIssuer> {
val hashToRelatedBNodesMap: MutableMap<String, MutableList<String>> = mutableMapOf()
val quads = blankNodeToQuad[identifier]!!
quads.forEach { quad ->
quad.forEach { type, value ->
if (value is RDFDataset.BlankNode && value.value != identifier) {
val position = when (type) {
"subject" -> "s"
"graph" -> "g"
"object" -> "o"
else -> {
throw Exception("Unknown node position")
}
}
val hash = hashRelateBlankNode(value.value, quad, nodeIdentifierIssuer, position)
if (!hashToRelatedBNodesMap.containsKey(hash)) {
hashToRelatedBNodesMap[hash] = mutableListOf()
}
hashToRelatedBNodesMap[hash]!!.add(value.value)
}
}
}
val stringToHashBuilder = StringBuilder()
var replacedIssuer = nodeIdentifierIssuer
hashToRelatedBNodesMap.entries.sortedBy { it.key }.forEach {
stringToHashBuilder.append(it.key)
var choosenPath: String = ""
var choosenIssuer = BlankNodeIdentifierIssuer()
val permutations = PermutationIterator(it.value).asSequence().toList()
var loopCount = 0;
permutations.forEach permutationLoop@{
loopCount++;
var copiedIssuer = replacedIssuer.copy()
var path = ""
val recursionList = mutableListOf<String>()
it.forEach { id ->
if (cannonicalIdentifierIssuer.hasId(id)) {
path += cannonicalIdentifierIssuer.getId(id)
} else {
if (!copiedIssuer.hasId(id)) {
recursionList.add(id)
}
path += copiedIssuer.getId(id)
}
if (choosenPath.length != 0 && path.length >= choosenPath.length && path > choosenPath) {
return@permutationLoop
}
}
recursionList.forEach { related ->
val result = hashNDegreeQuad(related, copiedIssuer)
path += copiedIssuer.getId(related)
path += "<" + result.first + ">"
copiedIssuer = result.second
if (choosenPath.length != 0 && path.length >= choosenPath.length && path > choosenPath) {
return@permutationLoop
}
}
if (choosenPath.length == 0 || path < choosenPath) {
choosenPath = path
choosenIssuer = copiedIssuer
}
}
stringToHashBuilder.append(choosenPath)
replacedIssuer = choosenIssuer
}
return Pair(DigestUtils.sha256(stringToHashBuilder.toString()).let(Hex::encodeHexString), replacedIssuer)
}
fun hashRelateBlankNode(related: String, quad: RDFDataset.Quad, identityIssuer: BlankNodeIdentifierIssuer, position: String): String {
val identifier: String = if (cannonicalIdentifierIssuer.hasId(related)) {
cannonicalIdentifierIssuer.getId(related)
} else if (identityIssuer.hasId(related)) {
identityIssuer.getId(related)
} else {
hashFirstDegree(related)
}
if (position != "g") {
return Hex.encodeHexString(DigestUtils.sha256(position + "<" + quad.predicate.value + ">" + identifier))
} else {
return Hex.encodeHexString(DigestUtils.sha256(position+identifier))
}
}
fun hashFirstDegree(identifier: String): String =
hashFirstDegreeCache.getOrPut(identifier, {
val quads = blankNodeToQuad[identifier]!!
val sortedNQuads = quads.map {
val asCopy = it.clone() as RDFDataset.Quad
asCopy.forEach {
val theval = it.value
if (theval is RDFDataset.BlankNode) {
if (theval.value == identifier) {
asCopy[it.key] = RDFDataset.BlankNode("_:a")
} else {
asCopy[it.key] = RDFDataset.BlankNode("_:z")
}
}
}
toNQuadSingle(asCopy)
}.sorted()
val joined = sortedNQuads.joinToString("")
DigestUtils.sha256(joined).let(Hex::encodeHexString)
})
fun toNQuadSingle(quad: RDFDataset.Quad): String {
// Well... RDFDatasetUtils that nquad a single quad is not public... so, this happened.
// Or you could copy the RDFDatasetUtils
val dSet = RDFDataset()
val subject = quad.subject.value
val graph = quad.graph?.value
val predicate: String = quad.predicate.value
if (quad.`object` is RDFDataset.Literal) {
val `object`: String = quad.`object`.value
val objectDataType : String? = quad.`object`.datatype
val objectLanguage : String? = quad.`object`.language
dSet.addQuad(subject, predicate, `object`, objectDataType, objectLanguage, graph)
} else {
val objectValue = quad.`object`.value
dSet.addQuad(subject, predicate, objectValue, graph)
}
return RDFDatasetUtils.toNQuads(dSet)
}
class BlankNodeIdentifierIssuer(
val prefix: String = "_:c14n",
var counter: Int = 0,
val issuedIdentifier: MutableList<String> = mutableListOf(),
val issuedIdentifierMap: MutableMap<String, String> = mutableMapOf()
) {
fun getId(existingId: String): String {
return issuedIdentifierMap.getOrPut(existingId, {
val newIdentifier = prefix + counter
counter += 1
issuedIdentifier.add(existingId)
issuedIdentifierMap[existingId] = newIdentifier
return newIdentifier
})
}
fun hasId(existingId: String) = issuedIdentifierMap.containsKey(existingId)
fun copy() = BlankNodeIdentifierIssuer(
prefix,
counter,
issuedIdentifier.toMutableList(),
issuedIdentifierMap.toMutableMap()
)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment