Last active
November 28, 2018 10:17
-
-
Save egonw/ca4c348b9a2d1116efcdb55fa85dd158 to your computer and use it in GitHub Desktop.
Bioclipse script to create "main subject" QuickStatements for articles in Wikidata
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright (C) 2018 Egon Willighagen | |
// 2018 Denise Slenter | |
// License: MIT | |
// Usage: | |
// | |
// change the values of the following two variables. The first is the text string you | |
// wish to find in titles of articles in Wikidata, and the second is the actual Wikidata | |
// item of the concept. | |
// | |
// The output of this script is a set of QuickStatements that can be uploaded here: | |
// | |
// https://tools.wmflabs.org/quickstatements/ | |
concept = "aflatoxin B1" | |
conceptQ = "Q4689278" | |
// the next is a list of false positives (all lower case) | |
blacklist = [ | |
] | |
// the code (don't change) | |
// | |
// Changelog: | |
// | |
// 2018-09-01 First upload to MyExperiment.org | |
concept = concept.toLowerCase() | |
// totalArticleCount = 17500000 | |
totalArticleCount = 750000 | |
batchSize = 250000 | |
def renewFile(file) { | |
if (ui.fileExists(file)) ui.remove(file) | |
ui.newFile(file) | |
return file | |
} | |
qsFile = "/Wikicite/output." + concept.replace(" ", "_") + ".quickstatements" | |
renewFile(qsFile) | |
rounds = (int)Math.ceil(totalArticleCount / batchSize) | |
1.upto(rounds) { counter -> | |
print "batch ${counter}/${rounds}: " | |
offset = (counter-1)*batchSize | |
sparql = """ | |
SELECT ?art ?artLabel | |
WITH { | |
SELECT ?art WHERE { | |
?art wdt:P31 wd:Q13442814 | |
} LIMIT $batchSize OFFSET $offset | |
} AS %RESULTS { | |
INCLUDE %RESULTS | |
?art wdt:P1476 ?artLabel . | |
MINUS { ?art wdt:P921 wd:$conceptQ } | |
FILTER (contains(lcase(str(?artLabel)), "$concept")) | |
} | |
""" | |
if (bioclipse.isOnline()) { | |
try { | |
rawResults = bioclipse.sparqlRemote( | |
"https://query.wikidata.org/sparql", sparql | |
) | |
results = rdf.processSPARQLXML(rawResults, sparql) | |
missing = results.rowCount == 0 | |
if (!missing) { | |
println "found ${results.rowCount} article(s)!" | |
printlnOutput = "" | |
fileOutput = "" | |
1.upto(results.rowCount) { artCounter -> | |
artTitle = results.get(artCounter, "artLabel") | |
blacklisted = false | |
blacklist.each { badWord -> | |
if (artTitle.toLowerCase().contains(badWord.toLowerCase())) { | |
blacklisted = true | |
} | |
} | |
if (!blacklisted) { | |
artIRI = results.get(artCounter, "art") | |
artQ = artIRI.substring(31) | |
printlnOutput += "${artQ}\t" + artTitle + "\n" | |
fileOutput += "${artQ}\tP921\t${conceptQ}\n" | |
} | |
} | |
print(printlnOutput) | |
ui.append(qsFile, fileOutput) | |
} else { | |
println "no hits" | |
} | |
} catch (Exception exception) { | |
println "Error while retrieving this batch: " + exception.message | |
} | |
} else { | |
println "no online access" | |
} | |
} | |
ui.open(qsFile) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment