Skip to content

Instantly share code, notes, and snippets.

Last active November 28, 2018 10:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save egonw/ca4c348b9a2d1116efcdb55fa85dd158 to your computer and use it in GitHub Desktop.
Save egonw/ca4c348b9a2d1116efcdb55fa85dd158 to your computer and use it in GitHub Desktop.
Bioclipse script to create "main subject" QuickStatements for articles in Wikidata
// Copyright (C) 2018 Egon Willighagen
// 2018 Denise Slenter
// License: MIT
// Usage:
// change the values of the following two variables. The first is the text string you
// wish to find in titles of articles in Wikidata, and the second is the actual Wikidata
// item of the concept.
// The output of this script is a set of QuickStatements that can be uploaded here:
concept = "aflatoxin B1"
conceptQ = "Q4689278"
// the next is a list of false positives (all lower case)
blacklist = [
// the code (don't change)
// Changelog:
// 2018-09-01 First upload to
concept = concept.toLowerCase()
// totalArticleCount = 17500000
totalArticleCount = 750000
batchSize = 250000
def renewFile(file) {
if (ui.fileExists(file)) ui.remove(file)
return file
qsFile = "/Wikicite/output." + concept.replace(" ", "_") + ".quickstatements"
rounds = (int)Math.ceil(totalArticleCount / batchSize)
1.upto(rounds) { counter ->
print "batch ${counter}/${rounds}: "
offset = (counter-1)*batchSize
sparql = """
SELECT ?art ?artLabel
?art wdt:P31 wd:Q13442814
} LIMIT $batchSize OFFSET $offset
?art wdt:P1476 ?artLabel .
MINUS { ?art wdt:P921 wd:$conceptQ }
FILTER (contains(lcase(str(?artLabel)), "$concept"))
if (bioclipse.isOnline()) {
try {
rawResults = bioclipse.sparqlRemote(
"", sparql
results = rdf.processSPARQLXML(rawResults, sparql)
missing = results.rowCount == 0
if (!missing) {
println "found ${results.rowCount} article(s)!"
printlnOutput = ""
fileOutput = ""
1.upto(results.rowCount) { artCounter ->
artTitle = results.get(artCounter, "artLabel")
blacklisted = false
blacklist.each { badWord ->
if (artTitle.toLowerCase().contains(badWord.toLowerCase())) {
blacklisted = true
if (!blacklisted) {
artIRI = results.get(artCounter, "art")
artQ = artIRI.substring(31)
printlnOutput += "${artQ}\t" + artTitle + "\n"
fileOutput += "${artQ}\tP921\t${conceptQ}\n"
ui.append(qsFile, fileOutput)
} else {
println "no hits"
} catch (Exception exception) {
println "Error while retrieving this batch: " + exception.message
} else {
println "no online access"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment