Skip to content

Instantly share code, notes, and snippets.

@markharwood
Last active December 18, 2020 19:54
Show Gist options
  • Star 12 You must be signed in to star a gist
  • Fork 9 You must be signed in to fork a gist
  • Save markharwood/21c723039425b4b3e4277b2bffa5c54c to your computer and use it in GitHub Desktop.
Save markharwood/21c723039425b4b3e4277b2bffa5c54c to your computer and use it in GitHub Desktop.
Index DBpedia link data
import bz2, os, re
from urllib2 import unquote
from elasticsearch import helpers
from elasticsearch import Elasticsearch
import sys
reload(sys)
sys.setdefaultencoding('utf8')
# Download of data http://downloads.dbpedia.org/2016-04/core-i18n/en/page_links_en.ttl.bz2
filename = "/Users/Mark/Documents/work/irdata/dbpedia/page_links_en.ttl.bz2"
indexName = "dbpedialinks"
docTypeName = "article"
linePattern = re.compile(r'<http://dbpedia.org/resource/([^>]*)> <[^>]*> <http://dbpedia.org/resource/([^>]*)>.*',
re.MULTILINE | re.DOTALL)
es = Elasticsearch()
print "Wiping any existing index..."
es.indices.delete(index=indexName, ignore=[400, 404])
indexSettings = {
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0,
},
"mappings": {
docTypeName: {
"properties": {
"subject": {
"type": "keyword",
"fields": {
"text": {
"type": "text"
}
}
},
"linked_categories": {
"type": "keyword",
"fields": {
"text": {
"type": "text"
}
}
},
"linked_subjects": {
"type": "keyword",
"fields": {
"text": {
"type": "text"
}
}
}
}
}
}
}
es.indices.create(index=indexName, body=indexSettings)
actions = []
rowNum = 0
lastSubject = ""
article = {}
numLinks = 0
numOrigLinks = 0
def addLink(article, subject):
# Use a separate field for category topics
if subject.startswith("Category:"):
article["linked_categories"].append(subject[len("Category:"):])
else:
article["linked_subjects"].append(subject)
def newArticle(subject):
article = {}
article["subject"] = subject
article["linked_subjects"] = []
article["linked_categories"] = []
addLink(article, subject)
return article
with os.popen('bzip2 -cd ' + filename) as file:
for line in file:
m = linePattern.match(unicode(line))
if m:
# Lines consist of [from_article_name] [to_article_name]
# and are sorted by from_article_name so all related items
# to from_article_name appear in contiguous lines.
subject = unquote(m.group(1)).replace('_', ' ')
linkedSubject = unquote(m.group(2)).replace('_', ' ')
if rowNum == 0:
article = newArticle(subject)
lastSubject = subject
numLinks = 0
numOrigLinks = 0
if subject != lastSubject:
if len(article["linked_subjects"]) > 1:
article["numOrigLinks"] = numOrigLinks
article["numLinks"] = numLinks
action = {
"_index": indexName,
'_op_type': 'index',
"_type": docTypeName,
"_source": article
}
actions.append(action)
# Flush bulk indexing action if necessary
if len(actions) >= 5000:
helpers.bulk(es, actions)
## TO check for failures and take appropriate action
del actions[0:len(actions)]
# Set up a new doc
article = newArticle(subject)
lastSubject = subject
numLinks = 0
numOrigLinks = 0
# Don't want too many outbound links in a single article - slows down things like
# signif terms and links become tenuous so truncate to max 500
if len(article["linked_subjects"]) < 500:
addLink(article, linkedSubject)
numLinks += 1
numOrigLinks += 1
rowNum += 1
if rowNum % 100000 == 0:
print rowNum, subject, linkedSubject
@xiaofengleo
Copy link

HI Mark, I wonder where should this script should be located to run in a local elastic search?
Also can this script ruined on a cloud instance of elastic search?

Thank you.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment