Skip to content

Instantly share code, notes, and snippets.

@eiriks
Created March 3, 2015 15:55
Show Gist options
  • Save eiriks/a27b7fb848718503bb3e to your computer and use it in GitHub Desktop.
Save eiriks/a27b7fb848718503bb3e to your computer and use it in GitHub Desktop.
Basic python script to load NewsML documents as training set for Apache Stanbol
#!/usr/bin/env python
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Basic python script to load NewsML documents as training set
Need Python 2.7 and lxml.
TODO: port to Python 3 as well if not working by default.
"""
from __future__ import print_function
import os
from time import time
from lxml import html
from lxml import etree
from urllib import quote
import urllib2
from hashlib import sha1
IPTC_SUBJECT_PREFIX = "http://cv.iptc.org/newscodes/subjectcode/"
def find_text_and_subjects(newsml_content,
subject_tags=('SubjectMatter', 'SubjectDetail'),
text_tags=('HeadLine',),
html_tags=('body.content',)):
# First parse of the document as XML for the structured attributes
xtree = etree.ElementTree(etree.fromstring(newsml_content))
text_items = [e.text.strip()
for tag in text_tags
for e in xtree.findall('//' + tag)]
subjects = [IPTC_SUBJECT_PREFIX + e.get('FormalName')
for tag in subject_tags
for e in xtree.findall('//' + tag)]
# Then use HTML parser to find the that looks like HTML hence can leverage
# the text_content method.
htree = etree.ElementTree(html.document_fromstring(newsml_content))
text_items += [e.text_content().strip()
for tag in html_tags
for e in htree.findall('//' + tag)]
text = "\n\n".join(text_items)
return text, subjects
def register_newsml_document(text, codes, url):
id = sha1(text.encode('utf-8')).hexdigest()
url += "?example_id=%s" % id
for code in codes:
url += "&concept=%s" % quote(code)
request = urllib2.Request(url, data=text.encode('utf-8'))
request.add_header('Content-Type', 'text/plain')
opener = urllib2.build_opener()
opener.open(request).read()
def print_newsml_summary(text, codes, server_url=None):
print(text.split('\n\n')[0])
for code in codes:
print('code: ' + code)
print()
if __name__ == "__main__":
import sys
# TODO: use argparse and debug switch to use print_newsfile_summary
# instead of the default handler
topfolder = sys.argv[1]
max = int(sys.argv[2])
server_url = sys.argv[3]
handle_news = register_newsml_document
count = 0
previous = time()
for dirpath, dirnames, filenames in os.walk(topfolder):
if count >= max:
break
if '.svn' in dirnames:
dirnames.remove('.svn')
for filename in filenames:
if count >= max:
break
if not filename.endswith('.xml'):
continue
full_path = os.path.join(topfolder, dirpath, filename)
newsml_content = open(full_path, 'rb').read()
text, codes = find_text_and_subjects(newsml_content)
if len(codes) == 0:
# ignore document without subject info
continue
handle_news(text, codes, server_url)
count += 1
if count % 100 == 0:
delta, previous = time() - previous, time()
print("Processed news %03d/%03d in %06.3fs"
% (count, max, delta))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment