Created
March 3, 2015 15:55
-
-
Save eiriks/a27b7fb848718503bb3e to your computer and use it in GitHub Desktop.
Basic python script to load NewsML documents as training set for Apache Stanbol
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Licensed to the Apache Software Foundation (ASF) under one or more | |
# contributor license agreements. See the NOTICE file distributed with | |
# this work for additional information regarding copyright ownership. | |
# The ASF licenses this file to You under the Apache License, Version 2.0 | |
# (the "License"); you may not use this file except in compliance with | |
# the License. You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""Basic python script to load NewsML documents as training set | |
Need Python 2.7 and lxml. | |
TODO: port to Python 3 as well if not working by default. | |
""" | |
from __future__ import print_function | |
import os | |
from time import time | |
from lxml import html | |
from lxml import etree | |
from urllib import quote | |
import urllib2 | |
from hashlib import sha1 | |
IPTC_SUBJECT_PREFIX = "http://cv.iptc.org/newscodes/subjectcode/" | |
def find_text_and_subjects(newsml_content, | |
subject_tags=('SubjectMatter', 'SubjectDetail'), | |
text_tags=('HeadLine',), | |
html_tags=('body.content',)): | |
# First parse of the document as XML for the structured attributes | |
xtree = etree.ElementTree(etree.fromstring(newsml_content)) | |
text_items = [e.text.strip() | |
for tag in text_tags | |
for e in xtree.findall('//' + tag)] | |
subjects = [IPTC_SUBJECT_PREFIX + e.get('FormalName') | |
for tag in subject_tags | |
for e in xtree.findall('//' + tag)] | |
# Then use HTML parser to find the that looks like HTML hence can leverage | |
# the text_content method. | |
htree = etree.ElementTree(html.document_fromstring(newsml_content)) | |
text_items += [e.text_content().strip() | |
for tag in html_tags | |
for e in htree.findall('//' + tag)] | |
text = "\n\n".join(text_items) | |
return text, subjects | |
def register_newsml_document(text, codes, url): | |
id = sha1(text.encode('utf-8')).hexdigest() | |
url += "?example_id=%s" % id | |
for code in codes: | |
url += "&concept=%s" % quote(code) | |
request = urllib2.Request(url, data=text.encode('utf-8')) | |
request.add_header('Content-Type', 'text/plain') | |
opener = urllib2.build_opener() | |
opener.open(request).read() | |
def print_newsml_summary(text, codes, server_url=None): | |
print(text.split('\n\n')[0]) | |
for code in codes: | |
print('code: ' + code) | |
print() | |
if __name__ == "__main__": | |
import sys | |
# TODO: use argparse and debug switch to use print_newsfile_summary | |
# instead of the default handler | |
topfolder = sys.argv[1] | |
max = int(sys.argv[2]) | |
server_url = sys.argv[3] | |
handle_news = register_newsml_document | |
count = 0 | |
previous = time() | |
for dirpath, dirnames, filenames in os.walk(topfolder): | |
if count >= max: | |
break | |
if '.svn' in dirnames: | |
dirnames.remove('.svn') | |
for filename in filenames: | |
if count >= max: | |
break | |
if not filename.endswith('.xml'): | |
continue | |
full_path = os.path.join(topfolder, dirpath, filename) | |
newsml_content = open(full_path, 'rb').read() | |
text, codes = find_text_and_subjects(newsml_content) | |
if len(codes) == 0: | |
# ignore document without subject info | |
continue | |
handle_news(text, codes, server_url) | |
count += 1 | |
if count % 100 == 0: | |
delta, previous = time() - previous, time() | |
print("Processed news %03d/%03d in %06.3fs" | |
% (count, max, delta)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment