eiriks/newsmlimporter.py

## newsmlimporter.py
#!/usr/bin/env python
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Basic python script to load NewsML documents as training set

Need Python 2.7 and lxml.

TODO: port to Python 3 as well if not working by default.
"""
from __future__ import print_function

import os
from time import time
from lxml import html
from lxml import etree
from urllib import quote
import urllib2
from hashlib import sha1


IPTC_SUBJECT_PREFIX = "http://cv.iptc.org/newscodes/subjectcode/"


def find_text_and_subjects(newsml_content,
                           subject_tags=('SubjectMatter', 'SubjectDetail'),
                           text_tags=('HeadLine',),
                           html_tags=('body.content',)):
    # First parse of the document as XML for the structured attributes
    xtree = etree.ElementTree(etree.fromstring(newsml_content))
    text_items = [e.text.strip()
                  for tag in text_tags
                  for e in xtree.findall('//' + tag)]
    subjects = [IPTC_SUBJECT_PREFIX + e.get('FormalName')
                for tag in subject_tags
                for e in xtree.findall('//' + tag)]

    # Then use HTML parser to find the that looks like HTML hence can leverage
    # the text_content method.
    htree = etree.ElementTree(html.document_fromstring(newsml_content))

    text_items += [e.text_content().strip()
                   for tag in html_tags
                   for e in htree.findall('//' + tag)]
    text = "\n\n".join(text_items)
    return text, subjects


def register_newsml_document(text, codes, url):
    id = sha1(text.encode('utf-8')).hexdigest()
    url += "?example_id=%s" % id
    for code in codes:
        url += "&concept=%s" % quote(code)
    request = urllib2.Request(url, data=text.encode('utf-8'))
    request.add_header('Content-Type', 'text/plain')
    opener = urllib2.build_opener()
    opener.open(request).read()


def print_newsml_summary(text, codes, server_url=None):
    print(text.split('\n\n')[0])
    for code in codes:
        print('code: ' + code)
    print()


if __name__ == "__main__":
    import sys

    # TODO: use argparse and debug switch to use print_newsfile_summary
    # instead of the default handler
    topfolder = sys.argv[1]
    max = int(sys.argv[2])
    server_url = sys.argv[3]
    handle_news = register_newsml_document

    count = 0
    previous = time()
    for dirpath, dirnames, filenames in os.walk(topfolder):
        if count >= max:
            break

        if '.svn' in dirnames:
            dirnames.remove('.svn')

        for filename in filenames:
            if count >= max:
                break
            if not filename.endswith('.xml'):
                continue
            full_path = os.path.join(topfolder, dirpath, filename)
            newsml_content = open(full_path, 'rb').read()
            text, codes = find_text_and_subjects(newsml_content)
            if len(codes) == 0:
                # ignore document without subject info
                continue
            handle_news(text, codes, server_url)
            count += 1
            if count % 100 == 0:
                delta, previous = time() - previous, time()
                print("Processed news %03d/%03d in %06.3fs"
                      % (count, max, delta))
	#!/usr/bin/env python
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""Basic python script to load NewsML documents as training set

	Need Python 2.7 and lxml.

	TODO: port to Python 3 as well if not working by default.
	"""
	from __future__ import print_function

	import os
	from time import time
	from lxml import html
	from lxml import etree
	from urllib import quote
	import urllib2
	from hashlib import sha1


	IPTC_SUBJECT_PREFIX = "http://cv.iptc.org/newscodes/subjectcode/"


	def find_text_and_subjects(newsml_content,
	subject_tags=('SubjectMatter', 'SubjectDetail'),
	text_tags=('HeadLine',),
	html_tags=('body.content',)):
	# First parse of the document as XML for the structured attributes
	xtree = etree.ElementTree(etree.fromstring(newsml_content))
	text_items = [e.text.strip()
	for tag in text_tags
	for e in xtree.findall('//' + tag)]
	subjects = [IPTC_SUBJECT_PREFIX + e.get('FormalName')
	for tag in subject_tags
	for e in xtree.findall('//' + tag)]

	# Then use HTML parser to find the that looks like HTML hence can leverage
	# the text_content method.
	htree = etree.ElementTree(html.document_fromstring(newsml_content))

	text_items += [e.text_content().strip()
	for tag in html_tags
	for e in htree.findall('//' + tag)]
	text = "\n\n".join(text_items)
	return text, subjects


	def register_newsml_document(text, codes, url):
	id = sha1(text.encode('utf-8')).hexdigest()
	url += "?example_id=%s" % id
	for code in codes:
	url += "&concept=%s" % quote(code)
	request = urllib2.Request(url, data=text.encode('utf-8'))
	request.add_header('Content-Type', 'text/plain')
	opener = urllib2.build_opener()
	opener.open(request).read()


	def print_newsml_summary(text, codes, server_url=None):
	print(text.split('\n\n')[0])
	for code in codes:
	print('code: ' + code)
	print()


	if __name__ == "__main__":
	import sys

	# TODO: use argparse and debug switch to use print_newsfile_summary
	# instead of the default handler
	topfolder = sys.argv[1]
	max = int(sys.argv[2])
	server_url = sys.argv[3]
	handle_news = register_newsml_document

	count = 0
	previous = time()
	for dirpath, dirnames, filenames in os.walk(topfolder):
	if count >= max:
	break

	if '.svn' in dirnames:
	dirnames.remove('.svn')

	for filename in filenames:
	if count >= max:
	break
	if not filename.endswith('.xml'):
	continue
	full_path = os.path.join(topfolder, dirpath, filename)
	newsml_content = open(full_path, 'rb').read()
	text, codes = find_text_and_subjects(newsml_content)
	if len(codes) == 0:
	# ignore document without subject info
	continue
	handle_news(text, codes, server_url)
	count += 1
	if count % 100 == 0:
	delta, previous = time() - previous, time()
	print("Processed news %03d/%03d in %06.3fs"
	% (count, max, delta))