amon-ra/newsml-import.py

## newsml-import.py
import xml.etree.ElementTree as ET
import os
import re
import sys
import traceback
from datetime import datetime

# Creates a csv to import with wp-all-import and redirect (last two columns)
CSV_FORMAT = "id;owner;title;subtitle;abstract;content;date;image;tags;category;slug;link;link2"
FNAME = "data.csv"

def parse_article(item):
    #data = item.attrib
    cat = ''
    c = item.find('./NewsItem/NewsComponent/NewsComponent[1]/NewsComponent')
    cat = c.find('./DescriptiveMetadata/Property[1]').attrib
    cat = cat['Value']
    post_id = c.attrib
    post_id = post_id['Euid']
    date = c.find('./DescriptiveMetadata/DateLineDate').text
    dt = datetime.strptime(date[:8],"%Y%m%d")
    title = c.find('./NewsLines/HeadLine').text or ''
    subtitle = c.find('./NewsLines/SubHeadLine').text or ''
    abstract = c.find('./ContentItem/DataContent/nitf/body/body.head/abstract/p').text or ''
    content = c.find('./ContentItem/DataContent/nitf/body/body.content').text or ''
    owner = c.find('./ContentItem/DataContent/nitf/body/body.head/rights/rights.owner').text or 'periodicoclm'
    if 'periodicoclm' in owner.lower(): owner = 'periodicoclm'
    tags = c.find('./ContentItem/DataContent/nitf/head/docdata/key-list/keyword').attrib
    if tags: tags = tags['key']
    link = c.find('./ContentItem').attrib
    link = link['Href'].replace('http://www.periodicoclm.es/','')
    slug = link.split('/')[2]
    link2 = '/'+dt.strftime('%Y/%m/%d')+'/'+slug+'/'
    # slug = re.sub(r'\/*?[0-9].*html$','',link) + '/'
    image = ''
    try:
        image = item.find('./NewsItem/NewsComponent/NewsComponent[2]/NewsComponent/NewsComponent/ContentItem').attrib
        image = image['Href']
    except Exception:
        pass
    # for h in item.iter('HeadLine'):
    #     print(h.text)
    # for sh in item.iter('SubHeadLine'):
    #     print(sh.text)
    return [post_id,owner,'"'+title.replace('"','”')+'"','"'+subtitle.replace('"','”')+'"','"'+abstract.replace('"','”')+'"','"'+content.replace('"','”')+'"',date,image,tags,cat,slug,link,link2]

def parse_opinion(item):
    #data = item.attrib
    cat = 'opinion'
    c = item.find('./NewsItem/NewsComponent/NewsComponent[1]/NewsComponent')
    post_id = c.attrib
    post_id = post_id['Euid']
    date = c.find('./DescriptiveMetadata/DateLineDate').text
    dt = datetime.strptime(date[:8],"%Y%m%d")
    title = c.find('./NewsLines/HeadLine').text or ''
    subtitle = c.find('./NewsLines/SubHeadLine').text or ''
    abstract = c.find('./ContentItem/DataContent/nitf/body/body.head/abstract/p').text or ''
    content = c.find('./ContentItem/DataContent/nitf/body/body.content').text or ''
    owner = c.find('./ContentItem/DataContent/nitf/body/body.head/rights/rights.owner').text or 'periodicoclm'
    if 'periodicoclm' in owner.lower(): owner = 'periodicoclm'
    tags = c.find('./ContentItem/DataContent/nitf/head/docdata/key-list/keyword').attrib
    if tags: tags = tags['key']
    link = c.find('./ContentItem').attrib
    link = link['Href'].replace('http://www.periodicoclm.es/','')
    slug = link.split('/')[2]
    link2 = '/'+dt.strftime('%Y/%m/%d')+'/'+slug+'/'
    # slug = re.sub(r'\/*?[0-9].*html$','',link) + '/'
    image = ''
    try:
        image = item.find('./NewsItem/NewsComponent/NewsComponent[2]/NewsComponent/NewsComponent/ContentItem').attrib
        image = image['Href']
    except Exception:
        pass
    # for h in item.iter('HeadLine'):
    #     print(h.text)
    # for sh in item.iter('SubHeadLine'):
    #     print(sh.text)
    return [post_id,owner,'"'+title.replace('"','”')+'"','"'+subtitle.replace('"','”')+'"','"'+abstract.replace('"','”')+'"','"'+content.replace('"','”')+'"',date,image,tags,cat,slug,link,link2]


ret = """<?xml version="1.0" encoding="utf-8"?>
<NewsML Version="1.2">
"""
f = open(FNAME,'w')
print(CSV_FORMAT,file=f)
for xml_file in os.listdir("periodicoclm/"):
    if xml_file == "index.xml": continue
    print(xml_file)
    try:
        line = []
        tree = ET.parse("periodicoclm/"+xml_file)
        root = tree.getroot()
        if xml_file.startswith('article'):
            line = parse_article(tree)
        elif xml_file.startswith('opinion'):
            line = parse_opinion(tree)
        if line:
            print(';'.join(line), file=f)
        # line = ET.tostring(root[1], encoding='utf8').decode('utf8')
        # ret += re.sub(r'^<\?xml version.*\?>','',line)
        # print(ret)
    except Exception as e:
        print(e)
        traceback.print_exc()

# ret +="""
# </NewsML>
# """
# f = open("periodicoclm/index.xml", "w")
# f.write(ret)
f.close()
	import xml.etree.ElementTree as ET
	import os
	import re
	import sys
	import traceback
	from datetime import datetime

	# Creates a csv to import with wp-all-import and redirect (last two columns)
	CSV_FORMAT = "id;owner;title;subtitle;abstract;content;date;image;tags;category;slug;link;link2"
	FNAME = "data.csv"

	def parse_article(item):
	#data = item.attrib
	cat = ''
	c = item.find('./NewsItem/NewsComponent/NewsComponent[1]/NewsComponent')
	cat = c.find('./DescriptiveMetadata/Property[1]').attrib
	cat = cat['Value']
	post_id = c.attrib
	post_id = post_id['Euid']
	date = c.find('./DescriptiveMetadata/DateLineDate').text
	dt = datetime.strptime(date[:8],"%Y%m%d")
	title = c.find('./NewsLines/HeadLine').text or ''
	subtitle = c.find('./NewsLines/SubHeadLine').text or ''
	abstract = c.find('./ContentItem/DataContent/nitf/body/body.head/abstract/p').text or ''
	content = c.find('./ContentItem/DataContent/nitf/body/body.content').text or ''
	owner = c.find('./ContentItem/DataContent/nitf/body/body.head/rights/rights.owner').text or 'periodicoclm'
	if 'periodicoclm' in owner.lower(): owner = 'periodicoclm'
	tags = c.find('./ContentItem/DataContent/nitf/head/docdata/key-list/keyword').attrib
	if tags: tags = tags['key']
	link = c.find('./ContentItem').attrib
	link = link['Href'].replace('http://www.periodicoclm.es/','')
	slug = link.split('/')[2]
	link2 = '/'+dt.strftime('%Y/%m/%d')+'/'+slug+'/'
	# slug = re.sub(r'\/?[0-9].html$','',link) + '/'
	image = ''
	try:
	image = item.find('./NewsItem/NewsComponent/NewsComponent[2]/NewsComponent/NewsComponent/ContentItem').attrib
	image = image['Href']
	except Exception:
	pass
	# for h in item.iter('HeadLine'):
	# print(h.text)
	# for sh in item.iter('SubHeadLine'):
	# print(sh.text)
	return [post_id,owner,'"'+title.replace('"','”')+'"','"'+subtitle.replace('"','”')+'"','"'+abstract.replace('"','”')+'"','"'+content.replace('"','”')+'"',date,image,tags,cat,slug,link,link2]

	def parse_opinion(item):
	#data = item.attrib
	cat = 'opinion'
	c = item.find('./NewsItem/NewsComponent/NewsComponent[1]/NewsComponent')
	post_id = c.attrib
	post_id = post_id['Euid']
	date = c.find('./DescriptiveMetadata/DateLineDate').text
	dt = datetime.strptime(date[:8],"%Y%m%d")
	title = c.find('./NewsLines/HeadLine').text or ''
	subtitle = c.find('./NewsLines/SubHeadLine').text or ''
	abstract = c.find('./ContentItem/DataContent/nitf/body/body.head/abstract/p').text or ''
	content = c.find('./ContentItem/DataContent/nitf/body/body.content').text or ''
	owner = c.find('./ContentItem/DataContent/nitf/body/body.head/rights/rights.owner').text or 'periodicoclm'
	if 'periodicoclm' in owner.lower(): owner = 'periodicoclm'
	tags = c.find('./ContentItem/DataContent/nitf/head/docdata/key-list/keyword').attrib
	if tags: tags = tags['key']
	link = c.find('./ContentItem').attrib
	link = link['Href'].replace('http://www.periodicoclm.es/','')
	slug = link.split('/')[2]
	link2 = '/'+dt.strftime('%Y/%m/%d')+'/'+slug+'/'
	# slug = re.sub(r'\/?[0-9].html$','',link) + '/'
	image = ''
	try:
	image = item.find('./NewsItem/NewsComponent/NewsComponent[2]/NewsComponent/NewsComponent/ContentItem').attrib
	image = image['Href']
	except Exception:
	pass
	# for h in item.iter('HeadLine'):
	# print(h.text)
	# for sh in item.iter('SubHeadLine'):
	# print(sh.text)
	return [post_id,owner,'"'+title.replace('"','”')+'"','"'+subtitle.replace('"','”')+'"','"'+abstract.replace('"','”')+'"','"'+content.replace('"','”')+'"',date,image,tags,cat,slug,link,link2]


	ret = """<?xml version="1.0" encoding="utf-8"?>
	<NewsML Version="1.2">
	"""
	f = open(FNAME,'w')
	print(CSV_FORMAT,file=f)
	for xml_file in os.listdir("periodicoclm/"):
	if xml_file == "index.xml": continue
	print(xml_file)
	try:
	line = []
	tree = ET.parse("periodicoclm/"+xml_file)
	root = tree.getroot()
	if xml_file.startswith('article'):
	line = parse_article(tree)
	elif xml_file.startswith('opinion'):
	line = parse_opinion(tree)
	if line:
	print(';'.join(line), file=f)
	# line = ET.tostring(root[1], encoding='utf8').decode('utf8')
	# ret += re.sub(r'^<\?xml version.*\?>','',line)
	# print(ret)
	except Exception as e:
	print(e)
	traceback.print_exc()

	# ret +="""
	# </NewsML>
	# """
	# f = open("periodicoclm/index.xml", "w")
	# f.write(ret)
	f.close()