Skip to content

Instantly share code, notes, and snippets.

@amon-ra
Last active February 18, 2021 16:51
Show Gist options
  • Save amon-ra/a2996b0e6bc2b3810fa57f2b66ba61e9 to your computer and use it in GitHub Desktop.
Save amon-ra/a2996b0e6bc2b3810fa57f2b66ba61e9 to your computer and use it in GitHub Desktop.
NEWSML to csv
import xml.etree.ElementTree as ET
import os
import re
import sys
import traceback
from datetime import datetime
# Creates a csv to import with wp-all-import and redirect (last two columns)
CSV_FORMAT = "id;owner;title;subtitle;abstract;content;date;image;tags;category;slug;link;link2"
FNAME = "data.csv"
def parse_article(item):
#data = item.attrib
cat = ''
c = item.find('./NewsItem/NewsComponent/NewsComponent[1]/NewsComponent')
cat = c.find('./DescriptiveMetadata/Property[1]').attrib
cat = cat['Value']
post_id = c.attrib
post_id = post_id['Euid']
date = c.find('./DescriptiveMetadata/DateLineDate').text
dt = datetime.strptime(date[:8],"%Y%m%d")
title = c.find('./NewsLines/HeadLine').text or ''
subtitle = c.find('./NewsLines/SubHeadLine').text or ''
abstract = c.find('./ContentItem/DataContent/nitf/body/body.head/abstract/p').text or ''
content = c.find('./ContentItem/DataContent/nitf/body/body.content').text or ''
owner = c.find('./ContentItem/DataContent/nitf/body/body.head/rights/rights.owner').text or 'periodicoclm'
if 'periodicoclm' in owner.lower(): owner = 'periodicoclm'
tags = c.find('./ContentItem/DataContent/nitf/head/docdata/key-list/keyword').attrib
if tags: tags = tags['key']
link = c.find('./ContentItem').attrib
link = link['Href'].replace('http://www.periodicoclm.es/','')
slug = link.split('/')[2]
link2 = '/'+dt.strftime('%Y/%m/%d')+'/'+slug+'/'
# slug = re.sub(r'\/*?[0-9].*html$','',link) + '/'
image = ''
try:
image = item.find('./NewsItem/NewsComponent/NewsComponent[2]/NewsComponent/NewsComponent/ContentItem').attrib
image = image['Href']
except Exception:
pass
# for h in item.iter('HeadLine'):
# print(h.text)
# for sh in item.iter('SubHeadLine'):
# print(sh.text)
return [post_id,owner,'"'+title.replace('"','”')+'"','"'+subtitle.replace('"','”')+'"','"'+abstract.replace('"','”')+'"','"'+content.replace('"','”')+'"',date,image,tags,cat,slug,link,link2]
def parse_opinion(item):
#data = item.attrib
cat = 'opinion'
c = item.find('./NewsItem/NewsComponent/NewsComponent[1]/NewsComponent')
post_id = c.attrib
post_id = post_id['Euid']
date = c.find('./DescriptiveMetadata/DateLineDate').text
dt = datetime.strptime(date[:8],"%Y%m%d")
title = c.find('./NewsLines/HeadLine').text or ''
subtitle = c.find('./NewsLines/SubHeadLine').text or ''
abstract = c.find('./ContentItem/DataContent/nitf/body/body.head/abstract/p').text or ''
content = c.find('./ContentItem/DataContent/nitf/body/body.content').text or ''
owner = c.find('./ContentItem/DataContent/nitf/body/body.head/rights/rights.owner').text or 'periodicoclm'
if 'periodicoclm' in owner.lower(): owner = 'periodicoclm'
tags = c.find('./ContentItem/DataContent/nitf/head/docdata/key-list/keyword').attrib
if tags: tags = tags['key']
link = c.find('./ContentItem').attrib
link = link['Href'].replace('http://www.periodicoclm.es/','')
slug = link.split('/')[2]
link2 = '/'+dt.strftime('%Y/%m/%d')+'/'+slug+'/'
# slug = re.sub(r'\/*?[0-9].*html$','',link) + '/'
image = ''
try:
image = item.find('./NewsItem/NewsComponent/NewsComponent[2]/NewsComponent/NewsComponent/ContentItem').attrib
image = image['Href']
except Exception:
pass
# for h in item.iter('HeadLine'):
# print(h.text)
# for sh in item.iter('SubHeadLine'):
# print(sh.text)
return [post_id,owner,'"'+title.replace('"','”')+'"','"'+subtitle.replace('"','”')+'"','"'+abstract.replace('"','”')+'"','"'+content.replace('"','”')+'"',date,image,tags,cat,slug,link,link2]
ret = """<?xml version="1.0" encoding="utf-8"?>
<NewsML Version="1.2">
"""
f = open(FNAME,'w')
print(CSV_FORMAT,file=f)
for xml_file in os.listdir("periodicoclm/"):
if xml_file == "index.xml": continue
print(xml_file)
try:
line = []
tree = ET.parse("periodicoclm/"+xml_file)
root = tree.getroot()
if xml_file.startswith('article'):
line = parse_article(tree)
elif xml_file.startswith('opinion'):
line = parse_opinion(tree)
if line:
print(';'.join(line), file=f)
# line = ET.tostring(root[1], encoding='utf8').decode('utf8')
# ret += re.sub(r'^<\?xml version.*\?>','',line)
# print(ret)
except Exception as e:
print(e)
traceback.print_exc()
# ret +="""
# </NewsML>
# """
# f = open("periodicoclm/index.xml", "w")
# f.write(ret)
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment