jartigag/exportOPMLWithTags.py

## exportOPMLWithTags.py
#!/usr/bin/env python3

# this script exports the urls file to OPML, including tags. for that, all feeds must have only one tag

#usage: ./exportOPMLWithTags.py urls > urls.opml

#requeriments (just to get the title from a rss feed if it isn't cached in newsboat):
# pip install feedparser

#input-output example:
#
# $ cat urls
# https://xkcd.com/rss.xml "must"
# http://www.commitstrip.com/en/feed/? "must"
# https://jartigag.xyz/feed.xml "personal"
#
# $ cat urls.opml
# <?xml version="1.0" ?>
# <opml version="2.0">
#   <head/>
#   <body>
#     <outline title="must" type="rss">
#       <outline title="xkcd.com" type="rss" xmlUrl="https://xkcd.com/rss.xml"/>
#       <outline title="CommitStrip" type="rss" xmlUrl="http://www.commitstrip.com/en/feed/?"/>
#     </outline>
#     <outline title="personal" type="rss">
#       <outline title="jartigag" type="rss" xmlUrl="https://jartigag.xyz/feed.xml"/>
#     </outline>
#   </body>
# </opml>

from xml.etree import ElementTree as ET
from xml.dom import minidom
import sys, os
import csv
import sqlite3

try:

    if len(sys.argv)<2: raise Exception("input file needed")

    inputfilename = sys.argv[1]

    if not os.path.isfile(inputfilename): raise Exception(f"{inputfilename} not found")

    with open(inputfilename) as f:
        reader = csv.reader(f,delimiter=" ")
        lines = list(reader)

    root = ET.Element('opml', version='2.0')
    head = ET.SubElement(root, 'head')
    body = ET.SubElement(root, 'body')

    try:
        # get titles from ~/.newsboat/cache.db:
        with sqlite3.connect(f"{os.environ['HOME']}/.newsboat/cache.db") as conn:
            conn.row_factory = sqlite3.Row
            c = conn.cursor()
            c.execute("select rssurl,title from rss_feed")
            db_rows = c.fetchall()
    except sqlite.OperationalError:
        pass


    for line in lines:

        if len(line)<2:
            # lines must be `url "tag"`, so ignore this line
            print(f"ignoring this line:\n{' '.join(line)}", file=sys.stderr)
            continue

        if not body:
        # that is, body has no children
            tag = ET.SubElement(body, 'outline', type='rss', title=line[1])
        elif line[1] not in [ o.attrib['title'] for o in body.findall('outline') ]:
        # that is, this tag doesn't exist yet
            tag = ET.SubElement(body, 'outline', type='rss', title=line[1])

        for tag in body.findall('outline'):
            if tag.attrib['title']==line[1]:
            # that is, this is the tag we are looking for
                feed = ET.SubElement(tag, 'outline', type='rss', xmlUrl=line[0])
                for row in db_rows:
                # look for the title among the cached ones
                    if row['rssurl']==line[0]:
                        feed.set('title', row['title'])
                if 'title' not in feed.attrib:
                # that is, this feed's title isn't in ~/.newsboat/cache.db
                    try:
                        import feedparser
                        print(f"getting title from {line[0]}", file=sys.stderr)
                        feed.set('title', feedparser.parse(line[0])['feed']['title'])
                    except (ModuleNotFoundError, KeyError):
                        # can't get title neither from cache.db nor the xml of the feed,
                        # so left title blank
                        feed.set('title', "")

    print(
        minidom.parseString( ET.tostring(root) )
        .toprettyxml(indent="  ")
    )

except Exception as e:
    print(e)
	#!/usr/bin/env python3

	# this script exports the urls file to OPML, including tags. for that, all feeds must have only one tag

	#usage: ./exportOPMLWithTags.py urls > urls.opml

	#requeriments (just to get the title from a rss feed if it isn't cached in newsboat):
	# pip install feedparser

	#input-output example:
	#
	# $ cat urls
	# https://xkcd.com/rss.xml "must"
	# http://www.commitstrip.com/en/feed/? "must"
	# https://jartigag.xyz/feed.xml "personal"
	#
	# $ cat urls.opml
	# <?xml version="1.0" ?>
	# <opml version="2.0">
	# <head/>
	# <body>
	# <outline title="must" type="rss">
	# <outline title="xkcd.com" type="rss" xmlUrl="https://xkcd.com/rss.xml"/>
	# <outline title="CommitStrip" type="rss" xmlUrl="http://www.commitstrip.com/en/feed/?"/>
	# </outline>
	# <outline title="personal" type="rss">
	# <outline title="jartigag" type="rss" xmlUrl="https://jartigag.xyz/feed.xml"/>
	# </outline>
	# </body>
	# </opml>

	from xml.etree import ElementTree as ET
	from xml.dom import minidom
	import sys, os
	import csv
	import sqlite3

	try:

	if len(sys.argv)<2: raise Exception("input file needed")

	inputfilename = sys.argv[1]

	if not os.path.isfile(inputfilename): raise Exception(f"{inputfilename} not found")

	with open(inputfilename) as f:
	reader = csv.reader(f,delimiter=" ")
	lines = list(reader)

	root = ET.Element('opml', version='2.0')
	head = ET.SubElement(root, 'head')
	body = ET.SubElement(root, 'body')

	try:
	# get titles from ~/.newsboat/cache.db:
	with sqlite3.connect(f"{os.environ['HOME']}/.newsboat/cache.db") as conn:
	conn.row_factory = sqlite3.Row
	c = conn.cursor()
	c.execute("select rssurl,title from rss_feed")
	db_rows = c.fetchall()
	except sqlite.OperationalError:
	pass


	for line in lines:

	if len(line)<2:
	# lines must be `url "tag"`, so ignore this line
	print(f"ignoring this line:\n{' '.join(line)}", file=sys.stderr)
	continue

	if not body:
	# that is, body has no children
	tag = ET.SubElement(body, 'outline', type='rss', title=line[1])
	elif line[1] not in [ o.attrib['title'] for o in body.findall('outline') ]:
	# that is, this tag doesn't exist yet
	tag = ET.SubElement(body, 'outline', type='rss', title=line[1])

	for tag in body.findall('outline'):
	if tag.attrib['title']==line[1]:
	# that is, this is the tag we are looking for
	feed = ET.SubElement(tag, 'outline', type='rss', xmlUrl=line[0])
	for row in db_rows:
	# look for the title among the cached ones
	if row['rssurl']==line[0]:
	feed.set('title', row['title'])
	if 'title' not in feed.attrib:
	# that is, this feed's title isn't in ~/.newsboat/cache.db
	try:
	import feedparser
	print(f"getting title from {line[0]}", file=sys.stderr)
	feed.set('title', feedparser.parse(line[0])['feed']['title'])
	except (ModuleNotFoundError, KeyError):
	# can't get title neither from cache.db nor the xml of the feed,
	# so left title blank
	feed.set('title', "")

	print(
	minidom.parseString( ET.tostring(root) )
	.toprettyxml(indent=" ")
	)

	except Exception as e:
	print(e)