Skip to content

Instantly share code, notes, and snippets.

@lstroud
Created April 25, 2014 19:29
Show Gist options
  • Save lstroud/11300388 to your computer and use it in GitHub Desktop.
Save lstroud/11300388 to your computer and use it in GitHub Desktop.
This script will convert an opml feed list between file formats (XML, JSON, and CSV), scrub the list for feeds that are still active, filter the list by expression, and de-duplicate the feeds in the list. I built it because I needed to reorganize my feeds. Moving the feeds to a csv file, where the tag column is the folder(s), made it very quick …
/*
Copyright 2014 Les Stroud
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
@Grab(group='org.codehaus.gpars', module='gpars', version='1.1.0')
@Grab(group='org.codehaus.groovy.modules.http-builder', module='http-builder', version='0.7' )
@Grab(group='net.sourceforge.nekohtml', module='nekohtml', version='1.9.20')
@Grab(group='net.sf.opencsv', module='opencsv', version='2.3')
import groovyx.net.http.HTTPBuilder
import groovyx.net.http.ContentType
import groovyx.net.http.Method
import groovyx.net.http.Status
import groovyx.gpars.GParsPool
import org.cyberneko.html.parsers.SAXParser
import groovy.xml.XmlUtil
import org.xml.sax.SAXException
import groovy.xml.MarkupBuilder
import groovy.json.JsonBuilder
import groovy.json.JsonSlurper
import au.com.bytecode.opencsv.CSVReader
import au.com.bytecode.opencsv.CSVWriter
import groovy.json.JsonOutput
rootFolder = null;
currentFolder = null;
count = [
active_count:0,
inactive_count:0,
total_count:0
]
feed_count = 0;
active_summary = [];
def cli = new CliBuilder(usage: 'groovy scrub_opml.groovy [-h] [-ads] [-f <expression closure>] [-o <outputformat>] [-O <outputfile_path>] -i <inputformat> <inputfile_path>')
cli.with {
h longOpt: 'help', 'Show usage information'
a longOpt: 'active', 'Filter for active feeds'
i longOpt: 'informat', args:1, argName:'in-format', 'Input file format. Valid values (XML, JSON, CSV)'
t longOpt: 'outformat', args:1, argName:'out-format','Output file format. Valid values (XML, JSON, CSV)'
o longOpt: 'outfile', args:1, argName:'out-path', 'Output file path.'
d longOpt: 'dedup', 'Deduplicate the feeds.'
s longOpt: 'stats', 'Print stats.'
f longOpt: 'filter', args:1, argName:'filter-exp', 'Filter Feeds by closure (closure must be quoted and evaluatable) [ -f "{feed -> return feed.name.startsWith(\'CSS\')}"'
}
filter_active = false
dedup = false
informat = null
outformat = null
input_file = null
output_file = null
print_stats = false
filter = null
if(!args){
cli.usage()
System.exit(1)
}
def options = cli.parse(args)
if (options.h) {
cli.usage()
System.exit(0)
}
if(options.a)
filter_active = true
if(options.d)
dedup = true
if(options.s)
print_stats = true
if(options.f){
filter = Eval.me(options.f)
}
if(options.i){
informat = OPMLModel.FORMAT.valueOf(options.i)
} else {
informat = OPMLModel.FORMAT.XML
}
if(options.t){
outformat = OPMLModel.FORMAT.valueOf(options.t)
} else {
outformat = informat
}
if(options.o){
try{
output_file = new File(options.o)
} catch (t){
t.printStackTrace()
cli.usage()
System.exit(1)
}
}
def extraArguments = options.arguments()
if(extraArguments){
try{
input_file = new File(extraArguments[0])
} catch (t){
t.printStackTrace()
cli.usage()
System.exit(1)
}
}
def main(){
def opmlModel = OPMLModel.deserialize(informat, input_file)
if(print_stats)
println JsonOutput.prettyPrint(JsonOutput.toJson(opmlModel.stats))
if(filter_active){
opmlModel = opmlModel.filterBy(isActive)
println "After Active Filter"
println JsonOutput.prettyPrint(JsonOutput.toJson(opmlModel.stats))
}
if(filter){
opmlModel = opmlModel.filterBy(filter)
println "After Filter ${filter}"
println JsonOutput.prettyPrint(JsonOutput.toJson(opmlModel.stats))
}
if(dedup){
opmlModel = opmlModel.deDup()
println "After DeDuplication"
println JsonOutput.prettyPrint(JsonOutput.toJson(opmlModel.stats))
}
def output = opmlModel.serialize(outformat)
if(output_file){
output_file.text = output
} else {
println output
}
}
year_ago = new Date() - 365;
isActive = {feed ->
//def content = feed.url.toURL().text
def content = ""
def active = false
def tryagain = false
def error_msg = null
def _pubdates = []
try {
//def http = new HTTPBuilder(feed.url)
//def rss = http.get([contentType: groovyx.net.http.ContentType.XML])
//def content = feed.url.toURL().getText([connectTimeout:5000, readTimeout:20000])
def resp = new HTTPBuilder().request(feed.url, Method.GET, ContentType.TEXT) { req ->
headers.Accept = 'application/rss+xml, application/rdf+xml, application/xml, text/xml'
response.success = { r, reader ->
content = reader.text
return r
}
def unrecoverable = { r ->
throw new RuntimeException(r.statusLine as String)
}
response.'404' = unrecoverable
response.'500' = unrecoverable
response.'403' = unrecoverable
response.failure = { r ->
tryagain = true;
return r
}
}
if(tryagain){
tryagain = false
resp = new HTTPBuilder().request(feed.url, Method.GET, ContentType.TEXT) { req ->
headers.Accept = 'application/rss+xml, application/rdf+xml, application/xml, text/xml'
response.success = { r, reader ->
content = reader.text
return r
}
response.failure = { r -> throw new RuntimeException(r.statusLine as String)}
}
}
if(Status.SUCCESS.matches(resp.status)){
//feeds were too inconsistent and I didn't really need it to be parsed
//def rss = new XmlSlurper().parseText(content)
//println XmlUtil.serialize(rss)
//active = (rss.channel.item.pubDate.find{(new Date(it.text())) > year_ago})
def m = content =~ /pubDate>(.*)<\/pubDate/
m.each{match ->
def _pubdate
try{
_pubdate = new Date(match[1])
_pubdates << _pubdate
} catch(t){;;}
if(_pubdate && _pubdate > year_ago)
active = true
}
}
} catch (IOException ioe){
error_msg = "ERROR checking: ${feed.name} ${feed.url}"
def sw = new StringWriter()
def pw = new PrintWriter(sw)
ioe.printStackTrace(pw)
error_msg += "\n" + sw.toString()
active = false
} catch (SAXException se){
error_msg = "ERROR checking: ${feed.url} - [Parse Error - assuming active]"
active = true
} catch (t){
error_msg = "ERROR checking: ${feed.name} ${feed.url} - [${t.message}]"
active = false
}
count.total_count++
if(active)
count.active_count++
else
count.inactive_count++
printProgBar((count.total_count/feed_count)*100 as int)
active_summary << "${feed.name} [${(active)?'Active':'Inactive'}] - Latest pub date: ${_pubdates.max() as String}"
return active
}
class OPMLModel {
static enum FORMAT {
XML, JSON, CSV
}
def root;
OPMLModel(){
root = new Folder(name:'Root')
}
static OPMLModel deserialize(FORMAT format, File file){
switch(format){
case FORMAT.XML:
return XMLConverter.deserialize(file);
case FORMAT.JSON:
return JSONConverter.deserialize(file);
case FORMAT.CSV:
return CSVConverter.deserialize(file);
}
}
def deDup(){
def feed_map = [:]
deDupFeeds(this.root.children, null, feed_map)
//println JsonOutput.prettyPrint(JsonOutput.toJson(feed_map))
def folder_hash = [:]
OPMLModel model = new OPMLModel()
feed_map.each{ k,v ->
if(v.folders){
v.folders.each { t ->
def _t = t.trim()
if(!folder_hash[_t])
folder_hash[_t] = new Folder(name:_t)
folder_hash[_t].children << v.feed
}
} else {
model.root.children << v.feed
}
}
folder_hash.each{ k,v ->
model.root.children << v
}
return model
}
private deDupFeeds(nodes, parent, feed_map){
nodes.each{ node ->
if(node instanceof Feed){
if(!feed_map[node.url]){
def _folders = []
if(parent)
_folders = [parent.name]
feed_map[node.url] = [feed: node, folders: _folders]
}
else{
if(feed_map[node.url].folders){
//println "Feed: ${node.name} already exists in folder(s) ${feed_map[node.url].folders}. Dropping this copy ${node.toString()} in folder ${parent.toString()}"
;;
} else {
if(parent){
feed_map[node.url].folders << parent.name
}
}
}
} else if(node instanceof Folder) {
deDupFeeds(node.children, node, feed_map)
}
}
}
def getStats(){
def stats = [feed_count: 0, folder_count: 0, feeds_in_folders: 0]
getOPMLStats(this.root.children, null, stats)
return stats
}
private getOPMLStats(nodes, parent, stats){
nodes.each{ node ->
if(node instanceof Feed){
stats.feed_count++;
if(parent)
stats.feeds_in_folders++;
} else if(node instanceof Folder) {
stats.folder_count++;
getOPMLStats(node.children, node, stats)
}
}
}
def OPMLModel filterBy(predicate){
def model = new OPMLModel();
model.root = this.root.filterBy(predicate)
return model
}
def String serialize(FORMAT format){
switch(format){
case FORMAT.XML:
return XMLConverter.serialize(this);
case FORMAT.JSON:
return JSONConverter.serialize(this);
case FORMAT.CSV:
return CSVConverter.serialize(this);
}
}
}
class XMLConverter {
static OPMLModel deserialize(File file){
def model = new OPMLModel();
def records = new XmlSlurper().parseText(file.text)
model.root.children = buildFolder(records.body.outline)
return model
}
static String serialize(OPMLModel model){
def writer = new StringWriter()
def xml = new MarkupBuilder(writer)
xml.opml(version:'1.0'){
head{
title 'RSS Feeds'
}
body{
for(f in model.root.children){
if(f instanceof Feed){
outline(text: f.name, title:f.title, url:f.url, xmlUrl:f.url)
} else if(f instanceof Folder) {
serializeFolder(xml, f)
}
}
}
}
return writer.toString()
}
private static serializeFolder(builder, folder){
builder.outline(text: folder.name){
for(f in folder.children){
if(f instanceof Feed){
outline(text: f.name, title:f.title, url:f.url, xmlUrl:f.url)
} else if(f instanceof Folder) {
serializeFolder(builder, f)
}
}
}
}
private static buildFolder(elements){
def feed_list = [];
//println "building elements ${elements.size()}"
elements.each{ element ->
if(!element.'@xmlUrl'.text()){ //is a folder
//println "Found folder ${element.'@text'} with ${element.outline.size()} children."
def folder = new Folder(name:element.'@text'.text())
folder.children = buildFolder(element.outline);
feed_list << folder
} else {
feed_list << new Feed(name:element.'@text'.text(), title:element.'@title'.text(), url:element.'@xmlUrl'.text() )
}
}
return feed_list
}
}
class JSONConverter {
static OPMLModel deserialize(File file){
def model = new OPMLModel();
def json = new JsonSlurper().parseText(file.text)
model.root.children = buildFolder(json)
return model
}
static String serialize(OPMLModel model){
return JsonOutput.prettyPrint(JsonOutput.toJson(model.root.children))
}
private static buildFolder(elements){
def feed_list = [];
//println "building elements ${elements.size()}"
elements.each{ element ->
if(!element.url){ //is a folder
def folder = new Folder(name:element.name)
folder.children = buildFolder(element.children);
feed_list << folder
} else {
feed_list << new Feed(name:element.text, title:element.title, url:element.url )
}
}
return feed_list
}
}
class CSVConverter {
static OPMLModel deserialize(File file){
StringReader sr = new StringReader(file.text)
CSVReader reader = new CSVReader(sr, ',' as char, '"' as char, 1)
def folder_hash = [:]
OPMLModel model = new OPMLModel()
reader.readAll().each{ row ->
def _name, _title, _url, _tags
_name = row[0]
_title = row[1]
_url = row[2]
_tags = row[3]
if(_tags){
_tags.split(',').each { t ->
def _t = t.trim()
if(!folder_hash[_t])
folder_hash[_t] = new Folder(name:_t)
folder_hash[_t].children << new Feed(name:_name, title:_title, url:_url)
}
} else {
model.root.children << new Feed(name:_name, title:_title, url:_url)
}
}
folder_hash.each{ k,v ->
model.root.children << v
}
return model
}
static String serialize(OPMLModel model){
def model_list = []
String[] header = ["Name", "Title", "Url", "Tags"]
model_list << header
flatten(model.root.children, null, model_list)
StringWriter sw = new StringWriter()
CSVWriter writer = new CSVWriter(sw, ',' as char, '"' as char)
model_list.each{ String[] row ->
writer.writeNext(row)
}
return sw.toString()
}
private static List flatten(nodes, parent, node_list){
nodes.each{ node ->
if(node instanceof Feed){
def node_tags = ""
if(parent)
node_tags = parent.name
String[] flatnode = [node.name, node.title, node.url, node_tags]
node_list << flatnode
} else if(node instanceof Folder) {
flatten(node.children, node, node_list)
}
}
}
}
class Folder {
String name
def children = []
def filterBy(predicate){
def filtered = []
//GParsPool.withPool(8){
children.each{c ->
def result = c.filterBy(predicate)
if(result)
filtered << result
}
//}
if(filtered){
def _filteredFolder = new Folder(name:name);
_filteredFolder.children = filtered;
return _filteredFolder;
} else {
return null;
}
}
def String toString(){
return "Folder: ${name} Child Count: ${children.size()}"
}
}
class Feed {
String name
String title
String url
//int article_count = 0
def filterBy(predicate){
if(predicate(this))
return this
else
return null
}
def String toString(){
return "Feed Name: ${name} Title: ${title} Url:${url}"
}
}
def void printProgBar(int percent){
StringBuilder bar = new StringBuilder("[");
for(int i = 0; i < 50; i++){
if( i < (percent/2)){
bar.append("=");
}else if( i == (percent/2)){
bar.append(">");
}else{
bar.append(" ");
}
}
bar.append("] " + percent + "% ");
bar.append(" ${count.active_count}|${count.inactive_count} ")
System.out.print("\r" + bar.toString());
}
main()
@ermik
Copy link

ermik commented Feb 21, 2017

👏

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment