Skip to content

Instantly share code, notes, and snippets.

@Segerberg
Created October 2, 2018 11:16
Show Gist options
  • Save Segerberg/52b9aa160ccddf9de8aff3035102d5ef to your computer and use it in GitHub Desktop.
Save Segerberg/52b9aa160ccddf9de8aff3035102d5ef to your computer and use it in GitHub Desktop.
import feedparser
import sqlite3
import os
from datetime import datetime
import json
"""
Small script to save links with dedup from rss-feeds for archiving with Squidwarc
usage:
$ python rss_getter.py > conf.json
"""
class Dedup:
def __init__(self):
self.file = os.path.join('./dedup.db')
def start(self):
conn = sqlite3.connect(self.file)
conn.execute(
'create table if not exists dedup ('
' key varchar(300) primary key,'
' value varchar(4000)'
');')
conn.commit()
conn.close()
def save(self, digest_key, url):
conn = sqlite3.connect(self.file)
conn.execute(
'insert or replace into dedup (key, value) values (?, ?)',
(digest_key, url))
conn.commit()
conn.close()
def lookup(self, digest_key, url=None):
result = False
conn = sqlite3.connect(self.file)
cursor = conn.execute('select value from dedup where key = ?', (digest_key,))
result_tuple = cursor.fetchone()
conn.close()
if result_tuple:
result = True
return result
def main():
data = {
"use": "puppeteer",
"headless": True,
"mode": "page-only",
"depth": 1,
"seeds": [
], "warc": {
"naming": "url"
},
"connect": {
"launch": True,
"host": "localhost",
"port": 9222
},
"crawlControl": {
"globalWait": 20000,
"inflightIdle": 1000,
"numInflight": 2,
"navWait": 8000
}
}
feeds = [
'https://www5.goteborg.se/prod/fastighetskontoret/etjanst/planobygg.nsf/rss',
'https://goteborg.se/wps/wcm/connect/Portal%20Site/Aktuellt/?srv=cmpnt&source=library&cmpntname=goteborg3.0/RSS_Aktuellt',
'http://www.vartgoteborg.se/prod/sk/vargotnu.nsf/lopsedel.xml'
]
for feed in feeds:
f = feedparser.parse(feed)
d = Dedup()
d.start()
for link in f.entries:
if d.lookup(link.link):
continue
else:
d.save(link.link,datetime.now())
data["seeds"].append(link.link)
if len(data["seeds"]) != 0:
print (json.dumps(data))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment