Skip to content

Instantly share code, notes, and snippets.

@it-is-wednesday
Last active April 21, 2024 15:38
Show Gist options
  • Save it-is-wednesday/973c7c8d7d700c6238b7697be111cc06 to your computer and use it in GitHub Desktop.
Save it-is-wednesday/973c7c8d7d700c6238b7697be111cc06 to your computer and use it in GitHub Desktop.
#!/usr/bin/env pip-run
# /// script
# dependencies = [
# 'feedparser',
# 'readability-lxml',
# 'lxml_html_clean',
# 'requests',
# ]
# ///
import fileinput
import json
import logging
from datetime import datetime
from functools import partial
from os import getenv
from pathlib import Path
from string import Template
from sys import stderr
from time import mktime, struct_time
from typing import Iterable, TypedDict
import feedparser
import readability
import requests
HTTP_OK = 200
PATH_LAST_SCAN = "~/.local/state/news-to-pocketbook/last-scan-timestamp"
# I HATE THE ANTISCRAPE WEB
USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64; rv:124.0) Gecko/20100101 Firefox/124.0"
DOC_TEMPLATE = """
<html>
<head>
<title>$title</title>
<style>$style</style>
</head>
<body>$content</body>
</html>
"""
DROPBOX_ENDPOINT = "https://content.dropboxapi.com/2/files/upload"
class Entry(TypedDict):
title: str
content: str
style: str
link: str
summary: str
published_parsed: struct_time
def fetch_entries(feed_url: str) -> Iterable[Entry]:
feed = feedparser.parse(feed_url)
yield from feed["entries"]
def is_after_date(entry: Entry, date: datetime) -> bool:
dt = datetime.fromtimestamp(mktime(entry["published_parsed"]))
return dt > date
def html_from_entry(entry: Entry, template: Template) -> str:
content = entry["content"] if entry.get("content") else entry["summary"]
content = content[0]["value"].replace("\\n", "").replace("\\r", "")
mapping = {
**entry,
"style": entry.get("style", ""),
"content": content,
}
return template.substitute(mapping)
def upload_file(content: str, title: str, token: str) -> None:
args = {
"autorename": False,
"mode": "add",
"mute": False,
"path": f"/Apps/Dropbox PocketBook/{title}.html",
"strict_conflict": False,
}
r = requests.post(
DROPBOX_ENDPOINT,
data=content,
headers={
"Authorization": f"Bearer {token}",
"Dropbox-API-Arg": json.dumps(args),
"Content-Type": "application/octet-stream",
},
)
if r.status_code != HTTP_OK:
logging.error("Could not upload '%s' to Dropbox: %s", title, r.json())
def main() -> None:
dropbox_token = getenv("POCKET_NEWS__DROPBOX_TOKEN")
if not dropbox_token:
print("Please pass POCKET_NEWS__DROPBOX_TOKEN", file=stderr)
return
statefile = Path(PATH_LAST_SCAN).expanduser().absolute()
statefile.parent.mkdir(parents=True, exist_ok=True)
if not statefile.exists():
date_last_scan = datetime.min
else:
date_last_scan = datetime.fromisoformat(statefile.read_text().strip())
template = Template(DOC_TEMPLATE)
for feed_url in fileinput.input():
entries = fetch_entries(feed_url.strip())
date_filter_func = partial(is_after_date, date=date_last_scan)
entries_new = filter(date_filter_func, entries)
for entry in entries_new:
html = html_from_entry(entry, template)
print(f"Uploading {entry['title']}")
upload_file(html, entry["title"], dropbox_token)
statefile.write_text(datetime.now().isoformat())
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment