Skip to content

Instantly share code, notes, and snippets.

@RhetTbull
Last active March 12, 2023 01:36
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save RhetTbull/06617e33fe8645f75260311ab582fb6d to your computer and use it in GitHub Desktop.
Save RhetTbull/06617e33fe8645f75260311ab582fb6d to your computer and use it in GitHub Desktop.
Extract your "Saved Stories" articles from the Apple News app on macOS (thanks to @eecue who wrote much of this)
"""Get your "Saved Stories" articles from Apple News
Thanks to Dave Bullock (https://github.com/eecue) who's idea this was and who wrote the extract_info_from_apple_news function
This script requires the following modules be pip installed:
* bs4
* requests
Save this script to a file called news.py and run it with Python 3.9 or later
For a more robust implementation of this, see: https://github.com/RhetTbull/apple-news-to-sqlite
"""
from __future__ import annotations
import io
import pathlib
import plistlib
import requests
from bs4 import BeautifulSoup
def get_reading_list_bplist() -> bytes | None:
"""Get saved articles info from Apple News
Returns:
bytes: The saved articles binary plist as a bytes object
None: If the saved articles are not found
"""
# The saved articles are stored in a binary file called reading-list
# in the Apple News container (~/Library/Containers/com.apple.news)
# The file contains at least two binary plist (bplist) files
# embedded in it, the second of which contains the saved article IDs
# (The first is a binary NSKeyedArchiver archive)
# This function finds the second bplist file and returns it as a bytes object
# or None if the file is not found
news_container = (
"~/Library/Containers/com.apple.news/"
+ "Data/Library/Application Support/com.apple.news/"
+ "com.apple.news.public-com.apple.news.private-production"
)
reading_list_file = (
pathlib.Path(news_container, "reading-list").expanduser().absolute()
)
bplist_marker = b"\x62\x70\x6C\x69\x73\x74\x30\x30" # bplist00
reading_list = open(reading_list_file, "rb")
length = reading_list.seek(0, io.SEEK_END)
reading_list.seek(io.SEEK_SET)
found = 0
while window := reading_list.peek(1):
if len(window) >= 8 and window[:8] == bplist_marker:
found += 1
if found == 2:
return reading_list.read(length - reading_list.tell())
reading_list.read(1)
return None
def get_article_info(reading_list: bytes) -> dict[str, dict[str, str]] | None:
"""Decode the saved article information from the binary plist"""
return plistlib.loads(reading_list, fmt=plistlib.FMT_BINARY)
def extract_info_from_apple_news(news_id: str) -> dict[str, str]:
"""Extract the article URL, title, description, image, and author from Apple News"""
# Construct the Apple News URL from the ID
apple_news_url = f"https://apple.news/{news_id}"
# Send a GET request to the Apple News URL and get the response HTML
response = requests.get(apple_news_url)
html = response.content.decode("utf-8")
# Use BeautifulSoup to extract the URL from the redirectToUrlAfterTimeout function
soup = BeautifulSoup(html, "html.parser")
if script_tag := soup.find(
"script", string=lambda t: "redirectToUrlAfterTimeout" in t
):
url_start_index = script_tag.text.index('"https://') + 1
url_end_index = script_tag.text.index('"', url_start_index)
url = script_tag.text[url_start_index:url_end_index]
else:
url = None
# Extract the og:title, og:description, og:image, and author meta tags
if title_tag := soup.find("meta", property="og:title"):
title = title_tag["content"]
else:
title = None
if description_tag := soup.find("meta", property="og:description"):
description = description_tag["content"]
else:
description = None
if image_tag := soup.find("meta", property="og:image"):
image = image_tag["content"]
else:
image = None
if author_tag := soup.find("meta", {"name": "Author"}):
author = author_tag["content"]
else:
author = None
# Return the extracted information as a dictionary
return {
"url": url,
"title": title,
"description": description,
"image": image,
"author": author,
}
if __name__ == "__main__":
reading_list = get_reading_list_bplist()
articles = get_article_info(reading_list)
if articles is None:
print("No saved articles found")
exit(0)
for article in articles.values():
print(extract_info_from_apple_news(article["articleID"]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment