Last active
March 12, 2023 01:36
-
-
Save RhetTbull/06617e33fe8645f75260311ab582fb6d to your computer and use it in GitHub Desktop.
Extract your "Saved Stories" articles from the Apple News app on macOS (thanks to @eecue who wrote much of this)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Get your "Saved Stories" articles from Apple News | |
Thanks to Dave Bullock (https://github.com/eecue) who's idea this was and who wrote the extract_info_from_apple_news function | |
This script requires the following modules be pip installed: | |
* bs4 | |
* requests | |
Save this script to a file called news.py and run it with Python 3.9 or later | |
For a more robust implementation of this, see: https://github.com/RhetTbull/apple-news-to-sqlite | |
""" | |
from __future__ import annotations | |
import io | |
import pathlib | |
import plistlib | |
import requests | |
from bs4 import BeautifulSoup | |
def get_reading_list_bplist() -> bytes | None: | |
"""Get saved articles info from Apple News | |
Returns: | |
bytes: The saved articles binary plist as a bytes object | |
None: If the saved articles are not found | |
""" | |
# The saved articles are stored in a binary file called reading-list | |
# in the Apple News container (~/Library/Containers/com.apple.news) | |
# The file contains at least two binary plist (bplist) files | |
# embedded in it, the second of which contains the saved article IDs | |
# (The first is a binary NSKeyedArchiver archive) | |
# This function finds the second bplist file and returns it as a bytes object | |
# or None if the file is not found | |
news_container = ( | |
"~/Library/Containers/com.apple.news/" | |
+ "Data/Library/Application Support/com.apple.news/" | |
+ "com.apple.news.public-com.apple.news.private-production" | |
) | |
reading_list_file = ( | |
pathlib.Path(news_container, "reading-list").expanduser().absolute() | |
) | |
bplist_marker = b"\x62\x70\x6C\x69\x73\x74\x30\x30" # bplist00 | |
reading_list = open(reading_list_file, "rb") | |
length = reading_list.seek(0, io.SEEK_END) | |
reading_list.seek(io.SEEK_SET) | |
found = 0 | |
while window := reading_list.peek(1): | |
if len(window) >= 8 and window[:8] == bplist_marker: | |
found += 1 | |
if found == 2: | |
return reading_list.read(length - reading_list.tell()) | |
reading_list.read(1) | |
return None | |
def get_article_info(reading_list: bytes) -> dict[str, dict[str, str]] | None: | |
"""Decode the saved article information from the binary plist""" | |
return plistlib.loads(reading_list, fmt=plistlib.FMT_BINARY) | |
def extract_info_from_apple_news(news_id: str) -> dict[str, str]: | |
"""Extract the article URL, title, description, image, and author from Apple News""" | |
# Construct the Apple News URL from the ID | |
apple_news_url = f"https://apple.news/{news_id}" | |
# Send a GET request to the Apple News URL and get the response HTML | |
response = requests.get(apple_news_url) | |
html = response.content.decode("utf-8") | |
# Use BeautifulSoup to extract the URL from the redirectToUrlAfterTimeout function | |
soup = BeautifulSoup(html, "html.parser") | |
if script_tag := soup.find( | |
"script", string=lambda t: "redirectToUrlAfterTimeout" in t | |
): | |
url_start_index = script_tag.text.index('"https://') + 1 | |
url_end_index = script_tag.text.index('"', url_start_index) | |
url = script_tag.text[url_start_index:url_end_index] | |
else: | |
url = None | |
# Extract the og:title, og:description, og:image, and author meta tags | |
if title_tag := soup.find("meta", property="og:title"): | |
title = title_tag["content"] | |
else: | |
title = None | |
if description_tag := soup.find("meta", property="og:description"): | |
description = description_tag["content"] | |
else: | |
description = None | |
if image_tag := soup.find("meta", property="og:image"): | |
image = image_tag["content"] | |
else: | |
image = None | |
if author_tag := soup.find("meta", {"name": "Author"}): | |
author = author_tag["content"] | |
else: | |
author = None | |
# Return the extracted information as a dictionary | |
return { | |
"url": url, | |
"title": title, | |
"description": description, | |
"image": image, | |
"author": author, | |
} | |
if __name__ == "__main__": | |
reading_list = get_reading_list_bplist() | |
articles = get_article_info(reading_list) | |
if articles is None: | |
print("No saved articles found") | |
exit(0) | |
for article in articles.values(): | |
print(extract_info_from_apple_news(article["articleID"])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment