Skip to content

Instantly share code, notes, and snippets.

Last active Jul 12, 2021
What would you like to do?
Scrapes urls from your mastodon account and adds them to shiori
#!/usr/bin/env python3
# Copyright (C) 2021 Andrew Ekstedt (magical)
# Borrows a small amount of code from mastodon-backup,
# Copyright (C) 2017-2018 Alex Schroeder <>
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <>.
import sys
import os.path
import logging
import argparse
import subprocess
# dependencies:
from mastodon import Mastodon, MastodonError
from bs4 import BeautifulSoup, SoupStrainer
SHIORI = "~/go/bin/shiori"
log = logging.getLogger('url_scraper')
def login(user, scopes = ['read']):
Login to your Mastodon account
if '@' not in user:
raise ValueError(user)
(username, domain) = user.split("@", 1)
url = 'https://' + domain
client_secret = domain + '.client.secret'
user_secret = domain + '.user.' + username + '.secret'
mastodon = None
if not os.path.isfile(client_secret):
print("Registering app")
api_base_url = url,
to_file = client_secret)
if not os.path.isfile(user_secret):
print("Log in")
mastodon = Mastodon(
client_id = client_secret,
api_base_url = url)
url = mastodon.auth_request_url(
client_id = client_secret,
print("Visit the following URL and authorize the app:")
print("Then paste the access token here:")
token = sys.stdin.readline().rstrip()
# on the very first login, pace has no effect
username = username,
code = token,
to_file = user_secret,
# in case the user kept running into a General API problem
mastodon = Mastodon(
client_id = client_secret,
access_token = user_secret,
api_base_url = url,
return mastodon
def scrape(user, dry_run=False):
mastodon = login(user)
except MastodonError as e:
log.warning("error logging in as %s: %s", user, e)
# pick up where we left off
last_id = None
last_id_file = get_last_id_filename(user)
with open(last_id_file, 'r') as f:
last_id = int(
except (OSError, IOError, ValueError):
me =
p = mastodon.account_statuses(me, since_id=last_id, limit=100)
except MastodonError as e:
log.error("error getting statuses, user = %s", user, exc_info=e)
done = set()
while p:
for toot in reversed(p):
if toot.card:"url: %s", toot.card.url)
saved = save_url(toot.card.url, dry_run=dry_run)
if saved:
for url in extract_urls_from_string(toot.content):
if url in done:
continue"url: %s", url)
saved = save_url(url, dry_run=dry_run)
if saved:
last_id =
p = mastodon.fetch_previous(p)
except MastodonError as e:
log.error("error getting statuses, user = %s", user, exc_info=e)
if last_id and not dry_run:
with open(last_id_file, 'w') as f:
print(last_id, file=f)
def extract_urls_from_string(content):
"""scans an html string for <a href>
>>> list(extract_urls_from_string('''<div class="status__content__text status__content__text--visible translate"><p>thanks, I hate it</p><p><a href="" rel="noopener noreferrer" target="_blank" class="status-link unhandled-link" title=""><span class="invisible">https://www.</span><span class="ellipsis"></span><span class="invisible">/firefox-design-redesign-bigger-floatier-tabs</span></a></p></div>'''))
if not content:
if not isinstance(content, str):
raise TypeError(content)
filter = SoupStrainer('a', href=True)
soup = BeautifulSoup(content, 'html.parser', parse_only=filter)
for tag in soup.find_all('a'):
href = tag.get('href')
if href and href.startswith(("http://", "https://")):
cls = tag.get('class')
if cls and 'mention' in cls:
if cls and 'hashtag' in cls:
# TODO: extract tags
continue'found link: %s', tag)
yield href
def get_last_id_filename(user):
if '@' not in user:
raise ValueError(user)
(username, domain) = user.split("@", 1)
filename = domain + '.user.' + username + '.lastid'
return filename
def save_url(url, dry_run=False):
"""add a url to shiori.
returns True if successful (or the url has already been added)
returns False on error and logs a message.
if dry_run:
return True
output = subprocess.check_output([os.path.expanduser(SHIORI), 'add', '-t', 'from-mastodon', url], stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
log.error("shiori failed, url = %s", url, exc_info=e)
return False
# if shiori already has a bookmark,
# it outputs "UNIQUE constraint failed: bookmark.url"
# on success, it outputs:
# 97. is a computer meant for sharing. (21-31 minutes)
# >
# + Wed 02 Dec 2020 07:52:10 PM UTC
# # from-mastodon
if not output:
log.error("shiori failed with no output, url = %s", url)
return False
if b"UNIQUE constraint failed: bookmark.url" in output:"url already added, url = %s", url)
return True
if output.count(b'\n') != 5:
log.error("shiori output doesn't look right, url = %s, output = %s", url, output)
return False
return True
def main():
# parse arguments
parser = argparse.ArgumentParser()
parser.add_argument('--log', help='path to log file (default: stderr)')
parser.add_argument('-n', '--dry-run', action='store_true', help="don't actually save the urls")
parser.add_argument('user', help="user@domain for the mastodon account to scan")
args = parser.parse_args()
# sanity check
if not os.path.exists(os.path.expanduser(SHIORI)):
log.error("command not found: %s", SHIORI)
# get data dir
data_var = 'XDG_DATA_HOME'
if data_var in os.environ and os.environ[data_var] and os.path.isabs(os.environ[data_var]):
data_dir = os.environ[data_var]
data_dir = os.path.expanduser("~/.config/share")
# create data dir if necessary
os.makedirs(data_dir, exist_ok=True)
data_dir = os.path.join(data_dir, "url_scraper")
os.makedirs(data_dir, mode=0o700, exist_ok=True)
# set up logging
# TODO: log errors to a different file?
if args.log:
if os.path.isabs(args.log):
log_file = args.log
log_file = os.path.join(data_dir, args.log)
logging.basicConfig(filename=log_file, level=logging.INFO)
# cd to data dir
# scrape the thing
scrape(args.user, dry_run=args.dry_run)
except Exception as e:
log.error("unhandled exception", exc_info=e)
if __name__ == '__main__':
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment