Skip to content

Instantly share code, notes, and snippets.

@FlyMyPG
Last active November 26, 2022 08:50
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save FlyMyPG/2e9d4532453182ada0da78e74980193b to your computer and use it in GitHub Desktop.
Save FlyMyPG/2e9d4532453182ada0da78e74980193b to your computer and use it in GitHub Desktop.
Read all toots ANYONE has written (excluding Boosts).
# -*- coding: utf-8 -*-
"""
MastoUserScrape.py -
Download toots (not Boosts) from 'https://mastodon.instance/@UserName'
and save in chronological order (oldest first).
Author: BobC - https://mastodon.hasameli.com/@BobC
License: "CC BY-NC-SA 4.0" https://creativecommons.org/licenses/by-nc-sa/4.0/
Usage: python MastoUserScrape.py User@Instance ...
Parameters: User@Instance.tld - The Mastodon user(s).
Multiple users are supported.
Output: Status and progress messages go to stdout.
Toots go to file: "User@Instance.tld.txt"
This ***SHOULD*** work on ALL major Python 3.x platforms (Win, Lin, Mac).
Check the "TODO" comments for what's missing.
"""
DEBUG = False # Set to True for so much fun!
import sys
if __name__ != "__main__":
print("This program is NOT a library and must NOT be imported!", file=sys.stderr)
sys.exit()
# And now we will continue with our regularly scheduled program...
import os
import textwrap
import re
import code
import html
import requests as rq
from lxml import html as lxhtml
if DEBUG is True:
import pprint ####DEBUG
## Local functions:
if DEBUG is True:
def pdict(d, indent=1, spaces=3):
""" Print a dict that may contain other dicts """
tabs = ' '*spaces*indent
for k, v in d.items():
if not isinstance(v, dict):
try:
print("%s%s = "%(tabs, k), v)
except:
print("%s%s = [unprintable]"%(tabs, k))
else:
print("%s%s = {dict}"%(tabs, k))
pdict(v, indent=indent+1)
return
def witext(s, indent=1, width=80, spaces=3):
""" Return a wrapped and indented string """
tabs = ' '*spaces*indent
w = width - len(tabs)
lines = s.split("\n") # Paragraphs and line breaks
lists = (textwrap.wrap(line, w, initial_indent=tabs, subsequent_indent=tabs)
for line in lines)
body = "\n".join("\n".join(l) for l in lists)
return body
def striphtml(data):
""" Remove or replace HTML tags and replace symbols """
data = html.unescape(data) # Replace symbols
data = re.sub(r'<[/ ]*br[/ ]*>', '\n', data) # Preserve line breaks
data = re.sub(r'</*p>', '\n', data) # Preserve paragraphs
data = re.sub(r'<[^<]+?>', '', data) # Strip all other tags
return data
## Main code:
if DEBUG is True:
pp = pprint.PrettyPrinter() ####DEBUG
p = pp.pprint ####DEBUG
appName = os.path.splitext(os.path.basename(__file__))[0]
appArgNames = "User@Instance"
numArgs = len(sys.argv) - 1
numAcct = 0
if numArgs == 0:
print("Error: Argument[s] missing.")
print("Usage: %s %s ..."%(appName, appArgNames))
sys.exit()
# Repeat for each user:
for user in sys.argv[1:]:
# Parameter validation:
# - Tolerate "@user@instance.tld"
if user[0] == '@':
user = user[1:] # Strip leading '@'
# - Require "user@instance.tld"
if not (('@' in user[1:-1]) and ('.' in user[user.find('@')+1:-1])):
print("\nSkipping bad parameter: ", user)
continue
outFile = "%s.txt"%(user)
usr, inst = user.split('@')
url = "https://%s/@%s"%(inst, usr)
print("\nUser: %s"%(user))
toots = []
times = []
bad_url = False
# Repeat for each page of toots:
while True:
print("Processing: " + url)
try:
page = rq.get(url).content
except:
print("Skipping bad url: ", url)
bad_url = True
break
tree = lxhtml.fromstring(page)
base = '//div[@class="entry h-entry"]' # All toot info is in this div
#TODO: Extract more info than time and toot text, e.g. toot ID
times += tree.xpath(base + '//data[@class="dt-published"]/@value')
# This would be too wide as a list comprehension
for toot in tree.xpath(base + '//div[@class="e-content"]'):
toots.append(lxhtml.tostring(toot))
url = tree.xpath('//a[@class="next"]/@href') # Next page of toots
if len(url) > 0:
url = url[0]
else:
break # No more URLs!
if bad_url is True: # I wish Python would let me escape 2 loops without this
continue
if len(times) != len(toots): # Sanity check for toot data
print("ERROR: # dates (%d) != # toots (%d)"%(len(times), len(toots)))
continue
# Send to file:
print("Processing: %d toots"%(len(toots)))
with open(outFile, 'w') as f:
t = list(zip(times, toots)) # Merge into tuples
for toot in reversed(t): # Process from oldest to newest
print("\n%s\n"%(toot[0]), witext(striphtml(toot[1].decode())), file=f)
print("Toots written to: %s\n"%(outFile))
if DEBUG is True:
# Go interactive only when stdout not redirected ####DEBUG
if os.fstat(0) == os.fstat(1): ####DEBUG
print("\nEntering Interactive Mode: ^Z or ^D when done.") ####DEBUG
code.interact(local=dict(globals(), **locals())) ####DEBUG
print("\nExiting!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment