Last active
November 26, 2022 08:50
-
-
Save FlyMyPG/2e9d4532453182ada0da78e74980193b to your computer and use it in GitHub Desktop.
Read all toots ANYONE has written (excluding Boosts).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
MastoUserScrape.py - | |
Download toots (not Boosts) from 'https://mastodon.instance/@UserName' | |
and save in chronological order (oldest first). | |
Author: BobC - https://mastodon.hasameli.com/@BobC | |
License: "CC BY-NC-SA 4.0" https://creativecommons.org/licenses/by-nc-sa/4.0/ | |
Usage: python MastoUserScrape.py User@Instance ... | |
Parameters: User@Instance.tld - The Mastodon user(s). | |
Multiple users are supported. | |
Output: Status and progress messages go to stdout. | |
Toots go to file: "User@Instance.tld.txt" | |
This ***SHOULD*** work on ALL major Python 3.x platforms (Win, Lin, Mac). | |
Check the "TODO" comments for what's missing. | |
""" | |
DEBUG = False # Set to True for so much fun! | |
import sys | |
if __name__ != "__main__": | |
print("This program is NOT a library and must NOT be imported!", file=sys.stderr) | |
sys.exit() | |
# And now we will continue with our regularly scheduled program... | |
import os | |
import textwrap | |
import re | |
import code | |
import html | |
import requests as rq | |
from lxml import html as lxhtml | |
if DEBUG is True: | |
import pprint ####DEBUG | |
## Local functions: | |
if DEBUG is True: | |
def pdict(d, indent=1, spaces=3): | |
""" Print a dict that may contain other dicts """ | |
tabs = ' '*spaces*indent | |
for k, v in d.items(): | |
if not isinstance(v, dict): | |
try: | |
print("%s%s = "%(tabs, k), v) | |
except: | |
print("%s%s = [unprintable]"%(tabs, k)) | |
else: | |
print("%s%s = {dict}"%(tabs, k)) | |
pdict(v, indent=indent+1) | |
return | |
def witext(s, indent=1, width=80, spaces=3): | |
""" Return a wrapped and indented string """ | |
tabs = ' '*spaces*indent | |
w = width - len(tabs) | |
lines = s.split("\n") # Paragraphs and line breaks | |
lists = (textwrap.wrap(line, w, initial_indent=tabs, subsequent_indent=tabs) | |
for line in lines) | |
body = "\n".join("\n".join(l) for l in lists) | |
return body | |
def striphtml(data): | |
""" Remove or replace HTML tags and replace symbols """ | |
data = html.unescape(data) # Replace symbols | |
data = re.sub(r'<[/ ]*br[/ ]*>', '\n', data) # Preserve line breaks | |
data = re.sub(r'</*p>', '\n', data) # Preserve paragraphs | |
data = re.sub(r'<[^<]+?>', '', data) # Strip all other tags | |
return data | |
## Main code: | |
if DEBUG is True: | |
pp = pprint.PrettyPrinter() ####DEBUG | |
p = pp.pprint ####DEBUG | |
appName = os.path.splitext(os.path.basename(__file__))[0] | |
appArgNames = "User@Instance" | |
numArgs = len(sys.argv) - 1 | |
numAcct = 0 | |
if numArgs == 0: | |
print("Error: Argument[s] missing.") | |
print("Usage: %s %s ..."%(appName, appArgNames)) | |
sys.exit() | |
# Repeat for each user: | |
for user in sys.argv[1:]: | |
# Parameter validation: | |
# - Tolerate "@user@instance.tld" | |
if user[0] == '@': | |
user = user[1:] # Strip leading '@' | |
# - Require "user@instance.tld" | |
if not (('@' in user[1:-1]) and ('.' in user[user.find('@')+1:-1])): | |
print("\nSkipping bad parameter: ", user) | |
continue | |
outFile = "%s.txt"%(user) | |
usr, inst = user.split('@') | |
url = "https://%s/@%s"%(inst, usr) | |
print("\nUser: %s"%(user)) | |
toots = [] | |
times = [] | |
bad_url = False | |
# Repeat for each page of toots: | |
while True: | |
print("Processing: " + url) | |
try: | |
page = rq.get(url).content | |
except: | |
print("Skipping bad url: ", url) | |
bad_url = True | |
break | |
tree = lxhtml.fromstring(page) | |
base = '//div[@class="entry h-entry"]' # All toot info is in this div | |
#TODO: Extract more info than time and toot text, e.g. toot ID | |
times += tree.xpath(base + '//data[@class="dt-published"]/@value') | |
# This would be too wide as a list comprehension | |
for toot in tree.xpath(base + '//div[@class="e-content"]'): | |
toots.append(lxhtml.tostring(toot)) | |
url = tree.xpath('//a[@class="next"]/@href') # Next page of toots | |
if len(url) > 0: | |
url = url[0] | |
else: | |
break # No more URLs! | |
if bad_url is True: # I wish Python would let me escape 2 loops without this | |
continue | |
if len(times) != len(toots): # Sanity check for toot data | |
print("ERROR: # dates (%d) != # toots (%d)"%(len(times), len(toots))) | |
continue | |
# Send to file: | |
print("Processing: %d toots"%(len(toots))) | |
with open(outFile, 'w') as f: | |
t = list(zip(times, toots)) # Merge into tuples | |
for toot in reversed(t): # Process from oldest to newest | |
print("\n%s\n"%(toot[0]), witext(striphtml(toot[1].decode())), file=f) | |
print("Toots written to: %s\n"%(outFile)) | |
if DEBUG is True: | |
# Go interactive only when stdout not redirected ####DEBUG | |
if os.fstat(0) == os.fstat(1): ####DEBUG | |
print("\nEntering Interactive Mode: ^Z or ^D when done.") ####DEBUG | |
code.interact(local=dict(globals(), **locals())) ####DEBUG | |
print("\nExiting!") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment