FlyMyPG/MastoUserScrape.py

## MastoUserScrape.py
# -*- coding: utf-8 -*-
"""
    MastoUserScrape.py -
        Download toots (not Boosts) from 'https://mastodon.instance/@UserName'
        and save in chronological order (oldest first).

    Author:  BobC - https://mastodon.hasameli.com/@BobC
    License: "CC BY-NC-SA 4.0" https://creativecommons.org/licenses/by-nc-sa/4.0/

    Usage: python MastoUserScrape.py User@Instance ...

    Parameters: User@Instance.tld  - The Mastodon user(s).

    Multiple users are supported.

    Output: Status and progress messages go to stdout.
            Toots go to file: "User@Instance.tld.txt"

    This ***SHOULD*** work on ALL major Python 3.x platforms (Win, Lin, Mac).

    Check the "TODO" comments for what's missing.
"""
DEBUG = False   # Set to True for so much fun!

import sys

if __name__ != "__main__":
    print("This program is NOT a library and must NOT be imported!", file=sys.stderr)
    sys.exit()

# And now we will continue with our regularly scheduled program...
import os
import textwrap
import re
import code
import html
import requests as rq
from lxml import html as lxhtml

if DEBUG is True:
    import pprint   ####DEBUG


## Local functions:

if DEBUG is True:
    def pdict(d, indent=1, spaces=3):
        """ Print a dict that may contain other dicts """
        tabs = ' '*spaces*indent
        for k, v in d.items():
            if not isinstance(v, dict):
                try:
                    print("%s%s = "%(tabs, k), v)
                except:
                    print("%s%s = [unprintable]"%(tabs, k))
            else:
                print("%s%s = {dict}"%(tabs, k))
                pdict(v, indent=indent+1)
        return


def witext(s, indent=1, width=80, spaces=3):
    """ Return a wrapped and indented string """
    tabs = ' '*spaces*indent
    w = width - len(tabs)
    lines = s.split("\n")    # Paragraphs and line breaks
    lists = (textwrap.wrap(line, w, initial_indent=tabs, subsequent_indent=tabs)
             for line in lines)
    body = "\n".join("\n".join(l) for l in lists)
    return body


def striphtml(data):
    """ Remove or replace HTML tags and replace symbols """
    data = html.unescape(data)                      # Replace symbols
    data = re.sub(r'<[/ ]*br[/ ]*>', '\n', data)    # Preserve line breaks
    data = re.sub(r'</*p>', '\n', data)             # Preserve paragraphs
    data = re.sub(r'<[^<]+?>', '', data)            # Strip all other tags
    return data


## Main code:

if DEBUG is True:
    pp = pprint.PrettyPrinter() ####DEBUG
    p = pp.pprint               ####DEBUG

appName = os.path.splitext(os.path.basename(__file__))[0]
appArgNames = "User@Instance"

numArgs = len(sys.argv) - 1
numAcct = 0

if numArgs == 0:
    print("Error: Argument[s] missing.")
    print("Usage: %s %s ..."%(appName, appArgNames))
    sys.exit()

# Repeat for each user:
for user in sys.argv[1:]:

    # Parameter validation:
    # - Tolerate "@user@instance.tld"
    if user[0] == '@':
        user = user[1:]     # Strip leading '@'
    # - Require "user@instance.tld"
    if not (('@' in user[1:-1]) and ('.' in user[user.find('@')+1:-1])):
        print("\nSkipping bad parameter: ", user)
        continue

    outFile = "%s.txt"%(user)

    usr, inst = user.split('@')
    url = "https://%s/@%s"%(inst, usr)

    print("\nUser: %s"%(user))

    toots = []
    times = []
    bad_url = False
    # Repeat for each page of toots:
    while True:
        print("Processing: " + url)
        try:
            page = rq.get(url).content
        except:
            print("Skipping bad url: ", url)
            bad_url = True
            break

        tree = lxhtml.fromstring(page)
        base = '//div[@class="entry h-entry"]'  # All toot info is in this div

        #TODO: Extract more info than time and toot text, e.g. toot ID

        times += tree.xpath(base + '//data[@class="dt-published"]/@value')

        # This would be too wide as a list comprehension
        for toot in tree.xpath(base + '//div[@class="e-content"]'):
            toots.append(lxhtml.tostring(toot))

        url = tree.xpath('//a[@class="next"]/@href')    # Next page of toots
        if len(url) > 0:
            url = url[0]
        else:
            break   # No more URLs!

    if bad_url is True: # I wish Python would let me escape 2 loops without this
        continue

    if len(times) != len(toots):    # Sanity check for toot data
        print("ERROR: # dates (%d) != # toots (%d)"%(len(times), len(toots)))
        continue

    # Send to file:
    print("Processing: %d toots"%(len(toots)))
    with open(outFile, 'w') as f:
        t = list(zip(times, toots)) # Merge into tuples
        for toot in reversed(t):    # Process from oldest to newest
            print("\n%s\n"%(toot[0]), witext(striphtml(toot[1].decode())), file=f)

    print("Toots written to: %s\n"%(outFile))


if DEBUG is True:
    # Go interactive only when stdout not redirected                ####DEBUG
    if os.fstat(0) == os.fstat(1):                                  ####DEBUG
        print("\nEntering Interactive Mode: ^Z or ^D when done.")   ####DEBUG
        code.interact(local=dict(globals(), **locals()))            ####DEBUG

print("\nExiting!")
	# -- coding: utf-8 --
	"""
	MastoUserScrape.py -
	Download toots (not Boosts) from 'https://mastodon.instance/@UserName'
	and save in chronological order (oldest first).

	Author: BobC - https://mastodon.hasameli.com/@BobC
	License: "CC BY-NC-SA 4.0" https://creativecommons.org/licenses/by-nc-sa/4.0/

	Usage: python MastoUserScrape.py User@Instance ...

	Parameters: User@Instance.tld - The Mastodon user(s).

	Multiple users are supported.

	Output: Status and progress messages go to stdout.
	Toots go to file: "User@Instance.tld.txt"

	This *SHOULD* work on ALL major Python 3.x platforms (Win, Lin, Mac).

	Check the "TODO" comments for what's missing.
	"""
	DEBUG = False # Set to True for so much fun!

	import sys

	if __name__ != "__main__":
	print("This program is NOT a library and must NOT be imported!", file=sys.stderr)
	sys.exit()

	# And now we will continue with our regularly scheduled program...
	import os
	import textwrap
	import re
	import code
	import html
	import requests as rq
	from lxml import html as lxhtml

	if DEBUG is True:
	import pprint ####DEBUG


	## Local functions:

	if DEBUG is True:
	def pdict(d, indent=1, spaces=3):
	""" Print a dict that may contain other dicts """
	tabs = ' 'spacesindent
	for k, v in d.items():
	if not isinstance(v, dict):
	try:
	print("%s%s = "%(tabs, k), v)
	except:
	print("%s%s = [unprintable]"%(tabs, k))
	else:
	print("%s%s = {dict}"%(tabs, k))
	pdict(v, indent=indent+1)
	return


	def witext(s, indent=1, width=80, spaces=3):
	""" Return a wrapped and indented string """
	tabs = ' 'spacesindent
	w = width - len(tabs)
	lines = s.split("\n") # Paragraphs and line breaks
	lists = (textwrap.wrap(line, w, initial_indent=tabs, subsequent_indent=tabs)
	for line in lines)
	body = "\n".join("\n".join(l) for l in lists)
	return body


	def striphtml(data):
	""" Remove or replace HTML tags and replace symbols """
	data = html.unescape(data) # Replace symbols
	data = re.sub(r'<[/ ]br[/ ]>', '\n', data) # Preserve line breaks
	data = re.sub(r'</*p>', '\n', data) # Preserve paragraphs
	data = re.sub(r'<[^<]+?>', '', data) # Strip all other tags
	return data


	## Main code:

	if DEBUG is True:
	pp = pprint.PrettyPrinter() ####DEBUG
	p = pp.pprint ####DEBUG

	appName = os.path.splitext(os.path.basename(__file__))[0]
	appArgNames = "User@Instance"

	numArgs = len(sys.argv) - 1
	numAcct = 0

	if numArgs == 0:
	print("Error: Argument[s] missing.")
	print("Usage: %s %s ..."%(appName, appArgNames))
	sys.exit()

	# Repeat for each user:
	for user in sys.argv[1:]:

	# Parameter validation:
	# - Tolerate "@user@instance.tld"
	if user[0] == '@':
	user = user[1:] # Strip leading '@'
	# - Require "user@instance.tld"
	if not (('@' in user[1:-1]) and ('.' in user[user.find('@')+1:-1])):
	print("\nSkipping bad parameter: ", user)
	continue

	outFile = "%s.txt"%(user)

	usr, inst = user.split('@')
	url = "https://%s/@%s"%(inst, usr)

	print("\nUser: %s"%(user))

	toots = []
	times = []
	bad_url = False
	# Repeat for each page of toots:
	while True:
	print("Processing: " + url)
	try:
	page = rq.get(url).content
	except:
	print("Skipping bad url: ", url)
	bad_url = True
	break

	tree = lxhtml.fromstring(page)
	base = '//div[@class="entry h-entry"]' # All toot info is in this div

	#TODO: Extract more info than time and toot text, e.g. toot ID

	times += tree.xpath(base + '//data[@class="dt-published"]/@value')

	# This would be too wide as a list comprehension
	for toot in tree.xpath(base + '//div[@class="e-content"]'):
	toots.append(lxhtml.tostring(toot))

	url = tree.xpath('//a[@class="next"]/@href') # Next page of toots
	if len(url) > 0:
	url = url[0]
	else:
	break # No more URLs!

	if bad_url is True: # I wish Python would let me escape 2 loops without this
	continue

	if len(times) != len(toots): # Sanity check for toot data
	print("ERROR: # dates (%d) != # toots (%d)"%(len(times), len(toots)))
	continue

	# Send to file:
	print("Processing: %d toots"%(len(toots)))
	with open(outFile, 'w') as f:
	t = list(zip(times, toots)) # Merge into tuples
	for toot in reversed(t): # Process from oldest to newest
	print("\n%s\n"%(toot[0]), witext(striphtml(toot[1].decode())), file=f)

	print("Toots written to: %s\n"%(outFile))


	if DEBUG is True:
	# Go interactive only when stdout not redirected ####DEBUG
	if os.fstat(0) == os.fstat(1): ####DEBUG
	print("\nEntering Interactive Mode: ^Z or ^D when done.") ####DEBUG
	code.interact(local=dict(globals(), **locals())) ####DEBUG

	print("\nExiting!")