Skip to content

Instantly share code, notes, and snippets.

@deliciouslytyped
Last active November 7, 2021 13:24
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save deliciouslytyped/5cd53011147a5634d64db59ec956320d to your computer and use it in GitHub Desktop.
Save deliciouslytyped/5cd53011147a5634d64db59ec956320d to your computer and use it in GitHub Desktop.
Download the text messages of a (public) gitter room without logging in, view it with some semi-readable output
#! /usr/bin/env bash
set -euo pipefail #NOTE I use a lot of functions and apparently -e is useless with functions?
#TODO simplify this back down by separating the render functionality
#TODO port to tcl?
# =========== utilities =========== #
# (root) -> ()
function render() {
local root=$1
shift
if command -v nix-build &> /dev/null;
then glow=$(nix-build '<nixpkgs>' -I nixpkgs=channel:nixos-unstable -A glow --no-out-link)/bin/glow
else glow=glow
fi
# glow doesnt seem to handle large files well
find "$root/md/" -type f -printf "%P\n" | sort -n -r | { while read line; do echo "$root/md/${line}"; done; } | #TODO this messy and root isn't quoted
xargs -I"{}" bash -c "echo {} && '$glow' -s dark ${*@Q} {} | tee -a out.bin.new > /dev/null #glow doesnt obey width when redirecting"
mv "$root/out.bin.new" "$root/out.bin"
less -R out.bin
exit
}
# =========== utilities =========== #
# (url) -> (g CHANID, g TOKEN, g BEFOREID)
function getmagic() {
local url=$1; local page; local jsonvals
page=$(curl --silent --fail "$url")
# we extract the necessary fields from a json object in script tags #TODO maybe the workings of the accessToken is in the public source somewhere?
jsonvals=$( echo "$page" | gawk 'match($0, /window.troupeContext = (.*);<\/script>/, m) { print m[1] }' | jq -r ".troupe.id, .accessToken" )
# This is madness. spongebob: no this is bash ; https://www.etalabs.net/sh_tricks.html via https://stackoverflow.com/a/6779351
IFS=$'\n' read -r -d '' CHANID TOKEN <<-EOF || true # "fix" exit code # we use two real tabs here for indentation https://unix.stackexchange.com/a/76483
$jsonvals
EOF
BEFOREID=$( echo "$page" |
pup '[class~="chat-item"] json{}' | # narrow the elements and return them as json
jq -r '.[] .class | capture("model-id-(?<id>[0-9a-f]{24})") .id' | # get the ids out of the class
tail -n 1 )
}
# (beforeid, chanid, token, root) -> (g last_id, g retVal)
function getpage() {
local beforeid=$1; local chanid=$2; local token=$3; local counter=$4; local root=$5
# I just used "copy as curl" but https://gitlab.com/gitterHQ/webapp/-/blob/master/server/api/v1/rooms/chat-messages.js
curl --silent --fail "https://gitter.im/api/v1/rooms/$chanid/chatMessages?lookups%5B%5D=user&includeThreads=false&beforeId=$beforeid&limit=100" \
-H "x-access-token: $token" \
-o "$root/json/$counter.json" #NOTE using -o as opposed to IO redirection means we dont write a file on failure?
#the next page is fetched by passing the next "beforeId", which will be from the last message
last_id=$(jq -r ".items | .[] .id" < "$root/json/$counter.json" | head -n 1) && retVal=true || retVal=false
}
# (counter, root) -> ()
function extractMarkdown(){
local counter=$1; local root=$2; local prog
IFS='' read -r -d '' prog <<-"EOF" || true # "fix" exit code # we use two real tabs here for the indentation of the EOF
.lookups as $lookups | # we need to use lookups later to convert sender ids to usernames
.items[] | # convert the list of messages to a stream of messages
. as $item | # we need this for concatenating to .username, because the scope is different at the usage site
"\n", # separate messages in the markdown
(.fromUser as $uid | $lookups.users | .[] | select(.id == $uid) # look up the user entry from the uid in the message
| .username+" "+$item.sent), # return the username with the date
(.text | startswith("```") // ("&nbsp;&nbsp;&nbsp;&nbsp;"+.)) # we make sure quoted sections start at the beginning of the line, as required by markdown, otherwise, we indent the message a bit for readability #TODO something is broken
EOF
jq -r "$prog" < "$root/json/$counter.json" > "$root/md/$counter.md"
}
#If we're on NixOS. We use this strat instead of nix-shell shebang because we need selfexec to be fast.
#(url) -> ()
function selfsetup() {
local url=$1
echo Entering nix-shell.
exec nix-shell -I nixpkgs=channel:nixos-unstable -p pup glow jq --run "IN_MY_SHELL=1 $(realpath "$0") '$url'"
}
# (root) -> ()
function init() {
local root=$1
echo This program may fail silently.
export STARTED=1
mkdir -p "$root"
}
# dependencies: pup, glow, jq, gawk, bash, xarrgs, find
function main() {
set +u
[ -z ${1+x} ] && { echo -e "USAGE: $0 gitter_url\nrender and extractMarkdown can be called via: ( . $0; somefunc somearg )\nThere is information about API longevity at https://gitter.im/gitter/api?at=5f74bee5cfe2f9049a14ae3e"; exit 1; }
set -u
url=$1
root=$(echo "$url" | rev | cut -d "/" -f -2 | rev | sed "s/\//-/")
set +u
[ -z ${IN_MY_SHELL+x} ] && selfsetup "$url"
[ -z ${STARTED+x} ] && init "$root"
set -u
#COUNTER, ID, TOKEN and CHANID are passed via env vars during tail recursion
set +u
# passed during tail recursion
COUNTER=${COUNTER:-0}; BEFOREID=${BEFOREID:-}; TOKEN=${TOKEN:-}; CHANID=${CHANID:-}
set -u
#need to set the initial value the first time around
[[ -z "$BEFOREID" || -z "$CHANID" || -z "$TOKEN" ]] && getmagic "$url"
mkdir -p "$root/json" "$root/md"
echo "Fetching $BEFOREID at counter $COUNTER"
sleep 1 # naive rate limit, probably unnecessary
getpage "$BEFOREID" "$CHANID" "$TOKEN" "$COUNTER" "$root" #NOTE currently recursion is stopped by jq eventually failing on the empty list
! $retVal && render "$root" || true
extractMarkdown "$COUNTER" "$root"
# tail recursion :P
exec /usr/bin/env TOKEN="$TOKEN" CHANID="$CHANID" BEFOREID="$last_id" COUNTER=$(("$COUNTER" + 1)) "$0" "$url"
}
# check if we are sourced, like python if __name__ == "__main__" ; https://stackoverflow.com/a/23009039
[ "$0" = "$BASH_SOURCE" ] && main "$@" || true
@deliciouslytyped
Copy link
Author

deliciouslytyped commented Jan 14, 2021

Edit: all of this might be unnecessary? https://matrix.org/blog/2020/09/30/welcoming-gitter-to-matrix/

TODO:

  • something is broken with the literal block fix
  • Might be integrable with https://github.com/kbob/gitter-scraping-instructions/blob/master/fmt-gitter (this seems to just need a json in the appropriate format)
  • Updating could involve storing the range in the file names and making the indexes increase over time, instead of the latest being 0.
  • readability of the rendering output could be significantly improved

@Profpatsch
Copy link

Cool, can we also push it to IRC? :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment