Skip to content

Instantly share code, notes, and snippets.

@alexshpilkin
Last active April 3, 2019 17:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alexshpilkin/73f301ed620993665561f7d404cefc0b to your computer and use it in GitHub Desktop.
Save alexshpilkin/73f301ed620993665561f7d404cefc0b to your computer and use it in GitHub Desktop.
Scrape rutracker.org forums
#!/bin/sh -eu
set -o pipefail
TRACE=${TRACE-}; export TRACE
tracef() {
fmt=$1; shift
if [ -t 2 ]; then printf '\033[0K%s'"$fmt"'\r' "$TRACE" "$@" >&2; fi
}
trap 'tracef ""' EXIT
id=${1##*f=}
TRACE=$TRACE$(printf '%05d ' "$id")
tracef '%s' "$1"
rm -f "forum$id.url"
page=$1; i=1; trace=$TRACE
until
TRACE=$trace$(printf '%02d ' "$i")
tracef '%s' "$page"
echo "$page" >>"forum$id.url"
curl -sSL -o - "$page" | \
sed -Ee 's/<(meta charset=[^>]+)>/<!-- \1 -->/' | \
iconv -f cp1251 >"forum$id.$i.html"
pup -p a.torTopic attr{href} <"forum$id.$i.html" | \
while read -r url; do
url=${page%/*}/$url
"$(dirname "$0")/rutopic" "$url"
done
next=$(pup -p a.pg attr{href} <"forum$id.$i.html" | tail -1)
next=${page%/*}/$next
grep -qFe "$next" "forum$id.url"
do
page=$next; i=$((i+1))
done
#!/bin/sh -eu
set -o pipefail
TRACE=${TRACE-}; export TRACE
tracef() {
fmt=$1; shift
if [ -t 2 ]; then printf '\033[0K%s'"$fmt"'\r' "$TRACE" "$@" >&2; fi
}
trap 'tracef ""' EXIT
id=${1##*t=}
TRACE=$TRACE$(printf '%07d ' "$id")
tracef '%s' "$1"
rm -f "topic$id.url"
page=$1; i=1; trace=$TRACE
until
TRACE=$trace$(printf '%02d ' "$i")
tracef '%s' "$page"
echo "$page" >>"topic$id.url"
curl -sSL -o - "$page" | \
sed -Ee 's/<(meta charset=[^>]+)>/<!-- \1 -->/g' | \
iconv -f cp1251 >"topic$id.$i.html"
next=$(pup -p a.pg attr{href} <"topic$id.$i.html" | tail -1)
next=${page%/*}/$next
grep -qFe "$next" "topic$id.url"
do
page=$next; i=$((i+1))
done
curl -sSL -o - -b cookies.txt -d "t=$id" "${1%/*}/viewtorrent.php" | \
iconv -f utf-8 | pup -p '.ftree > li json{}' | jq '
.[] |
walk(if type == "object" and .tag == "div"
then {"name": .children | map(select(.tag=="b") | .text) | join(""),
"size": .children | map(select(.tag=="i") | .text) | join("")} |
(if .size == "" then del(.size) else .size |= tonumber end)
elif type == "object" and .tag == "li" and .class == "dir"
then {"name": .children[0].name,
"children": .children[1].children}
elif type == "object" and .tag == "li"
then .children[0]
else . end) |
.name |= sub("^\\./"; "")
' >"topic$id.json"
jq -r '
walk(if type == "object" and .children
then .name as $name |
[{"name": ""}, .children[][] |
(.name |= $name + "/" + .)]
elif type == "object"
then [.]
else . end) |
.[] | "\(.size // "")\t\(.name)"
' <"topic$id.json" >"topic$id.tsv"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment