Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Converter for pirate/bookmark-archiver that converts a plain-text list of urls into something it can understand.
#!/bin/bash
#set -o errexit
#set -o nounset
##############
# This program converts a plain-text list of urls to the
# bookmark-archiver HTML format.
#
# Requirements: curl, xidel
# Usage:
# ./plaintext-convert.sh [{filename}]
#
# The filename is optional. If specified, the specified file will be
# read. If not, then stdin will be used instead.
# Some error testing, redirect following, and better title extraction
#
# Examples:
# ./plaintext-convert.sh <path/to/file >list.html
# ./plaintext-convert.sh urls.txt >urls.html
#
##############
url=""
code=""
date=$(date +%s)
egrep --only-matching 'http(s)?\://[^ "\*\*"]*' <"${1:-/dev/stdin}" | while read pageurl; do
date=$(( $date + 1 ))
# Extract webpage title
url=$(echo "${pageurl}")
echo "[info] Processing $url" >&2;
headers=$(curl --fail --connect-timeout 20 --location -sS --head "$url")
code=$(echo "$headers" | head -1 | awk '{print $2}')
#check for null as well
if [ -z "$code" ];then
echo "[info] Web page is gone!" >&2;
else
if echo "$code" | grep -q -e "3[0-9][0-9]";then
echo "[info] HTTP $code redirect" >&2;
resulturl=""
resulturl=$(wget -O- --server-response "$url" 2>&1 | grep "^Location" | tail -1 | awk -F ' ' '{print $2}')
if [ -z "$resulturl" ]; then
echo "[info] No new location found" >&2;
resulturl=$(echo "$url")
else
echo "[info] New location found" >&2;
url=$(echo "$resulturl")
echo "[info] REprocessing $url" >&2;
headers=$(curl --connect-timeout 20 --location -sS --head "$url")
code=$(echo "$headers" | head -1 | awk '{print $2}')
if echo "$code" | grep -q -e "3[0-9][0-9]";then
echo "[info] Second redirect; passing as-is" >&2;
fi
fi
fi
if echo "$code" | grep -q -e "2[0-9][0-9]";then
echo "[info] HTTP $code exists" >&2;
fi
pagetitle=$(curl "$url" -Ss | xidel --data - --css "title" --silent | tr '\n' ' ' | head -n 1)
if [ "$pagetitle" = "" ]; then pagetitle="$pageurl"; fi
echo "[info] Title: $pagetitle" >&2;
#pagetitle="$url"
echo "<dt><a href=\"$url\" add_date=\"$date\">$pagetitle</a></dt>";
echo "[info] On to next url" >&2;
fi
done;
echo "[info] Completed" >&2;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment