uriel1998/plaintext-convert.sh

## plaintext-convert.sh
#!/bin/bash
#set -o errexit
#set -o nounset
##############

# This program converts a plain-text list of urls to the
# bookmark-archiver HTML format.
#
# Requirements: curl, xidel
# Usage:
#     ./plaintext-convert.sh [{filename}]
#
# The filename is optional. If specified, the specified file will be
# read. If not, then stdin will be used instead.
# Some error testing, redirect following, and better title extraction
#
# Examples:
#     ./plaintext-convert.sh <path/to/file >list.html
#     ./plaintext-convert.sh urls.txt >urls.html
#
##############

url=""
code=""

date=$(date +%s)

egrep --only-matching 'http(s)?\://[^ "\*\*"]*' <"${1:-/dev/stdin}" | while read pageurl; do
    date=$(( $date + 1 ))
    # Extract webpage title

    url=$(echo "${pageurl}")
    echo "[info] Processing $url" >&2;
    headers=$(curl --fail --connect-timeout 20 --location -sS --head "$url")
    code=$(echo "$headers" | head -1 | awk '{print $2}')

    #check for null as well
    if [ -z "$code" ];then
        echo "[info] Web page is gone!" >&2;
    else
        if echo "$code" | grep -q -e "3[0-9][0-9]";then
            echo "[info] HTTP $code redirect" >&2;
            resulturl=""
            resulturl=$(wget -O- --server-response "$url" 2>&1 | grep "^Location" | tail -1 | awk -F ' ' '{print $2}')
            if [ -z "$resulturl" ]; then
                echo "[info] No new location found" >&2;
                resulturl=$(echo "$url")
            else
                echo "[info] New location found" >&2;
                url=$(echo "$resulturl")
                echo "[info] REprocessing $url" >&2;
                headers=$(curl --connect-timeout 20 --location -sS --head "$url")
                code=$(echo "$headers" | head -1 | awk '{print $2}')
                if echo "$code" | grep -q -e "3[0-9][0-9]";then
                    echo "[info] Second redirect; passing as-is" >&2;
                fi
            fi
        fi
        if echo "$code" | grep -q -e "2[0-9][0-9]";then
            echo "[info] HTTP $code exists" >&2;
        fi
        pagetitle=$(curl "$url" -Ss | xidel --data - --css "title" --silent | tr '\n' ' ' | head -n 1)
        if [ "$pagetitle" = "" ]; then pagetitle="$pageurl"; fi
        echo "[info] Title: $pagetitle" >&2;
        #pagetitle="$url"
        echo "<dt><a href=\"$url\" add_date=\"$date\">$pagetitle</a></dt>";
        echo "[info] On to next url" >&2;
    fi
done;
echo "[info] Completed" >&2;
	#!/bin/bash
	#set -o errexit
	#set -o nounset
	##############

	# This program converts a plain-text list of urls to the
	# bookmark-archiver HTML format.
	#
	# Requirements: curl, xidel
	# Usage:
	# ./plaintext-convert.sh [{filename}]
	#
	# The filename is optional. If specified, the specified file will be
	# read. If not, then stdin will be used instead.
	# Some error testing, redirect following, and better title extraction
	#
	# Examples:
	# ./plaintext-convert.sh <path/to/file >list.html
	# ./plaintext-convert.sh urls.txt >urls.html
	#
	##############

	url=""
	code=""

	date=$(date +%s)

	egrep --only-matching 'http(s)?\://[^ "\\"]*' <"${1:-/dev/stdin}" \| while read pageurl; do
	date=$(( $date + 1 ))
	# Extract webpage title

	url=$(echo "${pageurl}")
	echo "[info] Processing $url" >&2;
	headers=$(curl --fail --connect-timeout 20 --location -sS --head "$url")
	code=$(echo "$headers" \| head -1 \| awk '{print $2}')

	#check for null as well
	if [ -z "$code" ];then
	echo "[info] Web page is gone!" >&2;
	else
	if echo "$code" \| grep -q -e "3[0-9][0-9]";then
	echo "[info] HTTP $code redirect" >&2;
	resulturl=""
	resulturl=$(wget -O- --server-response "$url" 2>&1 \| grep "^Location" \| tail -1 \| awk -F ' ' '{print $2}')
	if [ -z "$resulturl" ]; then
	echo "[info] No new location found" >&2;
	resulturl=$(echo "$url")
	else
	echo "[info] New location found" >&2;
	url=$(echo "$resulturl")
	echo "[info] REprocessing $url" >&2;
	headers=$(curl --connect-timeout 20 --location -sS --head "$url")
	code=$(echo "$headers" \| head -1 \| awk '{print $2}')
	if echo "$code" \| grep -q -e "3[0-9][0-9]";then
	echo "[info] Second redirect; passing as-is" >&2;
	fi
	fi
	fi
	if echo "$code" \| grep -q -e "2[0-9][0-9]";then
	echo "[info] HTTP $code exists" >&2;
	fi
	pagetitle=$(curl "$url" -Ss \| xidel --data - --css "title" --silent \| tr '\n' ' ' \| head -n 1)
	if [ "$pagetitle" = "" ]; then pagetitle="$pageurl"; fi
	echo "[info] Title: $pagetitle" >&2;
	#pagetitle="$url"
	echo "<dt><a href=\"$url\" add_date=\"$date\">$pagetitle</a></dt>";
	echo "[info] On to next url" >&2;
	fi
	done;
	echo "[info] Completed" >&2;