sleep-walker/ugly.sh

## ugly.sh
#!/bin/bash

####
# Configuration
#####
DOCS="https://docs.google.com/spreadsheet/pub?hl=en_US&hl=en_US&key=0AqPp4y2wyQsbdGQ1V3pRRDg5NEpGVWpubzdRZ0tjUWc&single=true&gid=0&output=txt"
LICENSES="http://spdx.org/licenses/"


# upper table contains <td align="center"> and the other does not so I will use it to distinguish between them
identifiers_xpath='cat //tr/td[@align="center"]/../td/code/text()'
fullnames_xpath='cat //tr/td[@align="center"]/../td/a[@rel]/text()'

# simple output to stderr
err() {
    echo "$@" >> /dev/stderr
}

fetch_or_die() {
    # DEBUG
    case "$1" in
	"$DOCS")
	    cat DOCS
	    return ;;
	"$LICENSES")
	    cat LICENSES
	    return ;;
    esac

    if ! curl -# "$1" -o -; then
	err "Couldn't fetch '$1'"
	exit 10
    fi
}

xpath() {
# Apply XPath on HTML file, filter separators and prompts
# $1	HTML
# $2	xpath

    xmllint --html --shell "$1" <<< "$2" | \
	grep -vE '^( -------|/ >)'
}

export LC_ALL=C

#####
# Step 1 - get some WTF spreadsheet with some licenses
##


# download spreadsheet, remove line with 'New format', remove trailing whitespaces
fetch_or_die "$DOCS" | sed '/New format/d; s@[[:blank:]]*$@@' > licenses_changes.ntxt

# take every identifier with 'SUSE-' prefix, print in form
#   license+ tabulator license+
# (but WHY?!)
sed -n 's@^\(SUSE-[^[:blank:]]\+\)\t.*@\1+\t\1+@p' > licenses_changes.ptxt < licenses_changes.ntxt

#return 2> /dev/null || exit

#####
# Step 2 - download spdx license table
##

# allocate temporary file for HTML page
license_tmp=$(mktemp)

# fetch html page with license table
fetch_or_die "$LICENSES" > "$license_tmp"

# parse licenses from the HTML
readarray -t identifiers < <(xpath "$license_tmp" "$identifiers_xpath")
readarray -t fullnames < <(xpath "$license_tmp" "$fullnames_xpath")

# Do some sanity checks for parsed results
if [ ${#identifiers[@]} -ne ${#fullnames[@]} ]; then
    err "Number of identifiers and fullnames read from '$DOCS' doesn't match."
    err "identifiers read: ${#identifiers[@]}"
    err "fullnames read: ${#fullnames[@]}"
    exit 2
elif [ ${#identifiers[@]} -eq 0 ]; then
    err "No license read"
    exit 3
fi

# clean up!
rm "$license_tmp"

# bullshit - tell me why?!

#for i in "${identifiers[@]}"; do
#    echo "$i	$i" >> licenses_changes.ntxt
#    echo "$i+	$i+" >> licenses_changes.ptxt
#done

# check for duplicities (why the hell?)
# in license short string
dups="$(
    {
	cut -d$'\t' -f1 licenses_changes.ntxt
	export IFS=$'\n'
	echo "${identifiers[*]}"
    } | sort -u | sed 's@^SUSE-@@' | sort | uniq -d)"
# in description
dups="${dups}$(
    {
	cut -d$'\t' -f2 licenses_changes.ntxt
	export IFS=$'\n'
	echo "${identifiers[*]}"
    } | sort | uniq -d)"

# if we found any duplicates - end with shame
if [ -n "$dups" ]; then
    echo "DUPS $dups"
    exit 1
fi


{
    cat <<EOF

This is the git for openSUSE:Tools/obs-service-format_spec_file
It happens to be *the* repository for valid licenses to be used in openSUSE spec files

# [SPDX Licenses](http://spdx.org/licenses)

License Tag | Description
----------- | -----------
EOF

for i in "${!identifiers[@]}"; do
  echo "${identifiers[i]} | ${fullnames[i]}"
done

cat <<EOF
# SUSE Additions

|License Tag|
|-----------|
EOF

sed -n 's@^\(SUSE-[^[:blank:]]*\)\t.*@|\1|@p' licenses_changes.ntxt | sort -u
} > README.md

{
    echo "First line"
    # why the hell? like licenses_changes.ptxt ever contained anything useful!
    cat licenses_changes.ntxt licenses_changes.ptxt | sort -u
} > licenses_changes.txt

rm licenses_changes.{p,n}txt
	#!/bin/bash

	####
	# Configuration
	#####
	DOCS="https://docs.google.com/spreadsheet/pub?hl=en_US&hl=en_US&key=0AqPp4y2wyQsbdGQ1V3pRRDg5NEpGVWpubzdRZ0tjUWc&single=true&gid=0&output=txt"
	LICENSES="http://spdx.org/licenses/"


	# upper table contains <td align="center"> and the other does not so I will use it to distinguish between them
	identifiers_xpath='cat //tr/td[@align="center"]/../td/code/text()'
	fullnames_xpath='cat //tr/td[@align="center"]/../td/a[@rel]/text()'

	# simple output to stderr
	err() {
	echo "$@" >> /dev/stderr
	}

	fetch_or_die() {
	# DEBUG
	case "$1" in
	"$DOCS")
	cat DOCS
	return ;;
	"$LICENSES")
	cat LICENSES
	return ;;
	esac

	if ! curl -# "$1" -o -; then
	err "Couldn't fetch '$1'"
	exit 10
	fi
	}

	xpath() {
	# Apply XPath on HTML file, filter separators and prompts
	# $1 HTML
	# $2 xpath

	xmllint --html --shell "$1" <<< "$2" \| \
	grep -vE '^( -------\|/ >)'
	}

	export LC_ALL=C

	#####
	# Step 1 - get some WTF spreadsheet with some licenses
	##


	# download spreadsheet, remove line with 'New format', remove trailing whitespaces
	fetch_or_die "$DOCS" \| sed '/New format/d; s@[[:blank:]]*$@@' > licenses_changes.ntxt

	# take every identifier with 'SUSE-' prefix, print in form
	# license+ tabulator license+
	# (but WHY?!)
	sed -n 's@^\(SUSE-[^[:blank:]]\+\)\t.*@\1+\t\1+@p' > licenses_changes.ptxt < licenses_changes.ntxt

	#return 2> /dev/null \|\| exit

	#####
	# Step 2 - download spdx license table
	##

	# allocate temporary file for HTML page
	license_tmp=$(mktemp)

	# fetch html page with license table
	fetch_or_die "$LICENSES" > "$license_tmp"

	# parse licenses from the HTML
	readarray -t identifiers < <(xpath "$license_tmp" "$identifiers_xpath")
	readarray -t fullnames < <(xpath "$license_tmp" "$fullnames_xpath")

	# Do some sanity checks for parsed results
	if [ ${#identifiers[@]} -ne ${#fullnames[@]} ]; then
	err "Number of identifiers and fullnames read from '$DOCS' doesn't match."
	err "identifiers read: ${#identifiers[@]}"
	err "fullnames read: ${#fullnames[@]}"
	exit 2
	elif [ ${#identifiers[@]} -eq 0 ]; then
	err "No license read"
	exit 3
	fi

	# clean up!
	rm "$license_tmp"

	# bullshit - tell me why?!

	#for i in "${identifiers[@]}"; do
	# echo "$i $i" >> licenses_changes.ntxt
	# echo "$i+ $i+" >> licenses_changes.ptxt
	#done

	# check for duplicities (why the hell?)
	# in license short string
	dups="$(
	{
	cut -d$'\t' -f1 licenses_changes.ntxt
	export IFS=$'\n'
	echo "${identifiers[*]}"
	} \| sort -u \| sed 's@^SUSE-@@' \| sort \| uniq -d)"
	# in description
	dups="${dups}$(
	{
	cut -d$'\t' -f2 licenses_changes.ntxt
	export IFS=$'\n'
	echo "${identifiers[*]}"
	} \| sort \| uniq -d)"

	# if we found any duplicates - end with shame
	if [ -n "$dups" ]; then
	echo "DUPS $dups"
	exit 1
	fi


	{
	cat <<EOF

	This is the git for openSUSE:Tools/obs-service-format_spec_file
	It happens to be the repository for valid licenses to be used in openSUSE spec files

	# [SPDX Licenses](http://spdx.org/licenses)

	License Tag \| Description
	----------- \| -----------
	EOF

	for i in "${!identifiers[@]}"; do
	echo "${identifiers[i]} \| ${fullnames[i]}"
	done

	cat <<EOF
	# SUSE Additions

	\|License Tag\|
	\|-----------\|
	EOF

	sed -n 's@^\(SUSE-[^[:blank:]]\)\t.@\|\1\|@p' licenses_changes.ntxt \| sort -u
	} > README.md

	{
	echo "First line"
	# why the hell? like licenses_changes.ptxt ever contained anything useful!
	cat licenses_changes.ntxt licenses_changes.ptxt \| sort -u
	} > licenses_changes.txt

	rm licenses_changes.{p,n}txt