bitsgalore/extractURLs.sh

## extractURLs.sh
#!/bin/bash

# Extract all URLs from a document.
#
# Steps:
#
# 1. Use Apache Tika to extract text to plain text document
# 2. Use xurls to extract URLs from Tika output (and use sort to remove duplicates)
#
# Dependencies:
#
#  * Java (needed by Apache Tika)
#  * Apache Tika: https://tika.apache.org/
#  * Go (needed by xurls): https://golang.org/
#  * xurls:  https://github.com/mvdan/xurls
#  * sort: https://linux.die.net/man/1/sort

# Display usage message if command line does not contain expected
# number of arguments
if [ "$#" -ne 2 ] ; then
  echo "Usage: extractURLs.sh fileIn fileOut" >&2
  exit 1
fi

# Location of Tika JAR
tikaJar=~/tika/tika-app-1.16.jar

# File I/O
fileIn="$1"
fileOut="$2"

# fileIn basename, extension removed
bName=$(basename "$fileIn" | cut -f 1 -d '.')

# Tika output file
fOutTika=$bName"_Tika.txt"

# Extract text
java -jar $tikaJar --text $fileIn > $fOutTika

# Extract urls, then remove duplicate entries
xurls $fOutTika | sort -u > $fileOut

# Clean up
rm $fOutTika
	#!/bin/bash

	# Extract all URLs from a document.
	#
	# Steps:
	#
	# 1. Use Apache Tika to extract text to plain text document
	# 2. Use xurls to extract URLs from Tika output (and use sort to remove duplicates)
	#
	# Dependencies:
	#
	# * Java (needed by Apache Tika)
	# * Apache Tika: https://tika.apache.org/
	# * Go (needed by xurls): https://golang.org/
	# * xurls: https://github.com/mvdan/xurls
	# * sort: https://linux.die.net/man/1/sort

	# Display usage message if command line does not contain expected
	# number of arguments
	if [ "$#" -ne 2 ] ; then
	echo "Usage: extractURLs.sh fileIn fileOut" >&2
	exit 1
	fi

	# Location of Tika JAR
	tikaJar=~/tika/tika-app-1.16.jar

	# File I/O
	fileIn="$1"
	fileOut="$2"

	# fileIn basename, extension removed
	bName=$(basename "$fileIn" \| cut -f 1 -d '.')

	# Tika output file
	fOutTika=$bName"_Tika.txt"

	# Extract text
	java -jar $tikaJar --text $fileIn > $fOutTika

	# Extract urls, then remove duplicate entries
	xurls $fOutTika \| sort -u > $fileOut

	# Clean up
	rm $fOutTika