This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Extract all URLs from a document. | |
# | |
# Steps: | |
# | |
# 1. Use Apache Tika to extract text to plain text document | |
# 2. Use xurls to extract URLs from Tika output (and use sort to remove duplicates) | |
# | |
# Dependencies: | |
# | |
# * Java (needed by Apache Tika) | |
# * Apache Tika: https://tika.apache.org/ | |
# * Go (needed by xurls): https://golang.org/ | |
# * xurls: https://github.com/mvdan/xurls | |
# * sort: https://linux.die.net/man/1/sort | |
# Display usage message if command line does not contain expected | |
# number of arguments | |
if [ "$#" -ne 2 ] ; then | |
echo "Usage: extractURLs.sh fileIn fileOut" >&2 | |
exit 1 | |
fi | |
# Location of Tika JAR | |
tikaJar=~/tika/tika-app-1.16.jar | |
# File I/O | |
fileIn="$1" | |
fileOut="$2" | |
# fileIn basename, extension removed | |
bName=$(basename "$fileIn" | cut -f 1 -d '.') | |
# Tika output file | |
fOutTika=$bName"_Tika.txt" | |
# Extract text | |
java -jar $tikaJar --text $fileIn > $fOutTika | |
# Extract urls, then remove duplicate entries | |
xurls $fOutTika | sort -u > $fileOut | |
# Clean up | |
rm $fOutTika |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment