Created
June 27, 2017 22:10
-
-
Save pbhj/4dedca1e980d6a102433403c0f435524 to your computer and use it in GitHub Desktop.
bookmarkive filter script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# take input from stdin and sort, remove duplicates and filter | |
set -e | |
cat /dev/stdin | sort | uniq | \ | |
# strip hash endings (will kill twitter) | |
sed -e "s/#.*//" | \ | |
# remove session id strings | |
sed -e "s/&sid=.*$//" | \ | |
# # remove terminal /; but why? | |
# sed -e "s/\/$//" | \ | |
# # adhoc removal of urls based on keywords; -i is --ignore-case, -v is --invert-match | |
# grep -i -v -e 4chan -e facebook -e watchseries -e videoweed -e tesco -e dropbox -e flickr -e block\.opendns | |
grep -i -v -e "^javascript" -e "^file" -e "^about" -e "^http\:\/\/192\." -e "\.swf" -e "\.flv" | \ | |
# pass through uniq again as stripping endings may have produced multiples | |
uniq |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment