View natural language processing with shell
# extract all urls from a text file | |
cat file.txt | egrep -o 'https?://[^ ]+' | sed -e 's/https/http/g' | sed -e 's/\W+$//g' | sort | uniq -c | sort -bnr | |
# extraxt domains from URL's found in text files | |
cat file.txt | egrep -o 'https?://[^ ]+' | sed -e 's/https/http/g' | sed -e 's/\W+$//g' | sed -e 's/http:\/\///g' | sed -e 's/\/.*$//g' | sort | uniq -c | sort -bnr | |
# extract email addresses | |
cat file.txt | grep -i -o '[A-Z0-9._%+-]\+@[A-Z0-9.-]\+\.[A-Z]\{2,4\}' | sort | uniq -c | sort -bnr | |
# list all words in a text file |
View tika2text.sh
#!/bin/bash | |
# tika2text.sh - given a directory, recursively extract text frome files | |
# Eric Lease Morgan <emorgan@nd.edu> | |
# (c) University of Notre Dame, distributed under a GNU Public License | |
# March 27, 2017 - a second cut; works with a directory | |
View gist:8984187
sub extracter { | |
# given a (CrossRef) DOI, parse link header of HTTP request to get fulltext URLs | |
# see also: https://prospect.crossref.org/splash/ | |
# Eric Lease Morgan <emorgan@nd.edu> | |
# February 12, 2014 - first cut | |
# require | |
use HTTP::Request; |
View gist:8438082
sub slurp { | |
my $f = shift; | |
open ( F, $f ) or die "Can't open $f: $!\n"; | |
my $r = do { local $/; <F> }; | |
close F; | |
return $r; | |
} |