Eric Lease Morgan ericleasemorgan

## network-5c604132-375.gexf
<?xml version='1.0' encoding='UTF-8'?>
<gexf xmlns="http://gexf.net/1.3" version="1.3" xmlns:viz="http://gexf.net/1.3/viz" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://gexf.net/1.3 http://gexf.net/1.3/gexf.xsd">
  <meta lastmodifieddate="2023-11-15">
    <creator>Gephi 0.10.1</creator>
    <title></title>
    <description></description>
  </meta>
  <graph defaultedgetype="directed" mode="static">
    <attributes class="node" mode="static">
      <attribute id="types" title="types" type="string"/>

## natural language processing with shell
# extract all urls from a text file
cat file.txt | egrep -o 'https?://[^ ]+' | sed -e 's/https/http/g' |  sed -e 's/\W+$//g' | sort | uniq -c | sort -bnr

# extraxt domains from URL's found in text files
cat file.txt | egrep -o 'https?://[^ ]+' | sed -e 's/https/http/g' |  sed -e 's/\W+$//g' | sed -e 's/http:\/\///g' | sed -e 's/\/.*$//g' | sort | uniq -c | sort -bnr

# extract email addresses
cat file.txt | grep -i -o '[A-Z0-9._%+-]\+@[A-Z0-9.-]\+\.[A-Z]\{2,4\}' | sort | uniq -c | sort -bnr

# list all words in a text file

## tika2text.sh
#!/bin/bash

# tika2text.sh - given a directory, recursively extract text frome files

# Eric Lease Morgan <emorgan@nd.edu>
# (c) University of Notre Dame, distributed under a GNU Public License

# March 27, 2017 - a second cut; works with a directory


## gist:8984187
sub extracter {

	# given a (CrossRef) DOI, parse link header of HTTP request to get fulltext URLs
	# see also: https://prospect.crossref.org/splash/

	# Eric Lease Morgan <emorgan@nd.edu>
	# February 12, 2014 - first cut

	# require
	use HTTP::Request;

## gist:8438082
sub slurp {

	my $f = shift;
	open ( F, $f ) or die "Can't open $f: $!\n";
	my $r = do { local $/; <F> };
	close F;
	return $r;

}
	<?xml version='1.0' encoding='UTF-8'?>
	<gexf xmlns="http://gexf.net/1.3" version="1.3" xmlns:viz="http://gexf.net/1.3/viz" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://gexf.net/1.3 http://gexf.net/1.3/gexf.xsd">
	<meta lastmodifieddate="2023-11-15">
	<creator>Gephi 0.10.1</creator>
	<title></title>
	<description></description>
	</meta>
	<graph defaultedgetype="directed" mode="static">
	<attributes class="node" mode="static">
	<attribute id="types" title="types" type="string"/>
	# extract all urls from a text file
	cat file.txt \| egrep -o 'https?://[^ ]+' \| sed -e 's/https/http/g' \| sed -e 's/\W+$//g' \| sort \| uniq -c \| sort -bnr

	# extraxt domains from URL's found in text files
	cat file.txt \| egrep -o 'https?://[^ ]+' \| sed -e 's/https/http/g' \| sed -e 's/\W+$//g' \| sed -e 's/http:\/\///g' \| sed -e 's/\/.*$//g' \| sort \| uniq -c \| sort -bnr

	# extract email addresses
	cat file.txt \| grep -i -o '[A-Z0-9._%+-]\+@[A-Z0-9.-]\+\.[A-Z]\{2,4\}' \| sort \| uniq -c \| sort -bnr

	# list all words in a text file
	#!/bin/bash

	# tika2text.sh - given a directory, recursively extract text frome files

	# Eric Lease Morgan <emorgan@nd.edu>
	# (c) University of Notre Dame, distributed under a GNU Public License

	# March 27, 2017 - a second cut; works with a directory
	sub extracter {

	# given a (CrossRef) DOI, parse link header of HTTP request to get fulltext URLs
	# see also: https://prospect.crossref.org/splash/

	# Eric Lease Morgan <emorgan@nd.edu>
	# February 12, 2014 - first cut

	# require
	use HTTP::Request;
	sub slurp {

	my $f = shift;
	open ( F, $f ) or die "Can't open $f: $!\n";
	my $r = do { local $/; <F> };
	close F;
	return $r;

	}