bmschmidt/gist:923dce0330d72486ee8d

## gistfile1.sh
grams=3
corpus="eng-all"
searchstring="attention"
simultaneousDownloads="16"

# THIS PROCESS IS EXTREMELY RESOURCE INTENSIVE--DO NOT RUN IT ON A LARK OR IF YOU DON'T UNDERSTAND WHAT IT DOES,
# BECAUSE WASTING ENERGY AND BANDWIDTH IS BAD FOR THE ENVIRONMENT.
# To ensure you don't run it trivially, I've included a obvious command in the code that quickly stops the download.
# If you can't find or fix this, you probably shouldn't be running the script!

#This simultaneously downloads ngrams files up to 16 at a time, and while reading them reduces them down to just those
#that include a search string (actually, a regular expression) you define
#It uses the 2009 version of ngrams because the 2011 one includes a very large number of parts of speech. Changing the script around will download the longer one..
#Running on 16 processors from a computer receiving 40 MB a second, it takes about an hour to run on the 3-grams.
#I'm not sure, but it appears to be processor-bound on a 3.06 GBz Core 2 Duo iMac from 2010.
#This is a lot of data to stream. The 3-grams are about 70GB compressed: Netflix is something like 2GB an hour, so this is the
#equivalent of streaming all 4 seasons of arrested development in a row. I wouldn't do it over wifi.

curl -L --silent books.google.com/ngrams/datasets |# Get google's list of the sets
perl -ne 'if ($_ =~ m/.*(http.*\.zip).*/) {print $1 . "\n"}' | #Pull out just the urls of the files from that
grep "$corpus-${grams}gram-2009" | #Only use the corpus we're interested in
sed "s/\\.zip//g;s/.*-//g" |  # Get the name of the file, without all the verbiage around it.
head -1 | xargs -P $simultaneousDownloads -n 1 -I fileName sh -c "
  curl -L --silent  "http://storage.googleapis.com/books/ngrams/books/googlebooks-${corpus}-${grams}gram-20090715-fileName.zip" |
  gunzip -c |
  grep \"$searchstring\" > fileName.output; echo 'done with fileName'; "

cat *.output > myngrams.txt

zip -m myngrams.txt.zip myngrams.txt
	grams=3
	corpus="eng-all"
	searchstring="attention"
	simultaneousDownloads="16"

	# THIS PROCESS IS EXTREMELY RESOURCE INTENSIVE--DO NOT RUN IT ON A LARK OR IF YOU DON'T UNDERSTAND WHAT IT DOES,
	# BECAUSE WASTING ENERGY AND BANDWIDTH IS BAD FOR THE ENVIRONMENT.
	# To ensure you don't run it trivially, I've included a obvious command in the code that quickly stops the download.
	# If you can't find or fix this, you probably shouldn't be running the script!

	#This simultaneously downloads ngrams files up to 16 at a time, and while reading them reduces them down to just those
	#that include a search string (actually, a regular expression) you define
	#It uses the 2009 version of ngrams because the 2011 one includes a very large number of parts of speech. Changing the script around will download the longer one..
	#Running on 16 processors from a computer receiving 40 MB a second, it takes about an hour to run on the 3-grams.
	#I'm not sure, but it appears to be processor-bound on a 3.06 GBz Core 2 Duo iMac from 2010.
	#This is a lot of data to stream. The 3-grams are about 70GB compressed: Netflix is something like 2GB an hour, so this is the
	#equivalent of streaming all 4 seasons of arrested development in a row. I wouldn't do it over wifi.

	curl -L --silent books.google.com/ngrams/datasets \|# Get google's list of the sets
	perl -ne 'if ($_ =~ m/.(http.\.zip).*/) {print $1 . "\n"}' \| #Pull out just the urls of the files from that
	grep "$corpus-${grams}gram-2009" \| #Only use the corpus we're interested in
	sed "s/\\.zip//g;s/.*-//g" \| # Get the name of the file, without all the verbiage around it.
	head -1 \| xargs -P $simultaneousDownloads -n 1 -I fileName sh -c "
	curl -L --silent "http://storage.googleapis.com/books/ngrams/books/googlebooks-${corpus}-${grams}gram-20090715-fileName.zip" \|
	gunzip -c \|
	grep \"$searchstring\" > fileName.output; echo 'done with fileName'; "

	cat *.output > myngrams.txt

	zip -m myngrams.txt.zip myngrams.txt