chris-asl/extract-queries.sh

## extract-queries.sh
#!/bin/sh

# Match either cc=hep or anything that does NOT contain cc (as the default
# collection is hep).
FILTER_HEP_ONLY="(cc=hep)|^(?!.*cc=)"

# 1. We want to capture everything that starts after "p=".
# Sometimes the query part appears two times in the log entry, and grep captures
# the second one that ends with '"'. We exclude double quote so that the engine
# stops parsing once it sees this (or "&" which is the normal case).
# e.g.
# 95.233.191.71 - - [24/Nov/2015:11:48:32 +0100] "GET /search?ln=en&ln=en&p=f+a+greco%2C+mario&of=hcs&action_search=Search&sf=earliestdate&so=d&rm=&rg=25&sc=0 HTTP/1.1" 200 21882 "http://inspirehep.net/search?ln=en&p=f+a+greco%2C+mario&jrec=26&sf=earliestdate" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:42.0) Gecko/20100101 Firefox/42.0" 307139

# 2. We use the positive lookahead for "&" for selecting the first query, in the
# case there are two queries. (Also, this is the normal case for when there
# aren't two).

# Additionally, we use 2+ quantifier, since we have some queries that are
# probably "hacking" attempts, like "p=1".
MATCH_ONLY_QUERY_PART="(?<=p=)([^&\"]{2,})(?=&)"

# Filter out queries that after url decoding are "$".
# e.g.
# 137.138.201.151 - - [12/Nov/2014:08:41:14 +0100] "GET /search?ln=en&ln=en&p=%24&of=hb&action_search=Search&sf=earliestdate&so=d&rm=&rg=25&sc=0 HTTP/1.1" 200 68323 "http://inspirelb.cern.ch/search?ln=en&p=&of=hb&action_search=Search&sf=earliestdate&so=d" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36" 584134

cat |  # cat inherits standard input from our script
grep -Pi ${FILTER_HEP_ONLY} |
grep -Pos ${MATCH_ONLY_QUERY_PART} |
python -c 'import sys; import urllib; print(urllib.unquote_plus(sys.stdin.read()))' |
grep -v "\\$"
	#!/bin/sh

	# Match either cc=hep or anything that does NOT contain cc (as the default
	# collection is hep).
	FILTER_HEP_ONLY="(cc=hep)\|^(?!.*cc=)"

	# 1. We want to capture everything that starts after "p=".
	# Sometimes the query part appears two times in the log entry, and grep captures
	# the second one that ends with '"'. We exclude double quote so that the engine
	# stops parsing once it sees this (or "&" which is the normal case).
	# e.g.
	# 95.233.191.71 - - [24/Nov/2015:11:48:32 +0100] "GET /search?ln=en&ln=en&p=f+a+greco%2C+mario&of=hcs&action_search=Search&sf=earliestdate&so=d&rm=&rg=25&sc=0 HTTP/1.1" 200 21882 "http://inspirehep.net/search?ln=en&p=f+a+greco%2C+mario&jrec=26&sf=earliestdate" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:42.0) Gecko/20100101 Firefox/42.0" 307139

	# 2. We use the positive lookahead for "&" for selecting the first query, in the
	# case there are two queries. (Also, this is the normal case for when there
	# aren't two).

	# Additionally, we use 2+ quantifier, since we have some queries that are
	# probably "hacking" attempts, like "p=1".
	MATCH_ONLY_QUERY_PART="(?<=p=)([^&\"]{2,})(?=&)"

	# Filter out queries that after url decoding are "$".
	# e.g.
	# 137.138.201.151 - - [12/Nov/2014:08:41:14 +0100] "GET /search?ln=en&ln=en&p=%24&of=hb&action_search=Search&sf=earliestdate&so=d&rm=&rg=25&sc=0 HTTP/1.1" 200 68323 "http://inspirelb.cern.ch/search?ln=en&p=&of=hb&action_search=Search&sf=earliestdate&so=d" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36" 584134

	cat \| # cat inherits standard input from our script
	grep -Pi ${FILTER_HEP_ONLY} \|
	grep -Pos ${MATCH_ONLY_QUERY_PART} \|
	python -c 'import sys; import urllib; print(urllib.unquote_plus(sys.stdin.read()))' \|
	grep -v "\\$"