codieboomboom/4 - Data Wrangling

## 4 - Data Wrangling
#Find words that has at least 3 A or a and does not end with 's. What is the most 2 character ending ?
grep -E "\w*[Aa]\w*[Aa]\w*[Aa][^\']$" /usr/share/dict/words | sed -E 's/.*(..)$/\1/' | sort | uniq -c | sort
# Calculating the avaerage start up time using journalctl log
journalctl | grep "Startup finished"|grep "kernel" | sed  -E "s/^.*Startup finished in ([0-9]*\.[0-90]*)s \(kernel\).*$/\1/"
#syntax for inplace substitution
sed -i 's/regrex/substitution/' input.txt

#List out messages that are not common amongst the past 3 boot
#Step 1: find and extract messages from the past 3 boot:
for i in 0 -1 -2; do journalctl -b $i > boot_log.txt; done
#Step 2: Using sed to remove vary field such as time stamp
sed -E 's/^[A-Za-z]{3} [0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}//' boot_log.txt
#Step 3: Sort, Tally and then sort again
| sort |uniq -c | sort
#Step 4: Using awk to delete those with 3 and above (assumption that these does come from more than 1 boot)
| awk '$1 < 3 {print $0}'

#Data wrangling from FBI dataset
#Step 1: extract first column into 1 file
curl https://ucr.fbi.gov/crime-in-the-u.s/2016/crime-in-the-u.s.-2016/topic-pages/tables/table-1 | pup -c 'th.group0 text{}' | grep -E [0-9A-Za-z]{4} > extracted_data.txt
#Step 2: Extract second column into 1 file (make sure no space)
curl https://ucr.fbi.gov/crime-in-the-u.s/2016/crime-in-the-u.s.-2016/topic-pages/tables/table-1 | pup -c 'th.group1 text{}' | grep -E [A-Za-z] > temp.txt
curl https://ucr.fbi.gov/crime-in-the-u.s/2016/crime-in-the-u.s.-2016/topic-pages/tables/table-1 | pup -c 'td.group1 text{}' | grep -E [0-9] >> temp.txt
#Step 3 Paste them tgt
paste extracted_data.txt temp.txt > final.txt && mv final.txt extracted_data.txt

#Alternative pipeline
curl https://ucr.fbi.gov/crime-in-the-u.s/2016/crime-in-the-u.s.-2016/topic-pages/tables/table-1 | pup -c 'table' | grep -E group[0-1][^0-9] -C 1 | pup 'text{}' | tr -s " \n" | grep [^-] | awk 'NF' | awk '{printf "%s%s",$0,NR%2?FS:RS}'
#awk 'NF' is to remove blank lines while the next awk is to bind the fields together, based on this thread: https://stackoverflow.com/questions/14067523/moving-every-second-row-to-a-new-column-with-awk
	#Find words that has at least 3 A or a and does not end with 's. What is the most 2 character ending ?
	grep -E "\w[Aa]\w[Aa]\w[Aa][^\']$" /usr/share/dict/words \| sed -E 's/.(..)$/\1/' \| sort \| uniq -c \| sort
	# Calculating the avaerage start up time using journalctl log
	journalctl \| grep "Startup finished"\|grep "kernel" \| sed -E "s/^.Startup finished in ([0-9]\.[0-90])s \(kernel\).$/\1/"
	#syntax for inplace substitution
	sed -i 's/regrex/substitution/' input.txt

	#List out messages that are not common amongst the past 3 boot
	#Step 1: find and extract messages from the past 3 boot:
	for i in 0 -1 -2; do journalctl -b $i > boot_log.txt; done
	#Step 2: Using sed to remove vary field such as time stamp
	sed -E 's/^[A-Za-z]{3} [0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}//' boot_log.txt
	#Step 3: Sort, Tally and then sort again
	\| sort \|uniq -c \| sort
	#Step 4: Using awk to delete those with 3 and above (assumption that these does come from more than 1 boot)
	\| awk '$1 < 3 {print $0}'

	#Data wrangling from FBI dataset
	#Step 1: extract first column into 1 file
	curl https://ucr.fbi.gov/crime-in-the-u.s/2016/crime-in-the-u.s.-2016/topic-pages/tables/table-1 \| pup -c 'th.group0 text{}' \| grep -E [0-9A-Za-z]{4} > extracted_data.txt
	#Step 2: Extract second column into 1 file (make sure no space)
	curl https://ucr.fbi.gov/crime-in-the-u.s/2016/crime-in-the-u.s.-2016/topic-pages/tables/table-1 \| pup -c 'th.group1 text{}' \| grep -E [A-Za-z] > temp.txt
	curl https://ucr.fbi.gov/crime-in-the-u.s/2016/crime-in-the-u.s.-2016/topic-pages/tables/table-1 \| pup -c 'td.group1 text{}' \| grep -E [0-9] >> temp.txt
	#Step 3 Paste them tgt
	paste extracted_data.txt temp.txt > final.txt && mv final.txt extracted_data.txt

	#Alternative pipeline
	curl https://ucr.fbi.gov/crime-in-the-u.s/2016/crime-in-the-u.s.-2016/topic-pages/tables/table-1 \| pup -c 'table' \| grep -E group[0-1][^0-9] -C 1 \| pup 'text{}' \| tr -s " \n" \| grep [^-] \| awk 'NF' \| awk '{printf "%s%s",$0,NR%2?FS:RS}'
	#awk 'NF' is to remove blank lines while the next awk is to bind the fields together, based on this thread: https://stackoverflow.com/questions/14067523/moving-every-second-row-to-a-new-column-with-awk