dantonnoriega/cool-bash-code.sh

## cool-bash-code.sh
# cool bash codes

# search a directory for all lines that match a pattern (not perfect but useful) ------
## e.g. grep is searching for all lines matching "::" in `R/` to determine package calls
## -h hides the file names; -i ignores case
## sed -E uses regular expressions to search and match groups;
## we then sort and use -u
grep -hi :: -R R/* | sed -E 's/(.*)([ ]+[a-z]+::)(.*)/\2/g' | sort -u

# COUNT COLUMNS -----------------
## print how many columns, delimited by tab ($'\t'),
##  are in each file and file name, then sort
find . -type f -exec awk -F $'\t' -v d={} '{print d, NF; exit}' {} \; | sort

# find and kill active ssh connnections --------------
lsof -i -n | grep ssh | awk '!seen[$2]++' | awk '{print $2}' | while read -r line; do kill $line; done

# use awk to parse columns because `column` does poorly with empty space. -------------
## OFS = output file separator (we add space, ', ');
## $1=$1 is a self fulfilling statement and then we print the whole line $0
## to understand $1=$1, see point 27 of
## http://www.catonmat.net/blog/awk-one-liners-explained-part-two/
head -n 1000 some_file.txt | awk -F ',' '{OFS=", ";$1=$1; print $0}' | column -s $',' -t | less


## CONVERT CSV to PSV without removing commas between double quotes ----------------
# first awk line delimits on quotes. assuming equally paired double quotes, it takes
#  every other split and replaces commas with pipes
# second sed chunk finds any pipe (|) and adds a space (| )
#   this is because `column` incorrectly parses empty fields in csv file i.e. || fails,
#   but | | (with a space) does not the `1` is the default code block `{print $0}`
# the rest is standard, but we are now delimiting on '|' (pipes) not ','
  awk -F'"' -v OFS='"' '{ for (i=1; i<=NF; i+=2) gsub(",", "|", $i) } 1' some_file_with_double_quotes.csv |
  sed 's/|/| /g' |
  column -s'|' -t |
  less

# DETERMINE IF TEXT EXITS IN FIRST 5 LINES, PRINT IF NOT ----------------------
# create an empty file. loop through psv files. if we do NOT find word "value" in the
#  first 5 lines, then echo the file name to `list_files.txt`
touch list_files.txt; for i in *psv; do if ! head -n 5 "$i" |
  grep -q "value"; then echo "$i" >> list_files.txt; fi; done

# remove bom in beginning of file ------------------
 awk '{ gsub(/\xef\xbb\xbf/,""); print }' file_name.csv

# for a column of data, count all unique words and sort ------------
awk -F'|' '{print $4}' some_file.psv | sort | uniq -c | sort -n | less

# COUNT BY DELIM; APPEND DELIM IF LESS THAN EXPECTED --------------
## count for fields by -F separator. if less than some value (e.g. 20), then append the
##   missing fields separators needed.
## else print the line
## source: https://stackoverflow.com/questions/37295695/how-to-use-printf-to-print-a-character-multiple-times
## understanding `%*s` found here:
## https://www.gnu.org/software/gawk/manual/html_node/Format-Modifiers.html#Format-Modifiers
## magic is in first part:
## {s=sprintf("%*s",20-1-NF,""); gsub(/ /," |",s); print NF "\t" s FS $0}
##   the %*s repeats a total of 20-1-NF times, appending empty space at the end "".
## If 20-1-NF > 0, then it will print a space.
##   the `gsub` replaces the space with separator (" |") in this case.
##   then you can print the result as `s`.
cat some_file.psv |
  awk -F'|' '{
    if(NF < 20) {
      s=sprintf("%*s",20-1-NF,"");
      gsub(/ /," |",s); print NF "\t" s FS $0
    } else {
      print NF "\t" $0
    }}'

# FIND FILES THAT ARE SMALL AND REMOVE THEM -------------
## this avoids the "argument list too long using `ls`"
## looks at the 5th column, which as data size in bytes.
## if below, print the path to the file
## can then delete or do something to said files, like `xargs rm -f`
find -L ./data-raw/graphite -type f | xargs ls -l |
  awk '$5 < 100 {print $9}' | xargs rm -f

# COUNT DELIMITERS PER LINE -----------------------
## useful for seeing if you have delimiter errors e.g. expect only three fields
##   if so, then youd only see two pipes (||) per line.
##   if you see more or less, something is wrong
## here, the delimiter is paired with '\n' e.g. pipe delim = '|\n'
## tr -d means to delete; -c is the complement of a list of characters
##   we want find and delete all characters (-d) EXCEPT
##   pipes '|' and new lines '\n' (-c)
## we use `sort | uniq -c` to help order and count instances
cat host-status-apps.psv | tr -d -c '|\n' | sort | uniq -c


# split a file into chunks (here 250M) e.g. --------------------
< big-file.csv parallel --pipe --block 250M 'cat > raw/chunks/big-file-chunk-{#}'

# same thing but faster
## the negative 4 (-4) is how many blocks each job slot should have
## resulting in e.g. 4*8 = 32 jobs to process
parallel --skip-first-line --pipepart --block -4 -j8 -a some-big-file.csv \
  'tee raw/chunks/some-big-file-chunk-{#} > /dev/null'

# ITERATE URLS, SAVE CONTENT, APPENDING NUMS ------------------
## iterate through file with urls, tmp.txt, then curl and save as a set of files using nums
## should be in `bash` so we can export function and use in parallel
function curl_iter () { curl -s -g $2 > tbl$1.csv }
export -f curl_iter
cat -n tmp.txt | grep http | tr '\t' ',' | parallel --colsep="," curl_iter

# FLATTEN SIMPLE JSON OBJECTS (NO NESTING) TO CSV ---------------------
## SOURCE: https://stackoverflow.com/a/32965227/3987905
curl 'https://jsonplaceholder.typicode.com/posts' |
  jq -r '(. | map(keys) | add | unique)[] as $cols |
          map(. as $row | $cols | map($row[.])) as $rows | $cols, $rows[] | @csv'


# BASIC CURL WITH STATUS CODE CHECK, LOG ---------------
## curl a URL, print status code and input (assumes single input)
curl_line() {
  line=$1
    URL="https://example.com/query?somevalue=${line}&format=JSON"
  # echo $URL
  req=$(curl -i -s $URL)
  http_status=$(echo "$req" | sed -n '1,/^^M$/p' | awk 'NR==1{print $2}')
  echo $http_status $line | tee -a log-curl-line.txt
  if [ "$http_status" == "200" ]
  then
    echo "$req" | # want raw , interpolated text of variable echo'd
      # removes http header by excluding lines with carriage returns (hidden ^M);
      grep -v $'\r' |
    tee -a curl-line.json > /dev/null # append and suppress
  fi
}
export -f curl_line # export function (only works in bash)

# need to remove old file so we can start fresh and append (tee -a)
rm -f curl-line.json log-curl-line.txt

cat some-file-single-column-line.psv |
  parallel -j1 curl_line

# QUICKLY CONVERT  any .md to .html (built to be added to ~/.zshrc) --------
function md2html () {
  /usr/local/bin/pandoc --standalone \
    --template=https://raw.githubusercontent.com/tajmone/pandoc-goodies/master/templates/html5/github/GitHub.html5 \
    --highlight-style=pygments \
    --css=https://bootswatch.com/3/lumen/bootstrap.min.css \
    --metadata pagetitle=$1 $1 -o ${1%.*}.html
}
	# cool bash codes

	# search a directory for all lines that match a pattern (not perfect but useful) ------
	## e.g. grep is searching for all lines matching "::" in `R/` to determine package calls
	## -h hides the file names; -i ignores case
	## sed -E uses regular expressions to search and match groups;
	## we then sort and use -u
	grep -hi :: -R R/* \| sed -E 's/(.)([ ]+[a-z]+::)(.)/\2/g' \| sort -u

	# COUNT COLUMNS -----------------
	## print how many columns, delimited by tab ($'\t'),
	## are in each file and file name, then sort
	find . -type f -exec awk -F $'\t' -v d={} '{print d, NF; exit}' {} \; \| sort

	# find and kill active ssh connnections --------------
	lsof -i -n \| grep ssh \| awk '!seen[$2]++' \| awk '{print $2}' \| while read -r line; do kill $line; done

	# use awk to parse columns because `column` does poorly with empty space. -------------
	## OFS = output file separator (we add space, ', ');
	## $1=$1 is a self fulfilling statement and then we print the whole line $0
	## to understand $1=$1, see point 27 of
	## http://www.catonmat.net/blog/awk-one-liners-explained-part-two/
	head -n 1000 some_file.txt \| awk -F ',' '{OFS=", ";$1=$1; print $0}' \| column -s $',' -t \| less


	## CONVERT CSV to PSV without removing commas between double quotes ----------------
	# first awk line delimits on quotes. assuming equally paired double quotes, it takes
	# every other split and replaces commas with pipes
	# second sed chunk finds any pipe (\|) and adds a space (\| )
	# this is because `column` incorrectly parses empty fields in csv file i.e. \|\| fails,
	# but \| \| (with a space) does not the `1` is the default code block `{print $0}`
	# the rest is standard, but we are now delimiting on '\|' (pipes) not ','
	awk -F'"' -v OFS='"' '{ for (i=1; i<=NF; i+=2) gsub(",", "\|", $i) } 1' some_file_with_double_quotes.csv \|
	sed 's/\|/\| /g' \|
	column -s'\|' -t \|
	less

	# DETERMINE IF TEXT EXITS IN FIRST 5 LINES, PRINT IF NOT ----------------------
	# create an empty file. loop through psv files. if we do NOT find word "value" in the
	# first 5 lines, then echo the file name to `list_files.txt`
	touch list_files.txt; for i in *psv; do if ! head -n 5 "$i" \|
	grep -q "value"; then echo "$i" >> list_files.txt; fi; done

	# remove bom in beginning of file ------------------
	awk '{ gsub(/\xef\xbb\xbf/,""); print }' file_name.csv

	# for a column of data, count all unique words and sort ------------
	awk -F'\|' '{print $4}' some_file.psv \| sort \| uniq -c \| sort -n \| less

	# COUNT BY DELIM; APPEND DELIM IF LESS THAN EXPECTED --------------
	## count for fields by -F separator. if less than some value (e.g. 20), then append the
	## missing fields separators needed.
	## else print the line
	## source: https://stackoverflow.com/questions/37295695/how-to-use-printf-to-print-a-character-multiple-times
	## understanding `%*s` found here:
	## https://www.gnu.org/software/gawk/manual/html_node/Format-Modifiers.html#Format-Modifiers
	## magic is in first part:
	## {s=sprintf("%*s",20-1-NF,""); gsub(/ /," \|",s); print NF "\t" s FS $0}
	## the %*s repeats a total of 20-1-NF times, appending empty space at the end "".
	## If 20-1-NF > 0, then it will print a space.
	## the `gsub` replaces the space with separator (" \|") in this case.
	## then you can print the result as `s`.
	cat some_file.psv \|
	awk -F'\|' '{
	if(NF < 20) {
	s=sprintf("%*s",20-1-NF,"");
	gsub(/ /," \|",s); print NF "\t" s FS $0
	} else {
	print NF "\t" $0
	}}'

	# FIND FILES THAT ARE SMALL AND REMOVE THEM -------------
	## this avoids the "argument list too long using `ls`"
	## looks at the 5th column, which as data size in bytes.
	## if below, print the path to the file
	## can then delete or do something to said files, like `xargs rm -f`
	find -L ./data-raw/graphite -type f \| xargs ls -l \|
	awk '$5 < 100 {print $9}' \| xargs rm -f

	# COUNT DELIMITERS PER LINE -----------------------
	## useful for seeing if you have delimiter errors e.g. expect only three fields
	## if so, then youd only see two pipes (\|\|) per line.
	## if you see more or less, something is wrong
	## here, the delimiter is paired with '\n' e.g. pipe delim = '\|\n'
	## tr -d means to delete; -c is the complement of a list of characters
	## we want find and delete all characters (-d) EXCEPT
	## pipes '\|' and new lines '\n' (-c)
	## we use `sort \| uniq -c` to help order and count instances
	cat host-status-apps.psv \| tr -d -c '\|\n' \| sort \| uniq -c


	# split a file into chunks (here 250M) e.g. --------------------
	< big-file.csv parallel --pipe --block 250M 'cat > raw/chunks/big-file-chunk-{#}'

	# same thing but faster
	## the negative 4 (-4) is how many blocks each job slot should have
	## resulting in e.g. 4*8 = 32 jobs to process
	parallel --skip-first-line --pipepart --block -4 -j8 -a some-big-file.csv \
	'tee raw/chunks/some-big-file-chunk-{#} > /dev/null'

	# ITERATE URLS, SAVE CONTENT, APPENDING NUMS ------------------
	## iterate through file with urls, tmp.txt, then curl and save as a set of files using nums
	## should be in `bash` so we can export function and use in parallel
	function curl_iter () { curl -s -g $2 > tbl$1.csv }
	export -f curl_iter
	cat -n tmp.txt \| grep http \| tr '\t' ',' \| parallel --colsep="," curl_iter

	# FLATTEN SIMPLE JSON OBJECTS (NO NESTING) TO CSV ---------------------
	## SOURCE: https://stackoverflow.com/a/32965227/3987905
	curl 'https://jsonplaceholder.typicode.com/posts' \|
	jq -r '(. \| map(keys) \| add \| unique)[] as $cols \|
	map(. as $row \| $cols \| map($row[.])) as $rows \| $cols, $rows[] \| @csv'


	# BASIC CURL WITH STATUS CODE CHECK, LOG ---------------
	## curl a URL, print status code and input (assumes single input)
	curl_line() {
	line=$1
	URL="https://example.com/query?somevalue=${line}&format=JSON"
	# echo $URL
	req=$(curl -i -s $URL)
	http_status=$(echo "$req" \| sed -n '1,/^^M$/p' \| awk 'NR==1{print $2}')
	echo $http_status $line \| tee -a log-curl-line.txt
	if [ "$http_status" == "200" ]
	then
	echo "$req" \| # want raw , interpolated text of variable echo'd
	# removes http header by excluding lines with carriage returns (hidden ^M);
	grep -v $'\r' \|
	tee -a curl-line.json > /dev/null # append and suppress
	fi
	}
	export -f curl_line # export function (only works in bash)

	# need to remove old file so we can start fresh and append (tee -a)
	rm -f curl-line.json log-curl-line.txt

	cat some-file-single-column-line.psv \|
	parallel -j1 curl_line

	# QUICKLY CONVERT any .md to .html (built to be added to ~/.zshrc) --------
	function md2html () {
	/usr/local/bin/pandoc --standalone \
	--template=https://raw.githubusercontent.com/tajmone/pandoc-goodies/master/templates/html5/github/GitHub.html5 \
	--highlight-style=pygments \
	--css=https://bootswatch.com/3/lumen/bootstrap.min.css \
	--metadata pagetitle=$1 $1 -o ${1%.*}.html
	}