Skip to content

Instantly share code, notes, and snippets.

@dantonnoriega
Last active April 21, 2024 20:00
Show Gist options
  • Save dantonnoriega/700ea57123a2eb358f5633202a02aca7 to your computer and use it in GitHub Desktop.
Save dantonnoriega/700ea57123a2eb358f5633202a02aca7 to your computer and use it in GitHub Desktop.
a collection of cool bash scripts
# cool bash codes
# search a directory for all lines that match a pattern (not perfect but useful) ------
## e.g. grep is searching for all lines matching "::" in `R/` to determine package calls
## -h hides the file names; -i ignores case
## sed -E uses regular expressions to search and match groups;
## we then sort and use -u
grep -hi :: -R R/* | sed -E 's/(.*)([ ]+[a-z]+::)(.*)/\2/g' | sort -u
# COUNT COLUMNS -----------------
## print how many columns, delimited by tab ($'\t'),
## are in each file and file name, then sort
find . -type f -exec awk -F $'\t' -v d={} '{print d, NF; exit}' {} \; | sort
# find and kill active ssh connnections --------------
lsof -i -n | grep ssh | awk '!seen[$2]++' | awk '{print $2}' | while read -r line; do kill $line; done
# use awk to parse columns because `column` does poorly with empty space. -------------
## OFS = output file separator (we add space, ', ');
## $1=$1 is a self fulfilling statement and then we print the whole line $0
## to understand $1=$1, see point 27 of
## http://www.catonmat.net/blog/awk-one-liners-explained-part-two/
head -n 1000 some_file.txt | awk -F ',' '{OFS=", ";$1=$1; print $0}' | column -s $',' -t | less
## CONVERT CSV to PSV without removing commas between double quotes ----------------
# first awk line delimits on quotes. assuming equally paired double quotes, it takes
# every other split and replaces commas with pipes
# second sed chunk finds any pipe (|) and adds a space (| )
# this is because `column` incorrectly parses empty fields in csv file i.e. || fails,
# but | | (with a space) does not the `1` is the default code block `{print $0}`
# the rest is standard, but we are now delimiting on '|' (pipes) not ','
awk -F'"' -v OFS='"' '{ for (i=1; i<=NF; i+=2) gsub(",", "|", $i) } 1' some_file_with_double_quotes.csv |
sed 's/|/| /g' |
column -s'|' -t |
less
# DETERMINE IF TEXT EXITS IN FIRST 5 LINES, PRINT IF NOT ----------------------
# create an empty file. loop through psv files. if we do NOT find word "value" in the
# first 5 lines, then echo the file name to `list_files.txt`
touch list_files.txt; for i in *psv; do if ! head -n 5 "$i" |
grep -q "value"; then echo "$i" >> list_files.txt; fi; done
# remove bom in beginning of file ------------------
awk '{ gsub(/\xef\xbb\xbf/,""); print }' file_name.csv
# for a column of data, count all unique words and sort ------------
awk -F'|' '{print $4}' some_file.psv | sort | uniq -c | sort -n | less
# COUNT BY DELIM; APPEND DELIM IF LESS THAN EXPECTED --------------
## count for fields by -F separator. if less than some value (e.g. 20), then append the
## missing fields separators needed.
## else print the line
## source: https://stackoverflow.com/questions/37295695/how-to-use-printf-to-print-a-character-multiple-times
## understanding `%*s` found here:
## https://www.gnu.org/software/gawk/manual/html_node/Format-Modifiers.html#Format-Modifiers
## magic is in first part:
## {s=sprintf("%*s",20-1-NF,""); gsub(/ /," |",s); print NF "\t" s FS $0}
## the %*s repeats a total of 20-1-NF times, appending empty space at the end "".
## If 20-1-NF > 0, then it will print a space.
## the `gsub` replaces the space with separator (" |") in this case.
## then you can print the result as `s`.
cat some_file.psv |
awk -F'|' '{
if(NF < 20) {
s=sprintf("%*s",20-1-NF,"");
gsub(/ /," |",s); print NF "\t" s FS $0
} else {
print NF "\t" $0
}}'
# FIND FILES THAT ARE SMALL AND REMOVE THEM -------------
## this avoids the "argument list too long using `ls`"
## looks at the 5th column, which as data size in bytes.
## if below, print the path to the file
## can then delete or do something to said files, like `xargs rm -f`
find -L ./data-raw/graphite -type f | xargs ls -l |
awk '$5 < 100 {print $9}' | xargs rm -f
# COUNT DELIMITERS PER LINE -----------------------
## useful for seeing if you have delimiter errors e.g. expect only three fields
## if so, then youd only see two pipes (||) per line.
## if you see more or less, something is wrong
## here, the delimiter is paired with '\n' e.g. pipe delim = '|\n'
## tr -d means to delete; -c is the complement of a list of characters
## we want find and delete all characters (-d) EXCEPT
## pipes '|' and new lines '\n' (-c)
## we use `sort | uniq -c` to help order and count instances
cat host-status-apps.psv | tr -d -c '|\n' | sort | uniq -c
# split a file into chunks (here 250M) e.g. --------------------
< big-file.csv parallel --pipe --block 250M 'cat > raw/chunks/big-file-chunk-{#}'
# same thing but faster
## the negative 4 (-4) is how many blocks each job slot should have
## resulting in e.g. 4*8 = 32 jobs to process
parallel --skip-first-line --pipepart --block -4 -j8 -a some-big-file.csv \
'tee raw/chunks/some-big-file-chunk-{#} > /dev/null'
# ITERATE URLS, SAVE CONTENT, APPENDING NUMS ------------------
## iterate through file with urls, tmp.txt, then curl and save as a set of files using nums
## should be in `bash` so we can export function and use in parallel
function curl_iter () { curl -s -g $2 > tbl$1.csv }
export -f curl_iter
cat -n tmp.txt | grep http | tr '\t' ',' | parallel --colsep="," curl_iter
# FLATTEN SIMPLE JSON OBJECTS (NO NESTING) TO CSV ---------------------
## SOURCE: https://stackoverflow.com/a/32965227/3987905
curl 'https://jsonplaceholder.typicode.com/posts' |
jq -r '(. | map(keys) | add | unique)[] as $cols |
map(. as $row | $cols | map($row[.])) as $rows | $cols, $rows[] | @csv'
# BASIC CURL WITH STATUS CODE CHECK, LOG ---------------
## curl a URL, print status code and input (assumes single input)
curl_line() {
line=$1
URL="https://example.com/query?somevalue=${line}&format=JSON"
# echo $URL
req=$(curl -i -s $URL)
http_status=$(echo "$req" | sed -n '1,/^^M$/p' | awk 'NR==1{print $2}')
echo $http_status $line | tee -a log-curl-line.txt
if [ "$http_status" == "200" ]
then
echo "$req" | # want raw , interpolated text of variable echo'd
# removes http header by excluding lines with carriage returns (hidden ^M);
grep -v $'\r' |
tee -a curl-line.json > /dev/null # append and suppress
fi
}
export -f curl_line # export function (only works in bash)
# need to remove old file so we can start fresh and append (tee -a)
rm -f curl-line.json log-curl-line.txt
cat some-file-single-column-line.psv |
parallel -j1 curl_line
# QUICKLY CONVERT any .md to .html (built to be added to ~/.zshrc) --------
function md2html () {
/usr/local/bin/pandoc --standalone \
--template=https://raw.githubusercontent.com/tajmone/pandoc-goodies/master/templates/html5/github/GitHub.html5 \
--highlight-style=pygments \
--css=https://bootswatch.com/3/lumen/bootstrap.min.css \
--metadata pagetitle=$1 $1 -o ${1%.*}.html
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment