Skip to content

Instantly share code, notes, and snippets.

@tsjames9991
Created April 22, 2019 08:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tsjames9991/51b29fc0627bd48c11e42b05573371ef to your computer and use it in GitHub Desktop.
Save tsjames9991/51b29fc0627bd48c11e42b05573371ef to your computer and use it in GitHub Desktop.
HTML2CSV
#!/bin/bash
### Created this shell script to convert the spark dispatcher logs accessible as HTML to CSV so that we can use the data for different purposes.
create_csv()
{
awk -v RS='' '{gsub("\n", ", "); print}' textfile > rawfile.csv
rm -f dump.txt textfile
}
html_clean()
{
curl -s "URL_TO_BE_HIT" > table.html
sed -e 's/<[^>]*>//g' table.html | awk '$1=$1' | sed '1,11d' | sed '$d' | sed '$d' > dump.txt
rm -f table.html
}
clean_text()
{
COUNTER=1
echo "COMMA_SEPERATED_HEADERS_OF_TABLES" > textfile
while read -r line
do
if [ $COUNTER -le 6 ]
then
echo "$line" >> textfile
let COUNTER+=1
else
echo "$line"$'\n' >> textfile
let COUNTER=1
fi
done < dump.txt
}
check_status()
{
prev_date=$(date --date="5 days ago" +'%Y/%m/%d')
time=$(date +"%T")
H=`date +"%T" | cut -f 1 -d ':'`
M=`date +"%T" | cut -c 4`
grep "$prev_date $H:$M[0-9]:[0-9][0-9]" rawfile.csv > csvfile.csv
}
### Main script starts here
html_clean
clean_text
create_csv
check_status
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment