Last active
August 29, 2015 14:12
-
-
Save alvaromuir/a74c5a42082f9c0f0702 to your computer and use it in GitHub Desktop.
Script to prep a downloaded DFA file for hadoop import
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Script to prep a downloaded dfa file for transfer and ETL to warehouse | |
# or Hadoop cluster | |
# takes a sample via subsample for python | |
# https://pypi.python.org/pypi/subsample/0.0.6 | |
# and gzips for export | |
# | |
# ex: | |
# $ ./prep_dfa_import.sh <1234_downladed_dfa.csv> <new_file.csv> <sample.csv> | |
# @alvaromuir, 04.19.2015 | |
set -e | |
re='^[0-9]+$' | |
USAGECMD="Usage: $0 <file> <num lines to skip> <opt: outputname> <opt: sample_outputname>" | |
if [ -z "$1" ] | |
then | |
echo "ERROR - No dfa report supplied." | |
echo $USAGECMD; exit 1 | |
else | |
echo "- has file parameter" | |
if [ -z "$2" ] | |
then | |
if ! [[ $2 =~ $re ]] | |
then | |
echo "ERROR - num of rows in $1 to skip not supplied" | |
echo $USAGECMD; exit 1 | |
fi | |
else | |
echo "- has skip num" | |
if [ ! -e "$1" ] | |
then | |
echo "ERROR - $1 not found. Please check path and file name." | |
else | |
echo "- has valid file" | |
output="" | |
sampleoutput="" | |
if [ -z "$3" ] | |
then | |
output="prepped_dfa_file" | |
echo "- output not supplied, using $output" | |
else | |
output=$3 | |
echo "- using output paramater of $output" | |
fi | |
if [ -z "$4" ] | |
then | |
sampleoutput=$output"_sample" | |
echo "- sample file not supplied, output to $sampleoutput" | |
else | |
sampleoutput=$4 | |
echo "- sampling to paramater $sampleoutput" | |
fi | |
echo "" | |
echo " ... preping "${1##*/}" ..." | |
echo "removing header and footer rows ..." | |
sed 1,$2d $1 | sed '$d' > $output | |
echo "done!" | |
echo "sampling data, this could take a while ..." | |
subsample -n 750 $output >> $sampleoutput | |
echo "done!" | |
echo "gzipping file ..." | |
gzip -k $output | |
echo "done !" | |
echo "" | |
echo "Your prepared file is $output.gz, and your sample is $sampleoutput" | |
fi | |
fi | |
exit 0 | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment