The command line for data science is useful for:
- Quickly check large CSV files
- Checking data on a server, like in Google Cloud
- Merging CSV files quickly
- Replacing tabs with commas or similar formatting
less -S file.csv
column -t -s',' file.csv | less -S
head -n 100 file.csv | column -t -s',' | less -S # For Large file
wc -l file
less -S file.csv # Type "/" and search for word
cut -d',' -f2 file.csv
cat file.csv | grep 'dog' | less -S
cat file_1.csv > merged.csv
tail -n+2 file_2.csv >> merged.csv # Append without header
for i in {1..3}; do cp csv/GBvideos.csv "file_$i.csv"; done
head -n 1 file_1.csv > marged.csv # Header
find . -name file_*.csv | xargs tail -n+2 >> marged_file.csv
head -n 1 file_1.csv > marged.csv # Header
for file in file_*.csv; do
tail -n+2 file >> merged.csv
done
sed -E 's/\t/\,/g' file
sed -Ee :1 -e 's/^(([^",]|"[^"]*")*),/\1|/;t1' file.csv
echo '£1,245.20' | sed -E 's/(£[0-9])\,([0-9]{3}\.[0-9]{2})/\1\2/'