Last active
July 19, 2024 12:59
-
-
Save fabiolimace/7bfb971bae404b170e279019a2316f6b to your computer and use it in GitHub Desktop.
Convert a fixed-width file to a delimiter-separated file such as CSV and TSV.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# Convert a fixed-width file to a delimiter-separated file such as CSV and TSV. | |
# | |
# Positional parameters: | |
# | |
# FIELD_WIDTHS: a space-separated list the numbers of characters of each field in order of occurence. | |
# SEPARATOR: the character used to separate the fields in the output. The default separator is a tabulation. | |
# INPUT_FILE: the optional input file or `/dev/stdin` if no input file is provided. | |
# OUTPUT_FILE: the optional output file or `/dev/stdout` if no output file is provided. | |
# | |
# Usage: | |
# | |
# cat <<EOF | ./fixed-width-format-cut-paste.sh "5 5 5 5 5" "," "/dev/stdin" "/dev/output" | |
# 1111122222333334444455555 | |
# 1234512345123451234512345 | |
# EOF | |
# 11111,22222,33333,44444,55555 | |
# 12345,12345,12345,12345,12345 | |
# | |
# Read: https://en.wikipedia.org/wiki/Flat-file_database#Fixed-width_formats | |
# | |
field_widths="${1}"; | |
separator=`printf "${2-\t}"`; | |
input_file="${3:-/dev/stdin}"; | |
output_file="${4:-/dev/stdout}"; | |
tempdir=`mktemp --directory`; | |
field=1; | |
position=1; | |
cat /dev/stdin > "${tempdir}/input-file"; | |
for i in $field_widths; do | |
cut -c ${position}-$(( $position + $i - 1 )) "${tempdir}/input-file" > "${tempdir}/field-${field}"; | |
position=$(( $position + $i )); | |
field=$(( field + 1 )); | |
done; | |
paste -d "$separator" "${tempdir}/field-"* > "${output_file}"; | |
rm -rf "${tempdir}"; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/awk -f | |
# | |
# Convert a fixed-width file to a delimiter-separated file such as CSV and TSV. | |
# | |
# Parameters: | |
# | |
# FIELD_WIDTHS: a space-separated list the numbers of characters of each field in order of occurence. | |
# SEPARATOR: the character used to separate the fields in the output. The default separator is a tabulation. | |
# | |
# Usage: | |
# | |
# cat <<EOF | awk -v FIELD_WIDTHS="5 5 5 5 5" -v SEPARATOR="," -f fixed-width-format.awk | |
# 1111122222333334444455555 | |
# 1234512345123451234512345 | |
# EOF | |
# 11111,22222,33333,44444,55555 | |
# 12345,12345,12345,12345,12345 | |
# | |
# It may be useful for implementations of AWK other than GNU's AWK, which supports splitting fixed-width files into fields. | |
# | |
# Read: https://en.wikipedia.org/wiki/Flat-file_database#Fixed-width_formats | |
# | |
BEGIN { | |
SEP = SEPARATOR ? SEPARATOR : "\t"; | |
split(FIELD_WIDTHS, FIELD_WIDTHS_ARRAY); | |
NUMBER_OF_FIELDS = length(FIELD_WIDTHS_ARRAY); | |
} | |
function split_into_fields() { | |
position = 0; | |
for (i = 1; i <= NUMBER_OF_FIELDS; i++) { | |
position = position + FIELD_WIDTHS_ARRAY[i] + 1; | |
if (position <= length($0)) { | |
$0 = substr($0, 1, position - 1) SEP substr($0, position); | |
} | |
} | |
} | |
{ | |
split_into_fields(); | |
print $0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# Convert a fixed-width file to a delimiter-separated file such as CSV and TSV. | |
# | |
# Positional parameters: | |
# | |
# FIELD_WIDTHS: a space-separated list the numbers of characters of each field in order of occurence. | |
# SEPARATOR: the character used to separate the fields in the output. The default separator is a tabulation. | |
# INPUT_FILE: the optional file to be converted or `/dev/stdin` if no input file is provided. | |
# | |
# Usage: | |
# | |
# cat <<EOF | ./fixed-width-format.sh "5 5 5 5 5" "," "/dev/stdin" | |
# 1111122222333334444455555 | |
# 1234512345123451234512345 | |
# EOF | |
# 11111,22222,33333,44444,55555 | |
# 12345,12345,12345,12345,12345 | |
# | |
# Read: https://en.wikipedia.org/wiki/Flat-file_database#Fixed-width_formats | |
# | |
field_widths="${1}"; | |
separator=`printf "${2-\t}"`; | |
input_file="${3:-/dev/stdin}"; | |
split_into_fields() { | |
local line="${1}" | |
local position=0; | |
for i in $field_widths; do | |
position=$(( $position + $i + 1 )); | |
if [ $position -le "${#line}" ]; then | |
line="${line:0:$position-1}${separator}${line:$position-1}" | |
fi; | |
done; | |
echo "${line}"; | |
} | |
while read -r line; do | |
split_into_fields "$line" | |
done < "${input_file}"; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment