Skip to content

Instantly share code, notes, and snippets.

@wkoffel
Created April 22, 2013 17:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wkoffel/5436879 to your computer and use it in GitHub Desktop.
Save wkoffel/5436879 to your computer and use it in GitHub Desktop.
Awk script for splitting very large files. Customize the field and record separators and size for the split.
# split.awk --- do split in awk
#
# Requires ord() and chr() library functions
# usage: split [-num] [file] [outname]
BEGIN {
RS = "\036"
ORS = "\036"
_ord_init()
outfile = "x" # default
count = 10000000
if (ARGC > 4)
usage()
i = 1
if (ARGV[i] ~ /^-[[:digit:]]+$/) {
count = -ARGV[i]
ARGV[i] = ""
i++
}
# test argv in case reading from stdin instead of file
if (i in ARGV)
i++ # skip data file name
if (i in ARGV) {
outfile = ARGV[i]
ARGV[i] = ""
}
s1 = s2 = "a"
out = (outfile s1 s2)
}
{
if (++tcount > count) {
close(out)
if (s2 == "z") {
if (s1 == "z") {
printf("split: %s is too large to split\n",
FILENAME) > "/dev/stderr"
exit 1
}
s1 = chr(ord(s1) + 1)
s2 = "a"
}
else
s2 = chr(ord(s2) + 1)
out = (outfile s1 s2)
tcount = 1
}
print > out
}
function usage( e)
{
e = "usage: split [-num] [file] [outname]"
print e > "/dev/stderr"
exit 1
}
function ord(str, c)
{
# only first character is of interest
c = substr(str, 1, 1)
return _ord_[c]
}
function chr(c)
{
# force c to be numeric by adding 0
return sprintf("%c", c + 0)
}
function _ord_init( low, high, i, t)
{
low = sprintf("%c", 7) # BEL is ascii 7
if (low == "\a") { # regular ascii
low = 0
high = 127
} else if (sprintf("%c", 128 + 7) == "\a") {
# ascii, mark parity
low = 128
high = 255
} else { # ebcdic(!)
low = 0
high = 255
}
for (i = low; i <= high; i++) {
t = sprintf("%c", i)
_ord_[t] = i
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment