Skip to content

Instantly share code, notes, and snippets.

@brodygov
Created June 21, 2018 22:40
Show Gist options
  • Save brodygov/0d12abde37ff967de0af084c307dae45 to your computer and use it in GitHub Desktop.
Save brodygov/0d12abde37ff967de0af084c307dae45 to your computer and use it in GitHub Desktop.
Split s3 file by date
package main
import (
"bufio"
"fmt"
"log"
"os"
"path"
"strings"
"syscall"
)
func create_file(outfilename string) (*os.File, error) {
f, err := os.OpenFile(outfilename, os.O_WRONLY|os.O_CREATE|os.O_EXCL, 0644)
return f, err
}
func usage() {
fmt.Fprintf(os.Stderr, `usage: %s S3_LISTING OUT_DIRECTORY
Run through S3_LISTING, which should be the output of 'aws s3 ls', and split
out each entry by its modification date into separate listing files in
OUT_DIRECTORY.
`, path.Base(os.Args[0]))
}
func main() {
if len(os.Args) != 3 {
usage()
os.Exit(1)
}
infilename := os.Args[1]
outdir := os.Args[2]
fmt.Printf("Reading from %s to %s\n", infilename, outdir)
file, err := os.Open(infilename)
if err != nil {
log.Fatal("failed to open input file: ", err)
}
defer file.Close()
out_handles := make(map[string]*os.File)
scanner := bufio.NewScanner(file)
for scanner.Scan() {
words := strings.SplitN(scanner.Text(), " ", 2)
date := words[0]
rest := words[1]
if date == "" {
continue
}
outfilename := outdir + "/listing." + date + ".txt"
outf, ok := out_handles[outfilename]
if !ok {
outf, err = create_file(outfilename)
if err != nil {
// we only expect EMFILE
patherr, ok := err.(*os.PathError)
if !ok {
log.Fatal("not sure how we got here: ", err)
}
if patherr.Err != syscall.EMFILE {
log.Fatal("error creating output file: ", patherr)
}
log.Print("Hit EMFILE, closing all file handles")
// close all handles
for _, handle := range out_handles {
handle.Close()
}
out_handles = make(map[string]*os.File)
// retry
outf, err = create_file(outfilename)
if err != nil {
log.Fatal("retry fail", err)
}
}
out_handles[outfilename] = outf
}
if _, err := outf.Write([]byte(date + " " + rest + "\n")); err != nil {
log.Fatal("failed to write to outfile ", outfilename, " ", err, " date: ", date, " rest: "+rest)
}
}
}
#!/usr/bin/env ruby
def usage
STDERR.puts <<-EOM
usage: #{File.basename($0)} S3_LISTING OUT_DIRECTORY
Run through S3_LISTING, which should be the output of \`aws s3 ls\`, and split
out each entry by its modification date into separate listing files in
OUT_DIRECTORY.
EOM
end
def create_file_noclobber(filename)
puts "Creating #{filename.inspect}"
File.open(filename, File::WRONLY | File::CREAT | File::EXCL)
end
if ARGV.length != 2
usage
exit 1
end
infile = ARGV.fetch(0)
outdir = ARGV.fetch(1).chomp('/')
puts "Splitting #{infile.inspect} by date into #{outdir.inspect}"
# keep output file handles around to minimize system calls
out_handles = {}
# Increase our file handle limit
cur_rlimit = Process.getrlimit(:NOFILE).first
puts "rlimit NOFILE: #{cur_rulimit}"
Process.setrlimit(:NOFILE, 1024) if cur_rlimit < 1024
File.open(infile, 'r') do |inf|
inf.each_line do |line|
date, rest = line.split(' ', 2)
next if date.empty? || date == 'PRE'
outfile = "#{outdir}/listing.#{date}.txt"
begin
out_handles[outfile] ||= create_file_noclobber(outfile)
rescue Errno::EMFILE
STDERR.puts 'Hit EMFILE, closing all file handles'
out_handles.values.map(&:close)
out_handles.clear
retry
end
out_handles.fetch(outfile) << date + ' ' + rest
end
end
puts 'Finished'
# don't bother garbage collecting, just bail out real fast
exit!
#!/bin/bash
set -euo pipefail
usage() {
cat >&2 <<EOM
usage: $(basename "$0") S3_LISTING OUT_DIRECTORY
Run through S3_LISTING, which should be the output of \`aws s3 ls\`, and split
out each entry by its modification date into separate listing files in
OUT_DIRECTORY.
EOM
}
run() {
echo >&2 "+ $*"
"$@"
}
if [ $# -lt 2 ]; then
usage
exit 1
fi
infile="$1"
outdir="${2%/}"
echo "Splitting $infile by date into $outdir/"
while read -r date rest_line ; do
if [ -z "$date" ] || [ "$date" = "PRE" ]; then
continue
fi
outfile="$outdir/listing.$date.txt"
#if [ ! -e "$outfile" ]; then
# echo "Creating '$outfile'"
#fi
echo "$date $rest_line" >> "$outfile"
done < "$infile"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment