Created
June 21, 2018 22:40
-
-
Save brodygov/0d12abde37ff967de0af084c307dae45 to your computer and use it in GitHub Desktop.
Split s3 file by date
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"fmt" | |
"log" | |
"os" | |
"path" | |
"strings" | |
"syscall" | |
) | |
func create_file(outfilename string) (*os.File, error) { | |
f, err := os.OpenFile(outfilename, os.O_WRONLY|os.O_CREATE|os.O_EXCL, 0644) | |
return f, err | |
} | |
func usage() { | |
fmt.Fprintf(os.Stderr, `usage: %s S3_LISTING OUT_DIRECTORY | |
Run through S3_LISTING, which should be the output of 'aws s3 ls', and split | |
out each entry by its modification date into separate listing files in | |
OUT_DIRECTORY. | |
`, path.Base(os.Args[0])) | |
} | |
func main() { | |
if len(os.Args) != 3 { | |
usage() | |
os.Exit(1) | |
} | |
infilename := os.Args[1] | |
outdir := os.Args[2] | |
fmt.Printf("Reading from %s to %s\n", infilename, outdir) | |
file, err := os.Open(infilename) | |
if err != nil { | |
log.Fatal("failed to open input file: ", err) | |
} | |
defer file.Close() | |
out_handles := make(map[string]*os.File) | |
scanner := bufio.NewScanner(file) | |
for scanner.Scan() { | |
words := strings.SplitN(scanner.Text(), " ", 2) | |
date := words[0] | |
rest := words[1] | |
if date == "" { | |
continue | |
} | |
outfilename := outdir + "/listing." + date + ".txt" | |
outf, ok := out_handles[outfilename] | |
if !ok { | |
outf, err = create_file(outfilename) | |
if err != nil { | |
// we only expect EMFILE | |
patherr, ok := err.(*os.PathError) | |
if !ok { | |
log.Fatal("not sure how we got here: ", err) | |
} | |
if patherr.Err != syscall.EMFILE { | |
log.Fatal("error creating output file: ", patherr) | |
} | |
log.Print("Hit EMFILE, closing all file handles") | |
// close all handles | |
for _, handle := range out_handles { | |
handle.Close() | |
} | |
out_handles = make(map[string]*os.File) | |
// retry | |
outf, err = create_file(outfilename) | |
if err != nil { | |
log.Fatal("retry fail", err) | |
} | |
} | |
out_handles[outfilename] = outf | |
} | |
if _, err := outf.Write([]byte(date + " " + rest + "\n")); err != nil { | |
log.Fatal("failed to write to outfile ", outfilename, " ", err, " date: ", date, " rest: "+rest) | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
def usage | |
STDERR.puts <<-EOM | |
usage: #{File.basename($0)} S3_LISTING OUT_DIRECTORY | |
Run through S3_LISTING, which should be the output of \`aws s3 ls\`, and split | |
out each entry by its modification date into separate listing files in | |
OUT_DIRECTORY. | |
EOM | |
end | |
def create_file_noclobber(filename) | |
puts "Creating #{filename.inspect}" | |
File.open(filename, File::WRONLY | File::CREAT | File::EXCL) | |
end | |
if ARGV.length != 2 | |
usage | |
exit 1 | |
end | |
infile = ARGV.fetch(0) | |
outdir = ARGV.fetch(1).chomp('/') | |
puts "Splitting #{infile.inspect} by date into #{outdir.inspect}" | |
# keep output file handles around to minimize system calls | |
out_handles = {} | |
# Increase our file handle limit | |
cur_rlimit = Process.getrlimit(:NOFILE).first | |
puts "rlimit NOFILE: #{cur_rulimit}" | |
Process.setrlimit(:NOFILE, 1024) if cur_rlimit < 1024 | |
File.open(infile, 'r') do |inf| | |
inf.each_line do |line| | |
date, rest = line.split(' ', 2) | |
next if date.empty? || date == 'PRE' | |
outfile = "#{outdir}/listing.#{date}.txt" | |
begin | |
out_handles[outfile] ||= create_file_noclobber(outfile) | |
rescue Errno::EMFILE | |
STDERR.puts 'Hit EMFILE, closing all file handles' | |
out_handles.values.map(&:close) | |
out_handles.clear | |
retry | |
end | |
out_handles.fetch(outfile) << date + ' ' + rest | |
end | |
end | |
puts 'Finished' | |
# don't bother garbage collecting, just bail out real fast | |
exit! |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -euo pipefail | |
usage() { | |
cat >&2 <<EOM | |
usage: $(basename "$0") S3_LISTING OUT_DIRECTORY | |
Run through S3_LISTING, which should be the output of \`aws s3 ls\`, and split | |
out each entry by its modification date into separate listing files in | |
OUT_DIRECTORY. | |
EOM | |
} | |
run() { | |
echo >&2 "+ $*" | |
"$@" | |
} | |
if [ $# -lt 2 ]; then | |
usage | |
exit 1 | |
fi | |
infile="$1" | |
outdir="${2%/}" | |
echo "Splitting $infile by date into $outdir/" | |
while read -r date rest_line ; do | |
if [ -z "$date" ] || [ "$date" = "PRE" ]; then | |
continue | |
fi | |
outfile="$outdir/listing.$date.txt" | |
#if [ ! -e "$outfile" ]; then | |
# echo "Creating '$outfile'" | |
#fi | |
echo "$date $rest_line" >> "$outfile" | |
done < "$infile" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment