Skip to content

Instantly share code, notes, and snippets.

@kizashi1122
Created September 29, 2016 13:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kizashi1122/44ee35ea5bcc090b0f54fd1edbba3ade to your computer and use it in GitHub Desktop.
Save kizashi1122/44ee35ea5bcc090b0f54fd1edbba3ade to your computer and use it in GitHub Desktop.
#!/usr/bin/env perl
#
# split data file as evenly as possible for n-fold cross validation
#
# Usage:
# ./split_file_evenly.pl data_file 10
#
# In this case, 10 training and test sets are generated.
#
use strict;
use warnings;
use autodie qw/open close/;
use feature 'say';
my $file = shift or die $!;
my $k = shift || 10;
# line count
my $line_count = 0;
open my $fh, '<', $file;
$line_count += tr/\n/\n/ while sysread($fh, $_, 2**20);
close $fh;
# divide as evenly as possible
my $refa = &split_n($line_count, $k);
# range of test data
my $from = 1;
my $to = @$refa[0];
foreach my $i (0 .. ($k - 1)) {
open my $fh, '<', $file;
my $learnf = sprintf("%02d-learn", $i + 1);
my $testf = sprintf("%02d-test", $i + 1);
open my $fhl, '>', $learnf;
open my $fht, '>', $testf;
my $curpos = 1;
while (<$fh>) {
if ($curpos >= $from && $curpos <= $to) {
print $fht $_;
} else {
print $fhl $_;
}
$curpos++;
}
close $fh;
close $fhl;
close $fht;
# prepare next
$from = $to + 1;
$to = (@$refa[$i + 1]) ? $from + @$refa[$i + 1] - 1 : $line_count;
}
sub split_n {
my $l = shift;
my $k = shift;
my $div = int($l / $k);
my $mod = $l % $k;
my @parts = ($div) x $k;
if ($mod) {
for (0 .. $mod - 1) { $parts[$_]++ };
}
# map { say } @parts;
return \@parts;
}
__END__
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment