Created
September 29, 2016 13:56
-
-
Save kizashi1122/44ee35ea5bcc090b0f54fd1edbba3ade to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
# | |
# split data file as evenly as possible for n-fold cross validation | |
# | |
# Usage: | |
# ./split_file_evenly.pl data_file 10 | |
# | |
# In this case, 10 training and test sets are generated. | |
# | |
use strict; | |
use warnings; | |
use autodie qw/open close/; | |
use feature 'say'; | |
my $file = shift or die $!; | |
my $k = shift || 10; | |
# line count | |
my $line_count = 0; | |
open my $fh, '<', $file; | |
$line_count += tr/\n/\n/ while sysread($fh, $_, 2**20); | |
close $fh; | |
# divide as evenly as possible | |
my $refa = &split_n($line_count, $k); | |
# range of test data | |
my $from = 1; | |
my $to = @$refa[0]; | |
foreach my $i (0 .. ($k - 1)) { | |
open my $fh, '<', $file; | |
my $learnf = sprintf("%02d-learn", $i + 1); | |
my $testf = sprintf("%02d-test", $i + 1); | |
open my $fhl, '>', $learnf; | |
open my $fht, '>', $testf; | |
my $curpos = 1; | |
while (<$fh>) { | |
if ($curpos >= $from && $curpos <= $to) { | |
print $fht $_; | |
} else { | |
print $fhl $_; | |
} | |
$curpos++; | |
} | |
close $fh; | |
close $fhl; | |
close $fht; | |
# prepare next | |
$from = $to + 1; | |
$to = (@$refa[$i + 1]) ? $from + @$refa[$i + 1] - 1 : $line_count; | |
} | |
sub split_n { | |
my $l = shift; | |
my $k = shift; | |
my $div = int($l / $k); | |
my $mod = $l % $k; | |
my @parts = ($div) x $k; | |
if ($mod) { | |
for (0 .. $mod - 1) { $parts[$_]++ }; | |
} | |
# map { say } @parts; | |
return \@parts; | |
} | |
__END__ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment