robert-b-clarke/gtfs-splitter.pl

## gtfs-splitter.pl
=head1 NAME

gtfs-splitter.pl

=head1 DESCRIPTION

Hastily assembled and extremely unguaranteed perl script for dividing gtfs files into smaller per agency files

=head1 SYNOPSIS

  from command line

  > unzip BigGTFS.ZIP
  > perl gtfs-splitter.pl BigGTFS SmallerGTFS

  #then zip up the individual gtfs directories

=cut


#!/usr/bin/perl
use Modern::Perl;
use Text::CSV_XS;
use Data::Dumper;
use File::Path qw/mkpath/;
use File::Copy;

my %opts;

my $container_path = $ARGV[0] or die "Need an output path";
my $output_path = $ARGV[1] or die "Need an output path";

#list of files we're gonna process
my @filenames = ('agency.txt', 'routes.txt', 'trips.txt', 'calendar.txt', 'stop_times.txt', 'calendar_dates.txt','stops.txt');

#split GTFS files according to agency
my %route_to_agency = (); #route id to agency id mapping
my %trip_to_agency = (); #trip id to agency id mapping
my %service_to_agency = (); #service id to agency id mapping
my %stop_to_agency = (); #stopid to agency - this one contains arrays

my $csv_in = Text::CSV_XS->new();

foreach my $filename(@filenames){
    my $filepath = join('/', ($container_path, $filename));
    open my $fh_in, '<', $filepath or die "can't open $filename";
    say "-----\nProcessing $filename\n----";
    my $header_line = <$fh_in>;
    $csv_in->parse($header_line);
    my $headers = [$csv_in->fields];
    while( my $row_line = <$fh_in> ){
        $csv_in->parse($row_line);
        my $row = [$csv_in->fields];
        my @agencies = guess_agency($headers, $row, $filename);
        foreach my $agency(@agencies) {
            my $target_dir = agency_dir($output_path, $agency);
            my $output_fh = output_fh($target_dir, $filename, $header_line);
            print $output_fh $row_line;
        }
    }
    $fh_in->close();
}

#just copy feed_info.txt
my $feed_info_path = join('/', ($container_path, 'feed_info.txt'));
my %agencies_hash = reverse %route_to_agency;
foreach my $agency(keys %agencies_hash){
    my $target_dir = agency_dir($output_path, $agency);
    my $agency_feed_info = join '/', ($target_dir, 'feed_info.txt');
    copy($feed_info_path, $agency_feed_info);
}

#warn Dumper(\%stop_to_agency);
exit();

sub guess_agency {
    my ($headers, $row, $filename) = @_;
    my %record = ();
    for(my $i=0; $i < scalar(@$headers); $i++){
        $record{$headers->[$i]} = $row->[$i];
    }
    given($filename){
        when('agency.txt'){
            return $record{agency_id};
        }
        when('routes.txt'){
            $route_to_agency{$record{route_id}} = $record{agency_id};
            return $route_to_agency{$record{route_id}}
        }
        when('trips.txt'){
            my $route_id = $record{route_id};
            my $agency_id = $route_to_agency{$route_id} or die "no agency";
            $trip_to_agency{$record{trip_id}} = $agency_id;
            $service_to_agency{$record{service_id}} = $agency_id;
            return $agency_id;
        }
        when('stop_times.txt'){
            my $trip_agency = $trip_to_agency{$record{trip_id}};
            my $stop_id = $record{stop_id};
            my $existing_agencies = $stop_to_agency{$stop_id} // [];
            unless($trip_agency ~~ $existing_agencies){
                push @$existing_agencies, $trip_agency;
                $stop_to_agency{$stop_id} = $existing_agencies;
            }
            return $trip_agency;
        }
        default {
            #try trip_id, then service
            if($record{trip_id} && defined $trip_to_agency{$record{trip_id}}){
                return $trip_to_agency{$record{trip_id}};
            }
            elsif($record{service_id} && defined $service_to_agency{$record{service_id}}){
                return $service_to_agency{$record{service_id}};
            }
            elsif($record{stop_id} && defined $stop_to_agency{$record{stop_id}}){
                return @{$stop_to_agency{$record{stop_id}}};
            }
            else{
                #warn "can't process record from $filename with details ".Dumper(\%record);
                warn "can't process record from $filename";
                return;
            }
        }
    }
}

sub agency_dir{
    my ($agency, $output_dir) = @_;
    return join '/', ($agency,$output_dir);
}

sub output_fh{
    my ($dir, $filename, $header_line) = @_;
    mkpath($dir); #make dir if we don't have it already
    my $output_path = join('/', ($dir, $filename));
    eval{
        open my $dummy, '<', $output_path or die "can\'t open $output_path";
    };
    if($@){
        open my $fh, '>', $output_path or die "can\'t open $output_path";
        print $fh $header_line;
        return $fh;
    }
    else {
        open my $fh, '>>', $output_path or die "can\'t open $output_path";
        return $fh;
    }
}
	=head1 NAME

	gtfs-splitter.pl

	=head1 DESCRIPTION

	Hastily assembled and extremely unguaranteed perl script for dividing gtfs files into smaller per agency files

	=head1 SYNOPSIS

	from command line

	> unzip BigGTFS.ZIP
	> perl gtfs-splitter.pl BigGTFS SmallerGTFS

	#then zip up the individual gtfs directories

	=cut


	#!/usr/bin/perl
	use Modern::Perl;
	use Text::CSV_XS;
	use Data::Dumper;
	use File::Path qw/mkpath/;
	use File::Copy;

	my %opts;

	my $container_path = $ARGV[0] or die "Need an output path";
	my $output_path = $ARGV[1] or die "Need an output path";

	#list of files we're gonna process
	my @filenames = ('agency.txt', 'routes.txt', 'trips.txt', 'calendar.txt', 'stop_times.txt', 'calendar_dates.txt','stops.txt');

	#split GTFS files according to agency
	my %route_to_agency = (); #route id to agency id mapping
	my %trip_to_agency = (); #trip id to agency id mapping
	my %service_to_agency = (); #service id to agency id mapping
	my %stop_to_agency = (); #stopid to agency - this one contains arrays

	my $csv_in = Text::CSV_XS->new();

	foreach my $filename(@filenames){
	my $filepath = join('/', ($container_path, $filename));
	open my $fh_in, '<', $filepath or die "can't open $filename";
	say "-----\nProcessing $filename\n----";
	my $header_line = <$fh_in>;
	$csv_in->parse($header_line);
	my $headers = [$csv_in->fields];
	while( my $row_line = <$fh_in> ){
	$csv_in->parse($row_line);
	my $row = [$csv_in->fields];
	my @agencies = guess_agency($headers, $row, $filename);
	foreach my $agency(@agencies) {
	my $target_dir = agency_dir($output_path, $agency);
	my $output_fh = output_fh($target_dir, $filename, $header_line);
	print $output_fh $row_line;
	}
	}
	$fh_in->close();
	}

	#just copy feed_info.txt
	my $feed_info_path = join('/', ($container_path, 'feed_info.txt'));
	my %agencies_hash = reverse %route_to_agency;
	foreach my $agency(keys %agencies_hash){
	my $target_dir = agency_dir($output_path, $agency);
	my $agency_feed_info = join '/', ($target_dir, 'feed_info.txt');
	copy($feed_info_path, $agency_feed_info);
	}

	#warn Dumper(\%stop_to_agency);
	exit();

	sub guess_agency {
	my ($headers, $row, $filename) = @_;
	my %record = ();
	for(my $i=0; $i < scalar(@$headers); $i++){
	$record{$headers->[$i]} = $row->[$i];
	}
	given($filename){
	when('agency.txt'){
	return $record{agency_id};
	}
	when('routes.txt'){
	$route_to_agency{$record{route_id}} = $record{agency_id};
	return $route_to_agency{$record{route_id}}
	}
	when('trips.txt'){
	my $route_id = $record{route_id};
	my $agency_id = $route_to_agency{$route_id} or die "no agency";
	$trip_to_agency{$record{trip_id}} = $agency_id;
	$service_to_agency{$record{service_id}} = $agency_id;
	return $agency_id;
	}
	when('stop_times.txt'){
	my $trip_agency = $trip_to_agency{$record{trip_id}};
	my $stop_id = $record{stop_id};
	my $existing_agencies = $stop_to_agency{$stop_id} // [];
	unless($trip_agency ~~ $existing_agencies){
	push @$existing_agencies, $trip_agency;
	$stop_to_agency{$stop_id} = $existing_agencies;
	}
	return $trip_agency;
	}
	default {
	#try trip_id, then service
	if($record{trip_id} && defined $trip_to_agency{$record{trip_id}}){
	return $trip_to_agency{$record{trip_id}};
	}
	elsif($record{service_id} && defined $service_to_agency{$record{service_id}}){
	return $service_to_agency{$record{service_id}};
	}
	elsif($record{stop_id} && defined $stop_to_agency{$record{stop_id}}){
	return @{$stop_to_agency{$record{stop_id}}};
	}
	else{
	#warn "can't process record from $filename with details ".Dumper(\%record);
	warn "can't process record from $filename";
	return;
	}
	}
	}
	}

	sub agency_dir{
	my ($agency, $output_dir) = @_;
	return join '/', ($agency,$output_dir);
	}

	sub output_fh{
	my ($dir, $filename, $header_line) = @_;
	mkpath($dir); #make dir if we don't have it already
	my $output_path = join('/', ($dir, $filename));
	eval{
	open my $dummy, '<', $output_path or die "can\'t open $output_path";
	};
	if($@){
	open my $fh, '>', $output_path or die "can\'t open $output_path";
	print $fh $header_line;
	return $fh;
	}
	else {
	open my $fh, '>>', $output_path or die "can\'t open $output_path";
	return $fh;
	}
	}