nathanhaigh/gff3_split.pl

## gff3_split.pl
#!/usr/bin/env perl
use strict;
use warnings;

my $bed_file = shift @ARGV;
my $suffix_fmt = '_part%01d';

open(my $bed_fh, '<', $bed_file) || die "Couldn't open BED file: $!\n";

my %cut_positions;
my %current_position;

# Process the BED file so we have coordinates at which to cut the pseudomolecule
while(<$bed_fh>) {
  chomp;
  my @cols = split /\t/;

  $current_position{$cols[3]} += $cols[2];

  push @{$cut_positions{$cols[3]}}, $current_position{$cols[3]};
}
close $bed_fh;


my %current_part_idx;
my %current_offset;
while (<>) {
  chomp;
  if (/^##sequence-region\s+(\S+)\s+(\d+)\s+(\d+)/) {
    # We have sequence-region pragmas, we'll use them and modify them according to the required cuts
    my $id = $1;
    my $start = $2;
    my $end = $3;
    $current_part_idx{$id} = 0;
    $current_offset{$id} = 0;
    #print "$id\t$start\t$end\n";

    if (exists $cut_positions{$id}) {
      # we have cut positions for this sequence
      if ($cut_positions{$id}[-1] == $end) {
        # we have a cut defined at the end of the landmark, remove it
        pop @{$cut_positions{$id}}
      }

      if (scalar(@{$cut_positions{$id}}) == 0) {
        # No cuts defined within the landmark, no processing required for this sequence-region
        print "##sequence-region $id $start $end\n";
        next;
      }

      for (my $i=0; $i < scalar(@{$cut_positions{$id}}); $i++) {
        my $part_end = $i > 0 ? $cut_positions{$id}[$i] - $cut_positions{$id}[$i-1] : $cut_positions{$id}[$i];
        printf "##sequence-region %s$suffix_fmt %d %d\n",
          $id,
          $i+1,
          1,
          $part_end;
        if ($i == scalar(@{$cut_positions{$id}})-1) {
          # last part needs flushing
          printf "##sequence-region %s$suffix_fmt %d %d\n",
            $id,
            $i+2,
            1,
            $end-$part_end;
        }
      }
    }
    next;

  } elsif (/^#/) {
    # print lines starting with '#' without modification
    print "$_\n";
    next;
  }

  my @cols = split /\t/;

  # Only redefine the seqid, start and end, if we are actually cutting up the landmark
  if (scalar(@{$cut_positions{$cols[0]}}) > 0) {
    if (defined $cut_positions{$cols[0]}[$current_part_idx{$cols[0]}] && $cols[3] >= $cut_positions{$cols[0]}[$current_part_idx{$cols[0]}] ) {
      # Once beyond the current part, adjust the part idx and bp offset from the start of the landmark for future calculations
      $current_offset{$cols[0]}+=$cut_positions{$cols[0]}[$current_part_idx{$cols[0]}];
      $current_part_idx{$cols[0]}++;
    }
    # Modify the seqid, start and end coordinates appropriately
    $cols[3] -= $current_offset{$cols[0]};
    $cols[4] -= $current_offset{$cols[0]};
    $cols[0] = sprintf "%s$suffix_fmt", $cols[0], $current_part_idx{$cols[0]}+1;
  }

  # print out the feature
  print join("\t", @cols), "\n";
}

## test.bed
chr1_part1	0	3	chr1	0	3
chr1_part2	0	5	chr1	3	8
chrUn	0	10	chrUn	0	10

## test.gff3
##gff-version 3
##sequence-region chr1 1 8
##sequence-region chrUn 1 10
chr1	source	match	1	2	.	.	.	ID=Match_first_2_bases_part1
chr1	source	match	1	3	.	.	.	ID=Match_all_bases_part1
chr1	source	match	2	4	.	.	.	ID=Match_overlapping_split
chr1	source	match	4	5	.	.	.	ID=Match_first_2_bases_part2
chr1	source	match	4	8	.	.	.	ID=Match_all_bases_part2
chrUn	source	match	1	10	.	.	.	ID=Match_all_bases_unsplit
	#!/usr/bin/env perl
	use strict;
	use warnings;

	my $bed_file = shift @ARGV;
	my $suffix_fmt = '_part%01d';

	open(my $bed_fh, '<', $bed_file) \|\| die "Couldn't open BED file: $!\n";

	my %cut_positions;
	my %current_position;

	# Process the BED file so we have coordinates at which to cut the pseudomolecule
	while(<$bed_fh>) {
	chomp;
	my @cols = split /\t/;

	$current_position{$cols[3]} += $cols[2];

	push @{$cut_positions{$cols[3]}}, $current_position{$cols[3]};
	}
	close $bed_fh;


	my %current_part_idx;
	my %current_offset;
	while (<>) {
	chomp;
	if (/^##sequence-region\s+(\S+)\s+(\d+)\s+(\d+)/) {
	# We have sequence-region pragmas, we'll use them and modify them according to the required cuts
	my $id = $1;
	my $start = $2;
	my $end = $3;
	$current_part_idx{$id} = 0;
	$current_offset{$id} = 0;
	#print "$id\t$start\t$end\n";

	if (exists $cut_positions{$id}) {
	# we have cut positions for this sequence
	if ($cut_positions{$id}[-1] == $end) {
	# we have a cut defined at the end of the landmark, remove it
	pop @{$cut_positions{$id}}
	}

	if (scalar(@{$cut_positions{$id}}) == 0) {
	# No cuts defined within the landmark, no processing required for this sequence-region
	print "##sequence-region $id $start $end\n";
	next;
	}

	for (my $i=0; $i < scalar(@{$cut_positions{$id}}); $i++) {
	my $part_end = $i > 0 ? $cut_positions{$id}[$i] - $cut_positions{$id}[$i-1] : $cut_positions{$id}[$i];
	printf "##sequence-region %s$suffix_fmt %d %d\n",
	$id,
	$i+1,
	1,
	$part_end;
	if ($i == scalar(@{$cut_positions{$id}})-1) {
	# last part needs flushing
	printf "##sequence-region %s$suffix_fmt %d %d\n",
	$id,
	$i+2,
	1,
	$end-$part_end;
	}
	}
	}
	next;

	} elsif (/^#/) {
	# print lines starting with '#' without modification
	print "$_\n";
	next;
	}

	my @cols = split /\t/;

	# Only redefine the seqid, start and end, if we are actually cutting up the landmark
	if (scalar(@{$cut_positions{$cols[0]}}) > 0) {
	if (defined $cut_positions{$cols[0]}[$current_part_idx{$cols[0]}] && $cols[3] >= $cut_positions{$cols[0]}[$current_part_idx{$cols[0]}] ) {
	# Once beyond the current part, adjust the part idx and bp offset from the start of the landmark for future calculations
	$current_offset{$cols[0]}+=$cut_positions{$cols[0]}[$current_part_idx{$cols[0]}];
	$current_part_idx{$cols[0]}++;
	}
	# Modify the seqid, start and end coordinates appropriately
	$cols[3] -= $current_offset{$cols[0]};
	$cols[4] -= $current_offset{$cols[0]};
	$cols[0] = sprintf "%s$suffix_fmt", $cols[0], $current_part_idx{$cols[0]}+1;
	}

	# print out the feature
	print join("\t", @cols), "\n";
	}
	chr1_part1 0 3 chr1 0 3
	chr1_part2 0 5 chr1 3 8
	chrUn 0 10 chrUn 0 10
	##gff-version 3
	##sequence-region chr1 1 8
	##sequence-region chrUn 1 10
	chr1 source match 1 2 . . . ID=Match_first_2_bases_part1
	chr1 source match 1 3 . . . ID=Match_all_bases_part1
	chr1 source match 2 4 . . . ID=Match_overlapping_split
	chr1 source match 4 5 . . . ID=Match_first_2_bases_part2
	chr1 source match 4 8 . . . ID=Match_all_bases_part2
	chrUn source match 1 10 . . . ID=Match_all_bases_unsplit