Last active
June 8, 2017 01:33
-
-
Save nathanhaigh/f43eb593c873a6164962c4b7f7ec0509 to your computer and use it in GitHub Desktop.
Perl script to perform coordinate transformation on GFF3 files when reference sequences are split into multiple pieces.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use strict; | |
use warnings; | |
my $bed_file = shift @ARGV; | |
my $suffix_fmt = '_part%01d'; | |
open(my $bed_fh, '<', $bed_file) || die "Couldn't open BED file: $!\n"; | |
my %cut_positions; | |
my %current_position; | |
# Process the BED file so we have coordinates at which to cut the pseudomolecule | |
while(<$bed_fh>) { | |
chomp; | |
my @cols = split /\t/; | |
$current_position{$cols[3]} += $cols[2]; | |
push @{$cut_positions{$cols[3]}}, $current_position{$cols[3]}; | |
} | |
close $bed_fh; | |
my %current_part_idx; | |
my %current_offset; | |
while (<>) { | |
chomp; | |
if (/^##sequence-region\s+(\S+)\s+(\d+)\s+(\d+)/) { | |
# We have sequence-region pragmas, we'll use them and modify them according to the required cuts | |
my $id = $1; | |
my $start = $2; | |
my $end = $3; | |
$current_part_idx{$id} = 0; | |
$current_offset{$id} = 0; | |
#print "$id\t$start\t$end\n"; | |
if (exists $cut_positions{$id}) { | |
# we have cut positions for this sequence | |
if ($cut_positions{$id}[-1] == $end) { | |
# we have a cut defined at the end of the landmark, remove it | |
pop @{$cut_positions{$id}} | |
} | |
if (scalar(@{$cut_positions{$id}}) == 0) { | |
# No cuts defined within the landmark, no processing required for this sequence-region | |
print "##sequence-region $id $start $end\n"; | |
next; | |
} | |
for (my $i=0; $i < scalar(@{$cut_positions{$id}}); $i++) { | |
my $part_end = $i > 0 ? $cut_positions{$id}[$i] - $cut_positions{$id}[$i-1] : $cut_positions{$id}[$i]; | |
printf "##sequence-region %s$suffix_fmt %d %d\n", | |
$id, | |
$i+1, | |
1, | |
$part_end; | |
if ($i == scalar(@{$cut_positions{$id}})-1) { | |
# last part needs flushing | |
printf "##sequence-region %s$suffix_fmt %d %d\n", | |
$id, | |
$i+2, | |
1, | |
$end-$part_end; | |
} | |
} | |
} | |
next; | |
} elsif (/^#/) { | |
# print lines starting with '#' without modification | |
print "$_\n"; | |
next; | |
} | |
my @cols = split /\t/; | |
# Only redefine the seqid, start and end, if we are actually cutting up the landmark | |
if (scalar(@{$cut_positions{$cols[0]}}) > 0) { | |
if (defined $cut_positions{$cols[0]}[$current_part_idx{$cols[0]}] && $cols[3] >= $cut_positions{$cols[0]}[$current_part_idx{$cols[0]}] ) { | |
# Once beyond the current part, adjust the part idx and bp offset from the start of the landmark for future calculations | |
$current_offset{$cols[0]}+=$cut_positions{$cols[0]}[$current_part_idx{$cols[0]}]; | |
$current_part_idx{$cols[0]}++; | |
} | |
# Modify the seqid, start and end coordinates appropriately | |
$cols[3] -= $current_offset{$cols[0]}; | |
$cols[4] -= $current_offset{$cols[0]}; | |
$cols[0] = sprintf "%s$suffix_fmt", $cols[0], $current_part_idx{$cols[0]}+1; | |
} | |
# print out the feature | |
print join("\t", @cols), "\n"; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
chr1_part1 0 3 chr1 0 3 | |
chr1_part2 0 5 chr1 3 8 | |
chrUn 0 10 chrUn 0 10 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##gff-version 3 | |
##sequence-region chr1 1 8 | |
##sequence-region chrUn 1 10 | |
chr1 source match 1 2 . . . ID=Match_first_2_bases_part1 | |
chr1 source match 1 3 . . . ID=Match_all_bases_part1 | |
chr1 source match 2 4 . . . ID=Match_overlapping_split | |
chr1 source match 4 5 . . . ID=Match_first_2_bases_part2 | |
chr1 source match 4 8 . . . ID=Match_all_bases_part2 | |
chrUn source match 1 10 . . . ID=Match_all_bases_unsplit |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment