Skip to content

Instantly share code, notes, and snippets.

@dbolser-ebi
Created July 1, 2014 09:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dbolser-ebi/27ddc1190a905c83f392 to your computer and use it in GitHub Desktop.
Save dbolser-ebi/27ddc1190a905c83f392 to your computer and use it in GitHub Desktop.
#!perl
use strict;
use warnings;
## Easy manipulation of sets of integers (arbitrary intervals)
use Set::IntRange;
## Files
my $gene_ranges = "urartu_gene_ranges";
#my $gene_ranges = "tauschii_gene_ranges";
my $atac_result = "urartuvstauschii.atac.uids";
open G, '<', $gene_ranges or die;
open A, '<', $atac_result or die;
my %gene_sets;
my $gene_space;
while(<G>){
chomp;
my ($id, $st, $en) = split/\s+/;
if(!exists $gene_sets{$id}){
$gene_sets{$id} = Set::IntRange->new(1, $en);
$gene_sets{$id}->Interval_Fill($st, $en);
}
else{
if($en>abs($gene_sets{$id})){
$gene_sets{$id}->Resize(1, $en);
}
$gene_sets{$id}->Interval_Fill($st, $en);
}
$gene_space += $en-$st+1;
}
warn "loaded genes on ", scalar keys %gene_sets, " seq-regions\n";
warn "gene space is ", $gene_space, " bp\n";
my %atac_sets;
my $atac_space;
while(<A>){
## Only want to look at match lines of this type
#next unless /^M u /;
next unless /^M r /;
chomp;
my @row = split/\s+/;
my $id1 = $row[4];
my $st1 = $row[5]+1;
my $en1 = $row[5]+$row[6];
die unless $row[7] eq 1;
my $id2 = $row[8];
my $st2 = $row[9]+1;
my $en2 = $row[9]+$row[10];
my $ori = $row[11];
if(!exists $atac_sets{$id1}){
$atac_sets{$id1} = Set::IntRange->new(1, $en1);
$atac_sets{$id1}->Interval_Fill($st1, $en1);
}
else{
if($en1>abs($atac_sets{$id1})){
$atac_sets{$id1}->Resize(1, $en1);
}
$atac_sets{$id1}->Interval_Fill($st1, $en1);
}
if(!exists $atac_sets{$id2}){
$atac_sets{$id2} = Set::IntRange->new(1, $en2);
$atac_sets{$id2}->Interval_Fill($st2, $en2);
}
else{
if($en2>abs($atac_sets{$id2})){
$atac_sets{$id2}->Resize(1, $en2);
}
$atac_sets{$id2}->Interval_Fill($st2, $en2);
}
$atac_space += $en1-$st1+1;
}
warn "loaded atacs on ", scalar keys %atac_sets, " seq-regions\n";
warn "atac space is ", $atac_space, " bp\n";
## What is the gene coverage?
my $coverage;
for my $id (keys %gene_sets){
next unless exists $atac_sets{$id};
my ($biggest) =
sort(abs($gene_sets{$id}), abs($atac_sets{$id}));
$gene_sets{$id}->Resize(1, $biggest);
$atac_sets{$id}->Resize(1, $biggest);
my $union = Set::IntRange->new(1, $biggest);
$union->Union($gene_sets{$id}, $atac_sets{$id});
$coverage += $union->Norm;
}
print "coverage is ", $coverage, " bp\n";
print $coverage / $gene_space * 100, "\n";
__END__
## Create two sets to play with
my $set_len = 1000000000;
my $set_a = Set::IntRange->new(1, $set_len);
my $set_b = Set::IntRange->new(1, $set_len);
## Create a few k 1_000 long ranges in both...
my $range_len = 1000;
for (1..5000){
my $ra = int(rand($set_len-$range_len));
$set_a->Interval_Fill($ra,$ra+$range_len);
my $rb = int(rand($set_len-$range_len));
$set_b->Interval_Fill($rb,$rb+$range_len);
}
## Now find the size of the union
my $set_u = Set::IntRange->new(1, $set_len);
$set_u->Union($set_a, $set_b);
print $set_u->Norm, "\n";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment