Created
July 1, 2014 09:59
-
-
Save dbolser-ebi/27ddc1190a905c83f392 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!perl | |
use strict; | |
use warnings; | |
## Easy manipulation of sets of integers (arbitrary intervals) | |
use Set::IntRange; | |
## Files | |
my $gene_ranges = "urartu_gene_ranges"; | |
#my $gene_ranges = "tauschii_gene_ranges"; | |
my $atac_result = "urartuvstauschii.atac.uids"; | |
open G, '<', $gene_ranges or die; | |
open A, '<', $atac_result or die; | |
my %gene_sets; | |
my $gene_space; | |
while(<G>){ | |
chomp; | |
my ($id, $st, $en) = split/\s+/; | |
if(!exists $gene_sets{$id}){ | |
$gene_sets{$id} = Set::IntRange->new(1, $en); | |
$gene_sets{$id}->Interval_Fill($st, $en); | |
} | |
else{ | |
if($en>abs($gene_sets{$id})){ | |
$gene_sets{$id}->Resize(1, $en); | |
} | |
$gene_sets{$id}->Interval_Fill($st, $en); | |
} | |
$gene_space += $en-$st+1; | |
} | |
warn "loaded genes on ", scalar keys %gene_sets, " seq-regions\n"; | |
warn "gene space is ", $gene_space, " bp\n"; | |
my %atac_sets; | |
my $atac_space; | |
while(<A>){ | |
## Only want to look at match lines of this type | |
#next unless /^M u /; | |
next unless /^M r /; | |
chomp; | |
my @row = split/\s+/; | |
my $id1 = $row[4]; | |
my $st1 = $row[5]+1; | |
my $en1 = $row[5]+$row[6]; | |
die unless $row[7] eq 1; | |
my $id2 = $row[8]; | |
my $st2 = $row[9]+1; | |
my $en2 = $row[9]+$row[10]; | |
my $ori = $row[11]; | |
if(!exists $atac_sets{$id1}){ | |
$atac_sets{$id1} = Set::IntRange->new(1, $en1); | |
$atac_sets{$id1}->Interval_Fill($st1, $en1); | |
} | |
else{ | |
if($en1>abs($atac_sets{$id1})){ | |
$atac_sets{$id1}->Resize(1, $en1); | |
} | |
$atac_sets{$id1}->Interval_Fill($st1, $en1); | |
} | |
if(!exists $atac_sets{$id2}){ | |
$atac_sets{$id2} = Set::IntRange->new(1, $en2); | |
$atac_sets{$id2}->Interval_Fill($st2, $en2); | |
} | |
else{ | |
if($en2>abs($atac_sets{$id2})){ | |
$atac_sets{$id2}->Resize(1, $en2); | |
} | |
$atac_sets{$id2}->Interval_Fill($st2, $en2); | |
} | |
$atac_space += $en1-$st1+1; | |
} | |
warn "loaded atacs on ", scalar keys %atac_sets, " seq-regions\n"; | |
warn "atac space is ", $atac_space, " bp\n"; | |
## What is the gene coverage? | |
my $coverage; | |
for my $id (keys %gene_sets){ | |
next unless exists $atac_sets{$id}; | |
my ($biggest) = | |
sort(abs($gene_sets{$id}), abs($atac_sets{$id})); | |
$gene_sets{$id}->Resize(1, $biggest); | |
$atac_sets{$id}->Resize(1, $biggest); | |
my $union = Set::IntRange->new(1, $biggest); | |
$union->Union($gene_sets{$id}, $atac_sets{$id}); | |
$coverage += $union->Norm; | |
} | |
print "coverage is ", $coverage, " bp\n"; | |
print $coverage / $gene_space * 100, "\n"; | |
__END__ | |
## Create two sets to play with | |
my $set_len = 1000000000; | |
my $set_a = Set::IntRange->new(1, $set_len); | |
my $set_b = Set::IntRange->new(1, $set_len); | |
## Create a few k 1_000 long ranges in both... | |
my $range_len = 1000; | |
for (1..5000){ | |
my $ra = int(rand($set_len-$range_len)); | |
$set_a->Interval_Fill($ra,$ra+$range_len); | |
my $rb = int(rand($set_len-$range_len)); | |
$set_b->Interval_Fill($rb,$rb+$range_len); | |
} | |
## Now find the size of the union | |
my $set_u = Set::IntRange->new(1, $set_len); | |
$set_u->Union($set_a, $set_b); | |
print $set_u->Norm, "\n"; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment