andrewyatz/synteny_to_psl.pl

## synteny_to_psl.pl
#!/bin/env perl

use strict;
use warnings;

use Getopt::Long;
use Pod::Usage;
use Bio::EnsEMBL::Registry;
use Bio::EnsEMBL::Utils::Scalar qw(wrap_array);
use IO::String;

my $O = options();
my $DBA = database();
my $MLSS = mlss();
my $SYNTENY = synteny();
my $FH = fh();
print_synteny();

if(!$O->{file}) {
  print ${$FH->string_ref};
}

close($FH);

exit 0;

sub options {
  my $opts = {};
  my @flags = qw(
    ensembl ensemblgenomes database=s species=s@ version=i file=s help man
  );
  GetOptions($opts, @flags) or pod2usage(1);

  _exit( undef, 0, 1) if $opts->{help};
	_exit( undef, 0, 2) if $opts->{man};

	if($opts->{ensemblgenomes} && $opts->{ensembl}) {
	  _exit( 'Can only specify -ensembl or -ensemblgenomes', 1, 2);
	}

	my $species = wrap_array($opts->{species});
	if(scalar(@{$species}) != 2) {
	  _exit( 'Script requires two -species attributes', 1, 2);
	}

	$opts->{database} = 'multi' unless $opts->{database};

	return $opts;
}

sub database {
  my %args;

  if($O->{ensemblgenomes}) {
    %args = (
      -HOST => 'mysql.ebi.ac.uk', -PORT => 4157, -USER => 'anonymous'
    );
  }
  elsif($O->{ensembl}) {
    %args = (
      -HOST => 'ensembldb.ensembl.org', -PORT => 5306, -USER => 'anonymous'
    );
  }
  else {
    _exit('Not aware of which server I should connect to. Specify -ensembl or -ensemblgenomes', 2, 1);
  }

  if($O->{version}) {
    $args{-DB_VERSION} = $O->{version};
  }

  Bio::EnsEMBL::Registry->load_registry_from_db(%args);
  my $dba = Bio::EnsEMBL::Registry->get_DBAdaptor($O->{database}, 'compara');

  if(! defined $dba) {
    _exit($O->{database}.' did not result in a valid DB. Try again specifying a different -database parameter. Also check your -version switch and API version', 2, 1);
  }

  return $dba;
}

sub mlss {
  my $species = $O->{species};
  my $gdba = $DBA->get_GenomeDBAdaptor();
  my $mlssa = $DBA->get_MethodLinkSpeciesSetAdaptor();
  my @gdbs = map { $gdba->fetch_by_registry_name($_) } @{$species};
  my $mlss = $mlssa->fetch_by_method_link_type_GenomeDBs('SYNTENY', \@gdbs);
  return $mlss;
}

sub synteny {
  my $sra = $DBA->get_SyntenyRegionAdaptor();
  my $regions = $sra->fetch_all_by_MethodLinkSpeciesSet($MLSS);
  return $regions;
}

sub fh {
  if($O->{file}) {
    return IO::File->new($O->{file}, 'w');
  }
  return IO::String->new();
}

sub print_synteny {
  my ($source_gdb, $target_gdb) = @{$MLSS->species_set()};
  my $sname = $source_gdb->name();
  my $tname = $target_gdb->name();

  print $FH sprintf(
    'track name=synteny_%1$s description="%1$s %2$s synteny projections"',
    $sname, $tname);
  print $FH "\n";

  foreach my $s (@{$SYNTENY}) {
    my $regions = $s->get_all_DnaFragRegions();
    my ($source) = grep { $_->genome_db()->dbID() eq $source_gdb->dbID() } @{$regions};
    my ($target) = grep { $_->genome_db()->dbID() eq $target_gdb->dbID() } @{$regions};

    my $create_elements = sub {
      my ($frag) = @_;
      return (
        $frag->dnafrag()->name(), #name of region
        $frag->dnafrag()->length(), #length of region e.g. chr1 in human
        $frag->dnafrag_start(), #start of alignment
        $frag->dnafrag_end() #end of alignment
      );
    };

    my @data = (
      $source->length(),
      0, #misMatches
      0, #repMatches
      0, #nCount
      0, #qNumInsert
      0, #qBaseInsert
      0, #tNumInsert
      0, #tBaseInsert
      ( $source->dnafrag_strand() ) ? '+' : '-', #strand,
      $create_elements->($source), #query info
      $create_elements->($target), #target info
      1, # number of blocks
      $source->length(), #block sizes
      $source->dnafrag_start(), #query start blocks
      $target->dnafrag_start() #target start blocks
    );

    print $FH join(q{ }, @data), "\n";
  }

  return;
}

sub _exit {
  my ($msg, $status, $verbose) = @_;
  print STDERR $msg, "\n" if defined $msg;
  pod2usage( -exitstatus => $status, -verbose => $verbose );
}

__END__
=pod

=head1 NAME

list_synteny.pl

=head1 SYNOPSIS

	./list_synteny.pl -species human -species mouse -ensembl -database multi [see opts] [ --help | --man ]

=head1 DESCRIPTION

Returns a PSL to screen or to a specified file location

=head1 OPTIONS

=over 8

=item B<--species>

Accepts 2 invocations to specify the species to look for synteny between

=item B<--ensembl>

Specify to connect to the public ensembl database

=item B<--ensemblgenomes>

Specify to connect to the public ensembl genomes database

=item B<--database>

Specify the name of the database to connect to. If going to ensembl
use multi otherwise if connecting to ensembl genomes use one like
plants, metazoa, protists, bacteria or fungi (not all DBs have synteny
projections though).

=item B<--version>

Optional integer which can be used to force the databases we load. Useful if
your API does not match the DB you are connecting to.

=item B<--file>

Location of the file to write to. If not specified will write to STDOUT

=back

=head1 VERSION

1

=head1 LICENSE

 Copyright (c) 1999-2010 The European Bioinformatics Institute and
 Genome Research Limited.  All rights reserved.

 This software is distributed under a modified Apache license.
 For license details, please see

   http://www.ensembl.org/info/about/code_licence.html

=cut
	#!/bin/env perl

	use strict;
	use warnings;

	use Getopt::Long;
	use Pod::Usage;
	use Bio::EnsEMBL::Registry;
	use Bio::EnsEMBL::Utils::Scalar qw(wrap_array);
	use IO::String;

	my $O = options();
	my $DBA = database();
	my $MLSS = mlss();
	my $SYNTENY = synteny();
	my $FH = fh();
	print_synteny();

	if(!$O->{file}) {
	print ${$FH->string_ref};
	}

	close($FH);

	exit 0;

	sub options {
	my $opts = {};
	my @flags = qw(
	ensembl ensemblgenomes database=s species=s@ version=i file=s help man
	);
	GetOptions($opts, @flags) or pod2usage(1);

	_exit( undef, 0, 1) if $opts->{help};
	_exit( undef, 0, 2) if $opts->{man};

	if($opts->{ensemblgenomes} && $opts->{ensembl}) {
	_exit( 'Can only specify -ensembl or -ensemblgenomes', 1, 2);
	}

	my $species = wrap_array($opts->{species});
	if(scalar(@{$species}) != 2) {
	_exit( 'Script requires two -species attributes', 1, 2);
	}

	$opts->{database} = 'multi' unless $opts->{database};

	return $opts;
	}

	sub database {
	my %args;

	if($O->{ensemblgenomes}) {
	%args = (
	-HOST => 'mysql.ebi.ac.uk', -PORT => 4157, -USER => 'anonymous'
	);
	}
	elsif($O->{ensembl}) {
	%args = (
	-HOST => 'ensembldb.ensembl.org', -PORT => 5306, -USER => 'anonymous'
	);
	}
	else {
	_exit('Not aware of which server I should connect to. Specify -ensembl or -ensemblgenomes', 2, 1);
	}

	if($O->{version}) {
	$args{-DB_VERSION} = $O->{version};
	}

	Bio::EnsEMBL::Registry->load_registry_from_db(%args);
	my $dba = Bio::EnsEMBL::Registry->get_DBAdaptor($O->{database}, 'compara');

	if(! defined $dba) {
	_exit($O->{database}.' did not result in a valid DB. Try again specifying a different -database parameter. Also check your -version switch and API version', 2, 1);
	}

	return $dba;
	}

	sub mlss {
	my $species = $O->{species};
	my $gdba = $DBA->get_GenomeDBAdaptor();
	my $mlssa = $DBA->get_MethodLinkSpeciesSetAdaptor();
	my @gdbs = map { $gdba->fetch_by_registry_name($_) } @{$species};
	my $mlss = $mlssa->fetch_by_method_link_type_GenomeDBs('SYNTENY', \@gdbs);
	return $mlss;
	}

	sub synteny {
	my $sra = $DBA->get_SyntenyRegionAdaptor();
	my $regions = $sra->fetch_all_by_MethodLinkSpeciesSet($MLSS);
	return $regions;
	}

	sub fh {
	if($O->{file}) {
	return IO::File->new($O->{file}, 'w');
	}
	return IO::String->new();
	}

	sub print_synteny {
	my ($source_gdb, $target_gdb) = @{$MLSS->species_set()};
	my $sname = $source_gdb->name();
	my $tname = $target_gdb->name();

	print $FH sprintf(
	'track name=synteny_%1$s description="%1$s %2$s synteny projections"',
	$sname, $tname);
	print $FH "\n";

	foreach my $s (@{$SYNTENY}) {
	my $regions = $s->get_all_DnaFragRegions();
	my ($source) = grep { $_->genome_db()->dbID() eq $source_gdb->dbID() } @{$regions};
	my ($target) = grep { $_->genome_db()->dbID() eq $target_gdb->dbID() } @{$regions};

	my $create_elements = sub {
	my ($frag) = @_;
	return (
	$frag->dnafrag()->name(), #name of region
	$frag->dnafrag()->length(), #length of region e.g. chr1 in human
	$frag->dnafrag_start(), #start of alignment
	$frag->dnafrag_end() #end of alignment
	);
	};

	my @data = (
	$source->length(),
	0, #misMatches
	0, #repMatches
	0, #nCount
	0, #qNumInsert
	0, #qBaseInsert
	0, #tNumInsert
	0, #tBaseInsert
	( $source->dnafrag_strand() ) ? '+' : '-', #strand,
	$create_elements->($source), #query info
	$create_elements->($target), #target info
	1, # number of blocks
	$source->length(), #block sizes
	$source->dnafrag_start(), #query start blocks
	$target->dnafrag_start() #target start blocks
	);

	print $FH join(q{ }, @data), "\n";
	}

	return;
	}

	sub _exit {
	my ($msg, $status, $verbose) = @_;
	print STDERR $msg, "\n" if defined $msg;
	pod2usage( -exitstatus => $status, -verbose => $verbose );
	}

	__END__
	=pod

	=head1 NAME

	list_synteny.pl

	=head1 SYNOPSIS

	./list_synteny.pl -species human -species mouse -ensembl -database multi [see opts] [ --help \| --man ]

	=head1 DESCRIPTION

	Returns a PSL to screen or to a specified file location

	=head1 OPTIONS

	=over 8

	=item B<--species>

	Accepts 2 invocations to specify the species to look for synteny between

	=item B<--ensembl>

	Specify to connect to the public ensembl database

	=item B<--ensemblgenomes>

	Specify to connect to the public ensembl genomes database

	=item B<--database>

	Specify the name of the database to connect to. If going to ensembl
	use multi otherwise if connecting to ensembl genomes use one like
	plants, metazoa, protists, bacteria or fungi (not all DBs have synteny
	projections though).

	=item B<--version>

	Optional integer which can be used to force the databases we load. Useful if
	your API does not match the DB you are connecting to.

	=item B<--file>

	Location of the file to write to. If not specified will write to STDOUT

	=back

	=head1 VERSION

	1

	=head1 LICENSE

	Copyright (c) 1999-2010 The European Bioinformatics Institute and
	Genome Research Limited. All rights reserved.

	This software is distributed under a modified Apache license.
	For license details, please see

	http://www.ensembl.org/info/about/code_licence.html

	=cut