-
-
Save suryasaha/8420489 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use strict; | |
use autodie qw/open close/; | |
use DBI; | |
use Getopt::Long; | |
use Bio::GFF3::LowLevel qw/gff3_parse_feature/; | |
my ( $dsn, $user, $pass ); | |
GetOptions( | |
'dsn=s' => \$dsn, | |
'u|user=s' => \$user, | |
'p|password=s' => \$pass | |
); | |
die "no input file given\n" if !$ARGV[0]; | |
my $dbh = DBI->connect( $dsn, $user, $pass, | |
{ PrintError => 1, RaiseError => 1 } ); | |
$dbh->do( | |
<<'TEMP' | |
CREATE TEMP TABLE temp_feature_update ( | |
id varchar(56) NOT NULL, | |
start integer NOT NULL | |
end integer NOT NULL | |
) | |
TEMP | |
); | |
my $input = IO::File->new( $ARGV[0], 'r' ); | |
$dbh->do("COPY temp_feature(id,start,end) FROM STDIN"); | |
while ( my $line = $input->getline ) { | |
my $feature_hashref = gff3_parse_feature($line); | |
my $id = $feature_hashref->{attributes}->{ID}->[0]; | |
$dbh->pg_putcopydata( | |
"$id\t$feature_hashref->{start}\t$feature_hashref->{end}"); | |
} | |
$dbh->pg_putcopyend; | |
$dbh->do( | |
<<'UPDATEONLY' | |
UPDATE featureloc | |
SET featureloc.fmin = tmpf.start, | |
featureloc.fmax = tmpf.end | |
FROM featureloc,feature,temp_feature_update tmpf | |
WHERE | |
feature.feature_id = featureloc.feature_id | |
AND | |
feature.uniquename = tmpf.id | |
UPDATEONLY | |
); | |
$dbh->disconnect; | |
$input->close; | |
=head1 NAME | |
gff3chado_update.pl - Update feature location of existing feature from gff3 file | |
=head1 SYNOPSIS | |
perl gff3chado_update.pl (--dsn=<DBI dsn>) (-u|--user=<username>) (-p|--password=<password> ) (<gff3 file>) | |
=head1 OPTIONS | |
--dsn=DBI dsn Database dsn. | |
-u <username>, --user=<username> Database user name. | |
-p <password>, --password=<password> Database password. | |
=head1 DESCRIPTION | |
The script reads a gff3 file and then updates the feature locations of existing features in chado | |
database. It is intended to run after doing an bulk gff3 upload or if only the features are expected | |
to be present in the database. It also expects the following behaviour ... | |
=over | |
=item | |
All the features in gff3 should have an ID attribute. | |
=item | |
The value of ID should be stored in uniquename column of feature table. | |
=back | |
=head2 Further improvement | |
=over | |
=item | |
It updates the feature locations of all matching features. It could be improved | |
by updating features with changes in feature locations. | |
=item | |
Run the entire loading in single transaction. | |
=item | |
For very large number of updates, the temp table could have an index on id column. | |
=item | |
The foreign key constraints on feature and featureloc tables might be turned | |
off for faster update. Having an index on uniquename column might help because it | |
is not clear whether the default chado indices would be sufficient. | |
=back | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Only does updates of featureloc.fmin and featureloc.fmax based on feature_id