Skip to content

Instantly share code, notes, and snippets.

@allenday
Last active February 4, 2020 03:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save allenday/3520d7779d424d1648b630bc03bded54 to your computer and use it in GitHub Desktop.
Save allenday/3520d7779d424d1648b630bc03bded54 to your computer and use it in GitHub Desktop.
Load GFF v3 data to BigQuery
SELECT
seq_id,source,type,seq_start,seq_end,
ST_MAKELINE(ST_GEOGPOINT(LOG(seq_start)/LOG(10),-0.01),ST_GEOGPOINT(LOG(seq_end)/LOG(10),0.01)) AS range_log10,
score,strand,phase,attributes
FROM `project.dataset.tablename`
LIMIT 1000
[
{
"mode": "NULLABLE",
"name": "seq_id",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "source",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "type",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "seq_start",
"type": "INTEGER"
},
{
"mode": "NULLABLE",
"name": "seq_end",
"type": "INTEGER"
},
{
"mode": "NULLABLE",
"name": "score",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "strand",
"type": "INTEGER"
},
{
"mode": "NULLABLE",
"name": "phase",
"type": "INTEGER"
},
{
"mode": "REPEATED",
"name": "attributes",
"type": "RECORD",
"fields": [
{
"mode": "NULLABLE",
"name": "tag",
"type": "STRING"
},
{
"mode": "REPEATED",
"name": "value",
"type": "STRING"
}
]
}
]
#!/usr/bin/perl
# gsutil cat gs://some/where.gff | grep -vE '^#' | perl ./gff_to_bigquery.pl | gsutil cp - gs://some/where.gff.json
use strict;
use JSON;
while ( my $line = <> ) {
chomp $line;
my ($seqid_,$source_,$type_,$start_,$end_,$score_,$strand_,$phase_,$attributes) = split /\t/, $line;
my $seqid = $seqid_ eq '.' ? "" : $seqid_;
my $source = $source_ eq '.' ? "" : $source_;
my $type = $type_ eq '.' ? "" : $type_;
my $start = $start_ eq '.' ? "" : int($start_);
my $end = $end_ eq '.' ? "" : int($end_);
my $score = $score_ eq '.' ? "" : $score_;
my $strand = $strand_ eq '+' ? 1 : $strand_ eq '-' ? -1 : "";
my $phase = $phase_ eq '.' ? "" : int($phase_);
my @attr = split ';', $attributes;
my @aout = ();
foreach my $a (@attr) {
my ( $k, $v ) = $a =~ m/^(.+?)=(.+?)$/;
my @v = split ',', $v;
my %attr = ();
#$attr{ $k } = \@v;
$attr{ tag } = $k;
$attr{ value } = \@v;
push @aout, \%attr;
}
print JSON::encode_json({
seq_id => $seqid,
source => $source,
type => $type,
seq_start => $start,
seq_end => $end,
score => $score,
strand => $strand,
phase => $phase,
attributes => \@aout,
}), "\n";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment