Created
January 23, 2019 15:29
-
-
Save willmclaren/95ffc7a6479a9dc9eb937fc80754b6cf to your computer and use it in GitHub Desktop.
VCF to JSON starter code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use strict; | |
use Getopt::Long; | |
use FileHandle; | |
our ($CAN_USE_PERLIO_GZIP, $CAN_USE_GZIP, $CAN_USE_IO_UNCOMPRESS); | |
BEGIN { | |
# check PerlIO::gzip | |
if (eval q{ require PerlIO::gzip; 1 }) { | |
$CAN_USE_PERLIO_GZIP = 1; | |
} | |
# check gzip | |
if (`which gzip` =~ /\/gzip/) { | |
$CAN_USE_GZIP = 1; | |
} | |
# check IO::Uncompress::Gunzip | |
if(eval q{ use IO::Uncompress::Gunzip qw($GunzipError); 1}) { | |
$CAN_USE_IO_UNCOMPRESS = 1; | |
} | |
} | |
our @VARIANT_FIELDS = qw( | |
CLIN_SIG SOMATIC PHENO PUBMED | |
AF | |
MAX_AF MAX_AF_POPS | |
AFR_AF AMR_AF EAS_AF EUR_AF SAS_AF | |
AA_AF EA_AF | |
gnomAD_AF gnomAD_AFR_AF gnomAD_AMR_AF gnomAD_ASJ_AF gnomAD_EAS_AF gnomAD_FIN_AF gnomAD_NFE_AF gnomAD_OTH_AF gnomAD_SAS_AF | |
ExAC_AF ExAC_Adj_AF ExAC_AFR_AF ExAC_AMR_AF ExAC_EAS_AF ExAC_FIN_AF ExAC_NFE_AF ExAC_OTH_AF ExAC_SAS_AF | |
); | |
# configure from command line opts | |
my $config = configure(scalar @ARGV); | |
# run the main sub routine | |
main($config); | |
sub configure { | |
my $args = shift; | |
# set defaults | |
my $config = { | |
start => 1, | |
limit => 1e12, | |
output_file => 'stdout', | |
vcf_info_field => 'CSQ', | |
delimiter => "\t", | |
}; | |
GetOptions( | |
$config, | |
'help|h', # displays help message | |
'test=i', # test run on n lines | |
'delimiter|d=s', # delimiter | |
'input_file|i=s', # input file | |
'output_file|o=s', # output file | |
'gz', # force read as gzipped | |
'vcf_info_field=s', # VCF INFO field name to parse | |
) or die("ERROR: Failed to parse command-line flags\n"); | |
# print usage message if requested or no args supplied | |
if($config->{help} || !$args) { | |
&usage; | |
exit(0); | |
} | |
return $config; | |
} | |
sub main { | |
my $config = shift; | |
# input | |
my $in_fh; | |
if(my $input_file = $config->{input_file}) { | |
# check defined input file exists | |
die("ERROR: Could not find input file $input_file\n") unless -e $input_file; | |
if($input_file =~ /\.gz$/ || -B $input_file || $config->{gz}) { | |
$in_fh = get_compressed_filehandle($input_file, 1); | |
} | |
else { | |
$in_fh = FileHandle->new(); | |
$in_fh->open( $config->{input_file} ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n"); | |
} | |
} | |
else { | |
$in_fh = 'STDIN'; | |
} | |
# output | |
my $out_fh = FileHandle->new(); | |
if($config->{output_file} =~ /stdout/i) { | |
$out_fh = *STDOUT; | |
} | |
else { | |
$out_fh->open(">".$config->{output_file}) or die("ERROR: Could not write to output file ".$config->{output_file}."\n"); | |
} | |
my (@raw_headers, @headers, $line_number); | |
my $count = 0; | |
my $first = 1; | |
my $vcf_info_field = $config->{vcf_info_field}; | |
while(<$in_fh>) { | |
chomp; | |
$config->{line_number}++; | |
# header line? | |
if(/^\#/ || $first) { | |
push @raw_headers, $_; | |
$first = 0; | |
# print $_; | |
} | |
else { | |
$line_number++; | |
last if $config->{test} && $line_number > $config->{test}; | |
# parse headers before processing input | |
if(!(scalar @headers)) { | |
die("ERROR: No headers found in input file\n") unless scalar @raw_headers; | |
# write headers to output | |
unless($config->{count} || $config->{list} || $config->{no_header}) { | |
print $out_fh join("\n", @raw_headers); | |
print $out_fh "\n"; | |
} | |
# parse into data structures | |
parse_headers($config, \@raw_headers); | |
@headers = @{$config->{headers} || $config->{col_headers}}; | |
$config->{allowed_fields}->{$_} = 1 for (@{$config->{headers}}, @{$config->{col_headers}}); | |
} | |
my $line = $_; | |
my (@data, @chunks); | |
# get main data | |
my $main_data = parse_line($line, $config->{col_headers}, $config->{delimiter}); | |
# get info fields | |
foreach my $info_field(split /\;/, (split /\s+/, $line)[7]) { | |
my ($field, $value) = split /\=/, $info_field; | |
$main_data->{$field} = $value; | |
} | |
# get CSQ stuff | |
while($line =~ m/($vcf_info_field)\=(.+?)(\;|$|\s)/g) { | |
push @chunks, split('\,', $2); | |
# foreach my $chunk(@chunks) { | |
# $parsed = parse_line($chunk, \@headers, '\|'); | |
# } | |
$main_data->{$vcf_info_field} = [map {parse_line($_, \@headers, '\|')} @chunks]; | |
} | |
use Data::Dumper; | |
print Dumper $main_data; | |
last if $count >= $config->{limit} + $config->{start} - 1; | |
} | |
} | |
} | |
sub parse_headers { | |
my $config = shift; | |
my $raw_headers = shift; | |
foreach my $raw_header(@$raw_headers) { | |
# remove and count hash characters | |
my $hash_count = $raw_header =~ s/\#//g; | |
# field definition (VCF) | |
if($hash_count == 2) { | |
my $vcf_info_field = $config->{vcf_info_field}; | |
if($raw_header =~ /INFO\=\<ID\=($vcf_info_field)/) { | |
$raw_header =~ m/Format\: (.+?)\"/; | |
$config->{headers} = [split '\|', $1]; | |
} | |
elsif($raw_header =~ /INFO\=\<ID\=(.+?)\,/) { | |
$config->{allowed_fields}->{$1} = 1; | |
} | |
elsif($raw_header =~ m/ (.+?) \:/) { | |
$config->{allowed_fields}->{$1} = 1; | |
} | |
} | |
# column headers | |
else { | |
$config->{col_headers} = [split $config->{delimiter}, $raw_header]; | |
} | |
} | |
} | |
sub parse_line { | |
my $line = shift; | |
my $headers = shift; | |
my $delimiter = shift; | |
my $main_data = shift; | |
my %data = %{$main_data || {}}; | |
chomp $line; | |
my @split = split($delimiter, $line); | |
$data{$headers->[$_]} = $split[$_] for 0..$#split; | |
if(defined($data{Extra})) { | |
foreach my $chunk(split /\;/, $data{Extra}) { | |
my ($key, $value) = split /\=/, $chunk; | |
$data{$key} = $value; | |
} | |
} | |
# clean | |
foreach my $key(keys %data) { | |
$data{$key} = undef if | |
$data{$key} eq '' || ($key ne 'Allele' && $data{$key} eq '-'); | |
} | |
# set missing fields to undef | |
for(@$headers) { | |
$data{$_} = undef unless exists($data{$_}); | |
} | |
return \%data; | |
} | |
sub get_compressed_filehandle { | |
my ($file, $multi) = @_; | |
die("ERROR: No file given\n") unless $file; | |
die("ERROR: File $file does not exist\n") unless -e $file; | |
die("ERROR: File $file does not look like a binary file\n") unless -B $file; | |
if($CAN_USE_PERLIO_GZIP && !$multi) { | |
open my $fh, "<:gzip", $file or die("ERROR: $!"); | |
return $fh; | |
} | |
elsif($CAN_USE_GZIP) { | |
open my $fh, "gzip -dc $file |" or die("ERROR: $!"); | |
return $fh; | |
} | |
elsif($CAN_USE_IO_UNCOMPRESS) { | |
my $fh; | |
eval q{ $fh = IO::Uncompress::Gunzip->new($file, MultiStream => $multi) or die("ERROR: $GunzipError"); }; | |
die($@) if $@; | |
return $fh; | |
} | |
else { | |
die("Cannot read from compressed or binary file"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment