Skip to content

Instantly share code, notes, and snippets.

@willmclaren
Created January 23, 2019 15:29
Show Gist options
  • Save willmclaren/95ffc7a6479a9dc9eb937fc80754b6cf to your computer and use it in GitHub Desktop.
Save willmclaren/95ffc7a6479a9dc9eb937fc80754b6cf to your computer and use it in GitHub Desktop.
VCF to JSON starter code
use strict;
use Getopt::Long;
use FileHandle;
our ($CAN_USE_PERLIO_GZIP, $CAN_USE_GZIP, $CAN_USE_IO_UNCOMPRESS);
BEGIN {
# check PerlIO::gzip
if (eval q{ require PerlIO::gzip; 1 }) {
$CAN_USE_PERLIO_GZIP = 1;
}
# check gzip
if (`which gzip` =~ /\/gzip/) {
$CAN_USE_GZIP = 1;
}
# check IO::Uncompress::Gunzip
if(eval q{ use IO::Uncompress::Gunzip qw($GunzipError); 1}) {
$CAN_USE_IO_UNCOMPRESS = 1;
}
}
our @VARIANT_FIELDS = qw(
CLIN_SIG SOMATIC PHENO PUBMED
AF
MAX_AF MAX_AF_POPS
AFR_AF AMR_AF EAS_AF EUR_AF SAS_AF
AA_AF EA_AF
gnomAD_AF gnomAD_AFR_AF gnomAD_AMR_AF gnomAD_ASJ_AF gnomAD_EAS_AF gnomAD_FIN_AF gnomAD_NFE_AF gnomAD_OTH_AF gnomAD_SAS_AF
ExAC_AF ExAC_Adj_AF ExAC_AFR_AF ExAC_AMR_AF ExAC_EAS_AF ExAC_FIN_AF ExAC_NFE_AF ExAC_OTH_AF ExAC_SAS_AF
);
# configure from command line opts
my $config = configure(scalar @ARGV);
# run the main sub routine
main($config);
sub configure {
my $args = shift;
# set defaults
my $config = {
start => 1,
limit => 1e12,
output_file => 'stdout',
vcf_info_field => 'CSQ',
delimiter => "\t",
};
GetOptions(
$config,
'help|h', # displays help message
'test=i', # test run on n lines
'delimiter|d=s', # delimiter
'input_file|i=s', # input file
'output_file|o=s', # output file
'gz', # force read as gzipped
'vcf_info_field=s', # VCF INFO field name to parse
) or die("ERROR: Failed to parse command-line flags\n");
# print usage message if requested or no args supplied
if($config->{help} || !$args) {
&usage;
exit(0);
}
return $config;
}
sub main {
my $config = shift;
# input
my $in_fh;
if(my $input_file = $config->{input_file}) {
# check defined input file exists
die("ERROR: Could not find input file $input_file\n") unless -e $input_file;
if($input_file =~ /\.gz$/ || -B $input_file || $config->{gz}) {
$in_fh = get_compressed_filehandle($input_file, 1);
}
else {
$in_fh = FileHandle->new();
$in_fh->open( $config->{input_file} ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n");
}
}
else {
$in_fh = 'STDIN';
}
# output
my $out_fh = FileHandle->new();
if($config->{output_file} =~ /stdout/i) {
$out_fh = *STDOUT;
}
else {
$out_fh->open(">".$config->{output_file}) or die("ERROR: Could not write to output file ".$config->{output_file}."\n");
}
my (@raw_headers, @headers, $line_number);
my $count = 0;
my $first = 1;
my $vcf_info_field = $config->{vcf_info_field};
while(<$in_fh>) {
chomp;
$config->{line_number}++;
# header line?
if(/^\#/ || $first) {
push @raw_headers, $_;
$first = 0;
# print $_;
}
else {
$line_number++;
last if $config->{test} && $line_number > $config->{test};
# parse headers before processing input
if(!(scalar @headers)) {
die("ERROR: No headers found in input file\n") unless scalar @raw_headers;
# write headers to output
unless($config->{count} || $config->{list} || $config->{no_header}) {
print $out_fh join("\n", @raw_headers);
print $out_fh "\n";
}
# parse into data structures
parse_headers($config, \@raw_headers);
@headers = @{$config->{headers} || $config->{col_headers}};
$config->{allowed_fields}->{$_} = 1 for (@{$config->{headers}}, @{$config->{col_headers}});
}
my $line = $_;
my (@data, @chunks);
# get main data
my $main_data = parse_line($line, $config->{col_headers}, $config->{delimiter});
# get info fields
foreach my $info_field(split /\;/, (split /\s+/, $line)[7]) {
my ($field, $value) = split /\=/, $info_field;
$main_data->{$field} = $value;
}
# get CSQ stuff
while($line =~ m/($vcf_info_field)\=(.+?)(\;|$|\s)/g) {
push @chunks, split('\,', $2);
# foreach my $chunk(@chunks) {
# $parsed = parse_line($chunk, \@headers, '\|');
# }
$main_data->{$vcf_info_field} = [map {parse_line($_, \@headers, '\|')} @chunks];
}
use Data::Dumper;
print Dumper $main_data;
last if $count >= $config->{limit} + $config->{start} - 1;
}
}
}
sub parse_headers {
my $config = shift;
my $raw_headers = shift;
foreach my $raw_header(@$raw_headers) {
# remove and count hash characters
my $hash_count = $raw_header =~ s/\#//g;
# field definition (VCF)
if($hash_count == 2) {
my $vcf_info_field = $config->{vcf_info_field};
if($raw_header =~ /INFO\=\<ID\=($vcf_info_field)/) {
$raw_header =~ m/Format\: (.+?)\"/;
$config->{headers} = [split '\|', $1];
}
elsif($raw_header =~ /INFO\=\<ID\=(.+?)\,/) {
$config->{allowed_fields}->{$1} = 1;
}
elsif($raw_header =~ m/ (.+?) \:/) {
$config->{allowed_fields}->{$1} = 1;
}
}
# column headers
else {
$config->{col_headers} = [split $config->{delimiter}, $raw_header];
}
}
}
sub parse_line {
my $line = shift;
my $headers = shift;
my $delimiter = shift;
my $main_data = shift;
my %data = %{$main_data || {}};
chomp $line;
my @split = split($delimiter, $line);
$data{$headers->[$_]} = $split[$_] for 0..$#split;
if(defined($data{Extra})) {
foreach my $chunk(split /\;/, $data{Extra}) {
my ($key, $value) = split /\=/, $chunk;
$data{$key} = $value;
}
}
# clean
foreach my $key(keys %data) {
$data{$key} = undef if
$data{$key} eq '' || ($key ne 'Allele' && $data{$key} eq '-');
}
# set missing fields to undef
for(@$headers) {
$data{$_} = undef unless exists($data{$_});
}
return \%data;
}
sub get_compressed_filehandle {
my ($file, $multi) = @_;
die("ERROR: No file given\n") unless $file;
die("ERROR: File $file does not exist\n") unless -e $file;
die("ERROR: File $file does not look like a binary file\n") unless -B $file;
if($CAN_USE_PERLIO_GZIP && !$multi) {
open my $fh, "<:gzip", $file or die("ERROR: $!");
return $fh;
}
elsif($CAN_USE_GZIP) {
open my $fh, "gzip -dc $file |" or die("ERROR: $!");
return $fh;
}
elsif($CAN_USE_IO_UNCOMPRESS) {
my $fh;
eval q{ $fh = IO::Uncompress::Gunzip->new($file, MultiStream => $multi) or die("ERROR: $GunzipError"); };
die($@) if $@;
return $fh;
}
else {
die("Cannot read from compressed or binary file");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment