willmclaren/vcf2json.pl

## vcf2json.pl
use strict;
use Getopt::Long;
use FileHandle;

our ($CAN_USE_PERLIO_GZIP, $CAN_USE_GZIP, $CAN_USE_IO_UNCOMPRESS);

BEGIN {

  # check PerlIO::gzip
  if (eval q{ require PerlIO::gzip; 1 }) {
    $CAN_USE_PERLIO_GZIP = 1;
  }

  # check gzip
  if (`which gzip` =~ /\/gzip/) {
    $CAN_USE_GZIP = 1;
  }

  # check IO::Uncompress::Gunzip
  if(eval q{ use IO::Uncompress::Gunzip qw($GunzipError); 1}) {
    $CAN_USE_IO_UNCOMPRESS = 1;
  }
}

our @VARIANT_FIELDS = qw(
  CLIN_SIG SOMATIC PHENO PUBMED
  AF
  MAX_AF MAX_AF_POPS
  AFR_AF AMR_AF EAS_AF EUR_AF SAS_AF
  AA_AF EA_AF
  gnomAD_AF gnomAD_AFR_AF gnomAD_AMR_AF gnomAD_ASJ_AF gnomAD_EAS_AF gnomAD_FIN_AF gnomAD_NFE_AF gnomAD_OTH_AF gnomAD_SAS_AF
  ExAC_AF ExAC_Adj_AF ExAC_AFR_AF ExAC_AMR_AF ExAC_EAS_AF ExAC_FIN_AF ExAC_NFE_AF ExAC_OTH_AF ExAC_SAS_AF
);

# configure from command line opts
my $config = configure(scalar @ARGV);

# run the main sub routine
main($config);

sub configure {
  my $args = shift;

  # set defaults
  my $config = {
    start => 1,
    limit => 1e12,
    output_file => 'stdout',
    vcf_info_field => 'CSQ',
    delimiter => "\t",
  };

  GetOptions(
    $config,
    'help|h',                  # displays help message

    'test=i',                  # test run on n lines
    'delimiter|d=s',           # delimiter

    'input_file|i=s',          # input file
    'output_file|o=s',         # output file
    'gz',                      # force read as gzipped
    'vcf_info_field=s',        # VCF INFO field name to parse
  ) or die("ERROR: Failed to parse command-line flags\n");

  # print usage message if requested or no args supplied
  if($config->{help} || !$args) {
    &usage;
    exit(0);
  }

  return $config;
}

sub main {
  my $config = shift;

  # input
  my $in_fh;

  if(my $input_file = $config->{input_file}) {
    # check defined input file exists
    die("ERROR: Could not find input file $input_file\n") unless -e $input_file;

    if($input_file =~ /\.gz$/ || -B $input_file || $config->{gz}) {
      $in_fh = get_compressed_filehandle($input_file, 1);
    }
    else {
      $in_fh = FileHandle->new();
      $in_fh->open( $config->{input_file} ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n");
    }
  }
  else {
    $in_fh = 'STDIN';
  }

  # output
  my $out_fh = FileHandle->new();

  if($config->{output_file} =~ /stdout/i) {
    $out_fh = *STDOUT;
  }
  else {
    $out_fh->open(">".$config->{output_file}) or die("ERROR: Could not write to output file ".$config->{output_file}."\n");
  }

  my (@raw_headers, @headers, $line_number);
  my $count = 0;
  my $first = 1;

  my $vcf_info_field = $config->{vcf_info_field};

  while(<$in_fh>) {
    chomp;

    $config->{line_number}++;

    # header line?
    if(/^\#/ || $first) {
      push @raw_headers, $_;
      $first = 0;
      # print $_;
    }
    else {
      $line_number++;
      last if $config->{test} && $line_number > $config->{test};

      # parse headers before processing input
      if(!(scalar @headers)) {
        die("ERROR: No headers found in input file\n") unless scalar @raw_headers;

        # write headers to output
        unless($config->{count} || $config->{list} || $config->{no_header}) {
          print $out_fh join("\n", @raw_headers);
          print $out_fh "\n";
        }

        # parse into data structures
        parse_headers($config, \@raw_headers);
        @headers = @{$config->{headers} || $config->{col_headers}};
        $config->{allowed_fields}->{$_} = 1 for (@{$config->{headers}}, @{$config->{col_headers}});
      }

      my $line = $_;
      my (@data, @chunks);

      # get main data
      my $main_data = parse_line($line, $config->{col_headers}, $config->{delimiter});

      # get info fields
      foreach my $info_field(split /\;/, (split /\s+/, $line)[7]) {
        my ($field, $value) = split /\=/, $info_field;
        $main_data->{$field} = $value;
      }

      # get CSQ stuff
      while($line =~ m/($vcf_info_field)\=(.+?)(\;|$|\s)/g) {
        push @chunks, split('\,', $2);

        # foreach my $chunk(@chunks) {
        #   $parsed = parse_line($chunk, \@headers, '\|');
        # }

        $main_data->{$vcf_info_field} = [map {parse_line($_, \@headers, '\|')} @chunks];
      }

      use Data::Dumper;
      print Dumper $main_data;

      last if $count >= $config->{limit} + $config->{start} - 1;
    }
  }
}

sub parse_headers {
  my $config = shift;
  my $raw_headers = shift;

  foreach my $raw_header(@$raw_headers) {

    # remove and count hash characters
    my $hash_count = $raw_header =~ s/\#//g;

    # field definition (VCF)
    if($hash_count == 2) {
      my $vcf_info_field = $config->{vcf_info_field};

      if($raw_header =~ /INFO\=\<ID\=($vcf_info_field)/) {
        $raw_header =~ m/Format\: (.+?)\"/;
        $config->{headers} = [split '\|', $1];
      }
      elsif($raw_header =~ /INFO\=\<ID\=(.+?)\,/) {
        $config->{allowed_fields}->{$1} = 1;
      }
      elsif($raw_header =~ m/ (.+?) \:/) {
        $config->{allowed_fields}->{$1} = 1;
      }
    }

    # column headers
    else {
      $config->{col_headers} = [split $config->{delimiter}, $raw_header];
    }
  }
}

sub parse_line {
  my $line = shift;
  my $headers = shift;
  my $delimiter = shift;
  my $main_data = shift;

  my %data = %{$main_data || {}};

  chomp $line;
  my @split = split($delimiter, $line);

  $data{$headers->[$_]} = $split[$_] for 0..$#split;

  if(defined($data{Extra})) {
    foreach my $chunk(split /\;/, $data{Extra}) {
      my ($key, $value) = split /\=/, $chunk;
      $data{$key} = $value;
    }
  }

  # clean
  foreach my $key(keys %data) {
    $data{$key} = undef if
      $data{$key} eq '' || ($key ne 'Allele' && $data{$key} eq '-');
  }

  # set missing fields to undef
  for(@$headers) {
    $data{$_} = undef unless exists($data{$_});
  }

  return \%data;
}

sub get_compressed_filehandle {
  my ($file, $multi) = @_;

  die("ERROR: No file given\n") unless $file;
  die("ERROR: File $file does not exist\n") unless -e $file;
  die("ERROR: File $file does not look like a binary file\n") unless -B $file;

  if($CAN_USE_PERLIO_GZIP && !$multi) {
    open my $fh, "<:gzip", $file or die("ERROR: $!");
    return $fh;
  }
  elsif($CAN_USE_GZIP) {
    open my $fh, "gzip -dc $file |" or die("ERROR: $!");
    return $fh;
  }
  elsif($CAN_USE_IO_UNCOMPRESS) {
    my $fh;
    eval q{ $fh = IO::Uncompress::Gunzip->new($file, MultiStream => $multi) or die("ERROR: $GunzipError"); };
    die($@) if $@;
    return $fh;
  }
  else {
    die("Cannot read from compressed or binary file");
  }
}
	use strict;
	use Getopt::Long;
	use FileHandle;

	our ($CAN_USE_PERLIO_GZIP, $CAN_USE_GZIP, $CAN_USE_IO_UNCOMPRESS);

	BEGIN {

	# check PerlIO::gzip
	if (eval q{ require PerlIO::gzip; 1 }) {
	$CAN_USE_PERLIO_GZIP = 1;
	}

	# check gzip
	if (`which gzip` =~ /\/gzip/) {
	$CAN_USE_GZIP = 1;
	}

	# check IO::Uncompress::Gunzip
	if(eval q{ use IO::Uncompress::Gunzip qw($GunzipError); 1}) {
	$CAN_USE_IO_UNCOMPRESS = 1;
	}
	}

	our @VARIANT_FIELDS = qw(
	CLIN_SIG SOMATIC PHENO PUBMED
	AF
	MAX_AF MAX_AF_POPS
	AFR_AF AMR_AF EAS_AF EUR_AF SAS_AF
	AA_AF EA_AF
	gnomAD_AF gnomAD_AFR_AF gnomAD_AMR_AF gnomAD_ASJ_AF gnomAD_EAS_AF gnomAD_FIN_AF gnomAD_NFE_AF gnomAD_OTH_AF gnomAD_SAS_AF
	ExAC_AF ExAC_Adj_AF ExAC_AFR_AF ExAC_AMR_AF ExAC_EAS_AF ExAC_FIN_AF ExAC_NFE_AF ExAC_OTH_AF ExAC_SAS_AF
	);

	# configure from command line opts
	my $config = configure(scalar @ARGV);

	# run the main sub routine
	main($config);

	sub configure {
	my $args = shift;

	# set defaults
	my $config = {
	start => 1,
	limit => 1e12,
	output_file => 'stdout',
	vcf_info_field => 'CSQ',
	delimiter => "\t",
	};

	GetOptions(
	$config,
	'help\|h', # displays help message

	'test=i', # test run on n lines
	'delimiter\|d=s', # delimiter

	'input_file\|i=s', # input file
	'output_file\|o=s', # output file
	'gz', # force read as gzipped
	'vcf_info_field=s', # VCF INFO field name to parse
	) or die("ERROR: Failed to parse command-line flags\n");

	# print usage message if requested or no args supplied
	if($config->{help} \|\| !$args) {
	&usage;
	exit(0);
	}

	return $config;
	}

	sub main {
	my $config = shift;

	# input
	my $in_fh;

	if(my $input_file = $config->{input_file}) {
	# check defined input file exists
	die("ERROR: Could not find input file $input_file\n") unless -e $input_file;

	if($input_file =~ /\.gz$/ \|\| -B $input_file \|\| $config->{gz}) {
	$in_fh = get_compressed_filehandle($input_file, 1);
	}
	else {
	$in_fh = FileHandle->new();
	$in_fh->open( $config->{input_file} ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n");
	}
	}
	else {
	$in_fh = 'STDIN';
	}

	# output
	my $out_fh = FileHandle->new();

	if($config->{output_file} =~ /stdout/i) {
	$out_fh = *STDOUT;
	}
	else {
	$out_fh->open(">".$config->{output_file}) or die("ERROR: Could not write to output file ".$config->{output_file}."\n");
	}

	my (@raw_headers, @headers, $line_number);
	my $count = 0;
	my $first = 1;

	my $vcf_info_field = $config->{vcf_info_field};

	while(<$in_fh>) {
	chomp;

	$config->{line_number}++;

	# header line?
	if(/^\#/ \|\| $first) {
	push @raw_headers, $_;
	$first = 0;
	# print $_;
	}
	else {
	$line_number++;
	last if $config->{test} && $line_number > $config->{test};

	# parse headers before processing input
	if(!(scalar @headers)) {
	die("ERROR: No headers found in input file\n") unless scalar @raw_headers;

	# write headers to output
	unless($config->{count} \|\| $config->{list} \|\| $config->{no_header}) {
	print $out_fh join("\n", @raw_headers);
	print $out_fh "\n";
	}

	# parse into data structures
	parse_headers($config, \@raw_headers);
	@headers = @{$config->{headers} \|\| $config->{col_headers}};
	$config->{allowed_fields}->{$_} = 1 for (@{$config->{headers}}, @{$config->{col_headers}});
	}

	my $line = $_;
	my (@data, @chunks);

	# get main data
	my $main_data = parse_line($line, $config->{col_headers}, $config->{delimiter});

	# get info fields
	foreach my $info_field(split /\;/, (split /\s+/, $line)[7]) {
	my ($field, $value) = split /\=/, $info_field;
	$main_data->{$field} = $value;
	}

	# get CSQ stuff
	while($line =~ m/($vcf_info_field)\=(.+?)(\;\|$\|\s)/g) {
	push @chunks, split('\,', $2);

	# foreach my $chunk(@chunks) {
	# $parsed = parse_line($chunk, \@headers, '\\|');
	# }

	$main_data->{$vcf_info_field} = [map {parse_line($_, \@headers, '\\|')} @chunks];
	}

	use Data::Dumper;
	print Dumper $main_data;

	last if $count >= $config->{limit} + $config->{start} - 1;
	}
	}
	}

	sub parse_headers {
	my $config = shift;
	my $raw_headers = shift;

	foreach my $raw_header(@$raw_headers) {

	# remove and count hash characters
	my $hash_count = $raw_header =~ s/\#//g;

	# field definition (VCF)
	if($hash_count == 2) {
	my $vcf_info_field = $config->{vcf_info_field};

	if($raw_header =~ /INFO\=\<ID\=($vcf_info_field)/) {
	$raw_header =~ m/Format\: (.+?)\"/;
	$config->{headers} = [split '\\|', $1];
	}
	elsif($raw_header =~ /INFO\=\<ID\=(.+?)\,/) {
	$config->{allowed_fields}->{$1} = 1;
	}
	elsif($raw_header =~ m/ (.+?) \:/) {
	$config->{allowed_fields}->{$1} = 1;
	}
	}

	# column headers
	else {
	$config->{col_headers} = [split $config->{delimiter}, $raw_header];
	}
	}
	}

	sub parse_line {
	my $line = shift;
	my $headers = shift;
	my $delimiter = shift;
	my $main_data = shift;

	my %data = %{$main_data \|\| {}};

	chomp $line;
	my @split = split($delimiter, $line);

	$data{$headers->[$_]} = $split[$_] for 0..$#split;

	if(defined($data{Extra})) {
	foreach my $chunk(split /\;/, $data{Extra}) {
	my ($key, $value) = split /\=/, $chunk;
	$data{$key} = $value;
	}
	}

	# clean
	foreach my $key(keys %data) {
	$data{$key} = undef if
	$data{$key} eq '' \|\| ($key ne 'Allele' && $data{$key} eq '-');
	}

	# set missing fields to undef
	for(@$headers) {
	$data{$_} = undef unless exists($data{$_});
	}

	return \%data;
	}

	sub get_compressed_filehandle {
	my ($file, $multi) = @_;

	die("ERROR: No file given\n") unless $file;
	die("ERROR: File $file does not exist\n") unless -e $file;
	die("ERROR: File $file does not look like a binary file\n") unless -B $file;

	if($CAN_USE_PERLIO_GZIP && !$multi) {
	open my $fh, "<:gzip", $file or die("ERROR: $!");
	return $fh;
	}
	elsif($CAN_USE_GZIP) {
	open my $fh, "gzip -dc $file \|" or die("ERROR: $!");
	return $fh;
	}
	elsif($CAN_USE_IO_UNCOMPRESS) {
	my $fh;
	eval q{ $fh = IO::Uncompress::Gunzip->new($file, MultiStream => $multi) or die("ERROR: $GunzipError"); };
	die($@) if $@;
	return $fh;
	}
	else {
	die("Cannot read from compressed or binary file");
	}
	}