ynx0/vcf2csv.py

## vcf2csv.py
#!/usr/bin/env python3

import vobject
import csv
import sys

from functools import reduce
from collections.abc import Iterable

from pathlib import Path


def flatten(l):
	"""
	modified form of https://stackoverflow.com/a/2158532
	"""
	def is_list(el):
		return isinstance(el, Iterable) and not isinstance(el, (str, bytes))

	if not is_list(l):
		# allow singleton / non-list values, return them as is
		yield l
	else:
		for el in l:
			if is_list(el):
				yield from flatten(el)
			else:
				yield el

# this script assumes many contacts in a single vcf file, not many files for a single contact

def read_vcf(vcf_path):
	vcf_path = Path(vcf_path)
	assert vcf_path.suffix == '.vcf', f"Error: tried to read a vcf file, but got one with extension {vcf_path.suffix}"
	with open(vcf_path, "r") as f:
		return f.read()


def cards_from_text(vcf_text):
	"""
	reads in all vcf cards from a given stream of vcf text
	"""
	return list(vobject.readComponents(vcf_text, validate=True))


def card_to_row(vcf_card) -> dict:
	"""
	every vcf card stores its data in a value called 'contents'
	contents is a dict which stores attributes and their values
	each attribute can contain one or more entries in the form of a list
	collectively we call this list attribute_data, while each element is an attribute value
	"""

	processed = {}
	for attr_name, attr_data in vcf_card.contents.items():
		#import unicodedata
		#def filter_non_printable(s):
		#	""" takes out all control characters, from https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python """
		#	return ''.join(c for c in s if not unicodedata.category(c).startswith('C'))

		def process_attr(attr_val):
			attr_val = attr_val.value
			attr_val = list(flatten(attr_val))[0]  # removes any nesting if it exists, mainly for org which has an extra layer for no reason
			attr_val = str(attr_val).strip()
			# attr_val = filter_non_printable(attr_val)
			attr_val = attr_val.replace('\xa0', ' ')
			return attr_val

		def stringify_data(attr_data):
			return ', '.join(attr_data)

		processed[attr_name] = stringify_data([process_attr(attr_val) for attr_val in attr_data])

	return processed


def gen_fieldnames(vcards) -> list:
	# basically extract out the data headers from all the cards and make a vec of them
	# then we put them all in a single set and then OR them together to get the union
	# which is the then the list of all possible data types we can find in our dataset
	# thus generating the proper header for our csv and ensuring no missing nor extraneous columns
	vcard_attributes_vec = map(lambda x: x.contents.keys(), vcards)
	return sorted(list(reduce(lambda acc, keys: acc | keys, vcard_attributes_vec)))


def vcf2csv(vcf_path: Path):
	vcf_text = read_vcf(vcf_path)
	vcards = cards_from_text(vcf_text)
	fieldnames = gen_fieldnames(vcards)
	with open(f"{vcf_path.stem}.csv", "w", newline='') as f:
		writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t')
		writer.writeheader()
		for card in vcards:
			writer.writerow(card_to_row(card))

def main():
	if len(sys.argv) <= 1:
		print(f"Usage: ./vcf2csv.py <path/to/vcf file.vcf>")
		print("Outputs: <original file name>.csv")
		sys.exit(1)

	vcf_path = Path(sys.argv[1])
	vcf2csv(vcf_path)


if __name__ == '__main__':
	main()
	#!/usr/bin/env python3

	import vobject
	import csv
	import sys

	from functools import reduce
	from collections.abc import Iterable

	from pathlib import Path



	def flatten(l):
	"""
	modified form of https://stackoverflow.com/a/2158532
	"""
	def is_list(el):
	return isinstance(el, Iterable) and not isinstance(el, (str, bytes))

	if not is_list(l):
	# allow singleton / non-list values, return them as is
	yield l
	else:
	for el in l:
	if is_list(el):
	yield from flatten(el)
	else:
	yield el

	# this script assumes many contacts in a single vcf file, not many files for a single contact

	def read_vcf(vcf_path):
	vcf_path = Path(vcf_path)
	assert vcf_path.suffix == '.vcf', f"Error: tried to read a vcf file, but got one with extension {vcf_path.suffix}"
	with open(vcf_path, "r") as f:
	return f.read()


	def cards_from_text(vcf_text):
	"""
	reads in all vcf cards from a given stream of vcf text
	"""
	return list(vobject.readComponents(vcf_text, validate=True))


	def card_to_row(vcf_card) -> dict:
	"""
	every vcf card stores its data in a value called 'contents'
	contents is a dict which stores attributes and their values
	each attribute can contain one or more entries in the form of a list
	collectively we call this list attribute_data, while each element is an attribute value
	"""

	processed = {}
	for attr_name, attr_data in vcf_card.contents.items():
	#import unicodedata
	#def filter_non_printable(s):
	# """ takes out all control characters, from https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python """
	# return ''.join(c for c in s if not unicodedata.category(c).startswith('C'))

	def process_attr(attr_val):
	attr_val = attr_val.value
	attr_val = list(flatten(attr_val))[0] # removes any nesting if it exists, mainly for org which has an extra layer for no reason
	attr_val = str(attr_val).strip()
	# attr_val = filter_non_printable(attr_val)
	attr_val = attr_val.replace('\xa0', ' ')
	return attr_val

	def stringify_data(attr_data):
	return ', '.join(attr_data)

	processed[attr_name] = stringify_data([process_attr(attr_val) for attr_val in attr_data])

	return processed


	def gen_fieldnames(vcards) -> list:
	# basically extract out the data headers from all the cards and make a vec of them
	# then we put them all in a single set and then OR them together to get the union
	# which is the then the list of all possible data types we can find in our dataset
	# thus generating the proper header for our csv and ensuring no missing nor extraneous columns
	vcard_attributes_vec = map(lambda x: x.contents.keys(), vcards)
	return sorted(list(reduce(lambda acc, keys: acc \| keys, vcard_attributes_vec)))


	def vcf2csv(vcf_path: Path):
	vcf_text = read_vcf(vcf_path)
	vcards = cards_from_text(vcf_text)
	fieldnames = gen_fieldnames(vcards)
	with open(f"{vcf_path.stem}.csv", "w", newline='') as f:
	writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t')
	writer.writeheader()
	for card in vcards:
	writer.writerow(card_to_row(card))

	def main():
	if len(sys.argv) <= 1:
	print(f"Usage: ./vcf2csv.py <path/to/vcf file.vcf>")
	print("Outputs: <original file name>.csv")
	sys.exit(1)

	vcf_path = Path(sys.argv[1])
	vcf2csv(vcf_path)


	if __name__ == '__main__':
	main()