Skip to content

Instantly share code, notes, and snippets.

@theosanderson
Created April 26, 2024 22:46
Show Gist options
  • Save theosanderson/2ba5429aeeec9cfd5b4f2e0eb5c7c26a to your computer and use it in GitHub Desktop.
Save theosanderson/2ba5429aeeec9cfd5b4f2e0eb5c7c26a to your computer and use it in GitHub Desktop.
Translating GISAID named files to EPI_ISL files
# Example of how to run: tar -xJOf sequences_fasta_2024_04_25.tar.xz sequences.fasta | python translate.py | pv -l | zstd > seqs.fa.zst
import sys
import tarfile
import pandas as pd
def load_virus_to_accession_dict(tar_path):
""" Load the virus to accession ID dictionary from a tar.xz file containing a .tsv. """
with tarfile.open(tar_path, 'r:xz') as tar:
# Find the tsv file within the archive, assuming there's only one .tsv
tsv_file = [member for member in tar.getmembers() if member.name.endswith('.tsv')][0]
# Extract and read the tsv file
with tar.extractfile(tsv_file) as file:
df = pd.read_csv(file, sep='\t', usecols=['Virus name', 'Accession ID'])
return df.set_index('Virus name')['Accession ID'].to_dict()
def process_fasta(virus_to_accession):
""" Process FASTA data from stdin, map names using the dictionary, and output to stdout. """
current_name = ''
sequence_data = []
for line in sys.stdin:
line = line.strip()
if line.startswith('>'): # New sequence header
# Output previous sequence if it exists
if current_name:
print(f'>{virus_to_accession.get(current_name, current_name)}')
print(''.join(sequence_data))
sequence_data = []
# Update current sequence name
current_name = line[1:] # Remove '>'
# remove from "|" to the end of the line
current_name = current_name.split('|')[0]
else:
sequence_data.append(line)
# Output the last sequence
if current_name:
print(f'>{virus_to_accession.get(current_name, current_name)}')
print(''.join(sequence_data))
def main():
# Path to your .tar.xz file containing the .tsv
tar_path = 'metadata_tsv_2024_04_25.tar.xz'
# Load dictionary
virus_to_accession = load_virus_to_accession_dict(tar_path)
# Process FASTA data
process_fasta(virus_to_accession)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment