shell one-liner that parses the fasta headers from the NCBI python genome. will likely work on other genomes from NCBI as well.
output fields:
- transcript ID
- full transcript ID w/ version (.1, .2, etc.)
- full gene identifier (watch out for spaces and weird symbols)
- gene symbol
- transcript variant (watch out for spaces), with NA meaning none
- type of transcript (mRNA, ncNRA, etc.)
paste \
<(zcat GCF_000186305.1_Python_molurus_bivittatus-5.0.2_rna.fna.gz | \
grep ">" | sed 's/>//g' | cut -d " " -f 1 | awk -v OFS="\t" -F '.' '{ print $1, $1FS$2 }') \
<(zcat GCF_000186305.1_Python_molurus_bivittatus-5.0.2_rna.fna.gz | grep ">" | sed 's/>//g' | \
cut -d " " -f 5- | awk -v OFS="\t" -F ', ' '{ if ($(NF-1) ~ /^transcript/) print $(NF-2); else print $(NF-1) }' | \
rev | cut -d " " -f 2- | rev) \
<(zcat GCF_000186305.1_Python_molurus_bivittatus-5.0.2_rna.fna.gz | grep ">" | sed 's/>//g' | \
cut -d " " -f 5- | awk -v OFS="\t" -F ', ' '{ if ($(NF-1) ~ /^transcript/) print $(NF-2); else print $(NF-1) }' | \
rev | cut -d " " -f 1 | rev | sed -e 's/(//g' -e 's/)//g') \
<(zcat GCF_000186305.1_Python_molurus_bivittatus-5.0.2_rna.fna.gz | grep ">" | sed 's/>//g' | \
cut -d " " -f 5- | awk -v OFS="\t" -F ', ' '{ if ($(NF-1) ~ /^transcript/) print $(NF-1); else print "NA" }') \
<(zcat GCF_000186305.1_Python_molurus_bivittatus-5.0.2_rna.fna.gz | grep ">" | sed 's/>//g' | \
cut -d " " -f 5- | awk -v OFS="\t" -F ', ' '{ print $NF }') \
> GCF_000186305.1_Python_molurus_bivittatus-5.0.2_rna_metadata_parsed.tsv