This gist displays several strategies to parse and correct genbank files in the general format of a cookbook
sed '/PATTERN/d' someFile.gb > outputFile.gb
perl -pe '$_.= qq(TEXT N\n) if /PATTERN/' outputFile.gb > otherOutfile.gb
sed can be used for this purpose, by matching PATTERN, and optionally using a variable $NM by using double-quotes
NM="some value"
sed "s/ACCESSION <unknown id>/ACCESSION $NM/g" genbankFile.gb > genbankFile.mod.gb
commandline perl can also be used
perl -pe 's/^\s*ORGANISM \./ORGANISM Macaca mulatta/' genbankFile.gb > genbankFile.mod.gb
for i in *.gb ; do
NM=$(echo "$i" | rev | cut -d. -f2- | rev) ;
# replace <unknown id> with name string`
sed "s/ACCESSION <unknown id>/ACCESSION $NM/g" ${i} > ${i}.tmp.1 ;
# when converting from genbank to embl
# Biopython looks at the VERSION line for the identifier when converting a gb file
sed "s/VERSION <unknown id>/VERSION $NM/g" ${i}.tmp.1 > ${i}.tmp.2 ;
# misc aesthetic cleanup
perl -pe 's/^\s*ORGANISM \./ORGANISM Macaca mulatta/' ${i}.tmp.3 > ${i}.gb ;
rm *.tmp.* ;
done
bash
#!/bin/bash
for i in *.gb ; do
NM=$(echo "$i" | rev | cut -d. -f2- | rev) ;
# replace <unknown id> with name string
sed "s/ACCESSION <unknown id>/ACCESSION $NM/g" ${i} > ${i}.tmp.1 ;
sed "s/VERSION <unknown id>/VERSION $NM/g" ${i}.tmp.1 > ${i}.tmp.2 ;
# when converting from genbank to embl
# Biopython looks at the VERSION line for the identifier when converting a gb file
perl -pe 's/^\s*ORGANISM \./ORGANISM Macaca mulatta/' ${i}.tmp.2 > ${i}.gb ;
rm *.tmp.* ;
done
# then loop through the files, and use biopython to modify each to an embl file
python
#!/Users/caskey/anaconda3/bin/python
# requires bash env variable FILENAME to be set or script will fail
import sys
import os
from Bio import SeqIO
fname=os.environ['FILENAME']
fname = fname.replace('/',':')
presentDir=os.environ['PWD']
outFileName = presentDir + '/' + fname + '.embl'
fnameIn = presentDir + '/' + fname
count = SeqIO.convert(fnameIn, "genbank", outFileName, "embl")