Skip to content

Instantly share code, notes, and snippets.

@gregcaporaso
Created November 17, 2012 03:28
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gregcaporaso/4093047 to your computer and use it in GitHub Desktop.
Save gregcaporaso/4093047 to your computer and use it in GitHub Desktop.
quick and dirty script to create a barcode read fastq file from a sequence read fastq file with barcodes in the headers

USAGE: extract_fastq_barcodes_from_header.py input_reads.fastq barcode_reads.fastq

#!/usr/bin/env python
# File created on 16 Nov 2012
from __future__ import division
__author__ = "Greg Caporaso"
__credits__ = ["Greg Caporaso"]
__license__ = "GPL"
__version__ = "1.5.0-dev"
__maintainer__ = "Greg Caporaso"
__email__ = "gregcaporaso@gmail.com"
__status__ = "Development"
from sys import argv
from unittest import TestCase, main as test_main
def script_main(input_f):
for line in input_f:
if line.startswith('@'):
header = line.strip()
barcode = header.split(':')[-1]
yield '%s\n%s\n+\nbbbbbbbbbbbb' % (header,barcode)
else:
pass
class ScriptTests(TestCase):
def setUp(self):
self.fake_file = fake_file.split('\n')
self.expected_output = expected_output
def test_main(self):
""" expected barcode fastq is generated """
self.assertEqual('\n'.join(script_main(self.fake_file)),
self.expected_output)
fake_file = """@M141:79:749142:1:1101:16224:1417 1:N:0:CGACTAATGTGT
TACGTAGGTGGCAAGCGTTAGCCGGAATTATTGGGCGTAAAGCGCGAGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCACGGCTCAACCGTGGAGGGTCATTGGACACTGTAAAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCGGTGAAATGCGCAGAGATATGGAGGAACACCAGTAGCTAAGTCTACTTTCTGGACTGTAACTGACGCTGAGCTGCGACAGCGTGGGGATCAAACA
+
=9<==>9+--,55<@@EEEC+8AC+CCE-AAA.ACCCCCDAFFEAC>C555--*55<+55C+DDEDE3=C==4444+44@D@A33<@D@DE)@0@############################################################################################################################################################
@M141:79:749142:1:1101:16633:1423 1:N:0:CGACTTATGTGA
TACGTAGGTGGCAAGCGTTATCCGGAAGTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCACGGCTCAACCGTGGAGGGTCATTGGAAACTGGAAAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCTGTGACATGCGCAGAGATATGGAGACACACCAGTGGCGAAGGCGACGTTCTGGTCTGTAACTGACGCTGATGTGCACAAGCGTGGGGATCACACA
+
,5<==>>+<@@<@<@@E6+>+AACA=C+8ADAF=EE7>CEEF@ECCD>5>CEEDACCD5<CE@EDEEEEEDE@@@:+4@DDD==@@:2;@98@8::2296<E(;;6<EE<E;(66;(/;?66;<;<<E;?/.///96<EEE(6;?<?=E######################################################################################################
@M141:79:749142:1:1101:15549:1428 1:N:0:CGACTTATGTGT
TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTGTTAAGTCTGATGTGAAAGCCCACGGCTCAACCGTGGAGGGTCATTGGAAACTTGAAAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCGGTGAAATGCGCAGAGATATGGAGGAACATCAGTGGCGAAGGCGACTTGCTCGTCTGTAACTGACGCTGATGTGCGAAAGCGTGGTGATCCATCA
+
5<???BB?B?<5?B,<CC6CC8ECCEAFBEAA09A7>C>CCFG:D7>C>>>E=CD<<5+4CFFFFDFCDFFFF?@D,4DFD@@>>@DDEEE<A;<***1:;B;;?ACAEA?*0:*0?CC//::0A:AACA*008?EE8::::??/:0:A??AAC?################################################################################################
@M141:79:749142:1:1101:16736:1437 1:N:0:CGACTTATGTAT
TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCACGGCTCAACCGTGGAGGGTCATTGGAAACTGGAAAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTCGCGGTGAAATGCGCAGAGATATGGAGGCACACCAGTGGCGAAGGCGACCTACTGGTCTGTTACTGACGCTGATGTGCGCAAGCTAGAGGATCAAACA
+
<???,5?9BB9<?-ABC@>C;CFA=CEHHHFFFGHF+>CEHHHHHHHHHHAEHH7CCD+DCCCFHDDBD@444B,??DFFFEEEEEEE,;C@B;:B**28AEEE*C:*0:?:*0:AECAE*0??E*00*:?**0/*0**0?EEECEEC:A:*/*08'2?')00??EAA?820:A#############################################################################
"""
expected_output = """@M141:79:749142:1:1101:16224:1417 1:N:0:CGACTAATGTGT
CGACTAATGTGT
+
bbbbbbbbbbbb
@M141:79:749142:1:1101:16633:1423 1:N:0:CGACTTATGTGA
CGACTTATGTGA
+
bbbbbbbbbbbb
@M141:79:749142:1:1101:15549:1428 1:N:0:CGACTTATGTGT
CGACTTATGTGT
+
bbbbbbbbbbbb
@M141:79:749142:1:1101:16736:1437 1:N:0:CGACTTATGTAT
CGACTTATGTAT
+
bbbbbbbbbbbb
"""
if __name__ == "__main__":
if len(argv) == 1:
print "USAGE: extract_fastq_barcodes_from_header.py input_reads.fastq barcode_reads.fastq"
print "\nTest output:\n"
test_main()
else:
output_f = open(argv[2],'w')
for rec in script_main(open(argv[1],'U')):
output_f.write(rec)
output_f.write('\n')
output_f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment