Skip to content

Instantly share code, notes, and snippets.

@walterst
Last active December 26, 2015 14:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save walterst/7164604 to your computer and use it in GitHub Desktop.
Save walterst/7164604 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
""" Usage
python extract_bcs_from_fastq_ends.py X Y Z A B C D E
Where
X: input fastq file with barcodes at the beginning and ends of the reads
Y: output fastq barcodes file
Z: output reads fastq file (with barcodes removed)
A: length of forward barcode (int)
B: length of reverse barcode (int)
C: True/False flag to reverse complement the forward barcode
D: True/False flag to reverse complement the reverse barcode
E: True/False switch order of barcodes (False-order is barcode at start of
the read followed by barcode at end of read, True-order is barcode at end of
read followed by barcode at the beginning of the read).
"""
from sys import argv
from cogent.parse.fastq import MinimalFastqParser
from cogent import DNA
f = open(argv[1], "U")
bc_out = open(argv[2], "w")
read_out = open(argv[3], "w")
f_barcode_size = int(argv[4])
r_barcode_size = int(argv[5])
rc_f_bc = argv[6]
if rc_f_bc == "True":
rc_f_bc = True
elif rc_f_bc == "False":
rc_f_bc = False
else:
raise ValueError,("Fifth parameter must either be True or False")
rc_r_bc = argv[7]
if rc_r_bc == "True":
rc_r_bc = True
elif rc_r_bc == "False":
rc_r_bc = False
else:
raise ValueError,("Sixth parameter must either be True or False")
switch_bc_order = argv[8]
if switch_bc_order == "True":
switch_bc_order = True
elif switch_bc_order == "False":
switch_bc_order = False
else:
raise ValueError,("Seventh parameter must either be True or False")
for data in MinimalFastqParser(f, strict=False):
curr_label = data[0].strip()
curr_f_bc_read = data[1].strip()[0:f_barcode_size]
curr_read = data[1].strip()[f_barcode_size:-r_barcode_size]
curr_r_bc_read = data[1].strip()[-r_barcode_size:]
if rc_f_bc:
curr_f_bc_read = DNA.rc(curr_f_bc_read)
if rc_r_bc:
curr_r_bc_read = DNA.rc(curr_r_bc_read)
curr_f_bc_qual = data[2].strip()[0:f_barcode_size]
curr_qual = data[2].strip()[f_barcode_size:-r_barcode_size]
curr_r_bc_qual = data[2].strip()[-r_barcode_size:]
if rc_f_bc:
curr_f_bc_qual = curr_f_bc_qual[::-1]
if rc_r_bc:
curr_r_bc_qual = curr_r_bc_qual[::-1]
bc_out.write("@%s\n" % curr_label)
if switch_bc_order:
final_bc_read = curr_r_bc_read + curr_f_bc_read
final_bc_qual = curr_r_bc_qual + curr_f_bc_qual
else:
final_bc_read = curr_f_bc_read + curr_r_bc_read
final_bc_qual = curr_f_bc_qual + curr_r_bc_qual
bc_out.write("%s\n" % final_bc_read)
bc_out.write("+\n")
bc_out.write("%s\n" % final_bc_qual)
read_out.write("@%s\n" % curr_label)
read_out.write("%s\n" % curr_read)
read_out.write("+\n")
read_out.write("%s\n" % curr_qual)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment