Skip to content

Instantly share code, notes, and snippets.

@jmasonherr
Created July 8, 2016 19:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jmasonherr/05e4741c27900a0a686ed649b9685795 to your computer and use it in GitHub Desktop.
Save jmasonherr/05e4741c27900a0a686ed649b9685795 to your computer and use it in GitHub Desktop.
"""
No dependency Python script for converting uBiome's raw data .zip
into a single FASTQ file. Based on
https://gist.github.com/boydgreenfield/805ac27e0a6b9a5adea7
Usage:
>>>python ubiome2fastq.py ubiomedatafile.zip
"""
import argparse
import shutil
import zipfile
import gzip
import os
from subprocess import call
def enforce_headers(f1_header, f2_header):
if f1_header[0] != "@" or f2_header[0] != "@":
print 'HEADERS:', f1_header, f2_header
raise Exception("Invalid input FASTQ files.")
if f1_header.strip().split(" ")[0] != f2_header.strip().split(" ")[0]:
raise Exception("Input FASTQ files do not share headers. "
"Check and re-run w/o --strict.")
def join_interleaved(f1, f2, f_out, strict=False):
readlines = True
ix = 0
f1_lines = []
f2_lines = []
def flush_buffer(f_out, f1_lines, f2_lines):
if f1_lines and f2_lines:
assert len(f1_lines) == 4
assert len(f2_lines) == 4
f_out.write(f1_lines[0])
f_out.write(f1_lines[1])
f_out.write(f1_lines[2])
f_out.write(f1_lines[3])
f_out.write(f2_lines[0])
f_out.write(f2_lines[1])
f_out.write(f2_lines[2])
f_out.write(f2_lines[3])
f1_lines = []
f2_lines = []
return f1_lines, f2_lines
while readlines:
f1_line = f1.readline()
f2_line = f2.readline()
if f1_line == "":
readlines = False
if f2_line != "":
raise Exception("Input FASTQ files do not match in length.")
break
if ix % 4 == 0: # Header #1
if strict:
enforce_headers(f1_line, f2_line)
f1_lines, f2_lines = flush_buffer(f_out, f1_lines, f2_lines)
# Fill buffer up to 4 lines
f1_lines.append(f1_line)
f2_lines.append(f2_line)
ix += 1
_, _ = flush_buffer(f_out, f1_lines, f2_lines)
f1.close()
f2.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=("Convert uBiome's raw .zip into a single FASTQ. "))
parser.add_argument("zipfile", help="Zipfile of raw data from uBiome")
args = parser.parse_args()
path, filename = os.path.split(args.zipfile)
base_name = filename.replace('.zip', '')
tmp_folder = os.path.join(path, '%s_ubiome_extract_tmp' % base_name)
out_folder = os.path.join(path, base_name)
os.makedirs(out_folder)
zip = zipfile.ZipFile(args.zipfile).extractall(tmp_folder)
r1_files = sorted([x for x in os.listdir(tmp_folder) if '_R1_' in x])
outfile_name = os.path.join(out_folder, '%s.fastq.gz' % base_name)
f_out = gzip.open(outfile_name, mode='w')
for r1_file in r1_files:
r2_file = r1_file.replace('_R1_', '_R2_')
join_interleaved(
gzip.open(os.path.join(tmp_folder, r1_file), 'r'),
gzip.open(os.path.join(tmp_folder, r2_file), mode='r'),
f_out)
f_out.close()
shutil.rmtree(tmp_folder)
call(["fastqc", outfile_name])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment