Last active
February 27, 2020 16:17
-
-
Save meren/7632184 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import IlluminaUtils.lib.fastqlib as u | |
import IlluminaUtils.lib.fastalib as f | |
from IlluminaUtils.utils.helperfunctions import reverse_complement | |
# call this script like this: | |
# | |
# python demultiplex.py barcode_to_sample_name_file index_fastq_file quality_filtered_fasta_file | |
# | |
# expected file formats are shown below. | |
# | |
barcode_to_sample_name_file = sys.argv[1] | |
index_fastq_file = sys.argv[2] | |
quality_filtered_fasta_file = sys.argv[3] | |
# barcode_to_sample_name_file [TAB DELIMITED]: | |
# RR6 AATCAGTCTCGT | |
# RR7 AATCCGTACAGC | |
# RTG5 ACACCTGGTGAT | |
# RTG8 TATCGTTGACCA | |
# TGB5 TTACTGTGCGAT | |
# TGB8 AGGCTACACGAC | |
# CG5 CTAACCTCCGCT | |
# CG7 GAACCAAAGGAT | |
# MSN6 GTATGCGCTGTA | |
# MSN7 GTACATACCGGT | |
# | |
# | |
# index_fastq_file [FASTQ]: | |
# @HWI-M00626:113:000000000-A5RP4:1:1101:15686:1330 2:N:0: | |
# TGTAATTGTCGC | |
# + | |
# BBBBBFFFFFBB | |
# @HWI-M00626:113:000000000-A5RP4:1:1101:15825:1331 2:N:0: | |
# GAACTAGTCACC | |
# + | |
# 3>>ABFDFFFFF | |
# @HWI-M00626:113:000000000-A5RP4:1:1101:15868:1331 2:N:0: | |
# GTACCTAATTGC | |
# + | |
# ABBBBFFFFFFF | |
# @HWI-M00626:113:000000000-A5RP4:1:1101:15411:1334 2:N:0: | |
# TGACCGGAATCA | |
# + | |
# ABBBBBBBBBBF | |
# @HWI-M00626:113:000000000-A5RP4:1:1101:15612:1334 2:N:0: | |
# GAAGTTGGAAGT | |
# + | |
# CDDDCFFFFFFF | |
# | |
# quality_filtered_fasta_file [FASTA]: | |
# >HWI-M00626:113:000000000-A5RP4:1:1101:15686:1330 1:N:0:|o:247|m/o:0.008097|MR:n=0;r1=0;r2=2|Q30:n/a|mismatches:2 | |
# tacagGggatgcaagtgttatccggaattattgggcgtaaagcgtctgcaggttgatatttaagtcttttgttaaatctttgggcttaacccaaaaactgcaaaggaaactat | |
# >HWI-M00626:113:000000000-A5RP4:1:1101:15411:1334 1:N:0:|o:247|m/o:0.000000|MR:n=0;r1=0;r2=0|Q30:n/a|mismatches:0 | |
# tacgtaggtgtcaagcgttgtccggatttattgggcgtaaagcgcccgcaggcggcttagtaagttcgaggtgaaatctcccggctcaactgggagggtgcctcgaaaactac | |
# >HWI-M00626:113:000000000-A5RP4:1:1101:15612:1334 1:N:0:|o:247|m/o:0.000000|MR:n=0;r1=0;r2=0|Q30:n/a|mismatches:0 | |
# tacatagggtgcaagcgttgtccggaattattgggcgtaaagagctcgtaggtggtttgttgcgtcgggtgtgaaattcaggggctcaacccctgacctgcactcgatacggg | |
# >HWI-M00626:113:000000000-A5RP4:1:1101:15507:1335 1:N:0:|o:247|m/o:0.000000|MR:n=0;r1=0;r2=0|Q30:n/a|mismatches:0 | |
# tacggagggtgcaagcgttatccggatttactgggtttaaagggagcgcaggcggtcgtgtaagtcagaggtgaaagcccggcgctcaacgtcggaaccgcctttgatattgc | |
# >HWI-M00626:113:000000000-A5RP4:1:1101:15526:1335 1:N:0:|o:247|m/o:0.000000|MR:n=0;r1=0;r2=0|Q30:n/a|mismatches:0 | |
# tacggagggtgcaagcgttatccggatttactgggtttaaagggagcgcaggcggtcgtgtaagtcagaggtgaaagcccggcgctcaacgtcggaaccgcctttgatattgc | |
# step 1 | |
barcode_to_sample = {} | |
for line in open(barcode_to_sample_name_file).readlines(): | |
sample, barcode = line.strip().split() | |
barcode_to_sample[barcode] = sample | |
# step 2 | |
read_id_to_sample = {} | |
index_fastq = u.FastQSource(index_fastq_file, compressed = index_fastq_file.endswith('.gz')) | |
missing_barcode = 0 | |
while index_fastq.next(raw = True): | |
# revcomp? | |
index_fastq.entry.sequence = reverse_complement(index_fastq.entry.sequence) | |
if not barcode_to_sample.has_key(index_fastq.entry.sequence): | |
missing_barcode += 1 | |
continue | |
if index_fastq.p_available: | |
index_fastq.print_percentage('(missing barcode: %d)' % missing_barcode) | |
read_id_to_sample[index_fastq.entry.header_line.split()[0]] = barcode_to_sample[index_fastq.entry.sequence] | |
sys.stderr.write('\n') | |
# final step | |
fasta = f.SequenceSource(quality_filtered_fasta_file) | |
output = f.FastaOutput(quality_filtered_fasta_file + '-DEMULTIPLEXED.fa') | |
missing_read_id = 0 | |
while fasta.next(): | |
read_id = fasta.id.split()[0] | |
if read_id not in read_id_to_sample: | |
missing_read_id += 1 | |
continue | |
if fasta.pos % 1000 == 0: | |
sys.stderr.write('\rpos: %d (missing read: %d)' % (fasta.pos, missing_read_id)) | |
sys.stderr.flush() | |
output.write_id(read_id_to_sample[read_id] + '_%d|%s' % (fasta.pos, fasta.id)) | |
output.write_seq(fasta.seq, split = False) | |
output.close() | |
sys.stderr.write('\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment