Skip to content

Instantly share code, notes, and snippets.

@meren
Last active February 27, 2020 16:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save meren/7632184 to your computer and use it in GitHub Desktop.
Save meren/7632184 to your computer and use it in GitHub Desktop.
import sys
import IlluminaUtils.lib.fastqlib as u
import IlluminaUtils.lib.fastalib as f
from IlluminaUtils.utils.helperfunctions import reverse_complement
# call this script like this:
#
# python demultiplex.py barcode_to_sample_name_file index_fastq_file quality_filtered_fasta_file
#
# expected file formats are shown below.
#
barcode_to_sample_name_file = sys.argv[1]
index_fastq_file = sys.argv[2]
quality_filtered_fasta_file = sys.argv[3]
# barcode_to_sample_name_file [TAB DELIMITED]:
# RR6 AATCAGTCTCGT
# RR7 AATCCGTACAGC
# RTG5 ACACCTGGTGAT
# RTG8 TATCGTTGACCA
# TGB5 TTACTGTGCGAT
# TGB8 AGGCTACACGAC
# CG5 CTAACCTCCGCT
# CG7 GAACCAAAGGAT
# MSN6 GTATGCGCTGTA
# MSN7 GTACATACCGGT
#
#
# index_fastq_file [FASTQ]:
# @HWI-M00626:113:000000000-A5RP4:1:1101:15686:1330 2:N:0:
# TGTAATTGTCGC
# +
# BBBBBFFFFFBB
# @HWI-M00626:113:000000000-A5RP4:1:1101:15825:1331 2:N:0:
# GAACTAGTCACC
# +
# 3>>ABFDFFFFF
# @HWI-M00626:113:000000000-A5RP4:1:1101:15868:1331 2:N:0:
# GTACCTAATTGC
# +
# ABBBBFFFFFFF
# @HWI-M00626:113:000000000-A5RP4:1:1101:15411:1334 2:N:0:
# TGACCGGAATCA
# +
# ABBBBBBBBBBF
# @HWI-M00626:113:000000000-A5RP4:1:1101:15612:1334 2:N:0:
# GAAGTTGGAAGT
# +
# CDDDCFFFFFFF
#
# quality_filtered_fasta_file [FASTA]:
# >HWI-M00626:113:000000000-A5RP4:1:1101:15686:1330 1:N:0:|o:247|m/o:0.008097|MR:n=0;r1=0;r2=2|Q30:n/a|mismatches:2
# tacagGggatgcaagtgttatccggaattattgggcgtaaagcgtctgcaggttgatatttaagtcttttgttaaatctttgggcttaacccaaaaactgcaaaggaaactat
# >HWI-M00626:113:000000000-A5RP4:1:1101:15411:1334 1:N:0:|o:247|m/o:0.000000|MR:n=0;r1=0;r2=0|Q30:n/a|mismatches:0
# tacgtaggtgtcaagcgttgtccggatttattgggcgtaaagcgcccgcaggcggcttagtaagttcgaggtgaaatctcccggctcaactgggagggtgcctcgaaaactac
# >HWI-M00626:113:000000000-A5RP4:1:1101:15612:1334 1:N:0:|o:247|m/o:0.000000|MR:n=0;r1=0;r2=0|Q30:n/a|mismatches:0
# tacatagggtgcaagcgttgtccggaattattgggcgtaaagagctcgtaggtggtttgttgcgtcgggtgtgaaattcaggggctcaacccctgacctgcactcgatacggg
# >HWI-M00626:113:000000000-A5RP4:1:1101:15507:1335 1:N:0:|o:247|m/o:0.000000|MR:n=0;r1=0;r2=0|Q30:n/a|mismatches:0
# tacggagggtgcaagcgttatccggatttactgggtttaaagggagcgcaggcggtcgtgtaagtcagaggtgaaagcccggcgctcaacgtcggaaccgcctttgatattgc
# >HWI-M00626:113:000000000-A5RP4:1:1101:15526:1335 1:N:0:|o:247|m/o:0.000000|MR:n=0;r1=0;r2=0|Q30:n/a|mismatches:0
# tacggagggtgcaagcgttatccggatttactgggtttaaagggagcgcaggcggtcgtgtaagtcagaggtgaaagcccggcgctcaacgtcggaaccgcctttgatattgc
# step 1
barcode_to_sample = {}
for line in open(barcode_to_sample_name_file).readlines():
sample, barcode = line.strip().split()
barcode_to_sample[barcode] = sample
# step 2
read_id_to_sample = {}
index_fastq = u.FastQSource(index_fastq_file, compressed = index_fastq_file.endswith('.gz'))
missing_barcode = 0
while index_fastq.next(raw = True):
# revcomp?
index_fastq.entry.sequence = reverse_complement(index_fastq.entry.sequence)
if not barcode_to_sample.has_key(index_fastq.entry.sequence):
missing_barcode += 1
continue
if index_fastq.p_available:
index_fastq.print_percentage('(missing barcode: %d)' % missing_barcode)
read_id_to_sample[index_fastq.entry.header_line.split()[0]] = barcode_to_sample[index_fastq.entry.sequence]
sys.stderr.write('\n')
# final step
fasta = f.SequenceSource(quality_filtered_fasta_file)
output = f.FastaOutput(quality_filtered_fasta_file + '-DEMULTIPLEXED.fa')
missing_read_id = 0
while fasta.next():
read_id = fasta.id.split()[0]
if read_id not in read_id_to_sample:
missing_read_id += 1
continue
if fasta.pos % 1000 == 0:
sys.stderr.write('\rpos: %d (missing read: %d)' % (fasta.pos, missing_read_id))
sys.stderr.flush()
output.write_id(read_id_to_sample[read_id] + '_%d|%s' % (fasta.pos, fasta.id))
output.write_seq(fasta.seq, split = False)
output.close()
sys.stderr.write('\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment