Created
August 22, 2012 14:00
-
-
Save yannickwurm/3425892 to your computer and use it in GitHub Desktop.
Paste left read and right read from Illumina into a single megaread
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# (copy) Yannick Wurm - google me. | |
# All right reserved. | |
help = "Usage: \n" + | |
" #{$0} MySequences_R1.fastq MySequences_R2.fastq > MySequences_R1R2pasted.fastq\n" + | |
"Joins two paired reads into a single megaread. Can be helpful for deduplication.\n" + | |
"The two fastq files must have reads in the same order. Some error checking is performed.\n" | |
##require 'bio-faster' # not used because I can only read one at a time. (there is no "next_entry" method) | |
##require 'bio' # not used because I couldnt easily create a new fastq with pasted quality scores | |
if ARGV.length != 2 then | |
warn help | |
exit | |
end | |
fastq_R1, fastq_R2 = ARGV[0], ARGV[1] | |
[fastq_R1, fastq_R2].each do |file| | |
raise IOError "Cannot find: #{file}" unless File.exists?(file) | |
raise IOError "Empty file: #{file}" if File.zero?(file) | |
end | |
fastq_R1_stream = File.open(fastq_R1, 'r') | |
fastq_R2_stream = File.open(fastq_R2, 'r') | |
warn "Will merge #{fastq_R1} and #{fastq_R2}. If something crashes there is a problem with your fastq files" | |
while TRUE | |
r1_id_line = fastq_R1_stream.gets | |
r2_id_line = fastq_R2_stream.gets | |
if r1_id_line.nil? and r2_id_line.nil? | |
warn "Reached end of both files. Done." | |
exit | |
end | |
if r1_id_line.nil? or r2_id_line.nil? | |
warn "ERROR: End of one file but not the other" | |
warn "#{fastq_R1} contains: #{r1_id_line}" | |
warn "#{fastq_R2} contains: #{r2_id_line}" | |
warn "Prematurely stopping here" | |
end | |
r1_id = r1_id_line.split(' ')[0] #only id before space | |
r1_seq = fastq_R1_stream.gets.chomp | |
fastq_R1_stream.gets # skip the line starting with "+" | |
r1_quals = fastq_R1_stream.gets.chomp | |
r2_id = r2_id_line.split(' ')[0] | |
r2_seq = fastq_R2_stream.gets.chomp | |
fastq_R2_stream.gets.chomp | |
r2_quals = fastq_R2_stream.gets.chomp | |
raise IOError, "Different ids in both files: first: #{r1_id}, second: #{r2_id}" if r1_id != r2_id | |
puts [r1_id, | |
r1_seq + r2_seq, | |
'+', | |
r1_quals + r2_quals | |
].join("\n") | |
end | |
warn "Done." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment