yannickwurm/mergePairedReads.rb

## mergePairedReads.rb
#!/usr/bin/env ruby
# (copy) Yannick Wurm - google me.
# All right reserved.

help = "Usage: \n" +
       "      #{$0} MySequences_R1.fastq MySequences_R2.fastq > MySequences_R1R2pasted.fastq\n" +
       "Joins two paired reads into a single megaread. Can be helpful for deduplication.\n" +
       "The two fastq files must have reads in the same order. Some error checking is performed.\n"


##require 'bio-faster' # not used because I can only read one at a time. (there is no "next_entry" method)
##require 'bio'        # not used because I couldnt easily create a new fastq with pasted quality scores

if ARGV.length != 2 then
  warn help
  exit
end


fastq_R1, fastq_R2 = ARGV[0], ARGV[1]
[fastq_R1, fastq_R2].each do |file|
  raise IOError "Cannot find: #{file}" unless File.exists?(file)
  raise IOError "Empty file:  #{file}" if     File.zero?(file)
end


fastq_R1_stream = File.open(fastq_R1, 'r')
fastq_R2_stream = File.open(fastq_R2, 'r')

warn "Will merge #{fastq_R1} and #{fastq_R2}. If something crashes there is a problem with your fastq files"

while TRUE
  r1_id_line = fastq_R1_stream.gets
  r2_id_line = fastq_R2_stream.gets

  if r1_id_line.nil? and r2_id_line.nil?
    warn "Reached end of both files. Done."
    exit
  end
  if r1_id_line.nil? or r2_id_line.nil?
    warn "ERROR: End of one file but not the other"
    warn "#{fastq_R1} contains: #{r1_id_line}"
    warn "#{fastq_R2} contains: #{r2_id_line}"
    warn "Prematurely stopping here"
  end

  r1_id    = r1_id_line.split(' ')[0]   #only id before space
  r1_seq   = fastq_R1_stream.gets.chomp
  fastq_R1_stream.gets  # skip the line starting with "+"
  r1_quals = fastq_R1_stream.gets.chomp


  r2_id    = r2_id_line.split(' ')[0]
  r2_seq   = fastq_R2_stream.gets.chomp
  fastq_R2_stream.gets.chomp
  r2_quals = fastq_R2_stream.gets.chomp


  raise IOError, "Different ids in both files: first: #{r1_id}, second: #{r2_id}" if r1_id != r2_id

  puts [r1_id,
        r1_seq + r2_seq,
        '+',
        r1_quals + r2_quals
       ].join("\n")
end
warn "Done."
	#!/usr/bin/env ruby
	# (copy) Yannick Wurm - google me.
	# All right reserved.

	help = "Usage: \n" +
	" #{$0} MySequences_R1.fastq MySequences_R2.fastq > MySequences_R1R2pasted.fastq\n" +
	"Joins two paired reads into a single megaread. Can be helpful for deduplication.\n" +
	"The two fastq files must have reads in the same order. Some error checking is performed.\n"


	##require 'bio-faster' # not used because I can only read one at a time. (there is no "next_entry" method)
	##require 'bio' # not used because I couldnt easily create a new fastq with pasted quality scores

	if ARGV.length != 2 then
	warn help
	exit
	end


	fastq_R1, fastq_R2 = ARGV[0], ARGV[1]
	[fastq_R1, fastq_R2].each do \|file\|
	raise IOError "Cannot find: #{file}" unless File.exists?(file)
	raise IOError "Empty file: #{file}" if File.zero?(file)
	end


	fastq_R1_stream = File.open(fastq_R1, 'r')
	fastq_R2_stream = File.open(fastq_R2, 'r')

	warn "Will merge #{fastq_R1} and #{fastq_R2}. If something crashes there is a problem with your fastq files"

	while TRUE
	r1_id_line = fastq_R1_stream.gets
	r2_id_line = fastq_R2_stream.gets

	if r1_id_line.nil? and r2_id_line.nil?
	warn "Reached end of both files. Done."
	exit
	end
	if r1_id_line.nil? or r2_id_line.nil?
	warn "ERROR: End of one file but not the other"
	warn "#{fastq_R1} contains: #{r1_id_line}"
	warn "#{fastq_R2} contains: #{r2_id_line}"
	warn "Prematurely stopping here"
	end

	r1_id = r1_id_line.split(' ')[0] #only id before space
	r1_seq = fastq_R1_stream.gets.chomp
	fastq_R1_stream.gets # skip the line starting with "+"
	r1_quals = fastq_R1_stream.gets.chomp



	r2_id = r2_id_line.split(' ')[0]
	r2_seq = fastq_R2_stream.gets.chomp
	fastq_R2_stream.gets.chomp
	r2_quals = fastq_R2_stream.gets.chomp


	raise IOError, "Different ids in both files: first: #{r1_id}, second: #{r2_id}" if r1_id != r2_id

	puts [r1_id,
	r1_seq + r2_seq,
	'+',
	r1_quals + r2_quals
	].join("\n")
	end
	warn "Done."