Skip to content

Instantly share code, notes, and snippets.

@blahah
Last active December 17, 2015 21:59
Show Gist options
  • Save blahah/5678470 to your computer and use it in GitHub Desktop.
Save blahah/5678470 to your computer and use it in GitHub Desktop.
Snippet to extract consensus joined sequences from Sanger sequencing with forward and reverse primers (parsing data in the format provided by Source Bioscience), and some extra code to reconstruct the original fragment sequences for this specific case (virus protein sequencing with forward and revers primers from two overlapping fragments from e…
require 'rubygems'
require 'bio'
# load sequences, reverse-complement-align the pairs
seqs = {}
# get all .seq files
Dir.chdir('../raw') do
Dir['*.seq'].each do |file|
# load as fasta format
Bio::FastaFormat.open(file).each do |f|
# grab each fasta entry and extract metadata from id
virus, gene, orientation = f.entry_id.scan(/[0-9]+_([0-9]+)_([0-9][Ab])([FR])_.+/).first
seqs[virus] = {} if !seqs.has_key? virus
seqs[virus][gene] = {} if !seqs[virus].has_key? gene
# store in hash
seqs[virus][gene][orientation] = f.to_seq
end
end
end
def stripN(seq)
# this is ugly - a recursive end-search would be much prettier
seq = seq.gsub(/^[atgc]{0,3}n+[atgc]{0,3}n*[atgc]{0,3}n*[atgc]{0,3}n*[atgc]{0,3}n*[atgc]{0,3}n*/, '')
seq = seq.gsub(/n+[atgc]{0,3}n*[atgc]{0,3}n*[atgc]{0,3}n*[atgc]{0,3}n*[atgc]{0,3}n*[atgc]{0,3}$/, '')
return seq
end
def checkSequence(seq, cutoff)
# return true if sequence < cutoff% N
c = seq.composition
c = c['n'] / c.values.reduce(:+).to_f
if c < cutoff
return true
else
return false
end
end
# store final sequences
final4a = {}
final2b = {}
def storeSeq(key, seq, hash)
hash[key] = seq
end
# for each virus
seqs.each_pair do |virus, vdata|
# for each gene
vdata.each_pair do |gene, gdata|
output = gene == '4A' ? final4a : final2b
# get F and R
f, r = gdata['F'], gdata['R']
f.na; r.na
f, r = stripN(f), stripN(r).complement
keepf, keepr = checkSequence(f, 0.2), checkSequence(r, 0.2)
if keepf && keepr
# both sequences good
# perform alignment of the two ends
clustal = Bio::ClustalW.new
aln = clustal.query_alignment([f, r.complement]).alignment
seq = Bio::Sequence::NA.new(aln.consensus_string(threshold=0.5, :gap_mode=>-1))
storeSeq(virus, seq, output)
elsif keepf
puts "reverse #{virus}:#{gene} sequence was low quality"
storeSeq(virus, f, output)
elsif keepr
puts "forward #{virus}:#{gene} sequence was low quality"
storeSeq(virus, r, output)
else
puts "both #{virus}:#{gene} sequences were low quality"
end
end
end
File.open('../joined_sequences/joined.fa', 'w') do |out|
final4a.each_pair do |virus, foura|
twob = final2b[virus]
clustal = Bio::ClustalW.new
aln = clustal.query_alignment([twob, foura]).alignment
seq = Bio::Sequence::NA.new(aln.consensus_string(threshold=0.5, :gap_mode=>-1))
out.puts seq.to_fasta(virus, 60)
end
end
File.open('../joined_sequences/4a.fa', 'w') do |out|
final4a.each_pair do |virus, seq|
out.puts seq.to_fasta(virus, 60)
end
end
File.open('../joined_sequences/2b.fa', 'w') do |out|
final2b.each_pair do |virus, seq|
out.puts seq.to_fasta(virus, 60)
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment