harrisj/meter.rb

## gistfile1.txt
So, basically, scanning a sentence becomes like this:

1. Split the sentence into words
2. For each word, create a cleaned version (basically strip off quotes and punctuation) and look up in the dictionary.
3. If not found, try a few fallbacks based on stemmming rules or such
4. Otherwise, add the word to a term_misses table.

These code is really ugly (sorry!), and you might notice I actually am talking about meter in it. This is because the background image for the haikus is generated from the meter. But I don't want to do the meter for new words (sorry, syllable is enough), so for those I just return - as the meter (otherwise it's a combination of 1 or 0)

HAIKU (11000 1010011 11110)
"That obviously
wasn't even a thought back
when this was written."

Then, I have a separate process for dumping the term_misses to a file, one to each line. I can then go through and add syllable counts for other terms and reload the same file to remove that term from term misses and add it to the terms dictionary.

## meter.rb
  def self.meter(word)
    if word.blank?
      return '', true
    end

    if word =~ /^\u2013$/
      return '', false
    end

    word = word.strip

    term_record = where(:term => word.upcase).first
    unless term_record.nil?
      return term_record.meter, term_record.syllable_count_varies?
    end

    # otherwise, second chance time!
    case word
    # when /^[\u2014\u00B6\u2026]$/
    # #when /^[—¶…]$/
    #   0
    when /[\.?!]$/
      return meter(word[0, word.length - 1])
    when /^1?\d$/, /^\d0$/
      return meter(NUMBERS_TO_NAMES[word])
    when /^(\d)(\d)$/
      m1, a1 = meter(NUMBERS_TO_NAMES["#{$1}0"])
      m2, a2 = meter($2)
      return "#{m1}#{m2}", a1||a2
    # when /^(\d)(\d{2})$/
    #   m1, a1 = meter($1)
    #   m2, a2 = meter("HUNDRED")

    #   if $2 == '00'
    #     m3 = ''
    #     a3 = true
    #   else
    #     m3, a3 = meter($2)
    #   end

    #   return "#{m1}#{m2}#{m3}", a1||a2||a3
    when /^200?(\d{1,2})$/ # 2001
      m, a = meter($1)
      return "#{110}#{m}", a
    # when /^(\d{2})(\d{2})$/  # early years
    #   m1, awk1 = meter($1)
    #   m2, awk2 = meter($2)
    #   return "#{m1}#{m2}", awk1 || awk2
    #   m1 = meter($1)
    #   pct = meter("percent")
    #   return "#{m1[0]}"
    # when /^(.+)\.(com|net|org)$/  # amazon.com
    #   1 + syllables($1) + syllables($2)
    # when /^(([^.])\.)+[^.]\.?$/  # G.O.P.
    #   word.split(".").inject(0) {|sum, w| sum + syllables(w)}
    # when /^([\$\u20AC\uFFE5\u00A5\u00A3])(.+)/
    # #when /^([\$\u20AC￥¥£])(.+)/
    # #when /^([\$€￥¥£])(.+)/
    #   syllables($1) + syllables($2) # 13 dollars
    # when /^@/
    #   1 + syllables(word[1,word.length]) # user name
    # when /(.+)[',\?\)\]\}]$/
    #   syllables(word[0, word.length-1])
    # when /'s$/  # possesive
    #   syllables(word[0,word.length-2])
    #when /&/
    #   subwords = word.split(/&/)
    #   1 + subwords.inject(0) {|sum, w| sum + syllables(w)}
    when /[\-\/_]/
      awkward = false

      meter = word.split(/[\-\/_]/).map {|w| m,a = meter(w); awkward ||= a; m }.join('')
      return meter, awkward
    when /^(anti|un|non|pre|post|re|micro|super|hyper|mega|over|micro|cyber|ultra)(.+)/
      w1, a1 = meter($1)
      w2, a2 = meter($2)
      return "#{w1}#{w2}", a1 || a2
    when /ably|ibly$/
      return meter("#{word[0, word.length-1]}e")
    when /ily$/
      m, a = meter("#{word[0, word.length-3]}y")
      return "#{m}0", a
    when /'s$/
      return meter(word[0, word.length-2])
    when /ly$/
      m, a = meter(word[0, word.length-2])
      return "#{m}0", a
    when /(iest|iness)$/
      m, a = meter("#{word[0, word.length - $1.length]}y")
      return "#{m}01", a
    when /(ness|less|like|ish|ing)$/
      m, a = meter(word[0, word.length-$1.length])
      return "#{m}0", a
    when /ism$/
      m, a = (word[0, word.length - 3])
      return "#{m}00", a
    when /ies$/
      return meter("#{word[0, word.length - 3]}y")
    when /[^s]s$/
      return meter(word[0, word.length-1])
    end

    # else it's an error
    raise MissingTermsError.new([word])
  end

## meter_from_sentence.rb
  def self.meter_from_sentence(sentence)
    return [] if sentence.blank?

    sentence = sentence.dup
    sentence.squish!
    sentence.gsub!(/^[—¶•*]+/m, '')
    sentence.gsub!(/^"\s+/m, '')

    words = sentence.split(/\s+/)
    #sentence.split(/[\s\u2013]+/)
        #words = sentence.split(/[\s\– ]/)

    line = []
    term_misses = []

    words.each do |word|
      clean_term = clean_word(word)
      next if clean_term.blank?

      begin
        meter, awkward = Term.meter(clean_term)
        line << [word, meter, awkward]
      rescue MissingTermsError => ex
        line << [word, nil, true]
        term_misses << clean_term
      end
    end

    if term_misses.any?
      term_misses.each do |miss|
        TermMiss.add(miss)
      end

      raise MissingTermsError.new(term_misses)
    end

    line
  end
	So, basically, scanning a sentence becomes like this:

	1. Split the sentence into words
	2. For each word, create a cleaned version (basically strip off quotes and punctuation) and look up in the dictionary.
	3. If not found, try a few fallbacks based on stemmming rules or such
	4. Otherwise, add the word to a term_misses table.

	These code is really ugly (sorry!), and you might notice I actually am talking about meter in it. This is because the background image for the haikus is generated from the meter. But I don't want to do the meter for new words (sorry, syllable is enough), so for those I just return - as the meter (otherwise it's a combination of 1 or 0)

	HAIKU (11000 1010011 11110)
	"That obviously
	wasn't even a thought back
	when this was written."

	Then, I have a separate process for dumping the term_misses to a file, one to each line. I can then go through and add syllable counts for other terms and reload the same file to remove that term from term misses and add it to the terms dictionary.
	def self.meter(word)
	if word.blank?
	return '', true
	end

	if word =~ /^\u2013$/
	return '', false
	end

	word = word.strip

	term_record = where(:term => word.upcase).first
	unless term_record.nil?
	return term_record.meter, term_record.syllable_count_varies?
	end

	# otherwise, second chance time!
	case word
	# when /^[\u2014\u00B6\u2026]$/
	# #when /^[—¶…]$/
	# 0
	when /[\.?!]$/
	return meter(word[0, word.length - 1])
	when /^1?\d$/, /^\d0$/
	return meter(NUMBERS_TO_NAMES[word])
	when /^(\d)(\d)$/
	m1, a1 = meter(NUMBERS_TO_NAMES["#{$1}0"])
	m2, a2 = meter($2)
	return "#{m1}#{m2}", a1\|\|a2
	# when /^(\d)(\d{2})$/
	# m1, a1 = meter($1)
	# m2, a2 = meter("HUNDRED")

	# if $2 == '00'
	# m3 = ''
	# a3 = true
	# else
	# m3, a3 = meter($2)
	# end

	# return "#{m1}#{m2}#{m3}", a1\|\|a2\|\|a3
	when /^200?(\d{1,2})$/ # 2001
	m, a = meter($1)
	return "#{110}#{m}", a
	# when /^(\d{2})(\d{2})$/ # early years
	# m1, awk1 = meter($1)
	# m2, awk2 = meter($2)
	# return "#{m1}#{m2}", awk1 \|\| awk2
	# m1 = meter($1)
	# pct = meter("percent")
	# return "#{m1[0]}"
	# when /^(.+)\.(com\|net\|org)$/ # amazon.com
	# 1 + syllables($1) + syllables($2)
	# when /^(([^.])\.)+[^.]\.?$/ # G.O.P.
	# word.split(".").inject(0) {\|sum, w\| sum + syllables(w)}
	# when /^([\$\u20AC\uFFE5\u00A5\u00A3])(.+)/
	# #when /^([\$\u20AC￥¥£])(.+)/
	# #when /^([\$€￥¥£])(.+)/
	# syllables($1) + syllables($2) # 13 dollars
	# when /^@/
	# 1 + syllables(word[1,word.length]) # user name
	# when /(.+)[',\?\)\]\}]$/
	# syllables(word[0, word.length-1])
	# when /'s$/ # possesive
	# syllables(word[0,word.length-2])
	#when /&/
	# subwords = word.split(/&/)
	# 1 + subwords.inject(0) {\|sum, w\| sum + syllables(w)}
	when /[\-\/_]/
	awkward = false

	meter = word.split(/[\-\/_]/).map {\|w\| m,a = meter(w); awkward \|\|= a; m }.join('')
	return meter, awkward
	when /^(anti\|un\|non\|pre\|post\|re\|micro\|super\|hyper\|mega\|over\|micro\|cyber\|ultra)(.+)/
	w1, a1 = meter($1)
	w2, a2 = meter($2)
	return "#{w1}#{w2}", a1 \|\| a2
	when /ably\|ibly$/
	return meter("#{word[0, word.length-1]}e")
	when /ily$/
	m, a = meter("#{word[0, word.length-3]}y")
	return "#{m}0", a
	when /'s$/
	return meter(word[0, word.length-2])
	when /ly$/
	m, a = meter(word[0, word.length-2])
	return "#{m}0", a
	when /(iest\|iness)$/
	m, a = meter("#{word[0, word.length - $1.length]}y")
	return "#{m}01", a
	when /(ness\|less\|like\|ish\|ing)$/
	m, a = meter(word[0, word.length-$1.length])
	return "#{m}0", a
	when /ism$/
	m, a = (word[0, word.length - 3])
	return "#{m}00", a
	when /ies$/
	return meter("#{word[0, word.length - 3]}y")
	when /[^s]s$/
	return meter(word[0, word.length-1])
	end

	# else it's an error
	raise MissingTermsError.new([word])
	end
	def self.meter_from_sentence(sentence)
	return [] if sentence.blank?

	sentence = sentence.dup
	sentence.squish!
	sentence.gsub!(/^[—¶•*]+/m, '')
	sentence.gsub!(/^"\s+/m, '')

	words = sentence.split(/\s+/)
	#sentence.split(/[\s\u2013]+/)
	#words = sentence.split(/[\s\– ]/)

	line = []
	term_misses = []

	words.each do \|word\|
	clean_term = clean_word(word)
	next if clean_term.blank?

	begin
	meter, awkward = Term.meter(clean_term)
	line << [word, meter, awkward]
	rescue MissingTermsError => ex
	line << [word, nil, true]
	term_misses << clean_term
	end
	end

	if term_misses.any?
	term_misses.each do \|miss\|
	TermMiss.add(miss)
	end

	raise MissingTermsError.new(term_misses)
	end

	line
	end