Skip to content

Instantly share code, notes, and snippets.

@mieko
Created May 16, 2011 04:20
Show Gist options
  • Save mieko/973937 to your computer and use it in GitHub Desktop.
Save mieko/973937 to your computer and use it in GitHub Desktop.
Scan free-form text for human-reabible dates (according to Chronic)
require 'chronic'
module Chronic
class << self
def scan(text)
return enum_for(:scan, text) unless block_given?
# words is lossless. We need to be able to join('') this and get
# the original string back
words = scan_by_re(text, /\s+/).to_a
# each word is given a begin...end range for its index referencing its
# position in the original string.
word_offsets = []
# Each token we generate has a back-pointer at the word it came from
rev_map = []
# The tokens we generate. pre_normalize can turn a word into two,
# which will generate multiple tokens.
tokens = []
strpos = 0
words.each.with_index do |word, word_index|
word_offsets.push(strpos ... strpos + word.size)
strpos += word.size
normalized = pre_normalize(word)
normalized.split(/\s+/).each do |v|
tokens.push(Token.new(v.gsub(/\s+|[\[\]\(\)]|[?!"]\z/, '')))
rev_map.push(word_index)
end
end
# Use Chronic's normal tagger to find useful tokens
tokens = Repeater.scan(tokens, {})
[Grabber, Pointer, Scalar, Ordinal, Separator,
TimeZone].each do |tokenizer|
tokens = tokenizer.scan(tokens)
end
# Walk through the tokens, finding long runs of tags
b, e = 0, 0
joined = nil
loop do
# Walk until we find a tag
b += 1 while (b < tokens.size) && (!tokens[b].tagged? ||
tokens[b].get_tag(Separator))
# walk another pointer forward and find where it stops
e = b + 1
has_parsed = false
loop do
break if e >= tokens.size
break if !tokens[e].tagged?
joined = tokens[b...e].map(&:word).join(' ')
parse_result = (parse(joined) rescue nil)
e -= 1 and break if has_parsed && !parse_result
has_parsed = has_parsed || (!!parse_result)
e += 1
end
# e won't move if b was at the end.
break if b >= tokens.size
# setup for next iteration
tok_range = b ... e
b = e
# These are parallel arrays, it needs to be inclusive
word_indices = rev_map[tok_range.begin] .. rev_map[tok_range.end-1]
out_range = word_offsets[word_indices]
out_range = out_range.first.begin ... out_range.last.end
result = text[out_range]
result.gsub!(/[."?'@,;]*\z/i, '')
out_range = out_range.begin ... out_range.begin + result.size
# special case for english text
next if result.match /\Aam\z/i
yield result, out_range, joined
end
end
private
# like //.split, but is lossless: also inserts the split character into
# the stream.
def scan_by_re(text, re)
return self.enum_for(:scan_by_re, text, re) unless block_given?
pos = 0
while (md = text.match(re, pos))
b, e = md.begin(0), md.end(0)
yield text[pos...b] unless pos == b
yield text[b...e]
pos = e
end
last_s = text[pos ... text.size]
yield last_s unless last_s.empty?
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment