Skip to content

@msukmanowsky /serpico.rb secret
Created

Embed URL

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Search keyword extraction with ruby. Work in progress.
require 'uri'
require 'cgi'
require 'json'
# Add strip_arbitrary
class String
def strip_arbitrary(chars)
r = chars.chars.map { |c| Regexp.quote(c) }.join
self.gsub(/(^[#{r}]*)|([#{r}]*$)/, '')
end
end
module Serpico
class ExtractResult
attr_reader :engine_name, :keyword, :parser
def initialize(engine_name, keyword, parser)
@engine_name = engine_name
@keyword = keyword
@parser = parser
end
end
class SearchEngineParser
attr_reader :engine_name, :keyword_extractor, :link_macro, :charsets
def initialize(engine_name, keyword_extractor, link_macro, charsets)
@engine_name = engine_name
@keyword_extractor = [keyword_extractor] if not keyword_extractor.is_a?(Array)
@link_marco = link_macro
@charsets = [charsets] if not charsets.is_a?(Array)
@charsets = charsets.map {|x| x.downcase}
end
def get_serp_url(base_url, keyword)
return nil if @link_macro.nil?
return "#{base_url}/#{@link_macro.gsub(/\{k\}/, 'keyword')}"
end
def parse(serp_url)
begin
uri_parts = URI(serp_url)
rescue
return nil # malformed URLs
end
query = CGI::parse(uri_parts.query)
keyword = nil
@keyword_extractor.each do |extractor|
if extractor.start_with? "/"
# Regular expression extractor
extractor = extractor.strip_arbitrary('/')
regex = Regexp.new(extractor)
# TODO: some regex.match stuff or something
else
# We have a regular query string extractor
if query.has_key? extractor
keyword = query[extractor][0]
break
end
end
end
return nil if keyword.nil?
return ExtractResult.new(@engine_name, keyword, self)
end
end
@country_codes = ['af', 'ax', 'al', 'dz', 'as', 'ad', 'ao', 'ai', 'aq', 'ag', 'ar', 'am', 'aw', 'au', 'at', 'az', 'bs', 'bh', 'bd', 'bb', 'by', 'be', 'bz', 'bj', 'bm', 'bt', 'bo', 'bq', 'ba', 'bw', 'bv', 'br', 'io', 'bn', 'bg', 'bf', 'bi', 'kh', 'cm', 'ca', 'cv', 'ky', 'cf', 'td', 'cl', 'cn', 'cx', 'cc', 'co', 'km', 'cg', 'cd', 'ck', 'cr', 'ci', 'hr', 'cu', 'cw', 'cy', 'cz', 'dk', 'dj', 'dm', 'do', 'ec', 'eg', 'sv', 'gq', 'er', 'ee', 'et', 'fk', 'fo', 'fj', 'fi', 'fr', 'gf', 'pf', 'tf', 'ga', 'gm', 'ge', 'de', 'gh', 'gi', 'gr', 'gl', 'gd', 'gp', 'gu', 'gt', 'gg', 'gn', 'gw', 'gy', 'ht', 'hm', 'va', 'hn', 'hk', 'hu', 'is', 'in', 'id', 'ir', 'iq', 'ie', 'im', 'il', 'it', 'jm', 'jp', 'je', 'jo', 'kz', 'ke', 'ki', 'kp', 'kr', 'kw', 'kg', 'la', 'lv', 'lb', 'ls', 'lr', 'ly', 'li', 'lt', 'lu', 'mo', 'mk', 'mg', 'mw', 'my', 'mv', 'ml', 'mt', 'mh', 'mq', 'mr', 'mu', 'yt', 'mx', 'fm', 'md', 'mc', 'mn', 'me', 'ms', 'ma', 'mz', 'mm', 'na', 'nr', 'np', 'nl', 'nc', 'nz', 'ni', 'ne', 'ng', 'nu', 'nf', 'mp', 'no', 'om', 'pk', 'pw', 'ps', 'pa', 'pg', 'py', 'pe', 'ph', 'pn', 'pl', 'pt', 'pr', 'qa', 're', 'ro', 'ru', 'rw', 'bl', 'sh', 'kn', 'lc', 'mf', 'pm', 'vc', 'ws', 'sm', 'st', 'sa', 'sn', 'rs', 'sc', 'sl', 'sg', 'sx', 'sk', 'si', 'sb', 'so', 'za', 'gs', 'ss', 'es', 'lk', 'sd', 'sr', 'sj', 'sz', 'se', 'ch', 'sy', 'tw', 'tj', 'tz', 'th', 'tl', 'tg', 'tk', 'to', 'tt', 'tn', 'tr', 'tm', 'tc', 'tv', 'ug', 'ua', 'ae', 'gb', 'us', 'um', 'uy', 'uz', 'vu', 've', 'vn', 'vg', 'vi', 'wf', 'eh', 'ye', 'zm', 'zw', 'uk']
def Serpico.get_lossy_domain(domain)
codes = @country_codes.join('|')
return domain.gsub(/^(\w+[0-9]*|search)\./, '')
.gsub(/(^|\.)m\./, '\1')
.gsub(/(\.(com|org|net|co|it|edu))?\.(#{codes})(\/|$)/, '.{}\4')
.gsub(/(^|\.)(#{codes})\./, '\1{}.')
end
# Cache it at the module level
@engines = nil
def Serpico.get_search_engines()
return @engines if not @engines.nil?
@engines = {}
# Group the engines by engine name
piwik_engines = JSON.load(IO.read('search_engines.json'))
grouped = piwik_engines.to_a.group_by {|a| a[1][0]}
grouped.each do |domain, rule_definitions|
defaults = {
:extractor => nil,
:link_macro => nil,
:charsets => ['utf-8']
}
puts rule_definitions.inspect
rule_definitions.each_index do |i|
rule_definition = rule_definitions[i]
engine_name = rule_definitions[1][0]
rule = rule_definition[1][1..-1]
if i == 0
defaults[:extractor] = rule[0]
defaults[:link_macro] = rule[1] if rule.count >= 2
defaults[:charsets] = rule[2] if rule.count >= 3
@engines[domain] = SearchEngineParser.new(engine_name, defaults[:extractor], defaults[:link_macro], defaults[:charsets])
next
end
args = [engine_name, defaults[:extractor], defaults[:link_macro], defaults[:charsets]]
args[1] = rule[0] if rule.count >= 1
args[2] = rule[1] if rule.count >= 2
args[3] = rule[2] if rule.count >= 3
@engines[domain] = SearchEngineParser.new(*args)
end
end
return @engines
end
def Serpico.get_parser(referring_url)
engines = get_search_engines
begin
uri_parts = URI::parse referring_url
rescue
return nil # malformed URLs
end
# First try to look up a search engine by the host name incase we have
# a direct entry for it
parser = nil
if engines.has_key? uri_parts.host
parser = engines[uri_parts.host]
else
lossy_domain = get_lossy_domain(uri_parts.host)
parser = engines[lossy_domain] if engines.has_key? lossy_domain
end
return parser
end
def Serpico.is_serp(referring_url)
parser = get_parser(referring_url)
return false if parser.nil?
return parser.parse(referring_url).nil? == false
end
def Serpico.extract(serp_url, parser=nil, lower_case=true, trimmed=true, collapse_whitespace=true)
parser = get_parser(serp_url) if parser.nil?
return nil if parser.nil?
result = parser.parse(serp_url)
return nil if result.nil?
result.downcase! if lower_case
result.strip! if trimmed
result.gsub!(/\s+/, ' ') if collapse_whitespace
return result
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.