secret
Created

Search keyword extraction with ruby. Work in progress.

  • Download Gist
serpico.rb
Ruby
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
require 'uri'
require 'cgi'
require 'json'
 
 
# Add strip_arbitrary
class String
def strip_arbitrary(chars)
r = chars.chars.map { |c| Regexp.quote(c) }.join
self.gsub(/(^[#{r}]*)|([#{r}]*$)/, '')
end
end
 
 
module Serpico
class ExtractResult
attr_reader :engine_name, :keyword, :parser
def initialize(engine_name, keyword, parser)
@engine_name = engine_name
@keyword = keyword
@parser = parser
end
end
 
class SearchEngineParser
 
attr_reader :engine_name, :keyword_extractor, :link_macro, :charsets
 
def initialize(engine_name, keyword_extractor, link_macro, charsets)
@engine_name = engine_name
@keyword_extractor = [keyword_extractor] if not keyword_extractor.is_a?(Array)
@link_marco = link_macro
@charsets = [charsets] if not charsets.is_a?(Array)
@charsets = charsets.map {|x| x.downcase}
end
 
def get_serp_url(base_url, keyword)
return nil if @link_macro.nil?
return "#{base_url}/#{@link_macro.gsub(/\{k\}/, 'keyword')}"
end
 
def parse(serp_url)
begin
uri_parts = URI(serp_url)
rescue
return nil # malformed URLs
end
 
query = CGI::parse(uri_parts.query)
 
keyword = nil
@keyword_extractor.each do |extractor|
if extractor.start_with? "/"
# Regular expression extractor
extractor = extractor.strip_arbitrary('/')
regex = Regexp.new(extractor)
# TODO: some regex.match stuff or something
else
# We have a regular query string extractor
if query.has_key? extractor
keyword = query[extractor][0]
break
end
end
end
 
return nil if keyword.nil?
 
return ExtractResult.new(@engine_name, keyword, self)
end
end
 
@country_codes = ['af', 'ax', 'al', 'dz', 'as', 'ad', 'ao', 'ai', 'aq', 'ag', 'ar', 'am', 'aw', 'au', 'at', 'az', 'bs', 'bh', 'bd', 'bb', 'by', 'be', 'bz', 'bj', 'bm', 'bt', 'bo', 'bq', 'ba', 'bw', 'bv', 'br', 'io', 'bn', 'bg', 'bf', 'bi', 'kh', 'cm', 'ca', 'cv', 'ky', 'cf', 'td', 'cl', 'cn', 'cx', 'cc', 'co', 'km', 'cg', 'cd', 'ck', 'cr', 'ci', 'hr', 'cu', 'cw', 'cy', 'cz', 'dk', 'dj', 'dm', 'do', 'ec', 'eg', 'sv', 'gq', 'er', 'ee', 'et', 'fk', 'fo', 'fj', 'fi', 'fr', 'gf', 'pf', 'tf', 'ga', 'gm', 'ge', 'de', 'gh', 'gi', 'gr', 'gl', 'gd', 'gp', 'gu', 'gt', 'gg', 'gn', 'gw', 'gy', 'ht', 'hm', 'va', 'hn', 'hk', 'hu', 'is', 'in', 'id', 'ir', 'iq', 'ie', 'im', 'il', 'it', 'jm', 'jp', 'je', 'jo', 'kz', 'ke', 'ki', 'kp', 'kr', 'kw', 'kg', 'la', 'lv', 'lb', 'ls', 'lr', 'ly', 'li', 'lt', 'lu', 'mo', 'mk', 'mg', 'mw', 'my', 'mv', 'ml', 'mt', 'mh', 'mq', 'mr', 'mu', 'yt', 'mx', 'fm', 'md', 'mc', 'mn', 'me', 'ms', 'ma', 'mz', 'mm', 'na', 'nr', 'np', 'nl', 'nc', 'nz', 'ni', 'ne', 'ng', 'nu', 'nf', 'mp', 'no', 'om', 'pk', 'pw', 'ps', 'pa', 'pg', 'py', 'pe', 'ph', 'pn', 'pl', 'pt', 'pr', 'qa', 're', 'ro', 'ru', 'rw', 'bl', 'sh', 'kn', 'lc', 'mf', 'pm', 'vc', 'ws', 'sm', 'st', 'sa', 'sn', 'rs', 'sc', 'sl', 'sg', 'sx', 'sk', 'si', 'sb', 'so', 'za', 'gs', 'ss', 'es', 'lk', 'sd', 'sr', 'sj', 'sz', 'se', 'ch', 'sy', 'tw', 'tj', 'tz', 'th', 'tl', 'tg', 'tk', 'to', 'tt', 'tn', 'tr', 'tm', 'tc', 'tv', 'ug', 'ua', 'ae', 'gb', 'us', 'um', 'uy', 'uz', 'vu', 've', 'vn', 'vg', 'vi', 'wf', 'eh', 'ye', 'zm', 'zw', 'uk']
def Serpico.get_lossy_domain(domain)
codes = @country_codes.join('|')
return domain.gsub(/^(\w+[0-9]*|search)\./, '')
.gsub(/(^|\.)m\./, '\1')
.gsub(/(\.(com|org|net|co|it|edu))?\.(#{codes})(\/|$)/, '.{}\4')
.gsub(/(^|\.)(#{codes})\./, '\1{}.')
end
 
 
# Cache it at the module level
@engines = nil
def Serpico.get_search_engines()
return @engines if not @engines.nil?
 
@engines = {}
 
# Group the engines by engine name
piwik_engines = JSON.load(IO.read('search_engines.json'))
grouped = piwik_engines.to_a.group_by {|a| a[1][0]}
 
grouped.each do |domain, rule_definitions|
defaults = {
:extractor => nil,
:link_macro => nil,
:charsets => ['utf-8']
}
 
puts rule_definitions.inspect
 
rule_definitions.each_index do |i|
rule_definition = rule_definitions[i]
engine_name = rule_definitions[1][0]
rule = rule_definition[1][1..-1]
if i == 0
defaults[:extractor] = rule[0]
defaults[:link_macro] = rule[1] if rule.count >= 2
defaults[:charsets] = rule[2] if rule.count >= 3
 
@engines[domain] = SearchEngineParser.new(engine_name, defaults[:extractor], defaults[:link_macro], defaults[:charsets])
next
end
 
args = [engine_name, defaults[:extractor], defaults[:link_macro], defaults[:charsets]]
args[1] = rule[0] if rule.count >= 1
args[2] = rule[1] if rule.count >= 2
args[3] = rule[2] if rule.count >= 3
 
@engines[domain] = SearchEngineParser.new(*args)
end
end
 
return @engines
end
def Serpico.get_parser(referring_url)
engines = get_search_engines
begin
uri_parts = URI::parse referring_url
rescue
return nil # malformed URLs
end
# First try to look up a search engine by the host name incase we have
# a direct entry for it
parser = nil
if engines.has_key? uri_parts.host
parser = engines[uri_parts.host]
else
lossy_domain = get_lossy_domain(uri_parts.host)
parser = engines[lossy_domain] if engines.has_key? lossy_domain
end
return parser
end
def Serpico.is_serp(referring_url)
parser = get_parser(referring_url)
return false if parser.nil?
return parser.parse(referring_url).nil? == false
end
def Serpico.extract(serp_url, parser=nil, lower_case=true, trimmed=true, collapse_whitespace=true)
parser = get_parser(serp_url) if parser.nil?
return nil if parser.nil?
result = parser.parse(serp_url)
return nil if result.nil?
result.downcase! if lower_case
result.strip! if trimmed
result.gsub!(/\s+/, ' ') if collapse_whitespace
return result
end
end

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.