Skip to content

Instantly share code, notes, and snippets.

@wflanagan
Created December 7, 2011 13:06
Show Gist options
  • Save wflanagan/1442731 to your computer and use it in GitHub Desktop.
Save wflanagan/1442731 to your computer and use it in GitHub Desktop.
complete links function
def complete_links(opts = {})
return @complete_links unless @complete_links.blank?
link_list = if opts[:limit]
links.slice(0..opts[:limit])
else
links
end
@complete_links = []
index = []
return @complete_links unless link_list.is_a?(Nokogiri::XML::NodeSet) # to try to solve bug iwth bad encoding on a nodeset
link_list.each do |link|
begin
unless link.class.to_s.include?("Nokogiri::XML::Element")
log "TYPHOEUSPATCH: ERROR: NonXMLElement #{link.inspect}"
next
end
rescue => e
puts "TYPHOEUS: ERROR: #{e.inspect} #{e.backtrace.first}"
end
begin
next if link["href"].nil?
next if link["href"].length == 0
next if index.include?(link["href"])
next if link["href"].downcase.scan(/^javascript\:/).length > 0
next if link["href"].downcase.scan(/^mailto\:/).length > 0
rescue => e
puts "TYPHOEUSPATCH: ERROR: #{link["href"]} #{e.inspect} #{e.backtrace.first}"
next
end
link_info = {}
begin
begin
# gsubbing the link hrefs/urls to get rid of escaped characters that impact our ability to do our job
parameter_cleaned_href = link["href"].gsub("%25", "%").gsub("%3D", "=").gsub("%26", "&").gsub("%3A", ":").gsub("%2F", "/").gsub("%3F", "?").gsub("%3D", "=").gsub("%25", "%").gsub("%3B", ";") unless link["href"].nil?
parameter_cleaned_href = parameter_cleaned_href.gsub("%3A", ":").gsub("%2F", "/") unless parameter_cleaned_href.nil?
x = Addressable::URI.parse(url).join(parameter_cleaned_href)
rescue => e
puts "TYPHOEUSPATCH: Error: ParseError: #{link["href"]}"
next
end
x.path = x.path.squeeze('/')
x.query = nil if x.query && x.query.empty?
x.fragment = nil
x.path = "/" if x.path.nil?
# x = PostRank::URI.c18n(PostRank::URI.unescape(x.to_s))
link_info['href'] = link['href']
link_info['ref'] = link['ref']
link_info['rel'] = link['rel']
link_info['title'] = link['title']
link_info['alt'] = link['alt']
imgz = link.css('img')
img_array = []
imgz.each do |i|
img_array << {:src => i["src"], :alt => i["alt"], :title => i["title"]}
end
link_info['images'] = img_array
link_info['text'] = link.text.to_s
link_info['url'] = x.to_s
link_info['absolute_url'] = x.to_s
info = link_info_for_pagelink(link_info).stringify_keys
@complete_links << info
index << link["href"]
rescue => e
puts "TYPHOEUSPATCH: ERROR: ParseError: #{link["href"]}"
next
end
end
@complete_links
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment