Skip to content

Instantly share code, notes, and snippets.

@chand1012
Created May 15, 2017 02:12
Show Gist options
  • Save chand1012/50381ed9dd885ec2428a94fbc2e747a6 to your computer and use it in GitHub Desktop.
Save chand1012/50381ed9dd885ec2428a94fbc2e747a6 to your computer and use it in GitHub Desktop.
Web Crawler made in ruby. Designed for Wikipedia.
require 'rest-client'
require 'nokogiri'
require 'random_methods'
require 'os'
#v1.1: Added help section
#v1.2: Fixed bug with Ruby 2.2.3
#v1.3: Added change log
wikiLink = "wikipedia.org"
urlList = Array.new
url = nil
css_selector = nil
mainUrl = nil
#init
begin
# get the Url from the user
puts "Enter '/help' for help\nEnter '/changes' for change log"
loop do
print "Enter Url:"
url = gets.chomp
if url['http://'] then
puts "Already contains http://, Skipping..."
break
elsif url == '/help' then #help section
puts "This program is made to assist people in the quick finding of webpages"
puts "relevant to a specific topic."
puts "At the moment, the program is optimized for wikipedia and more basic sites, \nas it may not find links on more modern websites."
puts "Start by entering a url on your topic ie: \nhttp://en.wikipedia.org/wiki/Ruby_(programming_language)"
puts "Enter the name of the file that you want the links to be in,\nie:'ruby'"
puts "Then enter the number of topics you want to search for, for examle, 3"
puts "Enter each topic in their line, use underscores(_) instead of spaces, \nand do not capitalize the first letter."
puts "The program will automatically search the webpage for your topics and then output them to the file name you chose."
elsif url == '/changes' then #changes section
puts 'v1.0: Program released'
puts 'v1.1: Added help section'
puts 'v1.2: Fixed compatibitly issues with Ruby 2.2.2'
puts 'v1.3: Added change log'
puts 'Planned changes:'
puts "Fix bug making it print the link with only 2 letters ie: '.or' instead of '.org'"
puts 'Make user interface'
else
puts "Adding http://..."
url = "http://#{url}"
break
end
end
# dowload and init the webpage
page = Nokogiri::HTML(RestClient.get(url))
if mainUrl == wikiLink then
css_selector = 'p'
else
css_selector = 'body'
end
paras = page.css(css_selector)
rescue
puts "404: Page not found."
puts "Please restart client with valid url!"
if OS.windows? == true then
puts `pause`
exit
else
sleep(5)
exit
end
end
print 'Enter Output file name and extenstion:'
outputFile = gets.chomp
if !outputFile['.'] then # if it has no extenstion, default to txt
outputFile += ".txt"
end
#get search items
print "Enter number of items to search for:"
searchTimes = gets.chomp.to_i
x = 1
searchItems = Array.new
searchTimes.times do
print "Enter item #{x}:"
getItem = gets.chomp
searchItems += [getItem]
x += 1
end
mainUrl = url.domain
# process the page for each item
paras.each do |link|
hrefs = link.css('a').map do |a|
a['href']
end.compact.uniq
searchItems.each do |item|
hrefs.each do |href|
if href[item] or href[item.capitalize] then
if href == url then
puts "Duplicate link. Skipping..."
elsif href['Citation_needed'] or href['Help'] or href['File:'] or href['Category:'] or href['Talk:'] or href['Special:'] or href['Template'] then
puts "Contains Unacceptable character(s). Skipping...."
elsif href['http://'] && !href[mainUrl] then
puts "External Link Found!"
urlList += [href]
elsif href['/wiki/'] and not href['#'] then
puts 'Link Found!'
remoteUrl = wikiLink + href
urlList += [remoteUrl]
end
end
end
end
end
urlList = urlList.compact.uniq
urlList.each do |site|
puts "Adding link #{site} to file..."
end
urlList.write_all(outputFile)
puts "Search complete!"
if OS.windows? == true then
puts `pause`
else
sleep(5)
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment