chand1012/WebCrawler.rb

## WebCrawler.rb
require 'rest-client'
require 'nokogiri'
require 'random_methods'
require 'os'
#v1.1: Added help section
#v1.2: Fixed bug with Ruby 2.2.3
#v1.3: Added change log
wikiLink = "wikipedia.org"
urlList = Array.new
url = nil
css_selector = nil
mainUrl = nil

#init
begin
  # get the Url from the user
    puts "Enter '/help' for help\nEnter '/changes' for change log"
    loop do
        print "Enter Url:"
        url = gets.chomp
        if url['http://'] then
            puts "Already contains http://, Skipping..."
            break
        elsif url == '/help' then #help section
            puts "This program is made to assist people in the quick finding of webpages"
            puts "relevant to a specific topic."
            puts "At the moment, the program is optimized for wikipedia and more basic sites, \nas it may not find links on more modern websites."
            puts "Start by entering a url on your topic ie: \nhttp://en.wikipedia.org/wiki/Ruby_(programming_language)"
            puts "Enter the name of the file that you want the links to be in,\nie:'ruby'"
            puts "Then enter the number of topics you want to search for, for examle, 3"
            puts "Enter each topic in their line, use underscores(_) instead of spaces, \nand do not capitalize the first letter."
            puts "The program will automatically search the webpage for your topics and then output them to the file name you chose."
        elsif url == '/changes' then #changes section
          puts 'v1.0: Program released'
          puts 'v1.1: Added help section'
          puts 'v1.2: Fixed compatibitly issues with Ruby 2.2.2'
          puts 'v1.3: Added change log'
          puts 'Planned changes:'
          puts "Fix bug making it print the link with only 2 letters ie: '.or' instead of '.org'"
          puts 'Make user interface'
        else
            puts "Adding http://..."
            url = "http://#{url}"
            break
        end
    end
  # dowload and init the webpage
  page = Nokogiri::HTML(RestClient.get(url))
  if mainUrl == wikiLink then
    css_selector = 'p'
  else
    css_selector = 'body'
  end
  paras = page.css(css_selector)
rescue
  puts "404: Page not found."
  puts "Please restart client with valid url!"
  if OS.windows? == true then
    puts `pause`
    exit
  else
    sleep(5)
    exit
  end
end

print 'Enter Output file name and extenstion:'
outputFile = gets.chomp
if !outputFile['.'] then # if it has no extenstion, default to txt
  outputFile += ".txt"
end
#get search items
print "Enter number of items to search for:"
searchTimes = gets.chomp.to_i

x = 1
searchItems = Array.new
searchTimes.times do
  print "Enter item #{x}:"
  getItem = gets.chomp
  searchItems += [getItem]
  x += 1
end
mainUrl = url.domain
# process the page for each item
paras.each do |link|

  hrefs = link.css('a').map do |a|
    a['href']
  end.compact.uniq

  searchItems.each do |item|

    hrefs.each do |href|

      if href[item] or href[item.capitalize] then

        if href == url then
          puts "Duplicate link. Skipping..."
        elsif href['Citation_needed'] or href['Help'] or href['File:'] or href['Category:'] or href['Talk:'] or href['Special:'] or href['Template'] then
          puts "Contains Unacceptable character(s). Skipping...."
        elsif href['http://'] && !href[mainUrl] then
          puts "External Link Found!"
          urlList += [href]
        elsif href['/wiki/'] and not href['#'] then
          puts 'Link Found!'
          remoteUrl = wikiLink + href
          urlList += [remoteUrl]
        end

      end

    end
  end
end

urlList = urlList.compact.uniq

urlList.each do |site|
  puts "Adding link #{site} to file..."
end
urlList.write_all(outputFile)
puts "Search complete!"
if OS.windows? == true then
  puts `pause`
else
  sleep(5)
end
	require 'rest-client'
	require 'nokogiri'
	require 'random_methods'
	require 'os'
	#v1.1: Added help section
	#v1.2: Fixed bug with Ruby 2.2.3
	#v1.3: Added change log
	wikiLink = "wikipedia.org"
	urlList = Array.new
	url = nil
	css_selector = nil
	mainUrl = nil

	#init
	begin
	# get the Url from the user
	puts "Enter '/help' for help\nEnter '/changes' for change log"
	loop do
	print "Enter Url:"
	url = gets.chomp
	if url['http://'] then
	puts "Already contains http://, Skipping..."
	break
	elsif url == '/help' then #help section
	puts "This program is made to assist people in the quick finding of webpages"
	puts "relevant to a specific topic."
	puts "At the moment, the program is optimized for wikipedia and more basic sites, \nas it may not find links on more modern websites."
	puts "Start by entering a url on your topic ie: \nhttp://en.wikipedia.org/wiki/Ruby_(programming_language)"
	puts "Enter the name of the file that you want the links to be in,\nie:'ruby'"
	puts "Then enter the number of topics you want to search for, for examle, 3"
	puts "Enter each topic in their line, use underscores(_) instead of spaces, \nand do not capitalize the first letter."
	puts "The program will automatically search the webpage for your topics and then output them to the file name you chose."
	elsif url == '/changes' then #changes section
	puts 'v1.0: Program released'
	puts 'v1.1: Added help section'
	puts 'v1.2: Fixed compatibitly issues with Ruby 2.2.2'
	puts 'v1.3: Added change log'
	puts 'Planned changes:'
	puts "Fix bug making it print the link with only 2 letters ie: '.or' instead of '.org'"
	puts 'Make user interface'
	else
	puts "Adding http://..."
	url = "http://#{url}"
	break
	end
	end
	# dowload and init the webpage
	page = Nokogiri::HTML(RestClient.get(url))
	if mainUrl == wikiLink then
	css_selector = 'p'
	else
	css_selector = 'body'
	end
	paras = page.css(css_selector)
	rescue
	puts "404: Page not found."
	puts "Please restart client with valid url!"
	if OS.windows? == true then
	puts `pause`
	exit
	else
	sleep(5)
	exit
	end
	end

	print 'Enter Output file name and extenstion:'
	outputFile = gets.chomp
	if !outputFile['.'] then # if it has no extenstion, default to txt
	outputFile += ".txt"
	end
	#get search items
	print "Enter number of items to search for:"
	searchTimes = gets.chomp.to_i

	x = 1
	searchItems = Array.new
	searchTimes.times do
	print "Enter item #{x}:"
	getItem = gets.chomp
	searchItems += [getItem]
	x += 1
	end
	mainUrl = url.domain
	# process the page for each item
	paras.each do \|link\|

	hrefs = link.css('a').map do \|a\|
	a['href']
	end.compact.uniq

	searchItems.each do \|item\|

	hrefs.each do \|href\|

	if href[item] or href[item.capitalize] then

	if href == url then
	puts "Duplicate link. Skipping..."
	elsif href['Citation_needed'] or href['Help'] or href['File:'] or href['Category:'] or href['Talk:'] or href['Special:'] or href['Template'] then
	puts "Contains Unacceptable character(s). Skipping...."
	elsif href['http://'] && !href[mainUrl] then
	puts "External Link Found!"
	urlList += [href]
	elsif href['/wiki/'] and not href['#'] then
	puts 'Link Found!'
	remoteUrl = wikiLink + href
	urlList += [remoteUrl]
	end

	end

	end
	end
	end

	urlList = urlList.compact.uniq

	urlList.each do \|site\|
	puts "Adding link #{site} to file..."
	end
	urlList.write_all(outputFile)
	puts "Search complete!"
	if OS.windows? == true then
	puts `pause`
	else
	sleep(5)
	end