matthewd673/README.md

## README.md

      
    Raw
  

              README.md
            
          
    wikiweb.rb

A very poor use of time.
Build & run


Download wikiweb.rb
Ruby 3.2.1 required (or something reasonably recent)
Requires nokogiri (gem install nokogiri)

Usage

ruby wikiweb.rb <article-url>

You can also specify:

-q|--quiet: run quietly
-d|--depth: specify a depth (e.g.: -d 3). Default is 1.

For example:
ruby wikiweb.rb https://en.wikipedia.org/wiki/Physalis_peruviana -d 2


## wikiweb.rb
# wikiweb.rb
# Matthew Daly 2023
# @matthewd673

require "net/http"
require "nokogiri"

# parse command line args
if ARGV.length < 1
    puts "ruby wikiweb.rb <article-url>"
    puts "\t -q --quiet: omit article links"
    puts "\t -d --depth <number>: crawl to a given depth (default 1)"
    return
end

# kinda ugly but also kinda clever imo
url = ""
depth = 1
@quiet = false
mode = ""
for i in 0..ARGV.length
    if ARGV[i].eql?("-d") || ARGV[i].eql?("--depth")
        mode = "depth"
        next
    elsif ARGV[i].eql?("-q") || ARGV[i].eql?("--quiet")
        @quiet = true
        next
    end

    if mode.eql?("") && url.eql?("")
        url = ARGV[i]
    elsif mode.eql?("depth")
        depth = ARGV[i].to_i()
    end

    mode = ""
end

@urls = []
@dupCount = 0
# GET and parse a page for /wiki/ links
def scan_page(page_url, depth, prefix)
    if (depth == 0) then return end

    # http GET request
    page_url = "https://en.wikipedia.org" + page_url unless page_url.include?("://")
    res = Net::HTTP.get_response(URI(page_url))

    if !res.is_a?(Net::HTTPSuccess)
        puts prefix + "Failed to load URL (" + res.code + ")"
        return
    end

    # pull links out of page
    page = Nokogiri::HTML5.parse(res.body)
    links = page.xpath("//a")

    puts prefix + page_url unless @quiet

    # add each link to list and make recursive call
    count = 0;
    links.each { |l|
        if l["href"] == nil then next end

        # filter non-article links
        if !l["href"].start_with?("/wiki/") ||
            l["href"].include?(":") ||
            l["href"].eql?("/wiki/Main_Page")
                next
        end

        href = l["href"].split("#")[0]

        if (!@urls.include?(href))
            @urls.push(href)
            count = count + 1
            scan_page(href, depth - 1, prefix + " ")
        else
            @dupCount = @dupCount + 1
        end
    }

    puts prefix + " [" + count.to_s() + " articles]" unless @quiet
end

# run scanner and print result
scan_page(url, depth, " ")
puts "\n" + @urls.length.to_s() + " articles linked at depth " + depth.to_s() + " (excluding " + @dupCount.to_s() + " duplicates)"
	# wikiweb.rb
	# Matthew Daly 2023
	# @matthewd673

	require "net/http"
	require "nokogiri"

	# parse command line args
	if ARGV.length < 1
	puts "ruby wikiweb.rb <article-url>"
	puts "\t -q --quiet: omit article links"
	puts "\t -d --depth <number>: crawl to a given depth (default 1)"
	return
	end

	# kinda ugly but also kinda clever imo
	url = ""
	depth = 1
	@quiet = false
	mode = ""
	for i in 0..ARGV.length
	if ARGV[i].eql?("-d") \|\| ARGV[i].eql?("--depth")
	mode = "depth"
	next
	elsif ARGV[i].eql?("-q") \|\| ARGV[i].eql?("--quiet")
	@quiet = true
	next
	end

	if mode.eql?("") && url.eql?("")
	url = ARGV[i]
	elsif mode.eql?("depth")
	depth = ARGV[i].to_i()
	end

	mode = ""
	end

	@urls = []
	@dupCount = 0
	# GET and parse a page for /wiki/ links
	def scan_page(page_url, depth, prefix)
	if (depth == 0) then return end

	# http GET request
	page_url = "https://en.wikipedia.org" + page_url unless page_url.include?("://")
	res = Net::HTTP.get_response(URI(page_url))

	if !res.is_a?(Net::HTTPSuccess)
	puts prefix + "Failed to load URL (" + res.code + ")"
	return
	end

	# pull links out of page
	page = Nokogiri::HTML5.parse(res.body)
	links = page.xpath("//a")

	puts prefix + page_url unless @quiet

	# add each link to list and make recursive call
	count = 0;
	links.each { \|l\|
	if l["href"] == nil then next end

	# filter non-article links
	if !l["href"].start_with?("/wiki/") \|\|
	l["href"].include?(":") \|\|
	l["href"].eql?("/wiki/Main_Page")
	next
	end

	href = l["href"].split("#")[0]

	if (!@urls.include?(href))
	@urls.push(href)
	count = count + 1
	scan_page(href, depth - 1, prefix + " ")
	else
	@dupCount = @dupCount + 1
	end
	}

	puts prefix + " [" + count.to_s() + " articles]" unless @quiet
	end

	# run scanner and print result
	scan_page(url, depth, " ")
	puts "\n" + @urls.length.to_s() + " articles linked at depth " + depth.to_s() + " (excluding " + @dupCount.to_s() + " duplicates)"