hunj/path_strip.rb

## path_strip.rb
# path_strip(input_file, domain, output_file)
# imports an xml file containing paths of the website's pages,
# strips all unnecessary strings except for the path of the pages.
## Parameters:
# +input_file+ name of the input xml file
# +domain+ the domain of the URL to exclude in the result
# +output_file+ name of the output file, ending in .csv (preferred)
def path_strip(input_file, domain, output_file)
  raise "domain must be string form" unless domain.is_a? String
  raise "invalid input file name" unless input_file.is_a? String
  raise "invalid output file name" unless output_file.is_a? String

  file = File.open(input_file, "r")
  data = file.read.lines
  file.close

  result_file = File.open(output_file, "w")

  num = 0
  data.each do |line|
    if line =~ /<loc>http:\/\/#{Regexp.quote(domain)}\/.*<\/loc>/
      num += 1
      result_file.puts "link_#{num},#{line[5..-8].sub("http://#{domain}/", '')}"
    end
  end
  result_file.close
  p num
end

# example:
path_strip "./sitemap.xml", "www.example.com", "./result.csv"
	# path_strip(input_file, domain, output_file)
	# imports an xml file containing paths of the website's pages,
	# strips all unnecessary strings except for the path of the pages.
	## Parameters:
	# +input_file+ name of the input xml file
	# +domain+ the domain of the URL to exclude in the result
	# +output_file+ name of the output file, ending in .csv (preferred)
	def path_strip(input_file, domain, output_file)
	raise "domain must be string form" unless domain.is_a? String
	raise "invalid input file name" unless input_file.is_a? String
	raise "invalid output file name" unless output_file.is_a? String

	file = File.open(input_file, "r")
	data = file.read.lines
	file.close

	result_file = File.open(output_file, "w")

	num = 0
	data.each do \|line\|
	if line =~ /<loc>http:\/\/#{Regexp.quote(domain)}\/.*<\/loc>/
	num += 1
	result_file.puts "link_#{num},#{line[5..-8].sub("http://#{domain}/", '')}"
	end
	end
	result_file.close
	p num
	end

	# example:
	path_strip "./sitemap.xml", "www.example.com", "./result.csv"