agaelebe/Reuters21578_sgml_to_txt.rb

## Reuters21578_sgml_to_txt.rb
# =Split Reuters-21578
# =(Found at: http://www.daviddlewis.com/resources/testcollections/reuters21578/)
# =SGML files into separate TXT files
#
# Documents selected are those from LEWIS SPLIT that have at least one topic.
# Documents (only the body of text) are put in directories according to their type (train/test) and topic.
# Documents with more than one topic are written in more than one folder.
# Only documents that have a topic listed in  'used_topics' Array are selected.
# This pre-processing is useful for text categorization applications.
#
# Author: Hugo.Borges _at_ gmail
#
# Release date: 2008 09 02

require 'libxml'
require 'iconv'

reuters_dir = "reuters21578" # location of the sgml files to process
write_dir = "reuters21578txt" # write dir

main_topics = %w{commodities metals financial energy} # name of txt files containing the topics that will be used

# used_topics  is an Array with the topics that we want to use
used_topics = main_topics.collect do |topic|
	IO.readlines("#{reuters_dir}/#{topic}.txt","\n").each { |line| line.chomp!}
end

used_topics.flatten!.sort!

Dir.mkdir(write_dir) if Dir[write_dir].empty?
Dir.mkdir("#{write_dir}/test") if Dir["#{write_dir}/test"].empty?
Dir.mkdir("#{write_dir}/train") if Dir["#{write_dir}/train"].empty?


Dir.entries(reuters_dir).slice(2, 50).each do |filename|

  file_lines = File.open("#{reuters_dir}/#{filename}","r").readlines

  new_file_lines = []

  file_lines.each do |line|
    line.delete!("#&") #remove characters that confuse libxml parser
    line.sub!(/\<\!DOCTYPE.*?\>/,"") #remove doctype (not used)
    line = Iconv::iconv('utf-8', 'cp1251',line).to_s #convert to utf-8
    new_file_lines << line

    if /<\/REUTERS>/ =~ line #end of xml file

      doc = nil
      doc_parser = LibXML::XML::Parser.string(new_file_lines.to_s)
      doc = doc_parser.parse
      root =  doc.find('/REUTERS')[0]
      topics = doc.find('/REUTERS/TOPICS/D')
      doc_type = "test"

      if root.attributes.get_attribute("TOPICS").value == "YES"

        if root.attributes.get_attribute("LEWISSPLIT").value == "TRAIN"
          doc_type = "train"
        end

        doc_id = root.attributes.get_attribute("NEWID").value

        unless topics[0].nil?
					topics.each do |topic|
						klass = topic.content
						if used_topics.include?(klass)
							Dir.mkdir("#{write_dir}/#{doc_type}/#{klass}") if Dir["#{write_dir}/#{doc_type}/#{klass}"].empty?
							File.open("#{write_dir}/#{doc_type}/#{klass}/reut21578_#{doc_id}.txt","w") do |file|
								file.write(doc.find('/REUTERS/TEXT')[0].content)
							end
						end

          end

        end

      end

      new_file_lines = []

    end

  end

end
	# =Split Reuters-21578
	# =(Found at: http://www.daviddlewis.com/resources/testcollections/reuters21578/)
	# =SGML files into separate TXT files
	#
	# Documents selected are those from LEWIS SPLIT that have at least one topic.
	# Documents (only the body of text) are put in directories according to their type (train/test) and topic.
	# Documents with more than one topic are written in more than one folder.
	# Only documents that have a topic listed in 'used_topics' Array are selected.
	# This pre-processing is useful for text categorization applications.
	#
	# Author: Hugo.Borges _at_ gmail
	#
	# Release date: 2008 09 02

	require 'libxml'
	require 'iconv'

	reuters_dir = "reuters21578" # location of the sgml files to process
	write_dir = "reuters21578txt" # write dir

	main_topics = %w{commodities metals financial energy} # name of txt files containing the topics that will be used

	# used_topics is an Array with the topics that we want to use
	used_topics = main_topics.collect do \|topic\|
	IO.readlines("#{reuters_dir}/#{topic}.txt","\n").each { \|line\| line.chomp!}
	end

	used_topics.flatten!.sort!

	Dir.mkdir(write_dir) if Dir[write_dir].empty?
	Dir.mkdir("#{write_dir}/test") if Dir["#{write_dir}/test"].empty?
	Dir.mkdir("#{write_dir}/train") if Dir["#{write_dir}/train"].empty?


	Dir.entries(reuters_dir).slice(2, 50).each do \|filename\|

	file_lines = File.open("#{reuters_dir}/#{filename}","r").readlines

	new_file_lines = []

	file_lines.each do \|line\|
	line.delete!("#&") #remove characters that confuse libxml parser
	line.sub!(/\<\!DOCTYPE.*?\>/,"") #remove doctype (not used)
	line = Iconv::iconv('utf-8', 'cp1251',line).to_s #convert to utf-8
	new_file_lines << line

	if /<\/REUTERS>/ =~ line #end of xml file

	doc = nil
	doc_parser = LibXML::XML::Parser.string(new_file_lines.to_s)
	doc = doc_parser.parse
	root = doc.find('/REUTERS')[0]
	topics = doc.find('/REUTERS/TOPICS/D')
	doc_type = "test"

	if root.attributes.get_attribute("TOPICS").value == "YES"

	if root.attributes.get_attribute("LEWISSPLIT").value == "TRAIN"
	doc_type = "train"
	end

	doc_id = root.attributes.get_attribute("NEWID").value

	unless topics[0].nil?
	topics.each do \|topic\|
	klass = topic.content
	if used_topics.include?(klass)
	Dir.mkdir("#{write_dir}/#{doc_type}/#{klass}") if Dir["#{write_dir}/#{doc_type}/#{klass}"].empty?
	File.open("#{write_dir}/#{doc_type}/#{klass}/reut21578_#{doc_id}.txt","w") do \|file\|
	file.write(doc.find('/REUTERS/TEXT')[0].content)
	end
	end

	end

	end

	end

	new_file_lines = []

	end

	end

	end