chrislloyd/naive_bayes.txt

## gistfile1.txt
#!/usr/bin/env ruby

# gem install colored
require 'colored'

features = []

Dir['{rec.sport.hockey,talk.politics.guns}/*'].each do |filename|
  puts "Extracting features for #{filename.blue.bold}"
  begin
  _, source = File.read(filename).split("\n\n", 2)

  tmp = source.split(/\s/)
  tmp.each do |feature|
    feature.gsub!(/[^a-zA-Z0-9]/,'')
  end

  tmp.delete_if do |feature|
    feature.length.zero?
  end

  features |= tmp.flatten

  rescue Exception => e
  end
end

File.open('woo.csv', 'w') do |output|

  output.puts [*features, 'class'].join(',')

  Dir['{rec.sport.hockey,talk.politics.guns}/*'].each do |filename|
    puts "Searching #{filename.blue.bold}"
    begin
    _, source = File.read(filename).split("\n\n", 2)

    features.each do |feature|
      matches = source.match feature
      output.print (matches && matches.length) || 0
      output.print ','
    end

    output.puts File.dirname(filename)

    rescue Exception => e
      puts e
    end

  end

end

## naive_bayes.txt
Time taken to build model: 2.91 seconds

=== Stratified cross-validation ===
=== Summary ===

Correctly Classified Instances         695               91.8098 %
Incorrectly Classified Instances        62                8.1902 %
Kappa statistic                          0.8354
Mean absolute error                      0.0825
Root mean squared error                  0.2847
Relative absolute error                 16.5228 %
Root relative squared error             56.9882 %
Total Number of Instances              757

=== Detailed Accuracy By Class ===

               TP Rate   FP Rate   Precision   Recall  F-Measure   ROC Area  Class
                 0.962     0.129      0.889     0.962     0.924      0.964    rec.sport.hockey
                 0.871     0.038      0.955     0.871     0.911      0.971    talk.politics.guns
Weighted Avg.    0.918     0.085      0.921     0.918     0.918      0.968

=== Confusion Matrix ===

   a   b   <-- classified as
 378  15 |   a = rec.sport.hockey
  47 317 |   b = talk.politics.guns

## smo.txt
Time taken to build model: 10.21 seconds

=== Stratified cross-validation ===
=== Summary ===

Correctly Classified Instances         737               97.358  %
Incorrectly Classified Instances        20                2.642  %
Kappa statistic                          0.9471
Mean absolute error                      0.0264
Root mean squared error                  0.1625
Relative absolute error                  5.2917 %
Root relative squared error             32.5319 %
Total Number of Instances              757

=== Detailed Accuracy By Class ===

               TP Rate   FP Rate   Precision   Recall  F-Measure   ROC Area  Class
                 0.972     0.025      0.977     0.972     0.974      0.974    rec.sport.hockey
                 0.975     0.028      0.97      0.975     0.973      0.974    talk.politics.guns
Weighted Avg.    0.974     0.026      0.974     0.974     0.974      0.974

=== Confusion Matrix ===

   a   b   <-- classified as
 382  11 |   a = rec.sport.hockey
   9 355 |   b = talk.politics.guns

## zeror.txt
Time taken to build model: 0.02 seconds

=== Stratified cross-validation ===
=== Summary ===

Correctly Classified Instances         393               51.9155 %
Incorrectly Classified Instances       364               48.0845 %
Kappa statistic                          0
Mean absolute error                      0.4993
Root mean squared error                  0.4996
Relative absolute error                100      %
Root relative squared error            100      %
Total Number of Instances              757

=== Detailed Accuracy By Class ===

               TP Rate   FP Rate   Precision   Recall  F-Measure   ROC Area  Class
                 1         1          0.519     1         0.683      0.494    rec.sport.hockey
                 0         0          0         0         0          0.494    talk.politics.guns
Weighted Avg.    0.519     0.519      0.27      0.519     0.355      0.494

=== Confusion Matrix ===

   a   b   <-- classified as
 393   0 |   a = rec.sport.hockey
 364   0 |   b = talk.politics.guns
	#!/usr/bin/env ruby

	# gem install colored
	require 'colored'

	features = []

	Dir['{rec.sport.hockey,talk.politics.guns}/*'].each do \|filename\|
	puts "Extracting features for #{filename.blue.bold}"
	begin
	_, source = File.read(filename).split("\n\n", 2)

	tmp = source.split(/\s/)
	tmp.each do \|feature\|
	feature.gsub!(/[^a-zA-Z0-9]/,'')
	end

	tmp.delete_if do \|feature\|
	feature.length.zero?
	end

	features \|= tmp.flatten

	rescue Exception => e
	end
	end

	File.open('woo.csv', 'w') do \|output\|

	output.puts [*features, 'class'].join(',')

	Dir['{rec.sport.hockey,talk.politics.guns}/*'].each do \|filename\|
	puts "Searching #{filename.blue.bold}"
	begin
	_, source = File.read(filename).split("\n\n", 2)

	features.each do \|feature\|
	matches = source.match feature
	output.print (matches && matches.length) \|\| 0
	output.print ','
	end

	output.puts File.dirname(filename)

	rescue Exception => e
	puts e
	end

	end

	end
	Time taken to build model: 2.91 seconds

	=== Stratified cross-validation ===
	=== Summary ===

	Correctly Classified Instances 695 91.8098 %
	Incorrectly Classified Instances 62 8.1902 %
	Kappa statistic 0.8354
	Mean absolute error 0.0825
	Root mean squared error 0.2847
	Relative absolute error 16.5228 %
	Root relative squared error 56.9882 %
	Total Number of Instances 757

	=== Detailed Accuracy By Class ===

	TP Rate FP Rate Precision Recall F-Measure ROC Area Class
	0.962 0.129 0.889 0.962 0.924 0.964 rec.sport.hockey
	0.871 0.038 0.955 0.871 0.911 0.971 talk.politics.guns
	Weighted Avg. 0.918 0.085 0.921 0.918 0.918 0.968

	=== Confusion Matrix ===

	a b <-- classified as
	378 15 \| a = rec.sport.hockey
	47 317 \| b = talk.politics.guns
	Time taken to build model: 10.21 seconds

	=== Stratified cross-validation ===
	=== Summary ===

	Correctly Classified Instances 737 97.358 %
	Incorrectly Classified Instances 20 2.642 %
	Kappa statistic 0.9471
	Mean absolute error 0.0264
	Root mean squared error 0.1625
	Relative absolute error 5.2917 %
	Root relative squared error 32.5319 %
	Total Number of Instances 757

	=== Detailed Accuracy By Class ===

	TP Rate FP Rate Precision Recall F-Measure ROC Area Class
	0.972 0.025 0.977 0.972 0.974 0.974 rec.sport.hockey
	0.975 0.028 0.97 0.975 0.973 0.974 talk.politics.guns
	Weighted Avg. 0.974 0.026 0.974 0.974 0.974 0.974

	=== Confusion Matrix ===

	a b <-- classified as
	382 11 \| a = rec.sport.hockey
	9 355 \| b = talk.politics.guns
	Time taken to build model: 0.02 seconds

	=== Stratified cross-validation ===
	=== Summary ===

	Correctly Classified Instances 393 51.9155 %
	Incorrectly Classified Instances 364 48.0845 %
	Kappa statistic 0
	Mean absolute error 0.4993
	Root mean squared error 0.4996
	Relative absolute error 100 %
	Root relative squared error 100 %
	Total Number of Instances 757

	=== Detailed Accuracy By Class ===

	TP Rate FP Rate Precision Recall F-Measure ROC Area Class
	1 1 0.519 1 0.683 0.494 rec.sport.hockey
	0 0 0 0 0 0.494 talk.politics.guns
	Weighted Avg. 0.519 0.519 0.27 0.519 0.355 0.494

	=== Confusion Matrix ===

	a b <-- classified as
	393 0 \| a = rec.sport.hockey
	364 0 \| b = talk.politics.guns