mattkanwisher/extract_thai_subtitles_to_csv.rb

## extract_thai_subtitles_to_csv.rb
#this program takes Web Subtitle files from sites like Viki.com and then gets the
#occurrence frequency and outputs a csv, to later be made into an Anki deck

require "webvtt"
require "ffi-icu"

$words = []

def add_word(x)
	if x != "." && x != "," && x != " " && x != "-" && x != "!"
		$words << x
	end
end

def split_words(line)
	iterator = ICU::BreakIterator.new(:word, "th_TH")
   	iterator.text = line
	iterator.each_substring { |x| add_word(x) }
end


def extract()
	webvtt = WebVTT.read(ARGF.argv[0])
	webvtt.cues.each do |cue|
	  text = cue.text
	  text = text.gsub("<i>", "").gsub("</i>", "")
	  split_words(text)
	end


	res = $words.each_with_object(Hash.new(0)) { |word,counts| counts[word] += 1 }


	res.sort_by{ |k,v| -1 * v}.each do |word, val|
		puts "#{word}, #{val}"
	end
end

extract()
	#this program takes Web Subtitle files from sites like Viki.com and then gets the
	#occurrence frequency and outputs a csv, to later be made into an Anki deck

	require "webvtt"
	require "ffi-icu"

	$words = []

	def add_word(x)
	if x != "." && x != "," && x != " " && x != "-" && x != "!"
	$words << x
	end
	end

	def split_words(line)
	iterator = ICU::BreakIterator.new(:word, "th_TH")
	iterator.text = line
	iterator.each_substring { \|x\| add_word(x) }
	end


	def extract()
	webvtt = WebVTT.read(ARGF.argv[0])
	webvtt.cues.each do \|cue\|
	text = cue.text
	text = text.gsub("<i>", "").gsub("</i>", "")
	split_words(text)
	end


	res = $words.each_with_object(Hash.new(0)) { \|word,counts\| counts[word] += 1 }


	res.sort_by{ \|k,v\| -1 * v}.each do \|word, val\|
	puts "#{word}, #{val}"
	end
	end

	extract()