Epigene/gist:cda2a7b148012e0e06d6

## gistfile1.txt
# encoding: Windows-1252

require "unicode_utils/downcase"

sentence_length = 20 # Change this to suit how short a sentence is useless.

saturs = File.read('te.csv').gsub(/([\t\r\n\v\f]+)/){ |m| " " } # IO read in text, removes newlines

saturs.gsub!(/([^ a-zA-Z\u00C0-\u017F\!\?\.]+)/){ |m| " " }
saturs.gsub! /(?<=[a-zA-Z\u00C0-\u017F])[ ]+[A-Z\p{Lu}]+[a-z\p{Ll}]*(?=[ ])/, '' # izņem Īpašvārdus teikuma vidū.
saturs.gsub! /(?<!\.\s)[ ][A-Z\p{Lu}]+[a-z\p{Ll}]*(?=\.)/, '' # izņem Īpašvārdus teikuma beigās.

arr = []
normals = UnicodeUtils.downcase(saturs, :lv).split(/[!|?|.](?=[ ])/).each {|s| arr<<s if s.length > sentence_length} # normalize and split input text getting an array of sentences
arr.each do |teikums|
  teikums.gsub! /(^[ ]+)|([ ]+$)/, '' ; teikums.gsub! /([ ]{2,})/, ' '; teikums.gsub! /(\.$)/, '' #Attīra final teikumus no liekām atstarpēm
  teikums<<";"
end
words = arr.join.split(/[ \;]/).size

final = []
x = arr.inject(Hash.new(0)) { |h, e| h[e] += 1; h } # return sub-arrays that have duplicates
x.each {|k,v| final << k if v > 1}
puts final.flatten.uniq


duplicate_sentences = final.length
words_in_duplicate_sentences = final.join.gsub(/([;])/){ |m| " " }.split.size
# 35.times { print "*".yellow}
# puts "\n#{duplicate_sentences}".red+" teikumos atkārtojas ".blue+"#{words_in_duplicate_sentences}".red+" vārdi.".blue
# puts "Kopā tekstā ir #{words} vārdu.".blue
# puts "Unikāli tulkojamie vārdi tātad ir ".blue+"#{words-words_in_duplicate_sentences}".red+".".blue
	# encoding: Windows-1252

	require "unicode_utils/downcase"

	sentence_length = 20 # Change this to suit how short a sentence is useless.

	saturs = File.read('te.csv').gsub(/([\t\r\n\v\f]+)/){ \|m\| " " } # IO read in text, removes newlines

	saturs.gsub!(/([^ a-zA-Z\u00C0-\u017F\!\?\.]+)/){ \|m\| " " }
	saturs.gsub! /(?<=[a-zA-Z\u00C0-\u017F])[ ]+[A-Z\p{Lu}]+[a-z\p{Ll}]*(?=[ ])/, '' # izņem Īpašvārdus teikuma vidū.
	saturs.gsub! /(?<!\.\s)[ ][A-Z\p{Lu}]+[a-z\p{Ll}]*(?=\.)/, '' # izņem Īpašvārdus teikuma beigās.

	arr = []
	normals = UnicodeUtils.downcase(saturs, :lv).split(/[!\|?\|.](?=[ ])/).each {\|s\| arr<<s if s.length > sentence_length} # normalize and split input text getting an array of sentences
	arr.each do \|teikums\|
	teikums.gsub! /(^[ ]+)\|([ ]+$)/, '' ; teikums.gsub! /([ ]{2,})/, ' '; teikums.gsub! /(\.$)/, '' #Attīra final teikumus no liekām atstarpēm
	teikums<<";"
	end
	words = arr.join.split(/[ \;]/).size

	final = []
	x = arr.inject(Hash.new(0)) { \|h, e\| h[e] += 1; h } # return sub-arrays that have duplicates
	x.each {\|k,v\| final << k if v > 1}
	puts final.flatten.uniq


	duplicate_sentences = final.length
	words_in_duplicate_sentences = final.join.gsub(/([;])/){ \|m\| " " }.split.size
	# 35.times { print "*".yellow}
	# puts "\n#{duplicate_sentences}".red+" teikumos atkārtojas ".blue+"#{words_in_duplicate_sentences}".red+" vārdi.".blue
	# puts "Kopā tekstā ir #{words} vārdu.".blue
	# puts "Unikāli tulkojamie vārdi tātad ir ".blue+"#{words-words_in_duplicate_sentences}".red+".".blue