Created
May 9, 2014 12:14
-
-
Save Epigene/cda2a7b148012e0e06d6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: Windows-1252 | |
require "unicode_utils/downcase" | |
sentence_length = 20 # Change this to suit how short a sentence is useless. | |
saturs = File.read('te.csv').gsub(/([\t\r\n\v\f]+)/){ |m| " " } # IO read in text, removes newlines | |
saturs.gsub!(/([^ a-zA-Z\u00C0-\u017F\!\?\.]+)/){ |m| " " } | |
saturs.gsub! /(?<=[a-zA-Z\u00C0-\u017F])[ ]+[A-Z\p{Lu}]+[a-z\p{Ll}]*(?=[ ])/, '' # izņem Īpašvārdus teikuma vidū. | |
saturs.gsub! /(?<!\.\s)[ ][A-Z\p{Lu}]+[a-z\p{Ll}]*(?=\.)/, '' # izņem Īpašvārdus teikuma beigās. | |
arr = [] | |
normals = UnicodeUtils.downcase(saturs, :lv).split(/[!|?|.](?=[ ])/).each {|s| arr<<s if s.length > sentence_length} # normalize and split input text getting an array of sentences | |
arr.each do |teikums| | |
teikums.gsub! /(^[ ]+)|([ ]+$)/, '' ; teikums.gsub! /([ ]{2,})/, ' '; teikums.gsub! /(\.$)/, '' #Attīra final teikumus no liekām atstarpēm | |
teikums<<";" | |
end | |
words = arr.join.split(/[ \;]/).size | |
final = [] | |
x = arr.inject(Hash.new(0)) { |h, e| h[e] += 1; h } # return sub-arrays that have duplicates | |
x.each {|k,v| final << k if v > 1} | |
puts final.flatten.uniq | |
duplicate_sentences = final.length | |
words_in_duplicate_sentences = final.join.gsub(/([;])/){ |m| " " }.split.size | |
# 35.times { print "*".yellow} | |
# puts "\n#{duplicate_sentences}".red+" teikumos atkārtojas ".blue+"#{words_in_duplicate_sentences}".red+" vārdi.".blue | |
# puts "Kopā tekstā ir #{words} vārdu.".blue | |
# puts "Unikāli tulkojamie vārdi tātad ir ".blue+"#{words-words_in_duplicate_sentences}".red+".".blue |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment