Created
September 15, 2008 20:12
-
-
Save aurelian/10929 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
- | |
- - o | |
- a | |
- m | |
- l | |
- i | |
- v | |
- s | |
- e | |
- c | |
- - al | |
- ai | |
- pe | |
- la | |
- "\xC3\xAEn" | |
- cu | |
- de | |
- ca | |
- eu | |
- tu | |
- el | |
- ea | |
- ei | |
- mi | |
- "m\xC4\x83" | |
- "\xC8\x9Bi" | |
- te | |
- "\xC3\xAEl" | |
- "\xC3\xAEi" | |
- ne | |
- ni | |
- "v\xC4\x83" | |
- vi | |
- le | |
- li | |
- se | |
- ta | |
- sa | |
- ce | |
- fi | |
- am | |
- au | |
- "a\xC8\x99" | |
- ar | |
- oi | |
- om | |
- or | |
- va | |
- "\xC8\x99i" | |
- ci | |
- "c\xC4\x83" | |
- nu | |
- ul | |
- ia | |
- "\xC3\x8En" | |
- "\xC8\x98i" | |
- in | |
- "s\xC4\x83" | |
- da | |
- si | |
- un | |
- "d\xC4\x83" | |
- ii | |
- an | |
- il | |
- ba | |
- - cel | |
- cea | |
- cei | |
- ale | |
- sub | |
- din | |
- mai | |
- cum | |
- noi | |
- voi | |
- ele | |
- mie | |
- "\xC3\xAEmi" | |
- "\xC8\x9Bie" | |
- "\xC3\xAE\xC8\x9Bi" | |
- lui | |
- lor | |
- "\xC3\xAE\xC8\x99i" | |
- sie | |
- meu | |
- mea | |
- mei | |
- "t\xC4\x83u" | |
- "t\xC4\x83i" | |
- "s\xC4\x83u" | |
- "s\xC4\x83i" | |
- "\xC4\x83la" | |
- "\xC4\x83ia" | |
- aia | |
- cui | |
- una | |
- alt | |
- "c\xC3\xA2t" | |
- tot | |
- unu | |
- doi | |
- opt | |
- era | |
- fiu | |
- fii | |
- fie | |
- fim | |
- "a\xC8\x9Bi" | |
- are | |
- "o\xC8\x9Bi" | |
- vei | |
- vom | |
- vor | |
- dar | |
- iar | |
- sau | |
- ori | |
- "a\xC8\x99a" | |
- pot | |
- dat | |
- pus | |
- ani | |
- mii | |
- lei | |
- i-a | |
- l-a | |
- isi | |
- "i\xC8\x99i" | |
- s-a | |
- loc | |
- asa | |
- azi | |
- cam | |
- "c\xC3\xAEt" | |
- fel | |
- mod | |
- - unui | |
- unei | |
- unor | |
- cele | |
- "f\xC4\x83r\xC4\x83" | |
- spre | |
- prin | |
- "p\xC3\xA2n\xC4\x83" | |
- "dup\xC4\x83" | |
- mult | |
- mine | |
- tine | |
- "nou\xC4\x83" | |
- "vou\xC4\x83" | |
- sine | |
- mele | |
- tale | |
- sale | |
- "\xC4\x83sta" | |
- asta | |
- alea | |
- acel | |
- acei | |
- acea | |
- cine | |
- care | |
- unde | |
- "c\xC3\xA2nd" | |
- unul | |
- unii | |
- alta | |
- "alt\xC4\x83" | |
- "al\xC8\x9Bi" | |
- alte | |
- vreo | |
- "c\xC3\xA2t\xC4\x83" | |
- "c\xC3\xA2\xC8\x9Bi" | |
- "c\xC3\xA2te" | |
- "at\xC3\xA2t" | |
- "to\xC8\x9Bi" | |
- ceva | |
- doua | |
- trei | |
- "\xC8\x99ase" | |
- noua | |
- zece | |
- sunt | |
- "e\xC8\x99ti" | |
- este | |
- eram | |
- erai | |
- erau | |
- "fi\xC8\x9Bi" | |
- fost | |
- avem | |
- avea | |
- "ve\xC8\x9Bi" | |
- nici | |
- "\xC3\xAEns\xC4\x83" | |
- deci | |
- "de\xC8\x99i" | |
- "dac\xC4\x83" | |
- ieri | |
- mare | |
- doar | |
- spus | |
- acum | |
- face | |
- avut | |
- bine | |
- "fa\xC8\x9B\xC4\x83" | |
- "\xC3\xAEnc\xC4\x83" | |
- nu-l | |
- anii | |
- zeci | |
- de-a | |
- fara | |
- "f\xC4\x83ra" | |
- "far\xC4\x83" | |
- le-a | |
- l-au | |
- abia | |
- pana | |
- "p\xC3\xA2na" | |
- "pan\xC4\x83" | |
- i-au | |
- s-au | |
- si-a | |
- "\xC8\x99i-a" | |
- luat | |
- "dou\xC4\x83" | |
- pare | |
- desi | |
- sint | |
- inca | |
- "inc\xC4\x83" | |
- cand | |
- sa-l | |
- "s\xC4\x83-l" | |
- aici | |
- atat | |
- deja | |
- dupa | |
- mica | |
- "mic\xC4\x83" | |
- "dat\xC4\x83" | |
- data | |
- apoi | |
- "at\xC3\xAEt" | |
- ceea | |
- "c\xC3\xAEnd" | |
- "c\xC3\xAEte" | |
- "c\xC3\xAE\xC8\x9Bi" | |
- daca | |
- "\xC3\xAEnca" | |
- "\xC3\xAEntr" | |
- l-am | |
- "p\xC3\xAEn\xC4\x83" | |
- plus | |
- prea | |
- s-ar | |
- "s\xC4\x83-i" | |
- "s\xC3\xAEnt" | |
- - "ni\xC8\x99te" | |
- celui | |
- celei | |
- celor | |
- "c\xC4\x83tre" | |
- "l\xC3\xA2ng\xC4\x83" | |
- peste | |
- "dec\xC3\xA2t" | |
- "mult\xC4\x83" | |
- "mul\xC8\x9Bi" | |
- multe | |
- "pu\xC8\x9Bin" | |
- "sie\xC8\x99i" | |
- "\xC4\x83\xC8\x99tia" | |
- astea | |
- acest | |
- acela | |
- "\xC4\x83luia" | |
- aceia | |
- "\xC4\x83lora" | |
- aceea | |
- "\xC4\x83leia" | |
- acele | |
- "c\xC4\x83rui" | |
- "c\xC4\x83rei" | |
- "c\xC4\x83ror" | |
- cuiva | |
- orice | |
- unele | |
- unuia | |
- uneia | |
- unora | |
- altul | |
- "al\xC8\x9Bii" | |
- altui | |
- altei | |
- altor | |
- vreun | |
- "c\xC3\xA2tor" | |
- "at\xC3\xA2ta" | |
- "at\xC3\xA2\xC8\x9Bi" | |
- "c\xC3\xA2tva" | |
- "toat\xC4\x83" | |
- toate | |
- totul | |
- nimic | |
- patru | |
- cinci | |
- "\xC8\x99apte" | |
- doime | |
- ambii | |
- prima | |
- "era\xC8\x9Bi" | |
- fiind | |
- "ave\xC8\x9Bi" | |
- aveam | |
- aveai | |
- aveau | |
- "\xC3\xAEnc\xC3\xA2t" | |
- poate | |
- putea | |
- chiar | |
- "f\xC4\x83cut" | |
- parte | |
- spune | |
- numai | |
- le-au | |
- "\xC8\x98i-au" | |
- "\xC8\x99i-au" | |
- "s\xC4\x83-\xC8\x99i" | |
- sa-si | |
- "s\xC4\x83-si" | |
- "sa-\xC8\x99i" | |
- "c\xC4\x83-\xC8\x99i" | |
- "ca-\xC8\x99i" | |
- ca-si | |
- "c\xC4\x83-si" | |
- langa | |
- "l\xC3\xA2nga" | |
- catre | |
- facem | |
- facut | |
- multi | |
- putin | |
- acolo | |
- altii | |
- "adic\xC4\x83" | |
- anume | |
- atare | |
- "at\xC3\xAEti" | |
- cumva | |
- "dec\xC3\xAEt" | |
- dintr | |
- "\xC3\xAEnc\xC3\xAEt" | |
- "\xC3\xAEntre" | |
- mereu | |
- "poat\xC4\x83" | |
- "s\xC4\x83-mi" | |
- "s\xC4\x83-\xC8\x9Bi" | |
- - despre | |
- pentru | |
- dintre | |
- "\xC3\xAEnspre" | |
- foarte | |
- "pu\xC8\x9Bin\xC4\x83" | |
- "pu\xC8\x9Bini" | |
- "pu\xC8\x9Bine" | |
- destul | |
- destui | |
- "\xC3\xAEnsumi" | |
- "\xC3\xAEns\xC4\x83mi" | |
- "\xC3\xAEnsu\xC8\x9Bi" | |
- "\xC3\xAEns\xC4\x83\xC8\x9Bi" | |
- "\xC3\xAEnsu\xC8\x99i" | |
- "\xC3\xAEns\xC4\x83\xC8\x99i" | |
- "\xC3\xAEn\xC8\x99ine" | |
- "\xC3\xAEnsene" | |
- "\xC3\xAEn\xC8\x99iv\xC4\x83" | |
- "\xC3\xAEnsev\xC4\x83" | |
- "\xC3\xAEn\xC8\x99i\xC8\x99i" | |
- "\xC3\xAEnse\xC8\x99i" | |
- "\xC3\xAEnsele" | |
- nostru | |
- "no\xC8\x99tri" | |
- vostru | |
- "vo\xC8\x99tri" | |
- acesta | |
- "\xC4\x83stuia" | |
- "\xC4\x83stora" | |
- "\xC4\x83steia" | |
- "ace\xC8\x99ti" | |
- aceste | |
- acelui | |
- acelea | |
- acelor | |
- acelei | |
- cineva | |
- oricui | |
- altele | |
- altuia | |
- alteia | |
- altora | |
- vreuna | |
- "c\xC3\xA2tora" | |
- "at\xC3\xA2\xC8\x9Bia" | |
- "at\xC3\xA2tea" | |
- "at\xC3\xA2tor" | |
- "oric\xC3\xA2t" | |
- "c\xC3\xA2\xC8\x9Biva" | |
- "c\xC3\xA2teva" | |
- cutare | |
- nimeni | |
- treime | |
- sutime | |
- ambele | |
- "\xC3\xAEndoit" | |
- "\xC3\xAEnt\xC3\xA2ia" | |
- primul | |
- primii | |
- primei | |
- suntem | |
- "avea\xC8\x9Bi" | |
- "a\xC8\x99adar" | |
- "totu\xC8\x99i" | |
- atunci | |
- astfel | |
- "exist\xC4\x83" | |
- asupra | |
- doilea | |
- ultima | |
- intr-o | |
- "\xC3\xAEntr-o" | |
- niciun | |
- nicuna | |
- "facu\xC8\x9Bi" | |
- facuti | |
- cativa | |
- "c\xC3\xA2tiva" | |
- "ca\xC8\x9Biva" | |
- putina | |
- "pu\xC8\x9Bina" | |
- "putin\xC4\x83" | |
- altfel | |
- "ast\xC4\x83zi" | |
- "at\xC3\xAE\xC8\x9Bia" | |
- "c\xC4\x83reia" | |
- "c\xC4\x83rora" | |
- "c\xC4\x83ruia" | |
- "c\xC3\xAEteva" | |
- "c\xC3\xAE\xC8\x9Biva" | |
- "\xC3\xAEnapoi" | |
- oarece | |
- oricum | |
- "s\xC3\xAEntem" | |
- tocmai | |
- uneori | |
- - printre | |
- "destul\xC4\x83" | |
- destule | |
- "noastr\xC4\x83" | |
- noastre | |
- "voastr\xC4\x83" | |
- voastre | |
- "ace\xC8\x99tia" | |
- aceasta | |
- acestea | |
- acestui | |
- acestor | |
- "aceast\xC4\x83" | |
- acestei | |
- acelora | |
- aceleia | |
- "acela\xC8\x99i" | |
- "aceia\xC8\x99i" | |
- "aceea\xC8\x99i" | |
- oricine | |
- vreunul | |
- vreunii | |
- vreunui | |
- vreunei | |
- vreunor | |
- oricare | |
- fiecare | |
- "at\xC3\xA2tora" | |
- "oric\xC3\xA2t\xC4\x83" | |
- "oric\xC3\xA2\xC8\x9Bi" | |
- "oric\xC3\xA2te" | |
- "c\xC3\xA2torva" | |
- tuturor | |
- altceva | |
- "nim\xC4\x83nui" | |
- "am\xC3\xA2ndoi" | |
- ambilor | |
- ambelor | |
- "\xC3\xAEntreit" | |
- "\xC3\xAEnsutit" | |
- "\xC3\xAEnt\xC3\xA2iul" | |
- primele | |
- "sunte\xC8\x9Bi" | |
- trebuie | |
- aproape | |
- miliard | |
- ultimul | |
- "\xC3\xAEntr-un" | |
- intr-un | |
- sunteti | |
- "al\xC4\x83turi" | |
- "\xC3\xAEnainte" | |
- oarecui | |
- "s\xC3\xAEnte\xC8\x9Bi" | |
- - acestuia | |
- acestora | |
- acesteia | |
- "acelea\xC8\x99i" | |
- "cel\xC4\x83lalt" | |
- "ceilal\xC8\x9Bi" | |
- "cealalt\xC4\x83" | |
- altcuiva | |
- vreunele | |
- vreunuia | |
- vreuneia | |
- vreunora | |
- "oric\xC4\x83rui" | |
- "oric\xC4\x83rei" | |
- "oric\xC4\x83ror" | |
- "fiec\xC4\x83rui" | |
- "fiec\xC4\x83rei" | |
- "oric\xC3\xA2tor" | |
- oarecare | |
- "am\xC3\xA2ndou\xC4\x83" | |
- primului | |
- primilor | |
- primelor | |
- niciulul | |
- milioane | |
- asemenea | |
- deasupra | |
- oarecine | |
- printr-o | |
- - "aceluia\xC8\x99i" | |
- "acelora\xC8\x99i" | |
- "aceleia\xC8\x99i" | |
- celuilalt | |
- celelalte | |
- altcineva | |
- "oric\xC4\x83ruia" | |
- "oric\xC4\x83reia" | |
- "oric\xC4\x83rora" | |
- "fiec\xC4\x83ruia" | |
- "fiec\xC4\x83reia" | |
- "oric\xC3\xA2tora" | |
- "am\xC3\xA2nduror" | |
- "\xC3\xAEmpotriva" | |
- niciodata | |
- - "celorlal\xC8\x9Bi" | |
- celeilalte | |
- celorlalte | |
- "am\xC3\xA2ndurora" | |
- | |
- | |
- - "dumneavoastr\xC4\x83" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'yaml' | |
def remove_stop_words(text) | |
stop_words= YAML.load_file 'ro-stop_words5.yml' | |
clean_buff= [] | |
for match in text.downcase.scan /\b([\w\-']*)\b/iu | |
word= match[0] | |
next if word.nil? || word.strip == "" | |
size= word.unpack("U*").size | |
next if stop_words[size] && stop_words[size].include?(word) | |
clean_buff << word | |
end | |
clean_buff.join(" ") | |
end | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$htmlenc2unicode= { | |
" " => 160 ,# U+00A0 | |
"¡" => 161 ,# U+00A1 | |
"¢" => 162 ,# U+00A2 | |
"£" => 163 ,# U+00A3 | |
"¤" => 164 ,# U+00A4 | |
"¥" => 165 ,# U+00A5 | |
"¦" => 166 ,# U+00A6 | |
"§" => 167 ,# U+00A7 | |
"¨" => 168 ,# U+00A8 | |
"©" => 169 ,# U+00A9 | |
"ª" => 170 ,# U+00AA | |
"«" => 171 ,# U+00AB | |
"¬" => 172 ,# U+00AC | |
"­" => 173 ,# U+00AD | |
"®" => 174 ,# U+00AE | |
"¯" => 175 ,# U+00AF | |
"°" => 176 ,# U+00B0 | |
"±" => 177 ,# U+00B1 | |
"²" => 178 ,# U+00B2 | |
"³" => 179 ,# U+00B3 | |
"´" => 180 ,# U+00B4 | |
"µ" => 181 ,# U+00B5 | |
"¶" => 182 ,# U+00B6 | |
"·" => 183 ,# U+00B7 | |
"¸" => 184 ,# U+00B8 | |
"¹" => 185 ,# U+00B9 | |
"º" => 186 ,# U+00BA | |
"»" => 187 ,# U+00BB | |
"¼" => 188 ,# U+00BC | |
"½" => 189 ,# U+00BD | |
"¾" => 190 ,# U+00BE | |
"¿" => 191 ,# U+00BF | |
"À" => 192 ,# U+00C0 | |
"Á" => 193 ,# U+00C1 | |
"Â" => 194 ,# U+00C2 | |
"Ã" => 195 ,# U+00C3 | |
"Ä" => 196 ,# U+00C4 | |
"Å" => 197 ,# U+00C5 | |
"Æ" => 198 ,# U+00C6 | |
"Ç" => 199 ,# U+00C7 | |
"È" => 200 ,# U+00C8 | |
"É" => 201 ,# U+00C9 | |
"Ê" => 202 ,# U+00CA | |
"Ë" => 203 ,# U+00CB | |
"Ì" => 204 ,# U+00CC | |
"Í" => 205 ,# U+00CD | |
"Î" => 206 ,# U+00CE | |
"Ï" => 207 ,# U+00CF | |
"Ð" => 208 ,# U+00D0 | |
"Ñ" => 209 ,# U+00D1 | |
"Ò" => 210 ,# U+00D2 | |
"Ó" => 211 ,# U+00D3 | |
"Ô" => 212 ,# U+00D4 | |
"Õ" => 213 ,# U+00D5 | |
"Ö" => 214 ,# U+00D6 | |
"×" => 215 ,# U+00D7 | |
"Ø" => 216 ,# U+00D8 | |
"Ù" => 217 ,# U+00D9 | |
"Ú" => 218 ,# U+00DA | |
"Û" => 219 ,# U+00DB | |
"Ü" => 220 ,# U+00DC | |
"Ý" => 221 ,# U+00DD | |
"Þ" => 222 ,# U+00DE | |
"ß" => 223 ,# U+00DF | |
"à" => 224 ,# U+00E0 | |
"á" => 225 ,# U+00E1 | |
"â" => 226 ,# U+00E2 | |
"ã" => 227 ,# U+00E3 | |
"ä" => 228 ,# U+00E4 | |
"å" => 229 ,# U+00E5 | |
"æ" => 230 ,# U+00E6 | |
"ç" => 231 ,# U+00E7 | |
"è" => 232 ,# U+00E8 | |
"é" => 233 ,# U+00E9 | |
"ê" => 234 ,# U+00EA | |
"ë" => 235 ,# U+00EB | |
"ì" => 236 ,# U+00EC | |
"í" => 237 ,# U+00ED | |
"î" => 238 ,# U+00EE | |
"ï" => 239 ,# U+00EF | |
"ð" => 240 ,# U+00F0 | |
"ñ" => 241 ,# U+00F1 | |
"ò" => 242 ,# U+00F2 | |
"ó" => 243 ,# U+00F3 | |
"ô" => 244 ,# U+00F4 | |
"õ" => 245 ,# U+00F5 | |
"ö" => 246 ,# U+00F6 | |
"÷" => 247 ,# U+00F7 | |
"ø" => 248 ,# U+00F8 | |
"ù" => 249 ,# U+00F9 | |
"ú" => 250 ,# U+00FA | |
"û" => 251 ,# U+00FB | |
"ü" => 252 ,# U+00FC | |
"ý" => 253 ,# U+00FD | |
"þ " => 254 ,# U+00FE | |
"ÿ" => 255 ,# U+00FF | |
""" => 34 ,# U+0022 | |
"&" => 38 ,# U+0026 | |
"<" => 60 ,# U+003C | |
">" => 62 ,# U+003E | |
"'" => 39 # U+0027 | |
} | |
# common fixes (+later windows specific crap). | |
$post_process = { | |
160 => 32, # U+00A0 => U+0020, => ' ' | |
350 => 536, # U+015E (S w sedila) => U+0218 (S w virgula) | |
351 => 537, # U+015F (s w sedila) => U+0219 (s w virgula) | |
354 => 538, # U+0162 (T w sedila) => U+021A (T w virgula) | |
355 => 539 # U+0163 (t w sedila) => U+021B (t w virgula) | |
} | |
def translate2utf8(str) | |
$htmlenc2unicode.each{ | k, v | str.gsub!(k, [v].pack("U")) } | |
$post_process.each{ | k, v | str.gsub!([k].pack("U"), [v].pack("U")) } | |
str | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment