Skip to content

Instantly share code, notes, and snippets.

@maidis
Created February 11, 2019 12:31
Show Gist options
  • Save maidis/237fc5a791e9f3f2a9f2ebd40ab5c9fb to your computer and use it in GitHub Desktop.
Save maidis/237fc5a791e9f3f2a9f2ebd40ab5c9fb to your computer and use it in GitHub Desktop.
Nutuk Frekans Analizi
#include <iostream>
#include <algorithm>
#include <fstream>
#include <string>
#include <unordered_map>
#include <set>
#include <functional>
int main()
{
std::unordered_map<std::string, int> frekans;
std::fstream text("olanlar.txt");
while(text)
{
std::string word;
text >> word;
++frekans[word];
}
// https://thispointer.com/how-to-sort-a-map-by-value-in-c/
typedef std::function<bool(std::pair<std::string, int>, std::pair<std::string, int>)> Comparator;
Comparator compFunctor =
[](std::pair<std::string, int> elem1,std::pair<std::string, int> elem2)
{
return elem1.second > elem2.second;
};
std::multiset<std::pair<std::string, int>, Comparator> setOfWords(
frekans.begin(), frekans.end(), compFunctor);
for (std::pair<std::string, int> element : setOfWords)
std::cout << element.first << " :: " << element.second << std::endl;
}
bey :: 2359
paşa :: 2278
millet :: 1751
milliye :: 1157
hükümet :: 1090
efendi :: 1043
istanbul :: 918
meclis :: 914
kendi :: 831
karşı :: 701
suret :: 677
vaziyet :: 647
devlet :: 632
ordu :: 629
telgraf :: 624
ben :: 619
sivas :: 618
bütün :: 618
ali :: 614
hareket :: 605
tarih :: 601
memleket :: 566
kumandan :: 563
kuvvet :: 560
kemal :: 558
arz :: 552
fırka :: 539
kabul :: 539
büyük :: 530
kongre :: 524
kumanda :: 520
ankara :: 513
hazret :: 507
cemiyet :: 506
zaman :: 493
mustafa :: 493
cevap :: 492
kolordu :: 415
şifre :: 413
vatan :: 406
vekil :: 390
hak :: 390
heyet :: 389
rauf :: 386
idare :: 380
umumi :: 380
nazar :: 379
evvel :: 372
vesika :: 365
husus :: 364
mesele :: 363
cephe :: 359
taraf :: 359
çalış :: 359
maksat :: 357
hukuk :: 355
ingiliz :: 353
reis :: 348
rica :: 346
vali :: 346
vazife :: 341
lüzum :: 340
erzurum :: 335
düşman :: 334
ara :: 334
nazır :: 331
namına :: 326
yalnız :: 326
teşkil :: 317
gönder :: 316
işgal :: 312
baş :: 310
emir :: 309
milli :: 309
anadol :: 307
kabine :: 306
aynı :: 306
nokta :: 305
teklif :: 305
teşkilat :: 304
mütalaa :: 289
intihap :: 284
mühim :: 280
mebus :: 280
icap :: 274
temsili :: 271
riyaset :: 271
söz :: 269
zevat :: 263
esas :: 260
netice :: 260
fikir :: 255
dikkat :: 251
türk :: 246
ismet :: 241
talep :: 240
takip :: 239
karar :: 238
devam :: 237
yeni :: 235
umumiye :: 233
cemal :: 229
vilayet :: 229
tayin :: 228
ferit :: 223
doğru :: 223
murahhas :: 216
hilafet :: 212
cumhuriyet :: 211
taarruz :: 211
itilaf :: 208
tatbik :: 207
dahiliye :: 204
temas :: 202
havali :: 202
arzu :: 202
istiklal :: 202
ilan :: 201
telakki :: 200
sebep :: 199
tamamen :: 199
efendim :: 199
türki :: 198
trakya :: 197
arkadaş :: 195
itimat :: 195
package zemberek.examples.morphology;
import zemberek.core.logging.Log;
import zemberek.morphology.TurkishMorphology;
import zemberek.morphology.analysis.SingleAnalysis;
import zemberek.morphology.analysis.WordAnalysis;
import java.util.Scanner;
import java.io.File;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.FileNotFoundException;
public class StemmingAndLemmatization {
public static void main(String[] args) {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
String word = "muvaffakiyetsizleştirici";
try {
BufferedWriter writer = new BufferedWriter(new FileWriter("/home/maidis/İndirilenler/zemberek-nlp-master/examples/src/main/java/zemberek/examples/morphology/olmayanlar.txt"));
BufferedWriter writer2 = new BufferedWriter(new FileWriter("/home/maidis/İndirilenler/zemberek-nlp-master/examples/src/main/java/zemberek/examples/morphology/olanlar.txt"));
Scanner sc2 = null;
try {
sc2 = new Scanner(new File("/home/maidis/İndirilenler/zemberek-nlp-master/examples/src/main/java/zemberek/examples/morphology/text.txt"));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
while (sc2.hasNextLine()) {
Scanner s2 = new Scanner(sc2.nextLine());
while (s2.hasNext()) {
String s = s2.next();
s = s.replaceAll("\\s*\\p{Punct}+\\s*$", "");
//System.out.println(s);
if (s.length()>2) {
try {
WordAnalysis results = morphology.analyze(s);
SingleAnalysis result = results.getAnalysisResults().get(0);
Log.info(result.getLemmas().get(0));
writer2.write(result.getLemmas().get(0).toString() + '\n');
} catch (IndexOutOfBoundsException e) {
System.out.println(s);
writer.write(s + '\n');
}
}
}
}
writer.close();
writer2.close();
} catch(java.io.IOException ioe) {
//you write here code if an ioexcepion happens. You can leave it empty if you want
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment