Skip to content

Instantly share code, notes, and snippets.

@Glamdring
Last active July 24, 2017 21:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Glamdring/949602415fa229e674b7fc2006224e86 to your computer and use it in GitHub Desktop.
Save Glamdring/949602415fa229e674b7fc2006224e86 to your computer and use it in GitHub Desktop.
Анализ на предизборните пропагандни книги на Монитор/Телеграф
package test;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.jooq.tools.StringUtils;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;
import com.google.common.collect.Multisets;
public class BookAnalysis {
private static final Set<String> stopwords = new HashSet<>(Arrays.asList("това", "като", "през", "които",
"да се", "който", "след", "което", "която", "години", "година", "този", "тази", "тези", "един",
"може", "една", "едно", "беше", "вече", "както", "не", "само", "обаче", "част", "когато",
"за да", "преди", "него", "част", "е", "те", "чрез", "дори", "около", "че", "може", "сред",
"пред", "и", "били", "била", "също", "тогава", "заради", "г.", "той", "нито", "какво", "покъсно",
"бяха", "на", "се", "е", "била", "били", "така", "няма", "защото", "няколко", "бъде", "у",
"нас", "трябва", "между"));
private static final Set<String> parties = new HashSet<>(Arrays.asList("БСП", "ДПС", "ДСБ", "АБВ", "СДС", "ДБГ"));
public static void main(String[] args) throws Exception {
String text = FileUtils.readFileToString(new File("C:\\Users\\bozho\\Downloads\\monitor-book-1.txt"));
text = text.replace("\r\n", " ").replace("\n", " ").replace("\r", " ").replace("- ", "")
.replace(",", "").replace(".", "").replace("“", "").replace("„", "").replace("”", "").replace("\"", "");
Multiset<String> ngrams = HashMultiset.create();
for (int n = 1; n <= 4; n++) {
for (String ngram : ngrams(n, text)) {
if ((ngram.length() > 3 || parties.contains(ngram.toUpperCase())) && !stopwords.contains(ngram)) {
ngrams.add(ngram);
}
}
}
Multiset<String> ordered = Multisets.copyHighestCountFirst(ngrams);
for (String ngram : ordered.elementSet()) {
int count = ordered.count(ngram);
if (count >= 3) {
System.out.println(ngram + "," + count);
}
}
}
public static List<String> ngrams(int n, String str) {
List<String> ngrams = new ArrayList<String>();
String[] words = str.split(" ");
for (int i = 0; i < words.length - n + 1; i++)
ngrams.add(concat(words, i, i + n));
return ngrams;
}
public static String concat(String[] words, int start, int end) {
StringBuilder sb = new StringBuilder();
for (int i = start; i < end; i++) {
String word = words[i].toLowerCase().trim();
if (stopwords.contains(word)) {
return "";
}
sb.append((i > start ? " " : "") + word);
}
return sb.toString();
}
public static void mainDownload(String[] args) throws Exception {
int book = 0;
String[] prefixes = new String[] {"http://www.monitor.bg/fakti/books/b3/b3_Page_", "http://www.monitor.bg/fakti/books/kniga_4/full_4-1_Page_"};
int[] lengths = new int[] {242, 273};
String[] rootdirs = new String[] {"c:/tmp/monitor/book1/", "c:/tmp/monitor/book2/"};
new File(rootdirs[book]).mkdirs();
for (int i = 1; i <= lengths[book]; i ++) {
try {
String url = prefixes[book] + StringUtils.leftPad(String.valueOf(i), 3, '0') + ".jpg";
IOUtils.copy(new URL(url).openStream(), new FileOutputStream(rootdirs[book] + i + ".jpg"));
} catch (IOException ex){
ex.printStackTrace();
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment