Анализ на предизборните пропагандни книги на Монитор/Телеграф
package test; | |
import java.io.File; | |
import java.io.FileOutputStream; | |
import java.io.IOException; | |
import java.net.URL; | |
import java.util.ArrayList; | |
import java.util.Arrays; | |
import java.util.HashSet; | |
import java.util.List; | |
import java.util.Set; | |
import org.apache.commons.io.FileUtils; | |
import org.apache.commons.io.IOUtils; | |
import org.jooq.tools.StringUtils; | |
import com.google.common.collect.HashMultiset; | |
import com.google.common.collect.Multiset; | |
import com.google.common.collect.Multisets; | |
public class BookAnalysis { | |
private static final Set<String> stopwords = new HashSet<>(Arrays.asList("това", "като", "през", "които", | |
"да се", "който", "след", "което", "която", "години", "година", "този", "тази", "тези", "един", | |
"може", "една", "едно", "беше", "вече", "както", "не", "само", "обаче", "част", "когато", | |
"за да", "преди", "него", "част", "е", "те", "чрез", "дори", "около", "че", "може", "сред", | |
"пред", "и", "били", "била", "също", "тогава", "заради", "г.", "той", "нито", "какво", "покъсно", | |
"бяха", "на", "се", "е", "била", "били", "така", "няма", "защото", "няколко", "бъде", "у", | |
"нас", "трябва", "между")); | |
private static final Set<String> parties = new HashSet<>(Arrays.asList("БСП", "ДПС", "ДСБ", "АБВ", "СДС", "ДБГ")); | |
public static void main(String[] args) throws Exception { | |
String text = FileUtils.readFileToString(new File("C:\\Users\\bozho\\Downloads\\monitor-book-1.txt")); | |
text = text.replace("\r\n", " ").replace("\n", " ").replace("\r", " ").replace("- ", "") | |
.replace(",", "").replace(".", "").replace("“", "").replace("„", "").replace("”", "").replace("\"", ""); | |
Multiset<String> ngrams = HashMultiset.create(); | |
for (int n = 1; n <= 4; n++) { | |
for (String ngram : ngrams(n, text)) { | |
if ((ngram.length() > 3 || parties.contains(ngram.toUpperCase())) && !stopwords.contains(ngram)) { | |
ngrams.add(ngram); | |
} | |
} | |
} | |
Multiset<String> ordered = Multisets.copyHighestCountFirst(ngrams); | |
for (String ngram : ordered.elementSet()) { | |
int count = ordered.count(ngram); | |
if (count >= 3) { | |
System.out.println(ngram + "," + count); | |
} | |
} | |
} | |
public static List<String> ngrams(int n, String str) { | |
List<String> ngrams = new ArrayList<String>(); | |
String[] words = str.split(" "); | |
for (int i = 0; i < words.length - n + 1; i++) | |
ngrams.add(concat(words, i, i + n)); | |
return ngrams; | |
} | |
public static String concat(String[] words, int start, int end) { | |
StringBuilder sb = new StringBuilder(); | |
for (int i = start; i < end; i++) { | |
String word = words[i].toLowerCase().trim(); | |
if (stopwords.contains(word)) { | |
return ""; | |
} | |
sb.append((i > start ? " " : "") + word); | |
} | |
return sb.toString(); | |
} | |
public static void mainDownload(String[] args) throws Exception { | |
int book = 0; | |
String[] prefixes = new String[] {"http://www.monitor.bg/fakti/books/b3/b3_Page_", "http://www.monitor.bg/fakti/books/kniga_4/full_4-1_Page_"}; | |
int[] lengths = new int[] {242, 273}; | |
String[] rootdirs = new String[] {"c:/tmp/monitor/book1/", "c:/tmp/monitor/book2/"}; | |
new File(rootdirs[book]).mkdirs(); | |
for (int i = 1; i <= lengths[book]; i ++) { | |
try { | |
String url = prefixes[book] + StringUtils.leftPad(String.valueOf(i), 3, '0') + ".jpg"; | |
IOUtils.copy(new URL(url).openStream(), new FileOutputStream(rootdirs[book] + i + ".jpg")); | |
} catch (IOException ex){ | |
ex.printStackTrace(); | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment