Last active
July 24, 2017 21:52
-
-
Save Glamdring/949602415fa229e674b7fc2006224e86 to your computer and use it in GitHub Desktop.
Анализ на предизборните пропагандни книги на Монитор/Телеграф
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package test; | |
import java.io.File; | |
import java.io.FileOutputStream; | |
import java.io.IOException; | |
import java.net.URL; | |
import java.util.ArrayList; | |
import java.util.Arrays; | |
import java.util.HashSet; | |
import java.util.List; | |
import java.util.Set; | |
import org.apache.commons.io.FileUtils; | |
import org.apache.commons.io.IOUtils; | |
import org.jooq.tools.StringUtils; | |
import com.google.common.collect.HashMultiset; | |
import com.google.common.collect.Multiset; | |
import com.google.common.collect.Multisets; | |
public class BookAnalysis { | |
private static final Set<String> stopwords = new HashSet<>(Arrays.asList("това", "като", "през", "които", | |
"да се", "който", "след", "което", "която", "години", "година", "този", "тази", "тези", "един", | |
"може", "една", "едно", "беше", "вече", "както", "не", "само", "обаче", "част", "когато", | |
"за да", "преди", "него", "част", "е", "те", "чрез", "дори", "около", "че", "може", "сред", | |
"пред", "и", "били", "била", "също", "тогава", "заради", "г.", "той", "нито", "какво", "покъсно", | |
"бяха", "на", "се", "е", "била", "били", "така", "няма", "защото", "няколко", "бъде", "у", | |
"нас", "трябва", "между")); | |
private static final Set<String> parties = new HashSet<>(Arrays.asList("БСП", "ДПС", "ДСБ", "АБВ", "СДС", "ДБГ")); | |
public static void main(String[] args) throws Exception { | |
String text = FileUtils.readFileToString(new File("C:\\Users\\bozho\\Downloads\\monitor-book-1.txt")); | |
text = text.replace("\r\n", " ").replace("\n", " ").replace("\r", " ").replace("- ", "") | |
.replace(",", "").replace(".", "").replace("“", "").replace("„", "").replace("”", "").replace("\"", ""); | |
Multiset<String> ngrams = HashMultiset.create(); | |
for (int n = 1; n <= 4; n++) { | |
for (String ngram : ngrams(n, text)) { | |
if ((ngram.length() > 3 || parties.contains(ngram.toUpperCase())) && !stopwords.contains(ngram)) { | |
ngrams.add(ngram); | |
} | |
} | |
} | |
Multiset<String> ordered = Multisets.copyHighestCountFirst(ngrams); | |
for (String ngram : ordered.elementSet()) { | |
int count = ordered.count(ngram); | |
if (count >= 3) { | |
System.out.println(ngram + "," + count); | |
} | |
} | |
} | |
public static List<String> ngrams(int n, String str) { | |
List<String> ngrams = new ArrayList<String>(); | |
String[] words = str.split(" "); | |
for (int i = 0; i < words.length - n + 1; i++) | |
ngrams.add(concat(words, i, i + n)); | |
return ngrams; | |
} | |
public static String concat(String[] words, int start, int end) { | |
StringBuilder sb = new StringBuilder(); | |
for (int i = start; i < end; i++) { | |
String word = words[i].toLowerCase().trim(); | |
if (stopwords.contains(word)) { | |
return ""; | |
} | |
sb.append((i > start ? " " : "") + word); | |
} | |
return sb.toString(); | |
} | |
public static void mainDownload(String[] args) throws Exception { | |
int book = 0; | |
String[] prefixes = new String[] {"http://www.monitor.bg/fakti/books/b3/b3_Page_", "http://www.monitor.bg/fakti/books/kniga_4/full_4-1_Page_"}; | |
int[] lengths = new int[] {242, 273}; | |
String[] rootdirs = new String[] {"c:/tmp/monitor/book1/", "c:/tmp/monitor/book2/"}; | |
new File(rootdirs[book]).mkdirs(); | |
for (int i = 1; i <= lengths[book]; i ++) { | |
try { | |
String url = prefixes[book] + StringUtils.leftPad(String.valueOf(i), 3, '0') + ".jpg"; | |
IOUtils.copy(new URL(url).openStream(), new FileOutputStream(rootdirs[book] + i + ".jpg")); | |
} catch (IOException ex){ | |
ex.printStackTrace(); | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment