Skip to content

Instantly share code, notes, and snippets.

@ahmetaa
Last active August 29, 2015 14:07
Show Gist options
  • Save ahmetaa/e56556482685dd423613 to your computer and use it in GitHub Desktop.
Save ahmetaa/e56556482685dd423613 to your computer and use it in GitHub Desktop.
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Collects unigram and bigram counts from files or directories.
* It finds the matching single and two word expressions and their counts.
*/
public class MultiWordExpressionFinder {
// contains bigrams and their counts. key of the map holds the combined bigram words.
// for example for "ya da" bigram, key is "yada"
Map<String, BigramCount> bigramCountMap = new HashMap<>();
// unigrams and their counts.
Map<String, Integer> unigramCounts = new HashMap<>();
private static final Pattern CONTENT_PATTERN = Pattern.compile(
"(?:<field name=\"TEXT\">)(.+?)(?:</field>)",
Pattern.DOTALL | Pattern.MULTILINE);
/**
* Returns the text content from the xml file. It uses a regular expression to retrieve the content
* from between "<field name="TEXT">....</field>" tags.
*/
private String getContentFromXml(File xmlFile) throws IOException {
try (BufferedReader reader = Files.newBufferedReader(xmlFile.toPath(), StandardCharsets.UTF_8)) {
StringBuilder sb = new StringBuilder();
String line;
while ((line = reader.readLine()) != null) {
sb.append(line);
}
Matcher m = CONTENT_PATTERN.matcher(sb.toString());
if (m.find()) {
return m.group(1);
} else return "";
}
}
/**
* Adds unigrams, bigrams and their counts from the content of xml files from the dir.
*/
public void addAllFileContentFromDirectory(File dir) throws IOException {
if (!dir.isDirectory()) {
throw new IllegalArgumentException(dir + " must be a directory.");
}
File[] files = dir.listFiles();
if (files == null) {
throw new IllegalArgumentException(" Error while listing " + dir);
}
for (File file : files) {
if (file.getName().endsWith(".xml")) {
addContent(getContentFromXml(file));
}
}
}
static class BigramCount {
final String bigramStr; // holds a string as "word1 word2"
int count = 0;
BigramCount(String w1, String w2) {
this.bigramStr = w1 + " " + w2;
}
void increment() {
count++;
}
}
static final Locale TURKISH_LOCALE = new Locale("tr");
/**
* Cleans and applies primitive tokenization to the input text.
* it splits it to tokens from spaces and sentence boundaries.
* Then it puts the unigram and bigram content to count maps.
* However bigram map keys are actually connected bigram words.
* Such as for bigram `ya da`, map key is `yada`
*/
public void addContent(String text) {
// to lower case and replace all characters with " # " except turkish alphabet and sentence boundary characters.
text = text.toLowerCase(TURKISH_LOCALE).replaceAll("[^a-zçğıöşü.!?: ]", " # ");
// primitive tokenization
String[] tokens = text.split("[?:.! ]+");
for (int i = 0; i < tokens.length; i++) {
addUnigram(tokens[i]);
if (i < tokens.length - 1) {
addBigram(tokens[i], tokens[i + 1]);
}
}
}
private void addUnigram(String w) {
if (w.length() == 0)
return;
if (unigramCounts.containsKey(w)) {
int count = unigramCounts.get(w) + 1;
unigramCounts.put(w, count);
} else unigramCounts.put(w, 1);
}
private void addBigram(String w1, String w2) {
if (w1.length() == 0 || w2.length() == 0)
return;
String connected = w1 + w2;
BigramCount bigramCount = bigramCountMap.get(connected);
if (bigramCount == null) {
bigramCount = new BigramCount(w1, w2);
bigramCountMap.put(bigramCount);
}
bigramCount.increment();
}
public void exportPairs(File file) throws IOException {
try (PrintWriter pw = new PrintWriter(file, "UTF-8")) {
for (String unigram : unigramCounts.keySet()) {
if (bigramCountMap.containsKey(unigram)) {
BigramCount bigramCount = bigramCountMap.get(unigram);
pw.println(unigram + " -> " + unigramCounts.get(unigram) + " , " +
bigramCount.bigramStr + " -> " + bigramCount.count);
}
}
}
}
public static void main(String[] args) throws IOException {
long start = System.currentTimeMillis();
MultiWordExpressionFinder finder = new MultiWordExpressionFinder();
finder.addAllFileContentFromDirectory(new File("htbig"));
finder.exportPairs(new File("result.txt"));
System.out.println("Elapsed : " + ((double) System.currentTimeMillis() - start) / 1000d + " seconds.");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment