Last active
August 29, 2015 14:07
-
-
Save ahmetaa/e56556482685dd423613 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.*; | |
import java.nio.charset.StandardCharsets; | |
import java.nio.file.Files; | |
import java.util.HashMap; | |
import java.util.Locale; | |
import java.util.Map; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
/** | |
* Collects unigram and bigram counts from files or directories. | |
* It finds the matching single and two word expressions and their counts. | |
*/ | |
public class MultiWordExpressionFinder { | |
// contains bigrams and their counts. key of the map holds the combined bigram words. | |
// for example for "ya da" bigram, key is "yada" | |
Map<String, BigramCount> bigramCountMap = new HashMap<>(); | |
// unigrams and their counts. | |
Map<String, Integer> unigramCounts = new HashMap<>(); | |
private static final Pattern CONTENT_PATTERN = Pattern.compile( | |
"(?:<field name=\"TEXT\">)(.+?)(?:</field>)", | |
Pattern.DOTALL | Pattern.MULTILINE); | |
/** | |
* Returns the text content from the xml file. It uses a regular expression to retrieve the content | |
* from between "<field name="TEXT">....</field>" tags. | |
*/ | |
private String getContentFromXml(File xmlFile) throws IOException { | |
try (BufferedReader reader = Files.newBufferedReader(xmlFile.toPath(), StandardCharsets.UTF_8)) { | |
StringBuilder sb = new StringBuilder(); | |
String line; | |
while ((line = reader.readLine()) != null) { | |
sb.append(line); | |
} | |
Matcher m = CONTENT_PATTERN.matcher(sb.toString()); | |
if (m.find()) { | |
return m.group(1); | |
} else return ""; | |
} | |
} | |
/** | |
* Adds unigrams, bigrams and their counts from the content of xml files from the dir. | |
*/ | |
public void addAllFileContentFromDirectory(File dir) throws IOException { | |
if (!dir.isDirectory()) { | |
throw new IllegalArgumentException(dir + " must be a directory."); | |
} | |
File[] files = dir.listFiles(); | |
if (files == null) { | |
throw new IllegalArgumentException(" Error while listing " + dir); | |
} | |
for (File file : files) { | |
if (file.getName().endsWith(".xml")) { | |
addContent(getContentFromXml(file)); | |
} | |
} | |
} | |
static class BigramCount { | |
final String bigramStr; // holds a string as "word1 word2" | |
int count = 0; | |
BigramCount(String w1, String w2) { | |
this.bigramStr = w1 + " " + w2; | |
} | |
void increment() { | |
count++; | |
} | |
} | |
static final Locale TURKISH_LOCALE = new Locale("tr"); | |
/** | |
* Cleans and applies primitive tokenization to the input text. | |
* it splits it to tokens from spaces and sentence boundaries. | |
* Then it puts the unigram and bigram content to count maps. | |
* However bigram map keys are actually connected bigram words. | |
* Such as for bigram `ya da`, map key is `yada` | |
*/ | |
public void addContent(String text) { | |
// to lower case and replace all characters with " # " except turkish alphabet and sentence boundary characters. | |
text = text.toLowerCase(TURKISH_LOCALE).replaceAll("[^a-zçğıöşü.!?: ]", " # "); | |
// primitive tokenization | |
String[] tokens = text.split("[?:.! ]+"); | |
for (int i = 0; i < tokens.length; i++) { | |
addUnigram(tokens[i]); | |
if (i < tokens.length - 1) { | |
addBigram(tokens[i], tokens[i + 1]); | |
} | |
} | |
} | |
private void addUnigram(String w) { | |
if (w.length() == 0) | |
return; | |
if (unigramCounts.containsKey(w)) { | |
int count = unigramCounts.get(w) + 1; | |
unigramCounts.put(w, count); | |
} else unigramCounts.put(w, 1); | |
} | |
private void addBigram(String w1, String w2) { | |
if (w1.length() == 0 || w2.length() == 0) | |
return; | |
String connected = w1 + w2; | |
BigramCount bigramCount = bigramCountMap.get(connected); | |
if (bigramCount == null) { | |
bigramCount = new BigramCount(w1, w2); | |
bigramCountMap.put(bigramCount); | |
} | |
bigramCount.increment(); | |
} | |
public void exportPairs(File file) throws IOException { | |
try (PrintWriter pw = new PrintWriter(file, "UTF-8")) { | |
for (String unigram : unigramCounts.keySet()) { | |
if (bigramCountMap.containsKey(unigram)) { | |
BigramCount bigramCount = bigramCountMap.get(unigram); | |
pw.println(unigram + " -> " + unigramCounts.get(unigram) + " , " + | |
bigramCount.bigramStr + " -> " + bigramCount.count); | |
} | |
} | |
} | |
} | |
public static void main(String[] args) throws IOException { | |
long start = System.currentTimeMillis(); | |
MultiWordExpressionFinder finder = new MultiWordExpressionFinder(); | |
finder.addAllFileContentFromDirectory(new File("htbig")); | |
finder.exportPairs(new File("result.txt")); | |
System.out.println("Elapsed : " + ((double) System.currentTimeMillis() - start) / 1000d + " seconds."); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment