ahmetaa/MultiWordExpressionFinder

## MultiWordExpressionFinder
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Collects unigram and bigram counts from files or directories.
 * It finds the matching single and two word expressions and their counts.
 */
public class MultiWordExpressionFinder {

    // contains bigrams and their counts. key of the map holds the combined bigram words.
    // for example for "ya da" bigram, key is "yada"
    Map<String, BigramCount> bigramCountMap = new HashMap<>();

    // unigrams and their counts.
    Map<String, Integer> unigramCounts = new HashMap<>();

    private static final Pattern CONTENT_PATTERN = Pattern.compile(
            "(?:<field name=\"TEXT\">)(.+?)(?:</field>)",
            Pattern.DOTALL | Pattern.MULTILINE);

    /**
     * Returns the text content from the xml file. It uses a regular expression to retrieve the content
     * from between "<field name="TEXT">....</field>" tags.
     */
    private String getContentFromXml(File xmlFile) throws IOException {
        try (BufferedReader reader = Files.newBufferedReader(xmlFile.toPath(), StandardCharsets.UTF_8)) {
            StringBuilder sb = new StringBuilder();
            String line;
            while ((line = reader.readLine()) != null) {
                sb.append(line);
            }
            Matcher m = CONTENT_PATTERN.matcher(sb.toString());
            if (m.find()) {
                return m.group(1);
            } else return "";
        }
    }

    /**
     * Adds unigrams, bigrams and their counts from the content of xml files from the dir.
     */
    public void addAllFileContentFromDirectory(File dir) throws IOException {
        if (!dir.isDirectory()) {
            throw new IllegalArgumentException(dir + " must be a directory.");
        }
        File[] files = dir.listFiles();
        if (files == null) {
            throw new IllegalArgumentException(" Error while listing " + dir);
        }

        for (File file : files) {
            if (file.getName().endsWith(".xml")) {
                addContent(getContentFromXml(file));
            }
        }
    }

    static class BigramCount {
        final String bigramStr; // holds a string as "word1 word2"
        int count = 0;

        BigramCount(String w1, String w2) {
            this.bigramStr = w1 + " " + w2;
        }

        void increment() {
            count++;
        }
    }

    static final Locale TURKISH_LOCALE = new Locale("tr");

    /**
     * Cleans and applies primitive tokenization to the input text.
     * it splits it to tokens from spaces and sentence boundaries.
     * Then it puts the unigram and bigram content to count maps.
     * However bigram map keys are actually connected bigram words.
     * Such as for bigram `ya da`, map key is `yada`
     */
    public void addContent(String text) {
        // to lower case and replace all characters with " # " except turkish alphabet and sentence boundary characters.
        text = text.toLowerCase(TURKISH_LOCALE).replaceAll("[^a-zçğıöşü.!?: ]", " # ");

        // primitive tokenization
        String[] tokens = text.split("[?:.! ]+");
        for (int i = 0; i < tokens.length; i++) {
            addUnigram(tokens[i]);

            if (i < tokens.length - 1) {
                addBigram(tokens[i], tokens[i + 1]);
            }
        }
    }

    private void addUnigram(String w) {
        if (w.length() == 0)
            return;
        if (unigramCounts.containsKey(w)) {
            int count = unigramCounts.get(w) + 1;
            unigramCounts.put(w, count);
        } else unigramCounts.put(w, 1);
    }

    private void addBigram(String w1, String w2) {
        if (w1.length() == 0 || w2.length() == 0)
            return;
        String connected = w1 + w2;
        BigramCount bigramCount = bigramCountMap.get(connected);
        if (bigramCount == null) {
            bigramCount = new BigramCount(w1, w2);
            bigramCountMap.put(bigramCount);
        }
        bigramCount.increment();
    }

    public void exportPairs(File file) throws IOException {
        try (PrintWriter pw = new PrintWriter(file, "UTF-8")) {
            for (String unigram : unigramCounts.keySet()) {
                if (bigramCountMap.containsKey(unigram)) {
                    BigramCount bigramCount = bigramCountMap.get(unigram);
                    pw.println(unigram + " -> " + unigramCounts.get(unigram) + " , " +
                            bigramCount.bigramStr + " -> " + bigramCount.count);
                }
            }
        }
    }

    public static void main(String[] args) throws IOException {
        long start = System.currentTimeMillis();
        MultiWordExpressionFinder finder = new MultiWordExpressionFinder();
        finder.addAllFileContentFromDirectory(new File("htbig"));
        finder.exportPairs(new File("result.txt"));
        System.out.println("Elapsed : " + ((double) System.currentTimeMillis() - start) / 1000d + " seconds.");
    }
}
	import java.io.*;
	import java.nio.charset.StandardCharsets;
	import java.nio.file.Files;
	import java.util.HashMap;
	import java.util.Locale;
	import java.util.Map;
	import java.util.regex.Matcher;
	import java.util.regex.Pattern;

	/**
	* Collects unigram and bigram counts from files or directories.
	* It finds the matching single and two word expressions and their counts.
	*/
	public class MultiWordExpressionFinder {

	// contains bigrams and their counts. key of the map holds the combined bigram words.
	// for example for "ya da" bigram, key is "yada"
	Map<String, BigramCount> bigramCountMap = new HashMap<>();

	// unigrams and their counts.
	Map<String, Integer> unigramCounts = new HashMap<>();

	private static final Pattern CONTENT_PATTERN = Pattern.compile(
	"(?:<field name=\"TEXT\">)(.+?)(?:</field>)",
	Pattern.DOTALL \| Pattern.MULTILINE);

	/**
	* Returns the text content from the xml file. It uses a regular expression to retrieve the content
	* from between "<field name="TEXT">....</field>" tags.
	*/
	private String getContentFromXml(File xmlFile) throws IOException {
	try (BufferedReader reader = Files.newBufferedReader(xmlFile.toPath(), StandardCharsets.UTF_8)) {
	StringBuilder sb = new StringBuilder();
	String line;
	while ((line = reader.readLine()) != null) {
	sb.append(line);
	}
	Matcher m = CONTENT_PATTERN.matcher(sb.toString());
	if (m.find()) {
	return m.group(1);
	} else return "";
	}
	}

	/**
	* Adds unigrams, bigrams and their counts from the content of xml files from the dir.
	*/
	public void addAllFileContentFromDirectory(File dir) throws IOException {
	if (!dir.isDirectory()) {
	throw new IllegalArgumentException(dir + " must be a directory.");
	}
	File[] files = dir.listFiles();
	if (files == null) {
	throw new IllegalArgumentException(" Error while listing " + dir);
	}

	for (File file : files) {
	if (file.getName().endsWith(".xml")) {
	addContent(getContentFromXml(file));
	}
	}
	}

	static class BigramCount {
	final String bigramStr; // holds a string as "word1 word2"
	int count = 0;

	BigramCount(String w1, String w2) {
	this.bigramStr = w1 + " " + w2;
	}

	void increment() {
	count++;
	}
	}

	static final Locale TURKISH_LOCALE = new Locale("tr");

	/**
	* Cleans and applies primitive tokenization to the input text.
	* it splits it to tokens from spaces and sentence boundaries.
	* Then it puts the unigram and bigram content to count maps.
	* However bigram map keys are actually connected bigram words.
	* Such as for bigram `ya da`, map key is `yada`
	*/
	public void addContent(String text) {
	// to lower case and replace all characters with " # " except turkish alphabet and sentence boundary characters.
	text = text.toLowerCase(TURKISH_LOCALE).replaceAll("[^a-zçğıöşü.!?: ]", " # ");

	// primitive tokenization
	String[] tokens = text.split("[?:.! ]+");
	for (int i = 0; i < tokens.length; i++) {
	addUnigram(tokens[i]);

	if (i < tokens.length - 1) {
	addBigram(tokens[i], tokens[i + 1]);
	}
	}
	}

	private void addUnigram(String w) {
	if (w.length() == 0)
	return;
	if (unigramCounts.containsKey(w)) {
	int count = unigramCounts.get(w) + 1;
	unigramCounts.put(w, count);
	} else unigramCounts.put(w, 1);
	}

	private void addBigram(String w1, String w2) {
	if (w1.length() == 0 \|\| w2.length() == 0)
	return;
	String connected = w1 + w2;
	BigramCount bigramCount = bigramCountMap.get(connected);
	if (bigramCount == null) {
	bigramCount = new BigramCount(w1, w2);
	bigramCountMap.put(bigramCount);
	}
	bigramCount.increment();
	}

	public void exportPairs(File file) throws IOException {
	try (PrintWriter pw = new PrintWriter(file, "UTF-8")) {
	for (String unigram : unigramCounts.keySet()) {
	if (bigramCountMap.containsKey(unigram)) {
	BigramCount bigramCount = bigramCountMap.get(unigram);
	pw.println(unigram + " -> " + unigramCounts.get(unigram) + " , " +
	bigramCount.bigramStr + " -> " + bigramCount.count);
	}
	}
	}
	}

	public static void main(String[] args) throws IOException {
	long start = System.currentTimeMillis();
	MultiWordExpressionFinder finder = new MultiWordExpressionFinder();
	finder.addAllFileContentFromDirectory(new File("htbig"));
	finder.exportPairs(new File("result.txt"));
	System.out.println("Elapsed : " + ((double) System.currentTimeMillis() - start) / 1000d + " seconds.");
	}
	}