vladholubiev/PDFwordsFrequency.java

## PDFwordsFrequency.java
package ua.samosfator.zno.english.wordsFrequency;

import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;

import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static java.util.Comparator.comparing;
import static java.util.stream.Collectors.toMap;

public class Data {
    ArrayList<String> words = new ArrayList<>();
    Map<String, Integer> dict = new HashMap<>();

    public void getWords(String filePath) throws IOException {
        String parsedText;
        PDFParser parser;
        PDDocument pdDoc = null;
        COSDocument cosDoc = null;
        PDFTextStripper pdfStripper;
        File file = new File(filePath);
        try {
            parser = new PDFParser(new FileInputStream(file));
            parser.parse();
            cosDoc = parser.getDocument();
            pdfStripper = new PDFTextStripper();
            pdDoc = new PDDocument(cosDoc);
            parsedText = pdfStripper.getText(pdDoc);
            createDictionary(parsedText, filePath);
        } catch (Exception e) {
            e.printStackTrace();
            try {
                if (cosDoc != null)
                    cosDoc.close();
                if (pdDoc != null)
                    pdDoc.close();
            } catch (Exception e1) {
                e.printStackTrace();
            }
        }
    }

    private void createDictionary(String parsedText, String filePath) throws IOException {
        Pattern p = Pattern.compile("\\b([a-zA-Z]*([a-zA-Z])[a-zA-Z]*(?!\\2)[a-zA-Z]+)\\b");
        Matcher m = p.matcher(parsedText);
        while (m.find()) {
            String s = m.group(1).toLowerCase();
            if (!dict.containsKey(s)) {
                dict.put(s, 1);
            } else {
                int count = dict.get(s);
                dict.put(s, ++count);
            }
        }
        sortDict(dict);
    }
    private void sortDict(Map<String, Integer> dict) throws IOException {
        Map<String, Integer> sortedDict = dict.entrySet().stream()
                .sorted(comparing(Map.Entry::getValue))
                .collect(toMap(Map.Entry::getKey, Map.Entry::getValue,
                        (e1, e2) -> e1, LinkedHashMap::new));
        writeToFile(sortedDict);
    }
    private void writeToFile(Map<String, Integer> dict) throws IOException {
        BufferedWriter out = new BufferedWriter(new FileWriter("E:\\results.json"));
        out.write("{");
        for (String s : dict.keySet()) {
            out.write("\"" + s + "\" : \"" + dict.get(s) + "\",");
            out.newLine();
        }
        out.write("}");
        out.close();
    }
}
	package ua.samosfator.zno.english.wordsFrequency;

	import org.apache.pdfbox.cos.COSDocument;
	import org.apache.pdfbox.pdfparser.PDFParser;
	import org.apache.pdfbox.pdmodel.PDDocument;
	import org.apache.pdfbox.util.PDFTextStripper;

	import java.io.*;
	import java.util.ArrayList;
	import java.util.HashMap;
	import java.util.LinkedHashMap;
	import java.util.Map;
	import java.util.regex.Matcher;
	import java.util.regex.Pattern;

	import static java.util.Comparator.comparing;
	import static java.util.stream.Collectors.toMap;

	public class Data {
	ArrayList<String> words = new ArrayList<>();
	Map<String, Integer> dict = new HashMap<>();

	public void getWords(String filePath) throws IOException {
	String parsedText;
	PDFParser parser;
	PDDocument pdDoc = null;
	COSDocument cosDoc = null;
	PDFTextStripper pdfStripper;
	File file = new File(filePath);
	try {
	parser = new PDFParser(new FileInputStream(file));
	parser.parse();
	cosDoc = parser.getDocument();
	pdfStripper = new PDFTextStripper();
	pdDoc = new PDDocument(cosDoc);
	parsedText = pdfStripper.getText(pdDoc);
	createDictionary(parsedText, filePath);
	} catch (Exception e) {
	e.printStackTrace();
	try {
	if (cosDoc != null)
	cosDoc.close();
	if (pdDoc != null)
	pdDoc.close();
	} catch (Exception e1) {
	e.printStackTrace();
	}
	}
	}

	private void createDictionary(String parsedText, String filePath) throws IOException {
	Pattern p = Pattern.compile("\\b([a-zA-Z]([a-zA-Z])[a-zA-Z](?!\\2)[a-zA-Z]+)\\b");
	Matcher m = p.matcher(parsedText);
	while (m.find()) {
	String s = m.group(1).toLowerCase();
	if (!dict.containsKey(s)) {
	dict.put(s, 1);
	} else {
	int count = dict.get(s);
	dict.put(s, ++count);
	}
	}
	sortDict(dict);
	}
	private void sortDict(Map<String, Integer> dict) throws IOException {
	Map<String, Integer> sortedDict = dict.entrySet().stream()
	.sorted(comparing(Map.Entry::getValue))
	.collect(toMap(Map.Entry::getKey, Map.Entry::getValue,
	(e1, e2) -> e1, LinkedHashMap::new));
	writeToFile(sortedDict);
	}
	private void writeToFile(Map<String, Integer> dict) throws IOException {
	BufferedWriter out = new BufferedWriter(new FileWriter("E:\\results.json"));
	out.write("{");
	for (String s : dict.keySet()) {
	out.write("\"" + s + "\" : \"" + dict.get(s) + "\",");
	out.newLine();
	}
	out.write("}");
	out.close();
	}
	}