Skip to content

Instantly share code, notes, and snippets.

@vladholubiev
Created June 26, 2014 18:51
Show Gist options
  • Save vladholubiev/64ffee86547165c6ff52 to your computer and use it in GitHub Desktop.
Save vladholubiev/64ffee86547165c6ff52 to your computer and use it in GitHub Desktop.
Returns most frequent words by occurrence in a .pdf
package ua.samosfator.zno.english.wordsFrequency;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static java.util.Comparator.comparing;
import static java.util.stream.Collectors.toMap;
public class Data {
ArrayList<String> words = new ArrayList<>();
Map<String, Integer> dict = new HashMap<>();
public void getWords(String filePath) throws IOException {
String parsedText;
PDFParser parser;
PDDocument pdDoc = null;
COSDocument cosDoc = null;
PDFTextStripper pdfStripper;
File file = new File(filePath);
try {
parser = new PDFParser(new FileInputStream(file));
parser.parse();
cosDoc = parser.getDocument();
pdfStripper = new PDFTextStripper();
pdDoc = new PDDocument(cosDoc);
parsedText = pdfStripper.getText(pdDoc);
createDictionary(parsedText, filePath);
} catch (Exception e) {
e.printStackTrace();
try {
if (cosDoc != null)
cosDoc.close();
if (pdDoc != null)
pdDoc.close();
} catch (Exception e1) {
e.printStackTrace();
}
}
}
private void createDictionary(String parsedText, String filePath) throws IOException {
Pattern p = Pattern.compile("\\b([a-zA-Z]*([a-zA-Z])[a-zA-Z]*(?!\\2)[a-zA-Z]+)\\b");
Matcher m = p.matcher(parsedText);
while (m.find()) {
String s = m.group(1).toLowerCase();
if (!dict.containsKey(s)) {
dict.put(s, 1);
} else {
int count = dict.get(s);
dict.put(s, ++count);
}
}
sortDict(dict);
}
private void sortDict(Map<String, Integer> dict) throws IOException {
Map<String, Integer> sortedDict = dict.entrySet().stream()
.sorted(comparing(Map.Entry::getValue))
.collect(toMap(Map.Entry::getKey, Map.Entry::getValue,
(e1, e2) -> e1, LinkedHashMap::new));
writeToFile(sortedDict);
}
private void writeToFile(Map<String, Integer> dict) throws IOException {
BufferedWriter out = new BufferedWriter(new FileWriter("E:\\results.json"));
out.write("{");
for (String s : dict.keySet()) {
out.write("\"" + s + "\" : \"" + dict.get(s) + "\",");
out.newLine();
}
out.write("}");
out.close();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment