Skip to content

Instantly share code, notes, and snippets.

@thetekst
Last active October 9, 2022 12:21
Show Gist options
  • Save thetekst/2ede74924e3c847e2080e6e55662f709 to your computer and use it in GitHub Desktop.
Save thetekst/2ede74924e3c847e2080e6e55662f709 to your computer and use it in GitHub Desktop.
count popular words in pdf
import com.google.common.base.CharMatcher;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.stream.Collectors;
class Scratch {
public static void main(String[] args) throws IOException {
final LinkedHashMap<String, Long> sortedWords = read();
write(sortedWords);
}
private static LinkedHashMap<String, Long> read() throws IOException {
final var readPath = Paths.get(System.getProperty("user.home") + "/words");
return Files.readAllLines(readPath)
.stream()
.map(str -> CharMatcher.is(',').or(CharMatcher.is('.')).or(CharMatcher.is(':')).trimTrailingFrom(str))
.map(String::toLowerCase)
.collect(Collectors.groupingBy(t -> t, Collectors.counting()))
.entrySet()
.stream()
.sorted(Collections.reverseOrder(Map.Entry.comparingByValue()))
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e1, LinkedHashMap::new));
}
private static void write(LinkedHashMap<String, Long> sortedWords) throws IOException {
final var writePath = Paths.get(System.getProperty("user.home") + "/hit");
final var collect = sortedWords.entrySet().stream()
.map(entrySet -> entrySet.getKey() + ":" + entrySet.getValue())
.collect(Collectors.joining("\n"));
Files.write(writePath, collect.getBytes(StandardCharsets.UTF_8));
}
}
pdftotext $HOME/my_book.pdf - | tr " " "\n" | sort | grep "^[A-Za-z]" > words
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment