Skip to content

Instantly share code, notes, and snippets.

@dhagan
Created September 29, 2015 00:31
Show Gist options
  • Save dhagan/091fbdadb597f20b7571 to your computer and use it in GitHub Desktop.
Save dhagan/091fbdadb597f20b7571 to your computer and use it in GitHub Desktop.
import java.io.*;
import java.nio.file.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class concordance {
/**
* Usage is: java concordance <<filename>>"
*
* @param args
*/
public static void main(String[] args) {
if(args.length == 0)
{
System.out.println("Proper Usage is: java concordance <<filename>>");
System.exit(0);
}
_main(args);
}
/**
* wrapper
*
* @param args
*/
static void _main(String[] args) {
Map<String, WordInfo> concordanceMap = new HashMap<>();
String myText;
try {
// deficiency - would need a streaming strategy if the file text sizes
myText = new String(Files.readAllBytes(Paths.get(args[0])));
} catch (IOException e) {
System.out.print(args[0] + " File not found! Please check that the file exists.");
return;
}
String[] sentences = splitSentences(myText);
int sentenceNumber = 1;
for (String sentence : sentences) {
List<String> words = splitWords(sentence);
for (String word : words) {
String _word = word.toLowerCase();
if (!concordanceMap.containsKey(_word)) {
concordanceMap.put(_word, new WordInfo(_word, sentenceNumber));
} else {
concordanceMap.get(_word).WordCount++;
concordanceMap.get(_word).SentenceNumbers.add(sentenceNumber);
}
}
sentenceNumber++;
}
Map<String, WordInfo> treeMap = new TreeMap<>(concordanceMap);
printMap(treeMap);
}
/**
* format output
* @param map
*/
public static void printMap(Map<String, WordInfo> map) {
for (Map.Entry<String, WordInfo> entry : map.entrySet()) {
WordInfo wordInfo = entry.getValue();
StringJoiner sentenceNumbers = new StringJoiner(",");
for (Integer number : wordInfo.SentenceNumbers) {
sentenceNumbers.add(number.toString());
}
String value = "{" + wordInfo.WordCount + ":" + sentenceNumbers.toString() + "}";
System.out.println(String.format("%-20s %s", entry.getKey(), value));
}
}
/**
* split sentences into words, handle special case like i.e.
*
* please note I have used stack overflow suggestions for the regex
* @param sentence
* @return
*/
static List<String> splitWords(String sentence) {
List<String> allMatches = new ArrayList<String>();
Matcher matcher = Pattern.compile("((\\b[^\\s]+\\b)((?<=\\.\\w).)?)").matcher(sentence);
while (matcher.find()) {
allMatches.add(matcher.group());
}
return allMatches;
}
/**
*
* split text blob into sentences
*
* @param text
* @return
*/
static String[] splitSentences(String text) {
String pattern = "(?<=[.!?])\\s+(?=[A-Z])";
return text.split(pattern);
}
/**
* hold class for word count info,
* deficiency - next iteration abstract public members
*/
public static class WordInfo {
public WordInfo(String word, int sentenceNumber) {
Word = word;
WordCount = 1;
SentenceNumbers.add(sentenceNumber);
}
public String Word;
public int WordCount;
public List<Integer> SentenceNumbers = new ArrayList<>();
}
}
@sara251186
Copy link

thanks for sharing this.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment