Skip to content

Instantly share code, notes, and snippets.

Created November 27, 2017 20:17
Show Gist options
  • Save anonymous/7dd63c5e5127f89b4b55a2f36a68bdc6 to your computer and use it in GitHub Desktop.
Save anonymous/7dd63c5e5127f89b4b55a2f36a68bdc6 to your computer and use it in GitHub Desktop.
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.*;
public class WordStatLineIndex {
public static final char[] buf = new char[256];
public static void main(String[] args) {
if (args.length != 2) {
System.err.println("Wrong input");
return;
}
StringBuilder bd = new StringBuilder();
try(InputStreamReader in = new InputStreamReader(new FileInputStream(args[0]), "UTF-8")) {
Map<String, ArrayList<String > > stat = new TreeMap<>();
int read = -1;
int wordId = 1;
int lineId = 1;
while ((read = in.read(buf)) > 0) {
for (int i = 0; i < read; i++) {
char c = buf[i];
if (Character.isLetter(c) || c == '\'' || Character.getType(c) == Character.DASH_PUNCTUATION) {
bd.append(c);
} else {
if (bd.length() > 0) {
stat.merge(bd.toString().toLowerCase(), new ArrayList<>(Arrays.asList(lineId + ":" + wordId)), (x, y) -> {
x.addAll(y);
return x;
});
wordId++;
}
if (c == '\n') {
lineId++;
wordId = 1;
}
bd.setLength(0);
}
}
}
if (bd.length() > 0) {
stat.merge(bd.toString().toLowerCase(), new ArrayList<>(Arrays.asList(lineId + ":" + wordId)), (x, y) -> {
x.addAll(y);
return x;
});
}
Files.write(Paths.get(args[1]),
stat.entrySet().stream()
.reduce(new ArrayList<String>(),
(list, entry) -> {
bd.setLength(0);
bd.append(entry.getKey() + " "
+ entry.getValue().size() + " ");
entry.getValue().forEach((x -> bd.append(x + " ")));
bd.setLength(bd.length() - 1);
list.add(bd.toString());
return list;
}, (l1, l2) -> {
l1.addAll(l2);
return l1;
}),
Charset.forName("UTF-8"));
} catch (IOException e) {
e.printStackTrace();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment