Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save commonquail/e680ac0cb78a09c5caa1c8483a6c0d6b to your computer and use it in GitHub Desktop.
Save commonquail/e680ac0cb78a09c5caa1c8483a6c0d6b to your computer and use it in GitHub Desktop.
Processing Large Files in Java, Variation 7
*.class
indiv18.zip
itcont.txt
sample.txt
CLASS_NAME = ReadFileJavaApplicationBufferedReader7
%.class: %.java
javac $<
.PHONY: test
test: $(CLASS_NAME).class sample.txt
java -Xmx4G $(CLASS_NAME) sample.txt
.PHONY: run
run: $(CLASS_NAME).class
java -Xmx4G $(CLASS_NAME) itcont.txt
.PHONY: download-large-file
download-large-file: itcont.txt
indiv18.zip:
wget https://www.fec.gov/files/bulk-downloads/2018/indiv18.zip
itcont.txt: indiv18.zip
unzip indiv18.zip itcont.txt
sample.txt: itcont.txt
head -n 44000 $< > $@
import java.io.BufferedReader;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.Duration;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Pattern;
import static java.util.stream.Collectors.counting;
import static java.util.stream.Collectors.groupingBy;
public class ReadFileJavaApplicationBufferedReader7 {
// returns the time between startTime and now in milliseconds
static long between(Instant startTime) {
return Duration.between(startTime, Instant.now()).toMillis();
}
public static void main(String[] args) throws IOException {
Instant startTime = Instant.now();
try (BufferedReader b = Files.newBufferedReader(Path.of(args[0]))) {
int[] indexes = {0, 432, 43243};
List<String> names = new ArrayList<>();
List<String> dates = new ArrayList<>();
List<String> firstNames = new ArrayList<>();
var namePat = Pattern.compile(", \\s*([^, ]+)");
StringBuilder sb = new StringBuilder(7);
System.out.println("Reading file using " + Caller.getName());
/*
* Line format:
*
* 0 | 1 | 2 | 3 | DATE | 5 | 6 | NAME | 8 | ...
* ^ ^ ^ ^ ^ ^ ^ ^
* 1 2 3 4 5 6 7 8
*/
String readLine;
while ((readLine = b.readLine()) != null) {
// There are at least 3 separators before the first separator
// we're interested in so we don't need to check the first 3
// characters.
int startFieldIdx = 1 + nthIndexOf(readLine, '|', 4, 3);
int endFieldIdx = readLine.indexOf('|', startFieldIdx);
// extract dates
String rawDate = readLine.substring(startFieldIdx, endFieldIdx).strip();
sb.setLength(0);
sb.append(rawDate, 0, 4)
.append('-')
.append(rawDate, 4, 6);
dates.add(sb.toString());
// get all the names
startFieldIdx = 1 + nthIndexOf(readLine, '|', 2, endFieldIdx);
endFieldIdx = readLine.indexOf('|', startFieldIdx);
String name = readLine.substring(startFieldIdx, endFieldIdx).strip();
names.add(name);
// extract first names
var matcher = namePat.matcher(name);
if (matcher.find()) {
firstNames.add(matcher.group(1));
}
}
for (int i : indexes) {
System.out.println("Name: " + names.get(i) + " at index: " + i);
}
System.out.println("Name time: " + between(startTime) + "ms");
System.out.println("Total file line count: " + names.size());
System.out.println("Line count time: " + between(startTime) + "ms");
Map<String, Long> dateMap = dates.stream()
.collect(groupingBy(date -> date, counting()));
dateMap.forEach((date, count)
-> System.out.println("Donations per month and year: " + date + " and donation count: " + count));
System.out.println("Donations time: " + between(startTime) + "ms");
Map<String, Long> nameMap = firstNames.stream()
.collect(groupingBy(name -> name, counting()));
Entry<String, Long> common = Collections.max(nameMap.entrySet(), Entry.comparingByValue());
System.out.println("The most common first name is: " + common.getKey() + " and it occurs: " + common.getValue() + " times.");
System.out.println("Most common name time: " + between(startTime) + "ms");
}
}
/**
* Finds the nth occurrence of {@code c} in {@code s} <em>after</em> index
* {@code start}. To find a character at index 0, call with {@code start}
* equal to -1.
*/
static int nthIndexOf(String s, char c, int n, int start) {
int x = start;
while (n-- > 0) {
x = s.indexOf(c, x + 1);
}
return x;
}
}
class Caller {
// gets the simple name of the caller's class
public static String getName() {
return StackWalker.getInstance()
.walk(s -> s.skip(1)
.findFirst()
.map(StackWalker.StackFrame::getClassName)
.map(name -> name.replaceFirst("^.*\\.", ""))
.orElse(""));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment