Skip to content

Instantly share code, notes, and snippets.

@sshine
Created July 1, 2019 14:51
Show Gist options
  • Save sshine/6678b8ecad0eeaaac2304321236a3b74 to your computer and use it in GitHub Desktop.
Save sshine/6678b8ecad0eeaaac2304321236a3b74 to your computer and use it in GitHub Desktop.
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.*;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.function.*;
import java.util.stream.*;
public class MailHandler implements IMailHandler {
/**
* Since we cannot extend {@link IMailInfo} with the ability to compare by
* {@link IMailInfo#getTime()}, we provide a {@link Comparator<IMailInfo>}.
*/
private static final Comparator<IMailInfo> mailInfoComparatorByTime = new Comparator<>() {
@Override
public int compare(IMailInfo one, IMailInfo other) {
return one.getTime().compareTo(other.getTime());
}
};
private final String directory;
private final Random random;
private Map<String, List<IMailInfo>> archive = new HashMap<>();
private Integer archiveCount = 0;
/**
* Given a file-system directory path, provide a {@link MailHandler}
* that can import messages in individual files in the subdirectories
* of {@param directory}.
*
* @param directory the directory from which to import messages
*/
public MailHandler(final String directory, final Random random) {
if (directory == null) throw new AssertionError();
if (random == null) throw new AssertionError();
this.directory = directory;
this.random = random;
}
/**
* While this function can fail with file-system and parse errors,
* I can think of no meaningful way to handle them except give a
* warning and skip further processing of a particular file or directory.
*
* Given a variation of {@link IMailHandler}, it should probably throw.
*
* @param maxMails the limit at which to stop importing
*/
@Override
public void doImport(final Integer maxMails) {
if (maxMails < 0) throw new AssertionError();
// If MailParser were not an inner class, it would be possible to
// declare static methods in it, and so instantiation would not be
// necessary.
final MailParser mailParser = new MailParser();
try {
Files.walk(Paths.get(directory))
.filter(Files::isRegularFile)
.filter(path -> !path.endsWith("DELETIONS.txt"))
.limit(maxMails)
.map(mailParser::parseMailInfoSafely)
.filter(mailInfo -> mailInfo != null)
.forEach(this::archiveMailInfo)
;
} catch (IOException e) {
System.err.println(e);
}
}
/**
* The assignment does not specify whether {@param maxTime} can
* be null to signify no maximum time constraint. Since a better
* API design for this option would be to provide a separate
* {@code IMailHandler#search(String)}, I've chosen to go with
* assertions.
*
* The assignment also does not specify how to deal with emails
* that have no 'Date' field; for now they are simply excluded in
* search results.
*
* @param emailAddress the sender or recipient of an email
* @param maxTime the latest {@link IMailInfo#getTime()} allowed
* @return a list of emails that match the search criteria
*/
@Override
public List<IMailInfo> search(final String emailAddress, final Date maxTime) {
if (emailAddress == null) throw new AssertionError();
if (maxTime == null) throw new AssertionError();
return archive.getOrDefault(emailAddress, Collections.emptyList())
.stream()
.takeWhile(mailInfo -> mailInfo.getTime().before(maxTime))
.collect(Collectors.toList())
;
}
/**
* I've chosen to use Reservoir Sampling:
*
* https://en.wikipedia.org/wiki/Reservoir_sampling
*
* The two alternatives that I could think of are:
*
* - Create a copy of the set of {@link IMailInfo}s and
* iteratively move {@param nSamples} of those to a
* result set; this potentially uses a lot of memory,
* making it hard to extend the current solution to an
* out-of-memory one.
*
* - Iteratively pick {@param nSamples}, but put them back
* if already picked; this potentially uses infinite time.
*
* Some considerations with regards to this: One could sample...
*
* - among email addresses uniformly,
* - among email messages uniformly,
* - among email message references uniformly,
* - among some other property, uniformly or non-uniformly.
*
* Since the assignment does not specify any constraints to the
* statistical distribution of samples, another benefit of
* reservoir sampling is that it requires no additional data
* structures besides {@code archiveCount}.
*
* @param nSamples the number of samples
* @return a list of randomly sampled emails
*/
@Override
public List<IMailInfo> sample(final Integer nSamples) {
return mailInfoStream().collect(new ReservoirSampler<IMailInfo>(random, nSamples));
}
/**
* Produces a {@link Stream<IMailInfo>} from archive.
*/
private Stream<IMailInfo> mailInfoStream() {
return archive.values().stream().flatMap(List::stream);
}
/**
* Archive an {@link IMailInfo} by each email address in its 'From' / 'To' fields.
*/
private void archiveMailInfo(final IMailInfo mailInfo) {
if (mailInfo == null) throw new AssertionError();
if (mailInfo.getFrom() != null) {
archiveMailInfoByAddress(mailInfo.getFrom(), mailInfo);
}
for (String to : mailInfo.getTo()) {
archiveMailInfoByAddress(to, mailInfo);
}
}
/**
* Archive {@param mailInfo} by {@param emailAddress}, sorted by {@link IMailInfo#getTime()}.
*
* Ideally, I would use a skip list internally:
*
* https://en.wikipedia.org/wiki/Skip_list
*
* It has both O(log n) lookups and insertions.
*
* An ArrayList has O(n) insertions.
*
* The Java standard library does not provide a skip list implementation.
* As I was encouraged to not focus too much on performance, but on data
* structures and clear code, I've chosen to go with a solution that, if
* provided with an underlying {@link List<IMailInfo>} that supports
* O(log n) inserts, this method may support that seamlessly.
*/
private void archiveMailInfoByAddress(final String emailAddress, final IMailInfo mailInfo) {
if (emailAddress == null) throw new AssertionError();
if (mailInfo == null) throw new AssertionError();
List<IMailInfo> mailInfos = archive.computeIfAbsent(emailAddress, _emailAddress -> new ArrayList<>());
int pos = Collections.binarySearch(mailInfos, mailInfo, mailInfoComparatorByTime);
// Archive MailInfo regardless of there being an email with the same 'Date'
mailInfos.add(pos < 0 ? -pos-1 : pos, mailInfo);
archiveCount++;
}
/**
* Given the constraints of the assignment to only hand in two files,
* and to only use the standard library, I've chosen to make this an
* inner class.
*/
private class MailParser {
/**
* Set of headers to extract.
*/
private final Set<String> headers = Set.of("To", "From", "Subject", "Date");
/**
* Assume that 'Date' fields are RFC 2822-compatible until proven otherwise.
*/
private final SimpleDateFormat dateFormat =
new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss Z (z)");
/**
* Dummy constructor due to lack of inner class static methods.
*/
public MailParser() {
}
/**
* This wrapper is made with the premise that functions should
* do one thing (Robert C. Martin), and try-catch is one thing.
*
* An alternative would be to propagate exception handling up.
*/
public IMailInfo parseMailInfoSafely(final Path path) {
if (path == null) throw new AssertionError();
try {
return parseMailInfo(path);
} catch (IOException | ParseException e) {
System.err.println("Exception while parsing " + path + ": " + e.getMessage());
e.printStackTrace();
return null;
}
}
/**
* Static constructor for {@link IMailInfo} based on the file
* contents of {@param path}. In a production environment, it
* would be even better to rely on more fully covering email
* message parsers.
*/
private IMailInfo parseMailInfo(final Path path) throws IOException, ParseException {
if (path == null) throw new AssertionError();
Stream<String> lines = new BufferedReader(new FileReader(String.valueOf(path))).lines();
// Stream<String> lines = Files.lines(Paths.get(String.valueOf(path)));
Map<String, String> foundHeaders;
try {
foundHeaders = extractHeaders(path, lines);
} catch (Exception e) {
System.err.println("Wat: " + path);
foundHeaders = new HashMap<>();
}
List<String> to = extractMailList(foundHeaders.get("To"));
String from = foundHeaders.get("From");
String subject = foundHeaders.get("Subject");
Date time = extractDate(foundHeaders.get("Date"));
return new MailInfo(to, from, subject, time, path);
}
/**
* Split multiple emails in an ad-hoc way: Prefer ";" over ",".
*
* Example that won't work:
*
* To: Peter "Stillborn, The" Sellers <info@petersellers.com>.
*
* Ideally, use an (approximately) RFC-compliant email parser.
*/
private List<String> extractMailList(final String headerValue) {
if (headerValue == null) {
return Collections.emptyList();
}
String separator = headerValue.contains(";") ? ";" : ",";
return Arrays.asList(headerValue.split(separator + "\\s*"));
}
/**
* Handle missing 'Date' headers.
*/
private Date extractDate(final String headerValue) throws ParseException {
if (headerValue == null) return null;
return dateFormat.parse(headerValue);
}
/**
* The following parser is stateful in order to handle headers that
* split across multiple lines. The last header name is kept while
* processing lines that are possibly continuations of that header.
*
* The headers marked for extraction are hardcoded for now.
*/
private Map<String, String> extractHeaders(final Path path, final Stream<String> lines) {
if (path == null) throw new AssertionError();
if (lines == null) throw new AssertionError();
Set<String> missingHeaders = new HashSet<>(headers);
Map<String, String> foundHeaders = new HashMap<>(headers.size());
String headerName = null;
for (String line : (Iterable<String>) lines::iterator) {
// End parsing when we can
if (missingHeaders.isEmpty()) {
break;
}
// Mail body begins
if (line.isEmpty()) {
break;
}
// Some headers are split with line breaks followed by whitespace indentation
if ((headerName != null) && Character.isWhitespace(line.charAt(0))) {
foundHeaders.computeIfPresent(
headerName,
(_headerName, headerValue) -> headerValue + line.trim());
continue;
}
String[] header = line.split(": ", 2);
if (header.length < 2) {
System.err.println("Unknown header line '" + line + "' in " + path);
continue;
}
// Skip repeated headers for now; in principle it would be better to warn.
headerName = header[0];
String headerValue = header[1];
if (missingHeaders.contains(headerName)) {
missingHeaders.remove(headerName);
foundHeaders.put(headerName, headerValue);
}
}
return foundHeaders;
}
}
/**
* Inspired by:
*
* http://trevershick.github.io/java8/2015/12/02/reservoir-based-sampling.html
*
* (I don't presume to know much about stream collectors, but I thought that
* it was pretty cool to make it this way for a concise, generic solution that
* reads well in {@link #sample(Integer)}.)
*
* I've only changed variable names, indentation and injection of {@link Random}.
*/
private class ReservoirSampler<T> implements Collector<T, List<T>, List<T>> {
private final Random random;
private final int nSamples;
private int count = 0;
public ReservoirSampler(final Random random, final int nSamples) {
this.random = random;
this.nSamples = nSamples;
}
private void addIt(final List<T> in, T s) {
if (in.size() < nSamples) {
in.add(s);
}
else {
int replaceInIndex = (int) (random.nextDouble() * (nSamples + (count++) + 1));
if (replaceInIndex < nSamples) {
in.set(replaceInIndex, s);
}
}
}
@Override
public Supplier<List<T>> supplier() {
return ArrayList::new;
}
@Override
public BiConsumer<List<T>, T> accumulator() {
return this::addIt;
}
@Override
public BinaryOperator<List<T>> combiner() {
return (left, right) -> {
left.addAll(right);
return left;
};
}
@Override
public Set<java.util.stream.Collector.Characteristics> characteristics() {
return EnumSet.of(
Collector.Characteristics.UNORDERED,
Collector.Characteristics.IDENTITY_FINISH);
}
@Override
public Function<List<T>, List<T>> finisher() {
return Function.identity();
}
}
}
import java.nio.file.Path;
import java.util.*;
public class MailInfo implements IMailInfo {
private final List<String> to;
private final String from;
private final String subject;
private final Date time;
private final Path path;
public MailInfo(final List<String> to, final String from, final String subject, final Date time, final Path path) {
this.to = to;
this.from = from;
this.subject = subject;
this.time = time;
this.path = path;
}
@Override
public List<String> getTo() {
return to;
}
@Override
public String getFrom() {
return from;
}
@Override
public String getSubject() {
return subject;
}
@Override
public Date getTime() {
return time;
}
@Override
public String pretty() {
return
"From: " + from + "\n" +
"To: " + String.join(", ", to) + "\n" +
"Subject: " + subject + "\n" +
"Date: " + time + "\n" +
"\nSee the full email at " + path + "\n"
;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment