-
-
Save sshine/6678b8ecad0eeaaac2304321236a3b74 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.BufferedReader; | |
import java.io.FileReader; | |
import java.io.IOException; | |
import java.nio.file.*; | |
import java.text.ParseException; | |
import java.text.SimpleDateFormat; | |
import java.util.*; | |
import java.util.function.*; | |
import java.util.stream.*; | |
public class MailHandler implements IMailHandler { | |
/** | |
* Since we cannot extend {@link IMailInfo} with the ability to compare by | |
* {@link IMailInfo#getTime()}, we provide a {@link Comparator<IMailInfo>}. | |
*/ | |
private static final Comparator<IMailInfo> mailInfoComparatorByTime = new Comparator<>() { | |
@Override | |
public int compare(IMailInfo one, IMailInfo other) { | |
return one.getTime().compareTo(other.getTime()); | |
} | |
}; | |
private final String directory; | |
private final Random random; | |
private Map<String, List<IMailInfo>> archive = new HashMap<>(); | |
private Integer archiveCount = 0; | |
/** | |
* Given a file-system directory path, provide a {@link MailHandler} | |
* that can import messages in individual files in the subdirectories | |
* of {@param directory}. | |
* | |
* @param directory the directory from which to import messages | |
*/ | |
public MailHandler(final String directory, final Random random) { | |
if (directory == null) throw new AssertionError(); | |
if (random == null) throw new AssertionError(); | |
this.directory = directory; | |
this.random = random; | |
} | |
/** | |
* While this function can fail with file-system and parse errors, | |
* I can think of no meaningful way to handle them except give a | |
* warning and skip further processing of a particular file or directory. | |
* | |
* Given a variation of {@link IMailHandler}, it should probably throw. | |
* | |
* @param maxMails the limit at which to stop importing | |
*/ | |
@Override | |
public void doImport(final Integer maxMails) { | |
if (maxMails < 0) throw new AssertionError(); | |
// If MailParser were not an inner class, it would be possible to | |
// declare static methods in it, and so instantiation would not be | |
// necessary. | |
final MailParser mailParser = new MailParser(); | |
try { | |
Files.walk(Paths.get(directory)) | |
.filter(Files::isRegularFile) | |
.filter(path -> !path.endsWith("DELETIONS.txt")) | |
.limit(maxMails) | |
.map(mailParser::parseMailInfoSafely) | |
.filter(mailInfo -> mailInfo != null) | |
.forEach(this::archiveMailInfo) | |
; | |
} catch (IOException e) { | |
System.err.println(e); | |
} | |
} | |
/** | |
* The assignment does not specify whether {@param maxTime} can | |
* be null to signify no maximum time constraint. Since a better | |
* API design for this option would be to provide a separate | |
* {@code IMailHandler#search(String)}, I've chosen to go with | |
* assertions. | |
* | |
* The assignment also does not specify how to deal with emails | |
* that have no 'Date' field; for now they are simply excluded in | |
* search results. | |
* | |
* @param emailAddress the sender or recipient of an email | |
* @param maxTime the latest {@link IMailInfo#getTime()} allowed | |
* @return a list of emails that match the search criteria | |
*/ | |
@Override | |
public List<IMailInfo> search(final String emailAddress, final Date maxTime) { | |
if (emailAddress == null) throw new AssertionError(); | |
if (maxTime == null) throw new AssertionError(); | |
return archive.getOrDefault(emailAddress, Collections.emptyList()) | |
.stream() | |
.takeWhile(mailInfo -> mailInfo.getTime().before(maxTime)) | |
.collect(Collectors.toList()) | |
; | |
} | |
/** | |
* I've chosen to use Reservoir Sampling: | |
* | |
* https://en.wikipedia.org/wiki/Reservoir_sampling | |
* | |
* The two alternatives that I could think of are: | |
* | |
* - Create a copy of the set of {@link IMailInfo}s and | |
* iteratively move {@param nSamples} of those to a | |
* result set; this potentially uses a lot of memory, | |
* making it hard to extend the current solution to an | |
* out-of-memory one. | |
* | |
* - Iteratively pick {@param nSamples}, but put them back | |
* if already picked; this potentially uses infinite time. | |
* | |
* Some considerations with regards to this: One could sample... | |
* | |
* - among email addresses uniformly, | |
* - among email messages uniformly, | |
* - among email message references uniformly, | |
* - among some other property, uniformly or non-uniformly. | |
* | |
* Since the assignment does not specify any constraints to the | |
* statistical distribution of samples, another benefit of | |
* reservoir sampling is that it requires no additional data | |
* structures besides {@code archiveCount}. | |
* | |
* @param nSamples the number of samples | |
* @return a list of randomly sampled emails | |
*/ | |
@Override | |
public List<IMailInfo> sample(final Integer nSamples) { | |
return mailInfoStream().collect(new ReservoirSampler<IMailInfo>(random, nSamples)); | |
} | |
/** | |
* Produces a {@link Stream<IMailInfo>} from archive. | |
*/ | |
private Stream<IMailInfo> mailInfoStream() { | |
return archive.values().stream().flatMap(List::stream); | |
} | |
/** | |
* Archive an {@link IMailInfo} by each email address in its 'From' / 'To' fields. | |
*/ | |
private void archiveMailInfo(final IMailInfo mailInfo) { | |
if (mailInfo == null) throw new AssertionError(); | |
if (mailInfo.getFrom() != null) { | |
archiveMailInfoByAddress(mailInfo.getFrom(), mailInfo); | |
} | |
for (String to : mailInfo.getTo()) { | |
archiveMailInfoByAddress(to, mailInfo); | |
} | |
} | |
/** | |
* Archive {@param mailInfo} by {@param emailAddress}, sorted by {@link IMailInfo#getTime()}. | |
* | |
* Ideally, I would use a skip list internally: | |
* | |
* https://en.wikipedia.org/wiki/Skip_list | |
* | |
* It has both O(log n) lookups and insertions. | |
* | |
* An ArrayList has O(n) insertions. | |
* | |
* The Java standard library does not provide a skip list implementation. | |
* As I was encouraged to not focus too much on performance, but on data | |
* structures and clear code, I've chosen to go with a solution that, if | |
* provided with an underlying {@link List<IMailInfo>} that supports | |
* O(log n) inserts, this method may support that seamlessly. | |
*/ | |
private void archiveMailInfoByAddress(final String emailAddress, final IMailInfo mailInfo) { | |
if (emailAddress == null) throw new AssertionError(); | |
if (mailInfo == null) throw new AssertionError(); | |
List<IMailInfo> mailInfos = archive.computeIfAbsent(emailAddress, _emailAddress -> new ArrayList<>()); | |
int pos = Collections.binarySearch(mailInfos, mailInfo, mailInfoComparatorByTime); | |
// Archive MailInfo regardless of there being an email with the same 'Date' | |
mailInfos.add(pos < 0 ? -pos-1 : pos, mailInfo); | |
archiveCount++; | |
} | |
/** | |
* Given the constraints of the assignment to only hand in two files, | |
* and to only use the standard library, I've chosen to make this an | |
* inner class. | |
*/ | |
private class MailParser { | |
/** | |
* Set of headers to extract. | |
*/ | |
private final Set<String> headers = Set.of("To", "From", "Subject", "Date"); | |
/** | |
* Assume that 'Date' fields are RFC 2822-compatible until proven otherwise. | |
*/ | |
private final SimpleDateFormat dateFormat = | |
new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss Z (z)"); | |
/** | |
* Dummy constructor due to lack of inner class static methods. | |
*/ | |
public MailParser() { | |
} | |
/** | |
* This wrapper is made with the premise that functions should | |
* do one thing (Robert C. Martin), and try-catch is one thing. | |
* | |
* An alternative would be to propagate exception handling up. | |
*/ | |
public IMailInfo parseMailInfoSafely(final Path path) { | |
if (path == null) throw new AssertionError(); | |
try { | |
return parseMailInfo(path); | |
} catch (IOException | ParseException e) { | |
System.err.println("Exception while parsing " + path + ": " + e.getMessage()); | |
e.printStackTrace(); | |
return null; | |
} | |
} | |
/** | |
* Static constructor for {@link IMailInfo} based on the file | |
* contents of {@param path}. In a production environment, it | |
* would be even better to rely on more fully covering email | |
* message parsers. | |
*/ | |
private IMailInfo parseMailInfo(final Path path) throws IOException, ParseException { | |
if (path == null) throw new AssertionError(); | |
Stream<String> lines = new BufferedReader(new FileReader(String.valueOf(path))).lines(); | |
// Stream<String> lines = Files.lines(Paths.get(String.valueOf(path))); | |
Map<String, String> foundHeaders; | |
try { | |
foundHeaders = extractHeaders(path, lines); | |
} catch (Exception e) { | |
System.err.println("Wat: " + path); | |
foundHeaders = new HashMap<>(); | |
} | |
List<String> to = extractMailList(foundHeaders.get("To")); | |
String from = foundHeaders.get("From"); | |
String subject = foundHeaders.get("Subject"); | |
Date time = extractDate(foundHeaders.get("Date")); | |
return new MailInfo(to, from, subject, time, path); | |
} | |
/** | |
* Split multiple emails in an ad-hoc way: Prefer ";" over ",". | |
* | |
* Example that won't work: | |
* | |
* To: Peter "Stillborn, The" Sellers <info@petersellers.com>. | |
* | |
* Ideally, use an (approximately) RFC-compliant email parser. | |
*/ | |
private List<String> extractMailList(final String headerValue) { | |
if (headerValue == null) { | |
return Collections.emptyList(); | |
} | |
String separator = headerValue.contains(";") ? ";" : ","; | |
return Arrays.asList(headerValue.split(separator + "\\s*")); | |
} | |
/** | |
* Handle missing 'Date' headers. | |
*/ | |
private Date extractDate(final String headerValue) throws ParseException { | |
if (headerValue == null) return null; | |
return dateFormat.parse(headerValue); | |
} | |
/** | |
* The following parser is stateful in order to handle headers that | |
* split across multiple lines. The last header name is kept while | |
* processing lines that are possibly continuations of that header. | |
* | |
* The headers marked for extraction are hardcoded for now. | |
*/ | |
private Map<String, String> extractHeaders(final Path path, final Stream<String> lines) { | |
if (path == null) throw new AssertionError(); | |
if (lines == null) throw new AssertionError(); | |
Set<String> missingHeaders = new HashSet<>(headers); | |
Map<String, String> foundHeaders = new HashMap<>(headers.size()); | |
String headerName = null; | |
for (String line : (Iterable<String>) lines::iterator) { | |
// End parsing when we can | |
if (missingHeaders.isEmpty()) { | |
break; | |
} | |
// Mail body begins | |
if (line.isEmpty()) { | |
break; | |
} | |
// Some headers are split with line breaks followed by whitespace indentation | |
if ((headerName != null) && Character.isWhitespace(line.charAt(0))) { | |
foundHeaders.computeIfPresent( | |
headerName, | |
(_headerName, headerValue) -> headerValue + line.trim()); | |
continue; | |
} | |
String[] header = line.split(": ", 2); | |
if (header.length < 2) { | |
System.err.println("Unknown header line '" + line + "' in " + path); | |
continue; | |
} | |
// Skip repeated headers for now; in principle it would be better to warn. | |
headerName = header[0]; | |
String headerValue = header[1]; | |
if (missingHeaders.contains(headerName)) { | |
missingHeaders.remove(headerName); | |
foundHeaders.put(headerName, headerValue); | |
} | |
} | |
return foundHeaders; | |
} | |
} | |
/** | |
* Inspired by: | |
* | |
* http://trevershick.github.io/java8/2015/12/02/reservoir-based-sampling.html | |
* | |
* (I don't presume to know much about stream collectors, but I thought that | |
* it was pretty cool to make it this way for a concise, generic solution that | |
* reads well in {@link #sample(Integer)}.) | |
* | |
* I've only changed variable names, indentation and injection of {@link Random}. | |
*/ | |
private class ReservoirSampler<T> implements Collector<T, List<T>, List<T>> { | |
private final Random random; | |
private final int nSamples; | |
private int count = 0; | |
public ReservoirSampler(final Random random, final int nSamples) { | |
this.random = random; | |
this.nSamples = nSamples; | |
} | |
private void addIt(final List<T> in, T s) { | |
if (in.size() < nSamples) { | |
in.add(s); | |
} | |
else { | |
int replaceInIndex = (int) (random.nextDouble() * (nSamples + (count++) + 1)); | |
if (replaceInIndex < nSamples) { | |
in.set(replaceInIndex, s); | |
} | |
} | |
} | |
@Override | |
public Supplier<List<T>> supplier() { | |
return ArrayList::new; | |
} | |
@Override | |
public BiConsumer<List<T>, T> accumulator() { | |
return this::addIt; | |
} | |
@Override | |
public BinaryOperator<List<T>> combiner() { | |
return (left, right) -> { | |
left.addAll(right); | |
return left; | |
}; | |
} | |
@Override | |
public Set<java.util.stream.Collector.Characteristics> characteristics() { | |
return EnumSet.of( | |
Collector.Characteristics.UNORDERED, | |
Collector.Characteristics.IDENTITY_FINISH); | |
} | |
@Override | |
public Function<List<T>, List<T>> finisher() { | |
return Function.identity(); | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.nio.file.Path; | |
import java.util.*; | |
public class MailInfo implements IMailInfo { | |
private final List<String> to; | |
private final String from; | |
private final String subject; | |
private final Date time; | |
private final Path path; | |
public MailInfo(final List<String> to, final String from, final String subject, final Date time, final Path path) { | |
this.to = to; | |
this.from = from; | |
this.subject = subject; | |
this.time = time; | |
this.path = path; | |
} | |
@Override | |
public List<String> getTo() { | |
return to; | |
} | |
@Override | |
public String getFrom() { | |
return from; | |
} | |
@Override | |
public String getSubject() { | |
return subject; | |
} | |
@Override | |
public Date getTime() { | |
return time; | |
} | |
@Override | |
public String pretty() { | |
return | |
"From: " + from + "\n" + | |
"To: " + String.join(", ", to) + "\n" + | |
"Subject: " + subject + "\n" + | |
"Date: " + time + "\n" + | |
"\nSee the full email at " + path + "\n" | |
; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment