Skip to content

Instantly share code, notes, and snippets.

@steghio
Last active April 19, 2018 12:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save steghio/015dac3326d7cfe63223d16e60d2b515 to your computer and use it in GitHub Desktop.
Save steghio/015dac3326d7cfe63223d16e60d2b515 to your computer and use it in GitHub Desktop.
File utilities
package com.blogspot.groglogs.fileutils.structures;
import java.util.Objects;
public class FileEntry implements Comparable<FileEntry>{
public long timestamp;
public String data;
public String filename;
public FileEntry(long timestamp, String data, String filename){
this.timestamp = timestamp;
this.data = data;
this.filename = filename;
}
@Override
public boolean equals(Object other) {
if (other == this) return true;
if (!(other instanceof FileEntry)) {
return false;
}
FileEntry fe = (FileEntry) other;
return this.timestamp == fe.timestamp && this.data == fe.data && this.filename == fe.filename;
}
@Override
public int hashCode() {
return Objects.hash(timestamp, data, filename);
}
@Override
public int compareTo(FileEntry other) {
if(this.timestamp < other.timestamp) return -1;
if(this.timestamp == other.timestamp) return 0;
return 1;
}
}
package com.blogspot.groglogs.fileutils;
import com.blogspot.groglogs.fileutils.structures.FileEntry;
import java.io.*;
import java.util.*;
public class FileUtils {
//helper for mergeFiles
//Items are # separated, the timestamp is always the first item
//If item is malformed, have it raise the exception itself
private static Long getTimestamp(String line){
StringBuilder sb = new StringBuilder();
for(int i = 0; i < line.length(); i++){
char c = line.charAt(i);
if(c == '#') break;
sb.append(c);
}
return Long.valueOf(sb.toString());
}
//helper for mergeFiles
//reads a line from the input reader and stores it in the min heap
//returns what has been read so that the caller can handle removal from the map if needed
//If we encounter an error during read, have it propagate the exception
private static String processLine(BufferedReader br, PriorityQueue<FileEntry> sortedLines, String key) throws IOException{
String line = br.readLine();
//only process the line if there is data
if(line != null){
Long timestamp = getTimestamp(line);
FileEntry ff = new FileEntry(timestamp, line, key);
sortedLines.add(ff);
}
//return the line so that the caller can handle the map removal
return line;
}
//helper for mergeSortedFiles
private static void mergeFiles(HashMap<String, BufferedReader> readers, BufferedWriter bw){
PriorityQueue<FileEntry> sortedLines = new PriorityQueue<>(readers.size());
//initialize min-heap with the first element from each file
//use an iterator, if any file is empty we can remove it from the map
for(Iterator<Map.Entry<String, BufferedReader>> it = readers.entrySet().iterator(); it.hasNext(); ) {
Map.Entry<String, BufferedReader> entry = it.next();
BufferedReader br = entry.getValue();
try {
String line = processLine(br, sortedLines, entry.getKey());
//if null, we reached the end of file, remove the item from the map and close the reader
if (line == null){
it.remove();
br.close();
}
}catch(IOException e){
System.err.println("Error while reading line from file: " + entry.getKey() + " - " + e.getMessage());
throw new RuntimeException("Aborting processing due to read errors. Check log for details");
}
}
//keep looping over the heap until it's empty
//every time we read a line, get the next one from the same file and add it to the heap
while(!sortedLines.isEmpty()){
FileEntry fe = sortedLines.poll();
//write this entry to the out file
try {
bw.write(fe.data);
bw.newLine();
}catch(IOException e){
System.err.println("Error while writing line to output file: " + e.getMessage());
throw new RuntimeException("Aborting processing due to write errors. Check log for details");
}
//read, if any, the next line from the same file and put it in the min heap
BufferedReader br = readers.get(fe.filename);
if(br != null){
try {
String line = processLine(br, sortedLines, fe.filename);
//if null, we reached the end of file, remove the item from the map and close the reader
if (line == null){
readers.remove(fe.filename);
br.close();
}
}catch(IOException e){
System.err.println("Error while reading line from file: " + fe.filename + " - " + e.getMessage());
throw new RuntimeException("Aborting processing due to read errors. Check log for details");
}
}
//if the reader is null, remove it from the map
else{
readers.remove(fe.filename);
}
}
//close out file
try {
bw.close();
}catch(IOException e){
throw new RuntimeException("Error while closing out file: " + e.getMessage());
}
}
/*
Given N files where each line contains a timestamp (long) and some data sorted in ascending order
merge them in a single file also sorted in ascending order.
Files can be arbitrarily big and are passed in input as a full path list.
File content has the format:
timestamp#data
where timestamp is a long
*/
public static void mergeSortedFiles(List<String> files, String outfile){
if(files == null || files.size() < 2) throw new IllegalArgumentException("Input file list must contain at least two files");
if(outfile == null || outfile.isEmpty()) throw new IllegalArgumentException("Must specify output file");
//create output file
File f = new File(outfile);
BufferedWriter bw;
try{
if(!f.createNewFile()) throw new IllegalArgumentException("Output file: " + outfile + " already exists");
bw = new BufferedWriter(new FileWriter(f));
}catch(IOException e){
throw new RuntimeException("Error while creating out file: " + outfile + e.getMessage());
}
//map from filename to its reader
HashMap<String, BufferedReader> readers = new HashMap<>();
List<String> errorFiles = new ArrayList<>();
//open each file in read mode and keep a reader for each one
for(String file : files) {
try{
BufferedReader br = new BufferedReader(new FileReader(file));
readers.put(file, br);
}catch (FileNotFoundException e) {
errorFiles.add("File: " + file + " not found: " + e.getMessage());
}
}
//print out any error we encountered, but do not fail the operation
if(!errorFiles.isEmpty()){
for(String s : errorFiles) System.err.println(s);
}
//abort execution only if no file or only one file could be successfully opened, delete output file too
if(readers.isEmpty()){
f.delete();
throw new RuntimeException("No input file could be opened. See error log for details");
}
if(readers.size() < 2){
f.delete();
throw new RuntimeException("At least two files need to be successfully opened for merging. See error log for details");
}
//merge the files
mergeFiles(readers, bw);
//close each remaining file, if any, and log errors but do not abort the operation
errorFiles = new ArrayList<>();
for(Map.Entry entry : readers.entrySet()){
try {
((BufferedReader)entry.getValue()).close();
}catch (IOException e){
errorFiles.add("Error while closing file reader: " + entry.getKey() + " - " + e.getMessage());
}
}
//log any error encountered while closing the files
if(!errorFiles.isEmpty()){
for(String s : errorFiles) System.err.println(s);
}
}
}
package com.blogspot.groglogs.test.fileutils;
import org.junit.Test;
import java.util.ArrayList;
import java.util.List;
import static com.blogspot.groglogs.fileutils.FileUtils.mergeSortedFiles;
import static org.junit.Assert.assertEquals;
public class MergeFilesJTests {
List<String> files;
String outfile;
@Test
public void badInput() {
files = null;
try{
mergeSortedFiles(files, outfile);
}catch(IllegalArgumentException e){
System.out.println("null input file list got IllegalArgumentException: " + e.getMessage());
}
files = new ArrayList<>();
try{
mergeSortedFiles(files, outfile);
}catch(IllegalArgumentException e){
System.out.println("Empty input file list got IllegalArgumentException: " + e.getMessage());
}
files = new ArrayList<>();
files.add("/home/grog/Desktop/file1");
try{
mergeSortedFiles(files, outfile);
}catch(IllegalArgumentException e){
System.out.println("single element input file list got IllegalArgumentException: " + e.getMessage());
}
files = new ArrayList<>();
files.add("/home/grog/Desktop/file1");
files.add("/home/grog/Desktop/file2");
outfile = null;
try{
mergeSortedFiles(files, outfile);
}catch(IllegalArgumentException e){
System.out.println("null out file got IllegalArgumentException: " + e.getMessage());
}
files = new ArrayList<>();
files.add("/home/grog/Desktop/file1");
files.add("/home/grog/Desktop/file2");
outfile = "";
try{
mergeSortedFiles(files, outfile);
}catch(IllegalArgumentException e){
System.out.println("empty out file got IllegalArgumentException: " + e.getMessage());
}
files = new ArrayList<>();
files.add("/home/grog/Desktop/file1");
files.add("/home/grog/Desktop/file2");
outfile = "/home/grog/Desktop/file1";
try{
mergeSortedFiles(files, outfile);
}catch(IllegalArgumentException e){
System.out.println("out file already exists got IllegalArgumentException: " + e.getMessage());
}
}
@Test
public void merge3files() {
files = new ArrayList<>();
files.add("/home/grog/Desktop/file1");
files.add("/home/grog/Desktop/file2");
files.add("/home/grog/Desktop/file3");
outfile = "/home/grog/Desktop/outfile";
mergeSortedFiles(files, outfile);
}
}
@steghio
Copy link
Author

steghio commented Nov 21, 2017

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment