Skip to content

Instantly share code, notes, and snippets.

@irineu
Last active December 11, 2016 18:11
Show Gist options
  • Save irineu/856b0cf716d1815b44c27198a93ef74a to your computer and use it in GitHub Desktop.
Save irineu/856b0cf716d1815b44c27198a93ef74a to your computer and use it in GitHub Desktop.
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/*
* This code consists in:
*
* 1- Calculate the max lines that can be read to RAM
* 2- After read the max lines, sort them and save that piece of sorted list in a new file
* 3- Repeat process 1 and 2 until reach at the end of input file
* 4- Take 2 files of sorted pieces, read both first lines, compare who is greater then append in new file. Repeat this process for the next lines until both files be complete compared and appended into the new file.
* 5- Repeat step 4 for all next pairs of sorted pieces files
* 6- After "merge" all files sorted pieces, we have a half of files from them (merged in this case), repeat the 4 and 5 until the merged file became a single file
* 7- For check purpose, we can read the final file (complete sorted), and compare the first line to the next and repeat this process until reach the end of file to validate the result.
*/
public class ExternalSort {
/**
* These params are used to get the average size (in bytes) of each line to be stored
*/
private final static int WORDS_PER_LINE = 16;
private final static int AVERAGE_LETTERS_PER_WORDS = 12;
/**
* The max available memory (in MB)
*/
private final static int MAX_MEMORY = 5;
/**
* The max lines that can be readable at time
*/
private static int MAX_LINES = (MAX_MEMORY * (1024 * 1024)) / (WORDS_PER_LINE * AVERAGE_LETTERS_PER_WORDS);
/**
* Total of read lines after split, will be used for validate the result
*/
private static int TOTAL_LINES = 0;
private static boolean DONE = false;
/**
* External Sort Class, will start at constructor
* @param path The path of input file to be sorted
*/
public ExternalSort(String path) {
resetTempDir();
List<String> splitFiles = split(path);
String resultPath = externalMergeSort(splitFiles);
DONE = true;
checkResult(resultPath);
}
/**
* Will split a big file into small parts based on MAX_LINES
* @param path The path of input file to be sorted
* @return
*/
private List<String> split(String path) {
List<String> files = new ArrayList<>();
//Read the file
try (BufferedReader br = new BufferedReader(new FileReader(path))) {
int index=0;
double startTime = System.currentTimeMillis();
while (true) {
List<String> chunk = new ArrayList<>();
String line = null;
//Add the lines to a list until reach the MAX_LINES size or reach the end of file
for(int i=0; i<MAX_LINES && (line = br.readLine()) != null;i++){
chunk.add(line);
TOTAL_LINES++;
}
//Sort the list, i think of write an in-memory sort algorithm is not the propose of this test, so i've used the std java to sort
Collections.sort(chunk, String.CASE_INSENSITIVE_ORDER);
String outputFile = System.getProperty("user.dir") + "/temp/chunk-"+index;
//Save the list into a file
BufferedWriter bw = new BufferedWriter(new FileWriter(outputFile));
for( String s : chunk){
bw.write(s);
bw.newLine();
}
bw.close();
//Add the chunk path to a list
files.add(outputFile);
//if there is no more lines (the for-loop does not found a next line), break the loop
if(line == null) break;
index++;
}
System.out.format("STEP 1 Completed! ");
System.out.format("Readed lines: %d, ",TOTAL_LINES);
System.out.format("Total of chunks: %d, ",files.size());
System.out.format("Max lines per chunk: %d ,",MAX_LINES);
System.out.format("Time spent: %.0fms \r\n", System.currentTimeMillis() - startTime);
} catch (FileNotFoundException e) {
System.err.println("The file in the given path was not found");
System.exit(1);
} catch (IOException e) {
e.printStackTrace();
System.exit(1);
}
return files;
}
/**
* Will recursively merge a list of files into a single file
* It's a shorthand for start from depth 0
* @param files Must be a list of paths
* @return
*/
private String externalMergeSort(List<String> files) {
return externalMergeSort(files,0);
}
/**
* Will recursively merge a list of files into a single file
*
* @param files Must be a list of paths
* @param depth Will set a prefix to the file to specify the recursive depth
* @return
*/
private String externalMergeSort(List<String> files, int depth) {
double stepStartTime = System.currentTimeMillis();
//Chunks will be mapped here
List<String> mergedFiles = new ArrayList<>();
//Will jump 2 by 2, for check files A-B C-D E-F ...
for(int i=0;i<files.size();i=i+2){
String fileAB = System.getProperty("user.dir") + "/temp/"+depth+"-chunk-"+i+"_"+(i+1);
try{
BufferedReader brA = new BufferedReader(new FileReader(files.get(i)));
BufferedReader brB = null;
//check if total of chunks not is pair
if(i+1 < files.size())
brB = new BufferedReader(new FileReader(files.get(i+1)));
BufferedWriter bw = new BufferedWriter(new FileWriter(fileAB));
String lineA = null;
String lineB = null;
//if brB is null (total os chunks aren't pair and this is the last one)
if(brB !=null){
while(true){
//Read the first/next line of both files only if it's null
//it may not null if the last iteration the line was greater than the other
if(lineA == null) lineA = brA.readLine();
if(lineB == null) lineB = brB.readLine();
//if both == null, we are at EOF of both files, break the loop for move forward
if(lineA == null && lineB == null) break;
if(lineA != null && lineB != null){
//Check each one and append who is lower than other
//the greater line will stay for the next iteration
//if both are equal, append both
if(lineA.compareToIgnoreCase(lineB) > 0){
bw.write(lineB);
lineB = null;
}else if(lineA.compareToIgnoreCase(lineB) < 0){
bw.write(lineA);
lineA = null;
}else{
bw.write(lineA);
bw.write(lineB);
lineA = null;
lineB = null;
}
}else{
//If one line is null but the other not, maybe one file is completely scanned
//In this case, we should append the line where isn't null
if(lineA!=null){
bw.write(lineA);
lineA = null;
}else{
bw.write(lineB);
lineB = null;
}
}
bw.newLine();
}
}else{
// brB is null, total os chunks aren't pair and this is the last one
// So let's append brA on this file
while((lineA = brA.readLine()) !=null){
bw.write(lineA);
bw.newLine();
}
}
bw.close();
//add the merged file chunk (A+B) to a list
mergedFiles.add(fileAB);
}catch(Exception e){
e.printStackTrace();
}
}
System.out.format("Depth %d,Time spent: %.0fms \r\n", depth,System.currentTimeMillis() - stepStartTime);
//Remove all old temp files
for(String f : files){
File tempFile = new File(f);
tempFile.delete();
}
//if have more than 1 chunk file, we need re-merge until it became a single file
if(mergedFiles.size() > 1){
return externalMergeSort(mergedFiles, depth+1);
}else{
//We have a final merged file
return mergedFiles.get(0);
}
}
/**
* Create a temp dir (if necessary) and/or remove all (old temp) files from it
*/
private void resetTempDir() {
File tempPath = new File(System.getProperty("user.dir") + "/temp");
if(!tempPath.exists()){
tempPath.mkdir();
}else if(tempPath.isFile()){
tempPath.delete();
tempPath.mkdir();
}
String internalFiles[] = tempPath.list();
for(String chunkPath: internalFiles){
File chunkFile = new File(tempPath.getPath(),chunkPath);
chunkFile.delete();
}
}
/**
* Check a sorted file, line per line to validate
* Any exception will result in the software termination
*
* @param outputPath Must be the path of a sorted file
*/
private void checkResult(String outputPath) {
try (BufferedReader br = new BufferedReader(new FileReader(outputPath))) {
String line = null;
String lastLine = null;
int lineCount = 0;
while((line = br.readLine()) != null){
if(lastLine == null){
lastLine = line;
}
if(lastLine.compareToIgnoreCase(line) > 0){
System.err.println("Woops! The sort solution does not work!");
System.exit(1);
}
lastLine = line;
lineCount++;
}
System.out.println("Everything is OK! Checked lines: "+lineCount);
} catch (FileNotFoundException e) {
System.err.format("File output path not found, please check if %s exists\r\n",outputPath);
e.printStackTrace();
System.exit(1);
} catch (IOException e) {
e.printStackTrace();
System.exit(1);
}
}
/**
* It will just print some useful information about JVM memory and process duration
*/
private static void startMemoryMonitor() {
new Thread(new Runnable() {
private long lastLog = -1;
private long min = -1;
private long max = 0;
double startTime = System.currentTimeMillis();
@Override
public void run() {
while(!DONE){
try {
System.gc();
Runtime rt = Runtime.getRuntime();
long memoryLog = (rt.totalMemory() - rt.freeMemory()) / 1024 / 1024;
if(memoryLog != lastLog){
lastLog = memoryLog;
System.out.print("JVM Current Used Memory:"+memoryLog+"mb\r");
}
if(min == -1) min = memoryLog;
if(memoryLog < min) min = memoryLog;
if(memoryLog > max) max = memoryLog;
Thread.sleep(10);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
System.out.format("Spent time to complete all operation: %.0fms, JVM min memory: %dmb - peak: %dmb \r\n",System.currentTimeMillis() - startTime, min,max);
}
}).start();
}
public static void main(String[] args) {
if(args.length == 1){
startMemoryMonitor();
new ExternalSort(args[0]);
}else{
System.err.println("You must specify the path o the file e.g: java ExternalSort /home/user/input.txt");
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment