Skip to content

Instantly share code, notes, and snippets.

@awwsmm
Created August 2, 2018 13:14
Show Gist options
  • Save awwsmm/6dbd9899ee082106e00d1266be16e1ff to your computer and use it in GitHub Desktop.
Save awwsmm/6dbd9899ee082106e00d1266be16e1ff to your computer and use it in GitHub Desktop.
Finds the last non-empty line/row of a CSV or XLS(X) file
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.RandomAccessFile;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.usermodel.FormulaEvaluator;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
/// !!! depends on SimplestCSVParser : http://bit.ly/2AhVKaF
/// !!! depends on CountLines : http://bit.ly/2vsa36T
// Note: tokens containing only whitespace or whitespace in quotes are considered "empty"
// so a CSV line like: ""," ",'',,
// ...would be considered an empty line
public class LastLine {
// private constructor in utility class
private LastLine(){}
public static Integer inFile (String fileName) {
//--------------------------------------------------------------------------
// CSV file
//--------------------------------------------------------------------------
if (fileName.endsWith(".csv")) {
final File file = new File(fileName);
RandomAccessFile in = null;
try { // try to open the file as random-access
in = new RandomAccessFile(file, "r");
} catch (FileNotFoundException ex) {
System.err.println("File cannot be found: '" + fileName + "'");
return null;
}
// length of the file in bytes
final int fileLength = (int) file.length();
// start with this, double if there aren't any newlines in here
int maxBytes = 1024;
// read the last 1% of the file, or the last kB, whichever is smaller
int bytesToRead = Math.min((int)(0.01*fileLength), maxBytes);
// skip the last N characters in this buffer
// (use this when we've already checked lines at the end of the buffer)
int skipLastNBytes = 0;
// how many lines have we skipped from the end?
int currentLine = 0;
// did we find the last non-blank line? If so, we're done!
int lastLine = -1;
// loop until we find the last line
// break if we read the entire file and can't find any data
while (lastLine < 0) {
// make sure that bytesToRead isn't longer than the file
bytesToRead = Math.min(fileLength, maxBytes);
// move backwards so we can read forwards
int currentFilePos = fileLength - bytesToRead;
// don't read the last few bytes at the end
// (if we've already found that these are blank lines)
bytesToRead -= skipLastNBytes;
// byte array read from file
byte[] bytes = new byte[bytesToRead];
try { // seek to position within file
in.seek(currentFilePos);
} catch (IOException ex) {
System.err.println("Problem seeking to end of file.");
return null;
}
try { // read bytes into array
in.read(bytes);
} catch (IOException ex) {
System.err.println("Problem reading from file.");
return null;
}
// check that bytes contains at least one newline
int lineStart = -1;
int lineEnd = -1;
// assume this line is blank
boolean isBlankLine = true;
// loop backwards over bytes
for (int ii = bytesToRead-1; ii >= 0; --ii) {
if (bytes[(int)ii] == 0xA) { // line break
if (lineEnd >= 0) {
lineStart = ii+1;
break;
} else {
lineEnd = ii+1;
}
} else continue;
}
if (lineStart == -1) {
if (lineEnd != -1) {
// this line starts at the beginning of the file
if (currentFilePos == 0) {
lineStart = 0;
// couldn't find line start because buffer cut it off
} else {
maxBytes *= 2;
continue;
}
// have we read the whole file? it must be empty, then
} else if (bytesToRead == 0) {
break;
// if neither line end nor beginning could be found
} else {
maxBytes *= 2;
continue;
}
}
// convert line to parsed CSV tokens
String[] tokens = parseLineOfCSV(new String(bytes, lineStart, lineEnd-lineStart));
// loop over tokens -- if any values are found, then the line isn't empty
for (String token : tokens) {
// if token is all whitespace, move to next token
String trimmedToken = token.trim();
if (trimmedToken.equals("")) continue;
// if token is quotes "" or single-quotes '' surrounding whitespace, also empty
if (
( ( trimmedToken.startsWith("\"") && trimmedToken.endsWith("\"") )
|| ( trimmedToken.startsWith("'" ) && trimmedToken.endsWith("'" ) ) )
&& ( trimmedToken.substring(1, trimmedToken.length()-1).trim().equals(""))
) continue;
// token must contain something -> line isn't blank
isBlankLine = false;
break;
}
// if we found a non-blank line, quit!
if (!isBlankLine) {
lastLine = currentLine;
break;
// if this is a blank line, skip it the next time we read
} else {
skipLastNBytes += (lineEnd-lineStart);
currentLine++;
continue; // try again
}
}
if (lastLine < 0) return lastLine;
return nLinesInTextFile(fileName) - lastLine - 1;
//--------------------------------------------------------------------------
// XLS(X) file
//--------------------------------------------------------------------------
} else if (fileName.endsWith(".xls") || fileName.endsWith(".xlsx")) {
// get the Workbook
XSSFWorkbook wb = null;
try {
wb = new XSSFWorkbook(new File(fileName));
} catch (InvalidFormatException ex) {
System.err.println("File has *.xls or *.xlsx extension, but isn't actually an Excel file.");
return null;
} catch (FileNotFoundException ex) {
System.err.println("Couldn't open file '" + fileName + "'\n");
return null;
} catch (IOException ex) {
System.err.println("Couldn't open file '" + fileName + "'\n");
return null;
}
// get the first sheet /// only the first sheet!
Sheet sheet = wb.getSheetAt(0);
// get the number of rows
if (sheet == null) return -1;
// get last row, which may or may not be empty
int lastRow = sheet.getLastRowNum();
// DataFormatter formats any type of cell as a String cell
DataFormatter fmt = new DataFormatter();
// FormulaEvaluator evaluates formulae in cells
FormulaEvaluator eval = wb.getCreationHelper().createFormulaEvaluator();
// last non-empty row
int lastLine = -1;
// loop over rows to find last non-empty one
for (int rr = lastRow; rr >= 0; rr--) {
Row row = sheet.getRow(rr);
if (row == null) continue;
// loop over columns in this row
int nCols = row.getLastCellNum();
for (int cc = 0; cc < nCols; ++cc) {
Cell cell = row.getCell(cc);
eval.evaluate(cell);
String token = fmt.formatCellValue(cell, eval);
// if token is all whitespace, move to next token
String trimmedToken = token.trim();
if (trimmedToken.equals("")) continue;
// if token is quotes "" or single-quotes '' surrounding whitespace, also empty
if (
( ( trimmedToken.startsWith("\"") && trimmedToken.endsWith("\"") )
|| ( trimmedToken.startsWith("'" ) && trimmedToken.endsWith("'" ) ) )
&& ( trimmedToken.substring(1, trimmedToken.length()-1).trim().equals(""))
) continue;
// token must contain something -> line isn't blank
lastLine = rr;
break;
}
if (lastLine > -1) break;
}
return lastLine;
} else {
System.err.println("Unrecognized file type given to Calculate.lastLineInFile() ('"
+ fileName + "'). Returning -1.");
return -1;
}
}
/// copied from SimplestCSVParser : http://bit.ly/2AhVKaF
public static String[] parseLineOfCSV (String line) {
String regex = "(?:,|\\n|^)((?:\"(?:(?:\"\")*(?:[^\"])*)*\")|(?:[^\",\\n]*)|(?:\\n|$))";
Matcher matcher = Pattern.compile(regex).matcher(line);
ArrayList<String> list = new ArrayList<>();
while (matcher.find()) list.add(matcher.group(1));
if (line.charAt(0) == ',') list.add(0, "");
// and convert to an array of Strings
String[] tokens = new String[list.size()];
tokens = list.toArray(tokens);
return tokens;
}
/// copied from CountLines : http://bit.ly/2vsa36T
public static int nLinesInTextFile (String fileName) {
try (InputStream is = new BufferedInputStream(new FileInputStream(fileName))) {
final int bufferSize = 4096; // 4kB
byte[] c = new byte[bufferSize];
int readChars = is.read(c);
if (readChars == -1) return 0;
int count = 0;
while (readChars == bufferSize) {
for (int ii = 0; ii < bufferSize; )
if (c[ii++] == '\n') ++count;
readChars = is.read(c);
}
while (readChars != -1) {
for (int ii = 0; ii < readChars; ++ii)
if (c[ii] == '\n') ++count;
readChars = is.read(c);
}
return count == 0 ? 1 : count;
} catch (IOException ex) {
System.err.println("Cannot open file.");
return -1;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment