Created
August 2, 2018 13:14
-
-
Save awwsmm/6dbd9899ee082106e00d1266be16e1ff to your computer and use it in GitHub Desktop.
Finds the last non-empty line/row of a CSV or XLS(X) file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.File; | |
import java.io.FileNotFoundException; | |
import java.io.IOException; | |
import java.io.RandomAccessFile; | |
import org.apache.poi.ss.usermodel.Cell; | |
import org.apache.poi.ss.usermodel.DataFormatter; | |
import org.apache.poi.ss.usermodel.FormulaEvaluator; | |
import org.apache.poi.ss.usermodel.Row; | |
import org.apache.poi.ss.usermodel.Sheet; | |
import org.apache.poi.xssf.usermodel.XSSFWorkbook; | |
import org.apache.poi.openxml4j.exceptions.InvalidFormatException; | |
/// !!! depends on SimplestCSVParser : http://bit.ly/2AhVKaF | |
/// !!! depends on CountLines : http://bit.ly/2vsa36T | |
// Note: tokens containing only whitespace or whitespace in quotes are considered "empty" | |
// so a CSV line like: ""," ",'',, | |
// ...would be considered an empty line | |
public class LastLine { | |
// private constructor in utility class | |
private LastLine(){} | |
public static Integer inFile (String fileName) { | |
//-------------------------------------------------------------------------- | |
// CSV file | |
//-------------------------------------------------------------------------- | |
if (fileName.endsWith(".csv")) { | |
final File file = new File(fileName); | |
RandomAccessFile in = null; | |
try { // try to open the file as random-access | |
in = new RandomAccessFile(file, "r"); | |
} catch (FileNotFoundException ex) { | |
System.err.println("File cannot be found: '" + fileName + "'"); | |
return null; | |
} | |
// length of the file in bytes | |
final int fileLength = (int) file.length(); | |
// start with this, double if there aren't any newlines in here | |
int maxBytes = 1024; | |
// read the last 1% of the file, or the last kB, whichever is smaller | |
int bytesToRead = Math.min((int)(0.01*fileLength), maxBytes); | |
// skip the last N characters in this buffer | |
// (use this when we've already checked lines at the end of the buffer) | |
int skipLastNBytes = 0; | |
// how many lines have we skipped from the end? | |
int currentLine = 0; | |
// did we find the last non-blank line? If so, we're done! | |
int lastLine = -1; | |
// loop until we find the last line | |
// break if we read the entire file and can't find any data | |
while (lastLine < 0) { | |
// make sure that bytesToRead isn't longer than the file | |
bytesToRead = Math.min(fileLength, maxBytes); | |
// move backwards so we can read forwards | |
int currentFilePos = fileLength - bytesToRead; | |
// don't read the last few bytes at the end | |
// (if we've already found that these are blank lines) | |
bytesToRead -= skipLastNBytes; | |
// byte array read from file | |
byte[] bytes = new byte[bytesToRead]; | |
try { // seek to position within file | |
in.seek(currentFilePos); | |
} catch (IOException ex) { | |
System.err.println("Problem seeking to end of file."); | |
return null; | |
} | |
try { // read bytes into array | |
in.read(bytes); | |
} catch (IOException ex) { | |
System.err.println("Problem reading from file."); | |
return null; | |
} | |
// check that bytes contains at least one newline | |
int lineStart = -1; | |
int lineEnd = -1; | |
// assume this line is blank | |
boolean isBlankLine = true; | |
// loop backwards over bytes | |
for (int ii = bytesToRead-1; ii >= 0; --ii) { | |
if (bytes[(int)ii] == 0xA) { // line break | |
if (lineEnd >= 0) { | |
lineStart = ii+1; | |
break; | |
} else { | |
lineEnd = ii+1; | |
} | |
} else continue; | |
} | |
if (lineStart == -1) { | |
if (lineEnd != -1) { | |
// this line starts at the beginning of the file | |
if (currentFilePos == 0) { | |
lineStart = 0; | |
// couldn't find line start because buffer cut it off | |
} else { | |
maxBytes *= 2; | |
continue; | |
} | |
// have we read the whole file? it must be empty, then | |
} else if (bytesToRead == 0) { | |
break; | |
// if neither line end nor beginning could be found | |
} else { | |
maxBytes *= 2; | |
continue; | |
} | |
} | |
// convert line to parsed CSV tokens | |
String[] tokens = parseLineOfCSV(new String(bytes, lineStart, lineEnd-lineStart)); | |
// loop over tokens -- if any values are found, then the line isn't empty | |
for (String token : tokens) { | |
// if token is all whitespace, move to next token | |
String trimmedToken = token.trim(); | |
if (trimmedToken.equals("")) continue; | |
// if token is quotes "" or single-quotes '' surrounding whitespace, also empty | |
if ( | |
( ( trimmedToken.startsWith("\"") && trimmedToken.endsWith("\"") ) | |
|| ( trimmedToken.startsWith("'" ) && trimmedToken.endsWith("'" ) ) ) | |
&& ( trimmedToken.substring(1, trimmedToken.length()-1).trim().equals("")) | |
) continue; | |
// token must contain something -> line isn't blank | |
isBlankLine = false; | |
break; | |
} | |
// if we found a non-blank line, quit! | |
if (!isBlankLine) { | |
lastLine = currentLine; | |
break; | |
// if this is a blank line, skip it the next time we read | |
} else { | |
skipLastNBytes += (lineEnd-lineStart); | |
currentLine++; | |
continue; // try again | |
} | |
} | |
if (lastLine < 0) return lastLine; | |
return nLinesInTextFile(fileName) - lastLine - 1; | |
//-------------------------------------------------------------------------- | |
// XLS(X) file | |
//-------------------------------------------------------------------------- | |
} else if (fileName.endsWith(".xls") || fileName.endsWith(".xlsx")) { | |
// get the Workbook | |
XSSFWorkbook wb = null; | |
try { | |
wb = new XSSFWorkbook(new File(fileName)); | |
} catch (InvalidFormatException ex) { | |
System.err.println("File has *.xls or *.xlsx extension, but isn't actually an Excel file."); | |
return null; | |
} catch (FileNotFoundException ex) { | |
System.err.println("Couldn't open file '" + fileName + "'\n"); | |
return null; | |
} catch (IOException ex) { | |
System.err.println("Couldn't open file '" + fileName + "'\n"); | |
return null; | |
} | |
// get the first sheet /// only the first sheet! | |
Sheet sheet = wb.getSheetAt(0); | |
// get the number of rows | |
if (sheet == null) return -1; | |
// get last row, which may or may not be empty | |
int lastRow = sheet.getLastRowNum(); | |
// DataFormatter formats any type of cell as a String cell | |
DataFormatter fmt = new DataFormatter(); | |
// FormulaEvaluator evaluates formulae in cells | |
FormulaEvaluator eval = wb.getCreationHelper().createFormulaEvaluator(); | |
// last non-empty row | |
int lastLine = -1; | |
// loop over rows to find last non-empty one | |
for (int rr = lastRow; rr >= 0; rr--) { | |
Row row = sheet.getRow(rr); | |
if (row == null) continue; | |
// loop over columns in this row | |
int nCols = row.getLastCellNum(); | |
for (int cc = 0; cc < nCols; ++cc) { | |
Cell cell = row.getCell(cc); | |
eval.evaluate(cell); | |
String token = fmt.formatCellValue(cell, eval); | |
// if token is all whitespace, move to next token | |
String trimmedToken = token.trim(); | |
if (trimmedToken.equals("")) continue; | |
// if token is quotes "" or single-quotes '' surrounding whitespace, also empty | |
if ( | |
( ( trimmedToken.startsWith("\"") && trimmedToken.endsWith("\"") ) | |
|| ( trimmedToken.startsWith("'" ) && trimmedToken.endsWith("'" ) ) ) | |
&& ( trimmedToken.substring(1, trimmedToken.length()-1).trim().equals("")) | |
) continue; | |
// token must contain something -> line isn't blank | |
lastLine = rr; | |
break; | |
} | |
if (lastLine > -1) break; | |
} | |
return lastLine; | |
} else { | |
System.err.println("Unrecognized file type given to Calculate.lastLineInFile() ('" | |
+ fileName + "'). Returning -1."); | |
return -1; | |
} | |
} | |
/// copied from SimplestCSVParser : http://bit.ly/2AhVKaF | |
public static String[] parseLineOfCSV (String line) { | |
String regex = "(?:,|\\n|^)((?:\"(?:(?:\"\")*(?:[^\"])*)*\")|(?:[^\",\\n]*)|(?:\\n|$))"; | |
Matcher matcher = Pattern.compile(regex).matcher(line); | |
ArrayList<String> list = new ArrayList<>(); | |
while (matcher.find()) list.add(matcher.group(1)); | |
if (line.charAt(0) == ',') list.add(0, ""); | |
// and convert to an array of Strings | |
String[] tokens = new String[list.size()]; | |
tokens = list.toArray(tokens); | |
return tokens; | |
} | |
/// copied from CountLines : http://bit.ly/2vsa36T | |
public static int nLinesInTextFile (String fileName) { | |
try (InputStream is = new BufferedInputStream(new FileInputStream(fileName))) { | |
final int bufferSize = 4096; // 4kB | |
byte[] c = new byte[bufferSize]; | |
int readChars = is.read(c); | |
if (readChars == -1) return 0; | |
int count = 0; | |
while (readChars == bufferSize) { | |
for (int ii = 0; ii < bufferSize; ) | |
if (c[ii++] == '\n') ++count; | |
readChars = is.read(c); | |
} | |
while (readChars != -1) { | |
for (int ii = 0; ii < readChars; ++ii) | |
if (c[ii] == '\n') ++count; | |
readChars = is.read(c); | |
} | |
return count == 0 ? 1 : count; | |
} catch (IOException ex) { | |
System.err.println("Cannot open file."); | |
return -1; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment