Last active
June 30, 2020 04:07
-
-
Save VenkataRaju/10702187 to your computer and use it in GitHub Desktop.
Parses Excel(only .xlsx) file in SAX style
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package excelparser; | |
import com.google.common.base.Joiner; | |
import static com.google.common.base.Preconditions.*; | |
import com.google.common.collect.Maps; | |
import com.google.common.collect.Sets; | |
import java.io.File; | |
import java.io.IOException; | |
import java.io.InputStream; | |
import java.util.HashSet; | |
import java.util.Map; | |
import java.util.Set; | |
import javax.xml.parsers.ParserConfigurationException; | |
import javax.xml.parsers.SAXParserFactory; | |
import org.apache.poi.openxml4j.exceptions.OpenXML4JException; | |
import org.apache.poi.openxml4j.opc.OPCPackage; | |
import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable; | |
import org.apache.poi.xssf.eventusermodel.XSSFReader; | |
import org.apache.poi.xssf.eventusermodel.XSSFReader.SheetIterator; | |
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler; | |
import org.apache.poi.xssf.model.StylesTable; | |
import org.xml.sax.ContentHandler; | |
import org.xml.sax.InputSource; | |
import org.xml.sax.SAXException; | |
import org.xml.sax.XMLReader; | |
/** | |
* Parses XLSX Excel files in SAX style. | |
* After parsing each row calls the {@link RowDataHandler} | |
* | |
* @author Venkata Raju | |
*/ | |
public final class ExcelParser | |
{ | |
/** | |
* A call back interface, which will be called after parsing each row in the Excel file | |
*/ | |
public interface RowDataHandler | |
{ | |
/** | |
* @param rowNum | |
* Starts with 1, to be consistent with Excel row number | |
*/ | |
void rowData(String sheetName, int rowNum, Map<String, String> valueByHeader); | |
} | |
public static class ParserException extends RuntimeException | |
{ | |
public ParserException(String message) | |
{ | |
super(message); | |
} | |
public ParserException(String message, Throwable t) | |
{ | |
super(message, t); | |
} | |
} | |
public static final class MissingDataException extends ParserException | |
{ | |
public MissingDataException(String message) | |
{ | |
super(message); | |
} | |
} | |
private final Map<String, ? extends Set<String>> requiredHeadersBySheetName; | |
private final RowDataHandler rowDataHandler; | |
private final XMLReader xmlReader; | |
/** | |
* @param requiredHeadersBySheetName | |
* e.g. {@code ImmutableMap.of("MySheetName", ImmutableSet.of("FifthColumnHeader", "SecondColumnHeader"));} | |
* | |
* @throws NullPointerException if key/value is null or value contains null in {@code requiredHeadersBySheetName}, or {@code rowDataHandler} is null | |
* @throws ParserException if any internal error occurs | |
*/ | |
public ExcelParser(Map<String, ? extends Set<String>> requiredHeadersBySheetName, | |
RowDataHandler rowDataHandler) | |
{ | |
requiredHeadersBySheetName.forEach((key, value) -> | |
{ | |
checkNotNull(key, "null key"); | |
checkNotNull(value, "null value for key: %s", key); | |
if (value.contains(null)) | |
throw new NullPointerException("value set (for key: " + key + ") contains null value"); | |
}); | |
this.requiredHeadersBySheetName = requiredHeadersBySheetName; | |
this.rowDataHandler = checkNotNull(rowDataHandler); | |
try | |
{ | |
xmlReader = SAXParserFactory.newInstance().newSAXParser().getXMLReader(); | |
} | |
catch (ParserConfigurationException | SAXException e) | |
{ | |
throw new ParserException(e.getMessage(), e); | |
} | |
} | |
/** | |
* Parses .xlsx file(cell by cell) and calls {@link RowDataHandler#rowData(String, int, Map)} after parsing each row | |
* | |
* @throws ParserException if reading Excel file fails | |
* @throws MissingDataException if any sheet in the file or header in the sheet is missing | |
*/ | |
public void parseExcel(File excelFile) | |
{ | |
Set<String> sheetsFound = Sets.newHashSetWithExpectedSize(requiredHeadersBySheetName.size()); | |
try (OPCPackage opcPkg = OPCPackage.open(excelFile);) | |
{ | |
ReadOnlySharedStringsTable readOnlyStrsTbl = new ReadOnlySharedStringsTable(opcPkg); | |
XSSFReader xssfReader = new XSSFReader(opcPkg); | |
StylesTable stylesTbl = xssfReader.getStylesTable(); | |
for (SheetIterator sheetsIt = (SheetIterator) xssfReader.getSheetsData(); | |
sheetsIt.hasNext() && (sheetsFound.size() < requiredHeadersBySheetName.size());) | |
{ | |
try (InputStream sheetStream = sheetsIt.next()) | |
{ | |
String sheetName = sheetsIt.getSheetName(); | |
Set<String> requiredHeaders = requiredHeadersBySheetName.get(sheetName); | |
if (requiredHeaders == null) | |
continue; | |
sheetsFound.add(sheetName); | |
/*Maintains mapping between Column and Header (e.g {"A": "My First Column Name", "C": "My Third Column Name"}*/ | |
Map<String, String> headerByColumn = Maps.newHashMapWithExpectedSize(requiredHeaders.size()); | |
ContentHandler excelSheetHandler = new XSSFSheetXMLHandler(stylesTbl, readOnlyStrsTbl, | |
new XSSFSheetXMLHandler.SheetContentsHandler() | |
{ | |
int rowNum; | |
Map<String, String> cellValueByHeader; | |
@Override | |
public void headerFooter(String text, boolean isHeader, String tagName) | |
{// No op | |
} | |
@Override | |
public void startRow(int rowNum) | |
{ | |
this.rowNum = rowNum + 1;// To match Excel row numbering | |
cellValueByHeader = Maps.newHashMapWithExpectedSize(requiredHeaders.size()); | |
} | |
@Override | |
public void cell(String cellRef/*e.g. C23*/, String formattedVal/*Value in the Cell*/) | |
{ | |
if (rowNum == 1) | |
{// Create mappings between First row and Column from Cell Ref. | |
if (requiredHeaders.contains(formattedVal)) | |
headerByColumn.put(getColumn(cellRef), formattedVal); | |
return; | |
} | |
String header = headerByColumn.get(getColumn(cellRef)); | |
if (header != null)// We are tracking this column cells | |
cellValueByHeader.put(header, formattedVal); | |
} | |
@Override | |
public void endRow() | |
{ | |
if (rowNum == 1) | |
{// Check for missing Columns | |
if (headerByColumn.size() < requiredHeaders.size()) | |
{ | |
Set<String> missingHeaders = Sets.difference(requiredHeaders, new HashSet<>(headerByColumn.values())); | |
throw new MissingDataException("Missing header(s): " + Joiner.on(", ").join(missingHeaders) | |
+ ", Sheet: " + sheetName); | |
} | |
return; | |
} | |
rowDataHandler.rowData(sheetName, rowNum, cellValueByHeader); | |
} | |
}, true/* Take formulas directly, not their results */); | |
xmlReader.setContentHandler(excelSheetHandler); | |
InputSource sheetSource = new InputSource(sheetStream); | |
xmlReader.parse(sheetSource); | |
} | |
} | |
if (sheetsFound.size() < requiredHeadersBySheetName.size()) | |
{// Check for missing Sheets | |
Set<String> missingSheets = Sets.difference(requiredHeadersBySheetName.keySet(), sheetsFound); | |
throw new MissingDataException("Missing sheet(s): " + Joiner.on(", ").join(missingSheets) | |
+ ", Excel file: " + excelFile.getAbsolutePath()); | |
} | |
} | |
catch (IOException | OpenXML4JException | ParserException | SAXException e) | |
{ | |
throw (e instanceof ParserException) ? (ParserException) e : new ParserException(e.getMessage(), e); | |
} | |
} | |
/** | |
* gets Column(e.g. C) for cellRef (e.g. C23) by ignoring row number(e.g. 23) | |
*/ | |
private static String getColumn(String cellRef) | |
{ | |
for (int i = 1; i < cellRef.length(); i++) | |
if (Character.isDigit(cellRef.charAt(i))) | |
return cellRef.substring(0, i); | |
throw new IllegalArgumentException("Invalid cellRef: " + cellRef); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hello Venkata, How are you?
I wanted to use this program to parse Excel file but not able to figure out how to use the Usage section that you have put here.. Can you please advise or help?
ExcelParser excelParser = new ExcelParser(headersBySheetName, (String sheetName, int rowNum, Map<String, String> valueByHeader) ->
{
System.out.println("SheetName: " + sheetName + ", rowNum: " + rowNum + ", ValueInCellByHeader: " + valueByHeader);
});
excelParser.parseExcel(new File("Excel.xlsx"));