Skip to content

Instantly share code, notes, and snippets.

@VenkataRaju
Last active June 30, 2020 04:07
Show Gist options
  • Save VenkataRaju/10702187 to your computer and use it in GitHub Desktop.
Save VenkataRaju/10702187 to your computer and use it in GitHub Desktop.
Parses Excel(only .xlsx) file in SAX style
package excelparser;
import com.google.common.base.Joiner;
import static com.google.common.base.Preconditions.*;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParserFactory;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.eventusermodel.XSSFReader.SheetIterator;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
import org.apache.poi.xssf.model.StylesTable;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
/**
* Parses XLSX Excel files in SAX style.
* After parsing each row calls the {@link RowDataHandler}
*
* @author Venkata Raju
*/
public final class ExcelParser
{
/**
* A call back interface, which will be called after parsing each row in the Excel file
*/
public interface RowDataHandler
{
/**
* @param rowNum
* Starts with 1, to be consistent with Excel row number
*/
void rowData(String sheetName, int rowNum, Map<String, String> valueByHeader);
}
public static class ParserException extends RuntimeException
{
public ParserException(String message)
{
super(message);
}
public ParserException(String message, Throwable t)
{
super(message, t);
}
}
public static final class MissingDataException extends ParserException
{
public MissingDataException(String message)
{
super(message);
}
}
private final Map<String, ? extends Set<String>> requiredHeadersBySheetName;
private final RowDataHandler rowDataHandler;
private final XMLReader xmlReader;
/**
* @param requiredHeadersBySheetName
* e.g. {@code ImmutableMap.of("MySheetName", ImmutableSet.of("FifthColumnHeader", "SecondColumnHeader"));}
*
* @throws NullPointerException if key/value is null or value contains null in {@code requiredHeadersBySheetName}, or {@code rowDataHandler} is null
* @throws ParserException if any internal error occurs
*/
public ExcelParser(Map<String, ? extends Set<String>> requiredHeadersBySheetName,
RowDataHandler rowDataHandler)
{
requiredHeadersBySheetName.forEach((key, value) ->
{
checkNotNull(key, "null key");
checkNotNull(value, "null value for key: %s", key);
if (value.contains(null))
throw new NullPointerException("value set (for key: " + key + ") contains null value");
});
this.requiredHeadersBySheetName = requiredHeadersBySheetName;
this.rowDataHandler = checkNotNull(rowDataHandler);
try
{
xmlReader = SAXParserFactory.newInstance().newSAXParser().getXMLReader();
}
catch (ParserConfigurationException | SAXException e)
{
throw new ParserException(e.getMessage(), e);
}
}
/**
* Parses .xlsx file(cell by cell) and calls {@link RowDataHandler#rowData(String, int, Map)} after parsing each row
*
* @throws ParserException if reading Excel file fails
* @throws MissingDataException if any sheet in the file or header in the sheet is missing
*/
public void parseExcel(File excelFile)
{
Set<String> sheetsFound = Sets.newHashSetWithExpectedSize(requiredHeadersBySheetName.size());
try (OPCPackage opcPkg = OPCPackage.open(excelFile);)
{
ReadOnlySharedStringsTable readOnlyStrsTbl = new ReadOnlySharedStringsTable(opcPkg);
XSSFReader xssfReader = new XSSFReader(opcPkg);
StylesTable stylesTbl = xssfReader.getStylesTable();
for (SheetIterator sheetsIt = (SheetIterator) xssfReader.getSheetsData();
sheetsIt.hasNext() && (sheetsFound.size() < requiredHeadersBySheetName.size());)
{
try (InputStream sheetStream = sheetsIt.next())
{
String sheetName = sheetsIt.getSheetName();
Set<String> requiredHeaders = requiredHeadersBySheetName.get(sheetName);
if (requiredHeaders == null)
continue;
sheetsFound.add(sheetName);
/*Maintains mapping between Column and Header (e.g {"A": "My First Column Name", "C": "My Third Column Name"}*/
Map<String, String> headerByColumn = Maps.newHashMapWithExpectedSize(requiredHeaders.size());
ContentHandler excelSheetHandler = new XSSFSheetXMLHandler(stylesTbl, readOnlyStrsTbl,
new XSSFSheetXMLHandler.SheetContentsHandler()
{
int rowNum;
Map<String, String> cellValueByHeader;
@Override
public void headerFooter(String text, boolean isHeader, String tagName)
{// No op
}
@Override
public void startRow(int rowNum)
{
this.rowNum = rowNum + 1;// To match Excel row numbering
cellValueByHeader = Maps.newHashMapWithExpectedSize(requiredHeaders.size());
}
@Override
public void cell(String cellRef/*e.g. C23*/, String formattedVal/*Value in the Cell*/)
{
if (rowNum == 1)
{// Create mappings between First row and Column from Cell Ref.
if (requiredHeaders.contains(formattedVal))
headerByColumn.put(getColumn(cellRef), formattedVal);
return;
}
String header = headerByColumn.get(getColumn(cellRef));
if (header != null)// We are tracking this column cells
cellValueByHeader.put(header, formattedVal);
}
@Override
public void endRow()
{
if (rowNum == 1)
{// Check for missing Columns
if (headerByColumn.size() < requiredHeaders.size())
{
Set<String> missingHeaders = Sets.difference(requiredHeaders, new HashSet<>(headerByColumn.values()));
throw new MissingDataException("Missing header(s): " + Joiner.on(", ").join(missingHeaders)
+ ", Sheet: " + sheetName);
}
return;
}
rowDataHandler.rowData(sheetName, rowNum, cellValueByHeader);
}
}, true/* Take formulas directly, not their results */);
xmlReader.setContentHandler(excelSheetHandler);
InputSource sheetSource = new InputSource(sheetStream);
xmlReader.parse(sheetSource);
}
}
if (sheetsFound.size() < requiredHeadersBySheetName.size())
{// Check for missing Sheets
Set<String> missingSheets = Sets.difference(requiredHeadersBySheetName.keySet(), sheetsFound);
throw new MissingDataException("Missing sheet(s): " + Joiner.on(", ").join(missingSheets)
+ ", Excel file: " + excelFile.getAbsolutePath());
}
}
catch (IOException | OpenXML4JException | ParserException | SAXException e)
{
throw (e instanceof ParserException) ? (ParserException) e : new ParserException(e.getMessage(), e);
}
}
/**
* gets Column(e.g. C) for cellRef (e.g. C23) by ignoring row number(e.g. 23)
*/
private static String getColumn(String cellRef)
{
for (int i = 1; i < cellRef.length(); i++)
if (Character.isDigit(cellRef.charAt(i)))
return cellRef.substring(0, i);
throw new IllegalArgumentException("Invalid cellRef: " + cellRef);
}
}
@VenkataRaju
Copy link
Author

VenkataRaju commented Apr 15, 2014

Usage:

Map<String, Set<String>> headersBySheetName = new HashMap<>();
Set<String> headers = new HashSet<>(Arrays.asList("Header2", "Header1"));
headersBySheetName.put("Sheet1", headers);

ExcelParser excelParser = new ExcelParser(headersBySheetName, (String sheetName, int rowNum, Map<String, String> valueByHeader) ->
{
    System.out.println("SheetName: " + sheetName + ", rowNum: " + rowNum + ", ValueInCellByHeader: " + valueByHeader);
});
excelParser.parseExcel(new File("Excel.xlsx"));

dependencies

{
  compile group: 'com.google.guava', name: 'guava', version: '17.0-rc2'
  compile group: 'org.apache.poi', name: 'poi', version: '3.10-FINAL'
  compile group: 'org.apache.poi', name: 'poi-ooxml', version: '3.10-FINAL'
}

@vineetgupta007
Copy link

Hello Venkata, How are you?

I wanted to use this program to parse Excel file but not able to figure out how to use the Usage section that you have put here.. Can you please advise or help?
ExcelParser excelParser = new ExcelParser(headersBySheetName, (String sheetName, int rowNum, Map<String, String> valueByHeader) ->
{
System.out.println("SheetName: " + sheetName + ", rowNum: " + rowNum + ", ValueInCellByHeader: " + valueByHeader);
});
excelParser.parseExcel(new File("Excel.xlsx"));


Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment