Skip to content

Instantly share code, notes, and snippets.

@jdhenckel
Last active October 29, 2018 14:12
Show Gist options
  • Save jdhenckel/311c73f199f628bf2106348f32405658 to your computer and use it in GitHub Desktop.
Save jdhenckel/311c73f199f628bf2106348f32405658 to your computer and use it in GitHub Desktop.
CSV Parser (comma delimited values) written in Java and works with EXCEL exported files
//-------------------------------------------------------------------------------
// This file is made available under the Creative Commons CC0 1.0 Universal Public Domain Dedication.
// The person who associated a work with this deed has dedicated the work to the public domain by
// waiving all of his or her rights to the work worldwide under copyright law, including all related
// and neighboring rights, to the extent allowed by law. You can copy, modify, distribute and perform
// the work, even for commercial purposes, all without asking permission.
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
//--------------------------------------------------------------------------------
// Parser for files that contain comma separated values.
// This is designed to handle EXCEL exported files. For example
/* ==== SAMPLE BEGIN ====
"Name", "Age", Description
John Henckel,52,This is a string of text
Carin Pie,28,"This
text is broken
over \"several\" lines."
Phen "Bob" Weirsby,,missing age
==== SAMPLE END ==== */
// Notice that newline is a record delimiter, except within quotes. Also within quotes
// you can have escape sequences \n \" \\ \t. Other escape sequences are not changed.
// The " character is allowed in a non-quoted field if it is not the first character.
// Fields not in quotes are trimmed for whitespace.
public class ParserForCSV {
//--------------------------------------------------------------------------------
// Main function. Read entire file into list of rows.
// messages - Any warning messages are appended here.
public static List<String[]> parse(Reader file, List<String> messages) throws IOException {
List<String[]> result = new ArrayList<String[]>();
int n = 10;
for(int i = 0;; ++i) {
String[] row = parseRecord(file, messages, i, n);
if (row == null) break;
result.add(row);
if (row.length > n) n = Math.min(row.length, 2*n);
}
return result;
}
//--------------------------------------------------------------------------------
// This reads a record from a CSV file into an array of strings.
// Returns null for EOF.
// The hint is the expected number of fields per record.
public static String[] parseRecord(Reader file, List<String> messages, int rowNumber, int hint) throws IOException {
StringBuilder sb = new StringBuilder();
List<String> row = new ArrayList<String>(Math.min(Math.max(10, hint), 1000));
for(int i = 0;;) {
int c = parseField(file, sb, messages, rowNumber, i);
if (i==0 && sb.length() == 0) {
if (c == -1)
return null; // end of the file
if (c == '\n')
continue; // ignore blank lines in the middle of the file
}
row.add(sb.toString());
if (c != ',') break;
++i;
}
return row.toArray(new String[row.size()]);
}
//--------------------------------------------------------------------------------
// This reads a single FIELD item from a CSV file into a string buffer
// Returns the last character read, which is comma, newline, or EOF (-1).
// Append any warnings to messages.
public static int parseField(Reader file, StringBuilder result, List<String> messages, int row, int col) throws IOException {
String dump = null;
result.setLength(0);
int c = file.read();
int numChars = 0; // number of non-white chars that are added to the result
for (;;) {
if (c == '\n' || c == ',' || c == -1) {
if (dump != null)
messages.add("WARN: Ignoring data in record "+row+" field "+(col+1)+": " + dump);
if (numChars > 0) {
// trim trailing whitespace for non-quoted fields
while (numChars > 1 && Character.isWhitespace(result.charAt(numChars - 1))) --numChars;
result.setLength(numChars);
}
return c;
}
if (numChars == 0 && c == '"') {
result.setLength(0); // Ignore whitespace in front of quotes
for (;;) {
c = file.read();
if (c == '"' || c == -1) {
if (c == -1) {
messages.add("WARN: Unexpected EOF in record "+row+" field "+(col+1));
return c;
}
break;
}
if (c == '\\') {
c = file.read();
// Interpret \n, \t, \\, and \" in the strings
if (c == 'n') c = '\n';
else if (c == 't') c = '\t';
else if (c != '"' && c != '\\') result.append('\\');
}
result.append((char)c);
}
// The value -1 means we finished parsing the quotes and now we're looking for a comma or newline.
numChars = -1;
}
else {
if (numChars == -1) {
if (dump != null)
dump += (char)c;
else if (!Character.isWhitespace(c))
dump = "" + (char) c;
} else {
if (numChars > 0 || !Character.isWhitespace(c)) {
result.append((char)c);
++numChars;
}
}
}
c = file.read();
}
}
public static void TestCase() {
String data =
"\"Name\", \"Age\", Description\n" +
"John Henckel,52,This is a string of text\n" +
"Thomas Moore,\"52\" years old\n" +
"Carin Pie,28,\"This\n" +
"text is broken\n" +
"over \\\"several\\\" lines.\"\n" +
"Phen \"Bob\" Weirsby,,\"missing age\n";
List<String> messages = new ArrayList<>();
List<String[]> result;
try {
result = parse(new StringReader(data), messages);
} catch (Exception e) {
e.printStackTrace();
return;
}
int i = 0;
System.out.println("---- result data -----");
for (String[] row: result) {
System.out.println("BEGIN ROW " + i++);
for (String item: row) System.out.println(" DATA: \""+item+"\"");
}
System.out.println("---- parser messages -----");
for (String m: messages) {
System.out.println(m);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment