Skip to content

Instantly share code, notes, and snippets.

@basinilya
Last active July 27, 2019 08:36
Show Gist options
  • Save basinilya/bd9ccddad879066155a1e61ea18dbd05 to your computer and use it in GitHub Desktop.
Save basinilya/bd9ccddad879066155a1e61ea18dbd05 to your computer and use it in GitHub Desktop.
package org.foo.csvtokenizer;
import java.io.Reader;
import java.io.StreamTokenizer;
import java.util.ArrayList;
import java.util.List;
public abstract class AbstractPushingCsvParser implements PushingCsvParser {
@Override
public abstract void onRecord(List<String> values, long lineno) throws Exception;
@Override
public void parse(final Reader reader) throws Exception {
try (Reader sysResource = reader) {
tokenizer = new CsvTokenizer(reader, remainderField, quotedFields);
for (;;) {
switch (tokenizer.nextToken()) {
case StreamTokenizer.TT_WORD:
mVals.add(tokenizer.sval);
break;
case StreamTokenizer.TT_EOF:
return;
case StreamTokenizer.TT_EOL:
onRecord(new ArrayList<>(mVals), tokenizer.lineno());
mVals.clear();
break;
default:
throw new RuntimeException("can't happen");
}
}
}
}
private final List<String> mVals = new ArrayList<>();
private CsvTokenizer tokenizer;
private int remainderField = Integer.MAX_VALUE;
@Override
public int getRemainderField() {
return remainderField;
}
@Override
public void setRemainderField(final int remainderField) {
this.remainderField = remainderField;
}
private boolean quotedFields = true;
@Override
public boolean isQuotedFields() {
return quotedFields;
}
@Override
public void setQuotedFields(final boolean quotedFields) {
this.quotedFields = quotedFields;
}
}
package org.foo.csvtokenizer;
import java.io.IOException;
import java.io.Reader;
import java.io.StreamTokenizer;
import java.util.ArrayDeque;
import java.util.Deque;
public class CsvTokenizer extends StreamTokenizer {
public CsvTokenizer(final Reader r, final int remainderField, final boolean quotedFields) {
super(r);
this.quotedFields = quotedFields;
this.remainderField = remainderField;
resetSyntax();
wordChars(0, 255);
whitespaceChars('\n', '\n');
whitespaceChars('\r', '\r');
if (remainderField > 0) {
enableFS(true);
}
eolIsSignificant(true);
}
@Override
public int nextToken() throws IOException {
for (;;) {
if (!savedTtypes.isEmpty()) {
restore();
} else {
super.nextToken();
}
switch (ttype) {
case TT_WORD:
fieldStarted = true;
startNextField(sval);
break;
case TT_EOF:
handleEof();
return ttype;
case TT_EOL:
handleEol();
return ttype;
default:
handleFs();
if (ttype == TT_WORD) {
return ttype;
}
}
}
}
protected String getFs() {
return Character.toString(PushingCsvParser.DEFAULT_FS);
}
private final int remainderField;
private final boolean quotedFields;
private boolean fieldStarted;
private final Deque<Integer> savedTtypes = new ArrayDeque<>(2);
private String savedSval;
private int nFields;
private int restore() {
ttype = savedTtypes.pop();
return ttype;
}
private void enableFS(final boolean enable) {
final String fs = getFs();
for (int i = 0, len = fs.length(); i < len; i++) {
final char ch = fs.charAt(i);
if (enable) {
ordinaryChar(ch);
} else {
wordChars(ch, ch);
}
}
if (enable && quotedFields) {
ordinaryChar('"');
} else {
wordChars('"', '"');
}
}
private void startNextField(final String s) {
savedSval = s;
nFields++;
}
private void handleEof() throws IOException {
if (nFields > 0) {
savedTtypes.push(TT_EOF);
handleEol();
}
}
private void handleEol() throws IOException {
if (!fieldStarted) {
startNextField("");
fieldStarted = true;
}
if (nFields > 0) {
savedTtypes.push(TT_EOL);
ttype = TT_WORD;
sval = savedSval;
resetLine();
} else {
fieldStarted = false;
}
}
private void handleFs() throws IOException {
if (getFs().indexOf((char) ttype) == -1) {
if (ttype == '"') {
if (!fieldStarted) {
readQuotedPart();
}
recoverUnquotedTail();
fieldStarted = true;
} else {
throw new RuntimeException("unexpected ttype: " + ttype);
}
} else {
if (!fieldStarted) {
startNextField("");
}
if (nFields == remainderField) {
enableFS(false);
}
fieldStarted = false;
ttype = TT_WORD;
sval = savedSval;
}
}
private void readQuotedPart() throws IOException {
startQuotedMode();
final StringBuilder sb = new StringBuilder();
fieldStarted = true;
for (;;) {
super.nextToken();
if (ttype == '"') {
quoteChoice(sb);
} else if (ttype == TT_WORD && fieldStarted) {
sb.append(sval);
} else {
break; // eof, eol, fs, or non-adjacent quote
}
}
startNextField(sb.toString());
endQuotedMode();
}
private void quoteChoice(final StringBuilder sb) {
if (!fieldStarted) {
sb.append('"');
fieldStarted = true;
startQuotedMode();
} else {
fieldStarted = false;
endQuotedMode();
}
}
private void startQuotedMode() {
enableFS(false);
ordinaryChar('"');
wordChars('\n', '\n');
wordChars('\r', '\r');
}
private void endQuotedMode() {
enableFS(true);
whitespaceChars('\n', '\n');
whitespaceChars('\r', '\r');
}
private void recoverUnquotedTail() throws IOException {
for (;;) {
if (ttype == TT_WORD) {
savedSval = savedSval.concat(sval);
} else if (ttype == '"') {
savedSval = savedSval.concat("\"");
} else {
savedTtypes.push(ttype);
break;
}
super.nextToken();
}
}
private void resetLine() {
if (nFields == 0) {
throw new RuntimeException("Can't happen");
}
if (nFields >= remainderField && remainderField > 0) {
enableFS(true);
}
nFields = 0;
}
}
package org.foo.csvtokenizer;
import java.io.Reader;
import java.util.List;
public interface PushingCsvParser {
char DEFAULT_FS = '~';
void onRecord(List<String> values, long lineno) throws Exception;
void parse(Reader reader) throws Exception;
int getRemainderField();
void setRemainderField(int remainderField);
boolean isQuotedFields();
void setQuotedFields(boolean quotedFields);
}
package org.foo.csvtokenizer;
import static java.util.Arrays.asList;
import static org.junit.Assert.assertEquals;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import org.junit.Before;
import org.junit.Test;
public class TestCsvParser {
@Test
public void test0ZeroLengthFile() throws Exception {
// Since there should be a way to store zero rows in csv, we treat zero
// length files as having no rows
doIt("");
}
@Test
public void test100() throws Exception {
assertEquals(Integer.MAX_VALUE, parser.getRemainderField());
doIt(" ", asList(" "));
}
@Test
public void test200() throws Exception {
doIt("\n", asList(""));
}
@Test
public void test300() throws Exception {
doIt("~", asList("", ""));
}
@Test
public void test400() throws Exception {
doIt("~~", asList("", "", ""));
}
@Test
public void test500() throws Exception {
parser.setRemainderField(1);
doIt("~~\n~~", asList("", "~"), asList("", "~"));
}
@Test
public void test600() throws Exception {
parser.setRemainderField(0);
doIt("~~\n~~", asList("~~"), asList("~~"));
}
@Test
public void test700() throws Exception {
doIt("a~", asList("a", ""));
}
@Test
public void test800() throws Exception {
doIt("\n\na", asList(""), asList(""), asList("a"));
}
@Test
public void test900() throws Exception {
doIt(
"'one''one~one\none'~two\nthree~four".replace('\'', '"'),
asList("one\"one~one\none", "two"),
asList("three", "four"));
}
@Test
public void test950() throws Exception {
parser.setQuotedFields(false);
doIt(
"'one''one~one\none'~two\nthree~four".replace('\'', '"'),
asList("\"one\"\"one", "one"),
asList("one\"", "two"),
asList("three", "four"));
}
@Test
public void test1000() throws Exception {
doIt("'ab''cd'~x".replace('\'', '"'), asList("ab\"cd", "x"));
}
@Test
public void test1100() throws Exception {
doIt("'ab''cd~x'~x".replace('\'', '"'), asList("ab\"cd~x", "x"));
}
@Test
public void test1200() throws Exception {
doIt("'ab'cd'ef''g~x".replace('\'', '"'), asList("abcd\"ef\"\"g", "x"));
}
@Test
public void test1300() throws Exception {
doIt("'ab'cd~x".replace('\'', '"'), asList("abcd", "x"));
}
@Test
public void test1400() throws Exception {
doIt("ab'cd~x".replace('\'', '"'), asList("ab\"cd", "x"));
}
@Test
public void test1500() throws Exception {
doIt("ab''cd~x".replace('\'', '"'), asList("ab\"\"cd", "x"));
}
@SafeVarargs
private final void doIt(final String s, final List<String>... expectedRows) throws Exception {
parser.parse(new StringReader(s));
assertEquals(asList(expectedRows), allRows);
}
private final List<List<String>> allRows = new ArrayList<>();
@Before
public void setUp() {
allRows.clear();
{
parser = new AbstractPushingCsvParser() {
@Override
public void onRecord(final List<String> values, final long lineno)
throws Exception {
allRows.add(values);
}
};
}
}
private PushingCsvParser parser;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment