Skip to content

Instantly share code, notes, and snippets.

@jlewi
Created January 19, 2015 18:04
Show Gist options
  • Save jlewi/f01c6ac50cd2d8a7b879 to your computer and use it in GitHub Desktop.
Save jlewi/f01c6ac50cd2d8a7b879 to your computer and use it in GitHub Desktop.
A Dataflow coder for CharSequence objects.
/*
* Copyright (C) 2014 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package contrail.dataflow;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.UTFDataFormatException;
import java.nio.charset.Charset;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.google.cloud.dataflow.sdk.coders.AtomicCoder;
import com.google.cloud.dataflow.sdk.coders.CoderException;
import com.google.cloud.dataflow.sdk.util.VarInt;
import com.google.common.io.ByteStreams;
/**
* A StringUtf8Coder encodes Java Strings in UTF-8 encoding.
* If in a nested context, prefixes the string with a VarInt length field.
*/
@SuppressWarnings("serial")
public class CharSequenceCoder extends AtomicCoder<CharSequence> {
@JsonCreator
public static CharSequenceCoder of() {
return INSTANCE;
}
/////////////////////////////////////////////////////////////////////////////
private static final CharSequenceCoder INSTANCE = new CharSequenceCoder();
private static class Singletons {
private static final Charset UTF8 = Charset.forName("UTF-8");
}
// Writes a string with VarInt size prefix, supporting large strings.
private static void writeString(String value, DataOutputStream dos)
throws IOException {
byte[] bytes = value.getBytes(Singletons.UTF8);
VarInt.encode(bytes.length, dos);
dos.write(bytes);
}
// Reads a string with VarInt size prefix, supporting large strings.
private static String readString(DataInputStream dis) throws IOException {
int len = VarInt.decodeInt(dis);
if (len < 0) {
throw new CoderException("Invalid encoded string length: " + len);
}
byte[] bytes = new byte[len];
dis.readFully(bytes);
return new String(bytes, Singletons.UTF8);
}
private CharSequenceCoder() {}
@Override
public void encode(CharSequence value, OutputStream outStream, Context context)
throws IOException {
if (value == null) {
throw new CoderException("cannot encode a null String");
}
String stringValue = value.toString();
if (context.isWholeStream) {
outStream.write(stringValue.getBytes(Singletons.UTF8));
} else {
writeString(stringValue, new DataOutputStream(outStream));
}
}
@Override
public String decode(InputStream inStream, Context context)
throws IOException {
if (context.isWholeStream) {
ByteArrayOutputStream outStream = new ByteArrayOutputStream();
ByteStreams.copy(inStream, outStream);
// ByteArrayOutputStream.toString provides no Charset overloads.
return outStream.toString("UTF-8");
} else {
try {
return readString(new DataInputStream(inStream));
} catch (EOFException | UTFDataFormatException exn) {
// These exceptions correspond to decoding problems, so change
// what kind of exception they're branded as.
throw new CoderException(exn);
}
}
}
@Override
public boolean isDeterministic() {
return true;
}
protected long getEncodedElementByteSize(String value, Context context)
throws Exception {
if (value == null) {
throw new CoderException("cannot encode a null String");
}
if (context.isWholeStream) {
return value.getBytes(Singletons.UTF8).length;
} else {
DataOutputStream stream = new DataOutputStream(new ByteArrayOutputStream());
writeString(value, stream);
return stream.size();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment