Skip to content

Instantly share code, notes, and snippets.

@rjurney
Last active August 29, 2015 14:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rjurney/2315553d696136908fb3 to your computer and use it in GitHub Desktop.
Save rjurney/2315553d696136908fb3 to your computer and use it in GitHub Desktop.
ChooseFieldByJava UDF with problemos
public class ChooseFieldByValue extends EvalFunc<Tuple>
{
private TupleFactory tf = TupleFactory.getInstance();
// Enable multiple languages by specifying the model path. See http://text.sourceforge.net/models-1.5/
public Tuple exec(Tuple input) throws IOException
{
if(input.size() < 2) {
throw new IOException();
}
String fieldNameToReturn = input.get(0).toString();
if(fieldNameToReturn == null || fieldNameToReturn == "") {
return null;
}
Tuple outTuple = tf.newTuple();
Schema inputSchema = getInputSchema();
for(int i=0; i < input.size(); i++)
{
Schema.FieldSchema fieldSchema = inputSchema.getField(i);
System.err.println("i: " + Integer.toString(i));
System.err.println("fieldSchema.byte: [" + DataType.findTypeName(fieldSchema.type) + "]");
System.err.println("fieldSchema.alias: [" + fieldSchema.alias + "]");
System.err.println("fieldNameToReturn: [" + fieldNameToReturn + "]");
Object matchedValue = input.get(i);
System.err.println("input.get(i): [" + matchedValue.toString() + "]");
if(fieldSchema.alias.equals(fieldNameToReturn)) {
System.err.println("Matched fieldname " + fieldNameToReturn + " with value: " + matchedValue.toString());
outTuple.append(matchedValue);
break;
}
}
return outTuple;
}
package datafu.test.pig.util;
import java.util.List;
import junit.framework.Assert;
import org.adrianwalker.multilinestring.Multiline;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.pigunit.PigTest;
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
import org.testng.annotations.Test;
import datafu.test.pig.PigTests;
public class ChooseFieldByValueTest extends PigTests
{
/**
define ChooseFieldByValue datafu.pig.util.ChooseFieldByValue();
data = LOAD 'input' using PigStorage(',') AS (fieldName:chararray, text1:chararray, text2:chararray, text3:chararray);
data2 = FOREACH data GENERATE ChooseFieldByValue(fieldName,*) as result;
describe data2;
data3 = FOREACH data2 GENERATE result;
STORE data3 INTO 'output';
*/
@Multiline private static String chooseFieldByValueTest;
@Test
public void chooseFieldByValueTest() throws Exception
{
PigTest test = createPigTestFromString(chooseFieldByValueTest);
writeLinesToFile("input",
"text1,text1,hi,how,are",
"text2,text2,you,sir,today",
"text3,text3,bob,is,a",
"text1,text1,friend,of,mine",
"text2,text2,and,I,say",
"text3,text3,he,is,nice.");
//test.runScript();
assertOutput(test, "data3",
"(hi)",
"(sir)",
"(a)",
"(friend)",
"(I)",
"(nice)");
}
}
15:13:44.603 [DEBUG] [TestEventLogger] Gradle test > datafu.test.pig.util.ChooseFieldByValueTest.chooseFieldByValueTest STANDARD_ERROR
15:13:44.604 [DEBUG] [TestEventLogger] i: 0
15:13:44.604 [DEBUG] [TestEventLogger] fieldSchema.byte: [chararray]
15:13:44.604 [DEBUG] [TestEventLogger] fieldSchema.alias: [fieldName]
15:13:44.604 [DEBUG] [TestEventLogger] fieldNameToReturn: [text1]
15:13:44.605 [DEBUG] [TestEventLogger] input.get(i): [text1]
15:13:44.605 [DEBUG] [TestEventLogger] i: 1
15:13:44.605 [DEBUG] [TestEventLogger] fieldSchema.byte: [chararray]
15:13:44.606 [DEBUG] [TestEventLogger] fieldSchema.alias: [fieldName]
15:13:44.606 [DEBUG] [TestEventLogger] fieldNameToReturn: [text1]
15:13:44.606 [DEBUG] [TestEventLogger] input.get(i): [text1]
15:13:44.606 [DEBUG] [TestEventLogger] i: 2
15:13:44.607 [DEBUG] [TestEventLogger] fieldSchema.byte: [chararray]
15:13:44.607 [DEBUG] [TestEventLogger] fieldSchema.alias: [text1]
15:13:44.607 [DEBUG] [TestEventLogger] fieldNameToReturn: [text1]
15:13:44.607 [DEBUG] [TestEventLogger] input.get(i): [text1]
15:13:44.608 [DEBUG] [TestEventLogger] Matched fieldname text1 with value: text1
15:13:44.608 [DEBUG] [TestEventLogger] i: 0
15:13:44.608 [DEBUG] [TestEventLogger] fieldSchema.byte: [chararray]
15:13:44.609 [DEBUG] [TestEventLogger] fieldSchema.alias: [fieldName]
15:13:44.609 [DEBUG] [TestEventLogger] fieldNameToReturn: [text2]
15:13:44.609 [DEBUG] [TestEventLogger] input.get(i): [text2]
15:13:44.610 [DEBUG] [TestEventLogger] i: 1
15:13:44.611 [DEBUG] [TestEventLogger] fieldSchema.byte: [chararray]
15:13:44.611 [DEBUG] [TestEventLogger] fieldSchema.alias: [fieldName]
15:13:44.612 [DEBUG] [TestEventLogger] fieldNameToReturn: [text2]
15:13:44.612 [DEBUG] [TestEventLogger] input.get(i): [text2]
15:13:44.612 [DEBUG] [TestEventLogger] i: 2
15:13:44.613 [DEBUG] [TestEventLogger] fieldSchema.byte: [chararray]
15:13:44.613 [DEBUG] [TestEventLogger] fieldSchema.alias: [text1]
15:13:44.614 [DEBUG] [TestEventLogger] fieldNameToReturn: [text2]
15:13:44.614 [DEBUG] [TestEventLogger] input.get(i): [text2]
15:13:44.615 [DEBUG] [TestEventLogger] i: 3
15:13:44.615 [DEBUG] [TestEventLogger] fieldSchema.byte: [chararray]
15:13:44.615 [DEBUG] [TestEventLogger] fieldSchema.alias: [text2]
15:13:44.616 [DEBUG] [TestEventLogger] fieldNameToReturn: [text2]
15:13:44.616 [DEBUG] [TestEventLogger] input.get(i): [you]
15:13:44.617 [DEBUG] [TestEventLogger] Matched fieldname text2 with value: you
15:13:44.617 [DEBUG] [TestEventLogger] i: 0
15:13:44.618 [DEBUG] [TestEventLogger] fieldSchema.byte: [chararray]
15:13:44.618 [DEBUG] [TestEventLogger] fieldSchema.alias: [fieldName]
15:13:44.618 [DEBUG] [TestEventLogger] fieldNameToReturn: [text3]
15:13:44.618 [DEBUG] [TestEventLogger] input.get(i): [text3]
15:13:44.619 [DEBUG] [TestEventLogger] i: 1
15:13:44.619 [DEBUG] [TestEventLogger] fieldSchema.byte: [chararray]
15:13:44.619 [DEBUG] [TestEventLogger] fieldSchema.alias: [fieldName]
15:13:44.619 [DEBUG] [TestEventLogger] fieldNameToReturn: [text3]
15:13:44.620 [DEBUG] [TestEventLogger] input.get(i): [text3]
15:13:44.620 [DEBUG] [TestEventLogger] i: 2
15:13:44.620 [DEBUG] [TestEventLogger] fieldSchema.byte: [chararray]
15:13:44.620 [DEBUG] [TestEventLogger] fieldSchema.alias: [text1]
15:13:44.621 [DEBUG] [TestEventLogger] fieldNameToReturn: [text3]
15:13:44.621 [DEBUG] [TestEventLogger] input.get(i): [text3]
15:13:44.621 [DEBUG] [TestEventLogger] i: 3
15:13:44.621 [DEBUG] [TestEventLogger] fieldSchema.byte: [chararray]
15:13:44.622 [DEBUG] [TestEventLogger] fieldSchema.alias: [text2]
15:13:44.622 [DEBUG] [TestEventLogger] fieldNameToReturn: [text3]
15:13:44.622 [DEBUG] [TestEventLogger] input.get(i): [bob]
15:13:44.622 [DEBUG] [TestEventLogger] i: 4
15:13:44.623 [DEBUG] [TestEventLogger] fieldSchema.byte: [chararray]
15:13:44.623 [DEBUG] [TestEventLogger] fieldSchema.alias: [text3]
15:13:44.623 [DEBUG] [TestEventLogger] fieldNameToReturn: [text3]
15:13:44.623 [DEBUG] [TestEventLogger] input.get(i): [is]
15:13:44.624 [DEBUG] [TestEventLogger] Matched fieldname text3 with value: is
15:13:44.624 [DEBUG] [TestEventLogger] i: 0
15:13:44.624 [DEBUG] [TestEventLogger] fieldSchema.byte: [chararray]
15:13:44.624 [DEBUG] [TestEventLogger] fieldSchema.alias: [fieldName]
15:13:44.625 [DEBUG] [TestEventLogger] fieldNameToReturn: [text1]
15:13:44.625 [DEBUG] [TestEventLogger] input.get(i): [text1]
15:13:44.625 [DEBUG] [TestEventLogger] i: 1
15:13:44.625 [DEBUG] [TestEventLogger] fieldSchema.byte: [chararray]
15:13:44.626 [DEBUG] [TestEventLogger] fieldSchema.alias: [fieldName]
15:13:44.626 [DEBUG] [TestEventLogger] fieldNameToReturn: [text1]
15:13:44.626 [DEBUG] [TestEventLogger] input.get(i): [text1]
15:13:44.626 [DEBUG] [TestEventLogger] i: 2
15:13:44.626 [DEBUG] [TestEventLogger] fieldSchema.byte: [chararray]
15:13:44.627 [DEBUG] [TestEventLogger] fieldSchema.alias: [text1]
15:13:44.627 [DEBUG] [TestEventLogger] fieldNameToReturn: [text1]
15:13:44.627 [DEBUG] [TestEventLogger] input.get(i): [text1]
15:13:44.627 [DEBUG] [TestEventLogger] Matched fieldname text1 with value: text1
15:13:44.628 [DEBUG] [TestEventLogger] i: 0
15:13:44.628 [DEBUG] [TestEventLogger] fieldSchema.byte: [chararray]
15:13:44.628 [DEBUG] [TestEventLogger] fieldSchema.alias: [fieldName]
15:13:44.628 [DEBUG] [TestEventLogger] fieldNameToReturn: [text2]
15:13:44.629 [DEBUG] [TestEventLogger] input.get(i): [text2]
15:13:44.629 [DEBUG] [TestEventLogger] i: 1
15:13:44.629 [DEBUG] [TestEventLogger] fieldSchema.byte: [chararray]
15:13:44.629 [DEBUG] [TestEventLogger] fieldSchema.alias: [fieldName]
15:13:44.629 [DEBUG] [TestEventLogger] fieldNameToReturn: [text2]
15:13:44.630 [DEBUG] [TestEventLogger] input.get(i): [text2]
15:13:44.630 [DEBUG] [TestEventLogger] i: 2
15:13:44.630 [DEBUG] [TestEventLogger] fieldSchema.byte: [chararray]
15:13:44.630 [DEBUG] [TestEventLogger] fieldSchema.alias: [text1]
15:13:44.631 [DEBUG] [TestEventLogger] fieldNameToReturn: [text2]
15:13:44.631 [DEBUG] [TestEventLogger] input.get(i): [text2]
15:13:44.631 [DEBUG] [TestEventLogger] i: 3
15:13:44.631 [DEBUG] [TestEventLogger] fieldSchema.byte: [chararray]
15:13:44.631 [DEBUG] [TestEventLogger] fieldSchema.alias: [text2]
15:13:44.632 [DEBUG] [TestEventLogger] fieldNameToReturn: [text2]
15:13:44.632 [DEBUG] [TestEventLogger] input.get(i): [and]
15:13:44.632 [DEBUG] [TestEventLogger] Matched fieldname text2 with value: and
15:13:44.632 [DEBUG] [TestEventLogger] i: 0
15:13:44.633 [DEBUG] [TestEventLogger] fieldSchema.byte: [chararray]
15:13:44.633 [DEBUG] [TestEventLogger] fieldSchema.alias: [fieldName]
15:13:44.633 [DEBUG] [TestEventLogger] fieldNameToReturn: [text3]
15:13:44.633 [DEBUG] [TestEventLogger] input.get(i): [text3]
15:13:44.634 [DEBUG] [TestEventLogger] i: 1
15:13:44.634 [DEBUG] [TestEventLogger] fieldSchema.byte: [chararray]
15:13:44.634 [DEBUG] [TestEventLogger] fieldSchema.alias: [fieldName]
15:13:44.634 [DEBUG] [TestEventLogger] fieldNameToReturn: [text3]
15:13:44.634 [DEBUG] [TestEventLogger] input.get(i): [text3]
15:13:44.635 [DEBUG] [TestEventLogger] i: 2
15:13:44.635 [DEBUG] [TestEventLogger] fieldSchema.byte: [chararray]
15:13:44.635 [DEBUG] [TestEventLogger] fieldSchema.alias: [text1]
15:13:44.635 [DEBUG] [TestEventLogger] fieldNameToReturn: [text3]
15:13:44.636 [DEBUG] [TestEventLogger] input.get(i): [text3]
15:13:44.636 [DEBUG] [TestEventLogger] i: 3
15:13:44.636 [DEBUG] [TestEventLogger] fieldSchema.byte: [chararray]
15:13:44.636 [DEBUG] [TestEventLogger] fieldSchema.alias: [text2]
15:13:44.636 [DEBUG] [TestEventLogger] fieldNameToReturn: [text3]
15:13:44.637 [DEBUG] [TestEventLogger] input.get(i): [he]
15:13:44.637 [DEBUG] [TestEventLogger] i: 4
15:13:44.637 [DEBUG] [TestEventLogger] fieldSchema.byte: [chararray]
15:13:44.637 [DEBUG] [TestEventLogger] fieldSchema.alias: [text3]
15:13:44.638 [DEBUG] [TestEventLogger] fieldNameToReturn: [text3]
15:13:44.638 [DEBUG] [TestEventLogger] input.get(i): [is]
15:13:44.638 [DEBUG] [TestEventLogger] Matched fieldname text3 with value: is
15:13:47.865 [DEBUG] [TestEventLogger]
15:13:47.865 [DEBUG] [TestEventLogger] Gradle test > datafu.test.pig.util.ChooseFieldByValueTest.chooseFieldByValueTest STANDARD_OUT
15:13:47.866 [DEBUG] [TestEventLogger] Values for data3:
15:13:47.868 [DEBUG] [TestEventLogger] ((text1))
15:13:47.868 [DEBUG] [TestEventLogger] ((you))
15:13:47.869 [DEBUG] [TestEventLogger] ((is))
15:13:47.869 [DEBUG] [TestEventLogger] ((text1))
15:13:47.869 [DEBUG] [TestEventLogger] ((and))
15:13:47.870 [DEBUG] [TestEventLogger] ((is))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment