Skip to content

Instantly share code, notes, and snippets.

@aldrinleal
Created March 4, 2013 19:38
Show Gist options
  • Save aldrinleal/5084845 to your computer and use it in GitHub Desktop.
Save aldrinleal/5084845 to your computer and use it in GitHub Desktop.
package com.amalgamood.linker;
import cascading.flow.FlowDef;
import cascading.json.operation.JSONFlatten;
import cascading.json.operation.JSONParser;
import cascading.json.operation.JSONSplitter;
import cascading.pipe.Each;
import cascading.pipe.GroupBy;
import cascading.pipe.Pipe;
import cascading.pipe.assembly.Unique;
import cascading.scheme.Scheme;
import cascading.scheme.hadoop.TextDelimited;
import cascading.scheme.hadoop.TextLine;
import cascading.tap.MultiSinkTap;
import cascading.tap.Tap;
import cascading.tap.hadoop.Hfs;
import cascading.tuple.Fields;
public class URLMiner extends BaseApp {
public static void main(String[] args) throws Exception {
new URLMiner().execute(args);
}
@Override
protected void initializeInternal() throws Exception {
Scheme sourceScheme = new TextLine(new Fields("line"));
Tap<?, ?, ?> inTap = new Hfs(sourceScheme, args[0]);
Pipe assembly = new Pipe("url-miner");
JSONParser jsonParser = new JSONParser(new Fields("json"));
assembly = new Each(assembly, jsonParser);
assembly = new Each(assembly, new GetUrl(), Fields.ALL);
assembly = new Unique(assembly, new Fields("expanded_url"));
assembly = new Each(assembly, new Fields("expanded_url"), new ExtractMetadata(), Fields.ALL);
//assembly = new Each(assembly, new Fields("url"), new JSONFlatten(new Fields("json"), "entities", "urls"));
//Tap<?, ?, ?> outTap = new Hfs(new TextDelimited(Fields.ALL, true, "\t"), args[1]);
Tap<?, ?, ?> outTap = new Hfs(new TextDelimited(new Fields("url", "expanded_url", "metadata", "ends_at"), true, "\t"), args[1]);
Tap<?, ?, ?> nutchTap = new Hfs(new TextDelimited(new Fields("ends_at"), false, "\t"), args[2]);
Tap<?, ?, ?> outputTap = new MultiSinkTap(outTap, nutchTap);
flowDef = FlowDef.flowDef().addSource(assembly, inTap).addTailSink(assembly, outputTap);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment