Skip to content

Instantly share code, notes, and snippets.

@seregasheypak
seregasheypak / gist:5391266
Created April 15, 2013 21:01
A base test class
package ru.develbureau.mrtesting.mapreduce.minimrcluster;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MiniMRCluster;
import org.apache.hadoop.mapreduce.Job;
@seregasheypak
seregasheypak / gist:5391341
Created April 15, 2013 21:13
A concrete class for Apache log parsing mapper, combiner, reducer
package ru.develbureau.mrtesting.mapreduce.itest;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.testng.annotations.BeforeMethod;
@seregasheypak
seregasheypak / gist:5425807
Created April 20, 2013 12:18
MapReduceDriver for unit-testing
package ru.develbureau.mrtesting.mapreduce;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mrunit.mapreduce.MapReduceDriver;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;
import ru.develbureau.mrtesting.model.LoggedRequest;
import ru.develbureau.mrtesting.parser.ApacheLogParser;
package ru.develbureau.mrtesting.sequencefile;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.util.ReflectionUtils;
import lombok.Builder;
import lombok.Data;
import lombok.NonNull;
@Data
public class Visitor {
private String visitorId;
private Integer siteId;
private Long visitTs;
//create cassandra connection, please see how I do connect to cluster
private Cluster buildCluster(){
Cluster.Builder builder = Cluster.builder();
connectionSettings.getCassandraAddresses().forEach(builder::addContactPoint);
if(connectionSettings.getMinPoolSize() !=null){
builder.withPoolingOptions(createPoolingOptions());
}
//Typical put implmentation
public boolean put(SomeBean bean){
HTableInterface hTable = null;
try {
//Controller holds reference to HConnection.
//It's created once during servlet.init in servlet and shared across several controllers, just an object ref
HConnection hConnection = getConnection();
hTable = hConnection.getTable(getName());
return hTable.checkAndPut(createKey(bean), CF_B, CQ_B, null, createPut(bean));
}
@seregasheypak
seregasheypak / group.scala
Created August 31, 2015 09:19
group and iterate over group values
def groupAndGenerateNewSurrogateKey: Pipe = {
pipe.groupBy('naturalKey){ group =>
group.mapStream[Long, (Long, Long)]('someValueField -> ('someValueField, 'newSurrogateKey)) { items: Iterator[Long] =>
val newSurrogateKey = KeyGen.generate()
println(s"new group key:[$newSurrogateKey]") //outputs generated key
println(s"items: ${items.toList}") //correctly outputs grouped items
items.map((_,newSurrogateKey)).toList
}
}.project('someValueField, 'newSurrogateKey)
}
// A builder with single spout and bolt. Spout could emit tuples using two streams:
// user event with fieldGouping with streamId UserEvent.class.getSimpleName()
// game event with streamId GameEvent.class.getSimpleName()
builder.setSpout(GenericEventSpout.class.getSimpleName(), new GenericEventSpout(), 2)
builder.setBolt(UserContextBolt.class.getSimpleName(), new UserContextBolt(), 2)
.addConfigurations(env)
.fieldsGrouping(GenericEventSpout.class.getSimpleName(), UserEvent.class.getSimpleName(), new Fields(Const.userId.name()))
.allGrouping(GenericEventSpout.class.getSimpleName(), GameEvent.class.getSimpleName());
// Here are stream declarations in Spout:
lass UntarOperator(BashOperator):
"""
Download and unpack artifact
:param artifact_tar_gz_name: name of artifact from previous download step
:type url: string
:param lookup_task_id an id of task that downloaded artifact
:type user: string
:param password