Skip to content

Instantly share code, notes, and snippets.

@Shmuma
Created January 12, 2012 14:15
Show Gist options
  • Save Shmuma/1600759 to your computer and use it in GitHub Desktop.
Save Shmuma/1600759 to your computer and use it in GitHub Desktop.
Trivial writer to estimate LZO performance
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Random;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.ClusterStatus;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class Trivial extends Configured implements Tool {
/**
* A custom input format that creates virtual inputs of a single string
* for each map.
*/
static class RandomInputFormat implements InputFormat<Text, Text> {
/**
* Generate the requested number of file splits, with the filename
* set to the filename of the output file.
*/
public InputSplit[] getSplits(JobConf job,
int numSplits) throws IOException {
InputSplit[] result = new InputSplit[numSplits];
Path outDir = FileOutputFormat.getOutputPath(job);
for(int i=0; i < result.length; ++i) {
result[i] = new FileSplit(new Path(outDir, "dummy-split-" + i), 0, 1,
(String[])null);
}
return result;
}
/**
* Return a single record (filename, "") where the filename is taken from
* the file split.
*/
static class RandomRecordReader implements RecordReader<Text, Text> {
Path name;
public RandomRecordReader(Path p) {
name = p;
}
public boolean next(Text key, Text value) {
if (name != null) {
key.set(name.getName());
name = null;
return true;
}
return false;
}
public Text createKey() {
return new Text();
}
public Text createValue() {
return new Text();
}
public long getPos() {
return 0;
}
public void close() {}
public float getProgress() {
return 0.0f;
}
}
public RecordReader<Text, Text> getRecordReader(InputSplit split,
JobConf job,
Reporter reporter) throws IOException {
return new RandomRecordReader(((FileSplit) split).getPath());
}
}
static class Map extends MapReduceBase
implements Mapper<WritableComparable, Writable, BytesWritable, BytesWritable> {
private Random random = new Random();
private BytesWritable randomKey = new BytesWritable();
private BytesWritable randomValue = new BytesWritable();
public void map(WritableComparable key,
Writable value,
OutputCollector<BytesWritable, BytesWritable> output,
Reporter reporter) throws IOException {
for (int i = 0; i < 1024; i++) {
output.collect (randomKey, randomValue);
reporter.setStatus ("Record " + i + " generated");
}
}
private void randomizeBytes(byte[] data, int offset, int length) {
for(int i=offset + length - 1; i >= offset; --i) {
// data[i] = (byte)'A';
data[i] = (byte)random.nextInt (255);
}
}
@Override
public void configure(JobConf job) {
randomKey.setSize (100);
randomValue.setSize (10*1024*1024);
randomizeBytes (randomValue.getBytes(), 0, randomValue.getLength());
randomizeBytes (randomKey.getBytes(), 0, randomKey.getLength());
// for (int i = 0; i < randomValue.getLength(); i++)
// randomValue.getBytes ()[i] = 0;
}
}
public int run (String[] args) throws Exception {
JobConf job = new JobConf (getConf());
job.setBoolean ("mapred.output.compress", true);
job.set ("mapred.output.compression.codec", "com.hadoop.compression.lzo.LzopCodec");
// job.set ("mapred.output.compression.codec", "com.hadoop.compression.lzo.LzoCodec");
// job.set ("mapred.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec");
job.setJarByClass (Trivial.class);
job.setJobName("Trivial writer");
job.setOutputKeyClass (BytesWritable.class);
job.setOutputValueClass (BytesWritable.class);
job.setMapperClass (Map.class);
job.setInputFormat (RandomInputFormat.class);
job.setOutputFormat (SequenceFileOutputFormat.class);
// FileInputFormat.addInputPath (job, new Path (args[0]));
FileOutputFormat.setOutputPath (job, new Path (args[0]));
job.setNumMapTasks (1);
System.out.println("Running 1 map.");
job.setNumReduceTasks (0);
Date startTime = new Date();
System.out.println("Job started: " + startTime);
JobClient.runJob(job);
Date endTime = new Date();
System.out.println("Job ended: " + endTime);
System.out.println("The job took " + (endTime.getTime() - startTime.getTime()) /1000 + " seconds.");
return 0;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new Trivial(), args);
System.exit(res);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment