Shmuma/gist:1600759

## gistfile1.java
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Random;

import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.ClusterStatus;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


public class Trivial extends Configured implements Tool {
    /**
     * A custom input format that creates virtual inputs of a single string
     * for each map.
     */
    static class RandomInputFormat implements InputFormat<Text, Text> {

        /**
         * Generate the requested number of file splits, with the filename
         * set to the filename of the output file.
         */
        public InputSplit[] getSplits(JobConf job,
                                      int numSplits) throws IOException {
            InputSplit[] result = new InputSplit[numSplits];
            Path outDir = FileOutputFormat.getOutputPath(job);
            for(int i=0; i < result.length; ++i) {
                result[i] = new FileSplit(new Path(outDir, "dummy-split-" + i), 0, 1,
                                          (String[])null);
            }
            return result;
        }

        /**
         * Return a single record (filename, "") where the filename is taken from
         * the file split.
         */
        static class RandomRecordReader implements RecordReader<Text, Text> {
            Path name;
            public RandomRecordReader(Path p) {
                name = p;
            }
            public boolean next(Text key, Text value) {
                if (name != null) {
                    key.set(name.getName());
                    name = null;
                    return true;
                }
                return false;
            }
            public Text createKey() {
                return new Text();
            }
            public Text createValue() {
                return new Text();
            }
            public long getPos() {
                return 0;
            }
            public void close() {}
            public float getProgress() {
                return 0.0f;
            }
        }

        public RecordReader<Text, Text> getRecordReader(InputSplit split,
                                                        JobConf job,
                                                        Reporter reporter) throws IOException {
            return new RandomRecordReader(((FileSplit) split).getPath());
        }
    }


    static class Map extends MapReduceBase
        implements Mapper<WritableComparable, Writable, BytesWritable, BytesWritable> {

        private Random random = new Random();

        private BytesWritable randomKey = new BytesWritable();
        private BytesWritable randomValue = new BytesWritable();


        public void map(WritableComparable key,
                        Writable value,
                        OutputCollector<BytesWritable, BytesWritable> output,
                        Reporter reporter) throws IOException {
            for (int i = 0; i < 1024; i++) {
                output.collect (randomKey, randomValue);
                reporter.setStatus ("Record " + i + " generated");
            }
        }

        private void randomizeBytes(byte[] data, int offset, int length) {
            for(int i=offset + length - 1; i >= offset; --i) {
                //                data[i] = (byte)'A';
                data[i] = (byte)random.nextInt (255);
            }
        }


        @Override
        public void configure(JobConf job) {
            randomKey.setSize (100);
            randomValue.setSize (10*1024*1024);
            randomizeBytes (randomValue.getBytes(), 0, randomValue.getLength());
            randomizeBytes (randomKey.getBytes(), 0, randomKey.getLength());
            //            for (int i = 0; i < randomValue.getLength(); i++)
            //                randomValue.getBytes ()[i] = 0;
        }
    }

    public int run (String[] args) throws Exception {
        JobConf job = new JobConf (getConf());

        job.setBoolean ("mapred.output.compress", true);
                job.set ("mapred.output.compression.codec", "com.hadoop.compression.lzo.LzopCodec");
                //        job.set ("mapred.output.compression.codec", "com.hadoop.compression.lzo.LzoCodec");
        //        job.set ("mapred.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec");
        job.setJarByClass (Trivial.class);
        job.setJobName("Trivial writer");

        job.setOutputKeyClass (BytesWritable.class);
        job.setOutputValueClass (BytesWritable.class);

        job.setMapperClass (Map.class);
        job.setInputFormat (RandomInputFormat.class);
        job.setOutputFormat (SequenceFileOutputFormat.class);

        //        FileInputFormat.addInputPath (job, new Path (args[0]));
        FileOutputFormat.setOutputPath (job, new Path (args[0]));

        job.setNumMapTasks (1);
        System.out.println("Running 1 map.");
        job.setNumReduceTasks (0);


        Date startTime = new Date();
        System.out.println("Job started: " + startTime);
        JobClient.runJob(job);
        Date endTime = new Date();
        System.out.println("Job ended: " + endTime);
        System.out.println("The job took " + (endTime.getTime() - startTime.getTime()) /1000 + " seconds.");

        return 0;
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new Trivial(), args);
        System.exit(res);
    }
}
	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Date;
	import java.util.List;
	import java.util.Random;

	import org.apache.hadoop.io.BytesWritable;
	import org.apache.hadoop.io.Writable;
	import org.apache.hadoop.io.WritableComparable;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.conf.Configured;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapred.ClusterStatus;
	import org.apache.hadoop.mapred.FileOutputFormat;
	import org.apache.hadoop.mapred.FileInputFormat;
	import org.apache.hadoop.mapred.JobClient;
	import org.apache.hadoop.mapred.JobConf;
	import org.apache.hadoop.mapred.MapReduceBase;
	import org.apache.hadoop.mapred.RecordReader;
	import org.apache.hadoop.mapred.InputSplit;
	import org.apache.hadoop.mapred.InputFormat;
	import org.apache.hadoop.mapred.FileSplit;
	import org.apache.hadoop.mapred.Mapper;
	import org.apache.hadoop.mapred.OutputCollector;
	import org.apache.hadoop.mapred.OutputFormat;
	import org.apache.hadoop.mapred.Reporter;
	import org.apache.hadoop.mapred.SequenceFileOutputFormat;
	import org.apache.hadoop.util.Tool;
	import org.apache.hadoop.util.ToolRunner;


	public class Trivial extends Configured implements Tool {
	/**
	* A custom input format that creates virtual inputs of a single string
	* for each map.
	*/
	static class RandomInputFormat implements InputFormat<Text, Text> {

	/**
	* Generate the requested number of file splits, with the filename
	* set to the filename of the output file.
	*/
	public InputSplit[] getSplits(JobConf job,
	int numSplits) throws IOException {
	InputSplit[] result = new InputSplit[numSplits];
	Path outDir = FileOutputFormat.getOutputPath(job);
	for(int i=0; i < result.length; ++i) {
	result[i] = new FileSplit(new Path(outDir, "dummy-split-" + i), 0, 1,
	(String[])null);
	}
	return result;
	}

	/**
	* Return a single record (filename, "") where the filename is taken from
	* the file split.
	*/
	static class RandomRecordReader implements RecordReader<Text, Text> {
	Path name;
	public RandomRecordReader(Path p) {
	name = p;
	}
	public boolean next(Text key, Text value) {
	if (name != null) {
	key.set(name.getName());
	name = null;
	return true;
	}
	return false;
	}
	public Text createKey() {
	return new Text();
	}
	public Text createValue() {
	return new Text();
	}
	public long getPos() {
	return 0;
	}
	public void close() {}
	public float getProgress() {
	return 0.0f;
	}
	}

	public RecordReader<Text, Text> getRecordReader(InputSplit split,
	JobConf job,
	Reporter reporter) throws IOException {
	return new RandomRecordReader(((FileSplit) split).getPath());
	}
	}


	static class Map extends MapReduceBase
	implements Mapper<WritableComparable, Writable, BytesWritable, BytesWritable> {

	private Random random = new Random();

	private BytesWritable randomKey = new BytesWritable();
	private BytesWritable randomValue = new BytesWritable();


	public void map(WritableComparable key,
	Writable value,
	OutputCollector<BytesWritable, BytesWritable> output,
	Reporter reporter) throws IOException {
	for (int i = 0; i < 1024; i++) {
	output.collect (randomKey, randomValue);
	reporter.setStatus ("Record " + i + " generated");
	}
	}

	private void randomizeBytes(byte[] data, int offset, int length) {
	for(int i=offset + length - 1; i >= offset; --i) {
	// data[i] = (byte)'A';
	data[i] = (byte)random.nextInt (255);
	}
	}


	@Override
	public void configure(JobConf job) {
	randomKey.setSize (100);
	randomValue.setSize (1010241024);
	randomizeBytes (randomValue.getBytes(), 0, randomValue.getLength());
	randomizeBytes (randomKey.getBytes(), 0, randomKey.getLength());
	// for (int i = 0; i < randomValue.getLength(); i++)
	// randomValue.getBytes ()[i] = 0;
	}
	}

	public int run (String[] args) throws Exception {
	JobConf job = new JobConf (getConf());

	job.setBoolean ("mapred.output.compress", true);
	job.set ("mapred.output.compression.codec", "com.hadoop.compression.lzo.LzopCodec");
	// job.set ("mapred.output.compression.codec", "com.hadoop.compression.lzo.LzoCodec");
	// job.set ("mapred.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec");
	job.setJarByClass (Trivial.class);
	job.setJobName("Trivial writer");

	job.setOutputKeyClass (BytesWritable.class);
	job.setOutputValueClass (BytesWritable.class);

	job.setMapperClass (Map.class);
	job.setInputFormat (RandomInputFormat.class);
	job.setOutputFormat (SequenceFileOutputFormat.class);

	// FileInputFormat.addInputPath (job, new Path (args[0]));
	FileOutputFormat.setOutputPath (job, new Path (args[0]));

	job.setNumMapTasks (1);
	System.out.println("Running 1 map.");
	job.setNumReduceTasks (0);


	Date startTime = new Date();
	System.out.println("Job started: " + startTime);
	JobClient.runJob(job);
	Date endTime = new Date();
	System.out.println("Job ended: " + endTime);
	System.out.println("The job took " + (endTime.getTime() - startTime.getTime()) /1000 + " seconds.");

	return 0;
	}

	public static void main(String[] args) throws Exception {
	int res = ToolRunner.run(new Configuration(), new Trivial(), args);
	System.exit(res);
	}
	}