airawat/00-CreatingSequenceFile

## 00-CreatingSequenceFile
This gist demonstrates how to create a sequence file (compressed and uncompressed), from a text file.

Includes:
---------
1. Input data and script download
2. Input data-review
3. Data load commands
4. Mapper code
5. Driver code to create the sequence file out of a text file in HDFS
6. Command to run Java program
7. Results of the program run to create sequence file
8. Java program to read a sequence file, and convert to text file
9. Command to run program from #8, with results
10. Note on creating compressed sequence files
11. Driver code to create a compressed sequence file
12. Command to run program in #11 with results

## 01-DataAndCodeDownload
01. Data and code download
-----------------------------
Google:
<<To be added>>

Email me at airawat.blog@gmail.com if you encounter any issues

gitHub:
<<To be added>>


Directory structure
-------------------
formatProject
    data
        departments_sorted
          part-m-00000


    formatConverterTextToSequence
        src
            FormatConverterMapper.java
            FormatConverterTextToSequenceDriver.java
            FormatConverterSequenceToTextDriver.java
        jars
            formatConverterTextToSequence.jar
            formatConverterSequenceToText.jar

## 02-SourceData
**************************************************
Input text file - departments_sorted/part-m-00000
**************************************************

$ more formatProject/data/departments_sorted/part-m-00000
d001  Marketing
d002  Finance
d003	Human Resources
d004	Production
d005	Development
d006	Quality Management
d007	Sales
d008	Research
d009	Customer Service

## 03-HdfsLoadCommands
**********************************************
hdfs load commands
**********************************************

# Load data
$ hadoop fs -put formatProject/

# Remove unnecessary files
$ hadoop fs -rm -R formatProject/formatConverterTextToSequence/
$ hadoop fs -rm -R formatProject/formatConverterTextToMap/

## 04-TextFileMapper
/*********************************************************************************************************
**           Mapper
**           formatProject/FormatConverterTextToSequence/src/FormatConverterMapper.java
**           Reads text file and emits the contents out as key-value pairs
*********************************************************************************************************/

import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;


public class FormatConverterMapper extends
    Mapper<LongWritable, Text, LongWritable, Text> {

  @Override
  public void map(LongWritable key, Text value, Context context)
      throws IOException, InterruptedException {
    context.write(key, value);
  }
}

## 05-SequenceFileCreator
/*********************************************************************************************************
**           Driver
**           formatProject/FormatConverterTextToSequence/src/FormatConverterTextToSequenceDriver.java
*********************************************************************************************************/

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class FormatConverterTextToSequenceDriver extends Configured implements Tool {

  @Override
  public int run(String[] args) throws Exception {

    if (args.length != 2) {
      System.out.printf("Two parameters are required for FormatConverterTextToSequenceDriver-<input dir> <output dir>\n");
      return -1;
    }

    Job job = new Job(getConf());
    job.setJarByClass(FormatConverterTextToSequenceDriver.class);
    job.setJobName("Create Sequence File, from text file");

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(FormatConverterMapper.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(0);

    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
  }

  public static void main(String[] args) throws Exception {
    int exitCode = ToolRunner.run(new Configuration(), new FormatConverterTextToSequenceDriver(), args);
    System.exit(exitCode);
  }
}

## 06-CommandToRunProgram
************************************************
**Command to create sequence file from text file
************************************************

$ hadoop jar formatProject/formatConverterTextToSequence/jars/formatConverterTextToSequence.jar FormatConverterTextToSequenceDriver formatProject/data/departments_sorted/part-m-00000 formatProject/data/departments_sequence
.
.
.
.

$ hadoop fs -ls -R formatProject/data/departments_sequence | awk '{print $8}'
formatProject/data/departments_sequence/_SUCCESS
formatProject/data/departments_sequence/_logs
formatProject/data/departments_sequence/_logs/history
formatProject/data/departments_sequence/_logs/history/cdh-jt01_1376335706356_job_201308121428_0116_conf.xml
formatProject/data/departments_sequence/_logs/history/job_201308121428_0116_1379087496898_akhanolk_Create+Sequence+File%2C+from+text+file
formatProject/data/departments_sequence/part-m-00000


## 07-Results
************************************************
**Results
************************************************

$ hadoop fs -text formatProject/data/departments_sequence/part-m-00000
0  d001	Marketing
15	d002	Finance
28	d003	Human Resources
49	d004	Production
65	d005	Development
82	d006	Quality Management
106	d007	Sales
117	d008	Research
131	d009	Customer Service

## 08-SequenceFileReader
/*********************************************************************************************************
**           Driver
**           formatProject/FormatConverterTextToSequence/src/FormatConverterSequenceToTextDriver.java
*********************************************************************************************************/
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class FormatConverterSequenceToTextDriver extends Configured implements Tool {

  @Override
  public int run(String[] args) throws Exception {

    if (args.length != 2) {
      System.out
          .printf("Two parameters need to be supplied - <input dir> and <output dir>\n");
      return -1;
    }

    Job job = new Job(getConf());
    job.setJarByClass(FormatConverterSequenceToTextDriver.class);
    job.setJobName("Convert Sequence File and Output as Text");

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(FormatConverterMapper.class);
    job.setNumReduceTasks(0);

    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
  }

  public static void main(String[] args) throws Exception {
    int exitCode = ToolRunner.run(new Configuration(), new FormatConverterSequenceToTextDriver(), args);
    System.exit(exitCode);
  }
}

## 09-CommandToRunProgramToReadSequenceFile
**************************************************************
**Command to create text file from sequence file & results
**************************************************************

$ hadoop jar formatProject/formatConverterTextToSequence/jars/formatConverterSequenceToText.jar FormatConverterSequenceToTextDriver formatProject/data/departments_sequence/part-m-00000 formatProject/data/departments_text

$ hadoop fs -ls -R formatProject/data/departments_text | awk '{print $8}'
formatProject/data/departments_text/_SUCCESS
formatProject/data/departments_text/_logs
formatProject/data/departments_text/_logs/history
formatProject/data/departments_text/_logs/history/cdh-jt01_1376335706356_job_201308121428_0118_conf.xml
formatProject/data/departments_text/_logs/history/job_201308121428_0118_1379089420495_akhanolk_Convert+Sequence+File+and+Output+as+Text
formatProject/data/departments_text/part-m-00000

$ hadoop fs -cat formatProject/data/departments_text/part-m-00000
0  d001	Marketing
15	d002	Finance
28	d003	Human Resources
49	d004	Production
65	d005	Development
82	d006	Quality Management
106	d007	Sales
117	d008	Research
131	d009	Customer Service


## 10-Compression
**************************************************************
** Compression and sequence files

**************************************************************
To create an compressed sequence file - and block compression is the recommended option, there are just minor additions to code in the driver [formatProject/FormatConverterTextToSequence/src/FormatConverterTextToSequenceDriver.java]
The sample code here uses SnappyCodec, and block compression.

FileOutputFormat.setCompressOutput(job, true);
FileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class);
SequenceFileOutputFormat.setOutputCompressionType(job,CompressionType.BLOCK);

The next section includes the code.


## 11-DriverToCreateBlockCompSeqFile
/*************************************************************************************************************
**           Driver
**           formatProject/FormatConverterTextToSequence/src/FormatConverterTextToBlckCompSequenceDriver.java
*************************************************************************************************************/

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.compress.SnappyCodec;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class FormatConverterTextToBlckCompSequenceDriver extends Configured implements Tool {

  @Override
  public int run(String[] args) throws Exception {

    if (args.length != 2) {
      System.out.printf("Two parameters are required for FormatConverterTextToBlckCompSequenceDriver-<input dir> <output dir>\n");
      return -1;
    }

    Job job = new Job(getConf());
    job.setJarByClass(FormatConverterTextToBlckCompSequenceDriver.class);
    job.setJobName("Create block compressed Sequence File, from text file");

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class);

    job.setMapperClass(FormatConverterMapper.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputCompressionType(job,CompressionType.BLOCK);
    job.setNumReduceTasks(0);

    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
  }

  public static void main(String[] args) throws Exception {
    int exitCode = ToolRunner.run(new Configuration(), new FormatConverterTextToBlckCompSequenceDriver(), args);
    System.exit(exitCode);
  }
}

## 12-CommandsToRunProgramFrom11
*************************************************************************************
**Command to create block compressed(snappy) sequence file from text file + output
*************************************************************************************

$ hadoop jar formatProject/formatConverterTextToSequence/jars/formatConverterTextToBlkCompSequence.jar FormatConverterTextToBlckCompSequenceDriver formatProject/data/departments_sorted/part-m-00000 formatProject/data/departments_sequence_blckcmp
.

$ hadoop fs -ls -R formatProject/data/departments_sequence_blckcmp | awk '{print $8}'
formatProject/data/departments_sequence_blckcmp/_SUCCESS
formatProject/data/departments_sequence_blckcmp/_logs
formatProject/data/departments_sequence_blckcmp/_logs/history
formatProject/data/departments_sequence_blckcmp/_logs/history/cdh-jt01_1376335706356_job_201308121428_0120_conf.xml
formatProject/data/departments_sequence_blckcmp/_logs/history/job_201308121428_0120_1379091181653_akhanolk_Create+block+compressed+Sequence+File%2C+from+text+f
formatProject/data/departments_sequence_blckcmp/part-m-00000


$ hadoop fs -text formatProject/data/departments_sequence_blckcmp/part-m-00000
13/09/13 11:55:38 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
13/09/13 11:55:38 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
13/09/13 11:55:38 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
13/09/13 11:55:38 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
0  d001	Marketing
15	d002	Finance
28	d003	Human Resources
49	d004	Production
65	d005	Development
82	d006	Quality Management
106	d007	Sales
117	d008	Research
131	d009	Customer Service
	This gist demonstrates how to create a sequence file (compressed and uncompressed), from a text file.

	Includes:
	---------
	1. Input data and script download
	2. Input data-review
	3. Data load commands
	4. Mapper code
	5. Driver code to create the sequence file out of a text file in HDFS
	6. Command to run Java program
	7. Results of the program run to create sequence file
	8. Java program to read a sequence file, and convert to text file
	9. Command to run program from #8, with results
	10. Note on creating compressed sequence files
	11. Driver code to create a compressed sequence file
	12. Command to run program in #11 with results
	01. Data and code download
	-----------------------------
	Google:
	<<To be added>>

	Email me at airawat.blog@gmail.com if you encounter any issues

	gitHub:
	<<To be added>>


	Directory structure
	-------------------
	formatProject
	data
	departments_sorted
	part-m-00000


	formatConverterTextToSequence
	src
	FormatConverterMapper.java
	FormatConverterTextToSequenceDriver.java
	FormatConverterSequenceToTextDriver.java
	jars
	formatConverterTextToSequence.jar
	formatConverterSequenceToText.jar
	**************************************************
	Input text file - departments_sorted/part-m-00000
	**************************************************

	$ more formatProject/data/departments_sorted/part-m-00000
	d001 Marketing
	d002 Finance
	d003 Human Resources
	d004 Production
	d005 Development
	d006 Quality Management
	d007 Sales
	d008 Research
	d009 Customer Service
	**********************************************
	hdfs load commands
	**********************************************

	# Load data
	$ hadoop fs -put formatProject/

	# Remove unnecessary files
	$ hadoop fs -rm -R formatProject/formatConverterTextToSequence/
	$ hadoop fs -rm -R formatProject/formatConverterTextToMap/
	/*********************************************************************************************************
	** Mapper
	** formatProject/FormatConverterTextToSequence/src/FormatConverterMapper.java
	** Reads text file and emits the contents out as key-value pairs
	*********************************************************************************************************/

	import java.io.IOException;
	import org.apache.hadoop.io.LongWritable;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapreduce.Mapper;


	public class FormatConverterMapper extends
	Mapper<LongWritable, Text, LongWritable, Text> {

	@Override
	public void map(LongWritable key, Text value, Context context)
	throws IOException, InterruptedException {
	context.write(key, value);
	}
	}
	/*********************************************************************************************************
	** Driver
	** formatProject/FormatConverterTextToSequence/src/FormatConverterTextToSequenceDriver.java
	*********************************************************************************************************/

	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
	import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
	import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
	import org.apache.hadoop.mapreduce.Job;
	import org.apache.hadoop.conf.Configured;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.util.Tool;
	import org.apache.hadoop.util.ToolRunner;

	public class FormatConverterTextToSequenceDriver extends Configured implements Tool {

	@Override
	public int run(String[] args) throws Exception {

	if (args.length != 2) {
	System.out.printf("Two parameters are required for FormatConverterTextToSequenceDriver-<input dir> <output dir>\n");
	return -1;
	}

	Job job = new Job(getConf());
	job.setJarByClass(FormatConverterTextToSequenceDriver.class);
	job.setJobName("Create Sequence File, from text file");

	FileInputFormat.setInputPaths(job, new Path(args[0]));
	FileOutputFormat.setOutputPath(job, new Path(args[1]));

	job.setMapperClass(FormatConverterMapper.class);
	job.setOutputFormatClass(SequenceFileOutputFormat.class);
	job.setNumReduceTasks(0);

	boolean success = job.waitForCompletion(true);
	return success ? 0 : 1;
	}

	public static void main(String[] args) throws Exception {
	int exitCode = ToolRunner.run(new Configuration(), new FormatConverterTextToSequenceDriver(), args);
	System.exit(exitCode);
	}
	}
	************************************************
	**Command to create sequence file from text file
	************************************************

	$ hadoop jar formatProject/formatConverterTextToSequence/jars/formatConverterTextToSequence.jar FormatConverterTextToSequenceDriver formatProject/data/departments_sorted/part-m-00000 formatProject/data/departments_sequence
	.
	.
	.
	.

	$ hadoop fs -ls -R formatProject/data/departments_sequence \| awk '{print $8}'
	formatProject/data/departments_sequence/_SUCCESS
	formatProject/data/departments_sequence/_logs
	formatProject/data/departments_sequence/_logs/history
	formatProject/data/departments_sequence/_logs/history/cdh-jt01_1376335706356_job_201308121428_0116_conf.xml
	formatProject/data/departments_sequence/_logs/history/job_201308121428_0116_1379087496898_akhanolk_Create+Sequence+File%2C+from+text+file
	formatProject/data/departments_sequence/part-m-00000
	************************************************
	**Results
	************************************************

	$ hadoop fs -text formatProject/data/departments_sequence/part-m-00000
	0 d001 Marketing
	15 d002 Finance
	28 d003 Human Resources
	49 d004 Production
	65 d005 Development
	82 d006 Quality Management
	106 d007 Sales
	117 d008 Research
	131 d009 Customer Service
	**************************************************************
	**Command to create text file from sequence file & results
	**************************************************************

	$ hadoop jar formatProject/formatConverterTextToSequence/jars/formatConverterSequenceToText.jar FormatConverterSequenceToTextDriver formatProject/data/departments_sequence/part-m-00000 formatProject/data/departments_text

	$ hadoop fs -ls -R formatProject/data/departments_text \| awk '{print $8}'
	formatProject/data/departments_text/_SUCCESS
	formatProject/data/departments_text/_logs
	formatProject/data/departments_text/_logs/history
	formatProject/data/departments_text/_logs/history/cdh-jt01_1376335706356_job_201308121428_0118_conf.xml
	formatProject/data/departments_text/_logs/history/job_201308121428_0118_1379089420495_akhanolk_Convert+Sequence+File+and+Output+as+Text
	formatProject/data/departments_text/part-m-00000

	$ hadoop fs -cat formatProject/data/departments_text/part-m-00000
	0 d001 Marketing
	15 d002 Finance
	28 d003 Human Resources
	49 d004 Production
	65 d005 Development
	82 d006 Quality Management
	106 d007 Sales
	117 d008 Research
	131 d009 Customer Service
	**************************************************************
	** Compression and sequence files

	**************************************************************
	To create an compressed sequence file - and block compression is the recommended option, there are just minor additions to code in the driver [formatProject/FormatConverterTextToSequence/src/FormatConverterTextToSequenceDriver.java]
	The sample code here uses SnappyCodec, and block compression.

	FileOutputFormat.setCompressOutput(job, true);
	FileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class);
	SequenceFileOutputFormat.setOutputCompressionType(job,CompressionType.BLOCK);

	The next section includes the code.