zhzhan/gist:e977b171f7968e97d56d Secret

## gistfile1.txt
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package dev.demo

import org.apache.hadoop.io.Text
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.HashPartitioner
import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat
import org.apache.hadoop.io.NullWritable
import org.apache.spark.tez.TezJobExecutionContext
import java.util.concurrent.Executors
import java.util.concurrent.CountDownLatch
import org.apache.spark.tez.TezConstants
import org.apache.spark.SparkConf
import java.util.concurrent.atomic.AtomicInteger


object MyPartitionByMulti {

  def main(args: Array[String]) {
    var reducers = 64
    var inputFile = "/user/hive/external/tpch-100/lineitem"
    if (args != null && args.length > 0) {
      reducers = Integer.parseInt(args(0))
      if (args.length > 1) {
        inputFile = args(1)
      }
    }

    System.setProperty(TezConstants.UPDATE_CLASSPATH, "true");

    try {
      val jobName = "MyPartitionByMulti_" + System.nanoTime()
      val outputPath = jobName + "_out"
      val conf = new SparkConf
      conf.setAppName(jobName)
      val sc = new SparkContext(conf)
      val source = sc.textFile(inputFile)

      val result = source
        .map(a => (a.split('|')(10), a))
        .partitionBy(new HashPartitioner(reducers))
        .saveAsHadoopFile(outputPath, classOf[Text], classOf[Text], classOf[KeyPerPartitionOutputFormat])

      sc.stop
      println("DONE")
    } catch {
      case e: Throwable => e.printStackTrace()
    }
  }
}
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package dev.demo

	import org.apache.hadoop.io.Text
	import org.apache.spark.SparkContext
	import org.apache.spark.SparkContext._
	import org.apache.spark.HashPartitioner
	import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat
	import org.apache.hadoop.io.NullWritable
	import org.apache.spark.tez.TezJobExecutionContext
	import java.util.concurrent.Executors
	import java.util.concurrent.CountDownLatch
	import org.apache.spark.tez.TezConstants
	import org.apache.spark.SparkConf
	import java.util.concurrent.atomic.AtomicInteger


	object MyPartitionByMulti {

	def main(args: Array[String]) {
	var reducers = 64
	var inputFile = "/user/hive/external/tpch-100/lineitem"
	if (args != null && args.length > 0) {
	reducers = Integer.parseInt(args(0))
	if (args.length > 1) {
	inputFile = args(1)
	}
	}

	System.setProperty(TezConstants.UPDATE_CLASSPATH, "true");

	try {
	val jobName = "MyPartitionByMulti_" + System.nanoTime()
	val outputPath = jobName + "_out"
	val conf = new SparkConf
	conf.setAppName(jobName)
	val sc = new SparkContext(conf)
	val source = sc.textFile(inputFile)

	val result = source
	.map(a => (a.split('\|')(10), a))
	.partitionBy(new HashPartitioner(reducers))
	.saveAsHadoopFile(outputPath, classOf[Text], classOf[Text], classOf[KeyPerPartitionOutputFormat])

	sc.stop
	println("DONE")
	} catch {
	case e: Throwable => e.printStackTrace()
	}
	}
	}