Skip to content

Instantly share code, notes, and snippets.

@zhzhan
Created October 21, 2014 20:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zhzhan/e977b171f7968e97d56d to your computer and use it in GitHub Desktop.
Save zhzhan/e977b171f7968e97d56d to your computer and use it in GitHub Desktop.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package dev.demo
import org.apache.hadoop.io.Text
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.HashPartitioner
import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat
import org.apache.hadoop.io.NullWritable
import org.apache.spark.tez.TezJobExecutionContext
import java.util.concurrent.Executors
import java.util.concurrent.CountDownLatch
import org.apache.spark.tez.TezConstants
import org.apache.spark.SparkConf
import java.util.concurrent.atomic.AtomicInteger
object MyPartitionByMulti {
def main(args: Array[String]) {
var reducers = 64
var inputFile = "/user/hive/external/tpch-100/lineitem"
if (args != null && args.length > 0) {
reducers = Integer.parseInt(args(0))
if (args.length > 1) {
inputFile = args(1)
}
}
System.setProperty(TezConstants.UPDATE_CLASSPATH, "true");
try {
val jobName = "MyPartitionByMulti_" + System.nanoTime()
val outputPath = jobName + "_out"
val conf = new SparkConf
conf.setAppName(jobName)
val sc = new SparkContext(conf)
val source = sc.textFile(inputFile)
val result = source
.map(a => (a.split('|')(10), a))
.partitionBy(new HashPartitioner(reducers))
.saveAsHadoopFile(outputPath, classOf[Text], classOf[Text], classOf[KeyPerPartitionOutputFormat])
sc.stop
println("DONE")
} catch {
case e: Throwable => e.printStackTrace()
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment