risyomei/SparkPairedRDD.scala

## SparkPairedRDD.scala
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

object SparkPairedRDD {

  def main(args: Array[String]) {

    val spark = SparkSession
      .builder()
      .appName("Spark Paired")
      .getOrCreate()
    import spark.sqlContext.implicits._

    val df = Seq((1)).toDF("seq")
    val masterDF = spark.read.option("header", false).csv("hdfs://dev01/tmp/dict.txt").toDF("key", "value")
    masterDF.repartition(64)
    val cacheDF = masterDF.persist()

    var targetDF = cacheDF.sample(false,0.8, 7).limit(201600)
    cacheDF.createOrReplaceTempView("master")
    targetDF.createOrReplaceTempView("tgt_keys")

    println("Master Dictionary Record Count: " + cacheDF.count())
    println("Target Record Count: " + targetDF.count)

    df.withColumn("current_timestamp", current_timestamp().as("current_timestamp")).show(false)
    spark.sql("select master.key,master.value from master,tgt_keys where master.key = tgt_keys.key").show(201600)
    df.withColumn("current_timestamp", current_timestamp().as("current_timestamp")).show(false)
    println("Complete")
  }
}

## 说明.md

      
    Raw
  

              说明.md
            
          
    数据长这样
# head /Users/lxm/Desktop/dict.txt
100000,100001
100001,100002
100002,100003
100003,100004
100004,100005
100005,100006
100006,100007
100007,100008
100008,100009
100009,100010

提交命令
spark-submit --master yarn --deploy-mode cluster  --num-executors 16 --executor-cores 4 --executor-memory 32g  --driver-memory 32g spark-paired-rdd_2.11-1.0.jar

检查Record数量
168000 = 60秒 X 56消息/秒 X 60次查询/消息
yarn logs -applicationId application_1645008902323_0091  | grep -C2 "Record Count"

...(省略)...

Master Dictionary Record Count: 12010000
Target Record Count: 201600

...(省略)...

检查使用时间 约为8秒
yarn logs -applicationId application_1645008902323_0091  | grep -C2 current_timestamp

...(省略)...

+---+-----------------------+
|seq|current_timestamp      |
+---+-----------------------+
|1  |2022-02-24 02:50:17.143|
--

+---+-----------------------+
|seq|current_timestamp      |
+---+-----------------------+
|1  |2022-02-24 02:50:25.408|

...(省略)...


也就是说，当字典大小为 12010000 条时，查询 168000 次记录需要8秒。
附录


字典大小
查询数
处理查询用时


12010000
201600 (一分钟查询量)
约8秒


12010000
33600 (10秒种查询量)
约6秒


12010000
3360 (1秒种查询量)
约6秒
	import org.apache.spark.sql.SparkSession
	import org.apache.spark.sql.functions._

	object SparkPairedRDD {

	def main(args: Array[String]) {

	val spark = SparkSession
	.builder()
	.appName("Spark Paired")
	.getOrCreate()
	import spark.sqlContext.implicits._

	val df = Seq((1)).toDF("seq")
	val masterDF = spark.read.option("header", false).csv("hdfs://dev01/tmp/dict.txt").toDF("key", "value")
	masterDF.repartition(64)
	val cacheDF = masterDF.persist()

	var targetDF = cacheDF.sample(false,0.8, 7).limit(201600)
	cacheDF.createOrReplaceTempView("master")
	targetDF.createOrReplaceTempView("tgt_keys")

	println("Master Dictionary Record Count: " + cacheDF.count())
	println("Target Record Count: " + targetDF.count)

	df.withColumn("current_timestamp", current_timestamp().as("current_timestamp")).show(false)
	spark.sql("select master.key,master.value from master,tgt_keys where master.key = tgt_keys.key").show(201600)
	df.withColumn("current_timestamp", current_timestamp().as("current_timestamp")).show(false)
	println("Complete")
	}
	}
字典大小	查询数	处理查询用时
12010000	201600 (一分钟查询量)	约8秒
12010000	33600 (10秒种查询量)	约6秒
12010000	3360 (1秒种查询量)	约6秒