imfht/pyspark_cos_qcloud_example.py

## pyspark_cos_qcloud_example.py
import os

from pyspark import SparkContext

# first you'll need download hadoop-cos-x.x.x-shaded.jar at -> https://github.com/tencentyun/hadoop-cos
os.environ[
    'PYSPARK_SUBMIT_ARGS'] = './hadoop-cos-2.8.5-shaded.jar pyspark-shell'
sc = SparkContext(appName="wordCount").getOrCreate()

# some basic configuration, find more at https://cloud.tencent.com/document/product/436/6884
sc._jsc.hadoopConfiguration().set("fs.cosn.userinfo.secretId", "ak")  # ak
sc._jsc.hadoopConfiguration().set("fs.cosn.userinfo.secretKey", "sk")  # sk
sc._jsc.hadoopConfiguration().set("fs.cosn.bucket.region", "ap-guangzhou")  # ap
sc._jsc.hadoopConfiguration().set("fs.cosn.impl", "org.apache.hadoop.fs.CosFileSystem")  # register cosn scheme

# then create a rdd via cos key
text_file = sc.textFile("cosn://ap_name/filename")

# wordcount
counts = text_file.flatMap(lambda line: line.split(" ")) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a + b)

# save it!
counts.saveAsTextFile("anywhere")

if __name__ == '__main__':
    pass
	import os

	from pyspark import SparkContext

	# first you'll need download hadoop-cos-x.x.x-shaded.jar at -> https://github.com/tencentyun/hadoop-cos
	os.environ[
	'PYSPARK_SUBMIT_ARGS'] = './hadoop-cos-2.8.5-shaded.jar pyspark-shell'
	sc = SparkContext(appName="wordCount").getOrCreate()

	# some basic configuration, find more at https://cloud.tencent.com/document/product/436/6884
	sc._jsc.hadoopConfiguration().set("fs.cosn.userinfo.secretId", "ak") # ak
	sc._jsc.hadoopConfiguration().set("fs.cosn.userinfo.secretKey", "sk") # sk
	sc._jsc.hadoopConfiguration().set("fs.cosn.bucket.region", "ap-guangzhou") # ap
	sc._jsc.hadoopConfiguration().set("fs.cosn.impl", "org.apache.hadoop.fs.CosFileSystem") # register cosn scheme

	# then create a rdd via cos key
	text_file = sc.textFile("cosn://ap_name/filename")

	# wordcount
	counts = text_file.flatMap(lambda line: line.split(" ")) \
	.map(lambda word: (word, 1)) \
	.reduceByKey(lambda a, b: a + b)

	# save it!
	counts.saveAsTextFile("anywhere")

	if __name__ == '__main__':
	pass