Skip to content

Instantly share code, notes, and snippets.

@imfht
Created April 14, 2020 01:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save imfht/63c878208502bf088c35b59cdc203a05 to your computer and use it in GitHub Desktop.
Save imfht/63c878208502bf088c35b59cdc203a05 to your computer and use it in GitHub Desktop.
A example show how to use cos with pyspark
import os
from pyspark import SparkContext
# first you'll need download hadoop-cos-x.x.x-shaded.jar at -> https://github.com/tencentyun/hadoop-cos
os.environ[
'PYSPARK_SUBMIT_ARGS'] = './hadoop-cos-2.8.5-shaded.jar pyspark-shell'
sc = SparkContext(appName="wordCount").getOrCreate()
# some basic configuration, find more at https://cloud.tencent.com/document/product/436/6884
sc._jsc.hadoopConfiguration().set("fs.cosn.userinfo.secretId", "ak") # ak
sc._jsc.hadoopConfiguration().set("fs.cosn.userinfo.secretKey", "sk") # sk
sc._jsc.hadoopConfiguration().set("fs.cosn.bucket.region", "ap-guangzhou") # ap
sc._jsc.hadoopConfiguration().set("fs.cosn.impl", "org.apache.hadoop.fs.CosFileSystem") # register cosn scheme
# then create a rdd via cos key
text_file = sc.textFile("cosn://ap_name/filename")
# wordcount
counts = text_file.flatMap(lambda line: line.split(" ")) \
.map(lambda word: (word, 1)) \
.reduceByKey(lambda a, b: a + b)
# save it!
counts.saveAsTextFile("anywhere")
if __name__ == '__main__':
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment