Skip to content

Instantly share code, notes, and snippets.

@hangingman
Created November 13, 2018 13:27
Show Gist options
  • Save hangingman/1818368f9ad6fb3f51c7719c1cd858d9 to your computer and use it in GitHub Desktop.
Save hangingman/1818368f9ad6fb3f51c7719c1cd858d9 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql import Row
from pprint import pprint
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
path = './sample-sjis.txt'
with open(path) as f:
str_list = map(lambda l: l.decode('shift-jis'), f.readlines())
sc = SparkContext()
spark = SparkSession(sc)
pprint(str_list)
row = Row('title')
rdd = sc.parallelize(str_list)
df = rdd.map(row).toDF()
df.show()
import shutil
shutil.rmtree('sample-utf8.csv')
# 書き出してみる
df.select('title') \
.coalesce(1) \
.write \
.format('com.databricks.spark.csv') \
.option("overwrite", "true") \
.save('sample-utf8.csv')
sc.stop()
@hangingman
Copy link
Author

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql import Row
from pprint import pprint

import sys
reload(sys)
sys.setdefaultencoding('utf-8')

#path = './sample-sjis.txt'
#with open(path) as f:
#    str_list = map(lambda l: l.decode('shift-jis'), f.readlines())

import boto3
s3 = boto3.client('s3')
response = s3.get_object(Bucket='spark.freestylewiki.xyz', Key='sample-sjis.txt')
str_list = response['Body'].read().decode('shift-jis')

sc = SparkContext()
spark = SparkSession(sc)

pprint(str_list)

row = Row('title')
rdd = sc.parallelize(str_list)
df = rdd.map(row).toDF()

df.show()

#import shutil
#shutil.rmtree('sample-utf8.csv')

# 書き出してみる
df.select('title') \
  .coalesce(1) \
  .write \
  .format('com.databricks.spark.csv') \
  .option("overwrite", "true") \
  .save('s3://spark.freestylewiki.xyz/sample-utf8.csv')

sc.stop()

@hangingman
Copy link
Author

databricksだけで完結

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql import Row
from pprint import pprint

import sys
reload(sys)
sys.setdefaultencoding('utf-8')

sc = SparkContext()
spark = SparkSession(sc)

df = spark.read \
     .format('com.databricks.spark.csv') \
     .option('header', 'false') \
     .option('inferschema', 'false') \
     .option('charset', 'shift-jis') \
     .option('delimiter', '\t') \
     .load('sample-sjis.txt')

df = df.withColumnRenamed('_c0', 'title')
df.show()

#import shutil
#shutil.rmtree('sample-utf8.csv')

# 書き出してみる
df.select('title') \
  .coalesce(1) \
  .write \
  .format('com.databricks.spark.csv') \
  .option("overwrite", "true") \
  .save('sample-utf8.csv')

sc.stop()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment