kenwoodjw/pyscript_cdh.md

## pyscript_cdh.md

      
    Raw
  

              pyscript_cdh.md
            
          
    python scripy for cdh

hdfs

pip install hdfs
from hdfs import InsecureClient
hdfs_client = InsecureClient('http://192.168.0.251:50070',user='hive')
#列出文件
hdfs_client.list('.')
# 新建文件
hdfs_client.makedirs('/a/b/c')
# 通过设置append参数，向一个已经存在的文件追加写入数据
with client.write('test_liulin.txt', append=True) as writer:
    writer.write('You succeed!')
# 通过设置定界符来读取数据，必须与encoding参数同时使用
with client.read('test.txt', encoding='utf-8', delimiter='\n') as reader:
    for line in reader:
        print line

参考链接:https://my.oschina.net/wolfoxliu/blog/862015
https://hdfscli.readthedocs.io/en/latest/quickstart.html

hive

from pyhive import hive
cursor = hive.connect('192.168.0.251').cursor()
cursor.execute('show databases')
print (cursor.fetchall())
参考链接
https://github.com/dropbox/PyHive


spark

>>> from pyspark import SparkFiles
>>> path = os.path.join(tempdir, "test.txt")
>>> with open(path, "w") as testFile:
...    _ = testFile.write("100")
>>> sc.addFile(path)
>>> def func(iterator):
...    with open(SparkFiles.get("test.txt")) as testFile:
...        fileVal = int(testFile.readline())
...        return [x * fileVal for x in iterator]
>>> sc.parallelize([1, 2, 3, 4]).mapPartitions(func).collect()
[100, 200, 300, 400
参考链接:
http://spark.apache.org/docs/2.1.0/api/python/pyspark.html

hbase

#先启动thrift服务
nohup hbase thrift start &
import happybase
connection = happybase.Connection('192.168.0.251')

table = connection.table('table-name')

table.put(b'row-key', {b'family:qual1': b'value1',
                       b'family:qual2': b'value2'})

row = table.row(b'row-key')
print(row[b'family:qual1'])  # prints 'value1'

for key, data in table.rows([b'row-key-1', b'row-key-2']):
    print(key, data)  # prints row key and data for each row

for key, data in table.scan(row_prefix=b'row'):
    print(key, data)  # prints 'value1' and 'value2'

row = table.delete(b'row-key')
参考链接:
https://happybase.readthedocs.io/en/latest/index.html


impala

from impala.dbapi import connect
conn = connect(host='my.host.com', port=21050)
cursor = conn.cursor()
cursor.execute('SELECT * FROM mytable LIMIT 100')
print cursor.description  # prints the result set's schema
results = cursor.fetchall()
参考链接
https://github.com/cloudera/impyla