Skip to content

Instantly share code, notes, and snippets.

@priancho
Created August 15, 2017 06:49
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save priancho/357022fbe63fae8b097a563e43dd885b to your computer and use it in GitHub Desktop.
Save priancho/357022fbe63fae8b097a563e43dd885b to your computer and use it in GitHub Desktop.
Using pyarrow's HdfsClient to read a file in HDFS from Python
# -*- coding: utf-8 -*-
import sys
import os
import subprocess
from subprocess import PIPE
from StringIO import StringIO
from gzip import GzipFile
from pyarrow import HdfsClient
def set_env():
# libhdfs.so path
cmd = ["locate", "-l", "1", "libhdfs.so"]
libhdfsso_path = subprocess\
.Popen(cmd, stdout=PIPE)\
.stdout\
.read()\
.rstrip()
os.environ["ARROW_LIBHDFS_DIR"] = os.path.dirname(libhdfsso_path)
sys.stderr.write("Set ARROW_LIBHDFS_DIR: %s\n"
% (os.environ["ARROW_LIBHDFS_DIR"]))
# classpath
cmd = ["/usr/bin/hdfs", "classpath", "--glob"]
hadoop_cp = subprocess\
.Popen(cmd, stdout=PIPE)\
.stdout\
.read()\
.rstrip()
if "CLASSPATH" in os.environ:
os.environ["CLASSPATH"] = os.environ["CLASSPATH"] + ":" + hadoop_cp
else:
os.environ["CLASSPATH"] = hadoop_cp
sys.stderr.write("Set CLASSPATH: %s\n"
% (os.environ["CLASSPATH"]))
if __name__ == "__main__":
set_env()
hdfs = HdfsClient("namenode-hostname", 8020, user="myaccount")
filepath = "/user/myaccount/test-data.txt.gz"
f = hdfs.open(filepath)
# We can't use GzipFile(fileobj=f.read()) because
# pyarrow.hdfs.NativeFile.seek() takes only ONE param whereas
# gzip.GzipFile() requires fileobj's seek() method have TWO
# params.
sio = StringIO(f.read())
g = GzipFile(fileobj = sio)
for line in g:
print line.rstrip()
sys.exit(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment