roaramburu/register_hdfs.py

## register_hdfs.py
# Step 1 - Import necessary Packages
import cudf as cudf
import pyblazing
from pyblazing import DriverType, FileSystemType, EncryptionType
from pyblazing import SchemaFrom

# Step 2 - Register a File System
def register_hdfs():
    print('*** Register a HDFS File System ***')
    fs_status = pyblazing.register_file_system(
        authority="tpch_hdfs",
        type=FileSystemType.HDFS,
        root="/",
        params={
            "host": "127.0.0.1",
            "port": 54310,
            "user": "hadoop",
            "driverType": DriverType.LIBHDFS3,
            "kerberosTicket": ""
        }
    )
    print(fs_status)

register_hdfs()

# Step 3 - Define Schema & Register Table
names = ['n_nationkey', 'n_name', 'n_regionkey', 'n_comment']
dtypes = ['int32', 'int64', 'int32', 'int64']
nation_schema = pyblazing.register_table_schema(table_name='nation', type=SchemaFrom.CsvFile, path='hdfs://tpch_hdfs/Data1Mb/nation_0_0.psv', delimiter='|', dtypes=dtypes, names=names)
table_data = {
    nation_schema: ['hdfs://tpch_hdfs/Data1Mb/nation_0_0.psv']
}

# Step 4 - Query w/ SQL
sql = 'select n_nationkey, n_regionkey + n_nationkey as addition from main.nation'

result_gdf = pyblazing.run_query_filesystem(sql, table_data)
print(sql)
print(result_gdf)
	# Step 1 - Import necessary Packages
	import cudf as cudf
	import pyblazing
	from pyblazing import DriverType, FileSystemType, EncryptionType
	from pyblazing import SchemaFrom

	# Step 2 - Register a File System
	def register_hdfs():
	print('* Register a HDFS File System *')
	fs_status = pyblazing.register_file_system(
	authority="tpch_hdfs",
	type=FileSystemType.HDFS,
	root="/",
	params={
	"host": "127.0.0.1",
	"port": 54310,
	"user": "hadoop",
	"driverType": DriverType.LIBHDFS3,
	"kerberosTicket": ""
	}
	)
	print(fs_status)

	register_hdfs()

	# Step 3 - Define Schema & Register Table
	names = ['n_nationkey', 'n_name', 'n_regionkey', 'n_comment']
	dtypes = ['int32', 'int64', 'int32', 'int64']
	nation_schema = pyblazing.register_table_schema(table_name='nation', type=SchemaFrom.CsvFile, path='hdfs://tpch_hdfs/Data1Mb/nation_0_0.psv', delimiter='\|', dtypes=dtypes, names=names)
	table_data = {
	nation_schema: ['hdfs://tpch_hdfs/Data1Mb/nation_0_0.psv']
	}

	# Step 4 - Query w/ SQL
	sql = 'select n_nationkey, n_regionkey + n_nationkey as addition from main.nation'

	result_gdf = pyblazing.run_query_filesystem(sql, table_data)
	print(sql)
	print(result_gdf)