Skip to content

Instantly share code, notes, and snippets.

@roaramburu
Last active January 29, 2019 01:55
Show Gist options
  • Save roaramburu/e6f20d9d4037186d970a66f36b8f9a37 to your computer and use it in GitHub Desktop.
Save roaramburu/e6f20d9d4037186d970a66f36b8f9a37 to your computer and use it in GitHub Desktop.
# Step 1 - Import necessary Packages
import cudf as cudf
import pyblazing
from pyblazing import DriverType, FileSystemType, EncryptionType
from pyblazing import SchemaFrom
# Step 2 - Register a File System
def register_hdfs():
print('*** Register a HDFS File System ***')
fs_status = pyblazing.register_file_system(
authority="tpch_hdfs",
type=FileSystemType.HDFS,
root="/",
params={
"host": "127.0.0.1",
"port": 54310,
"user": "hadoop",
"driverType": DriverType.LIBHDFS3,
"kerberosTicket": ""
}
)
print(fs_status)
register_hdfs()
# Step 3 - Define Schema & Register Table
names = ['n_nationkey', 'n_name', 'n_regionkey', 'n_comment']
dtypes = ['int32', 'int64', 'int32', 'int64']
nation_schema = pyblazing.register_table_schema(table_name='nation', type=SchemaFrom.CsvFile, path='hdfs://tpch_hdfs/Data1Mb/nation_0_0.psv', delimiter='|', dtypes=dtypes, names=names)
table_data = {
nation_schema: ['hdfs://tpch_hdfs/Data1Mb/nation_0_0.psv']
}
# Step 4 - Query w/ SQL
sql = 'select n_nationkey, n_regionkey + n_nationkey as addition from main.nation'
result_gdf = pyblazing.run_query_filesystem(sql, table_data)
print(sql)
print(result_gdf)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment