Skip to content

Instantly share code, notes, and snippets.

@mesmacosta
Created January 6, 2020 16:14
pyhive
pandas
sasl>=0.2.1
thrift>=0.10.0
git+https://github.com/cloudera/thrift_sasl # Using master branch in order to get Python 3 SASL patches
import argparse
import logging
import random
import sys
import uuid
from pyhive import hive
_DATA_TYPES = ['TINYINT', 'SMALLINT', 'INT', 'BIGINT', 'FLOAT', 'DOUBLE', 'DECIMAL', 'TIMESTAMP', 'DATE',
'STRING', 'BOOLEAN', 'BINARY', 'ARRAY<STRUCT< key:STRING, value:STRING>>',
'ARRAY <STRING>', 'ARRAY <STRUCT <spouse: STRING, children: ARRAY <STRING>>>',
'ARRAY<DOUBLE>', 'MAP<STRING,DOUBLE>', 'STRUCT < employer: STRING, id: BIGINT, address: STRING >',
'UNIONTYPE<DOUBLE, STRING, ARRAY<string>, STRUCT<a:INT,b:string>>']
_COLUMN_NAMES = ['name', 'address', 'city', 'state', 'date_time', 'paragraph', 'randomdata', 'person', 'credit_card',
'size', 'reason', 'school', 'food', 'location', 'house', 'price', 'cpf', 'cnpj', 'passport',
'security_number', 'phone_number', 'bank_account_number', 'ip_address', 'stocks']
_DESCRIPTION_VALUES = ['This is a random generated column', 'Description for random generated column']
_TABLE_NAMES = ['school_info', 'personal_info', 'persons', 'employees', 'companies', 'store', 'home']
_DATABASE_NAMES = ['school_warehouse', 'company_warehouse', 'on_prem_warehouse', 'factory_warehouse',
'organization_warehouse']
def get_hive_conn(connection_args):
return hive.connect(host=connection_args['host'],
port=connection_args['port'],
username=connection_args['user'],
database=connection_args['database'],
auth=None)
def create_random_hive_data(connection_args):
conn = get_hive_conn(connection_args)
cursor = conn.cursor()
for x in range(4):
database_name, database_stmt = build_create_database_statement()
print('\n' + database_stmt)
cursor.execute(database_stmt)
cursor.execute(build_use_database_statement(database_name))
for y in range(250):
table_stmt = build_create_table_statement()
cursor.execute(table_stmt)
print('\n' + table_stmt)
cursor.execute('show databases')
databases = cursor.fetchall()
print(databases)
cursor.close()
def get_random_data_type():
return random.choice(_DATA_TYPES)
def get_random_databases_name():
return random.choice(_DATABASE_NAMES)
def get_random_column_name():
return random.choice(_COLUMN_NAMES)
def get_random_column_description():
return random.choice(_DESCRIPTION_VALUES)
def get_random_table_name():
return random.choice(_TABLE_NAMES)
def build_create_database_statement():
database_name = '{}{}'.format(get_random_databases_name(),
str(random.randint(1, 100000)))
database_stmt = 'CREATE DATABASE {} '.format(database_name)
return database_name, database_stmt
def build_use_database_statement(database_name):
return 'USE {} '.format(database_name)
def build_create_table_statement():
table_stmt = 'CREATE TABLE {}{} ( '.format(
get_random_table_name(),
uuid.uuid4().hex[:8]
)
table_stmt = '{}{}{} {}'.format(
table_stmt,
get_random_column_name(),
str(random.randint(1, 100000)),
get_random_data_type()
)
for x in range(random.randint(1, 100)):
table_stmt += ' , {}{}'.format(get_random_column_name(), str(random.randint(1, 100000))) + \
' {}'.format(get_random_data_type()) + \
' COMMENT "{}"'.format(get_random_column_description())
table_stmt = '{} )'.format(table_stmt)
return table_stmt
def parse_args():
parser = argparse.ArgumentParser(
description='Command line generate random metadata into a Hive server')
parser.add_argument(
'--hive-host',
help='Your Hive server host',
required=True)
parser.add_argument('--hive-user',
help='Your Hive server user')
parser.add_argument('--hive-database',
help='Your Hive server database name')
parser.add_argument('--hive-port',
help='Your Hive server port',
type=int,
default=1000)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
# Enable logging
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
create_random_hive_data({
'database': args.hive_database,
'host': args.hive_host,
'user': args.hive_user,
'port': args.hive_port
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment