Created
January 6, 2020 16:14
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pyhive | |
pandas | |
sasl>=0.2.1 | |
thrift>=0.10.0 | |
git+https://github.com/cloudera/thrift_sasl # Using master branch in order to get Python 3 SASL patches |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import logging | |
import random | |
import sys | |
import uuid | |
from pyhive import hive | |
_DATA_TYPES = ['TINYINT', 'SMALLINT', 'INT', 'BIGINT', 'FLOAT', 'DOUBLE', 'DECIMAL', 'TIMESTAMP', 'DATE', | |
'STRING', 'BOOLEAN', 'BINARY', 'ARRAY<STRUCT< key:STRING, value:STRING>>', | |
'ARRAY <STRING>', 'ARRAY <STRUCT <spouse: STRING, children: ARRAY <STRING>>>', | |
'ARRAY<DOUBLE>', 'MAP<STRING,DOUBLE>', 'STRUCT < employer: STRING, id: BIGINT, address: STRING >', | |
'UNIONTYPE<DOUBLE, STRING, ARRAY<string>, STRUCT<a:INT,b:string>>'] | |
_COLUMN_NAMES = ['name', 'address', 'city', 'state', 'date_time', 'paragraph', 'randomdata', 'person', 'credit_card', | |
'size', 'reason', 'school', 'food', 'location', 'house', 'price', 'cpf', 'cnpj', 'passport', | |
'security_number', 'phone_number', 'bank_account_number', 'ip_address', 'stocks'] | |
_DESCRIPTION_VALUES = ['This is a random generated column', 'Description for random generated column'] | |
_TABLE_NAMES = ['school_info', 'personal_info', 'persons', 'employees', 'companies', 'store', 'home'] | |
_DATABASE_NAMES = ['school_warehouse', 'company_warehouse', 'on_prem_warehouse', 'factory_warehouse', | |
'organization_warehouse'] | |
def get_hive_conn(connection_args): | |
return hive.connect(host=connection_args['host'], | |
port=connection_args['port'], | |
username=connection_args['user'], | |
database=connection_args['database'], | |
auth=None) | |
def create_random_hive_data(connection_args): | |
conn = get_hive_conn(connection_args) | |
cursor = conn.cursor() | |
for x in range(4): | |
database_name, database_stmt = build_create_database_statement() | |
print('\n' + database_stmt) | |
cursor.execute(database_stmt) | |
cursor.execute(build_use_database_statement(database_name)) | |
for y in range(250): | |
table_stmt = build_create_table_statement() | |
cursor.execute(table_stmt) | |
print('\n' + table_stmt) | |
cursor.execute('show databases') | |
databases = cursor.fetchall() | |
print(databases) | |
cursor.close() | |
def get_random_data_type(): | |
return random.choice(_DATA_TYPES) | |
def get_random_databases_name(): | |
return random.choice(_DATABASE_NAMES) | |
def get_random_column_name(): | |
return random.choice(_COLUMN_NAMES) | |
def get_random_column_description(): | |
return random.choice(_DESCRIPTION_VALUES) | |
def get_random_table_name(): | |
return random.choice(_TABLE_NAMES) | |
def build_create_database_statement(): | |
database_name = '{}{}'.format(get_random_databases_name(), | |
str(random.randint(1, 100000))) | |
database_stmt = 'CREATE DATABASE {} '.format(database_name) | |
return database_name, database_stmt | |
def build_use_database_statement(database_name): | |
return 'USE {} '.format(database_name) | |
def build_create_table_statement(): | |
table_stmt = 'CREATE TABLE {}{} ( '.format( | |
get_random_table_name(), | |
uuid.uuid4().hex[:8] | |
) | |
table_stmt = '{}{}{} {}'.format( | |
table_stmt, | |
get_random_column_name(), | |
str(random.randint(1, 100000)), | |
get_random_data_type() | |
) | |
for x in range(random.randint(1, 100)): | |
table_stmt += ' , {}{}'.format(get_random_column_name(), str(random.randint(1, 100000))) + \ | |
' {}'.format(get_random_data_type()) + \ | |
' COMMENT "{}"'.format(get_random_column_description()) | |
table_stmt = '{} )'.format(table_stmt) | |
return table_stmt | |
def parse_args(): | |
parser = argparse.ArgumentParser( | |
description='Command line generate random metadata into a Hive server') | |
parser.add_argument( | |
'--hive-host', | |
help='Your Hive server host', | |
required=True) | |
parser.add_argument('--hive-user', | |
help='Your Hive server user') | |
parser.add_argument('--hive-database', | |
help='Your Hive server database name') | |
parser.add_argument('--hive-port', | |
help='Your Hive server port', | |
type=int, | |
default=1000) | |
return parser.parse_args() | |
if __name__ == "__main__": | |
args = parse_args() | |
# Enable logging | |
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) | |
create_random_hive_data({ | |
'database': args.hive_database, | |
'host': args.hive_host, | |
'user': args.hive_user, | |
'port': args.hive_port | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment