Skip to content

Instantly share code, notes, and snippets.

@shiumachi
Created November 28, 2018 06:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shiumachi/34cb6745e446a0a38ebc703ea9a44d74 to your computer and use it in GitHub Desktop.
Save shiumachi/34cb6745e446a0a38ebc703ea9a44d74 to your computer and use it in GitHub Desktop.
data generator for Hive / Impala demo
import argparse
import random
usage = """\
%(prog)s [options]
"""
def init_parser():
parser = argparse.ArgumentParser(prog='datagen.py', usage=usage)
parser.add_argument("-f", "--filenum", type=int, default=0,
help="No. of file. default:0. file will be written to /tmp/[filenum].txt")
parser.add_argument("-l", "--linenum", type=int, default=10, help="No. of line. default:10")
parser.add_argument("-t", "--type", default='A', help="table type. A or B")
parser.add_argument("-r", "--randomize", action='store_true', help="randomize linenum")
return parser
def get_site_id(num):
return num ** 3 % 73
def get_domain(num, maxnum=2887):
n = num ** 3 % maxnum
if n < 300:
return ""
elif n < 500:
return "mypage.com"
else:
return "%03d.com" % n
def generate_func_A(num):
"""
ri_page_url
STRING
ri _refferer_url
STRING
ri_site_id
INT
"""
ri_page_url = '/%03d.html' % (num ** 3 % 757)
ri_refferer_url = '%s' % get_domain(num)
ri_site_id = get_site_id(num)
return "%s,%s,%d" % (ri_page_url, ri_refferer_url, ri_site_id)
def generate_func_B(num):
"""
su_site_id
INT
su_url
STRING
"""
su_site_id = num
if num == 1:
su_url = "mypage.com"
else:
su_url = '%03d.com' % (num - 1)
return "%d,%s" % (su_site_id, su_url)
def generate_func_C(num):
"""
id
INT
org_id
INT
name
STRING
"""
id = num
org_id = num ** 3 % 757
name = "user{0}".format(num)
return "{0},{1},{2}".format(id, org_id, name)
generate_func_dict = {"A": generate_func_A,
"B": generate_func_B,
"C": generate_func_C
}
def main():
parser = init_parser()
args = parser.parse_args()
filenum = args.filenum
start = args.filenum * (10 ** 9)
if args.randomize is True:
linenum = int(args.linenum * random.random())
else:
linenum = args.linenum
type = args.type
if type not in generate_func_dict:
type = 'A'
generate_func = generate_func_dict[type]
with open('/tmp/%03d.txt' % int(filenum), 'w') as f:
for i in xrange(start, start + linenum):
f.write(generate_func(i) + '\n')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment