Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
import argparse
import random
from datetime import date, timedelta
from random import shuffle
# option settings
parser = argparse.ArgumentParser(description='retail data generator')
parser.add_argument('--no-file', type=int, default=1, help='number of files. default is 1.')
parser.add_argument('--no-line', type=int, default=10000, help='number of lines. default is 10000.')
parser.add_argument('--no-date', type=int, default=365,
help='number of dates start from 2015-01-01 in YYYY-MM-DD format. default is 365.')
parser.add_argument('--no-product', type=int, default=100, help='no-product: number of products. default is 100.')
parser.add_argument('--no-product-category', type=int, default=3, help='number of product categories. default is 3.')
parser.add_argument('--no-store', type=int, default=10, help='number of stores. default is 10.')
parser.add_argument('--no-city', type=int, default=3, help='number of cities of stores. default is 3.')
parser.add_argument('--no-sales_quantity', type=int, default=107, help='number of sales quantities. default is 107.')
parser.add_argument('--no-unit_price', type=int, default=None,
help='number of unit prices. default is (no_product) * 2.')
parser.add_argument('--always-same-result', action='store_true',
help='if you need always same result, please set this option. random data is generated by default.')
def generate_rand_tables(p):
rand_tables = {}
if p.always_same_result is True:
random.seed(0)
else:
random.seed()
# sales table
sales_quantities = [x for x in range(1, p.no_sales_quantity + 1)]
shuffle(sales_quantities)
rand_tables['sales_quantities'] = sales_quantities
unit_prices = [x for x in range(1, p.no_unit_price + 1)]
shuffle(unit_prices)
rand_tables['unit_prices'] = unit_prices
# date table
dates = [x for x in range(1, p.no_date + 1)]
shuffle(dates)
rand_tables['dates'] = dates
# product table
products = [x for x in range(1, p.no_product + 1)]
shuffle(products)
rand_tables['products'] = products
product_categories = [x for x in range(1, p.no_product_category + 1)]
shuffle(product_categories)
rand_tables['product_categories'] = product_categories
# store table
stores = [x for x in range(1, p.no_store + 1)]
shuffle(stores)
rand_tables['stores'] = stores
cities = [x for x in range(1, p.no_city + 1)]
shuffle(cities)
rand_tables['cities'] = cities
return rand_tables
def gen_date_dim(p):
d0 = date(2015, 1, 1)
f = open("dates.csv", "w")
for i in range(p.no_date):
d = d0 + timedelta(days=i)
f.write("{0},{1}\n".format(str(i + 1), d.isoformat()))
f.close()
def gen_product_dim(p, rand_tables):
f = open("products.csv", "w")
for i in range(p.no_product):
id = str(i + 1)
name = "item" + str(i + 1)
category_no = (i + 1) % p.no_product_category
category = "category" + str(category_no)
f.write("{0},{1},{2}\n".format(id, name, category))
f.close()
def gen_store_dim(p, rand_tables):
f = open("stores.csv", "w")
for i in range(p.no_store):
id = str(i + 1)
name = "store" + str(i + 1)
city_no = (i + 1) % p.no_city
city = "city" + str(city_no)
f.write("{0},{1},{2}\n".format(id, name, city))
f.close()
def gen_sales(p, rand_tables):
for file_no in range(1, p.no_file + 1):
gen_sales_per_file(p, rand_tables, file_no)
def gen_sales_per_file(p, rand_tables, file_no):
start_line = (p.no_line * file_no) + 1
end_line = p.no_line * (file_no + 1)
f = open("sales{0}.csv".format(file_no), "w")
for line in range(start_line, end_line + 1):
date_id = rand_tables['dates'][line % (p.no_date)]
product_id = rand_tables['products'][line % (p.no_product)]
store_id = rand_tables['stores'][line % (p.no_store)]
sales_quantity = rand_tables['sales_quantities'][line % (p.no_sales_quantity)]
unit_price = rand_tables['unit_prices'][line % (p.no_unit_price)]
sales_amount = sales_quantity * unit_price
row = ','.join(list(map(str, [date_id, product_id, store_id, sales_quantity, unit_price, sales_amount])))
f.write(row + "\n")
f.close()
if __name__ == '__main__':
p = parser.parse_args()
if p.no_unit_price is None:
p.no_unit_price = p.no_product * 2
rand_tables = generate_rand_tables(p)
gen_date_dim(p)
gen_product_dim(p, rand_tables)
gen_store_dim(p, rand_tables)
gen_sales(p, rand_tables)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.