Ilya Katsov ikatsov

## drl-supply-chain-gym.py
class SimpleSupplyChain(gym.Env):
    def __init__(self, config):
        self.reset()
        self.action_space = Box(low=0.0, high=20.0)            # Continuous space
        self.observation_space = Box(low=-10000, high=10000)

    def reset(self):
        self.supply_chain = SupplyChainEnvironment()
        self.state = self.supply_chain.initial_state()
        return self.state.to_array()

## drl-supply-chain-sq-results.py
Optimized policy parameters:
Factory (s, Q) = (0, 20)
Warehouse 1 (s, Q) = (5, 5)
Warehouse 2 (s, Q) = (5, 5)
Warehouse 3 (s ,Q) = (5, 10)

Achieved profit: 6871.0

## drl-supply-chain-sq-optimization.py
from ax import optimize

def evaluate_sQPolicy(p):
    policy = SQPolicy(
        p['factory_s'],
        p['factory_Q'],
        [ p['w1_s'], p['w2_s'], p['w3_s'], ],
        [ p['w1_Q'], p['w2_Q'], p['w3_Q'], ]
    )
    return np.mean(simulate(env, policy, num_episodes = 30))

## drl-supply-chain-sq-sim.py
class SQPolicy(object):
    def __init__(self, factory_safety_stock,
                 factory_reorder_amount, safety_stock, reorder_amount):
        self.factory_safety_stock = factory_safety_stock
        self.factory_reorder_amount = factory_reorder_amount
        self.safety_stock = safety_stock
        self.reorder_amount = reorder_amount

    def select_action(self, state):
        action = Action(state.warehouse_num)

## drl-supply-chain-env-step.py
class SupplyChainEnvironment(object):
    ...
    def step(self, state, action):
        demands = np.fromfunction(lambda j: self.demand(j+1, self.t), (self.warehouse_num,))

        # Calculating the reward (profit)
        total_revenue = self.unit_price * np.sum(demands)
        total_production_cost = self.unit_cost * action.production_level
        total_storage_cost = np.dot( self.storage_costs,
                np.maximum(state.stock_levels(), np.zeros(self.warehouse_num + 1)) )

## drl-supply-chain-env-demand-function.py
class SupplyChainEnvironment(object):
    ...
    def demand(self, j, t):             # Demand at warehouse j at time t
        return np.round(self.d_max/2 +
                        self.d_max/2*np.sin(2*np.pi*(t + 2*j)/self.T*2) +
                        np.random.randint(0, self.d_var))

## drl-supply-chain-env-initialization.py
class SupplyChainEnvironment(object):
    def __init__(self):
        self.T = 26               # Episode duration
        self.warehouse_num = 3
        self.d_max = 5            # Maximum demand, units
        self.d_var = 2            # Maximum random demand variation, units

        self.unit_price = 100     # Unit price in dollars
        self.unit_cost = 40       # Unit cost in dollars

## drl-supply-chain-env-sa.py
class State(object):
    def __init__(self, warehouse_num, T, demand_history, t = 0):
        self.warehouse_num = warehouse_num
        self.factory_stock = 0
        self.warehouse_stock = np.repeat(0, warehouse_num)
        self.demand_history = demand_history
        self.T = T   # Length of one episode
        self.t = t

    def to_array(self):

## drl-supply-chain-hilo-dqn-rllib.py
import ray
import ray.rllib.agents.dqn as dqn

def train_dqn():
  config = dqn.DEFAULT_CONFIG.copy()
  config["log_level"] = "WARN"
  config["train_batch_size"] = 256
  config["buffer_size"] = 10000
  config["hiddens"] = [128, 128, 128]
  trainer = dqn.DQNTrainer(config=config, env=HiLoPricingEnv)

## drl-supply-chain-hilo-gym.py
import gym
from gym.spaces import Discrete, Box

class HiLoPricingEnv(gym.Env):
    def __init__(self, config):
        self.reset()
        self.action_space = Discrete(len(price_grid))
        self.observation_space = Box(0, 10000, shape=(2*T, ), dtype=np.float32)

    def reset(self):
	class SimpleSupplyChain(gym.Env):
	def __init__(self, config):
	self.reset()
	self.action_space = Box(low=0.0, high=20.0) # Continuous space
	self.observation_space = Box(low=-10000, high=10000)

	def reset(self):
	self.supply_chain = SupplyChainEnvironment()
	self.state = self.supply_chain.initial_state()
	return self.state.to_array()
	Optimized policy parameters:
	Factory (s, Q) = (0, 20)
	Warehouse 1 (s, Q) = (5, 5)
	Warehouse 2 (s, Q) = (5, 5)
	Warehouse 3 (s ,Q) = (5, 10)

	Achieved profit: 6871.0
	from ax import optimize

	def evaluate_sQPolicy(p):
	policy = SQPolicy(
	p['factory_s'],
	p['factory_Q'],
	[ p['w1_s'], p['w2_s'], p['w3_s'], ],
	[ p['w1_Q'], p['w2_Q'], p['w3_Q'], ]
	)
	return np.mean(simulate(env, policy, num_episodes = 30))
	class SQPolicy(object):
	def __init__(self, factory_safety_stock,
	factory_reorder_amount, safety_stock, reorder_amount):
	self.factory_safety_stock = factory_safety_stock
	self.factory_reorder_amount = factory_reorder_amount
	self.safety_stock = safety_stock
	self.reorder_amount = reorder_amount

	def select_action(self, state):
	action = Action(state.warehouse_num)
	class SupplyChainEnvironment(object):
	...
	def step(self, state, action):
	demands = np.fromfunction(lambda j: self.demand(j+1, self.t), (self.warehouse_num,))

	# Calculating the reward (profit)
	total_revenue = self.unit_price * np.sum(demands)
	total_production_cost = self.unit_cost * action.production_level
	total_storage_cost = np.dot( self.storage_costs,
	np.maximum(state.stock_levels(), np.zeros(self.warehouse_num + 1)) )
	class SupplyChainEnvironment(object):
	...
	def demand(self, j, t): # Demand at warehouse j at time t
	return np.round(self.d_max/2 +
	self.d_max/2np.sin(2np.pi(t + 2j)/self.T*2) +
	np.random.randint(0, self.d_var))
	class SupplyChainEnvironment(object):
	def __init__(self):
	self.T = 26 # Episode duration
	self.warehouse_num = 3
	self.d_max = 5 # Maximum demand, units
	self.d_var = 2 # Maximum random demand variation, units

	self.unit_price = 100 # Unit price in dollars
	self.unit_cost = 40 # Unit cost in dollars
	class State(object):
	def __init__(self, warehouse_num, T, demand_history, t = 0):
	self.warehouse_num = warehouse_num
	self.factory_stock = 0
	self.warehouse_stock = np.repeat(0, warehouse_num)
	self.demand_history = demand_history
	self.T = T # Length of one episode
	self.t = t

	def to_array(self):
	import ray
	import ray.rllib.agents.dqn as dqn

	def train_dqn():
	config = dqn.DEFAULT_CONFIG.copy()
	config["log_level"] = "WARN"
	config["train_batch_size"] = 256
	config["buffer_size"] = 10000
	config["hiddens"] = [128, 128, 128]
	trainer = dqn.DQNTrainer(config=config, env=HiLoPricingEnv)
	import gym
	from gym.spaces import Discrete, Box

	class HiLoPricingEnv(gym.Env):
	def __init__(self, config):
	self.reset()
	self.action_space = Discrete(len(price_grid))
	self.observation_space = Box(0, 10000, shape=(2*T, ), dtype=np.float32)

	def reset(self):