import random
import matplotlib.pyplot as plt
import torch

class dynamic_problem:
    def __init__(self, nbX, nbA, P, u, β):
        self.nbX = nbX
        self.nbA = nbA
        self.P = P
        self.u = u
        self.β = β


nbX = 201
nbA = 2

# Constructing the x_grid and transition probabilities
λ = 1/1500

def exp_cdf(c:float, λ):
    return 1 - torch.exp(-λ * c)

x_grid = torch.linspace(0,3*10e4, nbX)
x_diff = torch.maximum(x_grid[:,None] - x_grid[None,:], torch.zeros(1)) # Get all the differences
x_diff = torch.cat([x_diff, torch.ones(1, nbX) * 1e36], 0)
P0 = exp_cdf(x_diff, λ) # Get CDFs at all the interval boundaries
P0 = P0[1:,] - P0[:-1,] # Convert to PMF of the intervals
P1 = P0[:,0][:,None].repeat(1,nbX) # Transition probabilities conditional on replacement is just transition probabilities from the first state. The syntax [:,None] adds an empty dimension at the specified place.
P = torch.stack([P0,P1],-1)

# Utilities
θ = 1e-3
R = 8000
u = lambda x : torch.stack([-θ*x, -torch.tensor(R).repeat(len(x))], -1).squeeze()

# Discount factor
β = 0.97

bus_problem = dynamic_problem(nbX, nbA, P, u(x_grid), β)
bus_problem.x_grid = x_grid


def cmap(self, Q_init, tol, max_iter = float('inf')):
    # Initialize
    Q_k = Q_init.clone()
    tol = 1e-8
    k = 0
    diff_hist = []
    diff = tol + 1

    while (diff > tol) & (k < max_iter):
        # Compute new Q
        Q_k1 = self.u +  self.β * torch.sum(self.P * Q_k.max(1)[0][:,None,None],0)
        
        # Record difference
        diff = torch.norm(Q_k1 - Q_k)
        diff_hist.append(diff)
        
        # Update Q
        Q_k = Q_k1
        
        k += 1
        
    diff_hist = torch.stack(diff_hist)
    if diff <= tol:
        print("Iteration converged after " + "{:,}".format(self.nbX*self.nbA*k) + " function updates") 
    else:
        print("Hit maximum iterations after " + "{:,}".format(self.nbX*self.nbA*k) + " function updates")         
    return Q_k, diff_hist

dynamic_problem.cmap = cmap


%%time
Q_init = torch.ones(bus_problem.nbX, bus_problem.nbA) * -2000
tol = 1e-8
Q_sol_rust, diff_hist_rust = bus_problem.cmap(Q_init,tol)

Iteration converged after 207,432 function updates
CPU times: user 209 ms, sys: 109 ms, total: 318 ms
Wall time: 49.5 ms


# Plots
fig, axs = plt.subplots(2, figsize=(10, 10))
axs[0].plot(x_grid, Q_sol_rust[:,1]-Q_sol_rust[:,0], label = "Contraction mapping")
axs[0].set_xlabel("Miles")
axs[0].set_ylabel("Relative value of replacement")
axs[0].legend()

axs[1].plot(x_grid, Q_sol_rust[:,1]-Q_sol_rust[:,0] >= 0, label = "Contraction mapping")
axs[1].set_xlabel("Miles")
axs[1].set_ylabel("Replacement policy")
axs[1].legend()

<matplotlib.legend.Legend at 0x14031d1b0>


# Asynchronous updating component of Q-learning
@torch.jit.script
def Q_learn0_script(U, P, β:float, nbX:int, nbA:int, Q_init, x_init, eps:float, tol:float, buffer_size:int, max_iter:float = float('inf'), reset_freq:float = float('inf')):
    # Initialize
    Q_k = Q_init.clone()
    x_k = x_init
    a_k = Q_k[x_k].argmax()
    k = 0
    diff_buffer = torch.ones(buffer_size)*tol

    while (torch.norm(diff_buffer) > tol) and (k < max_iter):
        # epsilon-greedy choice of a
        if torch.rand(1) > eps:
            a_k = Q_k[x_k].argmax()
        else:
            a_k = torch.randint(nbA, (1,))[0].long()
        
        # Calculate the temporal difference and add to buffer
        TD = U[x_k,a_k] + β * torch.sum(P[:,x_k,a_k] * Q_k.max(1)[0]) - Q_k[x_k,a_k]
        diff_buffer[k % buffer_size] = TD
        
        # Update Q
        Q_k[x_k,a_k] += TD
        
        # Update x, possibly randomly
        if k % reset_freq == 0:
            x_k = torch.randint(nbX,(1,))[0].long()
        else:
            x_k = torch.multinomial(P[:,x_k,a_k],1)[0]
            
        k += 1
    
    return Q_k, diff_buffer, k

def Q_learn0(self, Q_init, x_init, eps:float, tol:float, buffer_size:int, max_iter = float('inf'), reset_freq:float = float('inf')):
    Q_k, diff_buffer, k = Q_learn0_script(self.u, self.P, self.β, self.nbX, self.nbA, Q_init, x_init, eps, tol, buffer_size, max_iter, reset_freq)
    
    if torch.norm(diff_buffer) <= tol:
        print("Iteration converged after " + "{:,}".format(k) + " function updates")
    else:
        print("Hit maximum iterations after " + "{:,}".format(k) + " function updates")   
    
    return Q_k, diff_buffer

dynamic_problem.Q_learn0 = Q_learn0


# Reasonable training parameters
x_init = torch.tensor(0)
eps = 0.02
tol = 1e-8
buffer_size = bus_problem.nbX*bus_problem.nbA
max_iter = 300000
reset_freq = 20

%time Q_sol_learn0, diff_buffer_learn0 = bus_problem.Q_learn0(Q_init, x_init, eps, tol, buffer_size, max_iter, reset_freq)

Iteration converged after 175,597 function updates
CPU times: user 1min, sys: 36.5 s, total: 1min 36s
Wall time: 14.1 s


# Plots
axs[0].plot(x_grid, Q_sol_learn0[:,1]-Q_sol_learn0[:,0], label = "Asynchronous updating")
axs[0].legend()
axs[1].plot(x_grid, Q_sol_learn0[:,1]-Q_sol_learn0[:,0] > 0, label = "Asynchronous updating")
axs[1].legend()
fig


# Randomly pick a state every time
%time _ = bus_problem.Q_learn0(Q_init, x_init, 0.05, tol, buffer_size, max_iter, reset_freq = 1);

Iteration converged after 112,803 function updates
CPU times: user 21.3 s, sys: 11.3 s, total: 32.6 s
Wall time: 4.42 s


# Full Q-learning
@torch.jit.script
def Q_learn1_script(U, P, β:float, nbX:int, nbA:int, Q_init, x_init, α_k:float, eps:float, tol:float, buffer_size:int, max_iter:float = float('inf'), reset_freq:float = float('inf')):
    # Initialize
    Q_k = Q_init.clone()
    x_k = x_init
    a_k = Q_k[x_k].argmax()
    k = 0
    diff_buffer = torch.ones(buffer_size)*tol + 1

    while (torch.abs(torch.mean(diff_buffer)) > tol) and (k < max_iter):
        # epsilon-greedy choice of y
        if torch.rand(1) > eps:
            a_k = Q_k[x_k].argmax()
        else:
            a_k = torch.randint(nbA, (1,))[0].long()
        
        # draw next x
        x_k1 = torch.multinomial(P[:,x_k,a_k],1)[0]
        
        # Calculate the temporal difference and add to buffer
        TD = U[x_k,a_k] + β * Q_k[x_k1].max() - Q_k[x_k,a_k]
        diff_buffer[k % buffer_size] = TD
        
        # Update Q
        Q_k[x_k,a_k] += α_k * TD
        
        # Update x, possibly randomly
        if k % reset_freq == 0:
            x_k = torch.randint(nbX,(1,))[0].long()
        else:
            x_k = x_k1        
            
        k += 1

    return Q_k, diff_buffer, k

def Q_learn1(self, Q_init, x_init, α_k:float, eps:float, tol:float, buffer_size:int, max_iter = float('inf'), reset_freq:float = float('inf')):
    Q_k, diff_buffer, k = Q_learn1_script(self.u, self.P, self.β, self.nbX, self.nbA, Q_init, x_init, α_k, eps, tol, buffer_size, max_iter, reset_freq)
    
    if torch.norm(diff_buffer) <= tol:
        print("Iteration converged after " + "{:,}".format(k) + " function updates")
    else:
        print("Hit maximum iterations after " + "{:,}".format(k) + " function updates")   
    
    return Q_k, diff_buffer

dynamic_problem.Q_learn1 = Q_learn1


x_init = torch.tensor(0)
α_k = 0.1
eps = 0.04
tol = 1e-8
buffer_size = bus_problem.nbX*bus_problem.nbA
max_iter = 1000000
reset_freq = 20

%time Q_sol_learn1, diff_buffer_learn1 = bus_problem.Q_learn1(Q_init, x_init, α_k, eps, tol, buffer_size, max_iter, reset_freq)

Hit maximum iterations after 852,508 function updates
CPU times: user 44.2 s, sys: 566 ms, total: 44.7 s
Wall time: 43.8 s


# Plot
axs[0].lines.pop()
axs[0].plot(x_grid, Q_sol_learn1[:,1]-Q_sol_learn1[:,0], label = "Q-learning")
axs[0].legend()
axs[1].lines.pop()
axs[1].plot(x_grid, Q_sol_learn1[:,1]-Q_sol_learn1[:,0] > 0, label = "Q-learning")
axs[1].legend()
fig


# Defining the neural network structure. 
from torch import nn
import torch.nn.functional as F
from torch import optim

class Fork_Net(nn.Module):
    def __init__(self, L1_size, L2_size, rescale_factor):
        super(Fork_Net, self).__init__()
        self.fc11 = nn.Linear(1, L1_size)
        self.fc12 = nn.Linear(L1_size, L2_size)
        self.fc13 = nn.Linear(L2_size, 1)
        self.fc21 = nn.Linear(1, L1_size)
        self.fc22 = nn.Linear(L1_size, L2_size)
        self.fc23 = nn.Linear(L2_size, 1)
        self.rescale_factor = rescale_factor

    def forward(self, X):
        x1 = F.relu(self.fc11(X*self.rescale_factor))
        x1 = F.relu(self.fc12(x1))
        x1 = self.fc13(x1)
        x2 = F.relu(self.fc21(X*self.rescale_factor))
        x2 = F.relu(self.fc22(x2))
        x2 = self.fc23(x2)
        return torch.cat([x1,x2], -1)


def deep_Q_learn(self, Q_net, x_init, α_k:float, eps:float, tol:float, buffer_size:int, max_iter:float = float('inf'), reset_freq:float = float('inf'), batch_size = 1):
    # Initialize
    opt_Q = optim.Adam(Q_net.parameters(), lr=α_k)
    x_k = torch.tensor([x_init])
    a_k = Q_net(self.x_grid[x_k]).argmax()
    k = 0
    diff_buffer = torch.ones(buffer_size)*tol + 1

    while (torch.mean(diff_buffer) > tol) and (k < max_iter):
        obj = 0
        
        for j in range(batch_size):
            # epsilon-greedy choice of y
            if random.random() > eps:
                a_k = Q_net(self.x_grid[x_k]).argmax()
            else:
                a_k = random.randint(0,1)
                
            # draw next x
            x_k1 = torch.multinomial(self.P[:,x_k[0],a_k],1)
            
            # Calculate the temporal difference and add to objective
            TD = self.u[x_k, a_k] + self.β * Q_net(self.x_grid[x_k1]).max().detach() - Q_net(self.x_grid[x_k])[a_k] # detach is used so that the gradient is not computed wrt to this term
            obj += (TD**2)/2
        
            # Update x, possibly randomly
            if (j + k*batch_size) % reset_freq == 0:
                # x_k = self.x_grid[random.randint(0,self.nbX-1)].unsqueeze(0) * 1.5
                x_k = torch.tensor([random.randint(0,self.nbX-1)])
            else:
                x_k = x_k1

        diff_buffer[k % buffer_size] = obj[0]/batch_size
            
        # Update network parameters
        opt_Q.zero_grad()
        (obj/batch_size).backward()
        opt_Q.step()
        
        k += 1

    if torch.norm(diff_buffer) <= tol:
        print("Iteration converged after " + "{:,}".format(k*batch_size) + " function updates")
    else:
        print("Hit maximum iterations after " + "{:,}".format(k*batch_size) + " function updates")   

    return Q_net, diff_buffer

dynamic_problem.deep_Q_learn = deep_Q_learn


# Initialized network
Q_net = Fork_Net(16,16,1/3e5)
plt.plot(Q_net(x_grid[:,None]).detach())

[<matplotlib.lines.Line2D at 0x140551c60>,
 <matplotlib.lines.Line2D at 0x140551c00>]


x_init = torch.tensor(0)
α_k = 5e-2
eps = 0.03
tol = 1e-8
buffer_size = bus_problem.nbX*bus_problem.nbA
max_iter = 1500
reset_freq = 20
batch_size = 20
%time Q_net, diff_buffer_deep = bus_problem.deep_Q_learn(Q_net, x_init, α_k, eps, tol, buffer_size, max_iter, reset_freq, batch_size)

Hit maximum iterations after 30,000 function updates
CPU times: user 8.41 s, sys: 157 ms, total: 8.56 s
Wall time: 8.28 s


Q_sol_deep = Q_net(x_grid[:,None]).detach()

# Plot
axs[0].lines.pop()
axs[0].plot(x_grid, Q_sol_deep[:,1]-Q_sol_deep[:,0], label = "Deep Q-learning")
axs[0].legend()
axs[1].lines.pop()
axs[1].plot(x_grid, Q_sol_deep[:,1]-Q_sol_deep[:,0]>0, label = "Deep Q-learning")
axs[1].legend()
fig

Introduction to Reinforcement Learning for Economists

Setup and Notation¶

Bus Engine replacement¶

Rust (1987) solution¶

Algorithm¶

Computational performance¶

Benchmark solution¶

Q-learning¶

Asynchronous (and stochastic) update path¶

Algorithm¶

1st issue: Exploration¶

Solution comparison¶

Stochastic iteration¶

Algorithm¶

2nd issue: Convergence¶

Solution comparison¶

Deep Q-learning¶

Algorithm¶

Initial parameters¶

Solution comparison¶

Conclusion¶

References¶