% Create an MDP model with eight states and three actions ("up", "stay", and "down").
MDP = createMDP(14,["up";"stay";"down"]);

% Specify the state transition and reward matrices for the MDP

% state 1 transition and reward: (k,x) = (0,0)
MDP.T(1,2,1) = 1;
MDP.R(1,2,1) = -5;
MDP.T(1,3,2) = 1;
MDP.R(1,3,2) = -2;
MDP.T(1,4,3) = 1;
MDP.R(1,4,3) = -3;

% state 2 transition and reward: (k,x) = (1,1)
MDP.T(2,2,1) = 1;    % dummy transition
MDP.R(2,2,1) = -10;  % dummy transition
MDP.T(2,5,2) = 1;
MDP.R(2,5,2) = -1;
MDP.T(2,6,3) = 1;
MDP.R(2,6,3) = -3;
% state 3 transition and reward: (k,x) = (1,0)
MDP.T(3,5,1) = 1;
MDP.R(3,5,1) = -2;
MDP.T(3,6,2) = 1;
MDP.R(3,6,2) = -4;
MDP.T(3,7,3) = 1;
MDP.R(3,7,3) = -3;
% state 4 transition and reward: (k,x) = (1,-1)
MDP.T(4,6,1) = 1;
MDP.R(4,6,1) = -4;
MDP.T(4,7,2) = 1;
MDP.R(4,7,2) = -5;
MDP.T(4,4,3) = 1;    % dummy transition
MDP.R(4,4,3) = -10;  % dummy transition

% state 5 transition and reward: (k,x) = (2,1)
MDP.T(5,5,1) = 1;    % dummy transition
MDP.R(5,5,1) = -10;  % dummy trnasition
MDP.T(5,8,2) = 1;
MDP.R(5,8,2) = -1;
MDP.T(5,9,3) = 1;
MDP.R(5,9,3) = -1;
% state 6 transition and reward: (k,x) = (2,0)
MDP.T(6,8,1) = 1;
MDP.R(6,8,1) = -2;
MDP.T(6,9,2) = 1;
MDP.R(6,9,2) = -3;
MDP.T(6,10,3) = 1;
MDP.R(6,10,3) = -3;
% state 7 transition and reward: (k,x) = (2,-1)
MDP.T(7,9,1) = 1;
MDP.R(7,9,1) = -2;
MDP.T(7,10,2) = 1;
MDP.R(7,10,2) = -1;
MDP.T(7,7,3) = 1;   % dummy transition
MDP.R(7,7,3) = -10; % dummy transition

% state 8 transition and reward: (k,x) = (3,1)
MDP.T(8,8,1) = 1;   % dummy transition
MDP.R(8,8,1) = -10; % dummy transition
MDP.T(8,11,2) = 1;
MDP.R(8,11,2) = -1;
MDP.T(8,12,3) = 1;
MDP.R(8,12,3) = -2;
% state 9 transition and reward: (k,x) = (3,0)
MDP.T(9,11,1) = 1;
MDP.R(9,11,1) = -3;
MDP.T(9,12,2) = 1;
MDP.R(9,12,2) = -4;
MDP.T(9,13,3) = 1;
MDP.R(9,13,3) = -2;
% state 10 transition and reward: (k,x) = (3,-1)
MDP.T(10,12,1) = 1;
MDP.R(10,12,1) = -1;
MDP.T(10,13,2) = 1;
MDP.R(10,13,2) = -5;
MDP.T(10,10,3) = 1;    % dummy transition
MDP.R(10,10,3) = -10;  % dummy transition

% state 11 transition and reward: (k,x) = (4,1)
MDP.T(11,11,1) = 1;    % dummy transition
MDP.R(11,11,1) = -10;  % dummy transition
MDP.T(11,11,2) = 1;    % dummy transition
MDP.R(11,11,2) = -10;  % dummy transition
MDP.T(11,14,3) = 1;
MDP.R(11,14,3) = -2;
% state 12 transition and reward: (k,x) = (4,0)
MDP.T(12,12,1) = 1;    % dummy transition
MDP.R(12,12,1) = -10;  % dummy transition
MDP.T(12,14,2) = 1;
MDP.R(12,14,2) = -3;
MDP.T(12,12,3) = 1;    % dummy transition
MDP.R(12,12,3) = -10;  % dummy transition
% state 13 transition and reward: (k,x) = (4,-1)
MDP.T(13,14,1) = 1;
MDP.R(13,14,1) = -4;
MDP.T(13,13,2) = 1;    % dummy transition
MDP.R(13,13,2) = -10;  % dummy transition
MDP.T(13,13,3) = 1;    % dummy transition
MDP.R(13,13,3) = -10;  % dummy transition

% state 14 transition and reward: (k,x) = (5,0)
MDP.T(14,14,1) = 1;    % dummy transition
MDP.R(14,14,1) = -10;  % dummy transition
MDP.T(14,14,2) = 1;    % dummy transition
MDP.R(14,14,2) = -10;  % dummy transition
MDP.T(14,14,3) = 1;    % dummy transition
MDP.R(14,14,3) = -10;  % dummy transition

% Specify states "s14" as terminal state of the MDP.
MDP.TerminalStates = ["s14"];

%Create the reinforcement learning MDP environment for this process model.
env = rlMDPEnv(MDP);

% To specify that the initial state of the agent is always state 1, 
% specify a reset function that returns the initial agent state. 
% This function is called at the start of each training episode and 
% simulation. Create an anonymous function handle that sets the initial 
% state to 1.
env.ResetFcn = @() 1;

% Fix the random generator seed for reproducibility.
rng(0)

% Create Q-Learning Agent
obsInfo = getObservationInfo(env);
actInfo = getActionInfo(env);
qTable = rlTable(obsInfo, actInfo);
qFunction = rlQValueFunction(qTable, obsInfo, actInfo);
qOptions = rlOptimizerOptions("LearnRate",1);

% Create a Q-learning agent using this table representation, 
% configuring the epsilon-greedy exploration.
agentOpts = rlQAgentOptions;
agentOpts.DiscountFactor = 1;
agentOpts.EpsilonGreedyExploration.Epsilon = 0.9;
agentOpts.EpsilonGreedyExploration.EpsilonDecay = 0.01;
agentOpts.CriticOptimizerOptions = qOptions;
qAgent = rlQAgent(qFunction,agentOpts); %#ok<NASGU> 

% Train Q-Learning Agent
trainOpts = rlTrainingOptions;
trainOpts.MaxStepsPerEpisode = 50;
trainOpts.MaxEpisodes = 500;
trainOpts.StopTrainingCriteria = "AverageReward";
trainOpts.StopTrainingValue = -10;  
trainOpts.ScoreAveragingWindowLength = 30;

trainingStats = train(qAgent,env,trainOpts); %#ok<UNRCH> 

% Validate Q-Learning Results
Data = sim(qAgent,env);
cumulativeReward = sum(Data.Reward)

QTable = getLearnableParameters(getCritic(qAgent));
QTable{1}