import './frozenlake.css';

import ListGroup from 'react-bootstrap/ListGroup';
import Table from 'react-bootstrap/Table';

import { CopyBlock, dracula } from "react-code-blocks";

import Markdown from 'react-markdown';
import remarkMath from 'remark-math';
import { MathJaxContext, MathJax } from 'better-react-mathjax';
import Alert from 'react-bootstrap/Alert';

import eightByEightPreview from './Files/Resources/8x8preview.gif'
import observeAction from './Files/Resources/ObserveAction.png'
import fourbyfour from './Files/Resources/4x4Image.png'
import eightbyeight from './Files/Resources/8x8Image.png'
import custommap from './Files/Resources/custommap4x4.png'
import largemap from './Files/Resources/largemap.png'


function MyCodeComponent({ code, language }) {
  return (


    


    <CopyBlock
      text={code}
      language={language}
      showLineNumbers={true}
      theme={dracula}
      wrapLines
    />
  );
}
    

function FrozenLake(){

  const markdownContentOne = `
## Brief Gymnasium Overview
[Gymnasium](https://gymnasium.farama.org/) is a useful Python library based on [OpenAI's Gym library](https://github.com/openai/gym?tab=readme-ov-file). This library provides a variety of environments for training and testing reinforcement learning algorithms. You control an agent by choosing from different actions, influencing what state the agent ends up in next. Each environment in Gymnasium provides a non-zero reward (negative or positive) after each step and/or upon reaching the desired end goal. These rewards help reinforcement learning algorithms determine the effectiveness of actions taken in a given state, allowing adjustments in future episodes to determine the best possible action for each state. The environments offer various options that can increase the complexity of finding optimal solutions.

## The Frozen Lake Environment
For the purpose of testing Q-Learning in a relatively simple environment, I will be utilizing the [Frozen Lake](https://gymnasium.farama.org/environments/toy_text/frozen_lake/) environment. The goal of Frozen Lake is to cross a frozen lake from the starting point to the goal without falling into any holes. Frozen Lake comes in both a 4x4 and 8x8 environment, for this guide I will be using the 8x8 environment to make the challenge slightly more complex, and gather more insights in the process.
`;




  return (
      <div id="frozenlakediv">

      <div className="leftrightdiv">
          <h1>Frozen Lake Reinforcement Learning Step by Step Guide</h1>
          <a href="/#/projects" class="returntorpojects">Return to Projects</a>
      </div>

      <ListGroup variant="flush">
          <ListGroup.Item><b>Category:</b> Reinforcement Learning (Q-Learning)</ListGroup.Item>
          <ListGroup.Item><b>Language:</b> Python</ListGroup.Item>
          <ListGroup.Item><b>Link:</b> <a href="https://github.com/adadamc/FrozenLake_StepByStep" target="_blank">GitHub</a></ListGroup.Item>
          <ListGroup.Item><b>Notes:</b> Guide (via <a href="https://github.com/adadamc/FrozenLake_StepByStep/blob/main/README.md" target="_blank">README</a>) and code (via <a href="https://github.com/adadamc/FrozenLake_StepByStep/blob/main/RL_FrozenLake.ipynb" target="_blank">Jupyter Notebook</a>) also available via above GitHub link</ListGroup.Item>
      </ListGroup>

      <br></br>

      <Alert variant="warning">
      <Alert.Heading>This page is under construction!</Alert.Heading>
      <p>
        The original guide is currently being formatted for this webpage. In the mean time, the full guide is available via GitHub: <a id="speciallink" href="https://github.com/adadamc/FrozenLake_StepByStep" target="_blank">GitHub</a>.
      </p>
    </Alert>

      <br></br>

      <MathJaxContext> 
      <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {markdownContentOne}
      </Markdown>

      <img src={eightByEightPreview} alt="Frozen Lake 8x8 Environment" />
      <br></br>
      <br></br>
      <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`### Understanding and Traversing the Environment`}
      </Markdown>

      
      <Table striped bordered hover variant="dark" style={{ width: 'auto' }}>
      <thead>
        <tr>
          <th>Space</th>
          <th>Size</th>
          <th>Values</th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <td>Action Space</td>
          <td>Discrete(4)</td>
          <td>0: Move left, 1: Move down, 2: Move right, 3: Move up</td>
        </tr>
        <tr>
          <td>Observation Space</td>
          <td>Discrete(64)</td>
          <td>0-63 (Position calculation: current_row * ncols + current_col)</td>
        </tr>
      </tbody>
    </Table>

    <img src={observeAction} alt="Observation and Action Space" />

    <br></br>
    <br></br>

    In the above image we can see that the observation space is represented as integers from 0-63 where 0 is the starting position (top left) and 63 is the goal position (bottom right). You can calculate a position as the <code>current_row * ncols + current_col</code>. As mentioned in the table, the agent can take 1 of 4 actions:

  <ul>
    <li><b>0:</b> Move left</li>
    <li><b>1:</b> Move down</li>
    <li><b>2:</b> Move right</li>
    <li><b>3:</b> Move up</li>
  </ul>

  In this example, taking action 1 (move down) moves the agent to position 17 (unless is_slippery is enabled, while is explained below).
<br></br>
<br></br>

  <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`#### is_slippery`}
      </Markdown>

      When creating the environment, <code>is_slippery</code> can be set to True or False.
      If disabled, you will always move in your intended direction. If enabled, you will only move in your intended direction 1/3 of the time, 
      otherwise, you have an equal 1/3 chance of slipping in either perpendicular direction. For example:

      <br></br>
      <br></br>

      <Table striped bordered hover variant="dark" style={{ width: 'auto' }}>
      <thead>
        <tr>
          <th>Reached</th>
          <th>Reward</th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <td>Goal</td>
          <td>+1</td>
        </tr>
        <tr>
          <td>Hole</td>
          <td>0</td>
        </tr>
        <tr>
          <td>Frozen</td>
          <td>0</td>
        </tr>
      </tbody>
    </Table>

    <br></br>

    <b>Possible Endings:</b>
    <ul>
    <li>Goal Reached</li>
    <li>Player moves into a hole</li>
    <li>Length of episode (actions taken) reach 200 (100 if using 4x4 environment)</li>
  </ul>

  While there is technically no negative reward, reaching the episode limit or falling into a hole is undesirable as we will not get a reward for that episode, and we will be unable to tell if the actions we took in that episode were in the
   "right direction" up to a certain point as we receive no positive feedback for partial progress.
<br></br>
<br></br>
   <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`## Setup
### Libraries / Imports`}
      </Markdown>

      <b>We will need a few libraries in order to:</b>
    <ul>
    <li>Use Gymnasium</li>
    <li>Debug / Monitor Performance of our Q-Learning Algorithm</li>
  </ul>


  These can be imported as:
  <CopyBlock
          text={`import gymnasium as gym
import numpy as np
import time
import math
from matplotlib import pyplot as plt`}
          language="python"
          theme={dracula}
          wrapLines
        />
<br></br>
<Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`### Initializing the Environment`}
      </Markdown>
      The Frozen Lake environment can be initialized with the <code>gym.make('FrozenLake-v1')</code> command. There are various arguments that 
      can be used to customize the environment.

      <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`#### map_name`}
      </Markdown>

      The <code>map_name</code> argument allows us to specify the map size,
      either <code>4x4</code> or <code>8x8</code>. In our examples, we will be
      using <code>8x8</code>.

      <CopyBlock
          text={`gym.make('FrozenLake-v1', map_name="8x8")`}
          language="python"
          theme={dracula}
          wrapLines
        />
<br></br>
<Table striped bordered hover variant="dark" style={{ width: 'auto' }}>
      <thead>
        <tr>
          <th><code>gym.make('FrozenLake-v1', map_name="8x8")</code></th>
          <th><code>gym.make('FrozenLake-v1', map_name="4x4")</code></th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <td><img src={eightbyeight} alt="" /></td>
          <td><img src={fourbyfour} alt="" /></td>
        </tr>
      </tbody>
    </Table>

    <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`#### desc`}
      </Markdown>
      The <code>desc</code> argument can be used to customize the map.

      <Table striped bordered hover variant="dark" style={{ width: 'auto' }}>
      <thead>
        <tr>
          <th>Letter</th>
          <th>Tile</th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <td>S</td>
          <td>Start Tile</td>
        </tr>
        <tr>
          <td>G</td>
          <td>Goal Tile</td>
        </tr>
        <tr>
          <td>F</td>
          <td>Frozen Tile</td>
        </tr>
        <tr>
          <td>H</td>
          <td>Tile with a Hole</td>
        </tr>
      </tbody>
    </Table>

    The <code>desc</code> argument is written in the following format (for 4x4): <code>desc=["SFFF", "FHFH", "FFFH", "HFFG"]</code>.

    For example, <code>env = gym.make("FrozenLake-v1", map_name="4x4", render_mode="human", desc=["SHHH", "HHHH", "GGGG", "FFFF"])</code>
    will create the following environment:
    <br></br>
    <br></br>
    <img src={custommap} alt="" />
    <br></br>
    Additionally, the generate_random_map function can be imported and used to generate a random map of any size.

    <CopyBlock
          text={`from gymnasium.envs.toy_text.frozen_lake import generate_random_map
env = gym.make("FrozenLake-v1", desc=generate_random_map(size=12), render_mode="human")`}
          language="python"
          theme={dracula}
          wrapLines
        />

        <br></br>

        The above code generated this 12x12 environment:
        <br></br>
        <img src={largemap} alt="" />
        <br></br>
        <br></br>
        <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`#### render_mode`}
      </Markdown>

      To help visualize what is going on the environment, there are various render options. By default, <code>render_mode</code> is set to None as it
       would not make sense to visually print tens of thousands, or even millions of episodes.

       <Table striped bordered hover variant="dark" style={{ width: 'auto' }}>
      <thead>
        <tr>
          <th>render_mode</th>
          <th>Description</th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <td>None</td>
          <td>No render is computed <b>(Default)</b></td>
        </tr>
        <tr>
          <td>human</td>
          <td>Continuous rendering in the current display / terminal. The images shown earlier in this guide are all from the human render_mode.</td>
        </tr>
        <tr>
          <td>rgb_array</td>
          <td>Not used in this guide, more info: <a href="https://gymnasium.farama.org/api/env/#gymnasium.Env.render" target="_blank">here</a></td>
        </tr>
        <tr>
          <td>ansi</td>
          <td>Not used in this guide, more info: <a href="https://gymnasium.farama.org/api/env/#gymnasium.Env.render" target="_blank">here</a></td>
        </tr>
        <tr>
          <td>rgb_array_list</td>
          <td>Not used in this guide, more info: <a href="https://gymnasium.farama.org/api/env/#gymnasium.Env.render" target="_blank">here</a></td>
        </tr>
      </tbody>
    </Table>

    <b></b>
    <b>For now, we will create a function <code>run_episodes</code> to allow us to run a specified number of episodes in an environment,
    gather the results, and specify variables in our Q-Learning algorithm.</b>

    <CopyBlock
          text={`def run_episodes(episodes, learning_rate=0.05, discount_factor=0.95, epsilon=1, epsilon_change=0.01, slippery=True, render=None):
    env = gym.make("FrozenLake-v1", map_name="8x8", is_slippery=slippery, render_mode=render)`}
          language="python"
          theme={dracula}
          wrapLines
        />

<br></br>
I have set some default values for a lot of these parameters, however,
 optimal values for these can vary based on map size, episode count, the `is_slippery` setting,
  and any other environment changes. `learning_rate`, `discount_factor`, `epsilon`, and `epsilon_change` will
   be explained further down in the guide (as it relates more to the Q-Learning calculation that will be used).
    The other parameters do the following:

    <Table striped bordered hover variant="dark" style={{ width: 'auto' }}>
      <thead>
        <tr>
          <th>Parameter</th>
          <th>Description</th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <td>episodes</td>
          <td>Episode count (ex. 500 episodes would mean that the "game" is played 500 times until we reach one of the end conditions for each episode)</td>
        </tr>
        <tr>
          <td>slippery</td>
          <td><code>True</code> or <code>False</code>, enables or disables <code>is_slippery</code> option in the environment</td>
        </tr>
        <tr>
          <td>render</td>
          <td>One of the above render_mode options can be specified here. <code>'human'</code> is useful, however, it should not be used for a large episode count as it significantly increases runtime</td>
        </tr>
      </tbody>
    </Table>

    <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`## Keeping Track of Results`}
      </Markdown>
      We will create some NumPy arrays to help track results so we can see the effectiveness of our implementation.

      <CopyBlock
          text={`# 64 states (0 to 63) and 4 actions (0 = left, 1 = down, 2 = right, 3 = up)
q = np.zeros((env.observation_space.n, env.action_space.n)) # q-value storage
rng = np.random.default_rng() # random number from 0 to 1 (to determine if random action should be taken)
completions = np.full(episodes,False)
ep_lengths = np.zeros(episodes)
ep_epsilons = np.zeros(episodes)
checkpoints = math.floor(episodes/10) # Print statement at 10% completion intervals`}
          language="python"
          theme={dracula}
          wrapLines
        />

    <br></br>

    <Table striped bordered hover variant="dark" style={{ width: 'auto' }}>
      <thead>
        <tr>
          <th>Variable</th>
          <th>Used For</th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <td>q</td>
          <td>Used for q-value storage, this starts off as an array of 0's of size 64x4. This allows us to store a Q-Value for every combination of position and action.</td>
        </tr>
        <tr>
          <td>rng</td>
          <td>Generator used to choose a number between 0 and 1. If the random number generated is lower than our <code>epsilon</code> value, we will take a random action. Otherwise, we will take the best action as determined by our Q-Values.</td>
        </tr>
        <tr>
          <td>completions</td>
          <td>An array with an entry for each episode we run, it starts off as <code>False</code>. If we reach the goal in an episode, the value at the index for the current episode will be changed to <code>True</code>.</td>
        </tr>
        <tr>
          <td>ep_lengths</td>
          <td>An array with an entry for each episode we run, it starts off with 0 values, when an episode completes, we will update the value at the index for the current episode with the amount of actions taken that episode.</td>
        </tr>
        <tr>
          <td>ep_epsilons</td>
          <td>We will be using a decaying epsilon value (to be discussed later in the guide). This array has an entry for each episode, which will be updated with the epsilon value for each individual episode.</td>
        </tr>
        <tr>
          <td>checkpoints</td>
          <td>Just for tracking the progress of running episodes. This calculates how many episodes it takes to reach 10% of episodes completed. We will print out some basic statistics at each 10% checkpoint.</td>
        </tr>
      </tbody>
    </Table>

    <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`## Running Episodes and Q-Learning Implementation`}
      </Markdown>

      Let's look at the code for running each episode, choosing an action, and updating our q-values/results.


    </MathJaxContext>

    <CopyBlock
          text={`for _ in range(episodes):
  state, info = env.reset()
        
  if (_+1)%checkpoints==0:
    print("Ep", _, " , Epsi:", round(epsilon,3), " | Comp:", completions.sum(), " | Success Rate:", round(completions.sum()/_,3)*100,"%")
          

    while True:

      if rng.random() < epsilon:
        action = env.action_space.sample() # Random action
      else:
        action = np.argmax(q[state,:])

      # new_state: After taking the action calculated above, what position are we now in? (0-63)
      # reward: The reward for taking that action (reach goal = +1, reach hole/frozen = 0)
      # terminated: True if the player moves into a hole OR the player reaches the goal
      # truncation: True if the limit (length of episode) is reached, this is 200 for 8x8 env
      # info: number from 0 to 1 with odds of taking the action requested (1/3 if is_slippery, 1 otherwise)
      new_state, reward, terminated, truncated, info = env.step(action)

      if reward == 1:
        completions[_] = True

      q[state,action] = q[state,action] + learning_rate * (reward + discount_factor * max(q[new_state,:]) -q[state,action])

      state = new_state
      ep_lengths[_] += 1

      if terminated or truncated:
        break

    ep_epsilons[_] = epsilon
    epsilon -= epsilon_change # Lower Epsilon by specified amount
    if epsilon < 0:
      epsilon = 0`}
          language="python"
          theme={dracula}
          wrapLines
        />

Let's break it down step by step:
<br></br>
<br></br>
<Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`### Monitoring Current Progress`}
      </Markdown>

      <CopyBlock
          text={`for _ in range(episodes):
  state, info = env.reset()`}
          language="python"
          theme={dracula}
          wrapLines
        />

In the above code section we run a for loop for the specified number of episodes. We start each iteration with an environment reset. <code>env.reset()</code> will return the starting state (<code>state</code>) and debug info (<code>info</code>).
<br></br>
<br></br>
The values returned by the reset function are as follows:

<Table striped bordered hover variant="dark" style={{ width: 'auto' }}>
      <thead>
        <tr>
          <th>Variable</th>
          <th>Returned Value</th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <td>state</td>
          <td><code>0</code> (this is the starting position)</td>
        </tr>
        <tr>
          <td>info</td>
          <td><code>&#123;&#39;prob: 1&#39;&#125;</code> (since you always start in position 0)</td>
        </tr>
      </tbody>
    </Table>

    <CopyBlock
          text={`if (_+1)%checkpoints==0:
  print("Ep", _, " , Epsi:", round(epsilon,3), " | Comp:", completions.sum(), " | Success Rate:", round(completions.sum()/_,3)*100,"%")`}
          language="python"
          theme={dracula}
          wrapLines
        />

<br></br>
The <b>above code section is optional</b>. It is used to print out some of the more important statistics at every 10% checkpoint (ex. if there are 1,000 episodes: this will be printed out every 100 episodes)
<br></br>
<br></br>
Example output: <code>Episode 1999  , Epsilon: 0.78  | Completions so Far: 40  | Success Rate so Far: 2.0 %</code>
<br></br>
<br></br>
This output tells us that as of the 2,000th episode (count starts at 0), the epsilon value is 0.78, 40 of the 2,000 episodes (2.0%) have resulted in reaching the goal.
<br></br>
<br></br>
<Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`### Epsilon and Exploration`}
      </Markdown>


      <CopyBlock
          text={`while True:
  if rng.random() < epsilon:
    action = env.action_space.sample() # Random action
  else:
    action = np.argmax(q[state,:])`}
          language="python"
          theme={dracula}
          wrapLines
        />

<br></br>
In the above code block, we will run this loop forever
 (there is a <code>break</code> condition later if we fall into a hole, reach the goal, or reach our action limit).  
 This is the point where we decide if we are going to choose a random action or what we
  have currently observed to be the best action. <code>rng.random()</code> returns a number between [0.0,1.0).
   Therefore, if our <code>epsilon</code> value is set to 1: we will <b>always take a random action</b>, this random
    action is chosen through the <code>env.action_space.sample()</code> function. Likewise, if our <code>epsilon</code> value is
     set to 0: we will <b>always take an action based on the optimal q-value in our given state</b>. For
      example, if we are picking an optimal Q-Value and are currently in state position 15, we
       will choose to go left, right, up, or down based on which of the 4 actions are highest in index 15
        of the state dimension in our <code>q</code> array. This is checked via <code>np.argmax(q[state,:])</code>.
<br></br>
<br></br>
<b>It is generally best to start with a very high
 epsilon (1 since we have no current data to base our choices off of)</b>. This ensures
  we initially only choose random actions. Overtime, as our <code>q</code> array gets updated, we will
   make more decisions based off of that.


<br></br>
<br></br>
   <Table striped bordered hover variant="dark" style={{ width: 'auto' }}>
      <thead>
        <tr>
          <th>Epsilon (ε)</th>
          <th>Odds</th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <td>1</td>
          <td>Random action is chosen 100% of the time</td>
        </tr>
        <tr>
          <td>0.75</td>
          <td>Random action is chosen 75% of the time, 25% of the time we choose what we currently believe is the optimal action given our current state</td>
        </tr>
        <tr>
          <td>0.5</td>
          <td>Random action is chosen 50% of the time, 50% of the time we choose what we currently believe is the optimal action given our current state</td>
        </tr>
        <tr>
          <td>0.25</td>
          <td>Random action is chosen 25% of the time, 75% of the time we choose what we currently believe is the optimal action given our current state</td>
        </tr>
        <tr>
          <td>0</td>
          <td>We always choose what we believe is the optimal action given our current state <b>(no exploration)</b></td>
        </tr>
      </tbody>
    </Table>

    

    It is important to have a balance when it comes to epsilon values. 
    <br></br>
    <br></br>

If it is always high, we will never actually use what we have observed before,
 which will also make it more difficult to find our goal and optimal solution,
  as it is very unlikely our agent will even be able to make it near the goal if it
   can not use the information it has gathered along the way to avoid falling into a hole or going in circles.
   <br></br>
    <br></br>

If the epsilon value is too low, we risk missing out on a potentially
 better solution, since we will always be going with what we <b>think</b> is best.
  Imagine if you are driving with a GPS that has no real-time traffic,
   outdated maps, and old speed limits. It may tell you a road is the best for getting
    to your destination, but what if a new road with less traffic and a higher speed limit has
     been built since then? If we never explore, we risk missing out on a more optimal path.

    <br></br>
    <br></br>
     <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`### Taking our Action`}
      </Markdown>


      <CopyBlock
        text={`new_state, reward, terminated, truncated, info = env.step(action)

if reward == 1:
  completions[_] = True`}
          language="python"
          theme={dracula}
          wrapLines
        />

<br></br>
Using the <code>env.step()</code> function, we can pass our intended action chosen previously as an argument. Various variables are returned:
 
<br></br>
<br></br>
   <Table striped bordered hover variant="dark" style={{ width: 'auto' }}>
      <thead>
        <tr>
          <th>Variable</th>
          <th>Returned Value</th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <td>new_state</td>
          <td>an <code>int</code> value representing our new position (between 0-63 on the 8x8 environment)</td>
        </tr>
        <tr>
          <td>reward</td>
          <td>a <code>number</code> value representing the reward gained from entering our new state (in this case: 1.0 if we reach the goal space, 0.0 otherwise)</td>
        </tr>
        <tr>
          <td>terminated</td>
          <td><code>bool</code> value. <code>True</code> if we reach the goal <b>OR</b> fall into a hole, <code>False</code> otherwise</td>
        </tr>
        <tr>
          <td>truncated</td>
          <td><code>bool</code> value. <code>True</code> if the length limit of the episode (200 for 8x8) is reached, <code>False</code> otherwise</td>
        </tr>
        <tr>
          <td>info</td>
          <td>The chance of us ending up in the new state, <code>&#123;&#39;prob&#39;: 0.3333333333333333&#125;</code> if <code>is_slippery</code> is enabled, <code>&#123;&#39;prob&#39;: 1&#125;</code> otherwise.</td>
        </tr>
      </tbody>
    </Table>

    <b>It is important to remember</b>, if <code>is_slippery</code> is enabled, you will not always go in
     the direction you intended, <code>new_state</code> will return the <b>ACTUAL</b> position you have ended up in.
<br></br>
<br></br>
We will then also update our <code>completions</code> array at the index of the current episode (<code>_</code>) to <code>True</code> if we found the goal and received a reward.

<br></br>
    <br></br>
     <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`### Updating our Q-Value`}
      </Markdown>


      <CopyBlock
        text={`q[state,action] = q[state,action] + learning_rate * (reward + discount_factor * max(q[new_state,:]) -q[state,action])`}
          language="python"
          theme={dracula}
          wrapLines
        />
<br></br>
The Q-Learning equation is as follows:
<br></br>
<MathJaxContext>
      <MathJax inline>{"$$Q(s, a) \\leftarrow Q(s, a) + \\alpha [r + \\gamma \\max_{a'} Q(s', a') - Q(s, a)]$$"}</MathJax> 
    </MathJaxContext>
Where

<Table striped bordered hover variant="dark" style={{ width: 'auto' }}>
      <thead>
        <tr>
          <th>Variable</th>
          <th>Represents</th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <td><MathJaxContext><MathJax inline>{"$$Q(s,a)$$"}</MathJax></MathJaxContext></td>
          <td>the Q-Value for state <code>s</code> and action <code>a</code></td>
        </tr>
        <tr>
          <td><MathJaxContext><MathJax inline>{"$$\\alpha$$"}</MathJax></MathJaxContext></td>
          <td>the learning rate (alpha)</td>
        </tr>
        <tr>
          <td><MathJaxContext><MathJax inline>{"$$r$$"}</MathJax></MathJaxContext></td>
          <td>the immediate reward</td>
        </tr>
        <tr>
          <td><MathJaxContext><MathJax inline>{"$$\\gamma$$"}</MathJax></MathJaxContext></td>
          <td>the discount factor (gamma)</td>
        </tr>
        <tr>
          <td><MathJaxContext><MathJax inline>{"$$\\max_{a'} Q(s', a')$$"}</MathJax></MathJaxContext></td>
          <td>the maximum Q-value of our next state</td>
        </tr>
        
      </tbody>
    </Table>

    Going into further detail:
    <br></br>
    <br></br>
     <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`#### Alpha (Learning Rate)`}
      </Markdown>


      Alpha is our learning rate (and is usually a number between 0 and 1). A
       larger alpha means that the Q-Value is updated quicker based on the new information we found.
        This means faster learning but at the cost of potentially being
         unstable (value changing too quickly and the Q-Value becoming too high/low based on
          a single update). Lower values are more stable but will
       cause new information to impact the Q-Value less potentially leading us to need more episodes.
       <br></br>
       <br></br>






      <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`#### Gamma (Discount Factor)`}
      </Markdown>

      Gamma is our discount factor. Gamma ranges from 0 
      to 1. When Gamma is 0, it means that the agent will
       only consider immediate rewards, meaning if it had the choice of getting 20
        dollars today or 10 dollars today plus an additional 10,000 dollars tomorrow,
         it would choose to get 20 dollars today. It is <b>important to note that in Frozen
          Lake, you only receive a reward for reaching the end goal, so having a low gamma 
          would be particularly problematic since you won't receive one the vast majority of the
           time</b>. If gamma is set to 1, future rewards will be valued just as much as the
            immediate reward received. Any values between 0 to 1 will be a balance,
       with lower values lowering the importance of future rewards.
       <br></br>
       <br></br>






      <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`#### Q-Values`}
      </Markdown>

      We store Q-Values to determine what we have currently observed
       to be the best action to take given the current state (which is influenced
        by the values chosen for Gamma, Alpha, Epsilon, etc.). They are stored in the following
         format (for simplicity imagine there
       are <b>4 possible positions and 2 possible actions (left, right).</b>
       <br></br>
       <br></br>

       <CopyBlock 
          text={`[[0.5 0.21]
 [0.95 0.01]
 [0.51 0.49]
 [0.02 0.83]]`}
          language="python"
          theme={dracula}
          wrapLines
        />


               <br></br>


In this example, if we are in state 1, we can see
 that we have found that going left is significantly better
  (0.95 vs 0.01) than going right. However, if we are in state 2, left is only
   slightly better (0.51 vs 0.49) than going right. We would still go left
    in all cases when deciding by Q-Values since we are only basing our decision off of
     the action with the <b>max</b> Q-Value. In this case, having a balanced epsilon value would
      help us still test the other possible actions (going right) to see if better results occur
       that way, instead of always going left.


       <br></br>
       <br></br>






      <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`### Finishing the Episode`}
      </Markdown>










      <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`### Finishing All Episodes and Closing the Environment`}
      </Markdown>






      <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`### Visualizing the Results`}
      </Markdown>








      <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`## Results and Observations`}
      </Markdown>






      <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`### Number of Episodes`}
      </Markdown>







      <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`#### Low Episode Count`}
      </Markdown>








      <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`#### High Episode Count`}
      </Markdown>







      <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`#### Ideal Episode Count`}
      </Markdown>





      <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`### Learning Rate`}
      </Markdown>




      <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`### Discount Factor`}
      </Markdown>






      <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`### Slippery Mode Enabled`}
      </Markdown>





      <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`## Potential Improvements`}
      </Markdown>




      <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`### Negative Rewards`}
      </Markdown>




      <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`## Conclusion`}
      </Markdown>




      <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`## Full Code`}
      </Markdown>


      <CopyBlock 
          text={`
# Runs the requested amount of episodes using Q-Learning
# episodes = # of episodes to run through
# learning_rate = 0 to 1, closer to 1 results in newer episodes taking higher priority in the q-values, changing the values faster
# discount_factor = 0 to 1, closer to 1 values future rewards highly, closer to 0 focuses on immediate rewards more
# epsilon = 0 to 1, chance of taking a random action (1 is always random, 0 is always optimal action as per q values)
# epsilon_change = how much to lower epsilon by per episode (over time max q-value should take priority and exploration minimized)
# slippery = If False, the action requested is always followed through on. If True, the action requested is followed through on 1/3 of the time,
#            and the two perpendicular actions are taken 1/3 of the time each (ex. request=left (1/3 chance), 1/3 chance of slipping up, 1/3 of down)
# render = None for no visualization, "Human" to see visualization

def run_episodes(episodes, learning_rate=0.05, discount_factor=0.95, epsilon=1, epsilon_change=0.01, slippery=True, render=None):
    env = gym.make("FrozenLake-v1", map_name="8x8", is_slippery=slippery, render_mode=render)

    # 64 states (0 to 63) and 4 actions (0 = left, 1 = down, 2 = right, 3 = up)
    q = np.zeros((env.observation_space.n, env.action_space.n)) # q-value storage
    rng = np.random.default_rng() # random number from 0 to 1 (to determine if random action should be taken)
    completions = np.full(episodes,False)
    ep_lengths = np.zeros(episodes)
    ep_epsilons = np.zeros(episodes)
    checkpoints = math.floor(episodes/10) # Print statement at 10% completion intervals

    print("Ran using the following settings:")
    print("Episodes:", episodes)
    print("Learning Rate:", learning_rate)
    print("Discount Factor:", discount_factor)
    print("Initial Epsilon:", epsilon)
    print("Epsilon Decay (per episode):", epsilon_change)
    print("Slippery:", slippery)
    print("")

    for _ in range(episodes):
        state, info = env.reset()
        
        if (_+1)%checkpoints==0:
            print("Ep", _, " , Epsi:", round(epsilon,3), " | Comp:", completions.sum(), " | Success Rate:", round(completions.sum()/_,3)*100,"%")
          

        while True:

            if rng.random() < epsilon:
                action = env.action_space.sample() # Random action
            else:
                action = np.argmax(q[state,:])

            # new_state: After taking the action calculated above, what position are we now in? (0-63)
            # reward: The reward for taking that action (reach goal = +1, reach hole/frozen = 0)
            # terminated: True if the player moves into a hole OR the player reaches the goal
            # truncation: True if the limit (length of episode) is reached, this is 200 for 8x8 env
            # info: number from 0 to 1 with odds of taking the action requested (1/3 if is_slippery, 1 otherwise)
            new_state, reward, terminated, truncated, info = env.step(action)

            if reward == 1:
                completions[_] = True

            q[state,action] = q[state,action] + learning_rate * (reward + discount_factor * max(q[new_state,:]) -q[state,action])

            state = new_state
            ep_lengths[_] += 1

            if terminated or truncated:
                break

        ep_epsilons[_] = epsilon
        epsilon -= epsilon_change # Lower Epsilon by specified amount
        if epsilon < 0:
            epsilon = 0

    time.sleep(0.5)
    env.close()
    
    print("\\nSimple Breakdown:")
    print("Episodes:", episodes)
    print("Successful Episodes:", completions.sum())
    print("Failed Episodes:", (episodes-completions.sum()))
    print("Success Rate:", round(((completions.sum())/(episodes))*100,3), "%")
    print("Success Episode Array:", np.convolve(completions, np.ones(100), 'valid'))

    # np.convolve will compute the rolling mean for 100 episodes

    fig, axs = plt.subplots(1, 2, figsize=(20, 8))

    axs[0].plot(np.convolve(completions, np.ones(100), 'valid'))
    axs[0].set_title("Successful Episodes")
    axs[0].set_xlabel("Episode")
    axs[0].set_ylabel("# of Successful Episodes Out of Past 100")

    ax2 = axs[0].twinx()
    ax2.plot(np.convolve(ep_epsilons, np.ones(100), 'valid') / 100, color='red')
    ax2.set_ylabel("Epsilon (Rolling 100 Episode Mean)")
    ax2.set_ylim([0, 1])
    

    axs[1].plot(np.convolve(ep_lengths, np.ones(100), 'valid') / 100)
    axs[1].set_title("Episode Lengths")
    axs[1].set_xlabel("Episode")
    axs[1].set_ylabel("Length")
`}
          language="python"
          theme={dracula}
          wrapLines
        />

<br></br>




      <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`## Additional Function to Visualize the Optimal Path Based on Q-Values`}
      </Markdown>

      This function can be ran at the end of run_episodes and is used to visualize the agent moving based on the final q values found.

      <CopyBlock
          text={`def visualize_best_result(episodes, q, slippery=True):
    print("")
    print("Optimal Result (as determined by Q-Values:")

    action_array = ["Left", "Down", "Right", "Up"]
    actions_taken = 0
    
    env = gym.make("FrozenLake-v1", map_name="8x8", is_slippery=slippery, render_mode="human")

    for _ in range(episodes):
        state, info = env.reset()

        while True:
            action = np.argmax(q[state,:])

            new_state, reward, terminated, truncated, info = env.step(action)

            state = new_state

            if _ == 0:
                actions_taken += 1
                print("Action taken:", action_array[action], " | Reward Given:", reward, " | Terminated:", terminated, " | Actions Taken:", actions_taken)

            if terminated or truncated:
                break

    env.close()`}
          language="python"
          theme={dracula}
          wrapLines
        />

<br></br>

      <Markdown 
        remarkPlugins={[remarkMath]} 
        components={{
          math: ({ value }) => <MathJax inline>{value}</MathJax>,
          inlineMath: ({ value }) => <MathJax inline>{value}</MathJax> 
        }}
      >
        {`## Resources Used`}
      </Markdown>


  <ul>
    <li>AI (CSCI 4610U) Lectures: Winter 2024 Semester</li>
    <li>ML2 (CSCI 4052U) Lectures: Fall 2024 Semester</li>
    <li><a href="https://www.youtube.com/watch?v=ZhoIgo3qqLU" target="_blank">https://www.youtube.com/watch?v=ZhoIgo3qqLU</a> - FrozenLake Gymnasium</li>
    <li><a href="https://gymnasium.farama.org/environments/toy_text/frozen_lake/" target="_blank">https://gymnasium.farama.org/environments/toy_text/frozen_lake/</a></li>
  </ul>








  </div>

  )
}

export default FrozenLake;