SARSA.py

import sys
import numpy as np
import random 
import math
import pandas as pd
import Racecar
from random import randint
import matplotlib  
import matplotlib.pyplot as plt  

'''
SARSA RL Algorithm
Variation of Q-learning: On Policy and uses the action performed by the current policy to learn the Q-value.
'''
class SARSA:
    def __init__(self, fileName, learningRate, discountRate, crashVersion):
        self.fileName = fileName
        self.race = Racecar.Race(fileName)
        self.rows = self.race.racetrack.rows
        self.cols = self.race.racetrack.cols
        self.actions = self.initActions()           # possible (ax,ay) action combinations
        self.states = self.initStates()             # possible (x,y,vx,vy) state combinations
        self.Qtable = self.initQTable()
        self.learningRate = learningRate
        self.discountRate = discountRate
        self.crashVersion = crashVersion

    '''
    return list of possible (ax,ay) action combinations
    '''
    def initActions(self):
        accelerations = [-1, 0, 1]
        possibleActions = []
        for ax in accelerations:
            for ay in accelerations: 
                possibleActions.append(str((ax,ay)))
        return possibleActions


    '''
    return list of possible (x,y,vx,vy) state combinations
    '''
    def initStates(self):
        minVelocity, maxVelocity = -5, 5
        minX, maxX = 0, self.rows
        minY, maxY = 0, self.cols
        possibleStates = []

        for x in range(minX, maxX):
            for y in range(minY, maxY):
                for vx in range(minVelocity, maxVelocity+1):
                    for vy in range(minVelocity, maxVelocity+1):
                        possibleStates.append(str((x,y,vx,vy)))
        return possibleStates
    
    '''
    Initialze our Q-table to all zeros
    columns = possible actions (ax,ay = 0,1,-1)
    rows = number of states (x = num rows, y = num cols, vx = -5,5, vy = -5,5)
         = (x,y) = (0,0), (0,1), (0,2), (0,4)....
         = (vx,vy) = ()
    '''
    def initQTable(self):
        table = [[np.random.uniform(0,0.1) for col in range(len(self.actions))] for row in range(len(self.states))]
        # using regular strings
        dfTable = pd.DataFrame(table, index = self.states)
        dfTable.columns = self.actions
        
        print('dfTable: ', dfTable)
        return dfTable
    '''
    Update our Q table
    maxQ(state',action') = max expected future reward given new state s' and all possible actions at this new state
    new Q(state',action') = Q(state,action) + LR(reward for (state',action') + DR*maxQ(state',action') - Q(state,action)))
    '''
    def Q(self, state, newState, action, newAction, reward):
        newX, newY, newVx, newVy = eval(newState)
        newAx, newAy = eval(newAction)
        prediction = self.Qtable.loc[state][action]
        target = reward + self.discountRate * self.Qtable.loc[newState][newAction]
        print('Q table before: ', self.Qtable.loc[state][action])
        self.Qtable.loc[state][action] = prediction + self.learningRate * (target - prediction)
        print('Q table after: ', self.Qtable.loc[state][action])
    '''
    Get a state randomly
    '''
    def getState(self):
        randomIndex = randint(0, 2)
        return self.states[randomIndex]

    '''
    Car should only choose to accelerate 0.8 of the time
    '''
    def shouldTakeAction(self):
        probability = np.random.uniform(0,1)
        if probability < 0.8:
            return True
        return False

    '''
    Get new velocity, make sure we are not exceeding min/max velocity
    '''
    def getNewVelocity(self, vx,vy,ax,ay):
        newX = vx + ax   
        newY = vy + ay
        if newX < self.race.car.minV: newX = self.race.car.minV
        if newX > self.race.car.maxV: newX = self.race.car.maxV
        if newY < self.race.car.minV: newY = self.race.car.minV
        if newY > self.race.car.maxV: newY = self.race.car.maxV
        return newX, newY

    '''
    Validate coordinates in a state
    '''
    def validateCoordinates(self, state):
        x,y,vx,vy = eval(str(state))
        newX,newY = self.validateSingleCoordinates(x,y)
        return (newX, newY, vx, vy)

    '''
    Validate single coordinates, make sure we don't go out of bounds
    '''
    def validateSingleCoordinates(self, x, y):
        newX,newY = x,y
        if x < 0:
            newX = 0
        elif x >= self.race.racetrack.rows:
            newX = self.race.racetrack.rows-1
        if y < 0: 
            newY = 0
        elif y >= self.race.racetrack.cols:
            newY = self.race.racetrack.cols-1
        # if out of bounds, get closest on track point
        if x != newX or y != newY:
            return self.race.getClosestTrackPoint((newX,newY))
        return (x,y)

    '''
    returns new state based on state,action input
    updates Qtable with reward
    '''
    def takeActionGetReward(self, state, action):
        if self.shouldTakeAction():            
            ax,ay = eval(action)
            x,y,vx,vy = eval(state)

            # get new vx,vy velocities based on accelerations ax,ay
            vx_, vy_ = self.getNewVelocity(vx,vy,ax,ay)

            # get new x,y positions based on new velocity
            x_ = x + vx_
            y_ = y + vy_
            x_,y_ = self.validateSingleCoordinates(x_,y_)

            # check if this new position makes the car crash
            carCrashed, linePoints, crashCoordinates = self.race.carCrashed(x,y,x_,y_)

            if carCrashed:
                newX, newy, newVx, newVy = self.race.getNewCarStateAfterCrash(linePoints, crashCoordinates, self.crashVersion)
                newState = self.validateCoordinates((newX, newy, newVx, newVy))
                newAction = self.getAction(newState)
                self.Q(state, str(newState), action, newAction, reward = -100)
                return newState
            else:
                newState = self.validateCoordinates((x_, y_, vx_, vy_))
                newAction = self.getAction(newState)
                self.Q(state, str(newState), action, newAction, reward = -1)
                return newState
        else:
            self.Q(state, state, action, action, reward = 0)
            return self.validateCoordinates(state)

    '''
    Given a state (x,y,vx,vy) return the max reward action from Q table
    Input: state (x,y,vx,vy) 
    Output: max reward action (ax,ay)
    '''
    def getAction(self, state):
        row = self.Qtable.loc[state]
        if random.uniform(0,1) > 0.8: 
            print('using maximal action: ', row.argmax())
            return str(row.argmax())
        else:
            randIdx = random.choice(self.actions)
            print('taking random action: ', randIdx)
            return str(randIdx)

    '''
    return postion on starting line and zero velocity, (x,y,vx,vy)
    '''
    def setInitalState(self):
        x,y = random.choice(self.race.racetrack.startingLine)
        return str((x,y,0,0))

    '''
    Get optimal policy based on Q table
    '''
    def getBestPolicy(self):
        bestPolicy = {}
        for state in self.states:
            bestPolicy[state] = self.Qtable.loc[state].argmax()
        return bestPolicy

    '''
    train q learning algorithm 
    '''
    def train(self, steps):
        for step in range(steps):
            currState = self.setInitalState()
            currAction = self.getAction(currState)
            if step % 1000 == 0:
                print('-----------step '+ str(step) + '/' + str(steps) + '-------------')
            for s in range(10):
                if not self.race.carCrossedFinishLine(currState):
                    self.race.time+=1
                    # take the action & observe reward
                    # get the new state, aka new position & velocity (x,y,vx,vy) 
                    newState = self.takeActionGetReward(currState, currAction)
                    currAction = self.getAction(newState)
                    currState = str(newState)
                else:
                    break

        print('training finished! ')
        print('time: ', self.race.time)
        return self.getBestPolicy(), self.race.time


    '''
    Given a policy, output the path we have taken along with time
    '''
    def timeBestPolicy(self, policy):
        currState = self.setInitalState()
        # print('initial state: ', currState)
        finalPath = []
        # Keep track if we get stuck
        clockStoped = 0   
        maxSteps = 5000
    
        # Begin time trial
        for step in range(maxSteps):        
            # Get the best action given the current state
            bestAction = policy[currState]
    
            # If we are at the finish line, stop the time trial
            if self.race.carCrossedFinishLine(currState): 
                print('found finish line!')
                print('finalPath: ', finalPath)
                return step
            
            # Take action and get new a new state s'
            newState = self.takeActionGetReward(currState, bestAction)
            currState = str(newState)
            finalPath.append((currState, bestAction, newState))
            # Determine if the car gets stuck
            _,_,vx,vy = eval(currState)
            if vy == 0 and vx == 0:
                clockStoped += 1
            else:
                clockStoped = 0
    
            # We have gotten stuck as the car has not been moving for 5 timesteps
            if clockStoped == 5:
                print('car stuck and not moving for a while, returning max time')
                print('finalPath: ', finalPath)
                return maxSteps
            
        # Program has timed out
        print('program timed out, returning max time')
        print('finalPath: ', finalPath)
        return maxSteps

    # run the experiements
    def experiment(self, iterations):            
        policy,timeToBuildPolicy = self.train(iterations)
        time = self.timeBestPolicy(policy)
        print('')
        print('')
        print('--------------SARSA Evaluation Results-----------------')
        print('track: ', self.fileName)
        # print('time to build policy: ', timeToBuildPolicy)
        # print('time using best policy: ', time)
        print('iterations: ', iterations)
        print('learning rate: ', self.learningRate)
        print('discount rate: ', self.discountRate)
        print('crash version: ', self.crashVersion)
        return time


'''
Plot multiple lines on 1 plot
Input: 
    mapping - dictionary of crashVersion: [stepsTillFinish]
    title - title of plot
    typee - type (classsification or regression)
    learningRate - learning rate for these experiements
'''
def plotMultiple(x, mapping, title):
    for crashVersion, y in mapping.items():
        plt.plot(x, y, label=crashVersion)
    plt.xlabel('Steps')
    plt.ylabel('Time till finish line')
    plt.title(title)
    plt.legend(loc="upper left")
    plt.show()

'''
Plot single line plot
'''
def plot(fileName, x, y, title):
    plt.plot(x,y)
    plt.xlabel('Steps')
    plt.ylabel('Time to reach finish line')
    plt.title(fileName + ' ' + title)
    plt.show()


# Generating graphs for pdf
# iterationList = [600000,800000,1000000]
# timeListForV1 = []
# timeListForV2 = []
# mapping = {}
# for iteration in iterationList:
#     # crash version 1
#     sarsa = SARSA('L-track.txt', 0.3, 0.9, 'v1')
#     time = sarsa.experiment(iteration)
#     timeListForV1.append(time)
    

#     # crash version 2
#     sarsa.crashVersion = 'v2'
#     time2 = sarsa.experiment(iteration)
#     timeListForV2.append(time2)

# mapping['crash version 1'] = timeListForV1
# mapping['crash version 2'] = timeListForV2
# print('final mapping: ', mapping)
# print('plotting the results...')
# plotMultiple(iterationList, mapping, 'SARSA Algorithm on L-track.txt')


# demo purposes 
# train for 1 iteration, crash version 1
# sarsa = SARSA('L-track.txt', 0.5, 0.8, 'v1')
# sarsa.experiment(1)

# train for 1 iteration, crash version 2
# sarsa.crashVersion = 'v2'
# sarsa.experiment(1)