diff --git a/pilot/database/__init__.py b/pilot/database/__init__.py
index e69de29bb..8d4d09005 100644
--- a/pilot/database/__init__.py
+++ b/pilot/database/__init__.py
@@ -0,0 +1 @@
+from .database import database_exists, create_database, save_app
diff --git a/pilot/database/database.py b/pilot/database/database.py
index b9dc30511..e92e16bea 100644
--- a/pilot/database/database.py
+++ b/pilot/database/database.py
@@ -50,6 +50,7 @@
             File,
         ]
 
+
 def get_created_apps():
     return [model_to_dict(app) for app in App.select().where((App.name.is_null(False)) & (App.status.is_null(False)))]
 
@@ -264,7 +265,7 @@ def hash_and_save_step(Model, app_id, unique_data_fields, data_fields, message):
         record = Model.get_by_id(inserted_id)
         logger.debug(yellow(f"{message} with id {record.id}"))
     except IntegrityError as e:
-        print(f"A record with data {unique_data_fields} already exists for {Model.__name__}.")
+        logger.warn(f"A record with data {unique_data_fields} already exists for {Model.__name__}.")
         return None
     return record
 
@@ -288,9 +289,10 @@ def save_development_step(project, prompt_path, prompt_data, messages, llm_respo
 
     development_step = hash_and_save_step(DevelopmentSteps, project.args['app_id'], unique_data, data_fields,
                                           "Saved Development Step")
-    project.checkpoints['last_development_step'] = development_step
+    if development_step is not None:
+        project.checkpoints['last_development_step'] = development_step
 
-    project.save_files_snapshot(development_step.id)
+        project.save_files_snapshot(development_step.id)
 
     return development_step
 
diff --git a/pilot/helpers/Debugger.py b/pilot/helpers/Debugger.py
index 17eeafcc9..db3f9636b 100644
--- a/pilot/helpers/Debugger.py
+++ b/pilot/helpers/Debugger.py
@@ -1,3 +1,4 @@
+import platform
 import uuid
 
 from const.code_execution import MAX_COMMAND_DEBUG_TRIES, MAX_RECUSION_LAYER
@@ -6,7 +7,7 @@
 from helpers.exceptions.TooDeepRecursionError import TooDeepRecursionError
 
 
-class Debugger():
+class Debugger:
     def __init__(self, agent):
         self.agent = agent
         self.recursion_layer = 0
@@ -41,7 +42,12 @@ def debug(self, convo, command=None, user_input=None, issue_description=None, is
             convo.load_branch(function_uuid)
 
             debugging_plan = convo.send_message('dev_ops/debug.prompt',
-                { 'command': command['command'] if command is not None else None, 'user_input': user_input, 'issue_description': issue_description },
+                {
+                    'command': command['command'] if command is not None else None,
+                    'user_input': user_input,
+                    'issue_description': issue_description,
+                    'os': platform.system()
+                },
                 DEBUG_STEPS_BREAKDOWN)
 
             try:
diff --git a/pilot/helpers/__init__.py b/pilot/helpers/__init__.py
index e69de29bb..7bacb0417 100644
--- a/pilot/helpers/__init__.py
+++ b/pilot/helpers/__init__.py
@@ -0,0 +1,2 @@
+from .AgentConvo import AgentConvo
+from .Project import Project
diff --git a/pilot/helpers/agents/Developer.py b/pilot/helpers/agents/Developer.py
index 93f9a2c8c..c33109364 100644
--- a/pilot/helpers/agents/Developer.py
+++ b/pilot/helpers/agents/Developer.py
@@ -96,9 +96,16 @@ def step_command_run(self, convo, step, i):
         additional_message = 'Let\'s start with the step #0:\n\n' if i == 0 else f'So far, steps { ", ".join(f"#{j}" for j in range(i)) } are finished so let\'s do step #{i + 1} now.\n\n'
         return run_command_until_success(data['command'], data['timeout'], convo, additional_message=additional_message)
 
-    def step_human_intervention(self, convo, step):
+    def step_human_intervention(self, convo, step: dict):
+        """
+        :param convo:
+        :param step: {'human_intervention_description': 'some description'}
+        :return:
+        """
         while True:
-            human_intervention_description = step['human_intervention_description'] + yellow_bold('\n\nIf you want to run the app, just type "r" and press ENTER and that will run `' + self.run_command + '`') if self.run_command is not None else step['human_intervention_description']
+            human_intervention_description = step['human_intervention_description'] + \
+                                             yellow_bold('\n\nIf you want to run the app, just type "r" and press ENTER and that will run `' + self.run_command + '`') \
+                                             if self.run_command is not None else step['human_intervention_description']
             response = self.project.ask_for_human_intervention('I need human intervention:',
                 human_intervention_description,
                 cbs={ 'r': lambda conv: run_command_until_success(self.run_command, None, conv, force=True, return_cli_response=True) },
@@ -260,8 +267,11 @@ def execute_task(self, convo, task_steps, test_command=None, reset_convo=True,
     def continue_development(self, iteration_convo, last_branch_name, continue_description=''):
         while True:
             iteration_convo.load_branch(last_branch_name)
-            user_description = ('Here is a description of what should be working: \n\n' + blue_bold(continue_description) + '\n') if continue_description != '' else ''
-            user_description = 'Can you check if the app works please? ' + user_description + '\nIf you want to run the app, ' + yellow_bold('just type "r" and press ENTER and that will run `' + self.run_command + '`')
+            user_description = ('Here is a description of what should be working: \n\n' + blue_bold(continue_description) + '\n') \
+                                if continue_description != '' else ''
+            user_description = 'Can you check if the app works please? ' + user_description + \
+                               '\nIf you want to run the app, ' + \
+                               yellow_bold('just type "r" and press ENTER and that will run `' + self.run_command + '`')
             # continue_description = ''
             response = self.project.ask_for_human_intervention(
                 user_description,
diff --git a/pilot/helpers/agents/ProductOwner.py b/pilot/helpers/agents/ProductOwner.py
index f2ba130ea..2dce0705a 100644
--- a/pilot/helpers/agents/ProductOwner.py
+++ b/pilot/helpers/agents/ProductOwner.py
@@ -20,6 +20,8 @@ def __init__(self, project):
         super().__init__('product_owner', project)
 
     def get_project_description(self):
+        # TODO: why save the project before user has even committed to a name & description?
+        # The UI saves a record as soon as the click "Create Project" button
         self.project.app = save_app(self.project)
         self.project.current_step = PROJECT_DESCRIPTION_STEP
 
diff --git a/pilot/helpers/agents/__init__.py b/pilot/helpers/agents/__init__.py
index 8b1378917..6170c18fa 100644
--- a/pilot/helpers/agents/__init__.py
+++ b/pilot/helpers/agents/__init__.py
@@ -1 +1,4 @@
-
+from .Architect import Architect, ARCHITECTURE_STEP
+from .CodeMonkey import CodeMonkey, IMPLEMENT_CHANGES, GET_FILES
+from .Developer import Developer, ENVIRONMENT_SETUP_STEP
+from .TechLead import TechLead
diff --git a/pilot/helpers/agents/test_Developer.py b/pilot/helpers/agents/test_Developer.py
index 39dcf1711..555df49f7 100644
--- a/pilot/helpers/agents/test_Developer.py
+++ b/pilot/helpers/agents/test_Developer.py
@@ -1,8 +1,11 @@
 import builtins
+import json
 import os
 import pytest
 from unittest.mock import patch
 
+import requests
+
 from helpers.AgentConvo import AgentConvo
 from dotenv import load_dotenv
 load_dotenv()
@@ -122,4 +125,57 @@ def test_code_changes_manual_test_no(self, mock_get_saved_user_input, mock_chat_
             result = self.developer.test_code_changes(monkey, convo)
 
             # Then
-            assert result == {'success': True, 'user_input': 'continue'}
+            assert result == {'success': True, 'user_input': 'no'}
+
+    @patch('helpers.cli.execute_command', return_value=('stdout:\n```\n\n```', 'DONE'))
+    @patch('helpers.AgentConvo.get_saved_development_step')
+    @patch('helpers.AgentConvo.save_development_step')
+    @patch('utils.llm_connection.requests.post')
+    @patch('utils.questionary.get_saved_user_input')
+    def test_test_code_changes_invalid_json(self, mock_get_saved_user_input,
+                                            mock_requests_post,
+                                            mock_save,
+                                            mock_get_saved_step,
+                                            mock_execute):
+        # Given
+        monkey = None
+        convo = AgentConvo(self.developer)
+        convo.save_branch = lambda branch_name=None: branch_name
+        convo.load_branch = lambda function_uuid=None: function_uuid
+        self.project.developer = self.developer
+
+        # we send a GET_TEST_TYPE spec, but the 1st response is invalid
+        types_in_response = ['command', 'command_test']
+        json_received = []
+
+        def generate_response(*args, **kwargs):
+            json_received.append(kwargs['json'])
+
+            gpt_response = json.dumps({
+                'type': types_in_response.pop(0),
+                'command': {
+                    'command': 'node server.js',
+                    'timeout': 3000
+                }
+            })
+            choice = json.dumps({'delta': {'content': gpt_response}})
+            line = json.dumps({'choices': [json.loads(choice)]}).encode('utf-8')
+
+            response = requests.Response()
+            response.status_code = 200
+            response.iter_lines = lambda: [line]
+            return response
+
+        mock_requests_post.side_effect = generate_response
+
+        mock_questionary = MockQuestionary([''])
+
+        with patch('utils.questionary.questionary', mock_questionary):
+            # When
+            result = self.developer.test_code_changes(monkey, convo)
+
+            # Then
+            assert result == {'success': True, 'cli_response': 'stdout:\n```\n\n```'}
+            assert mock_requests_post.call_count == 2
+            assert "The JSON is invalid at $.type - 'command' is not one of ['automated_test', 'command_test', 'manual_test', 'no_test']" in json_received[1]['messages'][3]['content']
+            assert mock_execute.call_count == 1
diff --git a/pilot/helpers/cli.py b/pilot/helpers/cli.py
index ab0f72209..545129ed8 100644
--- a/pilot/helpers/cli.py
+++ b/pilot/helpers/cli.py
@@ -6,6 +6,7 @@
 import time
 import platform
 
+from logger.logger import logger
 from utils.style import yellow, green, red, yellow_bold, white_bold
 from database.database import get_saved_command_run, save_command_run
 from helpers.exceptions.TooDeepRecursionError import TooDeepRecursionError
@@ -15,6 +16,7 @@
 
 interrupted = False
 
+
 def enqueue_output(out, q):
     for line in iter(out.readline, ''):
         if interrupted:  # Check if the flag is set
@@ -22,6 +24,7 @@ def enqueue_output(out, q):
         q.put(line)
     out.close()
 
+
 def run_command(command, root_path, q_stdout, q_stderr, pid_container):
     """
     Execute a command in a subprocess.
@@ -36,6 +39,7 @@ def run_command(command, root_path, q_stdout, q_stderr, pid_container):
     Returns:
         subprocess.Popen: The subprocess object.
     """
+    logger.info(f'Running `{command}`')
     if platform.system() == 'Windows':  # Check the operating system
         process = subprocess.Popen(
             command,
@@ -65,19 +69,19 @@ def run_command(command, root_path, q_stdout, q_stderr, pid_container):
     t_stderr.start()
     return process
 
+
 def terminate_process(pid):
     if platform.system() == "Windows":
         try:
             subprocess.run(["taskkill", "/F", "/T", "/PID", str(pid)])
-        except subprocess.CalledProcessError:
-            # Handle any potential errors here
-            pass
+        except subprocess.CalledProcessError as e:
+            logger.error(f'Error while terminating process: {e}')
     else:  # Unix-like systems
         try:
             os.killpg(pid, signal.SIGKILL)
-        except OSError:
-            # Handle any potential errors here
-            pass
+        except OSError as e:
+            logger.error(f'Error while terminating process: {e}')
+
 
 def execute_command(project, command, timeout=None, force=False):
     """
@@ -112,6 +116,7 @@ def execute_command(project, command, timeout=None, force=False):
         # TODO: I think AutoGPT allows other feedback here, like:
         #       "That's not going to work, let's do X instead"
         #       We don't explicitly make "no" or "skip" options to the user
+        #       see https://github.com/Pythagora-io/gpt-pilot/issues/122
         if answer == 'no':
             return '', 'DONE'
         elif answer == 'skip':
@@ -119,6 +124,7 @@ def execute_command(project, command, timeout=None, force=False):
 
 
     # TODO when a shell built-in commands (like cd or source) is executed, the output is not captured properly - this will need to be changed at some point
+    # TODO: Windows support
     if "cd " in command or "source " in command:
         command = "bash -c '" + command + "'"
 
@@ -157,6 +163,7 @@ def execute_command(project, command, timeout=None, force=False):
                     output_line = q.get_nowait()
                     if output_line not in output:
                         print(green('CLI OUTPUT:') + output_line, end='')
+                        logger.info('CLI OUTPUT: ' + output_line)
                         output += output_line
                 break
 
@@ -174,6 +181,7 @@ def execute_command(project, command, timeout=None, force=False):
             if line:
                 output += line
                 print(green('CLI OUTPUT:') + line, end='')
+                logger.info('CLI OUTPUT: ' + line)
 
             # Read stderr
             try:
@@ -184,13 +192,16 @@ def execute_command(project, command, timeout=None, force=False):
             if stderr_line:
                 stderr_output += stderr_line
                 print(red('CLI ERROR:') + stderr_line, end='')  # Print with different color for distinction
+                logger.error('CLI ERROR: ' + stderr_line)
 
     except (KeyboardInterrupt, TimeoutError) as e:
         interrupted = True
         if isinstance(e, KeyboardInterrupt):
-            print("\nCTRL+C detected. Stopping command execution...")
+            print('\nCTRL+C detected. Stopping command execution...')
+            logger.info('CTRL+C detected. Stopping command execution...')
         else:
-            print("\nTimeout detected. Stopping command execution...")
+            print('\nTimeout detected. Stopping command execution...')
+            logger.warn('Timeout detected. Stopping command execution...')
 
         terminate_process(pid_container[0])
 
@@ -267,7 +278,9 @@ def execute_command_and_check_cli_response(command, timeout, convo):
             { 'cli_response': cli_response, 'command': command })
     return cli_response, llm_response
 
-def run_command_until_success(command, timeout, convo, additional_message=None, force=False, return_cli_response=False, is_root_task=False):
+
+def run_command_until_success(command, timeout, convo, additional_message=None, force=False,
+                              return_cli_response=False, is_root_task=False):
     """
     Run a command until it succeeds or reaches a timeout.
 
diff --git a/pilot/logger/logger.py b/pilot/logger/logger.py
index 603cb7ace..1327814a7 100644
--- a/pilot/logger/logger.py
+++ b/pilot/logger/logger.py
@@ -1,4 +1,5 @@
 import os
+import re
 import logging
 
 
@@ -31,6 +32,7 @@ def setup_logger():
 
 
 def filter_sensitive_fields(record):
+    # TODO: also remove escape sequences for colors, bold etc
     if isinstance(record.args, dict):  # check if args is a dictionary
         args = record.args.copy()
         for field in sensitive_fields:
@@ -44,6 +46,8 @@ def filter_sensitive_fields(record):
         args_list = ['*****' if arg in sensitive_fields else arg for arg in args_list]
         record.args = tuple(args_list)
 
+    # Remove ANSI escape sequences - colours & bold
+    record.msg = re.sub(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])', '', record.msg)
     return record.levelno <= logging.INFO
 
 
diff --git a/pilot/main.py b/pilot/main.py
index 0fb7ceb10..14ed573c5 100644
--- a/pilot/main.py
+++ b/pilot/main.py
@@ -70,14 +70,20 @@ def local_print(*args, **kwargs):
     else:
         return local_print, ipc_client_instance
 
+
 if __name__ == "__main__":
     try:
+        # sys.argv.append('--ux-test=' + 'run_command_until_success')
         args = init()
+
         builtins.print, ipc_client_instance = get_custom_print(args)
         if '--api-key' in args:
-          os.environ["OPENAI_API_KEY"] = args['--api-key']
+            os.environ["OPENAI_API_KEY"] = args['--api-key']
         if '--get-created-apps-with-steps' in args:
             print({ 'db_data': get_created_apps_with_steps() }, type='info')
+        elif '--ux-test' in args:
+            from test.ux_tests import run_test
+            run_test(args['--ux-test'])
         else:
             # TODO get checkpoint from database and fill the project with it
             project = Project(args, ipc_client_instance=ipc_client_instance)
diff --git a/pilot/prompts/dev_ops/debug.prompt b/pilot/prompts/dev_ops/debug.prompt
index bd0ddf9ca..8db2339e5 100644
--- a/pilot/prompts/dev_ops/debug.prompt
+++ b/pilot/prompts/dev_ops/debug.prompt
@@ -5,7 +5,7 @@ You wanted me to check this - `{{ issue_description }}` but there was a problem{
 ```
 {% endif %}I want you to debug this issue by yourself and I will give you 2 functions that you can use - `run_command` and `implement_code_changes`.
 
-`run_command` function will run a command on the machine and will return the CLI output to you so you can see what to do next.
+`run_command` function will run a command on the machine and will return the CLI output to you so you can see what to do next. Note that the command will run on a {{ os }} machine.
 
 `implement_code_changes` function will change the code where you just need to thoroughly describe what needs to be implemented, I will implement the requested changes and let you know.
 
diff --git a/pilot/prompts/utils/invalid_json.prompt b/pilot/prompts/utils/invalid_json.prompt
new file mode 100644
index 000000000..25742343a
--- /dev/null
+++ b/pilot/prompts/utils/invalid_json.prompt
@@ -0,0 +1,6 @@
+[INST]I received an invalid JSON response. The response was a parseable JSON object, but it is not valid against the schema I provided. The JSON is invalid {{ invalid_reason }}
+
+Please try again with a valid JSON object, referring to the previous JSON schema I provided above.
+
+A response which starts with "I'm sorry for the confusion" would be an example of an invalid response, a preamble must NOT be included.
+[/INST]
diff --git a/pilot/test/ux_tests/README.md b/pilot/test/ux_tests/README.md
new file mode 100644
index 000000000..cb0f00ae6
--- /dev/null
+++ b/pilot/test/ux_tests/README.md
@@ -0,0 +1 @@
+The functions in this directory are used to test specific scenarios of the user experience.
diff --git a/pilot/test/ux_tests/__init__.py b/pilot/test/ux_tests/__init__.py
new file mode 100644
index 000000000..104f87f0c
--- /dev/null
+++ b/pilot/test/ux_tests/__init__.py
@@ -0,0 +1,10 @@
+from .run_command_until_success import run_command_until_success
+
+
+def run_test(test_name: str):
+    print(f'Running UX test "{test_name}"...')
+
+    if test_name == 'run_command_until_success':
+        return run_command_until_success()
+
+    print(f'UX test "{test_name}" not found')
diff --git a/pilot/test/ux_tests/run_command_until_success.py b/pilot/test/ux_tests/run_command_until_success.py
new file mode 100644
index 000000000..8b676129d
--- /dev/null
+++ b/pilot/test/ux_tests/run_command_until_success.py
@@ -0,0 +1,41 @@
+import os
+from helpers.agents import Developer, ENVIRONMENT_SETUP_STEP
+from helpers import AgentConvo, Project
+from helpers.files import update_file
+from database import save_app
+
+
+def run_command_until_success():
+    name = 'run_command_until_success'
+    project = Project({
+        'app_id': '84c2c532-e07c-4694-bcb0-70767c348b07',
+        'name': name,
+        'app_type': '',
+        'user_id': '97510ce7-dbca-44b6-973c-d27346ce4009',
+        'email': '7ed2f578-c791-4719-959c-dedf94394ad3',
+        'password': 'secret',
+    },
+        name=name,
+        architecture=[],
+        user_stories=[]
+    )
+
+    project.root_path = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                                                     '../../../workspace/TestDeveloper'))
+    project.technologies = []
+    project.current_step = ENVIRONMENT_SETUP_STEP
+    project.app = save_app(project)
+
+    update_file(f'{project.root_path}/package.json',
+                '{"dependencies": {"axios": "^1.5.0", "express": "^4.18.2", "mongoose": "^7.5.0"}}')
+
+    developer = Developer(project)
+    developer.run_command = 'npm install'
+
+    convo = AgentConvo(developer)
+    step = {
+        'type': 'human_intervention',
+        'human_intervention_description': 'I want you to test that this process works from the CLI _and_ from the UI.',
+    }
+
+    result = developer.step_human_intervention(convo, step)
diff --git a/pilot/utils/arguments.py b/pilot/utils/arguments.py
index 75dc5c0c3..e6b927d05 100644
--- a/pilot/utils/arguments.py
+++ b/pilot/utils/arguments.py
@@ -92,6 +92,11 @@ def get_email():
 
 # TODO can we make BaseModel.id a CharField with default=uuid4?
 def username_to_uuid(username):
+    """
+    Creates a consistent UUID from a username
+    :param username:
+    :return:
+    """
     sha1 = hashlib.sha1(username.encode()).hexdigest()
     uuid_str = "{}-{}-{}-{}-{}".format(sha1[:8], sha1[8:12], sha1[12:16], sha1[16:20], sha1[20:32])
     return str(uuid.UUID(uuid_str))
diff --git a/pilot/utils/function_calling.py b/pilot/utils/function_calling.py
index 00e3af4fc..aba5d6166 100644
--- a/pilot/utils/function_calling.py
+++ b/pilot/utils/function_calling.py
@@ -66,10 +66,8 @@ def parse_agent_response(response, function_calls: Union[FunctionCallSet, None])
         response: The response from the agent.
         function_calls: Optional function calls associated with the response.
 
-    Returns:
-        The post-processed response.
+    Returns: The post-processed response.
     """
-
     if function_calls:
         text = response['text']
         values = list(json.loads(text).values())
diff --git a/pilot/utils/llm_connection.py b/pilot/utils/llm_connection.py
index d683029f0..e81e4c4fa 100644
--- a/pilot/utils/llm_connection.py
+++ b/pilot/utils/llm_connection.py
@@ -7,7 +7,7 @@
 import tiktoken
 import questionary
 
-from jsonschema import validate
+from jsonschema import validate, ValidationError
 from utils.style import red
 from typing import List
 from const.llm import MIN_TOKENS_FOR_GPT_RESPONSE, MAX_GPT_MODEL_TOKENS
@@ -104,6 +104,7 @@ def create_gpt_chat_completion(messages: List[dict], req_type, min_tokens=MIN_TO
     except TokenLimitError as e:
         raise e
     except Exception as e:
+        logger.error(f'The request to {os.getenv("ENDPOINT")} API failed: %s', e)
         print(f'The request to {os.getenv("ENDPOINT")} API failed. Here is the error message:')
         print(e)
 
@@ -139,6 +140,7 @@ def get_tokens_in_messages_from_openai_error(error_message):
     else:
         return None
 
+
 def retry_on_exception(func):
     def wrapper(*args, **kwargs):
         # spinner = None
@@ -158,6 +160,14 @@ def wrapper(*args, **kwargs):
                         logger.info('Received incomplete JSON response from LLM. Asking for the rest...')
                         args[0]['function_buffer'] = e.doc
                         continue
+                elif isinstance(e, ValidationError):
+                    logger.warn('Received invalid JSON response from LLM. Asking to retry...')
+                    logger.info(f'  at {e.json_path} {e.message}')
+                    # eg:
+                    # json_path: '$.type'
+                    # message:   "'command' is not one of ['automated_test', 'command_test', 'manual_test', 'no_test']"
+                    args[0]['function_error'] = f'at {e.json_path} - {e.message}'
+                    continue
                 if "context_length_exceeded" in err_str:
                     # spinner_stop(spinner)
                     raise TokenLimitError(get_tokens_in_messages_from_openai_error(err_str), MAX_GPT_MODEL_TOKENS)
@@ -184,6 +194,7 @@ def wrapper(*args, **kwargs):
                     ])).ask()
 
                 # TODO: take user's input into consideration - send to LLM?
+                # https://github.com/Pythagora-io/gpt-pilot/issues/122
                 if user_message != '':
                     return {}
 
@@ -217,6 +228,11 @@ def stream_gpt_completion(data, req_type):
             data['messages'].append({'role': 'user', 'content': incomplete_json})
             gpt_response = data['function_buffer']
             received_json = True
+        elif 'function_error' in data:
+            invalid_json = get_prompt('utils/invalid_json.prompt', {'invalid_reason': data['function_error']})
+            data['messages'].append({'role': 'user', 'content': invalid_json})
+            received_json = True
+
         # Don't send the `functions` parameter to Open AI, but don't remove it from `data` in case we need to retry
         data = {key: value for key, value in data.items() if not key.startswith('function')}