diff --git a/pilot/database/__init__.py b/pilot/database/__init__.py index e69de29bb..8d4d09005 100644 --- a/pilot/database/__init__.py +++ b/pilot/database/__init__.py @@ -0,0 +1 @@ +from .database import database_exists, create_database, save_app diff --git a/pilot/database/database.py b/pilot/database/database.py index b9dc30511..e92e16bea 100644 --- a/pilot/database/database.py +++ b/pilot/database/database.py @@ -50,6 +50,7 @@ File, ] + def get_created_apps(): return [model_to_dict(app) for app in App.select().where((App.name.is_null(False)) & (App.status.is_null(False)))] @@ -264,7 +265,7 @@ def hash_and_save_step(Model, app_id, unique_data_fields, data_fields, message): record = Model.get_by_id(inserted_id) logger.debug(yellow(f"{message} with id {record.id}")) except IntegrityError as e: - print(f"A record with data {unique_data_fields} already exists for {Model.__name__}.") + logger.warn(f"A record with data {unique_data_fields} already exists for {Model.__name__}.") return None return record @@ -288,9 +289,10 @@ def save_development_step(project, prompt_path, prompt_data, messages, llm_respo development_step = hash_and_save_step(DevelopmentSteps, project.args['app_id'], unique_data, data_fields, "Saved Development Step") - project.checkpoints['last_development_step'] = development_step + if development_step is not None: + project.checkpoints['last_development_step'] = development_step - project.save_files_snapshot(development_step.id) + project.save_files_snapshot(development_step.id) return development_step diff --git a/pilot/helpers/Debugger.py b/pilot/helpers/Debugger.py index 17eeafcc9..db3f9636b 100644 --- a/pilot/helpers/Debugger.py +++ b/pilot/helpers/Debugger.py @@ -1,3 +1,4 @@ +import platform import uuid from const.code_execution import MAX_COMMAND_DEBUG_TRIES, MAX_RECUSION_LAYER @@ -6,7 +7,7 @@ from helpers.exceptions.TooDeepRecursionError import TooDeepRecursionError -class Debugger(): +class Debugger: def __init__(self, agent): self.agent = agent self.recursion_layer = 0 @@ -41,7 +42,12 @@ def debug(self, convo, command=None, user_input=None, issue_description=None, is convo.load_branch(function_uuid) debugging_plan = convo.send_message('dev_ops/debug.prompt', - { 'command': command['command'] if command is not None else None, 'user_input': user_input, 'issue_description': issue_description }, + { + 'command': command['command'] if command is not None else None, + 'user_input': user_input, + 'issue_description': issue_description, + 'os': platform.system() + }, DEBUG_STEPS_BREAKDOWN) try: diff --git a/pilot/helpers/__init__.py b/pilot/helpers/__init__.py index e69de29bb..7bacb0417 100644 --- a/pilot/helpers/__init__.py +++ b/pilot/helpers/__init__.py @@ -0,0 +1,2 @@ +from .AgentConvo import AgentConvo +from .Project import Project diff --git a/pilot/helpers/agents/Developer.py b/pilot/helpers/agents/Developer.py index 93f9a2c8c..c33109364 100644 --- a/pilot/helpers/agents/Developer.py +++ b/pilot/helpers/agents/Developer.py @@ -96,9 +96,16 @@ def step_command_run(self, convo, step, i): additional_message = 'Let\'s start with the step #0:\n\n' if i == 0 else f'So far, steps { ", ".join(f"#{j}" for j in range(i)) } are finished so let\'s do step #{i + 1} now.\n\n' return run_command_until_success(data['command'], data['timeout'], convo, additional_message=additional_message) - def step_human_intervention(self, convo, step): + def step_human_intervention(self, convo, step: dict): + """ + :param convo: + :param step: {'human_intervention_description': 'some description'} + :return: + """ while True: - human_intervention_description = step['human_intervention_description'] + yellow_bold('\n\nIf you want to run the app, just type "r" and press ENTER and that will run `' + self.run_command + '`') if self.run_command is not None else step['human_intervention_description'] + human_intervention_description = step['human_intervention_description'] + \ + yellow_bold('\n\nIf you want to run the app, just type "r" and press ENTER and that will run `' + self.run_command + '`') \ + if self.run_command is not None else step['human_intervention_description'] response = self.project.ask_for_human_intervention('I need human intervention:', human_intervention_description, cbs={ 'r': lambda conv: run_command_until_success(self.run_command, None, conv, force=True, return_cli_response=True) }, @@ -260,8 +267,11 @@ def execute_task(self, convo, task_steps, test_command=None, reset_convo=True, def continue_development(self, iteration_convo, last_branch_name, continue_description=''): while True: iteration_convo.load_branch(last_branch_name) - user_description = ('Here is a description of what should be working: \n\n' + blue_bold(continue_description) + '\n') if continue_description != '' else '' - user_description = 'Can you check if the app works please? ' + user_description + '\nIf you want to run the app, ' + yellow_bold('just type "r" and press ENTER and that will run `' + self.run_command + '`') + user_description = ('Here is a description of what should be working: \n\n' + blue_bold(continue_description) + '\n') \ + if continue_description != '' else '' + user_description = 'Can you check if the app works please? ' + user_description + \ + '\nIf you want to run the app, ' + \ + yellow_bold('just type "r" and press ENTER and that will run `' + self.run_command + '`') # continue_description = '' response = self.project.ask_for_human_intervention( user_description, diff --git a/pilot/helpers/agents/ProductOwner.py b/pilot/helpers/agents/ProductOwner.py index f2ba130ea..2dce0705a 100644 --- a/pilot/helpers/agents/ProductOwner.py +++ b/pilot/helpers/agents/ProductOwner.py @@ -20,6 +20,8 @@ def __init__(self, project): super().__init__('product_owner', project) def get_project_description(self): + # TODO: why save the project before user has even committed to a name & description? + # The UI saves a record as soon as the click "Create Project" button self.project.app = save_app(self.project) self.project.current_step = PROJECT_DESCRIPTION_STEP diff --git a/pilot/helpers/agents/__init__.py b/pilot/helpers/agents/__init__.py index 8b1378917..6170c18fa 100644 --- a/pilot/helpers/agents/__init__.py +++ b/pilot/helpers/agents/__init__.py @@ -1 +1,4 @@ - +from .Architect import Architect, ARCHITECTURE_STEP +from .CodeMonkey import CodeMonkey, IMPLEMENT_CHANGES, GET_FILES +from .Developer import Developer, ENVIRONMENT_SETUP_STEP +from .TechLead import TechLead diff --git a/pilot/helpers/agents/test_Developer.py b/pilot/helpers/agents/test_Developer.py index 39dcf1711..555df49f7 100644 --- a/pilot/helpers/agents/test_Developer.py +++ b/pilot/helpers/agents/test_Developer.py @@ -1,8 +1,11 @@ import builtins +import json import os import pytest from unittest.mock import patch +import requests + from helpers.AgentConvo import AgentConvo from dotenv import load_dotenv load_dotenv() @@ -122,4 +125,57 @@ def test_code_changes_manual_test_no(self, mock_get_saved_user_input, mock_chat_ result = self.developer.test_code_changes(monkey, convo) # Then - assert result == {'success': True, 'user_input': 'continue'} + assert result == {'success': True, 'user_input': 'no'} + + @patch('helpers.cli.execute_command', return_value=('stdout:\n```\n\n```', 'DONE')) + @patch('helpers.AgentConvo.get_saved_development_step') + @patch('helpers.AgentConvo.save_development_step') + @patch('utils.llm_connection.requests.post') + @patch('utils.questionary.get_saved_user_input') + def test_test_code_changes_invalid_json(self, mock_get_saved_user_input, + mock_requests_post, + mock_save, + mock_get_saved_step, + mock_execute): + # Given + monkey = None + convo = AgentConvo(self.developer) + convo.save_branch = lambda branch_name=None: branch_name + convo.load_branch = lambda function_uuid=None: function_uuid + self.project.developer = self.developer + + # we send a GET_TEST_TYPE spec, but the 1st response is invalid + types_in_response = ['command', 'command_test'] + json_received = [] + + def generate_response(*args, **kwargs): + json_received.append(kwargs['json']) + + gpt_response = json.dumps({ + 'type': types_in_response.pop(0), + 'command': { + 'command': 'node server.js', + 'timeout': 3000 + } + }) + choice = json.dumps({'delta': {'content': gpt_response}}) + line = json.dumps({'choices': [json.loads(choice)]}).encode('utf-8') + + response = requests.Response() + response.status_code = 200 + response.iter_lines = lambda: [line] + return response + + mock_requests_post.side_effect = generate_response + + mock_questionary = MockQuestionary(['']) + + with patch('utils.questionary.questionary', mock_questionary): + # When + result = self.developer.test_code_changes(monkey, convo) + + # Then + assert result == {'success': True, 'cli_response': 'stdout:\n```\n\n```'} + assert mock_requests_post.call_count == 2 + assert "The JSON is invalid at $.type - 'command' is not one of ['automated_test', 'command_test', 'manual_test', 'no_test']" in json_received[1]['messages'][3]['content'] + assert mock_execute.call_count == 1 diff --git a/pilot/helpers/cli.py b/pilot/helpers/cli.py index ab0f72209..545129ed8 100644 --- a/pilot/helpers/cli.py +++ b/pilot/helpers/cli.py @@ -6,6 +6,7 @@ import time import platform +from logger.logger import logger from utils.style import yellow, green, red, yellow_bold, white_bold from database.database import get_saved_command_run, save_command_run from helpers.exceptions.TooDeepRecursionError import TooDeepRecursionError @@ -15,6 +16,7 @@ interrupted = False + def enqueue_output(out, q): for line in iter(out.readline, ''): if interrupted: # Check if the flag is set @@ -22,6 +24,7 @@ def enqueue_output(out, q): q.put(line) out.close() + def run_command(command, root_path, q_stdout, q_stderr, pid_container): """ Execute a command in a subprocess. @@ -36,6 +39,7 @@ def run_command(command, root_path, q_stdout, q_stderr, pid_container): Returns: subprocess.Popen: The subprocess object. """ + logger.info(f'Running `{command}`') if platform.system() == 'Windows': # Check the operating system process = subprocess.Popen( command, @@ -65,19 +69,19 @@ def run_command(command, root_path, q_stdout, q_stderr, pid_container): t_stderr.start() return process + def terminate_process(pid): if platform.system() == "Windows": try: subprocess.run(["taskkill", "/F", "/T", "/PID", str(pid)]) - except subprocess.CalledProcessError: - # Handle any potential errors here - pass + except subprocess.CalledProcessError as e: + logger.error(f'Error while terminating process: {e}') else: # Unix-like systems try: os.killpg(pid, signal.SIGKILL) - except OSError: - # Handle any potential errors here - pass + except OSError as e: + logger.error(f'Error while terminating process: {e}') + def execute_command(project, command, timeout=None, force=False): """ @@ -112,6 +116,7 @@ def execute_command(project, command, timeout=None, force=False): # TODO: I think AutoGPT allows other feedback here, like: # "That's not going to work, let's do X instead" # We don't explicitly make "no" or "skip" options to the user + # see https://github.com/Pythagora-io/gpt-pilot/issues/122 if answer == 'no': return '', 'DONE' elif answer == 'skip': @@ -119,6 +124,7 @@ def execute_command(project, command, timeout=None, force=False): # TODO when a shell built-in commands (like cd or source) is executed, the output is not captured properly - this will need to be changed at some point + # TODO: Windows support if "cd " in command or "source " in command: command = "bash -c '" + command + "'" @@ -157,6 +163,7 @@ def execute_command(project, command, timeout=None, force=False): output_line = q.get_nowait() if output_line not in output: print(green('CLI OUTPUT:') + output_line, end='') + logger.info('CLI OUTPUT: ' + output_line) output += output_line break @@ -174,6 +181,7 @@ def execute_command(project, command, timeout=None, force=False): if line: output += line print(green('CLI OUTPUT:') + line, end='') + logger.info('CLI OUTPUT: ' + line) # Read stderr try: @@ -184,13 +192,16 @@ def execute_command(project, command, timeout=None, force=False): if stderr_line: stderr_output += stderr_line print(red('CLI ERROR:') + stderr_line, end='') # Print with different color for distinction + logger.error('CLI ERROR: ' + stderr_line) except (KeyboardInterrupt, TimeoutError) as e: interrupted = True if isinstance(e, KeyboardInterrupt): - print("\nCTRL+C detected. Stopping command execution...") + print('\nCTRL+C detected. Stopping command execution...') + logger.info('CTRL+C detected. Stopping command execution...') else: - print("\nTimeout detected. Stopping command execution...") + print('\nTimeout detected. Stopping command execution...') + logger.warn('Timeout detected. Stopping command execution...') terminate_process(pid_container[0]) @@ -267,7 +278,9 @@ def execute_command_and_check_cli_response(command, timeout, convo): { 'cli_response': cli_response, 'command': command }) return cli_response, llm_response -def run_command_until_success(command, timeout, convo, additional_message=None, force=False, return_cli_response=False, is_root_task=False): + +def run_command_until_success(command, timeout, convo, additional_message=None, force=False, + return_cli_response=False, is_root_task=False): """ Run a command until it succeeds or reaches a timeout. diff --git a/pilot/logger/logger.py b/pilot/logger/logger.py index 603cb7ace..1327814a7 100644 --- a/pilot/logger/logger.py +++ b/pilot/logger/logger.py @@ -1,4 +1,5 @@ import os +import re import logging @@ -31,6 +32,7 @@ def setup_logger(): def filter_sensitive_fields(record): + # TODO: also remove escape sequences for colors, bold etc if isinstance(record.args, dict): # check if args is a dictionary args = record.args.copy() for field in sensitive_fields: @@ -44,6 +46,8 @@ def filter_sensitive_fields(record): args_list = ['*****' if arg in sensitive_fields else arg for arg in args_list] record.args = tuple(args_list) + # Remove ANSI escape sequences - colours & bold + record.msg = re.sub(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])', '', record.msg) return record.levelno <= logging.INFO diff --git a/pilot/main.py b/pilot/main.py index 0fb7ceb10..14ed573c5 100644 --- a/pilot/main.py +++ b/pilot/main.py @@ -70,14 +70,20 @@ def local_print(*args, **kwargs): else: return local_print, ipc_client_instance + if __name__ == "__main__": try: + # sys.argv.append('--ux-test=' + 'run_command_until_success') args = init() + builtins.print, ipc_client_instance = get_custom_print(args) if '--api-key' in args: - os.environ["OPENAI_API_KEY"] = args['--api-key'] + os.environ["OPENAI_API_KEY"] = args['--api-key'] if '--get-created-apps-with-steps' in args: print({ 'db_data': get_created_apps_with_steps() }, type='info') + elif '--ux-test' in args: + from test.ux_tests import run_test + run_test(args['--ux-test']) else: # TODO get checkpoint from database and fill the project with it project = Project(args, ipc_client_instance=ipc_client_instance) diff --git a/pilot/prompts/dev_ops/debug.prompt b/pilot/prompts/dev_ops/debug.prompt index bd0ddf9ca..8db2339e5 100644 --- a/pilot/prompts/dev_ops/debug.prompt +++ b/pilot/prompts/dev_ops/debug.prompt @@ -5,7 +5,7 @@ You wanted me to check this - `{{ issue_description }}` but there was a problem{ ``` {% endif %}I want you to debug this issue by yourself and I will give you 2 functions that you can use - `run_command` and `implement_code_changes`. -`run_command` function will run a command on the machine and will return the CLI output to you so you can see what to do next. +`run_command` function will run a command on the machine and will return the CLI output to you so you can see what to do next. Note that the command will run on a {{ os }} machine. `implement_code_changes` function will change the code where you just need to thoroughly describe what needs to be implemented, I will implement the requested changes and let you know. diff --git a/pilot/prompts/utils/invalid_json.prompt b/pilot/prompts/utils/invalid_json.prompt new file mode 100644 index 000000000..25742343a --- /dev/null +++ b/pilot/prompts/utils/invalid_json.prompt @@ -0,0 +1,6 @@ +[INST]I received an invalid JSON response. The response was a parseable JSON object, but it is not valid against the schema I provided. The JSON is invalid {{ invalid_reason }} + +Please try again with a valid JSON object, referring to the previous JSON schema I provided above. + +A response which starts with "I'm sorry for the confusion" would be an example of an invalid response, a preamble must NOT be included. +[/INST] diff --git a/pilot/test/ux_tests/README.md b/pilot/test/ux_tests/README.md new file mode 100644 index 000000000..cb0f00ae6 --- /dev/null +++ b/pilot/test/ux_tests/README.md @@ -0,0 +1 @@ +The functions in this directory are used to test specific scenarios of the user experience. diff --git a/pilot/test/ux_tests/__init__.py b/pilot/test/ux_tests/__init__.py new file mode 100644 index 000000000..104f87f0c --- /dev/null +++ b/pilot/test/ux_tests/__init__.py @@ -0,0 +1,10 @@ +from .run_command_until_success import run_command_until_success + + +def run_test(test_name: str): + print(f'Running UX test "{test_name}"...') + + if test_name == 'run_command_until_success': + return run_command_until_success() + + print(f'UX test "{test_name}" not found') diff --git a/pilot/test/ux_tests/run_command_until_success.py b/pilot/test/ux_tests/run_command_until_success.py new file mode 100644 index 000000000..8b676129d --- /dev/null +++ b/pilot/test/ux_tests/run_command_until_success.py @@ -0,0 +1,41 @@ +import os +from helpers.agents import Developer, ENVIRONMENT_SETUP_STEP +from helpers import AgentConvo, Project +from helpers.files import update_file +from database import save_app + + +def run_command_until_success(): + name = 'run_command_until_success' + project = Project({ + 'app_id': '84c2c532-e07c-4694-bcb0-70767c348b07', + 'name': name, + 'app_type': '', + 'user_id': '97510ce7-dbca-44b6-973c-d27346ce4009', + 'email': '7ed2f578-c791-4719-959c-dedf94394ad3', + 'password': 'secret', + }, + name=name, + architecture=[], + user_stories=[] + ) + + project.root_path = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), + '../../../workspace/TestDeveloper')) + project.technologies = [] + project.current_step = ENVIRONMENT_SETUP_STEP + project.app = save_app(project) + + update_file(f'{project.root_path}/package.json', + '{"dependencies": {"axios": "^1.5.0", "express": "^4.18.2", "mongoose": "^7.5.0"}}') + + developer = Developer(project) + developer.run_command = 'npm install' + + convo = AgentConvo(developer) + step = { + 'type': 'human_intervention', + 'human_intervention_description': 'I want you to test that this process works from the CLI _and_ from the UI.', + } + + result = developer.step_human_intervention(convo, step) diff --git a/pilot/utils/arguments.py b/pilot/utils/arguments.py index 75dc5c0c3..e6b927d05 100644 --- a/pilot/utils/arguments.py +++ b/pilot/utils/arguments.py @@ -92,6 +92,11 @@ def get_email(): # TODO can we make BaseModel.id a CharField with default=uuid4? def username_to_uuid(username): + """ + Creates a consistent UUID from a username + :param username: + :return: + """ sha1 = hashlib.sha1(username.encode()).hexdigest() uuid_str = "{}-{}-{}-{}-{}".format(sha1[:8], sha1[8:12], sha1[12:16], sha1[16:20], sha1[20:32]) return str(uuid.UUID(uuid_str)) diff --git a/pilot/utils/function_calling.py b/pilot/utils/function_calling.py index 00e3af4fc..aba5d6166 100644 --- a/pilot/utils/function_calling.py +++ b/pilot/utils/function_calling.py @@ -66,10 +66,8 @@ def parse_agent_response(response, function_calls: Union[FunctionCallSet, None]) response: The response from the agent. function_calls: Optional function calls associated with the response. - Returns: - The post-processed response. + Returns: The post-processed response. """ - if function_calls: text = response['text'] values = list(json.loads(text).values()) diff --git a/pilot/utils/llm_connection.py b/pilot/utils/llm_connection.py index d683029f0..e81e4c4fa 100644 --- a/pilot/utils/llm_connection.py +++ b/pilot/utils/llm_connection.py @@ -7,7 +7,7 @@ import tiktoken import questionary -from jsonschema import validate +from jsonschema import validate, ValidationError from utils.style import red from typing import List from const.llm import MIN_TOKENS_FOR_GPT_RESPONSE, MAX_GPT_MODEL_TOKENS @@ -104,6 +104,7 @@ def create_gpt_chat_completion(messages: List[dict], req_type, min_tokens=MIN_TO except TokenLimitError as e: raise e except Exception as e: + logger.error(f'The request to {os.getenv("ENDPOINT")} API failed: %s', e) print(f'The request to {os.getenv("ENDPOINT")} API failed. Here is the error message:') print(e) @@ -139,6 +140,7 @@ def get_tokens_in_messages_from_openai_error(error_message): else: return None + def retry_on_exception(func): def wrapper(*args, **kwargs): # spinner = None @@ -158,6 +160,14 @@ def wrapper(*args, **kwargs): logger.info('Received incomplete JSON response from LLM. Asking for the rest...') args[0]['function_buffer'] = e.doc continue + elif isinstance(e, ValidationError): + logger.warn('Received invalid JSON response from LLM. Asking to retry...') + logger.info(f' at {e.json_path} {e.message}') + # eg: + # json_path: '$.type' + # message: "'command' is not one of ['automated_test', 'command_test', 'manual_test', 'no_test']" + args[0]['function_error'] = f'at {e.json_path} - {e.message}' + continue if "context_length_exceeded" in err_str: # spinner_stop(spinner) raise TokenLimitError(get_tokens_in_messages_from_openai_error(err_str), MAX_GPT_MODEL_TOKENS) @@ -184,6 +194,7 @@ def wrapper(*args, **kwargs): ])).ask() # TODO: take user's input into consideration - send to LLM? + # https://github.com/Pythagora-io/gpt-pilot/issues/122 if user_message != '': return {} @@ -217,6 +228,11 @@ def stream_gpt_completion(data, req_type): data['messages'].append({'role': 'user', 'content': incomplete_json}) gpt_response = data['function_buffer'] received_json = True + elif 'function_error' in data: + invalid_json = get_prompt('utils/invalid_json.prompt', {'invalid_reason': data['function_error']}) + data['messages'].append({'role': 'user', 'content': invalid_json}) + received_json = True + # Don't send the `functions` parameter to Open AI, but don't remove it from `data` in case we need to retry data = {key: value for key, value in data.items() if not key.startswith('function')}