mnotgod96 · tianhuyang · Nov 23, 2024 · Nov 23, 2024 · Nov 25, 2024 · Nov 25, 2024
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+/.idea
+/apps
diff --git a/config.yaml b/config.yaml
@@ -1,5 +1,10 @@
-MODEL: "OpenAI"  # The type of multi-modal LLM you would like to use to power the AppAgent, must be either OpenAI or Qwen
+MODEL: "Ollama"  # The type of multi-modal LLM you would like to use to power the AppAgent, must be either OpenAI or Qwen
 
+# OLLAMA
+OLLAMA_API_BASE: "http://localhost:11434/api/chat"
+OLLAMA_API_MODEL: "llava:13b"
+
+# OPENAI
 OPENAI_API_BASE: "https://api.openai.com/v1/chat/completions"
 OPENAI_API_KEY: "sk-"  # Set the value to sk-xxx if you host the openai interface for open llm model
 OPENAI_API_MODEL: "gpt-4-vision-preview"  # The only OpenAI model by now that accepts visual input

diff --git a/learn.py b/learn.py
@@ -7,7 +7,9 @@
 
 arg_desc = "AppAgent - exploration phase"
 parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)
-parser.add_argument("--app")
+parser.add_argument("--app", default="Chrome")
+parser.add_argument("--user_input", default="1")
+parser.add_argument("--task_desc", default="open the baidu.com website")
 parser.add_argument("--root_dir", default="./")
 args = vars(parser.parse_args())
 
@@ -26,7 +28,7 @@
                  "main interface of the app on your phone.", "yellow")
 print_with_color("Choose from the following modes:\n1. autonomous exploration\n2. human demonstration\n"
                  "Type 1 or 2.", "blue")
-user_input = ""
+user_input = args["user_input"]
 while user_input != "1" and user_input != "2":
     user_input = input()
 
@@ -36,7 +38,8 @@
     app = app.replace(" ", "")
 
 if user_input == "1":
-    os.system(f"python scripts/self_explorer.py --app {app} --root_dir {root_dir}")
+    task_desc = args["task_desc"]
+    os.system(f"python scripts/self_explorer.py --app {app} --root_dir {root_dir} --task_desc '{task_desc}'")
 else:
     demo_timestamp = int(time.time())
     demo_name = datetime.datetime.fromtimestamp(demo_timestamp).strftime(f"demo_{app}_%Y-%m-%d_%H-%M-%S")

diff --git a/scripts/document_generation.py b/scripts/document_generation.py
@@ -8,9 +8,10 @@
 
 import prompts
 from config import load_config
-from model import OpenAIModel, QwenModel
 from utils import print_with_color
 
+from model_parser import parse as model_parse
+
 arg_desc = "AppAgent - Human Demonstration"
 parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)
 parser.add_argument("--app", required=True)
@@ -20,18 +21,7 @@
 
 configs = load_config()
 
-if configs["MODEL"] == "OpenAI":
-    mllm = OpenAIModel(base_url=configs["OPENAI_API_BASE"],
-                       api_key=configs["OPENAI_API_KEY"],
-                       model=configs["OPENAI_API_MODEL"],
-                       temperature=configs["TEMPERATURE"],
-                       max_tokens=configs["MAX_TOKENS"])
-elif configs["MODEL"] == "Qwen":
-    mllm = QwenModel(api_key=configs["DASHSCOPE_API_KEY"],
-                     model=configs["QWEN_MODEL"])
-else:
-    print_with_color(f"ERROR: Unsupported model type {configs['MODEL']}!", "red")
-    sys.exit()
+mllm = model_parse(configs)
 
 root_dir = args["root_dir"]
 work_dir = os.path.join(root_dir, "apps")

diff --git a/scripts/model.py b/scripts/model.py
@@ -1,3 +1,4 @@
+import json
 import re
 from abc import abstractmethod
 from typing import List
@@ -100,10 +101,10 @@ def get_model_response(self, prompt: str, images: List[str]) -> (bool, str):
 
 def parse_explore_rsp(rsp):
     try:
-        observation = re.findall(r"Observation: (.*?)$", rsp, re.MULTILINE)[0]
-        think = re.findall(r"Thought: (.*?)$", rsp, re.MULTILINE)[0]
-        act = re.findall(r"Action: (.*?)$", rsp, re.MULTILINE)[0]
-        last_act = re.findall(r"Summary: (.*?)$", rsp, re.MULTILINE)[0]
+        observation = rsp['Observation']
+        think = rsp['Thought']
+        act = rsp['Action']
+        last_act = rsp['Summary']
         print_with_color("Observation:", "yellow")
         print_with_color(observation, "magenta")
         print_with_color("Thought:", "yellow")
@@ -137,7 +138,7 @@ def parse_explore_rsp(rsp):
             print_with_color(f"ERROR: Undefined act {act_name}!", "red")
             return ["ERROR"]
     except Exception as e:
-        print_with_color(f"ERROR: an exception occurs while parsing the model response: {e}", "red")
+        print_with_color(f"ERROR: an exception occurs while parsing the model response: {e.with_traceback()}", "red")
         print_with_color(rsp, "red")
         return ["ERROR"]
 
@@ -189,16 +190,16 @@ def parse_grid_rsp(rsp):
 
 def parse_reflect_rsp(rsp):
     try:
-        decision = re.findall(r"Decision: (.*?)$", rsp, re.MULTILINE)[0]
-        think = re.findall(r"Thought: (.*?)$", rsp, re.MULTILINE)[0]
+        decision = rsp['Decision']
+        think = rsp['Thought']
         print_with_color("Decision:", "yellow")
         print_with_color(decision, "magenta")
         print_with_color("Thought:", "yellow")
         print_with_color(think, "magenta")
         if decision == "INEFFECTIVE":
             return [decision, think]
         elif decision == "BACK" or decision == "CONTINUE" or decision == "SUCCESS":
-            doc = re.findall(r"Documentation: (.*?)$", rsp, re.MULTILINE)[0]
+            doc = rsp['Documentation']
             print_with_color("Documentation:", "yellow")
             print_with_color(doc, "magenta")
             return [decision, think, doc]
@@ -209,3 +210,41 @@ def parse_reflect_rsp(rsp):
         print_with_color(f"ERROR: an exception occurs while parsing the model response: {e}", "red")
         print_with_color(rsp, "red")
         return ["ERROR"]
+
+
+class OllamaModel(BaseModel):
+    def __init__(self, base_url: str, model: str):
+        super().__init__()
+        self.base_url = base_url
+        self.model = model
+
+    def get_model_response(self, prompt: str, images: List[str]) -> (bool, str):
+        for idx, img in enumerate(images):
+            base64_img = encode_image(img)
+            images[idx] = base64_img
+        headers = {
+            "Content-Type": "application/json"
+        }
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": prompt,
+                    'images': images
+                }
+            ],
+            "stream": False,
+            "format": "json",
+        }
+        # print('get_model_request:\n', prompt)
+        response = requests.post(self.base_url, headers=headers, json=payload).json()
+        # print('get_model_response:\n', json.dumps(response, indent=2))
+        if "error" not in response:
+            total_duration = response["total_duration"]
+            print_with_color(f"Request duration is "
+                             f"{'{0:.2f}'.format(total_duration / 10 ** 9)}s",
+                             "yellow")
+        else:
+            return False, response['error']
+        return True, json.loads(response["message"]["content"])
diff --git a/scripts/model_parser.py b/scripts/model_parser.py
@@ -0,0 +1,24 @@
+import sys
+from typing import Optional
+from model import BaseModel, OpenAIModel, QwenModel, OllamaModel
+from utils import print_with_color
+
+
+def parse(configs: dict) -> BaseModel:
+    mllm: Optional[BaseModel] = None
+    if configs["MODEL"] == "OpenAI":
+        mllm = OpenAIModel(base_url=configs["OPENAI_API_BASE"],
+                           api_key=configs["OPENAI_API_KEY"],
+                           model=configs["OPENAI_API_MODEL"],
+                           temperature=configs["TEMPERATURE"],
+                           max_tokens=configs["MAX_TOKENS"])
+    elif configs["MODEL"] == "Qwen":
+        mllm = QwenModel(api_key=configs["DASHSCOPE_API_KEY"],
+                         model=configs["QWEN_MODEL"])
+    elif configs["MODEL"] == 'Ollama':
+        mllm = OllamaModel(base_url=configs["OLLAMA_API_BASE"],
+                         model=configs["OLLAMA_API_MODEL"])
+    else:
+        print_with_color(f"ERROR: Unsupported model type {configs['MODEL']}!", "red")
+        sys.exit()
+    return mllm
diff --git a/scripts/prompts.py b/scripts/prompts.py
@@ -168,37 +168,54 @@
 
 The task you need to complete is to <task_description>. Your past actions to proceed with this task are summarized as 
 follows: <last_act>
+
 Now, given the following labeled screenshot, you need to think and call the function needed to proceed with the task. 
-Your output should include three parts in the given format:
+Your output should include the exact four parts (Observation, Thought, Action and Summary) in the following JSON format:
+{
+    "Observation": "your observation",
+    "Thought": "your thought",
+    "Action": "text('text')",
+    "Summary": "your summary"
+}
 Observation: <Describe what you observe in the image>
 Thought: <To complete the given task, what is the next step I should do>
 Action: <The function call with the correct parameters to proceed with the task. If you believe the task is completed or 
-there is nothing to be done, you should output FINISH. You cannot output anything else except a function call or FINISH 
-in this field.>
+there is nothing to be done, you should output FINISH. You cannot output anything else except a function call defined 
+above or FINISH in this field. You can only take one action at a time, so please directly call the function in python 
+syntax>
 Summary: <Summarize your past actions along with your latest action in one or two sentences. Do not include the numeric 
 tag in your summary>
-You can only take one action at a time, so please directly call the function."""
+"""
 
 self_explore_reflect_template = """I will give you screenshots of a mobile app before and after <action> the UI 
 element labeled with the number '<ui_element>' on the first screenshot. The numeric tag of each element is located at 
 the center of the element. The action of <action> this UI element was described as follows:
 <last_act>
 The action was also an attempt to proceed with a larger task, which is to <task_desc>. Your job is to carefully analyze 
 the difference between the two screenshots to determine if the action is in accord with the description above and at 
-the same time effectively moved the task forward. Your output should be determined based on the following situations:
+the same time effectively moved the task forward. Your output should be  determined based on the following situations:
 1. BACK
 If you think the action navigated you to a page where you cannot proceed with the given task, you should go back to the 
 previous interface. At the same time, describe the functionality of the UI element concisely in one or two sentences by 
 observing the difference between the two screenshots. Notice that your description of the UI element should focus on 
 the general function. Never include the numeric tag of the UI element in your description. You can use pronouns such as 
-"the UI element" to refer to the element. Your output should be in the following format:
+"the UI element" to refer to the element. Your output should be in the following JSON format:
+{
+    "Decision": "BACK",
+    "Thought": "your thought",
+    "Documentation": "your documentation"
+}
 Decision: BACK
 Thought: <explain why you think the last action is wrong and you should go back to the previous interface>
 Documentation: <describe the function of the UI element>
 2. INEFFECTIVE
 If you find the action changed nothing on the screen (screenshots before and after the action are identical), you 
 should continue to interact with other elements on the screen. Notice that if you find the location of the cursor 
-changed between the two screenshots, then they are not identical. Your output should be in the following format:
+changed between the two screenshots, then they are not identical. Your output should be in the following JSON format:
+{
+    "Decision": "INEFFECTIVE",
+    "Thought": "your thought"
+}
 Decision: INEFFECTIVE
 Thought: <explain why you made this decision>
 3. CONTINUE
@@ -207,7 +224,12 @@
 describe the functionality of the UI element concisely in one or two sentences by observing the difference between the 
 two screenshots. Notice that your description of the UI element should focus on the general function. Never include the 
 numeric tag of the UI element in your description. You can use pronouns such as "the UI element" to refer to the 
-element. Your output should be in the following format:
+element. Your output should be in the following JSON format:
+{
+    "Decision": "CONTINUE",
+    "Thought": "your thought",
+    "Documentation": "your documentation"
+}
 Decision: CONTINUE
 Thought: <explain why you think the action does not reflect the action description above and did not move the given 
 task forward>
@@ -216,7 +238,12 @@
 If you think the action successfully moved the task forward (even though it did not completed the task), you should 
 describe the functionality of the UI element concisely in one or two sentences. Notice that your description of the UI 
 element should focus on the general function. Never include the numeric tag of the UI element in your description. You 
-can use pronouns such as "the UI element" to refer to the element. Your output should be in the following format:
+can use pronouns such as "the UI element" to refer to the element. Your output should be in the following JSON format:
+{
+    "Decision": "SUCCESS",
+    "Thought": "your thought",
+    "Documentation": "your documentation"
+}
 Decision: SUCCESS
 Thought: <explain why you think the action successfully moved the task forward>
 Documentation: <describe the function of the UI element>

diff --git a/scripts/self_explorer.py b/scripts/self_explorer.py
@@ -10,29 +10,20 @@
 import prompts
 from config import load_config
 from and_controller import list_all_devices, AndroidController, traverse_tree
-from model import parse_explore_rsp, parse_reflect_rsp, OpenAIModel, QwenModel
+from model import parse_explore_rsp, parse_reflect_rsp
 from utils import print_with_color, draw_bbox_multi
+from model_parser import parse as model_parse
 
 arg_desc = "AppAgent - Autonomous Exploration"
 parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)
 parser.add_argument("--app")
+parser.add_argument("--task_desc")
 parser.add_argument("--root_dir", default="./")
 args = vars(parser.parse_args())
 
 configs = load_config()
 
-if configs["MODEL"] == "OpenAI":
-    mllm = OpenAIModel(base_url=configs["OPENAI_API_BASE"],
-                       api_key=configs["OPENAI_API_KEY"],
-                       model=configs["OPENAI_API_MODEL"],
-                       temperature=configs["TEMPERATURE"],
-                       max_tokens=configs["MAX_TOKENS"])
-elif configs["MODEL"] == "Qwen":
-    mllm = QwenModel(api_key=configs["DASHSCOPE_API_KEY"],
-                     model=configs["QWEN_MODEL"])
-else:
-    print_with_color(f"ERROR: Unsupported model type {configs['MODEL']}!", "red")
-    sys.exit()
+mllm = model_parse(configs)
 
 app = args["app"]
 root_dir = args["root_dir"]
@@ -79,8 +70,10 @@
     sys.exit()
 print_with_color(f"Screen resolution of {device}: {width}x{height}", "yellow")
 
-print_with_color("Please enter the description of the task you want me to complete in a few sentences:", "blue")
-task_desc = input()
+task_desc = args['task_desc']
+if not task_desc:
+    print_with_color("Please enter the description of the task you want me to complete in a few sentences:", "blue")
+    task_desc = input()
 
 round_count = 0
 doc_count = 0

diff --git a/scripts/task_executor.py b/scripts/task_executor.py
@@ -10,8 +10,9 @@
 import prompts
 from config import load_config
 from and_controller import list_all_devices, AndroidController, traverse_tree
-from model import parse_explore_rsp, parse_grid_rsp, OpenAIModel, QwenModel
+from model import parse_explore_rsp, parse_grid_rsp
 from utils import print_with_color, draw_bbox_multi, draw_grid
+from model_parser import parse as model_parse
 
 arg_desc = "AppAgent Executor"
 parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)
@@ -21,18 +22,7 @@
 
 configs = load_config()
 
-if configs["MODEL"] == "OpenAI":
-    mllm = OpenAIModel(base_url=configs["OPENAI_API_BASE"],
-                       api_key=configs["OPENAI_API_KEY"],
-                       model=configs["OPENAI_API_MODEL"],
-                       temperature=configs["TEMPERATURE"],
-                       max_tokens=configs["MAX_TOKENS"])
-elif configs["MODEL"] == "Qwen":
-    mllm = QwenModel(api_key=configs["DASHSCOPE_API_KEY"],
-                     model=configs["QWEN_MODEL"])
-else:
-    print_with_color(f"ERROR: Unsupported model type {configs['MODEL']}!", "red")
-    sys.exit()
+mllm = model_parse(configs)
 
 app = args["app"]
 root_dir = args["root_dir"]