Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ollama models #95

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
/.idea
/apps
7 changes: 6 additions & 1 deletion config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
MODEL: "OpenAI" # The type of multi-modal LLM you would like to use to power the AppAgent, must be either OpenAI or Qwen
MODEL: "Ollama" # The type of multi-modal LLM you would like to use to power the AppAgent, must be either OpenAI or Qwen

# OLLAMA
OLLAMA_API_BASE: "http://localhost:11434/api/chat"
OLLAMA_API_MODEL: "llava:13b"

# OPENAI
OPENAI_API_BASE: "https://api.openai.com/v1/chat/completions"
OPENAI_API_KEY: "sk-" # Set the value to sk-xxx if you host the openai interface for open llm model
OPENAI_API_MODEL: "gpt-4-vision-preview" # The only OpenAI model by now that accepts visual input
Expand Down
9 changes: 6 additions & 3 deletions learn.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@

arg_desc = "AppAgent - exploration phase"
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)
parser.add_argument("--app")
parser.add_argument("--app", default="Chrome")
parser.add_argument("--user_input", default="1")
parser.add_argument("--task_desc", default="open the baidu.com website")
parser.add_argument("--root_dir", default="./")
args = vars(parser.parse_args())

Expand All @@ -26,7 +28,7 @@
"main interface of the app on your phone.", "yellow")
print_with_color("Choose from the following modes:\n1. autonomous exploration\n2. human demonstration\n"
"Type 1 or 2.", "blue")
user_input = ""
user_input = args["user_input"]
while user_input != "1" and user_input != "2":
user_input = input()

Expand All @@ -36,7 +38,8 @@
app = app.replace(" ", "")

if user_input == "1":
os.system(f"python scripts/self_explorer.py --app {app} --root_dir {root_dir}")
task_desc = args["task_desc"]
os.system(f"python scripts/self_explorer.py --app {app} --root_dir {root_dir} --task_desc '{task_desc}'")
else:
demo_timestamp = int(time.time())
demo_name = datetime.datetime.fromtimestamp(demo_timestamp).strftime(f"demo_{app}_%Y-%m-%d_%H-%M-%S")
Expand Down
16 changes: 3 additions & 13 deletions scripts/document_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@

import prompts
from config import load_config
from model import OpenAIModel, QwenModel
from utils import print_with_color

from model_parser import parse as model_parse

arg_desc = "AppAgent - Human Demonstration"
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)
parser.add_argument("--app", required=True)
Expand All @@ -20,18 +21,7 @@

configs = load_config()

if configs["MODEL"] == "OpenAI":
mllm = OpenAIModel(base_url=configs["OPENAI_API_BASE"],
api_key=configs["OPENAI_API_KEY"],
model=configs["OPENAI_API_MODEL"],
temperature=configs["TEMPERATURE"],
max_tokens=configs["MAX_TOKENS"])
elif configs["MODEL"] == "Qwen":
mllm = QwenModel(api_key=configs["DASHSCOPE_API_KEY"],
model=configs["QWEN_MODEL"])
else:
print_with_color(f"ERROR: Unsupported model type {configs['MODEL']}!", "red")
sys.exit()
mllm = model_parse(configs)

root_dir = args["root_dir"]
work_dir = os.path.join(root_dir, "apps")
Expand Down
55 changes: 47 additions & 8 deletions scripts/model.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import re
from abc import abstractmethod
from typing import List
Expand Down Expand Up @@ -100,10 +101,10 @@ def get_model_response(self, prompt: str, images: List[str]) -> (bool, str):

def parse_explore_rsp(rsp):
try:
observation = re.findall(r"Observation: (.*?)$", rsp, re.MULTILINE)[0]
think = re.findall(r"Thought: (.*?)$", rsp, re.MULTILINE)[0]
act = re.findall(r"Action: (.*?)$", rsp, re.MULTILINE)[0]
last_act = re.findall(r"Summary: (.*?)$", rsp, re.MULTILINE)[0]
observation = rsp['Observation']
think = rsp['Thought']
act = rsp['Action']
last_act = rsp['Summary']
print_with_color("Observation:", "yellow")
print_with_color(observation, "magenta")
print_with_color("Thought:", "yellow")
Expand Down Expand Up @@ -137,7 +138,7 @@ def parse_explore_rsp(rsp):
print_with_color(f"ERROR: Undefined act {act_name}!", "red")
return ["ERROR"]
except Exception as e:
print_with_color(f"ERROR: an exception occurs while parsing the model response: {e}", "red")
print_with_color(f"ERROR: an exception occurs while parsing the model response: {e.with_traceback()}", "red")
print_with_color(rsp, "red")
return ["ERROR"]

Expand Down Expand Up @@ -189,16 +190,16 @@ def parse_grid_rsp(rsp):

def parse_reflect_rsp(rsp):
try:
decision = re.findall(r"Decision: (.*?)$", rsp, re.MULTILINE)[0]
think = re.findall(r"Thought: (.*?)$", rsp, re.MULTILINE)[0]
decision = rsp['Decision']
think = rsp['Thought']
print_with_color("Decision:", "yellow")
print_with_color(decision, "magenta")
print_with_color("Thought:", "yellow")
print_with_color(think, "magenta")
if decision == "INEFFECTIVE":
return [decision, think]
elif decision == "BACK" or decision == "CONTINUE" or decision == "SUCCESS":
doc = re.findall(r"Documentation: (.*?)$", rsp, re.MULTILINE)[0]
doc = rsp['Documentation']
print_with_color("Documentation:", "yellow")
print_with_color(doc, "magenta")
return [decision, think, doc]
Expand All @@ -209,3 +210,41 @@ def parse_reflect_rsp(rsp):
print_with_color(f"ERROR: an exception occurs while parsing the model response: {e}", "red")
print_with_color(rsp, "red")
return ["ERROR"]


class OllamaModel(BaseModel):
def __init__(self, base_url: str, model: str):
super().__init__()
self.base_url = base_url
self.model = model

def get_model_response(self, prompt: str, images: List[str]) -> (bool, str):
for idx, img in enumerate(images):
base64_img = encode_image(img)
images[idx] = base64_img
headers = {
"Content-Type": "application/json"
}
payload = {
"model": self.model,
"messages": [
{
"role": "user",
"content": prompt,
'images': images
}
],
"stream": False,
"format": "json",
}
# print('get_model_request:\n', prompt)
response = requests.post(self.base_url, headers=headers, json=payload).json()
# print('get_model_response:\n', json.dumps(response, indent=2))
if "error" not in response:
total_duration = response["total_duration"]
print_with_color(f"Request duration is "
f"{'{0:.2f}'.format(total_duration / 10 ** 9)}s",
"yellow")
else:
return False, response['error']
return True, json.loads(response["message"]["content"])
24 changes: 24 additions & 0 deletions scripts/model_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import sys
from typing import Optional
from model import BaseModel, OpenAIModel, QwenModel, OllamaModel
from utils import print_with_color


def parse(configs: dict) -> BaseModel:
mllm: Optional[BaseModel] = None
if configs["MODEL"] == "OpenAI":
mllm = OpenAIModel(base_url=configs["OPENAI_API_BASE"],
api_key=configs["OPENAI_API_KEY"],
model=configs["OPENAI_API_MODEL"],
temperature=configs["TEMPERATURE"],
max_tokens=configs["MAX_TOKENS"])
elif configs["MODEL"] == "Qwen":
mllm = QwenModel(api_key=configs["DASHSCOPE_API_KEY"],
model=configs["QWEN_MODEL"])
elif configs["MODEL"] == 'Ollama':
mllm = OllamaModel(base_url=configs["OLLAMA_API_BASE"],
model=configs["OLLAMA_API_MODEL"])
else:
print_with_color(f"ERROR: Unsupported model type {configs['MODEL']}!", "red")
sys.exit()
return mllm
45 changes: 36 additions & 9 deletions scripts/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,37 +168,54 @@

The task you need to complete is to <task_description>. Your past actions to proceed with this task are summarized as
follows: <last_act>

Now, given the following labeled screenshot, you need to think and call the function needed to proceed with the task.
Your output should include three parts in the given format:
Your output should include the exact four parts (Observation, Thought, Action and Summary) in the following JSON format:
{
"Observation": "your observation",
"Thought": "your thought",
"Action": "text('text')",
"Summary": "your summary"
}
Observation: <Describe what you observe in the image>
Thought: <To complete the given task, what is the next step I should do>
Action: <The function call with the correct parameters to proceed with the task. If you believe the task is completed or
there is nothing to be done, you should output FINISH. You cannot output anything else except a function call or FINISH
in this field.>
there is nothing to be done, you should output FINISH. You cannot output anything else except a function call defined
above or FINISH in this field. You can only take one action at a time, so please directly call the function in python
syntax>
Summary: <Summarize your past actions along with your latest action in one or two sentences. Do not include the numeric
tag in your summary>
You can only take one action at a time, so please directly call the function."""
"""

self_explore_reflect_template = """I will give you screenshots of a mobile app before and after <action> the UI
element labeled with the number '<ui_element>' on the first screenshot. The numeric tag of each element is located at
the center of the element. The action of <action> this UI element was described as follows:
<last_act>
The action was also an attempt to proceed with a larger task, which is to <task_desc>. Your job is to carefully analyze
the difference between the two screenshots to determine if the action is in accord with the description above and at
the same time effectively moved the task forward. Your output should be determined based on the following situations:
the same time effectively moved the task forward. Your output should be determined based on the following situations:
1. BACK
If you think the action navigated you to a page where you cannot proceed with the given task, you should go back to the
previous interface. At the same time, describe the functionality of the UI element concisely in one or two sentences by
observing the difference between the two screenshots. Notice that your description of the UI element should focus on
the general function. Never include the numeric tag of the UI element in your description. You can use pronouns such as
"the UI element" to refer to the element. Your output should be in the following format:
"the UI element" to refer to the element. Your output should be in the following JSON format:
{
"Decision": "BACK",
"Thought": "your thought",
"Documentation": "your documentation"
}
Decision: BACK
Thought: <explain why you think the last action is wrong and you should go back to the previous interface>
Documentation: <describe the function of the UI element>
2. INEFFECTIVE
If you find the action changed nothing on the screen (screenshots before and after the action are identical), you
should continue to interact with other elements on the screen. Notice that if you find the location of the cursor
changed between the two screenshots, then they are not identical. Your output should be in the following format:
changed between the two screenshots, then they are not identical. Your output should be in the following JSON format:
{
"Decision": "INEFFECTIVE",
"Thought": "your thought"
}
Decision: INEFFECTIVE
Thought: <explain why you made this decision>
3. CONTINUE
Expand All @@ -207,7 +224,12 @@
describe the functionality of the UI element concisely in one or two sentences by observing the difference between the
two screenshots. Notice that your description of the UI element should focus on the general function. Never include the
numeric tag of the UI element in your description. You can use pronouns such as "the UI element" to refer to the
element. Your output should be in the following format:
element. Your output should be in the following JSON format:
{
"Decision": "CONTINUE",
"Thought": "your thought",
"Documentation": "your documentation"
}
Decision: CONTINUE
Thought: <explain why you think the action does not reflect the action description above and did not move the given
task forward>
Expand All @@ -216,7 +238,12 @@
If you think the action successfully moved the task forward (even though it did not completed the task), you should
describe the functionality of the UI element concisely in one or two sentences. Notice that your description of the UI
element should focus on the general function. Never include the numeric tag of the UI element in your description. You
can use pronouns such as "the UI element" to refer to the element. Your output should be in the following format:
can use pronouns such as "the UI element" to refer to the element. Your output should be in the following JSON format:
{
"Decision": "SUCCESS",
"Thought": "your thought",
"Documentation": "your documentation"
}
Decision: SUCCESS
Thought: <explain why you think the action successfully moved the task forward>
Documentation: <describe the function of the UI element>
Expand Down
23 changes: 8 additions & 15 deletions scripts/self_explorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,29 +10,20 @@
import prompts
from config import load_config
from and_controller import list_all_devices, AndroidController, traverse_tree
from model import parse_explore_rsp, parse_reflect_rsp, OpenAIModel, QwenModel
from model import parse_explore_rsp, parse_reflect_rsp
from utils import print_with_color, draw_bbox_multi
from model_parser import parse as model_parse

arg_desc = "AppAgent - Autonomous Exploration"
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)
parser.add_argument("--app")
parser.add_argument("--task_desc")
parser.add_argument("--root_dir", default="./")
args = vars(parser.parse_args())

configs = load_config()

if configs["MODEL"] == "OpenAI":
mllm = OpenAIModel(base_url=configs["OPENAI_API_BASE"],
api_key=configs["OPENAI_API_KEY"],
model=configs["OPENAI_API_MODEL"],
temperature=configs["TEMPERATURE"],
max_tokens=configs["MAX_TOKENS"])
elif configs["MODEL"] == "Qwen":
mllm = QwenModel(api_key=configs["DASHSCOPE_API_KEY"],
model=configs["QWEN_MODEL"])
else:
print_with_color(f"ERROR: Unsupported model type {configs['MODEL']}!", "red")
sys.exit()
mllm = model_parse(configs)

app = args["app"]
root_dir = args["root_dir"]
Expand Down Expand Up @@ -79,8 +70,10 @@
sys.exit()
print_with_color(f"Screen resolution of {device}: {width}x{height}", "yellow")

print_with_color("Please enter the description of the task you want me to complete in a few sentences:", "blue")
task_desc = input()
task_desc = args['task_desc']
if not task_desc:
print_with_color("Please enter the description of the task you want me to complete in a few sentences:", "blue")
task_desc = input()

round_count = 0
doc_count = 0
Expand Down
16 changes: 3 additions & 13 deletions scripts/task_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@
import prompts
from config import load_config
from and_controller import list_all_devices, AndroidController, traverse_tree
from model import parse_explore_rsp, parse_grid_rsp, OpenAIModel, QwenModel
from model import parse_explore_rsp, parse_grid_rsp
from utils import print_with_color, draw_bbox_multi, draw_grid
from model_parser import parse as model_parse

arg_desc = "AppAgent Executor"
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)
Expand All @@ -21,18 +22,7 @@

configs = load_config()

if configs["MODEL"] == "OpenAI":
mllm = OpenAIModel(base_url=configs["OPENAI_API_BASE"],
api_key=configs["OPENAI_API_KEY"],
model=configs["OPENAI_API_MODEL"],
temperature=configs["TEMPERATURE"],
max_tokens=configs["MAX_TOKENS"])
elif configs["MODEL"] == "Qwen":
mllm = QwenModel(api_key=configs["DASHSCOPE_API_KEY"],
model=configs["QWEN_MODEL"])
else:
print_with_color(f"ERROR: Unsupported model type {configs['MODEL']}!", "red")
sys.exit()
mllm = model_parse(configs)

app = args["app"]
root_dir = args["root_dir"]
Expand Down