feat: support LLM process document file (#10966)

Co-authored-by: -LAN- <[email protected]>
langgenius · Nov 22, 2024 · 08ac368 · 08ac368
1 parent 556de44
commit 08ac368
Show file tree

Hide file tree

Showing 37 changed files with 233 additions and 88 deletions.
diff --git a/api/core/memory/token_buffer_memory.py b/api/core/memory/token_buffer_memory.py
@@ -3,7 +3,6 @@
 
 from core.app.app_config.features.file_upload.manager import FileUploadConfigManager
 from core.file import file_manager
-from core.file.models import FileType
 from core.model_manager import ModelInstance
 from core.model_runtime.entities import (
     AssistantPromptMessage,
@@ -103,12 +102,11 @@ def get_history_prompt_messages(
                     prompt_message_contents: list[PromptMessageContent] = []
                     prompt_message_contents.append(TextPromptMessageContent(data=message.query))
                     for file in file_objs:
-                        if file.type in {FileType.IMAGE, FileType.AUDIO}:
-                            prompt_message = file_manager.to_prompt_message_content(
-                                file,
-                                image_detail_config=detail,
-                            )
-                            prompt_message_contents.append(prompt_message)
+                        prompt_message = file_manager.to_prompt_message_content(
+                            file,
+                            image_detail_config=detail,
+                        )
+                        prompt_message_contents.append(prompt_message)
 
                     prompt_messages.append(UserPromptMessage(content=prompt_message_contents))
 

diff --git a/api/core/model_runtime/entities/message_entities.py b/api/core/model_runtime/entities/message_entities.py
@@ -49,7 +49,7 @@ class PromptMessageFunction(BaseModel):
     function: PromptMessageTool
 
 
-class PromptMessageContentType(Enum):
+class PromptMessageContentType(str, Enum):
     """
     Enum class for prompt message content type.
     """

diff --git a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-001.yaml b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-001.yaml
@@ -7,6 +7,7 @@ features:
   - vision
   - tool-call
   - stream-tool-call
+  - document
 model_properties:
   mode: chat
   context_size: 1048576

diff --git a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-002.yaml b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-002.yaml
@@ -7,6 +7,7 @@ features:
   - vision
   - tool-call
   - stream-tool-call
+  - document
 model_properties:
   mode: chat
   context_size: 1048576

diff --git a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-8b-exp-0827.yaml b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-8b-exp-0827.yaml
@@ -7,6 +7,7 @@ features:
   - vision
   - tool-call
   - stream-tool-call
+  - document
 model_properties:
   mode: chat
   context_size: 1048576

diff --git a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-8b-exp-0924.yaml b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-8b-exp-0924.yaml
@@ -7,6 +7,7 @@ features:
   - vision
   - tool-call
   - stream-tool-call
+  - document
 model_properties:
   mode: chat
   context_size: 1048576

diff --git a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-exp-0827.yaml b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-exp-0827.yaml
@@ -7,6 +7,7 @@ features:
   - vision
   - tool-call
   - stream-tool-call
+  - document
 model_properties:
   mode: chat
   context_size: 1048576

diff --git a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-latest.yaml b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-latest.yaml
@@ -7,6 +7,7 @@ features:
   - vision
   - tool-call
   - stream-tool-call
+  - document
 model_properties:
   mode: chat
   context_size: 1048576

diff --git a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash.yaml b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash.yaml
@@ -7,6 +7,7 @@ features:
   - vision
   - tool-call
   - stream-tool-call
+  - document
 model_properties:
   mode: chat
   context_size: 1048576

diff --git a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-001.yaml b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-001.yaml
@@ -7,6 +7,7 @@ features:
   - vision
   - tool-call
   - stream-tool-call
+  - document
 model_properties:
   mode: chat
   context_size: 2097152

diff --git a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-002.yaml b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-002.yaml
@@ -7,6 +7,7 @@ features:
   - vision
   - tool-call
   - stream-tool-call
+  - document
 model_properties:
   mode: chat
   context_size: 2097152

diff --git a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-exp-0801.yaml b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-exp-0801.yaml
@@ -7,6 +7,7 @@ features:
   - vision
   - tool-call
   - stream-tool-call
+  - document
 model_properties:
   mode: chat
   context_size: 2097152

diff --git a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-exp-0827.yaml b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-exp-0827.yaml
@@ -7,6 +7,7 @@ features:
   - vision
   - tool-call
   - stream-tool-call
+  - document
 model_properties:
   mode: chat
   context_size: 2097152

diff --git a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-latest.yaml b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-latest.yaml
@@ -7,6 +7,7 @@ features:
   - vision
   - tool-call
   - stream-tool-call
+  - document
 model_properties:
   mode: chat
   context_size: 2097152

diff --git a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro.yaml b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro.yaml
@@ -7,6 +7,7 @@ features:
   - vision
   - tool-call
   - stream-tool-call
+  - document
 model_properties:
   mode: chat
   context_size: 2097152

diff --git a/api/core/model_runtime/model_providers/google/llm/gemini-exp-1114.yaml b/api/core/model_runtime/model_providers/google/llm/gemini-exp-1114.yaml
@@ -7,6 +7,7 @@ features:
   - vision
   - tool-call
   - stream-tool-call
+  - document
 model_properties:
   mode: chat
   context_size: 32767

diff --git a/api/core/model_runtime/model_providers/google/llm/llm.py b/api/core/model_runtime/model_providers/google/llm/llm.py
@@ -16,6 +16,7 @@
 from core.model_runtime.entities.llm_entities import LLMResult, LLMResultChunk, LLMResultChunkDelta
 from core.model_runtime.entities.message_entities import (
     AssistantPromptMessage,
+    DocumentPromptMessageContent,
     ImagePromptMessageContent,
     PromptMessage,
     PromptMessageContentType,
@@ -35,6 +36,21 @@
 from core.model_runtime.errors.validate import CredentialsValidateFailedError
 from core.model_runtime.model_providers.__base.large_language_model import LargeLanguageModel
 
+GOOGLE_AVAILABLE_MIMETYPE = [
+    "application/pdf",
+    "application/x-javascript",
+    "text/javascript",
+    "application/x-python",
+    "text/x-python",
+    "text/plain",
+    "text/html",
+    "text/css",
+    "text/md",
+    "text/csv",
+    "text/xml",
+    "text/rtf",
+]
+
 
 class GoogleLargeLanguageModel(LargeLanguageModel):
     def _invoke(
@@ -370,6 +386,12 @@ def _format_message_to_glm_content(self, message: PromptMessage) -> ContentType:
                                 raise ValueError(f"Failed to fetch image data from url {message_content.data}, {ex}")
                         blob = {"inline_data": {"mime_type": mime_type, "data": base64_data}}
                         glm_content["parts"].append(blob)
+                    elif c.type == PromptMessageContentType.DOCUMENT:
+                        message_content = cast(DocumentPromptMessageContent, c)
+                        if message_content.mime_type not in GOOGLE_AVAILABLE_MIMETYPE:
+                            raise ValueError(f"Unsupported mime type {message_content.mime_type}")
+                        blob = {"inline_data": {"mime_type": message_content.mime_type, "data": message_content.data}}
+                        glm_content["parts"].append(blob)
 
             return glm_content
         elif isinstance(message, AssistantPromptMessage):

diff --git a/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-max-0809.yaml b/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-max-0809.yaml
@@ -6,6 +6,7 @@ model_type: llm
 features:
   - vision
   - agent-thought
+  - video
 model_properties:
   mode: chat
   context_size: 32000

diff --git a/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-max.yaml b/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-max.yaml
@@ -6,6 +6,7 @@ model_type: llm
 features:
   - vision
   - agent-thought
+  - video
 model_properties:
   mode: chat
   context_size: 32000

diff --git a/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-plus-0809.yaml b/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-plus-0809.yaml
@@ -6,6 +6,7 @@ model_type: llm
 features:
   - vision
   - agent-thought
+  - video
 model_properties:
   mode: chat
   context_size: 32768

diff --git a/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-plus.yaml b/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-plus.yaml
@@ -6,6 +6,7 @@ model_type: llm
 features:
   - vision
   - agent-thought
+  - video
 model_properties:
   mode: chat
   context_size: 8000

diff --git a/api/core/model_runtime/model_providers/zhipuai/llm/glm_4v_plus.yaml b/api/core/model_runtime/model_providers/zhipuai/llm/glm_4v_plus.yaml
@@ -6,6 +6,7 @@ model_properties:
   mode: chat
 features:
   - vision
+  - video
 parameter_rules:
   - name: temperature
     use_template: temperature

diff --git a/api/core/workflow/nodes/llm/exc.py b/api/core/workflow/nodes/llm/exc.py
@@ -26,9 +26,15 @@ class NoPromptFoundError(LLMNodeError):
     """Raised when no prompt is found in the LLM configuration."""
 
 
-class NotSupportedPromptTypeError(LLMNodeError):
-    """Raised when the prompt type is not supported."""
+class TemplateTypeNotSupportError(LLMNodeError):
+    def __init__(self, *, type_name: str):
+        super().__init__(f"Prompt type {type_name} is not supported.")
 
 
 class MemoryRolePrefixRequiredError(LLMNodeError):
     """Raised when memory role prefix is required for completion model."""
+
+
+class FileTypeNotSupportError(LLMNodeError):
+    def __init__(self, *, type_name: str):
+        super().__init__(f"{type_name} type is not supported by this model")
diff --git a/api/core/workflow/nodes/llm/node.py b/api/core/workflow/nodes/llm/node.py
@@ -65,14 +65,15 @@
     ModelConfig,
 )
 from .exc import (
+    FileTypeNotSupportError,
     InvalidContextStructureError,
     InvalidVariableTypeError,
     LLMModeRequiredError,
     LLMNodeError,
     MemoryRolePrefixRequiredError,
     ModelNotExistError,
     NoPromptFoundError,
-    NotSupportedPromptTypeError,
+    TemplateTypeNotSupportError,
     VariableNotFoundError,
 )
 
@@ -621,9 +622,7 @@ def _fetch_prompt_messages(
                 prompt_content = prompt_messages[0].content.replace("#sys.query#", user_query)
                 prompt_messages[0].content = prompt_content
         else:
-            errmsg = f"Prompt type {type(prompt_template)} is not supported"
-            logger.warning(errmsg)
-            raise NotSupportedPromptTypeError(errmsg)
+            raise TemplateTypeNotSupportError(type_name=str(type(prompt_template)))
 
         if vision_enabled and user_files:
             file_prompts = []
@@ -671,7 +670,7 @@ def _fetch_prompt_messages(
                             and ModelFeature.AUDIO not in model_config.model_schema.features
                         )
                     ):
-                        continue
+                        raise FileTypeNotSupportError(type_name=content_item.type)
                     prompt_message_content.append(content_item)
                 if len(prompt_message_content) == 1 and prompt_message_content[0].type == PromptMessageContentType.TEXT:
                     prompt_message.content = prompt_message_content[0].data

diff --git a/api/tests/unit_tests/core/workflow/nodes/llm/test_node.py b/api/tests/unit_tests/core/workflow/nodes/llm/test_node.py
@@ -400,59 +400,6 @@ def test_fetch_prompt_messages__basic(faker, llm_node, model_config):
                 )
             },
         ),
-        LLMNodeTestScenario(
-            description="Prompt template with variable selector of File without vision feature",
-            user_query=fake_query,
-            user_files=[],
-            vision_enabled=True,
-            vision_detail=fake_vision_detail,
-            features=[],
-            window_size=fake_window_size,
-            prompt_template=[
-                LLMNodeChatModelMessage(
-                    text="{{#input.image#}}",
-                    role=PromptMessageRole.USER,
-                    edition_type="basic",
-                ),
-            ],
-            expected_messages=mock_history[fake_window_size * -2 :] + [UserPromptMessage(content=fake_query)],
-            file_variables={
-                "input.image": File(
-                    tenant_id="test",
-                    type=FileType.IMAGE,
-                    filename="test1.jpg",
-                    transfer_method=FileTransferMethod.REMOTE_URL,
-                    remote_url=fake_remote_url,
-                )
-            },
-        ),
-        LLMNodeTestScenario(
-            description="Prompt template with variable selector of File with video file and vision feature",
-            user_query=fake_query,
-            user_files=[],
-            vision_enabled=True,
-            vision_detail=fake_vision_detail,
-            features=[ModelFeature.VISION],
-            window_size=fake_window_size,
-            prompt_template=[
-                LLMNodeChatModelMessage(
-                    text="{{#input.image#}}",
-                    role=PromptMessageRole.USER,
-                    edition_type="basic",
-                ),
-            ],
-            expected_messages=mock_history[fake_window_size * -2 :] + [UserPromptMessage(content=fake_query)],
-            file_variables={
-                "input.image": File(
-                    tenant_id="test",
-                    type=FileType.VIDEO,
-                    filename="test1.mp4",
-                    transfer_method=FileTransferMethod.REMOTE_URL,
-                    remote_url=fake_remote_url,
-                    extension="mp4",
-                )
-            },
-        ),
     ]
 
     for scenario in test_scenarios: