Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Support for Knowledge Base Retrieval Params & Preprocessing #61

Merged
merged 1 commit into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 17 additions & 4 deletions minds/knowledge_bases/knowledge_bases.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from pydantic import BaseModel

from minds.knowledge_bases.preprocessing import PreprocessingConfig
from minds.rest_api import RestAPI


Expand All @@ -25,6 +26,8 @@ class KnowledgeBaseConfig(BaseModel):
description: str
vector_store_config: Optional[VectorStoreConfig] = None
embedding_config: Optional[EmbeddingConfig] = None
# Params to apply to retrieval pipeline.
params: Optional[Dict] = None


class KnowledgeBaseDocument(BaseModel):
Expand All @@ -39,7 +42,7 @@ def __init__(self, name, api: RestAPI):
self.name = name
self.api = api

def insert_from_select(self, query: str):
def insert_from_select(self, query: str, preprocessing_config: PreprocessingConfig = None):
'''
Inserts select content of a connected datasource into this knowledge base

Expand All @@ -48,9 +51,11 @@ def insert_from_select(self, query: str):
update_request = {
'query': query
}
if preprocessing_config is not None:
update_request['preprocessing'] = preprocessing_config.model_dump()
_ = self.api.put(f'/knowledge_bases/{self.name}', data=update_request)

def insert_documents(self, documents: List[KnowledgeBaseDocument]):
def insert_documents(self, documents: List[KnowledgeBaseDocument], preprocessing_config: PreprocessingConfig = None):
'''
Inserts documents directly into this knowledge base

Expand All @@ -59,9 +64,11 @@ def insert_documents(self, documents: List[KnowledgeBaseDocument]):
update_request = {
'rows': [d.model_dump() for d in documents]
}
if preprocessing_config is not None:
update_request['preprocessing'] = preprocessing_config.model_dump()
_ = self.api.put(f'/knowledge_bases/{self.name}', data=update_request)

def insert_urls(self, urls: List[str]):
def insert_urls(self, urls: List[str], preprocessing_config: PreprocessingConfig = None):
'''
Crawls URLs & inserts the retrieved webpages into this knowledge base

Expand All @@ -70,9 +77,11 @@ def insert_urls(self, urls: List[str]):
update_request = {
'urls': urls
}
if preprocessing_config is not None:
update_request['preprocessing'] = preprocessing_config.model_dump()
_ = self.api.put(f'/knowledge_bases/{self.name}', data=update_request)

def insert_files(self, files: List[str]):
def insert_files(self, files: List[str], preprocessing_config: PreprocessingConfig = None):
'''
Inserts files that have already been uploaded to MindsDB into this knowledge base

Expand All @@ -81,6 +90,8 @@ def insert_files(self, files: List[str]):
update_request = {
'files': files
}
if preprocessing_config is not None:
update_request['preprocessing'] = preprocessing_config.model_dump()
_ = self.api.put(f'/knowledge_bases/{self.name}', data=update_request)


Expand Down Expand Up @@ -117,6 +128,8 @@ def create(self, config: KnowledgeBaseConfig) -> KnowledgeBase:
if config.embedding_config.params is not None:
embedding_data.update(config.embedding_config.params)
create_request['embedding_model'] = embedding_data
if config.params is not None:
create_request['params'] = config.params

_ = self.api.post('/knowledge_bases', data=create_request)
return self.get(config.name)
Expand Down
78 changes: 78 additions & 0 deletions minds/knowledge_bases/preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from typing import Any, Dict, List, Literal, Optional

from pydantic import BaseModel, Field, model_validator


DEFAULT_LLM_MODEL = 'gpt-4o'
DEFAULT_LLM_MODEL_PROVIDER = 'openai'


class TextChunkingConfig(BaseModel):
'''Configuration for chunking text content before they are inserted into a knowledge base'''
separators: List[str] = Field(
default=['\n\n', '\n', ' ', ''],
description='List of separators to use for splitting text, in order of priority'
)
chunk_size: int = Field(
default=1000,
description='The target size of each text chunk',
gt=0
)
chunk_overlap: int = Field(
default=200,
description='The number of characters to overlap between chunks',
ge=0
)


class LLMConfig(BaseModel):
model_name: str = Field(default=DEFAULT_LLM_MODEL, description='LLM model to use for context generation')
provider: str = Field(default=DEFAULT_LLM_MODEL_PROVIDER, description='LLM model provider to use for context generation')
params: Dict[str, Any] = Field(default={}, description='Additional parameters to pass in when initializing the LLM')
Comment on lines +28 to +31

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🙏 Nice



class ContextualConfig(BaseModel):
'''Configuration specific to contextual preprocessing'''
llm_config: LLMConfig = Field(
default=LLMConfig(),
description='LLM configuration to use for context generation'
)
context_template: Optional[str] = Field(
default=None,
description='Custom template for context generation'
)
chunk_size: int = Field(
default=1000,
description='The target size of each text chunk',
gt=0
)
chunk_overlap: int = Field(
default=200,
description='The number of characters to overlap between chunks',
ge=0
)


class PreprocessingConfig(BaseModel):
'''Complete preprocessing configuration'''
type: Literal['contextual', 'text_chunking'] = Field(
default='text_chunking',
description='Type of preprocessing to apply'
)
contextual_config: Optional[ContextualConfig] = Field(
default=None,
description='Configuration for contextual preprocessing'
)
text_chunking_config: Optional[TextChunkingConfig] = Field(
default=None,
description='Configuration for text chunking preprocessing'
)

@model_validator(mode='after')
def validate_config_presence(self) -> 'PreprocessingConfig':
'''Ensure the appropriate config is present for the chosen type'''
if self.type == 'contextual' and not self.contextual_config:
self.contextual_config = ContextualConfig()
if self.type == 'text_chunking' and not self.text_chunking_config:
self.text_chunking_config = TextChunkingConfig()
return self
8 changes: 7 additions & 1 deletion tests/unit/test_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,10 @@ def test_create_knowledge_bases(self, mock_post, mock_get):
name='test_kb',
description='Test knowledge base',
vector_store_config=test_vector_store_config,
embedding_config=test_embedding_config
embedding_config=test_embedding_config,
params={
'k1': 'v1'
}
)
response_mock(mock_get, test_knowledge_base_config.model_dump())

Expand All @@ -152,6 +155,9 @@ def test_create_knowledge_bases(self, mock_post, mock_get):
'provider': test_embedding_config.provider,
'name': test_embedding_config.model,
'k1': 'v1'
},
'params': {
'k1': 'v1'
}
}

Expand Down
Loading