Skip to content

Commit

Permalink
fix overlap and splitter optimization (#2742)
Browse files Browse the repository at this point in the history
Co-authored-by: jyong <[email protected]>
  • Loading branch information
JohnJyong and JohnJyong authored Mar 7, 2024
1 parent b163545 commit 8ba38e8
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 4 deletions.
4 changes: 2 additions & 2 deletions api/core/rag/index_processor/index_processor_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def _get_splitter(self, processing_rule: dict,

character_splitter = FixedRecursiveCharacterTextSplitter.from_encoder(
chunk_size=segmentation["max_tokens"],
chunk_overlap=0,
chunk_overlap=segmentation.get('chunk_overlap', 0),
fixed_separator=separator,
separators=["\n\n", "。", ".", " ", ""],
embedding_model_instance=embedding_model_instance
Expand All @@ -61,7 +61,7 @@ def _get_splitter(self, processing_rule: dict,
# Automatic segmentation
character_splitter = EnhanceRecursiveCharacterTextSplitter.from_encoder(
chunk_size=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['max_tokens'],
chunk_overlap=0,
chunk_overlap=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['chunk_overlap'],
separators=["\n\n", "。", ".", " ", ""],
embedding_model_instance=embedding_model_instance
)
Expand Down
4 changes: 2 additions & 2 deletions api/core/splitter/text_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def _split_text_with_regex(
if separator:
if keep_separator:
# The parentheses in the pattern keep the delimiters in the result.
_splits = re.split(f"({separator})", text)
_splits = re.split(f"({re.escape(separator)})", text)
splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]
if len(_splits) % 2 == 0:
splits += _splits[-1:]
Expand Down Expand Up @@ -94,7 +94,7 @@ def create_documents(
documents.append(new_doc)
return documents

def split_documents(self, documents: Iterable[Document]) -> list[Document]:
def split_documents(self, documents: Iterable[Document] ) -> list[Document]:
"""Split documents."""
texts, metadatas = [], []
for doc in documents:
Expand Down

0 comments on commit 8ba38e8

Please sign in to comment.