Skip to content

Commit

Permalink
Use python-docx to extract docx files (#2654)
Browse files Browse the repository at this point in the history
  • Loading branch information
bowenliang123 authored Mar 7, 2024
1 parent c0b82f8 commit b163545
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 10 deletions.
20 changes: 11 additions & 9 deletions api/core/rag/extractor/word_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@


class WordExtractor(BaseExtractor):
"""Load pdf files.
"""Load docx files.
Args:
Expand Down Expand Up @@ -46,14 +46,16 @@ def __del__(self) -> None:

def extract(self) -> list[Document]:
"""Load given path as single page."""
import docx2txt

return [
Document(
page_content=docx2txt.process(self.file_path),
metadata={"source": self.file_path},
)
]
from docx import Document as docx_Document

document = docx_Document(self.file_path)
doc_texts = [paragraph.text for paragraph in document.paragraphs]
content = '\n'.join(doc_texts)

return [Document(
page_content=content,
metadata={"source": self.file_path},
)]

@staticmethod
def _is_valid_url(url: str) -> bool:
Expand Down
2 changes: 1 addition & 1 deletion api/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ celery==5.2.7
redis~=4.5.4
openpyxl==3.1.2
chardet~=5.1.0
docx2txt==0.8
python-docx~=1.1.0
pypdfium2==4.16.0
resend~=0.7.0
pyjwt~=2.8.0
Expand Down

0 comments on commit b163545

Please sign in to comment.