Merge pull request #339 from netease-youdao/develop_for_v1.3.1

Develop for v1.3.1
netease-youdao · May 17, 2024 · 5f2e9dc · 5f2e9dc
2 parents 961fe9e + a5c85be
commit 5f2e9dc
Show file tree

Hide file tree

Showing 83 changed files with 67,512 additions and 25 deletions.
diff --git a/README.md b/README.md
@@ -4,6 +4,17 @@
   <a href="./README_zh.md">简体中文</a>
 </p>
 
+<h1><span style="color:red;">Important things should be said three times.</span></h1>
+
+# [2024-05-17:Latest Installation and Usage Documentation](https://github.com/netease-youdao/QAnything/blob/master/QAnything%E4%BD%BF%E7%94%A8%E8%AF%B4%E6%98%8E.md) 
+# [2024-05-17:Latest Installation and Usage Documentation](https://github.com/netease-youdao/QAnything/blob/master/QAnything%E4%BD%BF%E7%94%A8%E8%AF%B4%E6%98%8E.md) 
+# [2024-05-17:Latest Installation and Usage Documentation](https://github.com/netease-youdao/QAnything/blob/master/QAnything%E4%BD%BF%E7%94%A8%E8%AF%B4%E6%98%8E.md)
+
+## Business contact information：
+### 010-82558901
+![](docs/images/business.jpeg)
+
+
 ## Installation
 
 Requirements:

diff --git a/README_zh.md b/README_zh.md
@@ -3,6 +3,19 @@
   <a href="./README.md">English</a> |
   <a href="./README_zh.md">简体中文</a>
 </p>
+
+
+<h1><span style="color:red;">重要的事情说三遍！</span></h1>
+
+# [2024-05-17:最新的安装和使用文档](https://github.com/netease-youdao/QAnything/blob/master/QAnything%E4%BD%BF%E7%94%A8%E8%AF%B4%E6%98%8E.md) 
+# [2024-05-17:最新的安装和使用文档](https://github.com/netease-youdao/QAnything/blob/master/QAnything%E4%BD%BF%E7%94%A8%E8%AF%B4%E6%98%8E.md) 
+# [2024-05-17:最新的安装和使用文档](https://github.com/netease-youdao/QAnything/blob/master/QAnything%E4%BD%BF%E7%94%A8%E8%AF%B4%E6%98%8E.md)
+
+## 商务问题联系方式：
+### 010-82558901
+![](docs/images/business.jpeg)
+
+
 ## 安装 
 
 要求:

diff --git a/docs/images/business.jpeg b/docs/images/business.jpeg
diff --git a/qanything_kernel/configs/model_config.py b/qanything_kernel/configs/model_config.py
@@ -6,8 +6,20 @@
 
 os_system = platform.system()
 
+#### 用户配置区 ####
 # 默认的CUDA设备
 CUDA_DEVICE = '0'
+# 设置是否使用快速PDF解析器，设置为False时，使用优化后的PDF解析器，但速度下降
+USE_FAST_PDF_PARSER = True
+# 设置rerank的batch大小，16GB内存建议设置为8，32GB内存建议设置为16
+LOCAL_RERANK_BATCH = 8
+# 设置rerank的多线程worker数量，默认设置为4，根据机器性能调整
+LOCAL_RERANK_WORKERS = 4
+# 设置embed的batch大小，16GB内存建议设置为8，32GB内存建议设置为16
+LOCAL_EMBED_BATCH = 8
+# 设置embed的多线程worker数量，默认设置为4，根据机器性能调整
+LOCAL_EMBED_WORKERS = 4
+#### 用户配置区 ####
 
 # 获取项目根目录
 # 获取当前脚本的绝对路径
@@ -18,6 +30,7 @@
 # 如果不存在则创建
 if not os.path.exists(UPLOAD_ROOT_PATH):
     os.makedirs(UPLOAD_ROOT_PATH)
+PDF_MODEL_PATH = os.path.join(root_path, "qanything_kernel/utils/loader/pdf_to_markdown")
 
 nltk_data_path = os.path.join(root_path, 'qanything_kernel/nltk_data')
 
@@ -99,7 +112,6 @@
 print('LOCAL_RERANK_REPO:', LOCAL_RERANK_REPO)
 LOCAL_RERANK_MODEL_NAME = 'rerank'
 LOCAL_RERANK_MAX_LENGTH = 512
-LOCAL_RERANK_BATCH = 8
 
 LOCAL_EMBED_PATH = os.path.join(root_path, 'qanything_kernel/connector/embedding', 'embedding_model_configs_v0.0.1')
 if os_system == 'Darwin':
@@ -111,7 +123,6 @@
 print('LOCAL_EMBED_REPO:', LOCAL_EMBED_REPO)
 LOCAL_EMBED_MODEL_NAME = 'embed'
 LOCAL_EMBED_MAX_LENGTH = 512
-LOCAL_EMBED_BATCH = 8
 
 # VLLM PARAMS
 model_path = os.path.join(root_path, "assets", "custom_models")

diff --git a/qanything_kernel/connector/embedding/embedding_backend.py b/qanything_kernel/connector/embedding/embedding_backend.py
@@ -1,7 +1,7 @@
 """Wrapper around YouDao embedding models."""
 from typing import List
 from qanything_kernel.configs.model_config import LOCAL_EMBED_MODEL_PATH, LOCAL_EMBED_MAX_LENGTH, LOCAL_EMBED_BATCH, \
-    LOCAL_EMBED_PATH, LOCAL_EMBED_REPO
+    LOCAL_EMBED_PATH, LOCAL_EMBED_REPO, LOCAL_EMBED_WORKERS
 from qanything_kernel.utils.general_utils import get_time
 from qanything_kernel.utils.custom_log import debug_logger
 from langchain_core.embeddings import Embeddings
@@ -31,6 +31,7 @@ class EmbeddingBackend(Embeddings):
     def __init__(self, use_cpu):
         self.use_cpu = use_cpu
         self._tokenizer = AutoTokenizer.from_pretrained(LOCAL_EMBED_PATH)
+        self.workers = LOCAL_EMBED_WORKERS
 
     @abstractmethod
     def get_embedding(self, sentences, max_length) -> List:
@@ -41,7 +42,7 @@ def get_len_safe_embeddings(self, texts: List[str]) -> List[List[float]]:
         all_embeddings = []
         batch_size = LOCAL_EMBED_BATCH
 
-        with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
+        with concurrent.futures.ThreadPoolExecutor(max_workers=self.workers) as executor:
             futures = []
             for i in range(0, len(texts), batch_size):
                 batch = texts[i:i + batch_size]

diff --git a/qanything_kernel/connector/rerank/rerank_backend.py b/qanything_kernel/connector/rerank/rerank_backend.py
@@ -3,7 +3,7 @@
 from typing import List
 from qanything_kernel.configs.model_config import LOCAL_RERANK_MODEL_PATH, LOCAL_RERANK_MAX_LENGTH, \
     LOCAL_RERANK_MODEL_NAME, \
-    LOCAL_RERANK_BATCH, LOCAL_RERANK_PATH, LOCAL_RERANK_REPO
+    LOCAL_RERANK_BATCH, LOCAL_RERANK_PATH, LOCAL_RERANK_REPO, LOCAL_RERANK_WORKERS
 from qanything_kernel.utils.custom_log import debug_logger
 from qanything_kernel.utils.general_utils import get_time
 from modelscope import snapshot_download
@@ -32,6 +32,7 @@ def __init__(self, use_cpu):
         self.batch_size = LOCAL_RERANK_BATCH
         self.max_length = LOCAL_RERANK_MAX_LENGTH
         self.return_tensors = None
+        self.workers = LOCAL_RERANK_WORKERS
 
     @abstractmethod
     def inference(self, batch) -> List:
@@ -90,7 +91,7 @@ def predict(self,
         tot_batches, merge_inputs_idxs_sort = self.tokenize_preproc(query, passages)
 
         tot_scores = []
-        with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
+        with concurrent.futures.ThreadPoolExecutor(max_workers=self.workers) as executor:
             futures = []
             for k in range(0, len(tot_batches), self.batch_size):
                 batch = self._tokenizer.pad(

diff --git a/qanything_kernel/core/local_file.py b/qanything_kernel/core/local_file.py
@@ -1,13 +1,12 @@
 from qanything_kernel.utils.general_utils import *
 from typing import List, Union, Callable
-from qanything_kernel.configs.model_config import UPLOAD_ROOT_PATH, SENTENCE_SIZE, ZH_TITLE_ENHANCE
+from qanything_kernel.configs.model_config import UPLOAD_ROOT_PATH, SENTENCE_SIZE, ZH_TITLE_ENHANCE, USE_FAST_PDF_PARSER
 from langchain.docstore.document import Document
 from qanything_kernel.utils.loader.my_recursive_url_loader import MyRecursiveUrlLoader
 from langchain_community.document_loaders import UnstructuredFileLoader, TextLoader
 from langchain_community.document_loaders import UnstructuredWordDocumentLoader
 from langchain_community.document_loaders import UnstructuredExcelLoader
 from langchain_community.document_loaders import UnstructuredPDFLoader
-import langchain_community.document_loaders.pdf
 from langchain_community.document_loaders import UnstructuredEmailLoader
 from langchain_community.document_loaders import UnstructuredPowerPointLoader
 from qanything_kernel.utils.loader.csv_loader import CSVLoader
@@ -16,6 +15,8 @@
 from qanything_kernel.utils.splitter import ChineseTextSplitter
 from qanything_kernel.utils.loader import UnstructuredPaddleImageLoader, UnstructuredPaddlePDFLoader, UnstructuredPaddleAudioLoader
 from qanything_kernel.utils.splitter import zh_title_enhance
+from qanything_kernel.utils.loader.self_pdf_loader import PdfLoader
+from qanything_kernel.utils.loader.markdown_parser import convert_markdown_to_langchaindoc
 from sanic.request import File
 import pandas as pd
 import os
@@ -65,6 +66,27 @@ def __init__(self, user_id, kb_id, file: Union[File, str, dict], file_id, file_n
                 f.write(self.file_content)
         debug_logger.info(f'success init localfile {self.file_name}')
 
+    @staticmethod
+    def pdf_process(dos: List[Document]):
+        new_docs = []
+        for doc in dos:
+            # metadata={'title_lst': ['#樊昊天个人简历', '##教育经历'], 'has_table': False}
+            title_lst = doc.metadata['title_lst']
+            # 删除所有仅有多个#的title
+            title_lst = [t for t in title_lst if t.replace('#', '') != '']
+            has_table = doc.metadata['has_table']
+            if has_table:
+                doc.page_content = '\n'.join(title_lst) + '\n本段为表格，内容如下：' + doc.page_content
+                new_docs.append(doc)
+                continue
+            # doc.page_content = '\n'.join(title_lst) + '\n' + doc.page_content
+            slices = pdf_text_splitter.split_documents([doc])
+            for idx, slice in enumerate(slices):
+                slice.page_content = '\n'.join(title_lst) + f'\n######第{idx+1}段内容如下：\n' + slice.page_content
+            new_docs.extend(slices)
+        return new_docs
+
+    @get_time
     def split_file_to_docs(self, ocr_engine: Callable, sentence_size=SENTENCE_SIZE,
                            using_zh_title_enhance=ZH_TITLE_ENHANCE):
         if self.url:
@@ -82,10 +104,16 @@ def split_file_to_docs(self, ocr_engine: Callable, sentence_size=SENTENCE_SIZE,
             texts_splitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)
             docs = loader.load_and_split(texts_splitter)
         elif self.file_path.lower().endswith(".pdf"):
-            loader = UnstructuredPaddlePDFLoader(self.file_path, ocr_engine, self.use_cpu)
-            # texts_splitter = ChineseTextSplitter(pdf=True, sentence_size=sentence_size)
-            # docs = loader.load_and_split(texts_splitter)
-            docs = loader.load()
+            if USE_FAST_PDF_PARSER:
+                loader = UnstructuredPaddlePDFLoader(self.file_path, ocr_engine, self.use_cpu)
+                texts_splitter = ChineseTextSplitter(pdf=True, sentence_size=sentence_size)
+                docs = loader.load_and_split(texts_splitter)
+            else:
+                loader = PdfLoader(filename=self.file_path, root_dir=os.path.dirname(self.file_path))
+                markdown_dir = loader.load_to_markdown()
+                docs = convert_markdown_to_langchaindoc(markdown_dir)
+                docs = self.pdf_process(docs)
+                # print(docs)
         elif self.file_path.lower().endswith(".jpg") or self.file_path.lower().endswith(
                 ".png") or self.file_path.lower().endswith(".jpeg"):
             loader = UnstructuredPaddleImageLoader(self.file_path, ocr_engine, self.use_cpu)
@@ -121,6 +149,7 @@ def split_file_to_docs(self, ocr_engine: Callable, sentence_size=SENTENCE_SIZE,
             debug_logger.info("using_zh_title_enhance %s", using_zh_title_enhance)
             docs = zh_title_enhance(docs)
         print('docs number:', len(docs))
+        # print(docs)
         # 不是csv，xlsx和FAQ的文件，需要再次分割
         if not self.file_path.lower().endswith(".csv") and not self.file_path.lower().endswith(".xlsx") and not self.file_path == 'FAQ':
             new_docs = []
@@ -136,7 +165,8 @@ def split_file_to_docs(self, ocr_engine: Callable, sentence_size=SENTENCE_SIZE,
                         new_docs.append(doc)
             debug_logger.info(f"before 2nd split doc lens: {len(new_docs)}")
             if self.file_path.lower().endswith(".pdf"):
-                docs = pdf_text_splitter.split_documents(new_docs)
+                if USE_FAST_PDF_PARSER:
+                    docs = pdf_text_splitter.split_documents(new_docs)
             else:
                 docs = text_splitter.split_documents(new_docs)
             debug_logger.info(f"after 2nd split doc lens: {len(docs)}")

diff --git a/qanything_kernel/qanything_server/sanic_api.py b/qanything_kernel/qanything_server/sanic_api.py
@@ -18,7 +18,7 @@
 sys.path.append(root_dir)
 
 from qanything_kernel.configs.model_config import DT_7B_MODEL_PATH, \
-    DT_7B_DOWNLOAD_PARAMS, DT_3B_MODEL_PATH, DT_3B_DOWNLOAD_PARAMS
+    DT_7B_DOWNLOAD_PARAMS, DT_3B_MODEL_PATH, DT_3B_DOWNLOAD_PARAMS, PDF_MODEL_PATH
 import qanything_kernel.configs.model_config as model_config
 from qanything_kernel.utils.custom_log import debug_logger
 from qanything_kernel.utils.general_utils import download_file, get_gpu_memory_utilization, check_package_version
@@ -81,18 +81,27 @@
     if os_system != "Linux":
         raise ValueError(f"Unsupported system: {os_system}")
     system_name = 'manylinux_2_28_x86_64'
-    # 官方发布的1.17.1不支持cuda12以上的系统，需要根据官方文档:https://onnxruntime.ai/docs/install/里提到的地址手动下载whl
-    if not check_package_version("onnxruntime-gpu", "1.17.1"):
-        download_url = f"https://aiinfra.pkgs.visualstudio.com/PublicPackages/_apis/packaging/feeds/9387c3aa-d9ad-4513-968c-383f6f7f53b8/pypi/packages/onnxruntime-gpu/versions/1.17.1/onnxruntime_gpu-1.17.1-cp3{python3_version}-cp3{python3_version}-{system_name}.whl/content"
-        debug_logger.info(f'开始从{download_url}下载onnxruntime，也可以手动下载并通过pip install *.whl安装')
-        whl_name = f'onnxruntime_gpu-1.17.1-cp3{python3_version}-cp3{python3_version}-{system_name}.whl'
-        download_file(download_url, whl_name)
-        exit_status = os.system(f"pip install {whl_name}")
-        if exit_status != 0:
-            # raise ValueError(f"安装onnxruntime失败，请手动安装{whl_name}")
-            debug_logger.warning(f"安装onnxruntime-gpu失败，将安装onnxruntime来代替")
-            print(f"安装onnxruntime-gpu失败，将安装onnxruntime来代替", flush=True)
+    glibc_info = platform.libc_ver()
+    if glibc_info[0] != 'glibc':
+        raise ValueError(f"Unsupported libc: {glibc_info[0]}, 请确认系统是否为Linux系统。")
+    glibc_version = float(glibc_info[1])
+    if glibc_version < 2.28:
+        if not check_package_version("onnxruntime", "1.16.3"):
+            print(f"当前系统glibc版本为{glibc_version}<2.28，无法使用onnxruntime-gpu(cuda12.x)，将安装onnxruntime来代替", flush=True)
             os.system("pip install onnxruntime")
+    else:
+        # 官方发布的1.17.1不支持cuda12以上的系统，需要根据官方文档:https://onnxruntime.ai/docs/install/里提到的地址手动下载whl
+        if not check_package_version("onnxruntime-gpu", "1.17.1"):
+            download_url = f"https://aiinfra.pkgs.visualstudio.com/PublicPackages/_apis/packaging/feeds/9387c3aa-d9ad-4513-968c-383f6f7f53b8/pypi/packages/onnxruntime-gpu/versions/1.17.1/onnxruntime_gpu-1.17.1-cp3{python3_version}-cp3{python3_version}-{system_name}.whl/content"
+            debug_logger.info(f'开始从{download_url}下载onnxruntime，也可以手动下载并通过pip install *.whl安装')
+            whl_name = f'onnxruntime_gpu-1.17.1-cp3{python3_version}-cp3{python3_version}-{system_name}.whl'
+            download_file(download_url, whl_name)
+            exit_status = os.system(f"pip install {whl_name}")
+            if exit_status != 0:
+                # raise ValueError(f"安装onnxruntime失败，请手动安装{whl_name}")
+                debug_logger.warning(f"安装onnxruntime-gpu失败，将安装onnxruntime来代替")
+                print(f"安装onnxruntime-gpu失败，将安装onnxruntime来代替", flush=True)
+                os.system("pip install onnxruntime")
     if not args.use_openai_api:
         if not check_package_version("vllm", "0.2.7"):
             os.system(f"pip install vllm==0.2.7 -i https://pypi.mirrors.ustc.edu.cn/simple/ --trusted-host pypi.mirrors.ustc.edu.cn")
@@ -145,6 +154,15 @@
     debug_logger.info(f'{args.model}路径已存在，不再重复下载大模型（如果下载出错可手动删除此目录）')
     debug_logger.info(f"CUDA_DEVICE: {model_config.CUDA_DEVICE}")
 
+# 下载pdf解析相关的模型
+pdf_models_path = os.path.join(PDF_MODEL_PATH, 'checkpoints')
+if not os.path.exists(pdf_models_path):
+    debug_logger.info(f'开始下载大模型：{model_download_params}')
+    model_dir = snapshot_download('netease-youdao/QAnything-pdf-parser')
+    subprocess.check_output(['ln', '-s', model_dir, pdf_models_path], text=True)
+    debug_logger.info(f'PDF解析相关模型下载完毕！cache地址：{model_dir}, 软链接地址：{pdf_models_path}')
+
+
 WorkerManager.THRESHOLD = 6000
 
 app = Sanic("QAnything")