Skip to content

Commit

Permalink
Merge pull request #339 from netease-youdao/develop_for_v1.3.1
Browse files Browse the repository at this point in the history
Develop for v1.3.1
  • Loading branch information
xixihahaliu authored May 17, 2024
2 parents 961fe9e + a5c85be commit 5f2e9dc
Show file tree
Hide file tree
Showing 83 changed files with 67,512 additions and 25 deletions.
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,17 @@
<a href="./README_zh.md">简体中文</a>
</p>

<h1><span style="color:red;">Important things should be said three times.</span></h1>

# [2024-05-17:Latest Installation and Usage Documentation](https://github.com/netease-youdao/QAnything/blob/master/QAnything%E4%BD%BF%E7%94%A8%E8%AF%B4%E6%98%8E.md)
# [2024-05-17:Latest Installation and Usage Documentation](https://github.com/netease-youdao/QAnything/blob/master/QAnything%E4%BD%BF%E7%94%A8%E8%AF%B4%E6%98%8E.md)
# [2024-05-17:Latest Installation and Usage Documentation](https://github.com/netease-youdao/QAnything/blob/master/QAnything%E4%BD%BF%E7%94%A8%E8%AF%B4%E6%98%8E.md)

## Business contact information:
### 010-82558901
![](docs/images/business.jpeg)


## Installation

Requirements:
Expand Down
13 changes: 13 additions & 0 deletions README_zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,19 @@
<a href="./README.md">English</a> |
<a href="./README_zh.md">简体中文</a>
</p>


<h1><span style="color:red;">重要的事情说三遍!</span></h1>

# [2024-05-17:最新的安装和使用文档](https://github.com/netease-youdao/QAnything/blob/master/QAnything%E4%BD%BF%E7%94%A8%E8%AF%B4%E6%98%8E.md)
# [2024-05-17:最新的安装和使用文档](https://github.com/netease-youdao/QAnything/blob/master/QAnything%E4%BD%BF%E7%94%A8%E8%AF%B4%E6%98%8E.md)
# [2024-05-17:最新的安装和使用文档](https://github.com/netease-youdao/QAnything/blob/master/QAnything%E4%BD%BF%E7%94%A8%E8%AF%B4%E6%98%8E.md)

## 商务问题联系方式:
### 010-82558901
![](docs/images/business.jpeg)


## 安装

要求:
Expand Down
Binary file added docs/images/business.jpeg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
15 changes: 13 additions & 2 deletions qanything_kernel/configs/model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,20 @@

os_system = platform.system()

#### 用户配置区 ####
# 默认的CUDA设备
CUDA_DEVICE = '0'
# 设置是否使用快速PDF解析器,设置为False时,使用优化后的PDF解析器,但速度下降
USE_FAST_PDF_PARSER = True
# 设置rerank的batch大小,16GB内存建议设置为8,32GB内存建议设置为16
LOCAL_RERANK_BATCH = 8
# 设置rerank的多线程worker数量,默认设置为4,根据机器性能调整
LOCAL_RERANK_WORKERS = 4
# 设置embed的batch大小,16GB内存建议设置为8,32GB内存建议设置为16
LOCAL_EMBED_BATCH = 8
# 设置embed的多线程worker数量,默认设置为4,根据机器性能调整
LOCAL_EMBED_WORKERS = 4
#### 用户配置区 ####

# 获取项目根目录
# 获取当前脚本的绝对路径
Expand All @@ -18,6 +30,7 @@
# 如果不存在则创建
if not os.path.exists(UPLOAD_ROOT_PATH):
os.makedirs(UPLOAD_ROOT_PATH)
PDF_MODEL_PATH = os.path.join(root_path, "qanything_kernel/utils/loader/pdf_to_markdown")

nltk_data_path = os.path.join(root_path, 'qanything_kernel/nltk_data')

Expand Down Expand Up @@ -99,7 +112,6 @@
print('LOCAL_RERANK_REPO:', LOCAL_RERANK_REPO)
LOCAL_RERANK_MODEL_NAME = 'rerank'
LOCAL_RERANK_MAX_LENGTH = 512
LOCAL_RERANK_BATCH = 8

LOCAL_EMBED_PATH = os.path.join(root_path, 'qanything_kernel/connector/embedding', 'embedding_model_configs_v0.0.1')
if os_system == 'Darwin':
Expand All @@ -111,7 +123,6 @@
print('LOCAL_EMBED_REPO:', LOCAL_EMBED_REPO)
LOCAL_EMBED_MODEL_NAME = 'embed'
LOCAL_EMBED_MAX_LENGTH = 512
LOCAL_EMBED_BATCH = 8

# VLLM PARAMS
model_path = os.path.join(root_path, "assets", "custom_models")
Expand Down
5 changes: 3 additions & 2 deletions qanything_kernel/connector/embedding/embedding_backend.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Wrapper around YouDao embedding models."""
from typing import List
from qanything_kernel.configs.model_config import LOCAL_EMBED_MODEL_PATH, LOCAL_EMBED_MAX_LENGTH, LOCAL_EMBED_BATCH, \
LOCAL_EMBED_PATH, LOCAL_EMBED_REPO
LOCAL_EMBED_PATH, LOCAL_EMBED_REPO, LOCAL_EMBED_WORKERS
from qanything_kernel.utils.general_utils import get_time
from qanything_kernel.utils.custom_log import debug_logger
from langchain_core.embeddings import Embeddings
Expand Down Expand Up @@ -31,6 +31,7 @@ class EmbeddingBackend(Embeddings):
def __init__(self, use_cpu):
self.use_cpu = use_cpu
self._tokenizer = AutoTokenizer.from_pretrained(LOCAL_EMBED_PATH)
self.workers = LOCAL_EMBED_WORKERS

@abstractmethod
def get_embedding(self, sentences, max_length) -> List:
Expand All @@ -41,7 +42,7 @@ def get_len_safe_embeddings(self, texts: List[str]) -> List[List[float]]:
all_embeddings = []
batch_size = LOCAL_EMBED_BATCH

with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=self.workers) as executor:
futures = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
Expand Down
5 changes: 3 additions & 2 deletions qanything_kernel/connector/rerank/rerank_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import List
from qanything_kernel.configs.model_config import LOCAL_RERANK_MODEL_PATH, LOCAL_RERANK_MAX_LENGTH, \
LOCAL_RERANK_MODEL_NAME, \
LOCAL_RERANK_BATCH, LOCAL_RERANK_PATH, LOCAL_RERANK_REPO
LOCAL_RERANK_BATCH, LOCAL_RERANK_PATH, LOCAL_RERANK_REPO, LOCAL_RERANK_WORKERS
from qanything_kernel.utils.custom_log import debug_logger
from qanything_kernel.utils.general_utils import get_time
from modelscope import snapshot_download
Expand Down Expand Up @@ -32,6 +32,7 @@ def __init__(self, use_cpu):
self.batch_size = LOCAL_RERANK_BATCH
self.max_length = LOCAL_RERANK_MAX_LENGTH
self.return_tensors = None
self.workers = LOCAL_RERANK_WORKERS

@abstractmethod
def inference(self, batch) -> List:
Expand Down Expand Up @@ -90,7 +91,7 @@ def predict(self,
tot_batches, merge_inputs_idxs_sort = self.tokenize_preproc(query, passages)

tot_scores = []
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=self.workers) as executor:
futures = []
for k in range(0, len(tot_batches), self.batch_size):
batch = self._tokenizer.pad(
Expand Down
44 changes: 37 additions & 7 deletions qanything_kernel/core/local_file.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
from qanything_kernel.utils.general_utils import *
from typing import List, Union, Callable
from qanything_kernel.configs.model_config import UPLOAD_ROOT_PATH, SENTENCE_SIZE, ZH_TITLE_ENHANCE
from qanything_kernel.configs.model_config import UPLOAD_ROOT_PATH, SENTENCE_SIZE, ZH_TITLE_ENHANCE, USE_FAST_PDF_PARSER
from langchain.docstore.document import Document
from qanything_kernel.utils.loader.my_recursive_url_loader import MyRecursiveUrlLoader
from langchain_community.document_loaders import UnstructuredFileLoader, TextLoader
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
from langchain_community.document_loaders import UnstructuredExcelLoader
from langchain_community.document_loaders import UnstructuredPDFLoader
import langchain_community.document_loaders.pdf
from langchain_community.document_loaders import UnstructuredEmailLoader
from langchain_community.document_loaders import UnstructuredPowerPointLoader
from qanything_kernel.utils.loader.csv_loader import CSVLoader
Expand All @@ -16,6 +15,8 @@
from qanything_kernel.utils.splitter import ChineseTextSplitter
from qanything_kernel.utils.loader import UnstructuredPaddleImageLoader, UnstructuredPaddlePDFLoader, UnstructuredPaddleAudioLoader
from qanything_kernel.utils.splitter import zh_title_enhance
from qanything_kernel.utils.loader.self_pdf_loader import PdfLoader
from qanything_kernel.utils.loader.markdown_parser import convert_markdown_to_langchaindoc
from sanic.request import File
import pandas as pd
import os
Expand Down Expand Up @@ -65,6 +66,27 @@ def __init__(self, user_id, kb_id, file: Union[File, str, dict], file_id, file_n
f.write(self.file_content)
debug_logger.info(f'success init localfile {self.file_name}')

@staticmethod
def pdf_process(dos: List[Document]):
new_docs = []
for doc in dos:
# metadata={'title_lst': ['#樊昊天个人简历', '##教育经历'], 'has_table': False}
title_lst = doc.metadata['title_lst']
# 删除所有仅有多个#的title
title_lst = [t for t in title_lst if t.replace('#', '') != '']
has_table = doc.metadata['has_table']
if has_table:
doc.page_content = '\n'.join(title_lst) + '\n本段为表格,内容如下:' + doc.page_content
new_docs.append(doc)
continue
# doc.page_content = '\n'.join(title_lst) + '\n' + doc.page_content
slices = pdf_text_splitter.split_documents([doc])
for idx, slice in enumerate(slices):
slice.page_content = '\n'.join(title_lst) + f'\n######第{idx+1}段内容如下:\n' + slice.page_content
new_docs.extend(slices)
return new_docs

@get_time
def split_file_to_docs(self, ocr_engine: Callable, sentence_size=SENTENCE_SIZE,
using_zh_title_enhance=ZH_TITLE_ENHANCE):
if self.url:
Expand All @@ -82,10 +104,16 @@ def split_file_to_docs(self, ocr_engine: Callable, sentence_size=SENTENCE_SIZE,
texts_splitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)
docs = loader.load_and_split(texts_splitter)
elif self.file_path.lower().endswith(".pdf"):
loader = UnstructuredPaddlePDFLoader(self.file_path, ocr_engine, self.use_cpu)
# texts_splitter = ChineseTextSplitter(pdf=True, sentence_size=sentence_size)
# docs = loader.load_and_split(texts_splitter)
docs = loader.load()
if USE_FAST_PDF_PARSER:
loader = UnstructuredPaddlePDFLoader(self.file_path, ocr_engine, self.use_cpu)
texts_splitter = ChineseTextSplitter(pdf=True, sentence_size=sentence_size)
docs = loader.load_and_split(texts_splitter)
else:
loader = PdfLoader(filename=self.file_path, root_dir=os.path.dirname(self.file_path))
markdown_dir = loader.load_to_markdown()
docs = convert_markdown_to_langchaindoc(markdown_dir)
docs = self.pdf_process(docs)
# print(docs)
elif self.file_path.lower().endswith(".jpg") or self.file_path.lower().endswith(
".png") or self.file_path.lower().endswith(".jpeg"):
loader = UnstructuredPaddleImageLoader(self.file_path, ocr_engine, self.use_cpu)
Expand Down Expand Up @@ -121,6 +149,7 @@ def split_file_to_docs(self, ocr_engine: Callable, sentence_size=SENTENCE_SIZE,
debug_logger.info("using_zh_title_enhance %s", using_zh_title_enhance)
docs = zh_title_enhance(docs)
print('docs number:', len(docs))
# print(docs)
# 不是csv,xlsx和FAQ的文件,需要再次分割
if not self.file_path.lower().endswith(".csv") and not self.file_path.lower().endswith(".xlsx") and not self.file_path == 'FAQ':
new_docs = []
Expand All @@ -136,7 +165,8 @@ def split_file_to_docs(self, ocr_engine: Callable, sentence_size=SENTENCE_SIZE,
new_docs.append(doc)
debug_logger.info(f"before 2nd split doc lens: {len(new_docs)}")
if self.file_path.lower().endswith(".pdf"):
docs = pdf_text_splitter.split_documents(new_docs)
if USE_FAST_PDF_PARSER:
docs = pdf_text_splitter.split_documents(new_docs)
else:
docs = text_splitter.split_documents(new_docs)
debug_logger.info(f"after 2nd split doc lens: {len(docs)}")
Expand Down
42 changes: 30 additions & 12 deletions qanything_kernel/qanything_server/sanic_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
sys.path.append(root_dir)

from qanything_kernel.configs.model_config import DT_7B_MODEL_PATH, \
DT_7B_DOWNLOAD_PARAMS, DT_3B_MODEL_PATH, DT_3B_DOWNLOAD_PARAMS
DT_7B_DOWNLOAD_PARAMS, DT_3B_MODEL_PATH, DT_3B_DOWNLOAD_PARAMS, PDF_MODEL_PATH
import qanything_kernel.configs.model_config as model_config
from qanything_kernel.utils.custom_log import debug_logger
from qanything_kernel.utils.general_utils import download_file, get_gpu_memory_utilization, check_package_version
Expand Down Expand Up @@ -81,18 +81,27 @@
if os_system != "Linux":
raise ValueError(f"Unsupported system: {os_system}")
system_name = 'manylinux_2_28_x86_64'
# 官方发布的1.17.1不支持cuda12以上的系统,需要根据官方文档:https://onnxruntime.ai/docs/install/里提到的地址手动下载whl
if not check_package_version("onnxruntime-gpu", "1.17.1"):
download_url = f"https://aiinfra.pkgs.visualstudio.com/PublicPackages/_apis/packaging/feeds/9387c3aa-d9ad-4513-968c-383f6f7f53b8/pypi/packages/onnxruntime-gpu/versions/1.17.1/onnxruntime_gpu-1.17.1-cp3{python3_version}-cp3{python3_version}-{system_name}.whl/content"
debug_logger.info(f'开始从{download_url}下载onnxruntime,也可以手动下载并通过pip install *.whl安装')
whl_name = f'onnxruntime_gpu-1.17.1-cp3{python3_version}-cp3{python3_version}-{system_name}.whl'
download_file(download_url, whl_name)
exit_status = os.system(f"pip install {whl_name}")
if exit_status != 0:
# raise ValueError(f"安装onnxruntime失败,请手动安装{whl_name}")
debug_logger.warning(f"安装onnxruntime-gpu失败,将安装onnxruntime来代替")
print(f"安装onnxruntime-gpu失败,将安装onnxruntime来代替", flush=True)
glibc_info = platform.libc_ver()
if glibc_info[0] != 'glibc':
raise ValueError(f"Unsupported libc: {glibc_info[0]}, 请确认系统是否为Linux系统。")
glibc_version = float(glibc_info[1])
if glibc_version < 2.28:
if not check_package_version("onnxruntime", "1.16.3"):
print(f"当前系统glibc版本为{glibc_version}<2.28,无法使用onnxruntime-gpu(cuda12.x),将安装onnxruntime来代替", flush=True)
os.system("pip install onnxruntime")
else:
# 官方发布的1.17.1不支持cuda12以上的系统,需要根据官方文档:https://onnxruntime.ai/docs/install/里提到的地址手动下载whl
if not check_package_version("onnxruntime-gpu", "1.17.1"):
download_url = f"https://aiinfra.pkgs.visualstudio.com/PublicPackages/_apis/packaging/feeds/9387c3aa-d9ad-4513-968c-383f6f7f53b8/pypi/packages/onnxruntime-gpu/versions/1.17.1/onnxruntime_gpu-1.17.1-cp3{python3_version}-cp3{python3_version}-{system_name}.whl/content"
debug_logger.info(f'开始从{download_url}下载onnxruntime,也可以手动下载并通过pip install *.whl安装')
whl_name = f'onnxruntime_gpu-1.17.1-cp3{python3_version}-cp3{python3_version}-{system_name}.whl'
download_file(download_url, whl_name)
exit_status = os.system(f"pip install {whl_name}")
if exit_status != 0:
# raise ValueError(f"安装onnxruntime失败,请手动安装{whl_name}")
debug_logger.warning(f"安装onnxruntime-gpu失败,将安装onnxruntime来代替")
print(f"安装onnxruntime-gpu失败,将安装onnxruntime来代替", flush=True)
os.system("pip install onnxruntime")
if not args.use_openai_api:
if not check_package_version("vllm", "0.2.7"):
os.system(f"pip install vllm==0.2.7 -i https://pypi.mirrors.ustc.edu.cn/simple/ --trusted-host pypi.mirrors.ustc.edu.cn")
Expand Down Expand Up @@ -145,6 +154,15 @@
debug_logger.info(f'{args.model}路径已存在,不再重复下载大模型(如果下载出错可手动删除此目录)')
debug_logger.info(f"CUDA_DEVICE: {model_config.CUDA_DEVICE}")

# 下载pdf解析相关的模型
pdf_models_path = os.path.join(PDF_MODEL_PATH, 'checkpoints')
if not os.path.exists(pdf_models_path):
debug_logger.info(f'开始下载大模型:{model_download_params}')
model_dir = snapshot_download('netease-youdao/QAnything-pdf-parser')
subprocess.check_output(['ln', '-s', model_dir, pdf_models_path], text=True)
debug_logger.info(f'PDF解析相关模型下载完毕!cache地址:{model_dir}, 软链接地址:{pdf_models_path}')


WorkerManager.THRESHOLD = 6000

app = Sanic("QAnything")
Expand Down
Loading

0 comments on commit 5f2e9dc

Please sign in to comment.