Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(vdb): add HNSW vector index for TiDB vector store with TiFlash #12043

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/expose_service_ports.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@ yq eval '.services["pgvecto-rs"].ports += ["5431:5432"]' -i docker/docker-compos
yq eval '.services["elasticsearch"].ports += ["9200:9200"]' -i docker/docker-compose.yaml
yq eval '.services.couchbase-server.ports += ["8091-8096:8091-8096"]' -i docker/docker-compose.yaml
yq eval '.services.couchbase-server.ports += ["11210:11210"]' -i docker/docker-compose.yaml
yq eval '.services.tidb.ports += ["4000:4000"]' -i docker/docker-compose.yaml
yq eval '.services.tidb.ports += ["4000:4000"]' -i docker/tidb/docker-compose.yaml

echo "Ports exposed for sandbox, weaviate, tidb, qdrant, chroma, milvus, pgvector, pgvecto-rs, elasticsearch, couchbase"
14 changes: 12 additions & 2 deletions .github/workflows/vdb-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,15 @@ jobs:
- name: Expose Service Ports
run: sh .github/workflows/expose_service_ports.sh

- name: Set up Vector Stores (TiDB, Weaviate, Qdrant, PGVector, Milvus, PgVecto-RS, Chroma, MyScale, ElasticSearch, Couchbase)
- name: Set up Vector Store (TiDB)
uses: hoverkraft-tech/[email protected]
with:
compose-file: docker/tidb/docker-compose.yaml
services: |
tidb
tiflash

- name: Set up Vector Stores (Weaviate, Qdrant, PGVector, Milvus, PgVecto-RS, Chroma, MyScale, ElasticSearch, Couchbase)
uses: hoverkraft-tech/[email protected]
with:
compose-file: |
Expand All @@ -67,7 +75,9 @@ jobs:
pgvector
chroma
elasticsearch
tidb

- name: Check TiDB Ready
run: poetry run -C api python api/tests/integration_tests/vdb/tidb_vector/check_tiflash_ready.py

- name: Test Vector Stores
run: poetry run -C api bash dev/pytest/pytest_vdb.sh
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ docker/volumes/db/data/*
docker/volumes/redis/data/*
docker/volumes/weaviate/*
docker/volumes/qdrant/*
docker/tidb/volumes/*
docker/volumes/etcd/*
docker/volumes/minio/*
docker/volumes/milvus/*
Expand Down
43 changes: 29 additions & 14 deletions api/core/rag/datasource/vdb/tidb_vector/tidb_vector.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@

import sqlalchemy
from pydantic import BaseModel, model_validator
from sqlalchemy import JSON, TEXT, Column, DateTime, String, Table, create_engine, insert
from sqlalchemy import JSON, TEXT, Column, DateTime, Float, Integer, String, Table, bindparam, create_engine, insert
from sqlalchemy import text as sql_text
from sqlalchemy.orm import Session, declarative_base

from configs import dify_config
from core.rag.datasource.vdb.field import Field
from core.rag.datasource.vdb.vector_base import BaseVector
from core.rag.datasource.vdb.vector_factory import AbstractVectorFactory
from core.rag.datasource.vdb.vector_type import VectorType
Expand Down Expand Up @@ -54,14 +55,14 @@ def _table(self, dim: int) -> Table:
return Table(
self._collection_name,
self._orm_base.metadata,
Column("id", String(36), primary_key=True, nullable=False),
Column(Field.PRIMARY_KEY.value, String(36), primary_key=True, nullable=False),
Column(
"vector",
Field.VECTOR.value,
VectorType(dim),
nullable=False,
comment="" if self._distance_func is None else f"hnsw(distance={self._distance_func})",
),
Column("text", TEXT, nullable=False),
Column(Field.TEXT_KEY.value, TEXT, nullable=False),
Column("meta", JSON, nullable=False),
Column("create_time", DateTime, server_default=sqlalchemy.text("CURRENT_TIMESTAMP")),
Column(
Expand Down Expand Up @@ -104,14 +105,14 @@ def _create_collection(self, dimension: int):
text TEXT NOT NULL,
meta JSON NOT NULL,
doc_id VARCHAR(64) AS (JSON_UNQUOTE(JSON_EXTRACT(meta, '$.doc_id'))) STORED,
KEY (doc_id),
vector VECTOR<FLOAT>({dimension}) NOT NULL COMMENT "hnsw(distance={self._distance_func})",
create_time DATETIME DEFAULT CURRENT_TIMESTAMP,
update_time DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP
update_time DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
KEY (doc_id),
VECTOR INDEX idx_vector ((VEC_COSINE_DISTANCE(vector))) USING HNSW
);
""")
session.execute(create_statement)
# tidb vector not support 'CREATE/ADD INDEX' now
session.commit()
redis_client.set(collection_exist_cache_key, 1, ex=3600)

Expand Down Expand Up @@ -195,22 +196,36 @@ def search_by_vector(self, query_vector: list[float], **kwargs: Any) -> list[Doc

docs = []
if self._distance_func == "l2":
tidb_func = "Vec_l2_distance"
tidb_dist_func = "VEC_L2_DISTANCE"
elif self._distance_func == "cosine":
tidb_func = "Vec_Cosine_distance"
tidb_dist_func = "VEC_COSINE_DISTANCE"
else:
tidb_func = "Vec_Cosine_distance"
tidb_dist_func = "VEC_COSINE_DISTANCE"

with Session(self._engine) as session:
select_statement = sql_text(
f"""SELECT meta, text, distance FROM (
SELECT meta, text, {tidb_func}(vector, "{query_vector_str}") as distance
SELECT
meta,
text,
{tidb_dist_func}(vector, :query_vector_str) AS distance
FROM {self._collection_name}
ORDER BY distance
LIMIT {top_k}
) t WHERE distance < {distance};"""
LIMIT :top_k
) t WHERE distance <= :distance;"""
).bindparams(
bindparam("query_vector_str", type_=String),
bindparam("distance", type_=Float),
bindparam("top_k", type_=Integer),
)
res = session.execute(
select_statement,
params={
"query_vector_str": query_vector_str,
"distance": distance,
"top_k": top_k,
},
)
res = session.execute(select_statement)
results = [(row[0], row[1], row[2]) for row in res]
for meta, text, distance in results:
metadata = json.loads(meta)
Expand Down
59 changes: 59 additions & 0 deletions api/tests/integration_tests/vdb/tidb_vector/check_tiflash_ready.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import time

import pymysql


def check_tiflash_ready() -> bool:
try:
connection = pymysql.connect(
host="localhost",
port=4000,
user="root",
password="",
)

with connection.cursor() as cursor:
# Doc reference:
# https://docs.pingcap.com/zh/tidb/stable/information-schema-cluster-hardware
select_tiflash_query = """
SELECT * FROM information_schema.cluster_hardware
WHERE TYPE='tiflash'
LIMIT 1;
"""
cursor.execute(select_tiflash_query)
result = cursor.fetchall()
return result is not None and len(result) > 0
except Exception as e:
print(f"TiFlash is not ready. Exception: {e}")
return False
finally:
if connection:
connection.close()


def main():
max_attempts = 30
retry_interval_seconds = 2
is_tiflash_ready = False
for attempt in range(max_attempts):
try:
is_tiflash_ready = check_tiflash_ready()
except Exception as e:
print(f"TiFlash is not ready. Exception: {e}")
is_tiflash_ready = False

if is_tiflash_ready:
break
else:
print(f"Attempt {attempt + 1} failed,retry in {retry_interval_seconds} seconds...")
time.sleep(retry_interval_seconds)

if is_tiflash_ready:
print("TiFlash is ready in TiDB.")
else:
print(f"TiFlash is not ready in TiDB after {max_attempts} attempting checks.")
exit(1)


if __name__ == "__main__":
main()
10 changes: 0 additions & 10 deletions docker/docker-compose-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -207,16 +207,6 @@ services:
- '${EXPOSE_NGINX_PORT:-80}:${NGINX_PORT:-80}'
- '${EXPOSE_NGINX_SSL_PORT:-443}:${NGINX_SSL_PORT:-443}'

# The TiDB vector store.
# For production use, please refer to https://github.com/pingcap/tidb-docker-compose
tidb:
image: pingcap/tidb:v8.4.0
profiles:
- tidb
command:
- --store=unistore
restart: always

# The Weaviate vector store.
weaviate:
image: semitechnologies/weaviate:1.19.0
Expand Down
10 changes: 0 additions & 10 deletions docker/docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -595,16 +595,6 @@ services:
- '${EXPOSE_NGINX_PORT:-80}:${NGINX_PORT:-80}'
- '${EXPOSE_NGINX_SSL_PORT:-443}:${NGINX_SSL_PORT:-443}'

# The TiDB vector store.
# For production use, please refer to https://github.com/pingcap/tidb-docker-compose
tidb:
image: pingcap/tidb:v8.4.0
profiles:
- tidb
command:
- --store=unistore
restart: always

# The Weaviate vector store.
weaviate:
image: semitechnologies/weaviate:1.19.0
Expand Down
4 changes: 4 additions & 0 deletions docker/tidb/config/pd.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# PD Configuration File reference:
# https://docs.pingcap.com/tidb/stable/pd-configuration-file#pd-configuration-file
[replication]
max-replicas = 1
13 changes: 13 additions & 0 deletions docker/tidb/config/tiflash-learner.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# TiFlash tiflash-learner.toml Configuration File reference:
# https://docs.pingcap.com/tidb/stable/tiflash-configuration#configure-the-tiflash-learnertoml-file

log-file = "/logs/tiflash_tikv.log"

[server]
engine-addr = "tiflash:4030"
addr = "0.0.0.0:20280"
advertise-addr = "tiflash:20280"
status-addr = "tiflash:20292"

[storage]
data-dir = "/data/flash"
19 changes: 19 additions & 0 deletions docker/tidb/config/tiflash.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# TiFlash tiflash.toml Configuration File reference:
# https://docs.pingcap.com/tidb/stable/tiflash-configuration#configure-the-tiflashtoml-file

listen_host = "0.0.0.0"
path = "/data"

[flash]
tidb_status_addr = "tidb:10080"
service_addr = "tiflash:4030"

[flash.proxy]
config = "/tiflash-learner.toml"

[logger]
errorlog = "/logs/tiflash_error.log"
log = "/logs/tiflash.log"

[raft]
pd_addr = "pd0:2379"
62 changes: 62 additions & 0 deletions docker/tidb/docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
services:
pd0:
image: pingcap/pd:v8.5.0
ports:
- "2379"
volumes:
- ./config/pd.toml:/pd.toml:ro
- ./volumes/data:/data
- ./volumes/logs:/logs
command:
- --name=pd0
- --client-urls=http://0.0.0.0:2379
- --peer-urls=http://0.0.0.0:2380
- --advertise-client-urls=http://pd0:2379
- --advertise-peer-urls=http://pd0:2380
- --initial-cluster=pd0=http://pd0:2380
- --data-dir=/data/pd
- --config=/pd.toml
- --log-file=/logs/pd.log
restart: on-failure
tikv:
image: pingcap/tikv:v8.5.0
volumes:
- ./volumes/data:/data
- ./volumes/logs:/logs
command:
- --addr=0.0.0.0:20160
- --advertise-addr=tikv:20160
- --status-addr=tikv:20180
- --data-dir=/data/tikv
- --pd=pd0:2379
- --log-file=/logs/tikv.log
depends_on:
- "pd0"
restart: on-failure
tidb:
image: pingcap/tidb:v8.5.0
ports:
- "4000:4000"
volumes:
- ./volumes/logs:/logs
command:
- --advertise-address=tidb
- --store=tikv
- --path=pd0:2379
- --log-file=/logs/tidb.log
depends_on:
- "tikv"
restart: on-failure
tiflash:
image: pingcap/tiflash:v8.5.0
volumes:
- ./config/tiflash.toml:/tiflash.toml:ro
- ./config/tiflash-learner.toml:/tiflash-learner.toml:ro
- ./volumes/data:/data
- ./volumes/logs:/logs
command:
- --config=/tiflash.toml
depends_on:
- "tikv"
- "tidb"
restart: on-failure
Loading