From 9366ee76714b6c41edf3590bcb76d6c26d44d36b Mon Sep 17 00:00:00 2001 From: Jyong <76649700+JohnJyong@users.noreply.github.com> Date: Sat, 20 Jul 2024 01:29:25 +0800 Subject: [PATCH] update clean embedding cache query logic (#6483) --- ...5b_add_embedding_cache_created_at_index.py | 32 +++++++++++++++++++ api/models/dataset.py | 3 +- api/models/model.py | 2 +- api/schedule/clean_embedding_cache_task.py | 15 ++++++--- 4 files changed, 46 insertions(+), 6 deletions(-) create mode 100644 api/migrations/versions/6e957a32015b_add_embedding_cache_created_at_index.py diff --git a/api/migrations/versions/6e957a32015b_add_embedding_cache_created_at_index.py b/api/migrations/versions/6e957a32015b_add_embedding_cache_created_at_index.py new file mode 100644 index 00000000000000..7445f664cd75a1 --- /dev/null +++ b/api/migrations/versions/6e957a32015b_add_embedding_cache_created_at_index.py @@ -0,0 +1,32 @@ +"""add-embedding-cache-created_at_index + +Revision ID: 6e957a32015b +Revises: fecff1c3da27 +Create Date: 2024-07-19 17:21:34.414705 + +""" +from alembic import op + +import models as models + +# revision identifiers, used by Alembic. +revision = '6e957a32015b' +down_revision = 'fecff1c3da27' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table('embeddings', schema=None) as batch_op: + batch_op.create_index('created_at_idx', ['created_at'], unique=False) + + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table('embeddings', schema=None) as batch_op: + batch_op.drop_index('created_at_idx') + + # ### end Alembic commands ### diff --git a/api/models/dataset.py b/api/models/dataset.py index d0be005a154878..34dde2dcef737f 100644 --- a/api/models/dataset.py +++ b/api/models/dataset.py @@ -630,7 +630,8 @@ class Embedding(db.Model): __tablename__ = 'embeddings' __table_args__ = ( db.PrimaryKeyConstraint('id', name='embedding_pkey'), - db.UniqueConstraint('model_name', 'hash', 'provider_name', name='embedding_hash_idx') + db.UniqueConstraint('model_name', 'hash', 'provider_name', name='embedding_hash_idx'), + db.Index('created_at_idx', 'created_at') ) id = db.Column(StringUUID, primary_key=True, server_default=db.text('uuid_generate_v4()')) diff --git a/api/models/model.py b/api/models/model.py index 331bb91c29819f..396cd7ec6382c3 100644 --- a/api/models/model.py +++ b/api/models/model.py @@ -1383,7 +1383,7 @@ class TraceAppConfig(db.Model): __tablename__ = 'trace_app_config' __table_args__ = ( db.PrimaryKeyConstraint('id', name='tracing_app_config_pkey'), - db.Index('tracing_app_config_app_id_idx', 'app_id'), + db.Index('trace_app_config_app_id_idx', 'app_id'), ) id = db.Column(StringUUID, server_default=db.text('uuid_generate_v4()')) diff --git a/api/schedule/clean_embedding_cache_task.py b/api/schedule/clean_embedding_cache_task.py index f68c54600a6012..49b1be3a339b8a 100644 --- a/api/schedule/clean_embedding_cache_task.py +++ b/api/schedule/clean_embedding_cache_task.py @@ -2,6 +2,7 @@ import time import click +from sqlalchemy import text from werkzeug.exceptions import NotFound import app @@ -18,12 +19,18 @@ def clean_embedding_cache_task(): thirty_days_ago = datetime.datetime.now() - datetime.timedelta(days=clean_days) while True: try: - embeddings = db.session.query(Embedding).filter(Embedding.created_at < thirty_days_ago) \ + embedding_ids = db.session.query(Embedding.id).filter(Embedding.created_at < thirty_days_ago) \ .order_by(Embedding.created_at.desc()).limit(100).all() + embedding_ids = [embedding_id[0] for embedding_id in embedding_ids] except NotFound: break - for embedding in embeddings: - db.session.delete(embedding) - db.session.commit() + if embedding_ids: + db.session.execute(text( + "DELETE FROM embeddings WHERE id in :embedding_ids" + ), {'embedding_ids': tuple(embedding_ids)}) + + db.session.commit() + else: + break end_at = time.perf_counter() click.echo(click.style('Cleaned embedding cache from db success latency: {}'.format(end_at - start_at), fg='green'))