-
Notifications
You must be signed in to change notification settings - Fork 748
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add IVF bench for PgVector extension #512
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -62,3 +62,55 @@ def get_memory_usage(self): | |
|
||
def __str__(self): | ||
return f"PGVector(m={self._m}, ef_construction={self._ef_construction}, ef_search={self._ef_search})" | ||
|
||
|
||
class PGVectorIVF(BaseANN): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If this is to be added, I'd recommend using the original implementation, though adding the binary load found in #488: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wasn't aware it was part of the ann_benchmark workload. It makes sense to bring back the original implementation, but I don't know why it was removed in the first place; I am trying to learn the tradeoffs presented by both algorithm implementations and find it extremely useful. |
||
def __init__(self, metric, n_list): | ||
self._metric = metric | ||
self._n_list = n_list | ||
self._cur = None | ||
|
||
if metric == "angular": | ||
self._query = "SELECT id FROM items ORDER BY embedding <=> %s LIMIT %s" | ||
elif metric == "euclidean": | ||
self._query = "SELECT id FROM items ORDER BY embedding <-> %s LIMIT %s" | ||
else: | ||
raise RuntimeError(f"unknown metric {metric}") | ||
|
||
def fit(self, X): | ||
subprocess.run("service postgresql start", shell=True, check=True, stdout=sys.stdout, stderr=sys.stderr) | ||
conn = psycopg.connect(user="ann", password="ann", dbname="ann", autocommit=True) | ||
pgvector.psycopg.register_vector(conn) | ||
cur = conn.cursor() | ||
cur.execute("DROP TABLE IF EXISTS items") | ||
cur.execute("CREATE TABLE items (id int, embedding vector(%d))" % X.shape[1]) | ||
cur.execute("ALTER TABLE items ALTER COLUMN embedding SET STORAGE PLAIN") | ||
print("copying data...") | ||
with cur.copy("COPY items (id, embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: | ||
copy.set_types(["int4", "vector"]) | ||
for i, embedding in enumerate(X): | ||
copy.write_row((i, embedding)) | ||
print("creating index...") | ||
if self._metric == "angular": | ||
cur.execute( | ||
"CREATE INDEX ON items USING ivfflat (embedding vector_cosine_ops) WITH (lists = %d)" % (self._n_list) | ||
) | ||
elif self._metric == "euclidean": | ||
cur.execute("CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = %d)" % (self._n_list)) | ||
else: | ||
raise RuntimeError(f"unknown metric {self._metric}") | ||
print("done!") | ||
self._cur = cur | ||
|
||
def query(self, v, n): | ||
self._cur.execute(self._query, (v, n), binary=True, prepare=True) | ||
return [id for id, in self._cur.fetchall()] | ||
|
||
def get_memory_usage(self): | ||
if self._cur is None: | ||
return 0 | ||
self._cur.execute("SELECT pg_relation_size('items_embedding_idx')") | ||
return self._cur.fetchone()[0] / 1024 | ||
|
||
def __str__(self): | ||
return f"PGVectorIVF(n_list={self._n_list})" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
renamed only to make it clear while comparing with ivfflat.