-
Notifications
You must be signed in to change notification settings - Fork 57
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Try to speed up queries with prepared statements #483
Comments
It looks like prepared statements at least in python are not faster than just running the queries over the index. It seems like DuckDB already figures out that it can cache. Maybe this query is just dominated by the runtime.
import duckdb
import time
def setup():
con = duckdb.connect(database='test.db')
con.execute("""
CREATE TABLE IF NOT EXISTS gaia AS -- compute u and v with natural earth projection
WITH prep AS (
SELECT
radians((-l + 540) % 360 - 180) AS lambda,
radians(b) AS phi,
asin(sqrt(3)/2 * sin(phi)) AS t,
t^2 AS t2,
t2^3 AS t6,
*
FROM 'https://idl.uw.edu/mosaic-datasets/data/gaia-5m.parquet'
WHERE parallax BETWEEN -5 AND 20 AND phot_g_mean_mag IS NOT NULL AND bp_rp IS NOT NULL
)
SELECT
(1.340264 * lambda * cos(t)) / (sqrt(3)/2 * (1.340264 + (-0.081106 * 3 * t2) + (t6 * (0.000893 * 7 + 0.003796 * 9 * t2)))) AS u,
t * (1.340264 + (-0.081106 * t2) + (t6 * (0.000893 + 0.003796 * t2))) AS v,
* EXCLUDE('t', 't2', 't6')
FROM prep
""")
con.execute("""
CREATE TEMP TABLE IF NOT EXISTS cube_index_ac93cfd0 AS SELECT FLOOR(("bp_rp"::DOUBLE - -3.38848876953125) * 8.342301453972835::DOUBLE)::INTEGER + FLOOR((21.31503677368164 - "phot_g_mean_mag"::DOUBLE) * 8.621847966401786::DOUBLE)::INTEGER * 102 AS "index", COUNT(*)::INTEGER AS "density", FLOOR(28.5328481685884::DOUBLE * ("u" - -3.6273854635156497::DOUBLE))::INTEGER AS "active0", FLOOR(37.95470756049873::DOUBLE * ("v" - -1.3173598738027117::DOUBLE))::INTEGER AS "active1" FROM "gaia" WHERE ("bp_rp" BETWEEN -3.38848876953125 AND 8.71848201751709) AND ("phot_g_mean_mag" BETWEEN 2.8735146522521973 AND 21.31503677368164) GROUP BY "index", "active0", "active1"
""")
return con
con = setup()
template = 'SELECT "index", SUM("density")::DOUBLE AS "density" FROM "cube_index_ac93cfd0" WHERE (("active0" BETWEEN FLOOR(28.5328481685884::DOUBLE * ({0} - -3.6273854635156497::DOUBLE))::INTEGER AND FLOOR(28.5328481685884::DOUBLE * ({1} - -3.6273854635156497::DOUBLE))::INTEGER) AND ("active1" BETWEEN FLOOR(37.95470756049873::DOUBLE * ({2} - -1.3173598738027117::DOUBLE))::INTEGER AND FLOOR(37.95470756049873::DOUBLE * ({3} - -1.3173598738027117::DOUBLE))::INTEGER)) GROUP BY "index"'
print("individual")
for i in range(10):
offset = float(i) / 100
values = [-1.9976848558072506 + offset, 2.3131361065181917 + offset, -0.8958047880601472 + offset, 0.974845904922482 + offset]
query = template.format(*values)
start_time = time.time()
con.execute(query).arrow()
print("--- %s ms ---" % ((time.time() - start_time) * 1000))
print("prepared")
for i in range(10):
offset = float(i) / 100
values = [-1.9976848558072506 + offset, 2.3131361065181917 + offset, -0.8958047880601472 + offset, 0.974845904922482 + offset]
query = template.format("?", "?", "?", "?")
start_time = time.time()
con.execute(query, values).arrow()
print("--- %s ms ---" % ((time.time() - start_time) * 1000)) |
Not much difference with a much smaller query
import duckdb
import time
def setup():
con = duckdb.connect(database='test.db')
con.execute("""
CREATE TABLE IF NOT EXISTS flights AS SELECT * FROM read_parquet('https://github.com/uwdata/mosaic/raw/main/data/flights-200k.parquet')
""")
con.execute("""
CREATE TABLE IF NOT EXISTS cube_index_6c97d391 AS SELECT 0 + 1::DOUBLE * (FLOOR("time" / 1::DOUBLE)) AS "x1", 0 + 1::DOUBLE * (1 + FLOOR("time" / 1::DOUBLE)) AS "x2", COUNT(*)::INTEGER AS "y", FLOOR(2.16::DOUBLE * ("delay" - -60::DOUBLE))::INTEGER AS "active0" FROM "flights" AS "source" GROUP BY "x1", "x2", "active0"
""")
return con
con = setup()
template = 'SELECT "x1", "x2", SUM("y")::DOUBLE AS "y" FROM "cube_index_6c97d391" WHERE ("active0" BETWEEN FLOOR(2.16::DOUBLE * ({0} - -60::DOUBLE))::INTEGER AND FLOOR(2.16::DOUBLE * ({1} - -60::DOUBLE))::INTEGER) GROUP BY "x1", "x2"'
print("individual")
for i in range(10):
offset = i
values = [-17.77777777777778 + offset, 87.68518518518519 + offset]
query = template.format(*values)
start_time = time.time()
con.execute(query).arrow()
print("--- %s ms ---" % ((time.time() - start_time) * 1000))
print("prepared")
for i in range(10):
offset = i
values = [-17.77777777777778 + offset, 87.68518518518519 + offset]
query = template.format("?", "?")
start_time = time.time()
con.execute(query, values).arrow()
print("--- %s ms ---" % ((time.time() - start_time) * 1000)) |
By using
(also pre-importing pyarrow removes the noise on the first query) Full code: import duckdb
import time
import pyarrow
def setup():
con = duckdb.connect(database='test.db')
con.execute("""
CREATE TABLE IF NOT EXISTS flights AS SELECT * FROM read_parquet('https://github.com/uwdata/mosaic/raw/main/data/flights-200k.parquet')
""")
con.execute("""
CREATE TABLE IF NOT EXISTS cube_index_6c97d391 AS SELECT 0 + 1::DOUBLE * (FLOOR("time" / 1::DOUBLE)) AS "x1", 0 + 1::DOUBLE * (1 + FLOOR("time" / 1::DOUBLE)) AS "x2", COUNT(*)::INTEGER AS "y", FLOOR(2.16::DOUBLE * ("delay" - -60::DOUBLE))::INTEGER AS "active0" FROM "flights" AS "source" GROUP BY "x1", "x2", "active0"
""")
return con
con = setup()
template = 'SELECT "x1", "x2", SUM("y")::DOUBLE AS "y" FROM "cube_index_6c97d391" WHERE ("active0" BETWEEN FLOOR(2.16::DOUBLE * ({0} - -60::DOUBLE))::INTEGER AND FLOOR(2.16::DOUBLE * ({1} - -60::DOUBLE))::INTEGER) GROUP BY "x1", "x2"'
print("individual")
for i in range(10):
offset = i
values = [-17.77777777777778 + offset, 87.68518518518519 + offset]
query = template.format(*values)
start_time = time.time()
con.execute(query).arrow()
print("--- %s ms ---" % ((time.time() - start_time) * 1000))
print("prepared")
query = template.format("?::DOUBLE", "?::DOUBLE")
con.execute('PREPARE prep AS ' + query)
for i in range(10):
offset = i
values = [-17.77777777777778 + offset, 87.68518518518519 + offset]
start_time = time.time()
con.execute('EXECUTE prep({0}, {1})'.format(*values)).arrow()
print("--- %s ms ---" % ((time.time() - start_time) * 1000)) |
Sweet. Thanks for noting my incorrect calls here. Details of the scriptRun with `uv run main_flights.py`# /// script
# dependencies = [
# "duckdb",
# "pyarrow",
# ]
# ///
import duckdb
import time
def setup():
con = duckdb.connect(database='test.db')
con.execute("""
CREATE TABLE IF NOT EXISTS flights AS SELECT * FROM read_parquet('https://github.com/uwdata/mosaic/raw/main/data/flights-200k.parquet')
""")
con.execute("""
CREATE TABLE IF NOT EXISTS cube_index_6c97d391 AS SELECT 0 + 1::DOUBLE * (FLOOR("time" / 1::DOUBLE)) AS "x1", 0 + 1::DOUBLE * (1 + FLOOR("time" / 1::DOUBLE)) AS "x2", COUNT(*)::INTEGER AS "y", FLOOR(2.16::DOUBLE * ("delay" - -60::DOUBLE))::INTEGER AS "active0" FROM "flights" AS "source" GROUP BY "x1", "x2", "active0"
""")
return con
con = setup()
template = 'SELECT "x1", "x2", SUM("y")::DOUBLE AS "y" FROM "cube_index_6c97d391" WHERE ("active0" BETWEEN FLOOR(2.16::DOUBLE * ({0} - -60::DOUBLE))::INTEGER AND FLOOR(2.16::DOUBLE * ({1} - -60::DOUBLE))::INTEGER) GROUP BY "x1", "x2"'
# run once to warm caches
con.execute(template.format(-17.789123, 88.8234988)).arrow()
print("individual")
times = []
for i in range(20):
offset = i
values = [-17.77777777777778 + offset, 87.68518518518519 + offset]
query = template.format(*values)
start_time = time.time()
con.execute(query).arrow()
runtime = (time.time() - start_time) * 1000
times.append(runtime)
print("--- %s ms ---" % (runtime))
print("average: %s ms" % (sum(times[1:]) / len(times[1:])))
print("min: %s ms" % (min(times)))
print()
print("prepared")
query = template.format("?::DOUBLE", "?::DOUBLE")
times = []
con.execute('PREPARE prep AS ' + query)
for i in range(20):
offset = i
values = [-17.77777777777778 + offset, 87.68518518518519 + offset]
query = template.format("?", "?")
start_time = time.time()
con.execute('EXECUTE prep({0}, {1})'.format(*values)).arrow()
runtime = (time.time() - start_time) * 1000
times.append(runtime)
print("--- %s ms ---" % (runtime))
print("average: %s ms" % (sum(times[1:]) / len(times[1:])))
print("min: %s ms" % (min(times))) Looks like prepared statements are worth it for many small queries
|
Mosaic executes a lot of very similar queries on the indexes. We could use prepared statements to speed up queries by avoiding the passing overhead of sql for every query.
The text was updated successfully, but these errors were encountered: