Try to speed up queries with prepared statements #483

domoritz · 2024-08-16T06:44:22Z

Mosaic executes a lot of very similar queries on the indexes. We could use prepared statements to speed up queries by avoiding the passing overhead of sql for every query.

domoritz · 2024-08-21T16:28:39Z

It looks like prepared statements at least in python are not faster than just running the queries over the index. It seems like DuckDB already figures out that it can cache. Maybe this query is just dominated by the runtime.

uv run --with duckdb --with pyarrow main.py
individual
--- 51.6200065612793 ms ---
--- 4.616975784301758 ms ---
--- 4.367828369140625 ms ---
--- 4.959821701049805 ms ---
--- 5.73420524597168 ms ---
--- 5.649328231811523 ms ---
--- 4.678964614868164 ms ---
--- 5.36799430847168 ms ---
--- 5.323886871337891 ms ---
--- 4.825115203857422 ms ---

uv run --with duckdb --with pyarrow main.py
prepared
--- 50.913095474243164 ms ---
--- 5.853891372680664 ms ---
--- 5.547046661376953 ms ---
--- 5.339860916137695 ms ---
--- 6.097078323364258 ms ---
--- 6.219625473022461 ms ---
--- 6.114959716796875 ms ---
--- 5.279064178466797 ms ---
--- 5.7220458984375 ms ---
--- 5.427837371826172 ms ---

import duckdb
import time

def setup():
    con = duckdb.connect(database='test.db')

    con.execute("""
    CREATE TABLE IF NOT EXISTS gaia AS -- compute u and v with natural earth projection
    WITH prep AS (
    SELECT
        radians((-l + 540) % 360 - 180) AS lambda,
        radians(b) AS phi,
        asin(sqrt(3)/2 * sin(phi)) AS t,
        t^2 AS t2,
        t2^3 AS t6,
        *
    FROM 'https://idl.uw.edu/mosaic-datasets/data/gaia-5m.parquet'
    WHERE parallax BETWEEN -5 AND 20 AND phot_g_mean_mag IS NOT NULL AND bp_rp IS NOT NULL
    )
    SELECT
    (1.340264 * lambda * cos(t)) / (sqrt(3)/2 * (1.340264 + (-0.081106 * 3 * t2) + (t6 * (0.000893 * 7 + 0.003796 * 9 * t2)))) AS u,
    t * (1.340264 + (-0.081106 * t2) + (t6 * (0.000893 + 0.003796 * t2))) AS v,
    * EXCLUDE('t', 't2', 't6')
    FROM prep
    """)

    con.execute("""
    CREATE TEMP TABLE IF NOT EXISTS cube_index_ac93cfd0 AS SELECT FLOOR(("bp_rp"::DOUBLE - -3.38848876953125) * 8.342301453972835::DOUBLE)::INTEGER + FLOOR((21.31503677368164 - "phot_g_mean_mag"::DOUBLE) * 8.621847966401786::DOUBLE)::INTEGER * 102 AS "index", COUNT(*)::INTEGER AS "density", FLOOR(28.5328481685884::DOUBLE * ("u" - -3.6273854635156497::DOUBLE))::INTEGER AS "active0", FLOOR(37.95470756049873::DOUBLE * ("v" - -1.3173598738027117::DOUBLE))::INTEGER AS "active1" FROM "gaia" WHERE ("bp_rp" BETWEEN -3.38848876953125 AND 8.71848201751709) AND ("phot_g_mean_mag" BETWEEN 2.8735146522521973 AND 21.31503677368164) GROUP BY "index", "active0", "active1"
    """)

    return con

con = setup()

template = 'SELECT "index", SUM("density")::DOUBLE AS "density" FROM "cube_index_ac93cfd0" WHERE (("active0" BETWEEN FLOOR(28.5328481685884::DOUBLE * ({0} - -3.6273854635156497::DOUBLE))::INTEGER AND FLOOR(28.5328481685884::DOUBLE * ({1} - -3.6273854635156497::DOUBLE))::INTEGER) AND ("active1" BETWEEN FLOOR(37.95470756049873::DOUBLE * ({2} - -1.3173598738027117::DOUBLE))::INTEGER AND FLOOR(37.95470756049873::DOUBLE * ({3} - -1.3173598738027117::DOUBLE))::INTEGER)) GROUP BY "index"'

print("individual")
for i in range(10):
    offset = float(i) / 100
    values = [-1.9976848558072506 + offset, 2.3131361065181917 + offset, -0.8958047880601472 + offset, 0.974845904922482 + offset]

    query = template.format(*values)
    start_time = time.time()
    con.execute(query).arrow()
    print("--- %s ms ---" % ((time.time() - start_time) * 1000))

print("prepared")
for i in range(10):
    offset = float(i) / 100
    values = [-1.9976848558072506 + offset, 2.3131361065181917 + offset, -0.8958047880601472 + offset, 0.974845904922482 + offset]

    query = template.format("?", "?", "?", "?")
    start_time = time.time()
    con.execute(query, values).arrow()
    print("--- %s ms ---" % ((time.time() - start_time) * 1000))

domoritz · 2024-08-21T16:37:38Z

Not much difference with a much smaller query

uv run --with duckdb --with pyarrow main_flights.py
individual
--- 52.39295959472656 ms ---
--- 1.6629695892333984 ms ---
--- 1.1830329895019531 ms ---
--- 1.3239383697509766 ms ---
--- 1.1157989501953125 ms ---
--- 1.262664794921875 ms ---
--- 1.2297630310058594 ms ---
--- 1.2099742889404297 ms ---
--- 1.1899471282958984 ms ---
--- 1.2128353118896484 ms ---

uv run --with duckdb --with pyarrow main_flights.py
prepared
--- 48.19607734680176 ms ---
--- 1.9650459289550781 ms ---
--- 1.8160343170166016 ms ---
--- 1.4908313751220703 ms ---
--- 1.4719963073730469 ms ---
--- 1.5170574188232422 ms ---
--- 1.592874526977539 ms ---
--- 1.3775825500488281 ms ---
--- 1.8668174743652344 ms ---
--- 1.6541481018066406 ms ---

import duckdb
import time

def setup():
    con = duckdb.connect(database='test.db')

    con.execute("""
    CREATE TABLE IF NOT EXISTS flights AS SELECT * FROM read_parquet('https://github.com/uwdata/mosaic/raw/main/data/flights-200k.parquet')
    """)

    con.execute("""
    CREATE TABLE IF NOT EXISTS cube_index_6c97d391 AS SELECT 0 + 1::DOUBLE * (FLOOR("time" / 1::DOUBLE)) AS "x1", 0 + 1::DOUBLE * (1 + FLOOR("time" / 1::DOUBLE)) AS "x2", COUNT(*)::INTEGER AS "y", FLOOR(2.16::DOUBLE * ("delay" - -60::DOUBLE))::INTEGER AS "active0" FROM "flights" AS "source" GROUP BY "x1", "x2", "active0"
    """)

    return con

con = setup()

template = 'SELECT "x1", "x2", SUM("y")::DOUBLE AS "y" FROM "cube_index_6c97d391" WHERE ("active0" BETWEEN FLOOR(2.16::DOUBLE * ({0} - -60::DOUBLE))::INTEGER AND FLOOR(2.16::DOUBLE * ({1} - -60::DOUBLE))::INTEGER) GROUP BY "x1", "x2"'

print("individual")
for i in range(10):
    offset = i
    values = [-17.77777777777778 + offset, 87.68518518518519 + offset]

    query = template.format(*values)
    start_time = time.time()
    con.execute(query).arrow()
    print("--- %s ms ---" % ((time.time() - start_time) * 1000))

print("prepared")
for i in range(10):
    offset = i
    values = [-17.77777777777778 + offset, 87.68518518518519 + offset]

    query = template.format("?", "?")
    start_time = time.time()
    con.execute(query, values).arrow()
    print("--- %s ms ---" % ((time.time() - start_time) * 1000))

Mytherin · 2024-08-22T08:46:44Z

By using con.execute you are not actually caching and re-using the prepared statement, so I would not expect any difference in performance. The query still needs to be parsed, bound and optimized fully for each query that is executed. You can use an explicit PREPARE in SQL, which gives me the following timings:

individual
--- 2.9261112213134766 ms ---
--- 2.0589828491210938 ms ---
--- 1.7421245574951172 ms ---
--- 1.5671253204345703 ms ---
--- 1.6980171203613281 ms ---
--- 1.6040802001953125 ms ---
--- 1.725912094116211 ms ---
--- 1.1591911315917969 ms ---
--- 1.5230178833007812 ms ---
--- 1.5270709991455078 ms ---
prepared
--- 0.9090900421142578 ms ---
--- 0.9322166442871094 ms ---
--- 0.8151531219482422 ms ---
--- 0.7300376892089844 ms ---
--- 0.720977783203125 ms ---
--- 0.7140636444091797 ms ---
--- 0.7319450378417969 ms ---
--- 0.701904296875 ms ---
--- 0.6818771362304688 ms ---
--- 0.7190704345703125 ms ---

(also pre-importing pyarrow removes the noise on the first query)

Full code:

import duckdb
import time
import pyarrow

def setup():
    con = duckdb.connect(database='test.db')

    con.execute("""
    CREATE TABLE IF NOT EXISTS flights AS SELECT * FROM read_parquet('https://github.com/uwdata/mosaic/raw/main/data/flights-200k.parquet')
    """)

    con.execute("""
    CREATE TABLE IF NOT EXISTS cube_index_6c97d391 AS SELECT 0 + 1::DOUBLE * (FLOOR("time" / 1::DOUBLE)) AS "x1", 0 + 1::DOUBLE * (1 + FLOOR("time" / 1::DOUBLE)) AS "x2", COUNT(*)::INTEGER AS "y", FLOOR(2.16::DOUBLE * ("delay" - -60::DOUBLE))::INTEGER AS "active0" FROM "flights" AS "source" GROUP BY "x1", "x2", "active0"
    """)

    return con

con = setup()

template = 'SELECT "x1", "x2", SUM("y")::DOUBLE AS "y" FROM "cube_index_6c97d391" WHERE ("active0" BETWEEN FLOOR(2.16::DOUBLE * ({0} - -60::DOUBLE))::INTEGER AND FLOOR(2.16::DOUBLE * ({1} - -60::DOUBLE))::INTEGER) GROUP BY "x1", "x2"'

print("individual")
for i in range(10):
    offset = i
    values = [-17.77777777777778 + offset, 87.68518518518519 + offset]

    query = template.format(*values)
    start_time = time.time()
    con.execute(query).arrow()
    print("--- %s ms ---" % ((time.time() - start_time) * 1000))

print("prepared")
query = template.format("?::DOUBLE", "?::DOUBLE")
con.execute('PREPARE prep AS ' + query)
for i in range(10):
    offset = i
    values = [-17.77777777777778 + offset, 87.68518518518519 + offset]

    start_time = time.time()
    con.execute('EXECUTE prep({0}, {1})'.format(*values)).arrow()
    print("--- %s ms ---" % ((time.time() - start_time) * 1000))

domoritz · 2024-09-01T00:50:59Z

Sweet. Thanks for noting my incorrect calls here.

Details of the script

Run with `uv run main_flights.py`

# /// script
# dependencies = [
#   "duckdb",
#   "pyarrow",
# ]
# ///

import duckdb
import time

def setup():
    con = duckdb.connect(database='test.db')

    con.execute("""
    CREATE TABLE IF NOT EXISTS flights AS SELECT * FROM read_parquet('https://github.com/uwdata/mosaic/raw/main/data/flights-200k.parquet')
    """)

    con.execute("""
    CREATE TABLE IF NOT EXISTS cube_index_6c97d391 AS SELECT 0 + 1::DOUBLE * (FLOOR("time" / 1::DOUBLE)) AS "x1", 0 + 1::DOUBLE * (1 + FLOOR("time" / 1::DOUBLE)) AS "x2", COUNT(*)::INTEGER AS "y", FLOOR(2.16::DOUBLE * ("delay" - -60::DOUBLE))::INTEGER AS "active0" FROM "flights" AS "source" GROUP BY "x1", "x2", "active0"
    """)

    return con

con = setup()

template = 'SELECT "x1", "x2", SUM("y")::DOUBLE AS "y" FROM "cube_index_6c97d391" WHERE ("active0" BETWEEN FLOOR(2.16::DOUBLE * ({0} - -60::DOUBLE))::INTEGER AND FLOOR(2.16::DOUBLE * ({1} - -60::DOUBLE))::INTEGER) GROUP BY "x1", "x2"'

# run once to warm caches
con.execute(template.format(-17.789123, 88.8234988)).arrow()


print("individual")
times = []
for i in range(20):
    offset = i
    values = [-17.77777777777778 + offset, 87.68518518518519 + offset]

    query = template.format(*values)
    start_time = time.time()
    con.execute(query).arrow()
    runtime = (time.time() - start_time) * 1000
    times.append(runtime)
    print("--- %s ms ---" % (runtime))

print("average: %s ms" % (sum(times[1:]) / len(times[1:])))
print("min: %s ms" % (min(times)))

print()
print("prepared")
query = template.format("?::DOUBLE", "?::DOUBLE")
times = []
con.execute('PREPARE prep AS ' + query)
for i in range(20):
    offset = i
    values = [-17.77777777777778 + offset, 87.68518518518519 + offset]

    query = template.format("?", "?")
    start_time = time.time()
    con.execute('EXECUTE prep({0}, {1})'.format(*values)).arrow()
    runtime = (time.time() - start_time) * 1000
    times.append(runtime)
    print("--- %s ms ---" % (runtime))

print("average: %s ms" % (sum(times[1:]) / len(times[1:])))
print("min: %s ms" % (min(times)))

Looks like prepared statements are worth it for many small queries

--- 1.5118122100830078 ms ---
--- 1.2290477752685547 ms ---
--- 1.241922378540039 ms ---
--- 1.1942386627197266 ms ---
--- 1.0569095611572266 ms ---
--- 1.125335693359375 ms ---
--- 1.0340213775634766 ms ---
--- 0.9911060333251953 ms ---
--- 1.0809898376464844 ms ---
--- 1.1119842529296875 ms ---
--- 1.165151596069336 ms ---
--- 1.1281967163085938 ms ---
--- 1.0540485382080078 ms ---
--- 1.135110855102539 ms ---
--- 1.3189315795898438 ms ---
--- 1.3990402221679688 ms ---
--- 1.2979507446289062 ms ---
--- 1.500844955444336 ms ---
--- 1.035928726196289 ms ---
--- 1.5711784362792969 ms ---
average: 1.1932598917107833 ms
min: 0.9911060333251953 ms

prepared
--- 1.1472702026367188 ms ---
--- 0.8311271667480469 ms ---
--- 0.8587837219238281 ms ---
--- 0.8690357208251953 ms ---
--- 0.9310245513916016 ms ---
--- 0.8649826049804688 ms ---
--- 1.1250972747802734 ms ---
--- 0.9782314300537109 ms ---
--- 1.3728141784667969 ms ---
--- 1.0807514190673828 ms ---
--- 0.8540153503417969 ms ---
--- 1.3549327850341797 ms ---
--- 1.1641979217529297 ms ---
--- 0.8919239044189453 ms ---
--- 1.049041748046875 ms ---
--- 0.8766651153564453 ms ---
--- 1.2028217315673828 ms ---
--- 0.9281635284423828 ms ---
--- 0.8931159973144531 ms ---
--- 0.865936279296875 ms ---
average: 0.9996138120952406 ms
min: 0.8311271667480469 ms

domoritz added the projects Project ideas for Mosaic label Aug 16, 2024

domoritz linked a pull request Aug 21, 2024 that will close this issue

Hacky implementation to test use of prepared statements #486

Draft

domoritz mentioned this issue Sep 16, 2024

Weekly Status Log: Query Interfaces cmudig/mosaic#3

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Try to speed up queries with prepared statements #483

Try to speed up queries with prepared statements #483

domoritz commented Aug 16, 2024

domoritz commented Aug 21, 2024 •

edited

Loading

domoritz commented Aug 21, 2024 •

edited

Loading

Mytherin commented Aug 22, 2024 •

edited

Loading

domoritz commented Sep 1, 2024

Try to speed up queries with prepared statements #483

Try to speed up queries with prepared statements #483

Comments

domoritz commented Aug 16, 2024

domoritz commented Aug 21, 2024 • edited Loading

domoritz commented Aug 21, 2024 • edited Loading

Mytherin commented Aug 22, 2024 • edited Loading

domoritz commented Sep 1, 2024

domoritz commented Aug 21, 2024 •

edited

Loading

domoritz commented Aug 21, 2024 •

edited

Loading

Mytherin commented Aug 22, 2024 •

edited

Loading