diff --git a/.cargo/config b/.cargo/config new file mode 100644 index 0000000..59c989e --- /dev/null +++ b/.cargo/config @@ -0,0 +1,11 @@ +[target.x86_64-apple-darwin] +rustflags = [ + "-C", "link-arg=-undefined", + "-C", "link-arg=dynamic_lookup", +] + +[target.aarch64-apple-darwin] +rustflags = [ + "-C", "link-arg=-undefined", + "-C", "link-arg=dynamic_lookup", +] \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..eba99f4 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,48 @@ +--- +name: Bug report +about: An issue with rust connectorx or python connectorx +title: '' +labels: 'bug' +assignees: '' +--- + +#### What language are you using? + +Replace this text with the **Rust** or **Python**. + +#### What version are you using? + +Replace this text with the version. + +#### What database are you using? + +e.g. PostgreSQL, MySQL + +#### What dataframe are you using? + +e.g. Pandas, Arrow + +#### Can you describe your bug? + +Give a high level description of the bug. + +#### What are the steps to reproduce the behavior? + +If possible, please include a **minimal simple** example including: + +##### Database setup if the error only happens on specific data or data type + +Table schema and example data + +##### Example query / code + +``` +your +code +goes +here +``` + +#### What is the error? + +Show the error result here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..2ef8f68 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,12 @@ +--- +name: Feature request +about: Suggest a new feature for connectorx +title: '' +labels: 'feature' +assignees: '' +--- + + +#### Describe your feature request + +Please describe the behavior you want and the motivation. diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md new file mode 100644 index 0000000..a437c9b --- /dev/null +++ b/.github/ISSUE_TEMPLATE/question.md @@ -0,0 +1,15 @@ +--- +name: Question +about: Ask a question about connectorx +title: '' +labels: '' +assignees: '' +--- + +#### Try Discussion? + +You can ask questions in [discussions](https://github.com/sfu-db/connector-x/discussions/categories/q-a). + +#### Other + +Feel free to ask here if you think it's more suitable. diff --git a/.github/config/db1.json b/.github/config/db1.json new file mode 100644 index 0000000..648955c --- /dev/null +++ b/.github/config/db1.json @@ -0,0 +1,15 @@ +{ + "driver": "org.postgresql.Driver", + "url": "jdbc:postgresql://postgres:5432/postgres", + "username": "postgres", + "password": "postgres", + "costParams": { + "join": 10.0, + "agg": 20.0, + "sort": 20.0, + "trans": 3.0 + }, + "dialect": "postgres", + "cardEstType": "postgres", + "partitionType": "postgres" +} \ No newline at end of file diff --git a/.github/config/db2.json b/.github/config/db2.json new file mode 100644 index 0000000..648955c --- /dev/null +++ b/.github/config/db2.json @@ -0,0 +1,15 @@ +{ + "driver": "org.postgresql.Driver", + "url": "jdbc:postgresql://postgres:5432/postgres", + "username": "postgres", + "password": "postgres", + "costParams": { + "join": 10.0, + "agg": 20.0, + "sort": 20.0, + "trans": 3.0 + }, + "dialect": "postgres", + "cardEstType": "postgres", + "partitionType": "postgres" +} \ No newline at end of file diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 0000000..ebd950d --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,77 @@ +name: benchmark + +on: + workflow_dispatch: + push: + branches: + - prerelease + - main + pull_request: + branches: + - main + +jobs: + benchmark: + runs-on: [self-hosted, linux, x64, connectorx-benchmark] + steps: + - uses: actions/checkout@v2 + + - name: Install tools + run: | + apt-get update + apt-get install -y curl postgresql-client build-essential python3-dev python3-pip pkg-config libssl-dev git sqlite3 libsqlite3-dev mysql-client libmysqlclient-dev libkrb5-dev libclang-dev + env: + DEBIAN_FRONTEND: noninteractive + + - name: Install Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + components: rustfmt + default: true + + - name: Install other dependent tools + run: | + pip3 install poetry + if [ ! -f "$HOME/.cargo/bin/just" ]; then curl --proto '=https' --tlsv1.2 -sSf https://just.systems/install.sh | bash -s -- --to ~/.cargo/bin; fi + + - name: Install python dependencies + run: just bootstrap-python + + - name: Run benchmarks + run: just benchmark-report + env: + POSTGRES_URL: ${{ secrets.POSTGRES_URL }} + MYSQL_URL: ${{ secrets.MYSQL_URL }} + TPCH_TABLE: lineitem + + - name: Ignore git safe directory error + run: git config --global --add safe.directory /tmp/github-runner-connectorx/connector-x/connector-x + + - name: Show benchmark result for pull request + if: ${{ github.event_name == 'pull_request'}} + uses: rhysd/github-action-benchmark@v1 + with: + name: "ConnectorX TPC-H Scale@1 Benchmarks" + tool: "pytest" + output-file-path: benchmark.json + github-token: ${{ secrets.GITHUB_TOKEN }} + auto-push: false + save-data-file: false + fail-threshold: "200%" + comment-always: true + fail-on-alert: true + + - name: Store benchmark result for push operator + if: ${{ github.event_name == 'push'}} + uses: rhysd/github-action-benchmark@v1 + with: + name: "ConnectorX TPC-H Scale@1 Benchmarks" + tool: "pytest" + output-file-path: benchmark.json + github-token: ${{ secrets.GITHUB_TOKEN }} + auto-push: true + alert-threshold: "100%" + fail-threshold: "200%" + comment-always: true + fail-on-alert: true diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..a425409 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,259 @@ +name: ci + +on: + pull_request: + branches: + - main + - prerelease + push: + branches: + - main + +jobs: + # rust: + # runs-on: ubuntu-latest + # container: ubuntu:20.04 + # services: + # # Label used to access the service container + # postgres: + # # Docker Hub image + # image: postgres + # env: + # POSTGRES_PASSWORD: postgres + # # Set health checks to wait until postgres has started + # options: >- + # --health-cmd pg_isready + # --health-interval 10s + # --health-timeout 5s + # --health-retries 5 + # # mysql + # mysql: + # image: ghcr.io/wangxiaoying/mysql:latest + # env: + # MYSQL_DATABASE: mysql + # MYSQL_ROOT_PASSWORD: mysql + # LANG: C.UTF-8 + # ports: + # - 3306:3306 + # options: >- + # --health-cmd "mysqladmin ping" + # --health-interval 10s + # --health-timeout 10s + # --health-retries 5 + # mssql: + # image: mcr.microsoft.com/mssql/server:2019-latest + # env: + # ACCEPT_EULA: y + # SA_PASSWORD: mssql!Password + # ports: + # - 1433:1433 + # options: >- + # --health-cmd "/opt/mssql-tools/bin/sqlcmd -S localhost -U sa -P \"$SA_PASSWORD\" -Q 'SELECT 1' || exit 1" + # --health-interval 10s + # --health-timeout 5s + # --health-retries 20 + # steps: + # - uses: actions/checkout@v2 + + # - name: Install tools + # run: | + # apt-get update + # apt-get install -y curl postgresql-client build-essential pkg-config libssl-dev git sqlite3 libsqlite3-dev mysql-client python3 python3-pip libicu66 libkrb5-dev libclang-dev + # pip3 install mssql-cli + # pip3 install cli-helpers==2.2.0 + # ln -s /usr/bin/python3 /usr/bin/python + # echo "Cache Version ${{ secrets.CACHE_VERSION }}" + # env: + # DEBIAN_FRONTEND: noninteractive + + # - name: Install Rust + # uses: actions-rs/toolchain@v1 + # with: + # toolchain: stable + # components: rustfmt + # default: true + + # - uses: actions/cache@v2 + # with: + # path: | + # ~/.cargo/bin/ + # ~/.cargo/registry/index/ + # ~/.cargo/registry/cache/ + # ~/.cargo/git/db/ + # target/ + # key: ${{ runner.os }}-cargo-${{ secrets.CACHE_VERSION }}-${{ hashFiles('**/Cargo.lock') }} + + # - name: Install other dependent tools + # run: | + # if [ ! -f "$HOME/.cargo/bin/just" ]; then curl --proto '=https' --tlsv1.2 -sSf https://just.systems/install.sh | bash -s -- --to ~/.cargo/bin; fi + + # - name: Seed the database + # run: just seed-db + # env: + # POSTGRES_URL: "postgresql://postgres:postgres@postgres:5432/postgres" + # SQLITE_URL: "sqlite:///tmp/test.db" + # MYSQL_HOST: mysql + # MYSQL_PORT: 3306 + # MYSQL_DB: mysql + # MYSQL_USER: root + # MYSQL_PASSWORD: mysql + # MSSQL_HOST: mssql + # MSSQL_PORT: 1433 + # MSSQL_DB: tempdb + # MSSQL_USER: sa + # MSSQL_PASSWORD: mssql!Password + + # - name: Is the code formatted? + # uses: actions-rs/cargo@v1 + # with: + # command: fmt + # args: --all -- --check -q + + # - name: Clippy linting + # uses: actions-rs/cargo@v1 + # with: + # command: clippy + # args: --features all + + # - name: Check each feature gate + # run: just test-feature-gate + + # - name: Run tests + # run: cargo clean && just test + # env: + # POSTGRES_URL: "postgresql://postgres:postgres@postgres:5432/postgres" + # SQLITE_URL: "sqlite:///tmp/test.db" + # MYSQL_URL: "mysql://root:mysql@mysql:3306/mysql" + # MSSQL_URL: "mssql://sa:mssql!Password@mssql:1433/tempdb" + + # - name: Test build docs + # uses: actions-rs/cargo@v1 + # with: + # command: doc + # args: --no-deps --features all + + python: + runs-on: ubuntu-latest + container: ubuntu:20.04 + services: + # Label used to access the service container + postgres: + # Docker Hub image + image: postgres + env: + POSTGRES_PASSWORD: postgres + # Set health checks to wait until postgres has started + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + # mysql + mysql: + image: ghcr.io/wangxiaoying/mysql:latest + env: + MYSQL_DATABASE: mysql + MYSQL_ROOT_PASSWORD: mysql + LANG: C.UTF-8 + ports: + - 3306:3306 + options: >- + --health-cmd "mysqladmin ping" + --health-interval 10s + --health-timeout 10s + --health-retries 5 + mssql: + image: mcr.microsoft.com/mssql/server:2019-latest + env: + ACCEPT_EULA: y + SA_PASSWORD: mssql!Password + ports: + - 1433:1433 + options: >- + --health-cmd "/opt/mssql-tools/bin/sqlcmd -S localhost -U sa -P \"$SA_PASSWORD\" -Q 'SELECT 1' || exit 1" + --health-interval 10s + --health-timeout 5s + --health-retries 20 + + steps: + - uses: actions/checkout@v2 + + - name: Install tools + run: | + apt-get update + apt-get install -y curl postgresql-client build-essential python3-dev python3-pip pkg-config libssl-dev git sqlite3 libsqlite3-dev mysql-client libmysqlclient-dev python3 python3-pip libicu66 libkrb5-dev libclang-dev + pip3 install mssql-cli + pip3 install cli-helpers==2.2.0 + ln -s /usr/bin/python3 /usr/bin/python + env: + DEBIAN_FRONTEND: noninteractive + + - name: Install Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + components: rustfmt + default: true + + - uses: actions/setup-java@v3 + with: + distribution: "temurin" + java-version: "17" + + - uses: actions/cache@v2 + with: + path: | + ~/.cargo/bin/ + ~/.cargo/registry/index/ + ~/.cargo/registry/cache/ + ~/.cargo/git/db/ + target/ + key: ${{ runner.os }}-cargo-python-${{ secrets.CACHE_VERSION }}-${{ hashFiles('**/Cargo.lock') }} + + - name: Install other dependent tools + run: | + pip3 install poetry + if [ ! -f "$HOME/.cargo/bin/just" ]; then curl --proto '=https' --tlsv1.2 -sSf https://just.systems/install.sh | bash -s -- --to ~/.cargo/bin; fi + + - name: Seed the database + run: just seed-db + env: + POSTGRES_URL: "postgresql://postgres:postgres@postgres:5432/postgres" + SQLITE_URL: "sqlite:///tmp/test.db" + MYSQL_HOST: mysql + MYSQL_PORT: 3306 + MYSQL_DB: mysql + MYSQL_USER: root + MYSQL_PASSWORD: mysql + MSSQL_HOST: mssql + MSSQL_PORT: 1433 + MSSQL_DB: tempdb + MSSQL_USER: sa + MSSQL_PASSWORD: mssql!Password + + - name: Clippy linting + uses: actions-rs/cargo@v1 + with: + command: clippy + args: --features all --all + + - name: Cache venv + uses: actions/cache@v2 + with: + path: ~/.cache/pypoetry/virtualenvs + key: ${{ runner.os }}-venv-${{ secrets.CACHE_VERSION }}-${{ hashFiles('connectorx-python/poetry.lock') }} + + - name: Install python dependencies + run: just bootstrap-python + + - name: Test python + run: just test-python + env: + POSTGRES_URL: "postgresql://postgres:postgres@postgres:5432/postgres" + SQLITE_URL: "sqlite:///tmp/test.db" + MYSQL_URL: "mysql://root:mysql@mysql:3306/mysql" + MSSQL_URL: "mssql://sa:mssql!Password@mssql:1433/tempdb" + DB1: "postgresql://postgres:postgres@postgres:5432/postgres" + DB2: "postgresql://postgres:postgres@postgres:5432/postgres" + FED_CONFIG_PATH: ${{ github.workspace }}/.github/config + SQLITE3_STATIC: 1 diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000..d475ee9 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,53 @@ +name: docs + +on: + push: + branches: + - main + +jobs: + docs: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + - name: Install Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + components: rustfmt + default: true + + - name: Intall Python + uses: actions/setup-python@v1 + with: + python-version: 3.7 + + - name: Install dependencies + run: | + pip install -r docs/requirements.txt + sudo apt-get update + sudo apt-get install -y libkrb5-dev libclang-dev + + - name: Build the book + run: | + jupyter-book build docs + + - name: Build Rust Docs + uses: actions-rs/cargo@v1 + with: + command: doc + args: --no-deps --features all + + - name: Move the rust doc into jupyter book + run: mv target/doc ./docs/_build/html/rust-docs + + - name: Add .nojekyll + run: touch ./docs/_build/html/.nojekyll + + - name: Deploy Docs 🚀 + uses: JamesIves/github-pages-deploy-action@4.0.0 + with: + branch: gh-pages # The branch the action should deploy to. + folder: ./docs/_build/html # The folder the action should deploy. + clean-exclude: dev diff --git a/.github/workflows/import-test.yml b/.github/workflows/import-test.yml new file mode 100644 index 0000000..20aa1d7 --- /dev/null +++ b/.github/workflows/import-test.yml @@ -0,0 +1,61 @@ +name: import-test + +on: + workflow_dispatch: + inputs: + indexUrl: + description: "Index Url" + required: true + default: "https://test.pypi.org/simple/" + version: + description: "version" + required: false + default: "" + +jobs: + check: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, macos-10.15, windows-latest] + python-version: ["3.7", "3.8", "3.9", "3.10"] + steps: + - uses: actions/checkout@v2 + + - name: Setup python + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + architecture: x64 + + - name: Check which python we use + run: | + echo $(which python) + + - name: Install ConnectorX - POSIX + if: ${{ matrix.os != 'windows-latest' }} + run: | + pip install numpy importlib-metadata + + version='${{ github.event.inputs.version }}' + if [ -z $version ] + then + pip install --index-url ${{ github.event.inputs.indexUrl }} connectorx + else + pip install --index-url ${{ github.event.inputs.indexUrl }} connectorx==$version + fi + + - name: Install ConnectorX - Windows + if: ${{ matrix.os == 'windows-latest' }} + run: | + pip install numpy importlib-metadata + + $version = '${{ github.event.inputs.version }}' + if ($version) { + pip install --index-url ${{ github.event.inputs.indexUrl }} connectorx==$version + } else { + pip install --index-url ${{ github.event.inputs.indexUrl }} connectorx + } + + - name: Import + run: python -c "from connectorx import read_sql" diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..3d7564d --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,263 @@ +name: release + +on: + push: + branches: + - prerelease + - release + +jobs: + linux: + runs-on: ubuntu-latest + container: quay.io/pypa/manylinux_2_28_x86_64 + strategy: + matrix: + python-version: [[38, "3.8"], [39, "3.9"], [310, "3.10"], [311, "3.11"]] + steps: + - uses: actions/checkout@v2 + + - name: Set python version + run: | + echo "/opt/python/cp${{ matrix.python-version[0] }}-cp${{ matrix.python-version[0] }}/bin" >> $GITHUB_PATH + + - uses: actions-rs/toolchain@v1 + with: + toolchain: stable + components: rustfmt + target: aarch64-unknown-linux-gnu + default: true + + - uses: extractions/setup-just@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - uses: Gr1N/setup-poetry@v8 + + - name: Install tools + run: | + yum install -y epel-release + yum install -y mysql-devel postgresql-devel freetds-devel krb5-libs clang-devel + + - name: Setup project + run: | + just bootstrap-python + + - uses: PyO3/maturin-action@v1 + with: + rust-toolchain: stable + maturin-version: v0.14.15 + command: build + args: -m connectorx-python/Cargo.toml -i python --release --manylinux 2_28 --features integrated-auth-gssapi + env: + SQLITE3_STATIC: 1 + + - name: Copy j4rs dependencies into dist + run: | + cp -rf connectorx-python/target/release/jassets connectorx-python/connectorx/dependencies + + # rebuild the wheel to incorporate j4rs dependencies + - uses: PyO3/maturin-action@v1 + with: + rust-toolchain: stable + maturin-version: v0.14.15 + command: build + args: -m connectorx-python/Cargo.toml -i python --release --manylinux 2_28 --features integrated-auth-gssapi + env: + SQLITE3_STATIC: 1 + + # - uses: PyO3/maturin-action@v1 + # with: + # maturin-version: v0.14.15 + # command: build + # args: -m connectorx-python/Cargo.toml --target aarch64-unknown-linux-gnu -i python --release --manylinux 2_28 --features integrated-auth-gssapi + # env: + # SQLITE3_STATIC: 1 + + - uses: actions/upload-artifact@v3 + with: + name: "ubuntu-latest-${{ matrix.python-version[1] }}" + path: connectorx-python/target/wheels/*.whl + + win-and-mac: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: ["windows-latest", "macos-11"] + python-version: ["3.8", "3.9", "3.10", "3.11"] + include: + - os: "macos-11" + features: "--features integrated-auth-gssapi" + steps: + - uses: actions/checkout@v2 + + - uses: ankane/setup-mysql@v1 + with: + mysql-version: 8 + + - uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + architecture: x64 + + - uses: actions-rs/toolchain@v1 + with: + toolchain: stable + components: rustfmt + default: true + + - uses: extractions/setup-just@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - uses: Gr1N/setup-poetry@v8 + + - name: Setup project + run: | + just bootstrap-python + + - uses: PyO3/maturin-action@v1 + with: + rust-toolchain: stable + maturin-version: v0.14.15 + command: build + args: -m connectorx-python/Cargo.toml -i python --release ${{ matrix.features }} + env: + SQLITE3_STATIC: 1 + + - name: Copy j4rs dependencies into dist + run: | + cp -r connectorx-python/target/release/jassets connectorx-python/connectorx/dependencies + + # rebuild the wheel to incorporate j4rs dependencies + - uses: PyO3/maturin-action@v1 + with: + rust-toolchain: stable + maturin-version: v0.14.15 + command: build + args: -m connectorx-python/Cargo.toml -i python --release ${{ matrix.features }} + env: + SQLITE3_STATIC: 1 + + - uses: actions/upload-artifact@v2 + with: + name: "${{ matrix.os }}-${{ matrix.python-version }}" + path: connectorx-python/target/wheels/*.whl + + apple-arm: + runs-on: macos-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11"] + steps: + - uses: actions/checkout@v2 + + - uses: ankane/setup-mysql@v1 + with: + mysql-version: 8 + + - uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - uses: actions-rs/toolchain@v1 + with: + toolchain: stable + components: rustfmt + target: aarch64-apple-darwin + default: true + + - uses: extractions/setup-just@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - uses: Gr1N/setup-poetry@v8 + + - name: Setup project + run: | + just bootstrap-python + + - uses: PyO3/maturin-action@v1 + with: + rust-toolchain: stable + maturin-version: v0.14.15 + command: build + args: -m connectorx-python/Cargo.toml --target aarch64-apple-darwin -i python --release --features integrated-auth-gssapi + env: + SQLITE3_STATIC: 1 + + - name: Copy j4rs dependencies into dist + run: | + cp -rf connectorx-python/target/aarch64-apple-darwin/release/jassets connectorx-python/connectorx/dependencies + + # rebuild the wheel to incorporate j4rs dependencies + - uses: PyO3/maturin-action@v1 + with: + rust-toolchain: stable + maturin-version: v0.14.15 + command: build + args: -m connectorx-python/Cargo.toml --target aarch64-apple-darwin -i python --release --features integrated-auth-gssapi + env: + SQLITE3_STATIC: 1 + + - uses: actions/upload-artifact@v2 + with: + name: "macos-${{ matrix.python-version }}" + path: connectorx-python/target/wheels/*.whl + + verify: + runs-on: ${{ matrix.os }} + needs: [win-and-mac, linux, apple-arm] + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11"] + os: [macos-11, ubuntu-latest, windows-latest] + steps: + - uses: actions/checkout@v2 + + - uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + architecture: x64 + + - uses: actions/download-artifact@v3 + with: + name: "${{ matrix.os }}-${{ matrix.python-version }}" + + - run: | + pip install *.whl + python -c "import connectorx" + + upload: + runs-on: ubuntu-latest + needs: [verify] + steps: + - name: Download all artifacts + uses: actions/download-artifact@v3 + + - name: Setup environment + run: | + tree . + echo "/home/runner/.local/bin" >> $GITHUB_PATH + + - name: Install Twine + run: pip install twine + + - name: Upload to PyPI site + if: github.ref == 'refs/heads/release' + env: + PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} + run: | + for file in $(ls **/*) + do + twine upload --non-interactive -u __token__ -p $PYPI_TOKEN $file || continue + done + + - name: Upload to PyPI test site + if: github.ref == 'refs/heads/prerelease' + env: + PYPI_TEST_TOKEN: ${{ secrets.PYPI_TEST_TOKEN }} + run: | + for file in $(ls **/*) + do + twine upload --non-interactive --repository-url https://test.pypi.org/legacy/ -u __token__ -p $PYPI_TEST_TOKEN $file --verbose || continue + done diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e9b003d --- /dev/null +++ b/.gitignore @@ -0,0 +1,21 @@ +**/target +.vscode +connectorx-python/connectorx/*.so +__pycache__ +.python-version +.idea/ +flamegraph.svg +perf.data +.env +.venv +.pytest_cache +data.txt +profile.pb +dask-worker-space +dist +*.db +benchmark.json +docs/_build +connectorx/examples/test.rs +*.duckdb +federated-query/ diff --git a/Benchmark.md b/Benchmark.md new file mode 100644 index 0000000..1a57f1a --- /dev/null +++ b/Benchmark.md @@ -0,0 +1,158 @@ +# Benchmark Setup + +## Postgres (Docker) + +1. Download PostgreSQL from docker +``` +docker pull postgres +``` + +2. Create a directory for mount point (Optional) +``` +mkdir -p $YOUR_DOCKER_DIR/docker/volumes/postgres +``` + +3. Run PostgreSQL: +``` +# With local mount point +docker run --rm --name pg-connector -e POSTGRES_USER=postgres -e POSTGRES_DB=tpch -e POSTGRES_PASSWORD=postgres -d -p 5432:5432 -v $YOUR_DOCKER_DIR/docker/volumes/postgres:/var/lib/postgresql/data postgres -c shared_buffers=1024MB + +# Without local mount point +docker run --rm --name pg-connector -e POSTGRES_USER=postgres -e POSTGRES_DB=tpch -e POSTGRES_PASSWORD=postgres -d -p 5432:5432 -c shared_buffers=1024MB +``` + +## TPC-H + +1. Download TPC-H toolkit and compile: +``` +git clone https://github.com/gregrahn/tpch-kit.git +cd tpch-kit/dbgen && make MACHINE=LINUX DATABASE=POSTGRESQL +``` + +2. Generate `LINEITEM` table with scale factor 10 +``` +# Generate all tables +./dbgen -s 10 + +# Alternatively you can only generate LINEITEM table using -T option +./dbgen -s 10 -T L +``` + +3. Create table and load schema +``` +createdb -h localhost -U postgres tpch +psql -h localhost -U postgres -d tpch < dss.ddl +``` + +4. Load data into PostgreSQL +``` +psql -h localhost -U postgres -d tpch -c "\copy LINEITEM FROM '$YOUR_TPCH_DIR/tpch-kit/dbgen/lineitem.tbl' DELIMITER '|' ENCODING 'LATIN1';" +``` + +5. Create index for `LINEITEM` on `l_orderkey` +``` +psql -h localhost -U postgres -d tpch -c "CREATE INDEX lineitem_l_orderkey_idx ON LINEITEM USING btree (l_orderkey);" +``` + +## Redshift: Upload TPC-H +> Note: For Redshift, AWS has already hosted TPC-H data in public s3. We borrow the uploading script from [amazon-redshift-utils](https://github.com/awslabs/amazon-redshift-utils/blob/master/src/CloudDataWarehouseBenchmark/Cloud-DWB-Derived-from-TPCH/3TB/ddl.sql). We only modified `LINEITEM`'s sortkey from `(l_shipdate,l_orderkey)` to `(l_orderkey)`. + +1. Make the following changes in the COPY commands of `script/benchmarks/tpch-reshift.sql`: + + 1. Change `credentials` accordingly from Redshift. + 2. (Optional) Change TPC-H data size in `from` s3 string. Currently it is 10GB (equivilant to TPC-H scale factor 10). It can be change to 3TB. + +2. Run modified `tpch-reshift.sql` for Redshift: +``` +psql -h -U -d -p -f tpch-reshift.sql +``` + +# Benchmark result on AWS r5.4xlarge + +We load the lineitem table of TPC-H @ scale=10 into a r5.4xlarge EC2 machine on AWS for each database, and then run ConnectorX to download data from the database +on another r5.4xlarge machine, with the following command: + +```python +import connectorx as cx + +cx.read_sql("connection string", "SELECT * FROM lineitem", partition_on="l_orderkey", partition_num=4) +``` + +Here are the baselines we compare againt: +* Pandas +* Modin +* Dask +* Turbodbc + +Since Modin and Dask support parallel execution, we use the same number of cores (4) to run them. For Turbodbc, we use the result NumPy arrays to construct the final Pandas.DataFrame for a fair comparison. + +## Postgres (db.m6g.4xlarge RDS) + +## Time chart, lower is better. + +

time chart

+ +## Memory consumption chart, lower is better. + +

memory chart

+ +In conclusion, ConnectorX uses **3x** less memory and **13x** less time compared with Pandas. + +## MySQL (db.m6g.4xlarge RDS) + + +## Time chart, lower is better. + +

time chart

+ +## Memory consumption chart, lower is better. + +

memory chart

+ +In conclusion, ConnectorX uses **3x** less memory and **8x** less time compared with Pandas. + + +## SQLite (r5.4xlarge EC2 same instance) + +**Turbodbc does not support read_sql on SQLite** + +## Time chart, lower is better. + +

time chart

+ +## Memory consumption chart, lower is better. + +

memory chart

+ +In conclusion, ConnectorX uses **2x** less memory and **5x** less time compared with Pandas. + + +## Oracle (db.r5.4xlarge RDS) + +**Modin and Turbodbc does not support read_sql on Oracle** + +## Time chart, lower is better. + +

time chart

+ +## Memory consumption chart, lower is better. + +

memory chart

+ +In conclusion, ConnectorX uses **3x** less memory and **3x** less time compared with Pandas. + + + +## Mssql (r5.4xlarge docker in another EC2 instance) + +**Modin does not support read_sql on Mssql** + +## Time chart, lower is better. + +

time chart

+ +## Memory consumption chart, lower is better. + +

memory chart

+ +In conclusion, ConnectorX uses **3x** less memory and **14x** less time compared with Pandas. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..dba6963 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,81 @@ +# Developer's Guide + +This doc describes how you can get started at developing ConnectorX. + +## Environment Setup + +### Install tools and dependencies + +Please check out [here](https://sfu-db.github.io/connector-x/install.html#build-from-source-code) + + +### Run tests + +* Set up environment variables by creating a `.env` file under project directory. Here is an example: +``` +# postgres +POSTGRES_URL=postgresql://username:password@hostname:5432/db + +# mysql +MYSQL_HOST=hostname +MYSQL_PORT=3306 +MYSQL_DB=db +MYSQL_USER=username +MYSQL_PASSWORD=password +MYSQL_URL=mysql://${MYSQL_USER}:${MYSQL_PASSWORD}@${MYSQL_HOST}:${MYSQL_PORT}/${MYSQL_DB} + +# sqlite +SQLITE_URL=sqlite://db_dir + +# mssql +MSSQL_HOST=hostname +MSSQL_PORT=1433 +MSSQL_USER=username +MSSQL_PASSWORD=password +MSSQL_DB=db +MSSQL_URL=mssql://username:password@hostname:1433/db + +# log +RUST_LOG=connectorx=debug,connectorx_python=debug + +# benchmark related +TPCH_TABLE=lineitem +MODIN_ENGINE=dask + +``` + +* Seed database: `just seed-db` +* Run Rust tests: `just test` +* Run Python tests: `just test-python [-k {test case keyword}]` + +### Other commands + +* Format the code: `cargo fmt` + +## How to Add a New Source + +* Implement source related logics, including: + * Define the type system of the new source + * Implement data fetching and parsing logics + * Examples can be found [here](https://github.com/sfu-db/connector-x/blob/main/connectorx/src/sources) +* Define the conversion between the new source and existing destinations + * Examples can be found [here](https://github.com/sfu-db/connector-x/tree/main/connectorx/src/transports) and [here](https://github.com/sfu-db/connector-x/tree/main/connectorx-python/src/pandas/transports) +* Make the new source visable to destinations, including: + * Add the source to the [source_router](https://github.com/sfu-db/connector-x/blob/main/connectorx-python/src/source_router.rs) + * Add the source to writing functions of each destination. Here are examples for [pandas](https://github.com/sfu-db/connector-x/blob/main/connectorx-python/src/pandas/mod.rs) and [arrow](https://github.com/sfu-db/connector-x/blob/main/connectorx-python/src/arrow.rs) +* Add corresponding unit tests under `connectorx/tests` for Rust and `connectorx-python/connectorx/tests` for Python + +**Please check out [here](https://sfu-db.github.io/connector-x/connectorx/#extending-connectorx) for more detailed implementation instructions of how to extend ConnectorX.** + +## How to Add a New Destination + +* Implement destination related logics, including: + * Define the type system of the new destination + * Implement data writing logics + * Implement the writing interface of destination + * Here are examples for [arrow](https://github.com/sfu-db/connector-x/tree/main/connectorx/src/destinations/arrow) and [pandas](https://github.com/sfu-db/connector-x/tree/main/connectorx-python/src/pandas) +* Define the conversion between existing source and the new destination + * Examples can be found [here](https://github.com/sfu-db/connector-x/tree/main/connectorx/src/transports) and [here](https://github.com/sfu-db/connector-x/tree/main/connectorx-python/src/pandas/transports) +* Add corresponding unit tests under `connectorx/tests` for Rust and `connectorx-python/connectorx/tests` for Python + +**Please check out [here](https://sfu-db.github.io/connector-x/connectorx/#extending-connectorx) for more detailed implementation instructions of how to extend ConnectorX.** diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..c640e98 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,5659 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "addr2line" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + +[[package]] +name = "ahash" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" +dependencies = [ + "getrandom 0.2.10", + "once_cell", + "version_check", +] + +[[package]] +name = "ahash" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" +dependencies = [ + "cfg-if", + "const-random", + "getrandom 0.2.10", + "once_cell", + "version_check", +] + +[[package]] +name = "aho-corasick" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c378d78423fdad8089616f827526ee33c19f2fddbd5de1629152c9593ba4783" +dependencies = [ + "memchr", +] + +[[package]] +name = "alloc-no-stdlib" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" + +[[package]] +name = "alloc-stdlib" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" +dependencies = [ + "alloc-no-stdlib", +] + +[[package]] +name = "allocator-api2" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5" + +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "ansi_term" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" +dependencies = [ + "winapi", +] + +[[package]] +name = "anyhow" +version = "1.0.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6" + +[[package]] +name = "argminmax" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "202108b46429b765ef483f8a24d5c46f48c14acfdacc086dd4ab6dddf6bcdbd2" +dependencies = [ + "num-traits", +] + +[[package]] +name = "array-init-cursor" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf7d0a018de4f6aa429b9d33d69edf69072b1c5b1cb8d3e4a5f7ef898fc3eb76" + +[[package]] +name = "arrayref" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545" + +[[package]] +name = "arrayvec" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" + +[[package]] +name = "arrow" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04a8801ebb147ad240b2d978d3ab9f73c9ccd4557ba6a03e7800496770ed10e0" +dependencies = [ + "ahash 0.8.3", + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-csv", + "arrow-data", + "arrow-ipc", + "arrow-json", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", +] + +[[package]] +name = "arrow-arith" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "895263144bd4a69751cbe6a34a53f26626e19770b313a9fa792c415cd0e78f11" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "half 2.3.1", + "num", +] + +[[package]] +name = "arrow-array" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "226fdc6c3a4ae154a74c24091d36a90b514f0ed7112f5b8322c1d8f354d8e20d" +dependencies = [ + "ahash 0.8.3", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "chrono-tz", + "half 2.3.1", + "hashbrown 0.14.0", + "num", +] + +[[package]] +name = "arrow-buffer" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc4843af4dd679c2f35b69c572874da8fde33be53eb549a5fb128e7a4b763510" +dependencies = [ + "bytes", + "half 2.3.1", + "num", +] + +[[package]] +name = "arrow-cast" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35e8b9990733a9b635f656efda3c9b8308c7a19695c9ec2c7046dd154f9b144b" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "chrono", + "comfy-table", + "half 2.3.1", + "lexical-core", + "num", +] + +[[package]] +name = "arrow-csv" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "646fbb4e11dd0afb8083e883f53117713b8caadb4413b3c9e63e3f535da3683c" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "chrono", + "csv", + "csv-core", + "lazy_static", + "lexical-core", + "regex", +] + +[[package]] +name = "arrow-data" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da900f31ff01a0a84da0572209be72b2b6f980f3ea58803635de47913191c188" +dependencies = [ + "arrow-buffer", + "arrow-schema", + "half 2.3.1", + "num", +] + +[[package]] +name = "arrow-format" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07884ea216994cdc32a2d5f8274a8bee979cfe90274b83f86f440866ee3132c7" +dependencies = [ + "planus", + "serde", +] + +[[package]] +name = "arrow-ipc" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2707a8d7ee2d345d045283ece3ae43416175873483e5d96319c929da542a0b1f" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "flatbuffers", +] + +[[package]] +name = "arrow-json" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d1b91a63c356d14eedc778b76d66a88f35ac8498426bb0799a769a49a74a8b4" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "chrono", + "half 2.3.1", + "indexmap 2.0.0", + "lexical-core", + "num", + "serde", + "serde_json", +] + +[[package]] +name = "arrow-ord" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "584325c91293abbca7aaaabf8da9fe303245d641f5f4a18a6058dc68009c7ebf" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "half 2.3.1", + "num", +] + +[[package]] +name = "arrow-row" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e32afc1329f7b372463b21c6ca502b07cf237e1ed420d87706c1770bb0ebd38" +dependencies = [ + "ahash 0.8.3", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "half 2.3.1", + "hashbrown 0.14.0", +] + +[[package]] +name = "arrow-schema" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b104f5daa730f00fde22adc03a12aa5a2ae9ccbbf99cbd53d284119ddc90e03d" +dependencies = [ + "bitflags 2.4.0", +] + +[[package]] +name = "arrow-select" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73b3ca55356d1eae07cf48808d8c462cea674393ae6ad1e0b120f40b422eb2b4" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "num", +] + +[[package]] +name = "arrow-string" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af1433ce02590cae68da0a18ed3a3ed868ffac2c6f24c533ddd2067f7ee04b4a" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "num", + "regex", + "regex-syntax 0.7.5", +] + +[[package]] +name = "arrow2" +version = "0.17.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59c468daea140b747d781a1da9f7db5f0a8e6636d4af20cc539e43d05b0604fa" +dependencies = [ + "ahash 0.8.3", + "arrow-format", + "bytemuck", + "chrono", + "dyn-clone", + "either", + "ethnum", + "foreign_vec", + "futures", + "getrandom 0.2.10", + "hash_hasher", + "lexical-core", + "lz4", + "multiversion", + "num-traits", + "regex", + "regex-syntax 0.6.29", + "rustc_version", + "simdutf8", + "strength_reduce", + "zstd", +] + +[[package]] +name = "async-channel" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35" +dependencies = [ + "concurrent-queue", + "event-listener", + "futures-core", +] + +[[package]] +name = "async-compression" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb42b2197bf15ccb092b62c74515dbd8b86d0effd934795f6687c93b6e679a2c" +dependencies = [ + "bzip2", + "flate2", + "futures-core", + "futures-io", + "memchr", + "pin-project-lite", + "tokio", + "xz2", + "zstd", + "zstd-safe", +] + +[[package]] +name = "async-lock" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "287272293e9d8c41773cec55e365490fe034813a2f172f502d6ddcf75b2f582b" +dependencies = [ + "event-listener", +] + +[[package]] +name = "async-native-tls" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e9e7a929bd34c68a82d58a4de7f86fffdaf97fb2af850162a7bb19dd7269b33" +dependencies = [ + "async-std", + "native-tls", + "thiserror", + "url", +] + +[[package]] +name = "async-std" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62565bb4402e926b29953c785397c6dc0391b7b446e45008b0049eb43cec6f5d" +dependencies = [ + "async-channel", + "async-lock", + "crossbeam-utils", + "futures-channel", + "futures-core", + "futures-io", + "memchr", + "once_cell", + "pin-project-lite", + "pin-utils", + "slab", + "wasm-bindgen-futures", +] + +[[package]] +name = "async-stream" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22068c0c19514942eefcfd4daf8976ef1aad84e61539f95cd200c35202f80af5" +dependencies = [ + "async-stream-impl 0.2.1", + "futures-core", +] + +[[package]] +name = "async-stream" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd56dd203fef61ac097dd65721a419ddccb106b2d2b70ba60a6b529f03961a51" +dependencies = [ + "async-stream-impl 0.3.5", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-stream-impl" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25f9db3b38af870bf7e5cc649167533b493928e50744e2c30ae350230b414670" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.33", +] + +[[package]] +name = "async-trait" +version = "0.1.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc00ceb34980c03614e35a3a4e218276a0a824e911d07651cd0d858a51e8c0f0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.33", +] + +[[package]] +name = "asynchronous-codec" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb4401f0a3622dad2e0763fa79e0eb328bc70fb7dccfdd645341f00d671247d6" +dependencies = [ + "bytes", + "futures-sink", + "futures-util", + "memchr", + "pin-project-lite", +] + +[[package]] +name = "atoi" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" +dependencies = [ + "num-traits", +] + +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi 0.1.19", + "libc", + "winapi", +] + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "backtrace" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" +dependencies = [ + "addr2line", + "cc", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] + +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + +[[package]] +name = "base64" +version = "0.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ba43ea6f343b788c8764558649e08df62f86c6ef251fdaeb1ffd010a9ae50a2" + +[[package]] +name = "bb8" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e9f4fa9768efd269499d8fba693260cfc670891cf6de3adc935588447a77cc8" +dependencies = [ + "async-trait", + "futures-channel", + "futures-util", + "parking_lot 0.11.2", + "tokio", +] + +[[package]] +name = "bb8-tiberius" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "648d5365b34a2a362d5b8790d3c1b230d263d2377e563c76cb79c10d326b917e" +dependencies = [ + "async-trait", + "bb8", + "futures", + "thiserror", + "tiberius", + "tokio", + "tokio-util 0.6.10", +] + +[[package]] +name = "bigdecimal" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6773ddc0eafc0e509fb60e48dff7f450f8e674a0686ae8605e8d9901bd5eefa" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + +[[package]] +name = "bindgen" +version = "0.59.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bd2a9a458e8f4304c52c43ebb0cfbd520289f8379a52e329a38afda99bf8eb8" +dependencies = [ + "bitflags 1.3.2", + "cexpr", + "clang-sys", + "clap", + "env_logger", + "lazy_static", + "lazycell", + "log", + "peeking_take_while", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "which", +] + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4682ae6287fcf752ecaabbfcc7b6f9b72aa33933dc23a554d853aea8eea8635" + +[[package]] +name = "bitvec" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c" +dependencies = [ + "funty", + "radium", + "tap", + "wyz", +] + +[[package]] +name = "blake2" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe" +dependencies = [ + "digest", +] + +[[package]] +name = "blake3" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "199c42ab6972d92c9f8995f086273d25c42fc0f7b2a1fcefba465c1352d25ba5" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", + "digest", +] + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "borsh" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4114279215a005bc675e386011e594e1d9b800918cea18fcadadcce864a2046b" +dependencies = [ + "borsh-derive", + "hashbrown 0.13.2", +] + +[[package]] +name = "borsh-derive" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0754613691538d51f329cce9af41d7b7ca150bc973056f1156611489475f54f7" +dependencies = [ + "borsh-derive-internal", + "borsh-schema-derive-internal", + "proc-macro-crate", + "proc-macro2", + "syn 1.0.109", +] + +[[package]] +name = "borsh-derive-internal" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "afb438156919598d2c7bad7e1c0adf3d26ed3840dbc010db1a882a65583ca2fb" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "borsh-schema-derive-internal" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "634205cc43f74a1b9046ef87c4540ebda95696ec0f315024860cad7c5b0f5ccd" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "brotli" +version = "3.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1a0b1dbcc8ae29329621f8d4f0d835787c1c38bb1401979b49d13b0b305ff68" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", + "brotli-decompressor", +] + +[[package]] +name = "brotli-decompressor" +version = "2.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b6561fd3f895a11e8f72af2cb7d22e08366bebc2b6b57f7744c4bda27034744" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", +] + +[[package]] +name = "bufstream" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40e38929add23cdf8a366df9b0e088953150724bcbe5fc330b0d8eb3b328eec8" + +[[package]] +name = "bumpalo" +version = "3.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1" + +[[package]] +name = "bytecheck" +version = "0.6.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b6372023ac861f6e6dc89c8344a8f398fb42aaba2b5dbc649ca0c0e9dbcb627" +dependencies = [ + "bytecheck_derive", + "ptr_meta", + "simdutf8", +] + +[[package]] +name = "bytecheck_derive" +version = "0.6.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7ec4c6f261935ad534c0c22dbef2201b45918860eb1c574b972bd213a76af61" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "bytemuck" +version = "1.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "374d28ec25809ee0e23827c2ab573d729e293f281dfe393500e7ad618baa61c6" +dependencies = [ + "bytemuck_derive", +] + +[[package]] +name = "bytemuck_derive" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "965ab7eb5f8f97d2a083c799f3a1b994fc397b2fe2da5d1da1626ce15a39f2b1" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.33", +] + +[[package]] +name = "byteorder" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" + +[[package]] +name = "bytes" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" + +[[package]] +name = "bzip2" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" +dependencies = [ + "bzip2-sys", + "libc", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.11+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + +[[package]] +name = "cc" +version = "1.0.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" +dependencies = [ + "jobserver", + "libc", +] + +[[package]] +name = "cesu8" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c" + +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "chrono" +version = "0.4.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "defd4e7873dbddba6c7c91e199c7fcb946abc4a6a4ac3195400bcfb01b5de877" +dependencies = [ + "android-tzdata", + "iana-time-zone", + "js-sys", + "num-traits", + "serde", + "wasm-bindgen", + "windows-targets", +] + +[[package]] +name = "chrono-tz" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1369bc6b9e9a7dfdae2055f6ec151fe9c554a9d23d357c0237cee2e25eaabb7" +dependencies = [ + "chrono", + "chrono-tz-build", + "phf", +] + +[[package]] +name = "chrono-tz-build" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2f5ebdc942f57ed96d560a6d1a459bae5851102a25d5bf89dc04ae453e31ecf" +dependencies = [ + "parse-zoneinfo", + "phf", + "phf_codegen", +] + +[[package]] +name = "clang-sys" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f" +dependencies = [ + "glob", + "libc", + "libloading", +] + +[[package]] +name = "clap" +version = "2.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" +dependencies = [ + "ansi_term", + "atty", + "bitflags 1.3.2", + "strsim 0.8.0", + "textwrap", + "unicode-width", + "vec_map", +] + +[[package]] +name = "cmake" +version = "0.1.50" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a31c789563b815f77f4250caee12365734369f942439b7defd71e18a48197130" +dependencies = [ + "cc", +] + +[[package]] +name = "comfy-table" +version = "7.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ab77dbd8adecaf3f0db40581631b995f312a8a5ae3aa9993188bb8f23d83a5b" +dependencies = [ + "crossterm", + "strum 0.24.1", + "strum_macros 0.24.3", + "unicode-width", +] + +[[package]] +name = "concurrent-queue" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62ec6771ecfa0762d24683ee5a32ad78487a3d3afdc0fb8cae19d2c5deb50b7c" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "connection-string" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c4ecb0dc8c35d2c626e45ae70bbfcb1050b302f42bcdf025d913cc0c5a0b443" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "connectorx" +version = "0.3.3-alpha.1" +dependencies = [ + "anyhow", + "arrow", + "arrow2", + "bb8", + "bb8-tiberius", + "chrono", + "criterion", + "csv", + "datafusion", + "env_logger", + "fallible-streaming-iterator", + "fehler", + "futures", + "gcp-bigquery-client", + "hex", + "iai", + "itertools 0.10.5", + "j4rs", + "log", + "mysql_common", + "native-tls", + "ndarray", + "num-traits", + "openssl", + "oracle", + "owning_ref", + "polars", + "postgres", + "postgres-native-tls", + "postgres-openssl", + "pprof", + "r2d2", + "r2d2-oracle", + "r2d2_mysql", + "r2d2_postgres", + "r2d2_sqlite", + "rayon", + "regex", + "rusqlite", + "rust_decimal", + "rust_decimal_macros", + "serde_json", + "sqlparser 0.37.0", + "thiserror", + "tiberius", + "tokio", + "tokio-util 0.6.10", + "url", + "urlencoding", + "uuid 0.8.2", +] + +[[package]] +name = "connectorx-cpp" +version = "0.3.3-alpha.1" +dependencies = [ + "arrow", + "connectorx", + "libc", + "openssl", +] + +[[package]] +name = "const-random" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368a7a772ead6ce7e1de82bfb04c485f3db8ec744f72925af5735e29a22cc18e" +dependencies = [ + "const-random-macro", + "proc-macro-hack", +] + +[[package]] +name = "const-random-macro" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d7d6ab3c3a2282db210df5f02c4dab6e0a7057af0fb7ebd4070f30fe05c0ddb" +dependencies = [ + "getrandom 0.2.10", + "once_cell", + "proc-macro-hack", + "tiny-keccak", +] + +[[package]] +name = "constant_time_eq" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7144d30dcf0fafbce74250a3963025d8d52177934239851c917d29f1df280c2" + +[[package]] +name = "core-foundation" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "194a7a9e6de53fa55116934067c844d9d749312f75c6f6d0980e8c252f8c2146" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" + +[[package]] +name = "cpp_demangle" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eeaa953eaad386a53111e47172c2fedba671e5684c8dd601a5f474f4f118710f" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "cpufeatures" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1" +dependencies = [ + "libc", +] + +[[package]] +name = "crc32fast" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "criterion" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b01d6de93b2b6c65e17c634a26653a29d107b3c98c607c765bf38d041531cd8f" +dependencies = [ + "atty", + "cast", + "clap", + "criterion-plot", + "csv", + "itertools 0.10.5", + "lazy_static", + "num-traits", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_cbor", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2673cc8207403546f45f5fd319a974b1e6983ad1a3ee7e6041650013be041876" +dependencies = [ + "cast", + "itertools 0.10.5", +] + +[[package]] +name = "crossbeam" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2801af0d36612ae591caa9568261fddce32ce6e08a7275ea334a06a4ad021a2c" +dependencies = [ + "cfg-if", + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-epoch", + "crossbeam-queue", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" +dependencies = [ + "cfg-if", + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" +dependencies = [ + "autocfg", + "cfg-if", + "crossbeam-utils", + "memoffset", + "scopeguard", +] + +[[package]] +name = "crossbeam-queue" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1cfb3ea8a53f37c40dea2c7bedcbd88bdfae54f5e2175d6ecaff1c988353add" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossterm" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a84cda67535339806297f1b331d6dd6320470d2a0fe65381e79ee9e156dd3d13" +dependencies = [ + "bitflags 1.3.2", + "crossterm_winapi", + "libc", + "mio", + "parking_lot 0.12.1", + "signal-hook", + "signal-hook-mio", + "winapi", +] + +[[package]] +name = "crossterm_winapi" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b" +dependencies = [ + "winapi", +] + +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "csv" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "626ae34994d3d8d668f4269922248239db4ae42d538b14c398b74a52208e8086" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +dependencies = [ + "memchr", +] + +[[package]] +name = "darling" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a01d95850c592940db9b8194bc39f4bc0e89dee5c4265e4b1807c34a9aba453c" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "859d65a907b6852c9361e3185c862aae7fafd2887876799fa55f5f99dc40d610" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim 0.10.0", + "syn 1.0.109", +] + +[[package]] +name = "darling_macro" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c972679f83bdf9c42bd905396b6c3588a843a17f0f16dfcfa3e2c5d57441835" +dependencies = [ + "darling_core", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "dashmap" +version = "5.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856" +dependencies = [ + "cfg-if", + "hashbrown 0.14.0", + "lock_api", + "once_cell", + "parking_lot_core 0.9.8", +] + +[[package]] +name = "datafusion" +version = "31.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a4e4fc25698a14c90b34dda647ba10a5a966dc04b036d22e77fb1048663375d" +dependencies = [ + "ahash 0.8.3", + "arrow", + "arrow-array", + "arrow-schema", + "async-compression", + "async-trait", + "bytes", + "bzip2", + "chrono", + "dashmap", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-optimizer", + "datafusion-physical-expr", + "datafusion-sql", + "flate2", + "futures", + "glob", + "half 2.3.1", + "hashbrown 0.14.0", + "indexmap 2.0.0", + "itertools 0.11.0", + "log", + "num_cpus", + "object_store", + "parking_lot 0.12.1", + "parquet", + "percent-encoding", + "pin-project-lite", + "rand 0.8.5", + "sqlparser 0.37.0", + "tempfile", + "tokio", + "tokio-util 0.7.8", + "url", + "uuid 1.4.1", + "xz2", + "zstd", +] + +[[package]] +name = "datafusion-common" +version = "31.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c23ad0229ea4a85bf76b236d8e75edf539881fdb02ce4e2394f9a76de6055206" +dependencies = [ + "arrow", + "arrow-array", + "async-compression", + "bytes", + "bzip2", + "chrono", + "flate2", + "futures", + "num_cpus", + "object_store", + "parquet", + "sqlparser 0.37.0", + "tokio", + "tokio-util 0.7.8", + "xz2", + "zstd", +] + +[[package]] +name = "datafusion-execution" +version = "31.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b37d2fc1a213baf34e0a57c85b8e6648f1a95152798fd6738163ee96c19203f" +dependencies = [ + "arrow", + "dashmap", + "datafusion-common", + "datafusion-expr", + "futures", + "hashbrown 0.14.0", + "log", + "object_store", + "parking_lot 0.12.1", + "rand 0.8.5", + "tempfile", + "url", +] + +[[package]] +name = "datafusion-expr" +version = "31.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6ea9844395f537730a145e5d87f61fecd37c2bc9d54e1dc89b35590d867345d" +dependencies = [ + "ahash 0.8.3", + "arrow", + "datafusion-common", + "sqlparser 0.37.0", + "strum 0.25.0", + "strum_macros 0.25.2", +] + +[[package]] +name = "datafusion-optimizer" +version = "31.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8a30e0f79c5d59ba14d3d70f2500e87e0ff70236ad5e47f9444428f054fd2be" +dependencies = [ + "arrow", + "async-trait", + "chrono", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-expr", + "hashbrown 0.14.0", + "itertools 0.11.0", + "log", + "regex-syntax 0.7.5", +] + +[[package]] +name = "datafusion-physical-expr" +version = "31.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "766c567082c9bbdcb784feec8fe40c7049cedaeb3a18d54f563f75fe0dc1932c" +dependencies = [ + "ahash 0.8.3", + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-schema", + "base64 0.21.4", + "blake2", + "blake3", + "chrono", + "datafusion-common", + "datafusion-expr", + "half 2.3.1", + "hashbrown 0.14.0", + "hex", + "indexmap 2.0.0", + "itertools 0.11.0", + "libc", + "log", + "md-5", + "paste", + "petgraph", + "rand 0.8.5", + "regex", + "sha2", + "unicode-segmentation", + "uuid 1.4.1", +] + +[[package]] +name = "datafusion-sql" +version = "31.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "811fd084cf2d78aa0c76b74320977c7084ad0383690612528b580795764b4dd0" +dependencies = [ + "arrow", + "arrow-schema", + "datafusion-common", + "datafusion-expr", + "log", + "sqlparser 0.37.0", +] + +[[package]] +name = "debugid" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6ee87af31d84ef885378aebca32be3d682b0e0dc119d5b4860a2c5bb5046730" +dependencies = [ + "uuid 0.8.2", +] + +[[package]] +name = "deranged" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2696e8a945f658fd14dc3b87242e6b80cd0f36ff04ea560fa39082368847946" +dependencies = [ + "serde", +] + +[[package]] +name = "derive_utils" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9abcad25e9720609ccb3dcdb795d845e37d8ce34183330a9f48b03a1a71c8e21" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.33", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", + "subtle", +] + +[[package]] +name = "dirs" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3aa72a6f96ea37bbc5aa912f6788242832f75369bdfdadcb0e38423f100059" +dependencies = [ + "dirs-sys", +] + +[[package]] +name = "dirs-sys" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b1d1d91c932ef41c0f2663aa8b0ca0342d444d842c06914aa0a7e352d0bada6" +dependencies = [ + "libc", + "redox_users", + "winapi", +] + +[[package]] +name = "doc-comment" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" + +[[package]] +name = "dunce" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56ce8c6da7551ec6c462cbaf3bfbc75131ebbfa1c944aeaa9dab51ca1c5f0c3b" + +[[package]] +name = "dyn-clone" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbfc4744c1b8f2a09adc0e55242f60b1af195d88596bd8700be74418c056c555" + +[[package]] +name = "either" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" + +[[package]] +name = "encoding" +version = "0.2.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" +dependencies = [ + "encoding-index-japanese", + "encoding-index-korean", + "encoding-index-simpchinese", + "encoding-index-singlebyte", + "encoding-index-tradchinese", +] + +[[package]] +name = "encoding-index-japanese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-korean" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-simpchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-singlebyte" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-tradchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding_index_tests" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" + +[[package]] +name = "encoding_rs" +version = "0.8.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7268b386296a025e474d5140678f75d6de9493ae55a5d709eeb9dd08149945e1" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "enum_dispatch" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f33313078bb8d4d05a2733a94ac4c2d8a0df9a2b84424ebf4f33bfc224a890e" +dependencies = [ + "once_cell", + "proc-macro2", + "quote", + "syn 2.0.33", +] + +[[package]] +name = "enumflags2" +version = "0.7.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c041f5090df68b32bcd905365fd51769c8b9d553fe87fde0b683534f10c01bd2" +dependencies = [ + "enumflags2_derive", +] + +[[package]] +name = "enumflags2_derive" +version = "0.7.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e9a1f9f7d83e59740248a6e14ecf93929ade55027844dfcea78beafccc15745" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.33", +] + +[[package]] +name = "env_logger" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a12e6657c4c97ebab115a42dcee77225f7f482cdd841cf7088c657a42e9e00e7" +dependencies = [ + "atty", + "humantime", + "log", + "regex", + "termcolor", +] + +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + +[[package]] +name = "errno" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "136526188508e25c6fef639d7927dfb3e0e3084488bf202267829cf7fc23dbdd" +dependencies = [ + "errno-dragonfly", + "libc", + "windows-sys", +] + +[[package]] +name = "errno-dragonfly" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "ethnum" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c8ff382b2fa527fb7fb06eeebfc5bbb3f17e3cc6b9d70b006c41daa8824adac" + +[[package]] +name = "event-listener" +version = "2.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" + +[[package]] +name = "fallible-iterator" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" + +[[package]] +name = "fallible-streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" + +[[package]] +name = "fast-float" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95765f67b4b18863968b4a1bd5bb576f732b29a4a28c7cd84c09fa3e2875f33c" + +[[package]] +name = "fastrand" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764" + +[[package]] +name = "fehler" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5729fe49ba028cd550747b6e62cd3d841beccab5390aa398538c31a2d983635" +dependencies = [ + "fehler-macros", +] + +[[package]] +name = "fehler-macros" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccb5acb1045ebbfa222e2c50679e392a71dd77030b78fb0189f2d9c5974400f9" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "finl_unicode" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6" + +[[package]] +name = "fixedbitset" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" + +[[package]] +name = "flatbuffers" +version = "23.5.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dac53e22462d78c16d64a1cd22371b54cc3fe94aa15e7886a2fa6e5d1ab8640" +dependencies = [ + "bitflags 1.3.2", + "rustc_version", +] + +[[package]] +name = "flate2" +version = "1.0.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6c98ee8095e9d1dcbf2fcc6d95acccb90d1c81db1e44725c6a984b1dbdfb010" +dependencies = [ + "crc32fast", + "libz-sys", + "miniz_oxide", +] + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + +[[package]] +name = "foreign_vec" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee1b05cbd864bcaecbd3455d6d967862d446e4ebfc3c2e5e5b9841e53cba6673" + +[[package]] +name = "form_urlencoded" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a62bc1cf6f830c2ec14a513a9fb124d0a213a629668a4186f329db21fe045652" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "frunk" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11a351b59e12f97b4176ee78497dff72e4276fb1ceb13e19056aca7fa0206287" +dependencies = [ + "frunk_core", + "frunk_derives", + "frunk_proc_macros", +] + +[[package]] +name = "frunk_core" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af2469fab0bd07e64ccf0ad57a1438f63160c69b2e57f04a439653d68eb558d6" + +[[package]] +name = "frunk_derives" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fa992f1656e1707946bbba340ad244f0814009ef8c0118eb7b658395f19a2e" +dependencies = [ + "frunk_proc_macro_helpers", + "quote", + "syn 2.0.33", +] + +[[package]] +name = "frunk_proc_macro_helpers" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35b54add839292b743aeda6ebedbd8b11e93404f902c56223e51b9ec18a13d2c" +dependencies = [ + "frunk_core", + "proc-macro2", + "quote", + "syn 2.0.33", +] + +[[package]] +name = "frunk_proc_macros" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71b85a1d4a9a6b300b41c05e8e13ef2feca03e0334127f29eca9506a7fe13a93" +dependencies = [ + "frunk_core", + "frunk_proc_macro_helpers", + "quote", + "syn 2.0.33", +] + +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + +[[package]] +name = "funty" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" + +[[package]] +name = "futures" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" + +[[package]] +name = "futures-executor" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964" + +[[package]] +name = "futures-macro" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.33", +] + +[[package]] +name = "futures-sink" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e" + +[[package]] +name = "futures-task" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65" + +[[package]] +name = "futures-util" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "gcp-bigquery-client" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ab5966c98f6d4e71e247cda6a6d8497bc8a1df3a4ba9ee548087842cffc21d" +dependencies = [ + "async-stream 0.3.5", + "hyper", + "hyper-rustls 0.23.2", + "log", + "reqwest", + "serde", + "serde_json", + "thiserror", + "time", + "tokio", + "tokio-stream", + "url", + "yup-oauth2", +] + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.9.0+wasi-snapshot-preview1", +] + +[[package]] +name = "getrandom" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "wasi 0.11.0+wasi-snapshot-preview1", + "wasm-bindgen", +] + +[[package]] +name = "gimli" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fb8d784f27acf97159b40fc4db5ecd8aa23b9ad5ef69cdd136d3bc80665f0c0" + +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + +[[package]] +name = "h2" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91fc23aa11be92976ef4729127f1a74adf36d8436f7816b185d18df956790833" +dependencies = [ + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http", + "indexmap 1.9.3", + "slab", + "tokio", + "tokio-util 0.7.8", + "tracing", +] + +[[package]] +name = "half" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" + +[[package]] +name = "half" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc52e53916c08643f1b56ec082790d1e86a32e58dc5268f897f313fbae7b4872" +dependencies = [ + "cfg-if", + "crunchy", + "num-traits", +] + +[[package]] +name = "hash_hasher" +version = "2.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74721d007512d0cb3338cd20f0654ac913920061a4c4d0d8708edb3f2a698c0c" + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" +dependencies = [ + "ahash 0.7.6", +] + +[[package]] +name = "hashbrown" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e" +dependencies = [ + "ahash 0.8.3", +] + +[[package]] +name = "hashbrown" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a" +dependencies = [ + "ahash 0.8.3", + "allocator-api2", + "rayon", +] + +[[package]] +name = "hashlink" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7" +dependencies = [ + "hashbrown 0.14.0", +] + +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + +[[package]] +name = "hermit-abi" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + +[[package]] +name = "home" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5444c27eef6923071f7ebcc33e3444508466a76f7a2b93da00ed6e19f30c1ddb" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "http" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd6effc99afb63425aff9b05836f029929e345a6148a14b7ecd5ab67af944482" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http-body" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5f38f16d184e36f2408a55281cd658ecbd3ca05cce6d6510a176eca393e26d1" +dependencies = [ + "bytes", + "http", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" + +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + +[[package]] +name = "hyper" +version = "0.14.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffb1cfd654a8219eaef89881fdb3bb3b1cdc5fa75ded05d6933b2b382e395468" +dependencies = [ + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "socket2 0.4.9", + "tokio", + "tower-service", + "tracing", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.23.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1788965e61b367cd03a62950836d5cd41560c3577d90e40e0819373194d1661c" +dependencies = [ + "http", + "hyper", + "log", + "rustls 0.20.9", + "rustls-native-certs", + "tokio", + "tokio-rustls 0.23.4", +] + +[[package]] +name = "hyper-rustls" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d78e1e73ec14cf7375674f74d7dde185c8206fd9dea6fb6295e8a98098aaa97" +dependencies = [ + "futures-util", + "http", + "hyper", + "rustls 0.21.7", + "tokio", + "tokio-rustls 0.24.1", +] + +[[package]] +name = "iai" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71a816c97c42258aa5834d07590b718b4c9a598944cd39a52dc25b351185d678" + +[[package]] +name = "iana-time-zone" +version = "0.1.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fad5b825842d2b38bd206f3e81d6957625fd7f0a361e345c30e01a0ae2dd613" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + +[[package]] +name = "idna" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d20d6b07bfbc108882d88ed8e37d39636dcc260e15e30c45e6ba089610b917c" +dependencies = [ + "unicode-bidi", + "unicode-normalization", +] + +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown 0.12.3", +] + +[[package]] +name = "indexmap" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5477fe2230a79769d8dc68e0eabf5437907c0457a5614a9e8dddb67f65eb65d" +dependencies = [ + "equivalent", + "hashbrown 0.14.0", +] + +[[package]] +name = "inferno" +version = "0.10.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de3886428c6400486522cf44b8626e7b94ad794c14390290f2a274dcf728a58f" +dependencies = [ + "ahash 0.7.6", + "atty", + "indexmap 1.9.3", + "itoa", + "lazy_static", + "log", + "num-format", + "quick-xml", + "rgb", + "str_stack", +] + +[[package]] +name = "instant" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "integer-encoding" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" + +[[package]] +name = "io-enum" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5305557fa27b460072ae15ce07617e999f5879f14d376c8449f0bfb9f9d8e91e" +dependencies = [ + "derive_utils", + "syn 2.0.33", +] + +[[package]] +name = "ipnet" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6" + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" + +[[package]] +name = "j4rs" +version = "0.15.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76cc9c1648a1cc940ac10c19f56e50bee15344590e10f220899d955db5f87ac2" +dependencies = [ + "cesu8", + "dirs", + "dunce", + "fs_extra", + "glob", + "java-locator", + "jni-sys", + "lazy_static", + "libc", + "libloading", + "log", + "serde", + "serde_json", + "sha2", +] + +[[package]] +name = "java-locator" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90003f2fd9c52f212c21d8520f1128da0080bad6fff16b68fe6e7f2f0c3780c2" +dependencies = [ + "glob", + "lazy_static", +] + +[[package]] +name = "jni-sys" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130" + +[[package]] +name = "jobserver" +version = "0.1.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2" +dependencies = [ + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a38fc24e30fd564ce974c02bf1d337caddff65be6cc4735a1f7eab22a7440f04" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + +[[package]] +name = "lexical" +version = "6.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7aefb36fd43fef7003334742cbf77b243fcd36418a1d1bdd480d613a67968f6" +dependencies = [ + "lexical-core", +] + +[[package]] +name = "lexical-core" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46" +dependencies = [ + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", +] + +[[package]] +name = "lexical-parse-float" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f" +dependencies = [ + "lexical-parse-integer", + "lexical-util", + "static_assertions", +] + +[[package]] +name = "lexical-parse-integer" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9" +dependencies = [ + "lexical-util", + "static_assertions", +] + +[[package]] +name = "lexical-util" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5255b9ff16ff898710eb9eb63cb39248ea8a5bb036bea8085b1a767ff6c4e3fc" +dependencies = [ + "static_assertions", +] + +[[package]] +name = "lexical-write-float" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862" +dependencies = [ + "lexical-util", + "lexical-write-integer", + "static_assertions", +] + +[[package]] +name = "lexical-write-integer" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446" +dependencies = [ + "lexical-util", + "static_assertions", +] + +[[package]] +name = "libc" +version = "0.2.148" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cdc71e17332e86d2e1d38c1f99edcb6288ee11b815fb1a4b049eaa2114d369b" + +[[package]] +name = "libgssapi" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "724dbcd1f871da9c67983537a47ac510c278656f6392418ad67c7a52720e54b2" +dependencies = [ + "bitflags 1.3.2", + "bytes", + "lazy_static", + "libgssapi-sys", + "parking_lot 0.11.2", +] + +[[package]] +name = "libgssapi-sys" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dd7d65e409c889f6c9d81ff079371d0d8fd88d7dca702ff187ef96fb0450fb7" +dependencies = [ + "bindgen", +] + +[[package]] +name = "libloading" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" +dependencies = [ + "cfg-if", + "winapi", +] + +[[package]] +name = "libm" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7012b1bbb0719e1097c47611d3898568c546d597c2e74d66f6087edd5233ff4" + +[[package]] +name = "libsqlite3-sys" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "afc22eff61b133b115c6e8c74e818c628d6d5e7a502afea6f64dee076dd94326" +dependencies = [ + "cc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "libz-sys" +version = "1.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d97137b25e321a73eef1418d1d5d2eda4d77e12813f8e6dead84bc52c5870a7b" +dependencies = [ + "cc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "linux-raw-sys" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a9bad9f94746442c783ca431b22403b519cd7fbeed0533fdd6328b2f2212128" + +[[package]] +name = "lock_api" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" + +[[package]] +name = "lru" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6e8aaa3f231bb4bd57b84b2d5dc3ae7f350265df8aa96492e0bc394a1571909" +dependencies = [ + "hashbrown 0.12.3", +] + +[[package]] +name = "lz4" +version = "1.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e9e2dd86df36ce760a60f6ff6ad526f7ba1f14ba0356f8254fb6905e6494df1" +dependencies = [ + "libc", + "lz4-sys", +] + +[[package]] +name = "lz4-sys" +version = "1.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57d27b317e207b10f69f5e75494119e391a96f48861ae870d1da6edac98ca900" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + +[[package]] +name = "matrixmultiply" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "090126dc04f95dc0d1c1c91f61bdd474b3930ca064c1edc8a849da2c6cbe1e77" +dependencies = [ + "autocfg", + "rawpointer", +] + +[[package]] +name = "md-5" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6365506850d44bff6e2fbcb5176cf63650e48bd45ef2fe2665ae1570e0f4b9ca" +dependencies = [ + "digest", +] + +[[package]] +name = "md5" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e6bcd6433cff03a4bfc3d9834d504467db1f1cf6d0ea765d37d330249ed629d" + +[[package]] +name = "md5" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" + +[[package]] +name = "memchr" +version = "2.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c" + +[[package]] +name = "memmap2" +version = "0.5.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327" +dependencies = [ + "libc", +] + +[[package]] +name = "memmap2" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f49388d20533534cd19360ad3d6a7dadc885944aa802ba3995040c5ec11288c6" +dependencies = [ + "libc", +] + +[[package]] +name = "memoffset" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" +dependencies = [ + "autocfg", +] + +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "miniz_oxide" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7" +dependencies = [ + "adler", +] + +[[package]] +name = "mio" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2" +dependencies = [ + "libc", + "log", + "wasi 0.11.0+wasi-snapshot-preview1", + "windows-sys", +] + +[[package]] +name = "multiversion" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2c7b9d7fe61760ce5ea19532ead98541f6b4c495d87247aff9826445cf6872a" +dependencies = [ + "multiversion-macros", + "target-features", +] + +[[package]] +name = "multiversion-macros" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26a83d8500ed06d68877e9de1dde76c1dbb83885dcdbda4ef44ccbc3fbda2ac8" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", + "target-features", +] + +[[package]] +name = "mysql" +version = "23.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f11339ca5c251941805d51362a07823605a80586ced92914ab7de84fba813f" +dependencies = [ + "bufstream", + "bytes", + "crossbeam", + "flate2", + "io-enum", + "libc", + "lru", + "mysql_common", + "named_pipe", + "native-tls", + "once_cell", + "pem", + "percent-encoding", + "serde", + "serde_json", + "socket2 0.4.9", + "twox-hash", + "url", +] + +[[package]] +name = "mysql_common" +version = "0.29.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9006c95034ccf7b903d955f210469119f6c3477fc9c9e7a7845ce38a3e665c2a" +dependencies = [ + "base64 0.13.1", + "bigdecimal", + "bindgen", + "bitflags 1.3.2", + "bitvec", + "byteorder", + "bytes", + "cc", + "chrono", + "cmake", + "crc32fast", + "flate2", + "frunk", + "lazy_static", + "lexical", + "num-bigint", + "num-traits", + "rand 0.8.5", + "regex", + "rust_decimal", + "saturating", + "serde", + "serde_json", + "sha1", + "sha2", + "smallvec", + "subprocess", + "thiserror", + "time", + "uuid 1.4.1", +] + +[[package]] +name = "named_pipe" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad9c443cce91fc3e12f017290db75dde490d685cdaaf508d7159d7cf41f0eb2b" +dependencies = [ + "winapi", +] + +[[package]] +name = "native-tls" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e" +dependencies = [ + "lazy_static", + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + +[[package]] +name = "ndarray" +version = "0.15.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32" +dependencies = [ + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "rawpointer", +] + +[[package]] +name = "nix" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa9b4819da1bc61c0ea48b63b7bc8604064dd43013e7cc325df098d49cd7c18a" +dependencies = [ + "bitflags 1.3.2", + "cc", + "cfg-if", + "libc", +] + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "now" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d89e9874397a1f0a52fc1f197a8effd9735223cb2390e9dcc83ac6cd02923d0" +dependencies = [ + "chrono", +] + +[[package]] +name = "ntapi" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8a3895c6391c39d7fe7ebc444a87eb2991b2a0bc718fdabd071eec617fc68e4" +dependencies = [ + "winapi", +] + +[[package]] +name = "num" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b05180d69e3da0e530ba2a1dae5110317e49e3b7f3d41be227dc5f92e49ee7af" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + +[[package]] +name = "num-bigint" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "608e7659b5c3d7cba262d894801b9ec9d00de989e8a82bd4bef91d08da45cdc0" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-complex" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ba157ca0885411de85d6ca030ba7e2a83a28636056c7c699b07c8b6f7383214" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-format" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a652d9771a63711fd3c3deb670acfbe5c30a4072e664d7a3bf5a9e1056ac72c3" +dependencies = [ + "arrayvec", + "itoa", +] + +[[package]] +name = "num-integer" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" +dependencies = [ + "autocfg", + "num-traits", +] + +[[package]] +name = "num-iter" +version = "0.1.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0" +dependencies = [ + "autocfg", + "num-bigint", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "num_cpus" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" +dependencies = [ + "hermit-abi 0.3.2", + "libc", +] + +[[package]] +name = "num_threads" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44" +dependencies = [ + "libc", +] + +[[package]] +name = "object" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cf5f9dd3933bd50a9e1f149ec995f39ae2c496d31fd772c1fd45ebc27e902b0" +dependencies = [ + "memchr", +] + +[[package]] +name = "object_store" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d359e231e5451f4f9fa889d56e3ce34f8724f1a61db2107739359717cf2bbf08" +dependencies = [ + "async-trait", + "bytes", + "chrono", + "futures", + "humantime", + "itertools 0.10.5", + "parking_lot 0.12.1", + "percent-encoding", + "snafu", + "tokio", + "tracing", + "url", + "walkdir", +] + +[[package]] +name = "once_cell" +version = "1.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" + +[[package]] +name = "oorandom" +version = "11.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" + +[[package]] +name = "openssl" +version = "0.10.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bac25ee399abb46215765b1cb35bc0212377e58a061560d8b29b024fd0430e7c" +dependencies = [ + "bitflags 2.4.0", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.33", +] + +[[package]] +name = "openssl-probe" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" + +[[package]] +name = "openssl-src" +version = "300.1.3+3.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd2c101a165fff9935e34def4669595ab1c7847943c42be86e21503e482be107" +dependencies = [ + "cc", +] + +[[package]] +name = "openssl-sys" +version = "0.9.93" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db4d56a4c0478783083cfafcc42493dd4a981d41669da64b4572a2a089b51b1d" +dependencies = [ + "cc", + "libc", + "openssl-src", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "opentls" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f561874f8d6ecfb674fc08863414040c93cc90c0b6963fe679895fab8b65560" +dependencies = [ + "futures-util", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "url", +] + +[[package]] +name = "oracle" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfe80334af1fbaea016fbef0af77f5fa32452362e29a039389b8c93737585003" +dependencies = [ + "cc", + "chrono", + "lazy_static", + "oracle_procmacro", + "paste", +] + +[[package]] +name = "oracle_procmacro" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad247f3421d57de56a0d0408d3249d4b1048a522be2013656d92f022c3d8af27" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "ordered-float" +version = "2.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7940cf2ca942593318d07fcf2596cdca60a85c9e7fab408a5e21a4f9dcd40d87" +dependencies = [ + "num-traits", +] + +[[package]] +name = "owning_ref" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ff55baddef9e4ad00f88b6c743a2a8062d4c6ade126c2a528644b8e444d52ce" +dependencies = [ + "stable_deref_trait", +] + +[[package]] +name = "parking_lot" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" +dependencies = [ + "instant", + "lock_api", + "parking_lot_core 0.8.6", +] + +[[package]] +name = "parking_lot" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +dependencies = [ + "lock_api", + "parking_lot_core 0.9.8", +] + +[[package]] +name = "parking_lot_core" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a2cfe6f0ad2bfc16aefa463b497d5c7a5ecd44a23efa72aa342d90177356dc" +dependencies = [ + "cfg-if", + "instant", + "libc", + "redox_syscall 0.2.16", + "smallvec", + "winapi", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall 0.3.5", + "smallvec", + "windows-targets", +] + +[[package]] +name = "parquet" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ad2cba786ae07da4d73371a88b9e0f9d3ffac1a9badc83922e0e15814f5c5fa" +dependencies = [ + "ahash 0.8.3", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ipc", + "arrow-schema", + "arrow-select", + "base64 0.21.4", + "brotli", + "bytes", + "chrono", + "flate2", + "futures", + "hashbrown 0.14.0", + "lz4", + "num", + "num-bigint", + "object_store", + "paste", + "seq-macro", + "snap", + "thrift", + "tokio", + "twox-hash", + "zstd", +] + +[[package]] +name = "parse-zoneinfo" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c705f256449c60da65e11ff6626e0c16a0a0b96aaa348de61376b249bc340f41" +dependencies = [ + "regex", +] + +[[package]] +name = "paste" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" + +[[package]] +name = "peeking_take_while" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" + +[[package]] +name = "pem" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8835c273a76a90455d7344889b0964598e3316e2a79ede8e36f16bdcf2228b8" +dependencies = [ + "base64 0.13.1", +] + +[[package]] +name = "percent-encoding" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94" + +[[package]] +name = "petgraph" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1d3afd2628e69da2be385eb6f2fd57c8ac7977ceeff6dc166ff1657b0e386a9" +dependencies = [ + "fixedbitset", + "indexmap 2.0.0", +] + +[[package]] +name = "phf" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" +dependencies = [ + "phf_shared", + "rand 0.8.5", +] + +[[package]] +name = "phf_shared" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" +dependencies = [ + "siphasher", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkg-config" +version = "0.3.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" + +[[package]] +name = "planus" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc1691dd09e82f428ce8d6310bd6d5da2557c82ff17694d2a32cad7242aea89f" +dependencies = [ + "array-init-cursor", +] + +[[package]] +name = "plotters" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2c224ba00d7cadd4d5c660deaf2098e5e80e07846537c51f9cfa4be50c1fd45" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e76628b4d3a7581389a35d5b6e2139607ad7c75b17aed325f210aa91f4a9609" + +[[package]] +name = "plotters-svg" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38f6d39893cca0701371e3c27294f09797214b86f1fb951b89ade8ec04e2abab" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "polars" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1362d4a136c0ebacb40d88a37ba361738b222fd8a2ee9340a3d8642f698c52b" +dependencies = [ + "getrandom 0.2.10", + "polars-core", + "polars-io", + "polars-lazy", + "polars-ops", + "polars-sql", + "polars-time", + "version_check", +] + +[[package]] +name = "polars-arrow" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f967c901fa5da4ca7f64e813d1268488ba97e9b3004cefc579ff851c197a1138" +dependencies = [ + "arrow2", + "hashbrown 0.14.0", + "multiversion", + "num-traits", + "polars-error", + "thiserror", + "version_check", +] + +[[package]] +name = "polars-core" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b24f92fc5b167f668ff85ab9607dfa72e2c09664cacef59297ee8601dee60126" +dependencies = [ + "ahash 0.8.3", + "arrow2", + "bitflags 2.4.0", + "chrono", + "comfy-table", + "either", + "hashbrown 0.14.0", + "indexmap 2.0.0", + "num-traits", + "once_cell", + "polars-arrow", + "polars-error", + "polars-row", + "polars-utils", + "rand 0.8.5", + "rand_distr", + "rayon", + "regex", + "smartstring", + "thiserror", + "version_check", + "xxhash-rust", +] + +[[package]] +name = "polars-error" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40d09c3a7337e53b38c37b57999038440fa39c6801b9ba48afaecd8e16f7ac0a" +dependencies = [ + "arrow2", + "regex", + "thiserror", +] + +[[package]] +name = "polars-io" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92cab0df9f2a35702fa5aec99edfaabf9ae8e9cdd0acf69e143ad2d132f34f9c" +dependencies = [ + "ahash 0.8.3", + "arrow2", + "async-trait", + "bytes", + "chrono", + "fast-float", + "futures", + "home", + "lexical", + "lexical-core", + "memchr", + "memmap2 0.7.1", + "num-traits", + "once_cell", + "polars-arrow", + "polars-core", + "polars-error", + "polars-time", + "polars-utils", + "rayon", + "regex", + "simdutf8", + "tokio", +] + +[[package]] +name = "polars-lazy" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c33762ec2a55e01c9f8776b34db86257c70a0a3b3929bd4eb91a52aacf61456" +dependencies = [ + "ahash 0.8.3", + "bitflags 2.4.0", + "glob", + "once_cell", + "polars-arrow", + "polars-core", + "polars-io", + "polars-ops", + "polars-pipe", + "polars-plan", + "polars-time", + "polars-utils", + "rayon", + "smartstring", + "version_check", +] + +[[package]] +name = "polars-ops" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e825575c96302d2daedfc205a0062180033c92c55bcd6aafc4e109d4d8849ed0" +dependencies = [ + "argminmax", + "arrow2", + "either", + "indexmap 2.0.0", + "memchr", + "polars-arrow", + "polars-core", + "polars-utils", + "smartstring", + "version_check", +] + +[[package]] +name = "polars-pipe" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f2bc9a12da9ed043fb0cb51dbcb87b365e4845b7ab6399d7a81e838460c6974" +dependencies = [ + "enum_dispatch", + "hashbrown 0.14.0", + "num-traits", + "polars-arrow", + "polars-core", + "polars-io", + "polars-ops", + "polars-plan", + "polars-row", + "polars-utils", + "rayon", + "smartstring", + "version_check", +] + +[[package]] +name = "polars-plan" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb67b014f0295e8e9dbb84404a91d666d477b3bc248a2ed51bc442833b16da35" +dependencies = [ + "ahash 0.8.3", + "arrow2", + "once_cell", + "polars-arrow", + "polars-core", + "polars-io", + "polars-ops", + "polars-time", + "polars-utils", + "rayon", + "regex", + "smartstring", + "strum_macros 0.25.2", + "version_check", +] + +[[package]] +name = "polars-row" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27f54c1956027bf6301948fb4f2837cf6d6b638d8dd1edf3aaeaa19906a986be" +dependencies = [ + "arrow2", + "polars-error", + "polars-utils", +] + +[[package]] +name = "polars-sql" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbfcb15cf8eebd25ea1724109d0153817cd484c6326290585f0736b4e7fcf2f4" +dependencies = [ + "polars-arrow", + "polars-core", + "polars-lazy", + "polars-plan", + "serde", + "serde_json", + "sqlparser 0.36.1", +] + +[[package]] +name = "polars-time" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53f42d2632f5971c9575041d33cbcfb1f996900c40bbf58bc6eb0a0c5efbecea" +dependencies = [ + "arrow2", + "atoi", + "chrono", + "now", + "once_cell", + "polars-arrow", + "polars-core", + "polars-ops", + "polars-utils", + "regex", + "smartstring", +] + +[[package]] +name = "polars-utils" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c326708a370d71dc6e11a8f4bbc10a8479e1c314dc048ba73543b815cd0bf339" +dependencies = [ + "ahash 0.8.3", + "hashbrown 0.14.0", + "num-traits", + "once_cell", + "polars-error", + "rayon", + "smartstring", + "sysinfo", + "version_check", +] + +[[package]] +name = "postgres" +version = "0.19.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7915b33ed60abc46040cbcaa25ffa1c7ec240668e0477c4f3070786f5916d451" +dependencies = [ + "bytes", + "fallible-iterator", + "futures-util", + "log", + "tokio", + "tokio-postgres", +] + +[[package]] +name = "postgres-native-tls" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d442770e2b1e244bb5eb03b31c79b65bb2568f413b899eaba850fa945a65954" +dependencies = [ + "futures", + "native-tls", + "tokio", + "tokio-native-tls", + "tokio-postgres", +] + +[[package]] +name = "postgres-openssl" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1de0ea6504e07ca78355a6fb88ad0f36cafe9e696cbc6717f16a207f3a60be72" +dependencies = [ + "futures", + "openssl", + "tokio", + "tokio-openssl", + "tokio-postgres", +] + +[[package]] +name = "postgres-protocol" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49b6c5ef183cd3ab4ba005f1ca64c21e8bd97ce4699cfea9e8d9a2c4958ca520" +dependencies = [ + "base64 0.21.4", + "byteorder", + "bytes", + "fallible-iterator", + "hmac", + "md-5", + "memchr", + "rand 0.8.5", + "sha2", + "stringprep", +] + +[[package]] +name = "postgres-types" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d2234cdee9408b523530a9b6d2d6b373d1db34f6a8e51dc03ded1828d7fb67c" +dependencies = [ + "bytes", + "chrono", + "fallible-iterator", + "postgres-protocol", + "serde", + "serde_json", + "uuid 0.8.2", +] + +[[package]] +name = "pprof" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc842ca3fb958643d1696cfdada75410482480c11a7129463924fff5ab18d405" +dependencies = [ + "backtrace", + "inferno", + "lazy_static", + "libc", + "log", + "nix", + "parking_lot 0.11.2", + "symbolic-demangle", + "tempfile", + "thiserror", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" + +[[package]] +name = "pretty-hex" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be91bcc43e73799dc46a6c194a55e7aae1d86cc867c860fd4a436019af21bd8c" + +[[package]] +name = "proc-macro-crate" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d6ea3c4595b96363c13943497db34af4460fb474a95c43f4446ad341b8c9785" +dependencies = [ + "toml", +] + +[[package]] +name = "proc-macro-hack" +version = "0.5.20+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" + +[[package]] +name = "proc-macro2" +version = "1.0.67" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d433d9f1a3e8c1263d9456598b16fec66f4acc9a74dacffd35c7bb09b3a1328" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "ptr_meta" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0738ccf7ea06b608c10564b31debd4f5bc5e197fc8bfe088f68ae5ce81e7a4f1" +dependencies = [ + "ptr_meta_derive", +] + +[[package]] +name = "ptr_meta_derive" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16b845dbfca988fa33db069c0e230574d15a3088f147a87b64c7589eb662c9ac" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "quick-xml" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8533f14c8382aaad0d592c812ac3b826162128b65662331e1127b45c3d18536b" +dependencies = [ + "memchr", +] + +[[package]] +name = "quote" +version = "1.0.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r2d2" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51de85fb3fb6524929c8a2eb85e6b6d363de4e8c48f9e2c2eac4944abc181c93" +dependencies = [ + "log", + "parking_lot 0.12.1", + "scheduled-thread-pool", +] + +[[package]] +name = "r2d2-oracle" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e592c29a9d04b2eb9aa5adc8775087200343b486efa8a374cb43a02f4269d67f" +dependencies = [ + "oracle", + "r2d2", +] + +[[package]] +name = "r2d2_mysql" +version = "23.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9733d738ce65959a744f387bae69aa690a867e18d48e5486b171c47bc7b0c575" +dependencies = [ + "mysql", + "r2d2", +] + +[[package]] +name = "r2d2_postgres" +version = "0.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7029c56be658cb54f321e0bee597810ee16796b735fa2559d7056bf06b12230b" +dependencies = [ + "postgres", + "r2d2", +] + +[[package]] +name = "r2d2_sqlite" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99f31323d6161385f385046738df520e0e8694fa74852d35891fc0be08348ddc" +dependencies = [ + "r2d2", + "rusqlite", + "uuid 1.4.1", +] + +[[package]] +name = "radium" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" + +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom 0.1.16", + "libc", + "rand_chacha 0.2.2", + "rand_core 0.5.1", + "rand_hc", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core 0.5.1", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom 0.1.16", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.10", +] + +[[package]] +name = "rand_distr" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" +dependencies = [ + "num-traits", + "rand 0.8.5", +] + +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core 0.5.1", +] + +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + +[[package]] +name = "rayon" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-utils", + "num_cpus", +] + +[[package]] +name = "redox_syscall" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" +dependencies = [ + "bitflags 1.3.2", +] + +[[package]] +name = "redox_syscall" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" +dependencies = [ + "bitflags 1.3.2", +] + +[[package]] +name = "redox_users" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b" +dependencies = [ + "getrandom 0.2.10", + "redox_syscall 0.2.16", + "thiserror", +] + +[[package]] +name = "regex" +version = "1.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax 0.7.5", +] + +[[package]] +name = "regex-automata" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax 0.7.5", +] + +[[package]] +name = "regex-syntax" +version = "0.6.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" + +[[package]] +name = "regex-syntax" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" + +[[package]] +name = "rend" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "581008d2099240d37fb08d77ad713bcaec2c4d89d50b5b21a8bb1996bbab68ab" +dependencies = [ + "bytecheck", +] + +[[package]] +name = "reqwest" +version = "0.11.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e9ad3fe7488d7e34558a2033d45a0c90b72d97b4f80705666fea71472e2e6a1" +dependencies = [ + "base64 0.21.4", + "bytes", + "encoding_rs", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "hyper", + "hyper-rustls 0.24.1", + "ipnet", + "js-sys", + "log", + "mime", + "once_cell", + "percent-encoding", + "pin-project-lite", + "rustls 0.21.7", + "rustls-pemfile 1.0.3", + "serde", + "serde_json", + "serde_urlencoded", + "tokio", + "tokio-rustls 0.24.1", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", + "webpki-roots", + "winreg", +] + +[[package]] +name = "rgb" +version = "0.8.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20ec2d3e3fc7a92ced357df9cebd5a10b6fb2aa1ee797bf7e9ce2f17dffc8f59" +dependencies = [ + "bytemuck", +] + +[[package]] +name = "ring" +version = "0.16.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc" +dependencies = [ + "cc", + "libc", + "once_cell", + "spin", + "untrusted", + "web-sys", + "winapi", +] + +[[package]] +name = "rkyv" +version = "0.7.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0200c8230b013893c0b2d6213d6ec64ed2b9be2e0e016682b7224ff82cff5c58" +dependencies = [ + "bitvec", + "bytecheck", + "hashbrown 0.12.3", + "ptr_meta", + "rend", + "rkyv_derive", + "seahash", + "tinyvec", + "uuid 1.4.1", +] + +[[package]] +name = "rkyv_derive" +version = "0.7.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2e06b915b5c230a17d7a736d1e2e63ee753c256a8614ef3f5147b13a4f5541d" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "rusqlite" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "549b9d036d571d42e6e85d1c1425e2ac83491075078ca9a15be021c56b1641f2" +dependencies = [ + "bitflags 2.4.0", + "chrono", + "fallible-iterator", + "fallible-streaming-iterator", + "hashlink", + "libsqlite3-sys", + "smallvec", +] + +[[package]] +name = "rust_decimal" +version = "1.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4c4216490d5a413bc6d10fa4742bd7d4955941d062c0ef873141d6b0e7b30fd" +dependencies = [ + "arrayvec", + "borsh", + "bytes", + "num-traits", + "postgres", + "rand 0.8.5", + "rkyv", + "serde", + "serde_json", +] + +[[package]] +name = "rust_decimal_macros" +version = "1.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86444b802de0b10ac5e563b5ddb43b541b9705de4e01a50e82194d2b183c1835" +dependencies = [ + "quote", + "rust_decimal", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "rustc_version" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +dependencies = [ + "semver", +] + +[[package]] +name = "rustix" +version = "0.38.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7db8590df6dfcd144d22afd1b83b36c21a18d7cbc1dc4bb5295a8712e9eb662" +dependencies = [ + "bitflags 2.4.0", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + +[[package]] +name = "rustls" +version = "0.20.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b80e3dec595989ea8510028f30c408a4630db12c9cbb8de34203b89d6577e99" +dependencies = [ + "log", + "ring", + "sct", + "webpki", +] + +[[package]] +name = "rustls" +version = "0.21.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd8d6c9f025a446bc4d18ad9632e69aec8f287aa84499ee335599fabd20c3fd8" +dependencies = [ + "log", + "ring", + "rustls-webpki", + "sct", +] + +[[package]] +name = "rustls-native-certs" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9aace74cb666635c918e9c12bc0d348266037aa8eb599b5cba565709a8dff00" +dependencies = [ + "openssl-probe", + "rustls-pemfile 1.0.3", + "schannel", + "security-framework", +] + +[[package]] +name = "rustls-pemfile" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ee86d63972a7c661d1536fefe8c3c8407321c3df668891286de28abcd087360" +dependencies = [ + "base64 0.13.1", +] + +[[package]] +name = "rustls-pemfile" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d3987094b1d07b653b7dfdc3f70ce9a1da9c51ac18c1b06b662e4f9a0e9f4b2" +dependencies = [ + "base64 0.21.4", +] + +[[package]] +name = "rustls-webpki" +version = "0.101.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45a27e3b59326c16e23d30aeb7a36a24cc0d29e71d68ff611cdfb4a01d013bed" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "rustversion" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4" + +[[package]] +name = "ryu" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "saturating" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ece8e78b2f38ec51c51f5d475df0a7187ba5111b2a28bdc761ee05b075d40a71" + +[[package]] +name = "schannel" +version = "0.1.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c3733bf4cf7ea0880754e19cb5a462007c4a8c1914bff372ccc95b464f1df88" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "scheduled-thread-pool" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3cbc66816425a074528352f5789333ecff06ca41b36b0b0efdfbb29edc391a19" +dependencies = [ + "parking_lot 0.12.1", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "sct" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d53dcdb7c9f8158937a7981b48accfd39a43af418591a5d008c7b22b5e1b7ca4" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "seahash" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b" + +[[package]] +name = "security-framework" +version = "2.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05b64fb303737d99b81884b2c63433e9ae28abebe5eb5045dcdd175dc2ecf4de" +dependencies = [ + "bitflags 1.3.2", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e932934257d3b408ed8f30db49d85ea163bfe74961f017f405b025af298f0c7a" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "semver" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0293b4b29daaf487284529cc2f5675b8e57c61f70167ba415a463651fd6a918" + +[[package]] +name = "seq-macro" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" + +[[package]] +name = "serde" +version = "1.0.188" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf9e0fcba69a370eed61bcf2b728575f726b50b55cba78064753d708ddc7549e" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_cbor" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5" +dependencies = [ + "half 1.8.2", + "serde", +] + +[[package]] +name = "serde_derive" +version = "1.0.188" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.33", +] + +[[package]] +name = "serde_json" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2cc66a619ed80bf7a0f6b17dd063a84b88f6dea1813737cf469aef1d081142c2" +dependencies = [ + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "sha1" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f04293dc80c3993519f2d7f6f511707ee7094fe0c6d3406feb330cdb3540eba3" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha2" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479fb9d862239e610720565ca91403019f2f00410f1864c5aa7479b950a76ed8" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "shlex" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7cee0529a6d40f580e7a5e6c495c8fbfe21b7b52795ed4bb5e62cdf92bc6380" + +[[package]] +name = "signal-hook" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8621587d4798caf8eb44879d42e56b9a93ea5dcd315a6487c357130095b62801" +dependencies = [ + "libc", + "signal-hook-registry", +] + +[[package]] +name = "signal-hook-mio" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af" +dependencies = [ + "libc", + "mio", + "signal-hook", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" +dependencies = [ + "libc", +] + +[[package]] +name = "simdutf8" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f27f6278552951f1f2b8cf9da965d10969b2efdea95a6ec47987ab46edfe263a" + +[[package]] +name = "siphasher" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" + +[[package]] +name = "slab" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" +dependencies = [ + "autocfg", +] + +[[package]] +name = "smallvec" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" + +[[package]] +name = "smartstring" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fb72c633efbaa2dd666986505016c32c3044395ceaf881518399d2f4127ee29" +dependencies = [ + "autocfg", + "static_assertions", + "version_check", +] + +[[package]] +name = "snafu" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4de37ad025c587a29e8f3f5605c00f70b98715ef90b9061a815b9e59e9042d6" +dependencies = [ + "doc-comment", + "snafu-derive", +] + +[[package]] +name = "snafu-derive" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990079665f075b699031e9c08fd3ab99be5029b96f3b78dc0709e8f77e4efebf" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "snap" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e9f0ab6ef7eb7353d9119c170a436d1bf248eea575ac42d19d12f4e34130831" + +[[package]] +name = "socket2" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "socket2" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4031e820eb552adee9295814c0ced9e5cf38ddf1e8b7d566d6de8e2538ea989e" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "spin" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" + +[[package]] +name = "sqlparser" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2eaa1e88e78d2c2460d78b7dc3f0c08dbb606ab4222f9aff36f420d36e307d87" +dependencies = [ + "log", +] + +[[package]] +name = "sqlparser" +version = "0.37.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37ae05a8250b968a3f7db93155a84d68b2e6cea1583949af5ca5b5170c76c075" +dependencies = [ + "log", + "sqlparser_derive", +] + +[[package]] +name = "sqlparser_derive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55fe75cb4a364c7f7ae06c7dbbc8d84bddd85d6cdf9975963c3935bc1991761e" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "str_stack" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9091b6114800a5f2141aee1d1b9d6ca3592ac062dc5decb3764ec5895a47b4eb" + +[[package]] +name = "strength_reduce" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe895eb47f22e2ddd4dabc02bce419d2e643c8e3b585c78158b349195bc24d82" + +[[package]] +name = "stringprep" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb41d74e231a107a1b4ee36bd1214b11285b77768d2e3824aedafa988fd36ee6" +dependencies = [ + "finl_unicode", + "unicode-bidi", + "unicode-normalization", +] + +[[package]] +name = "strsim" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" + +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + +[[package]] +name = "strum" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f" + +[[package]] +name = "strum" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125" +dependencies = [ + "strum_macros 0.25.2", +] + +[[package]] +name = "strum_macros" +version = "0.24.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn 1.0.109", +] + +[[package]] +name = "strum_macros" +version = "0.25.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad8d03b598d3d0fff69bf533ee3ef19b8eeb342729596df84bcc7e1f96ec4059" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.33", +] + +[[package]] +name = "subprocess" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c2e86926081dda636c546d8c5e641661049d7562a68f5488be4a1f7f66f6086" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "subtle" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" + +[[package]] +name = "symbolic-common" +version = "8.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f551f902d5642e58039aee6a9021a61037926af96e071816361644983966f540" +dependencies = [ + "debugid", + "memmap2 0.5.10", + "stable_deref_trait", + "uuid 0.8.2", +] + +[[package]] +name = "symbolic-demangle" +version = "8.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4564ca7b4e6eb14105aa8bbbce26e080f6b5d9c4373e67167ab31f7b86443750" +dependencies = [ + "cpp_demangle", + "rustc-demangle", + "symbolic-common", +] + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9caece70c63bfba29ec2fed841a09851b14a235c60010fa4de58089b6c025668" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "sysinfo" +version = "0.29.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a18d114d420ada3a891e6bc8e96a2023402203296a47cdd65083377dad18ba5" +dependencies = [ + "cfg-if", + "core-foundation-sys", + "libc", + "ntapi", + "once_cell", + "winapi", +] + +[[package]] +name = "tap" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" + +[[package]] +name = "target-features" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06f6b473c37f9add4cf1df5b4d66a8ef58ab6c895f1a3b3f949cf3e21230140e" + +[[package]] +name = "tempfile" +version = "3.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb94d2f3cc536af71caac6b6fcebf65860b347e7ce0cc9ebe8f70d3e521054ef" +dependencies = [ + "cfg-if", + "fastrand", + "redox_syscall 0.3.5", + "rustix", + "windows-sys", +] + +[[package]] +name = "termcolor" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "textwrap" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" +dependencies = [ + "unicode-width", +] + +[[package]] +name = "thiserror" +version = "1.0.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d6d7a740b8a666a7e828dd00da9c0dc290dff53154ea77ac109281de90589b7" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49922ecae66cc8a249b77e68d1d0623c1b2c514f0060c27cdc68bd62a1219d35" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.33", +] + +[[package]] +name = "thrift" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" +dependencies = [ + "byteorder", + "integer-encoding", + "ordered-float", +] + +[[package]] +name = "tiberius" +version = "0.5.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e08c782c165a53700c17e4b15a1f6facc21e40a6a80402c518e0f3a2c3fcedd4" +dependencies = [ + "async-native-tls", + "async-stream 0.2.1", + "async-trait", + "asynchronous-codec", + "byteorder", + "bytes", + "chrono", + "connection-string", + "encoding", + "enumflags2", + "futures", + "futures-sink", + "futures-util", + "libgssapi", + "num-traits", + "once_cell", + "opentls", + "pin-project-lite", + "pretty-hex", + "rust_decimal", + "thiserror", + "tracing", + "uuid 0.8.2", + "winauth", +] + +[[package]] +name = "time" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17f6bb557fd245c28e6411aa56b6403c689ad95061f50e4be16c274e70a17e48" +dependencies = [ + "deranged", + "itoa", + "libc", + "num_threads", + "serde", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb" + +[[package]] +name = "time-macros" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a942f44339478ef67935ab2bbaec2fb0322496cf3cbe84b261e06ac3814c572" +dependencies = [ + "time-core", +] + +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "tinyvec" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "tokio" +version = "1.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17ed6077ed6cd6c74735e21f37eb16dc3935f96878b1fe961074089cc80893f9" +dependencies = [ + "backtrace", + "bytes", + "libc", + "mio", + "num_cpus", + "parking_lot 0.12.1", + "pin-project-lite", + "socket2 0.5.4", + "tokio-macros", + "windows-sys", +] + +[[package]] +name = "tokio-macros" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.33", +] + +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + +[[package]] +name = "tokio-openssl" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08f9ffb7809f1b20c1b398d92acf4cc719874b3b2b2d9ea2f09b4a80350878a" +dependencies = [ + "futures-util", + "openssl", + "openssl-sys", + "tokio", +] + +[[package]] +name = "tokio-postgres" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d340244b32d920260ae7448cb72b6e238bddc3d4f7603394e7dd46ed8e48f5b8" +dependencies = [ + "async-trait", + "byteorder", + "bytes", + "fallible-iterator", + "futures-channel", + "futures-util", + "log", + "parking_lot 0.12.1", + "percent-encoding", + "phf", + "pin-project-lite", + "postgres-protocol", + "postgres-types", + "rand 0.8.5", + "socket2 0.5.4", + "tokio", + "tokio-util 0.7.8", + "whoami", +] + +[[package]] +name = "tokio-rustls" +version = "0.23.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c43ee83903113e03984cb9e5cebe6c04a5116269e900e3ddba8f068a62adda59" +dependencies = [ + "rustls 0.20.9", + "tokio", + "webpki", +] + +[[package]] +name = "tokio-rustls" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" +dependencies = [ + "rustls 0.21.7", + "tokio", +] + +[[package]] +name = "tokio-stream" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "397c988d37662c7dda6d2208364a706264bf3d6138b11d436cbac0ad38832842" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.6.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36943ee01a6d67977dd3f84a5a1d2efeb4ada3a1ae771cadfaa535d9d9fc6507" +dependencies = [ + "bytes", + "futures-core", + "futures-io", + "futures-sink", + "log", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.7.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "806fe8c2c87eccc8b3267cbae29ed3ab2d0bd37fca70ab622e46aaa9375ddb7d" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", + "tracing", +] + +[[package]] +name = "toml" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4f7f0dd8d50a853a531c426359045b1998f04219d88799810762cd4ad314234" +dependencies = [ + "serde", +] + +[[package]] +name = "tower-service" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" + +[[package]] +name = "tracing" +version = "0.1.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8" +dependencies = [ + "cfg-if", + "log", + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.33", +] + +[[package]] +name = "tracing-core" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a" +dependencies = [ + "once_cell", +] + +[[package]] +name = "try-lock" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed" + +[[package]] +name = "twox-hash" +version = "1.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" +dependencies = [ + "cfg-if", + "rand 0.8.5", + "static_assertions", +] + +[[package]] +name = "typenum" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" + +[[package]] +name = "unicode-bidi" +version = "0.3.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "unicode-normalization" +version = "0.1.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "unicode-segmentation" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" + +[[package]] +name = "unicode-width" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" + +[[package]] +name = "untrusted" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" + +[[package]] +name = "url" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "143b538f18257fac9cad154828a57c6bf5157e1aa604d4816b5995bf6de87ae5" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", +] + +[[package]] +name = "urlencoding" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" + +[[package]] +name = "uuid" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" +dependencies = [ + "getrandom 0.2.10", + "md5 0.7.0", +] + +[[package]] +name = "uuid" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79daa5ed5740825c40b389c5e50312b9c86df53fccd33f281df655642b43869d" +dependencies = [ + "getrandom 0.2.10", + "rand 0.8.5", +] + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "vec_map" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "walkdir" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71d857dc86794ca4c280d616f7da00d2dbfd8cd788846559a6813e6aa4b54ee" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "wasm-bindgen" +version = "0.2.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25f1af7423d8588a3d840681122e72e6a24ddbcb3f0ec385cac0d12d24256c06" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b21c0df030f5a177f3cba22e9bc4322695ec43e7257d865302900290bcdedca" +dependencies = [ + "bumpalo", + "lazy_static", + "log", + "proc-macro2", + "quote", + "syn 1.0.109", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2eb6ec270a31b1d3c7e266b999739109abce8b6c87e4b31fcfcd788b65267395" +dependencies = [ + "cfg-if", + "js-sys", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f4203d69e40a52ee523b2529a773d5ffc1dc0071801c87b3d270b471b80ed01" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa8a30d46208db204854cadbb5d4baf5fcf8071ba5bf48190c3e59937962ebc" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d958d035c4438e28c70e4321a2911302f10135ce78a9c7834c0cab4123d06a2" + +[[package]] +name = "web-sys" +version = "0.3.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c060b319f29dd25724f09a2ba1418f142f539b2be99fbf4d2d5a8f7330afb8eb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "webpki" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0e74f82d49d545ad128049b7e88f6576df2da6b02e9ce565c6f533be576957e" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "webpki-roots" +version = "0.25.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14247bb57be4f377dfb94c72830b8ce8fc6beac03cf4bf7b9732eadd414123fc" + +[[package]] +name = "which" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" +dependencies = [ + "either", + "home", + "once_cell", + "rustix", +] + +[[package]] +name = "whoami" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22fc3756b8a9133049b26c7f61ab35416c130e8c09b660f5b3958b446f52cc50" +dependencies = [ + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "winauth" +version = "0.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f820cd208ce9c6b050812dc2d724ba98c6c1e9db5ce9b3f58d925ae5723a5e6" +dependencies = [ + "bitflags 1.3.2", + "byteorder", + "md5 0.6.1", + "rand 0.7.3", + "winapi", +] + +[[package]] +name = "windows" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "winreg" +version = "0.50.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1" +dependencies = [ + "cfg-if", + "windows-sys", +] + +[[package]] +name = "wyz" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed" +dependencies = [ + "tap", +] + +[[package]] +name = "xxhash-rust" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9828b178da53440fa9c766a3d2f73f7cf5d0ac1fe3980c1e5018d899fd19e07b" + +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] + +[[package]] +name = "yup-oauth2" +version = "7.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98748970d2ddf05253e6525810d989740334aa7509457864048a829902db76f3" +dependencies = [ + "anyhow", + "async-trait", + "base64 0.13.1", + "futures", + "http", + "hyper", + "hyper-rustls 0.23.2", + "itertools 0.10.5", + "log", + "percent-encoding", + "rustls 0.20.9", + "rustls-pemfile 0.3.0", + "seahash", + "serde", + "serde_json", + "time", + "tokio", + "tower-service", + "url", +] + +[[package]] +name = "zstd" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "6.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581" +dependencies = [ + "libc", + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.8+zstd.1.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c" +dependencies = [ + "cc", + "libc", + "pkg-config", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..0bbd499 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,12 @@ +[workspace] +default-members = ["connectorx"] +members = ["connectorx", "connectorx-cpp"] +resolver = "2" + +[profile.release] +debug = true +lto = true + +[workspace.dependencies] +arrow = {version = "46", features = ["prettyprint", "ffi"]} +arrow2 = {version = "0.17", default-features = false} diff --git a/Justfile b/Justfile new file mode 100644 index 0000000..07ac506 --- /dev/null +++ b/Justfile @@ -0,0 +1,130 @@ +set dotenv-load := true + +build-release: + cargo build --release --features all + +build-debug: + cargo build --features all + +build-cpp +ARGS="": + cd connectorx-cpp && cargo build {{ARGS}} + +build-cpp-release +ARGS="": + cd connectorx-cpp && cargo build --release {{ARGS}} + +test +ARGS="": + cargo test --features all {{ARGS}} -- --nocapture + +test-feature-gate: + cargo c --features src_postgres + cargo c --features src_mysql + cargo c --features src_mssql + cargo c --features src_sqlite + cargo c --features src_oracle + cargo c --features src_csv + cargo c --features src_dummy + cargo c --features dst_arrow + cargo c --features dst_arrow2 + +bootstrap-python: + cd connectorx-python && poetry install + +setup-java: + cd federated-query/rewriter && mvn package -Dmaven.test.skip=true + cp -f ./federated-query/rewriter/target/federated-rewriter-1.0-SNAPSHOT-jar-with-dependencies.jar connectorx-python/connectorx/dependencies/federated-rewriter.jar + +setup-python: + cd connectorx-python && poetry run maturin develop --release + +test-python +opts="": setup-python + cd connectorx-python && poetry run pytest connectorx/tests -v -s {{opts}} + +test-python-s +opts="": + cd connectorx-python && poetry run pytest connectorx/tests -v -s {{opts}} + +test-fed file="3.sql": + cd connectorx && cargo run --features src_postgres --features src_mysql --features dst_arrow --features federation --example federated_test "../federated-query/test-queries/{{file}}" + +test-datafusion: + cd connectorx && cargo run --features src_postgres --features src_mysql --features dst_arrow --features federation --example test + +seed-db: + #!/bin/bash + psql $POSTGRES_URL -f scripts/postgres.sql + sqlite3 ${SQLITE_URL#sqlite://} < scripts/sqlite.sql + mysql --protocol tcp -h$MYSQL_HOST -P$MYSQL_PORT -u$MYSQL_USER -p$MYSQL_PASSWORD $MYSQL_DB < scripts/mysql.sql + mssql-cli -S$MSSQL_HOST -U$MSSQL_USER -P$MSSQL_PASSWORD -d$MSSQL_DB -i scripts/mssql.sql + +# dbs not included in ci +seed-db-more: + mysql --protocol tcp -h$CLICKHOUSE_HOST -P$CLICKHOUSE_PORT -u$CLICKHOUSE_USER -p$CLICKHOUSE_PASSWORD $CLICKHOUSE_DB < scripts/clickhouse.sql + psql $REDSHIFT_URL -f scripts/redshift.sql + ORACLE_URL_SCRIPT=`echo ${ORACLE_URL#oracle://} | sed "s/:/\//"` + cat scripts/oracle.sql | sqlplus $ORACLE_URL_SCRIPT + mysql --protocol tcp -h$MARIADB_HOST -P$MARIADB_PORT -u$MARIADB_USER -p$MARIADB_PASSWORD $MARIADB_DB < scripts/mysql.sql + +# benches +flame-tpch conn="POSTGRES_URL": + cd connectorx-python && PYO3_PYTHON=$HOME/.pyenv/versions/3.8.6/bin/python3.8 PYTHONPATH=$HOME/.pyenv/versions/conn/lib/python3.8/site-packages LD_LIBRARY_PATH=$HOME/.pyenv/versions/3.8.6/lib/ cargo run --no-default-features --features executable --features fptr --features nbstr --features dsts --features srcs --release --example flame_tpch {{conn}} + +build-tpch: + cd connectorx-python && cargo build --no-default-features --features executable --features fptr --release --example tpch + +cachegrind-tpch: build-tpch + valgrind --tool=cachegrind target/release/examples/tpch + +python-tpch name +ARGS="": setup-python + #!/bin/bash + export PYTHONPATH=$PWD/connectorx-python + cd connectorx-python && \ + poetry run python ../benchmarks/tpch-{{name}}.py {{ARGS}} + +python-tpch-ext name +ARGS="": + cd connectorx-python && poetry run python ../benchmarks/tpch-{{name}}.py {{ARGS}} + +python-ddos name +ARGS="": setup-python + #!/bin/bash + export PYTHONPATH=$PWD/connectorx-python + cd connectorx-python && \ + poetry run python ../benchmarks/ddos-{{name}}.py {{ARGS}} + +python-ddos-ext name +ARGS="": + cd connectorx-python && poetry run python ../benchmarks/ddos-{{name}}.py {{ARGS}} + + +python-shell: + cd connectorx-python && \ + poetry run ipython + +benchmark-report: setup-python + cd connectorx-python && \ + poetry run pytest connectorx/tests/benchmarks.py --benchmark-json ../benchmark.json + +# releases +build-python-wheel: + # need to get the j4rs dependency first + cd connectorx-python && maturin build --release -i python + # copy files + cp -rf connectorx-python/target/release/jassets connectorx-python/connectorx/dependencies + # build final wheel + cd connectorx-python && maturin build --release -i python + +bench-fed path: + just python-tpch fed --file {{path}}/q2.sql + just python-tpch-ext fed --file {{path}}/q3.sql + just python-tpch-ext fed --file {{path}}/q4.sql + just python-tpch-ext fed --file {{path}}/q5.sql + just python-tpch-ext fed --file {{path}}/q7.sql + just python-tpch-ext fed --file {{path}}/q8.sql + just python-tpch-ext fed --file {{path}}/q9.sql + just python-tpch-ext fed --file {{path}}/q10.sql + just python-tpch-ext fed --file {{path}}/q11.sql + just python-tpch-ext fed --file {{path}}/q12.sql + just python-tpch-ext fed --file {{path}}/q13.sql + just python-tpch-ext fed --file {{path}}/q14.sql + just python-tpch-ext fed --file {{path}}/q16.sql + just python-tpch-ext fed --file {{path}}/q17.sql + just python-tpch-ext fed --file {{path}}/q18.sql + just python-tpch-ext fed --file {{path}}/q19.sql + just python-tpch-ext fed --file {{path}}/q20.sql + just python-tpch-ext fed --file {{path}}/q22.sql diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..e8c7ee7 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 SFU Database Group + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..bd760ff --- /dev/null +++ b/README.md @@ -0,0 +1,155 @@ +# ConnectorX [![status][ci_badge]][ci_page] [![discussions][discussion_badge]][discussion_page] [![Downloads][download_badge]][download_page] + +[ci_badge]: https://github.com/sfu-db/connector-x/workflows/ci/badge.svg +[ci_page]: https://github.com/sfu-db/connector-x/actions +[discussion_badge]: https://img.shields.io/badge/Forum-Github%20Discussions-blue +[discussion_page]: https://github.com/sfu-db/connector-x/discussions +[download_badge]: https://pepy.tech/badge/connectorx +[download_page]: https://pepy.tech/project/connectorx + +Load data from to , the fastest way. + +ConnectorX enables you to load data from databases into Python in the fastest and most memory efficient way. + +What you need is one line of code: + +```python +import connectorx as cx + +cx.read_sql("postgresql://username:password@server:port/database", "SELECT * FROM lineitem") +``` + +Optionally, you can accelerate the data loading using parallelism by specifying a partition column. + +```python +import connectorx as cx + +cx.read_sql("postgresql://username:password@server:port/database", "SELECT * FROM lineitem", partition_on="l_orderkey", partition_num=10) +``` + +The function will partition the query by **evenly** splitting the specified column to the amount of partitions. +ConnectorX will assign one thread for each partition to load and write data in parallel. +Currently, we support partitioning on **numerical** columns (**cannot contain NULL**) for **SPJA** queries. + +# Installation + +```bash +pip install connectorx +``` + +Check out [here](https://sfu-db.github.io/connector-x/install.html#build-from-source-code) to see how to build python wheel from source. + +# Performance + +We compared different solutions in Python that provides the `read_sql` function, by loading a 10x TPC-H lineitem table (8.6GB) from Postgres into a DataFrame, with 4 cores parallelism. + +## Time chart, lower is better. + +

time chart

+ +## Memory consumption chart, lower is better. + +

memory chart

+ +In conclusion, ConnectorX uses up to **3x** less memory and **21x** less time (**3x** less memory and **13x** less time compared with Pandas.). More on [here](https://github.com/sfu-db/connector-x/blob/main/Benchmark.md#benchmark-result-on-aws-r54xlarge). + +## How does ConnectorX achieve a lightning speed while keeping the memory footprint low? + +We observe that existing solutions more or less do data copy multiple times when downloading the data. +Additionally, implementing a data intensive application in Python brings additional cost. + +ConnectorX is written in Rust and follows "zero-copy" principle. +This allows it to make full use of the CPU by becoming cache and branch predictor friendly. Moreover, the architecture of ConnectorX ensures the data will be copied exactly once, directly from the source to the destination. + +## How does ConnectorX download the data? + +Upon receiving the query, e.g. `SELECT * FROM lineitem`, ConnectorX will first issue a `LIMIT 1` query `SELECT * FROM lineitem LIMIT 1` to get the schema of the result set. + +Then, if `partition_on` is specified, ConnectorX will issue `SELECT MIN($partition_on), MAX($partition_on) FROM (SELECT * FROM lineitem)` to know the range of the partition column. +After that, the original query is split into partitions based on the min/max information, e.g. `SELECT * FROM (SELECT * FROM lineitem) WHERE $partition_on > 0 AND $partition_on < 10000`. +ConnectorX will then run a count query to get the partition size (e.g. `SELECT COUNT(*) FROM (SELECT * FROM lineitem) WHERE $partition_on > 0 AND $partition_on < 10000`). If the partition +is not specified, the count query will be `SELECT COUNT(*) FROM (SELECT * FROM lineitem)`. + +Finally, ConnectorX will use the schema info as well as the count info to allocate memory and download data by executing the queries normally. + +Once the downloading begins, there will be one thread for each partition so that the data are downloaded in parallel at the partition level. The thread will issue the query of the corresponding +partition to the database and then write the returned data to the destination row-wise or column-wise (depends on the database) in a streaming fashion. + + +# Supported Sources & Destinations + +Example connection string, supported protocols and data types for each data source can be found [here](https://sfu-db.github.io/connector-x/databases.html). + +For more planned data sources, please check out our [discussion](https://github.com/sfu-db/connector-x/discussions/61). + +## Sources +- [x] Postgres +- [x] Mysql +- [x] Mariadb (through mysql protocol) +- [x] Sqlite +- [x] Redshift (through postgres protocol) +- [x] Clickhouse (through mysql protocol) +- [x] SQL Server +- [x] Azure SQL Database (through mssql protocol) +- [x] Oracle +- [x] Big Query +- [ ] ODBC (WIP) +- [ ] ... + +## Destinations +- [x] Pandas +- [x] PyArrow +- [x] Modin (through Pandas) +- [x] Dask (through Pandas) +- [x] Polars (through PyArrow) + +# Documentation + +Doc: https://sfu-db.github.io/connector-x/intro.html +Rust docs: [stable](https://docs.rs/connectorx) [nightly](https://sfu-db.github.io/connector-x/connectorx/) + +# Next Plan + +Checkout our [discussion][discussion_page] to participate in deciding our next plan! + +# Historical Benchmark Results + +https://sfu-db.github.io/connector-x/dev/bench/ + +# Developer's Guide +Please see [Developer's Guide](https://github.com/sfu-db/connector-x/blob/main/CONTRIBUTING.md) for information about developing ConnectorX. + +# Supports + +You are always welcomed to: +1. Ask questions & propose new ideas in our github [discussion][discussion_page]. +2. Ask questions in stackoverflow. Make sure to have #connectorx attached. + +# Organizations and Projects using ConnectorX + +[](https://github.com/pola-rs/polars) +[](https://dataprep.ai/) +[](https://modin.readthedocs.io) + +To add your project/organization here, reply our post [here](https://github.com/sfu-db/connector-x/discussions/146) + +# Citing ConnectorX + +If you use ConnectorX, please consider citing the following paper: + +Xiaoying Wang, Weiyuan Wu, Jinze Wu, Yizhou Chen, Nick Zrymiak, Changbo Qu, Lampros Flokas, George Chow, Jiannan Wang, Tianzheng Wang, Eugene Wu, Qingqing Zhou. [ConnectorX: Accelerating Data Loading From Databases to Dataframes.](https://www.vldb.org/pvldb/vol15/p2994-wang.pdf) _VLDB 2022_. + +BibTeX entry: + +```bibtex +@article{connectorx2022, + author = {Xiaoying Wang and Weiyuan Wu and Jinze Wu and Yizhou Chen and Nick Zrymiak and Changbo Qu and Lampros Flokas and George Chow and Jiannan Wang and Tianzheng Wang and Eugene Wu and Qingqing Zhou}, + title = {ConnectorX: Accelerating Data Loading From Databases to Dataframes}, + journal = {Proc. {VLDB} Endow.}, + volume = {15}, + number = {11}, + pages = {2994--3003}, + year = {2022}, + url = {https://www.vldb.org/pvldb/vol15/p2994-wang.pdf}, +} +``` diff --git a/assets/Technical_Report__ConnectorX.pdf b/assets/Technical_Report__ConnectorX.pdf new file mode 100644 index 0000000..2734c9b Binary files /dev/null and b/assets/Technical_Report__ConnectorX.pdf differ diff --git a/assets/cache_overview.jpg b/assets/cache_overview.jpg new file mode 100644 index 0000000..4fd4330 Binary files /dev/null and b/assets/cache_overview.jpg differ diff --git a/assets/cache_workflow.jpeg b/assets/cache_workflow.jpeg new file mode 100644 index 0000000..5d702c1 Binary files /dev/null and b/assets/cache_workflow.jpeg differ diff --git a/assets/cx_cache.jpeg b/assets/cx_cache.jpeg new file mode 100644 index 0000000..7f302b2 Binary files /dev/null and b/assets/cx_cache.jpeg differ diff --git a/assets/destinations.gif b/assets/destinations.gif new file mode 100644 index 0000000..dbdf7d2 Binary files /dev/null and b/assets/destinations.gif differ diff --git a/assets/mssql-mem.png b/assets/mssql-mem.png new file mode 100644 index 0000000..2e026a2 Binary files /dev/null and b/assets/mssql-mem.png differ diff --git a/assets/mssql-time.png b/assets/mssql-time.png new file mode 100644 index 0000000..9aee47a Binary files /dev/null and b/assets/mssql-time.png differ diff --git a/assets/mysql-mem.png b/assets/mysql-mem.png new file mode 100644 index 0000000..6e82eb2 Binary files /dev/null and b/assets/mysql-mem.png differ diff --git a/assets/mysql-time.png b/assets/mysql-time.png new file mode 100644 index 0000000..426b135 Binary files /dev/null and b/assets/mysql-time.png differ diff --git a/assets/oracle-mem.png b/assets/oracle-mem.png new file mode 100644 index 0000000..6fd0dfa Binary files /dev/null and b/assets/oracle-mem.png differ diff --git a/assets/oracle-time.png b/assets/oracle-time.png new file mode 100644 index 0000000..9cc6f14 Binary files /dev/null and b/assets/oracle-time.png differ diff --git a/assets/pg-mem.png b/assets/pg-mem.png new file mode 100644 index 0000000..dc0d55a Binary files /dev/null and b/assets/pg-mem.png differ diff --git a/assets/pg-time.png b/assets/pg-time.png new file mode 100644 index 0000000..bad81d1 Binary files /dev/null and b/assets/pg-time.png differ diff --git a/assets/sources.gif b/assets/sources.gif new file mode 100644 index 0000000..3fbc915 Binary files /dev/null and b/assets/sources.gif differ diff --git a/assets/sqlite-mem.png b/assets/sqlite-mem.png new file mode 100644 index 0000000..2be5714 Binary files /dev/null and b/assets/sqlite-mem.png differ diff --git a/assets/sqlite-time.png b/assets/sqlite-time.png new file mode 100644 index 0000000..ba0493b Binary files /dev/null and b/assets/sqlite-time.png differ diff --git a/benchmarks/ddos-cx.py b/benchmarks/ddos-cx.py new file mode 100644 index 0000000..a66d687 --- /dev/null +++ b/benchmarks/ddos-cx.py @@ -0,0 +1,51 @@ +""" +Usage: + tpch-cx.py [--protocol=] [--conn=] [--ret=] + +Options: + --protocol= The protocol to use [default: binary]. + --conn= The connection url to use [default: POSTGRES_URL]. + --ret= The return type [default: pandas]. + -h --help Show this screen. + --version Show version. +""" +import os + +import connectorx as cx +from contexttimer import Timer +from docopt import docopt +import pandas as pd +import modin.pandas as mpd +import dask.dataframe as dd +import polars as pl +import pyarrow as pa + + +if __name__ == "__main__": + args = docopt(__doc__, version="Naval Fate 2.0") + conn = os.environ[args["--conn"]] + table = "DDOS" + part_num = int(args[""]) + + with Timer() as timer: + if part_num > 1: + df = cx.read_sql( + conn, + f"""SELECT * FROM {table}""", + partition_on="ID", + partition_num=int(args[""]), + protocol=args["--protocol"], + return_type=args["--ret"], + ) + else: + df = cx.read_sql( + conn, + f"""SELECT * FROM {table}""", + protocol=args["--protocol"], + return_type=args["--ret"], + ) + print("time in total:", timer.elapsed) + + print(df) + print([(c, df[c].dtype) for c in df.columns]) + print(df.info(memory_usage='deep')) diff --git a/benchmarks/ddos-dask.py b/benchmarks/ddos-dask.py new file mode 100644 index 0000000..b4b52be --- /dev/null +++ b/benchmarks/ddos-dask.py @@ -0,0 +1,59 @@ +""" +Usage: + tpch-dask.py [--conn=] [--table=] [--index=] [--driver=] + +Options: + --conn= The connection url to use [default: POSTGRES_URL]. + --table=
The connection url to use [default: DDOS]. + --index= The connection url to use [default: id]. + --driver= The driver to use using sqlalchemy: https://docs.sqlalchemy.org/en/14/core/engines.html. + -h --help Show this screen. + --version Show version. + +Drivers: + PostgreSQL: postgresql, postgresql+psycopg2 + MySQL: mysql, mysql+mysqldb, mysql+pymysql + Redshift: postgresql, redshift, redshift+psycopg2 +""" + +import os + +import dask.dataframe as dd +from contexttimer import Timer +from docopt import docopt +from dask.distributed import Client, LocalCluster +from sqlalchemy.engine.url import make_url + +if __name__ == "__main__": + args = docopt(__doc__, version="Naval Fate 2.0") + index_col = args["--index"] + conn = os.environ[args["--conn"]] + conn = make_url(conn) + table = args["--table"] + driver = args.get("--driver", None) + npartition = int(args[""]) + + cluster = LocalCluster(n_workers=npartition, scheduler_port=0, memory_limit="230G") + client = Client(cluster) + + # https://docs.sqlalchemy.org/en/13/core/engines.html#sqlite + # 4 initial slashes is needed for Unix/Mac + if conn.drivername == "sqlite": + conn = f"sqlite:///{str(conn)[9:]}" + elif driver is not None: + conn = str(conn.set(drivername=driver)) + print(f"conn url: {conn}") + + with Timer() as timer: + df = dd.read_sql_table( + table, + str(conn), + index_col, + npartitions=npartition, + limits=(0, 7902474), + ).compute() + + print(f"[Total] {timer.elapsed:.2f}s") + + print(df) + print([(c, df[c].dtype) for c in df.columns]) diff --git a/benchmarks/ddos-modin.py b/benchmarks/ddos-modin.py new file mode 100644 index 0000000..df6b542 --- /dev/null +++ b/benchmarks/ddos-modin.py @@ -0,0 +1,57 @@ +""" +Usage: + tpch-modin.py [--conn=] [--driver=] + +Options: + --conn= The connection url to use [default: POSTGRES_URL]. + --driver= The driver to use using sqlalchemy: https://docs.sqlalchemy.org/en/14/core/engines.html. + -h --help Show this screen. + --version Show version. + +Drivers: + PostgreSQL: postgresql, postgresql+psycopg2 + MySQL: mysql, mysql+mysqldb, mysql+pymysql + Redshift: postgresql, redshift, redshift+psycopg2 +""" + +import os + +import modin.config as config +import modin.pandas as pd +from contexttimer import Timer +from docopt import docopt +from dask.distributed import Client, LocalCluster +from sqlalchemy.engine.url import make_url + +# modin adopts the fastest mysqlclient connector for mysql + +if __name__ == "__main__": + args = docopt(__doc__, version="1.0") + conn = os.environ[args["--conn"]] + conn = make_url(conn) + table = "DDOS" + driver = args.get("--driver", None) + + partitions = int(args[""]) + config.NPartitions.put(partitions) + + cluster = LocalCluster(n_workers=partitions, scheduler_port=0, memory_limit="230G") + client = Client(cluster) + + # https://docs.sqlalchemy.org/en/13/core/engines.html#sqlite + # 4 initial slashes is needed for Unix/Mac + if conn.drivername == "sqlite": + conn = f"sqlite:///{str(conn)[9:]}" + elif driver is not None: + conn = str(conn.set(drivername=driver)) + print(f"conn url: {conn}") + + with Timer() as timer: + df = pd.read_sql( + f"SELECT * FROM {table}", + str(conn), + ) + print(f"[Total] {timer.elapsed:.2f}s") + + print(df) + print([(c, df[c].dtype) for c in df.columns]) diff --git a/benchmarks/ddos-pandas-chunk.py b/benchmarks/ddos-pandas-chunk.py new file mode 100644 index 0000000..3822ad1 --- /dev/null +++ b/benchmarks/ddos-pandas-chunk.py @@ -0,0 +1,57 @@ +""" +Usage: + tpch-pandas-chunk.py [--conn=] [--csize=] [--driver=] + +Options: + --conn= The connection url to use [default: POSTGRES]. + --csize= Chunk size [default: 1000]. + --driver= The driver to use using sqlalchemy: https://docs.sqlalchemy.org/en/14/core/engines.html. + -h --help Show this screen. + --version Show version. +""" + +import os +from contexttimer import Timer +from docopt import docopt +import pandas as pd +from sqlalchemy import create_engine +from sqlalchemy.engine.url import make_url +import time + +if __name__ == "__main__": + args = docopt(__doc__, version="1.0") + conn = os.environ[args["--conn"]] + chunksize = int(args["--csize"]) + driver = args.get("--driver", None) + conn = make_url(conn) + if driver is not None: + conn = conn.set(drivername=driver) + if conn.drivername == "sqlite": + conn = conn.set(database="/" + conn.database) + + print(f"chunksize: {chunksize}, conn url: {str(conn)}") + + with Timer() as timer: + engine = create_engine(conn) + conn = engine.connect().execution_options( + stream_results=True, max_row_buffer=chunksize) + dfs = [] + with Timer() as stream_timer: + for df in pd.read_sql("SELECT * FROM DDOS", conn, chunksize=chunksize): + dfs.append(df) + print(f"time iterate batches: {stream_timer.elapsed}") + df = pd.concat(dfs) + print(f"time in total: {timer.elapsed}s") + time.sleep(3) # capture peak memory + + conn.close() + print(df) + print(df.info(memory_usage="deep")) + # print(df._data.blocks) + + # print("======") + # print(len(dfs)) + # for d in dfs: + # print(d.info(memory_usage="deep")) + # print(d._data.blocks) + # break diff --git a/benchmarks/ddos-pandas.py b/benchmarks/ddos-pandas.py new file mode 100644 index 0000000..9a80b95 --- /dev/null +++ b/benchmarks/ddos-pandas.py @@ -0,0 +1,57 @@ +""" +Usage: + tpch-pandas.py [--conn=] [--driver=] + +Options: + --conn= The connection url to use [default: POSTGRES_URL]. + --driver= The driver to use using sqlalchemy: https://docs.sqlalchemy.org/en/14/core/engines.html. + -h --help Show this screen. + --version Show version. + +Drivers: + PostgreSQL: postgresql, postgresql+psycopg2 + MySQL: mysql, mysql+mysqldb, mysql+pymysql + Redshift: postgresql, redshift, redshift+psycopg2 + +""" + +import os + +from contexttimer import Timer +from sqlalchemy import create_engine +from docopt import docopt +import pandas as pd +import sqlite3 +from clickhouse_driver import connect +from sqlalchemy.engine.url import make_url + +if __name__ == "__main__": + args = docopt(__doc__, version="1.0") + table = "DDOS" + driver = args.get("--driver", None) + conn = os.environ[args["--conn"]] + conn = make_url(conn) + + if conn.drivername == "sqlite": + conn = sqlite3.connect(str(conn)[9:]) + elif driver == "clickhouse": + # clickhouse-driver uses native protocol: 9000 + conn = conn.set(drivername=driver, port=9000) + conn = connect(str(conn)) + else: # go with sqlalchemy + if driver is not None: + conn = conn.set(drivername=driver) + print(f"conn url: {str(conn)}") + engine = create_engine(conn) + conn = engine.connect() + + with Timer() as timer: + df = pd.read_sql( + f"SELECT * FROM {table}", + conn, + ) + print(f"[Total] {timer.elapsed:.2f}s") + conn.close() + + print(df) + print([(c, df[c].dtype) for c in df.columns]) diff --git a/benchmarks/ddos-turbodbc.py b/benchmarks/ddos-turbodbc.py new file mode 100644 index 0000000..933a17f --- /dev/null +++ b/benchmarks/ddos-turbodbc.py @@ -0,0 +1,63 @@ +""" +Usage: + tpch-turbodbc.py [--driver=] [--ret=] + +Options: + --driver= ODBC driver to use [default: PostgreSQL]. + --ret= The return type [default: pandas-numpy]. + -h --help Show this screen. + --version Show version. + +""" + +import os + +from docopt import docopt +from turbodbc import connect, make_options +import pandas as pd +from contexttimer import Timer + +if __name__ == "__main__": + args = docopt(__doc__, version="Naval Fate 2.0") + table = "DDOS" + driver = args["--driver"] + ret = args["--ret"] + query = f"SELECT * FROM {table}" + + with Timer() as gtimer: + with Timer() as timer: + if driver == "MSSQL": + options = make_options(prefer_unicode=True) + connection = connect( + dsn=driver, uid=os.environ["MSSQL_USER"], pwd=os.environ["MSSQL_PASSWORD"], turbodbc_options=options) + else: + connection = connect(dsn=driver) + cursor = connection.cursor() + print(f"connect: {timer.elapsed}") + with Timer() as timer: + cursor.execute(query) + print(f"execute: {timer.elapsed}") + if ret == "pandas-numpy": + with Timer() as timer: + data = cursor.fetchallnumpy() + print(f"fetchallnumpy: {timer.elapsed}") + with Timer() as timer: + df = pd.DataFrame(data=data) + print(f"convert to pandas: {timer.elapsed}") + elif ret == "pandas-arrow": + with Timer() as timer: + data = cursor.fetchallarrow() + print(f"fetchallarrow: {timer.elapsed}") + with Timer() as timer: + # to be fair with other benchmarks, generate consolidate blocks and convert date + df = data.to_pandas(split_blocks=False, date_as_object=False) + print(f"convert to pandas: {timer.elapsed}") + else: + assert ret == "arrow" + with Timer() as timer: + df = cursor.fetchallarrow() + print(f"fetchallarrow: {timer.elapsed}") + + print(f"time in total: {gtimer.elapsed}") + print(df) + print([(c, df[c].dtype) for c in df.columns]) diff --git a/benchmarks/tpch-cx-aw.py b/benchmarks/tpch-cx-aw.py new file mode 100644 index 0000000..2df5924 --- /dev/null +++ b/benchmarks/tpch-cx-aw.py @@ -0,0 +1,53 @@ +""" +Usage: + tpch-cx-aw.py [--protocol=] [--conn=] [--ret=] + +Options: + --protocol= The protocol to use [default: binary]. + --conn= The connection url to use [default: POSTGRES_URL]. + --ret= The return type [default: pandas]. + -h --help Show this screen. + --version Show version. +""" +import os + +import connectorx as cx +from contexttimer import Timer +from docopt import docopt + + +if __name__ == "__main__": + args = docopt(__doc__, version="Naval Fate 2.0") + conn = os.environ[args["--conn"]] + table = os.environ["TPCH_TABLE"] + part_num = int(args[""]) + ret = args["--ret"] + + print(f"[CX-AW] conn: {conn}, part_num: {part_num}, return: {ret}") + + with Timer() as gtimer: + with Timer() as timer: + if part_num > 1: + data = cx.read_sql( + conn, + f"""SELECT * FROM {table}""", + partition_on="L_ORDERKEY", + partition_num=int(args[""]), + protocol=args["--protocol"], + return_type="arrow", + ) + else: + data = cx.read_sql( + conn, + f"""SELECT * FROM {table}""", + protocol=args["--protocol"], + return_type="arrow", + ) + print("got arrow:", timer.elapsed) + if ret == "pandas": + with Timer() as timer: + df = data.to_pandas(split_blocks=False, date_as_object=False) + print("convert to pandas:", timer.elapsed) + + print(f"time in total: {gtimer.elapsed}") + print(df) diff --git a/benchmarks/tpch-cx.py b/benchmarks/tpch-cx.py new file mode 100644 index 0000000..fbd5a48 --- /dev/null +++ b/benchmarks/tpch-cx.py @@ -0,0 +1,65 @@ +""" +Usage: + tpch-cx.py [--protocol=] [--conn=] [--ret=] + +Options: + --protocol= The protocol to use [default: binary]. + --conn= The connection url to use [default: POSTGRES_URL]. + --ret= The return type [default: pandas]. + -h --help Show this screen. + --version Show version. +""" +import os + +import connectorx as cx +from contexttimer import Timer +from docopt import docopt +import pandas as pd +import modin.pandas as mpd +import dask.dataframe as dd +import polars as pl +import pyarrow as pa + + +def describe(df): + if isinstance(df, pd.DataFrame): + print(df.head()) + elif isinstance(df, mpd.DataFrame): + print(df.head()) + elif isinstance(df, pl.DataFrame): + print(df.head()) + elif isinstance(df, dd.DataFrame): + print(df.head()) + elif isinstance(df, pa.Table): + print(df.slice(0, 10).to_pandas()) + else: + raise ValueError("unknown type") + + +if __name__ == "__main__": + args = docopt(__doc__, version="Naval Fate 2.0") + conn = os.environ[args["--conn"]] + table = os.environ["TPCH_TABLE"] + part_num = int(args[""]) + + with Timer() as timer: + if part_num > 1: + df = cx.read_sql( + conn, + f"""SELECT * FROM {table}""", + partition_on="L_ORDERKEY", + partition_num=int(args[""]), + protocol=args["--protocol"], + return_type=args["--ret"], + ) + else: + df = cx.read_sql( + conn, + f"""SELECT * FROM {table}""", + protocol=args["--protocol"], + return_type=args["--ret"], + ) + print("time in total:", timer.elapsed) + + print(type(df), len(df)) + describe(df) \ No newline at end of file diff --git a/benchmarks/tpch-dask.py b/benchmarks/tpch-dask.py new file mode 100644 index 0000000..af139a4 --- /dev/null +++ b/benchmarks/tpch-dask.py @@ -0,0 +1,67 @@ +""" +Usage: + tpch-dask.py [--conn=] [--index=] [--driver=] + +Options: + --conn= The connection url to use [default: POSTGRES_URL]. + --index= The connection url to use [default: l_orderkey]. + --driver= The driver to use using sqlalchemy: https://docs.sqlalchemy.org/en/14/core/engines.html. + -h --help Show this screen. + --version Show version. + +Drivers: + PostgreSQL: postgresql, postgresql+psycopg2 + MySQL: mysql, mysql+mysqldb, mysql+pymysql + Redshift: postgresql, redshift, redshift+psycopg2 +""" + +import os + +import dask.dataframe as dd +from contexttimer import Timer +from docopt import docopt +from dask.distributed import Client, LocalCluster +from sqlalchemy.engine.url import make_url + +if __name__ == "__main__": + args = docopt(__doc__, version="Naval Fate 2.0") + index_col = args["--index"] + conn = os.environ[args["--conn"]] + conn = make_url(conn) + table = os.environ["TPCH_TABLE"] + driver = args.get("--driver", None) + npartition = int(args[""]) + + cluster = LocalCluster(n_workers=npartition, scheduler_port=0, memory_limit="230G") + client = Client(cluster) + + # https://docs.sqlalchemy.org/en/13/core/engines.html#sqlite + # 4 initial slashes is needed for Unix/Mac + if conn.drivername == "sqlite": + conn = f"sqlite:///{str(conn)[9:]}" + elif driver is not None: + conn = str(conn.set(drivername=driver)) + print(f"conn url: {conn}") + + with Timer() as timer: + df = dd.read_sql_table( + table, + conn, + index_col, + npartitions=npartition, + limits=(0, 60000000), + parse_dates=[ + "l_shipdate", + "l_commitdate", + "l_receiptdate", + "L_SHIPDATE", + "L_COMMITDATE", + "L_RECEIPTDATE", + ], + ).compute() + + print(f"[Total] {timer.elapsed:.2f}s") + + print(df.head()) + print(len(df)) + print(df.dtypes) diff --git a/benchmarks/tpch-fed.py b/benchmarks/tpch-fed.py new file mode 100644 index 0000000..c4ccebc --- /dev/null +++ b/benchmarks/tpch-fed.py @@ -0,0 +1,69 @@ +""" +Usage: + tpch-fed.py [--file=] [--dir=] [--runs=] [--print] + +Options: + --file= Query file. + --dir= Query path. + --runs= # runs [default: 1]. + --print Print query result. + -h --help Show this screen. + --version Show version. +""" + +import os +import sys +import time +import connectorx as cx +from contexttimer import Timer +from docopt import docopt +from pathlib import Path + +def run_query_from_file(query_file, doprint=False, ntries=0): + with open(query_file, "r") as f: + sql = f.read() + print(f"file: {query_file}") + + try: + with Timer() as timer: + df = cx.read_sql(db_map, sql, return_type="arrow") + print(f"time in total: {timer.elapsed:.2f}, {len(df)} rows, {len(df.columns)} cols") + if doprint: + print(df) + del df + # print(df.schema) + # print(df) + except RuntimeError as e: + print(e) + if ntries >= 5: + raise + print("retry in 10 seconds...") + sys.stdout.flush() + time.sleep(10) + run_query_from_file(query_file, ntries+1) + + sys.stdout.flush() + +if __name__ == "__main__": + args = docopt(__doc__, version="Naval Fate 2.0") + query_file = args["--file"] + + db_map = {} + db_conns = os.environ["FED_CONN"] + for conn in db_conns.split(','): + db_map[conn.split('=', 1)[0]] = conn.split('=', 1)[1] + + print(f"dbs: {db_map}") + + for i in range(int(args["--runs"])): + print(f"=============== run {i} ================") + print() + sys.stdout.flush() + if args["--file"]: + filename = args["--file"] + run_query_from_file(filename, args["--print"]) + elif args["--dir"]: + for filename in sorted(Path(args["--dir"]).glob("q*.sql")): + run_query_from_file(filename, args["--print"]) + time.sleep(2) + diff --git a/benchmarks/tpch-modin-exp.py b/benchmarks/tpch-modin-exp.py new file mode 100644 index 0000000..a5ef407 --- /dev/null +++ b/benchmarks/tpch-modin-exp.py @@ -0,0 +1,46 @@ +""" +Usage: + tpch-modin-exp.py + +Options: + -h --help Show this screen. + --version Show version. +""" + +import os +os.environ["MODIN_ENGINE"] = "ray" +import ray +from contexttimer import Timer +from docopt import docopt + + +if __name__ == "__main__": + args = docopt(__doc__, version="1.0") + conn = os.environ["POSTGRES_URL"] + table = os.environ["POSTGRES_TABLE"] + + partitions = int(args[""]) + # ray.init(num_cpus=partitions, object_store_memory=10**10, _plasma_directory="/tmp") + ray.init(num_cpus=partitions, object_store_memory=10**10) + + import modin.config as config + import modin.experimental.pandas as pd + + config.NPartitions.put(partitions) + with Timer() as timer: + df = pd.read_sql( + f"{table}", # use table here, a bug exists in modin experimental read_sql for query + conn, + parse_dates=[ + "l_shipdate", + "l_commitdate", + "l_receiptdate", + ], + partition_column="l_orderkey", + lower_bound=0, + upper_bound=60000000, + max_sessions=partitions, + ) + print(f"[Total] {timer.elapsed:.2f}s") + + print(df.head()) diff --git a/benchmarks/tpch-modin.py b/benchmarks/tpch-modin.py new file mode 100644 index 0000000..69ac57f --- /dev/null +++ b/benchmarks/tpch-modin.py @@ -0,0 +1,66 @@ +""" +Usage: + tpch-modin.py [--conn=] [--driver=] + +Options: + --conn= The connection url to use [default: POSTGRES_URL]. + --driver= The driver to use using sqlalchemy: https://docs.sqlalchemy.org/en/14/core/engines.html. + -h --help Show this screen. + --version Show version. + +Drivers: + PostgreSQL: postgresql, postgresql+psycopg2 + MySQL: mysql, mysql+mysqldb, mysql+pymysql + Redshift: postgresql, redshift, redshift+psycopg2 +""" + +import os + +import modin.config as config +import modin.pandas as pd +from contexttimer import Timer +from docopt import docopt +from dask.distributed import Client, LocalCluster +from sqlalchemy.engine.url import make_url + +# modin adopts the fastest mysqlclient connector for mysql + +if __name__ == "__main__": + args = docopt(__doc__, version="1.0") + conn = os.environ[args["--conn"]] + conn = make_url(conn) + table = os.environ["TPCH_TABLE"] + driver = args.get("--driver", None) + + partitions = int(args[""]) + config.NPartitions.put(partitions) + + cluster = LocalCluster(n_workers=partitions, scheduler_port=0, memory_limit="230G") + client = Client(cluster) + + # https://docs.sqlalchemy.org/en/13/core/engines.html#sqlite + # 4 initial slashes is needed for Unix/Mac + if conn.drivername == "sqlite": + conn = f"sqlite:///{str(conn)[9:]}" + elif driver is not None: + conn = str(conn.set(drivername=driver)) + print(f"conn url: {conn}") + + with Timer() as timer: + df = pd.read_sql( + f"SELECT * FROM {table}", + str(conn), + parse_dates=[ + "l_shipdate", + "l_commitdate", + "l_receiptdate", + "L_SHIPDATE", + "L_COMMITDATE", + "L_RECEIPTDATE", + ], + ) + print(f"[Total] {timer.elapsed:.2f}s") + + print(df.head()) + print(len(df)) + print(df.dtypes) diff --git a/benchmarks/tpch-pandahouse.py b/benchmarks/tpch-pandahouse.py new file mode 100644 index 0000000..0a8ed6a --- /dev/null +++ b/benchmarks/tpch-pandahouse.py @@ -0,0 +1,37 @@ +""" +Usage: + tpch-pandahouse.py [--index=] + +Options: + --index= The connection url to use [default: L_ORDERKEY]. + -h --help Show this screen. + --version Show version. +""" + +import os + +from contexttimer import Timer +from docopt import docopt +import pandas as pd +from pandahouse import read_clickhouse + +if __name__ == "__main__": + args = docopt(__doc__, version="1.0") + index_col = args["--index"] + table = os.environ["TPCH_TABLE"] + + conn = { + "host": f"http://{os.environ['CLICKHOUSE_HOST']}:8123", # 8123 is default clickhouse http port + "database": os.environ["CLICKHOUSE_DB"], + "user": os.environ["CLICKHOUSE_USER"], + "password": os.environ["CLICKHOUSE_PASSWORD"], + } + print(conn) + + with Timer() as timer: + df = read_clickhouse(f'SELECT * FROM {conn["database"]}.{table}', index_col=index_col, connection=conn) + print(f"[Total] {timer.elapsed:.2f}s") + + print(df.head()) + print(df.tail()) + print(len(df)) \ No newline at end of file diff --git a/benchmarks/tpch-pandas-chunk.py b/benchmarks/tpch-pandas-chunk.py new file mode 100644 index 0000000..d51c4e2 --- /dev/null +++ b/benchmarks/tpch-pandas-chunk.py @@ -0,0 +1,63 @@ +""" +Usage: + tpch-pandas-chunk.py [--conn=] [--csize=] [--driver=] + +Options: + --conn= The connection url to use [default: POSTGRES]. + --csize= Chunk size [default: 1000]. + --driver= The driver to use using sqlalchemy: https://docs.sqlalchemy.org/en/14/core/engines.html. + -h --help Show this screen. + --version Show version. +""" + +import os +from contexttimer import Timer +from docopt import docopt +import pandas as pd +from sqlalchemy import create_engine +from sqlalchemy.engine.url import make_url +import time + +if __name__ == "__main__": + args = docopt(__doc__, version="1.0") + conn = os.environ[args["--conn"]] + chunksize = int(args["--csize"]) + driver = args.get("--driver", None) + conn = make_url(conn) + if driver is not None: + conn = conn.set(drivername=driver) + if conn.drivername == "sqlite": + conn = conn.set(database="/" + conn.database) + print(f"chunksize: {chunksize}, conn url: {str(conn)}") + + with Timer() as timer: + engine = create_engine(conn) + conn = engine.connect().execution_options( + stream_results=True, max_row_buffer=chunksize) + dfs = [] + with Timer() as stream_timer: + for df in pd.read_sql("SELECT * FROM lineitem", + conn, parse_dates=[ + "l_shipdate", + "l_commitdate", + "l_receiptdate", + "L_SHIPDATE", + "L_COMMITDATE", + "L_RECEIPTDATE",], chunksize=chunksize): + dfs.append(df) + print(f"time iterate batches: {stream_timer.elapsed}") + df = pd.concat(dfs) + print(f"time in total: {timer.elapsed}s") + time.sleep(3) # capture peak memory + + conn.close() + print(df) + print(df.info(memory_usage="deep")) + # print(df._data.blocks) + + # print("======") + # print(len(dfs)) + # for d in dfs: + # print(d.info(memory_usage="deep")) + # print(d._data.blocks) + # break diff --git a/benchmarks/tpch-pandas.py b/benchmarks/tpch-pandas.py new file mode 100644 index 0000000..408790d --- /dev/null +++ b/benchmarks/tpch-pandas.py @@ -0,0 +1,67 @@ +""" +Usage: + tpch-pandas.py [--conn=] [--driver=] + +Options: + --conn= The connection url to use [default: POSTGRES_URL]. + --driver= The driver to use using sqlalchemy: https://docs.sqlalchemy.org/en/14/core/engines.html. + -h --help Show this screen. + --version Show version. + +Drivers: + PostgreSQL: postgresql, postgresql+psycopg2 + MySQL: mysql, mysql+mysqldb, mysql+pymysql + Redshift: postgresql, redshift, redshift+psycopg2 + +""" + +import os + +from contexttimer import Timer +from sqlalchemy import create_engine +from docopt import docopt +import pandas as pd +import sqlite3 +from clickhouse_driver import connect +from sqlalchemy.engine.url import make_url + +if __name__ == "__main__": + args = docopt(__doc__, version="1.0") + table = os.environ["TPCH_TABLE"] + driver = args.get("--driver", None) + conn = os.environ[args["--conn"]] + conn = make_url(conn) + + if conn.drivername == "sqlite": + conn = sqlite3.connect(str(conn)[9:]) + elif driver == "clickhouse": + # clickhouse-driver uses native protocol: 9000 + conn = conn.set(drivername=driver, port=9000) + conn = connect(str(conn)) + else: # go with sqlalchemy + if driver is not None: + conn = conn.set(drivername=driver) + print(f"conn url: {str(conn)}") + engine = create_engine(conn) + conn = engine.connect() + + with Timer() as timer: + df = pd.read_sql( + f"SELECT * FROM {table}", + conn, + parse_dates=[ + "l_shipdate", + "l_commitdate", + "l_receiptdate", + "L_SHIPDATE", + "L_COMMITDATE", + "L_RECEIPTDATE", + ], + ) + print(f"[Total] {timer.elapsed:.2f}s") + conn.close() + + print(df.head()) + print(df.tail()) + print(len(df)) + print(df.dtypes) diff --git a/benchmarks/tpch-presto.py b/benchmarks/tpch-presto.py new file mode 100644 index 0000000..df348ea --- /dev/null +++ b/benchmarks/tpch-presto.py @@ -0,0 +1,79 @@ +""" +Usage: + tpch-cx.py [--protocol=] + +Options: + --protocol= The protocol to use [default: prestodb]. + -h --help Show this screen. + --version Show version. +""" +import os + +from docopt import docopt +import prestodb +from pyhive import presto +from sqlalchemy.engine import create_engine +import pandas as pd +from contexttimer import Timer + +if __name__ == "__main__": + args = docopt(__doc__, version="Naval Fate 2.0") + proto = args["--protocol"] + table = os.environ["TPCH_TABLE"] + + if proto == "prestodb": + conn = prestodb.dbapi.connect( + host=os.environ["PRESTO_HOST"], + port=int(os.environ["PRESTO_PORT"]), + user=os.environ["PRESTO_USER"], + catalog=os.environ["PRESTO_CATALOG"], + schema=os.environ["PRESTO_SCHEMA"], + ) + cur = conn.cursor() + with Timer() as timer: + cur.execute(f'SELECT * FROM {table}') + rows = cur.fetchall() + print(f"fetch all: {timer.elapsed:.2f}") + + with Timer() as timer: + df = pd.DataFrame(rows) + print(f"to df: {timer.elapsed:.2f}") + + elif proto == "pyhive-pd": + connection = presto.connect( + host=os.environ["PRESTO_HOST"], + port=int(os.environ["PRESTO_PORT"]), + username=os.environ["PRESTO_USER"], + catalog=os.environ["PRESTO_CATALOG"], + schema=os.environ["PRESTO_SCHEMA"], + ) + + with Timer() as timer: + df = pd.read_sql("select * from lineitem", connection) + print(f"Time in total: {timer.elapsed:.2f}") + elif proto == "pyhive": + connection = presto.connect( + host=os.environ["PRESTO_HOST"], + port=int(os.environ["PRESTO_PORT"]), + username=os.environ["PRESTO_USER"], + catalog=os.environ["PRESTO_CATALOG"], + schema=os.environ["PRESTO_SCHEMA"], + ) + cur = connection.cursor() + with Timer() as timer: + cur.execute(f'SELECT * FROM {table}') + rows = cur.fetchall() + print(f"fetch all: {timer.elapsed:.2f}") + + with Timer() as timer: + df = pd.DataFrame(rows) + print(f"to df: {timer.elapsed:.2f}") + elif proto == "sqlalchemy": + engine = create_engine(f'presto://{os.environ["PRESTO_USER"]}@{os.environ["PRESTO_HOST"]}:{os.environ["PRESTO_PORT"]}/{os.environ["PRESTO_CATALOG"]}/{os.environ["PRESTO_SCHEMA"]}') + conn = engine.connect() + with Timer() as timer: + df = pd.read_sql(f"SELECT * FROM {table}", conn) + print(f"Time in total: {timer.elapsed:.2f}") + + print(df.head()) + print(len(df)) diff --git a/benchmarks/tpch-pyarrow-p.py b/benchmarks/tpch-pyarrow-p.py new file mode 100644 index 0000000..0578ff7 --- /dev/null +++ b/benchmarks/tpch-pyarrow-p.py @@ -0,0 +1,94 @@ +""" +Usage: + tpch-pyarrow-p.py + +Options: + -h --help Show this screen. + --version Show version. +""" +import io +import itertools +import os +from multiprocessing import Pool +from typing import Any, List + +import numpy as np +import pyarrow as pa +from contexttimer import Timer +from docopt import docopt +from pyarrow import csv +from sqlalchemy import create_engine + + +def get_sqls(table: str, count: int) -> List[str]: + sqls = [] + split = np.linspace(0, 60000000, num=count + 1, endpoint=True, dtype=int) + for i in range(len(split) - 1): + + sqls.append( + f"""SELECT + l_orderkey, + l_partkey, + l_suppkey, + l_linenumber, + l_quantity::float8, + l_extendedprice::float8, + l_discount::float8, + l_tax::float8, + l_returnflag, + l_linestatus, + l_shipdate, + l_commitdate, + l_receiptdate, + l_shipinstruct, + l_shipmode, + l_comment + FROM {table} + WHERE l_orderkey > {split[i]} and l_orderkey <= {split[i+1]}""" + ) + return sqls + + +def func(id: int, conn: str, query: str) -> Any: + engine = create_engine(conn) + conn = engine.connect() + cur = conn.connection.cursor() + store = io.BytesIO() + + with Timer() as timer: + cur.copy_expert(f"COPY ({query}) TO STDOUT WITH CSV HEADER;", store) + print(f"[Copy {id}] {timer.elapsed:.2f}s") + + store.seek(0) + with Timer() as timer: + df = csv.read_csv(store, read_options=csv.ReadOptions(use_threads=False)) + print(f"[Read CSV {id}] {timer.elapsed:.2f}s") + + return df + + +if __name__ == "__main__": + args = docopt(__doc__, version="1.0") + conn = os.environ["POSTGRES_URL"] + table = os.environ["POSTGRES_TABLE"] + + queries = get_sqls(table, int(args[""])) + + print(f"number of threads: {len(queries)}\nsqls: {queries}") + + with Timer() as timer, Pool(len(queries)) as pool: + dfs = pool.starmap( + func, zip(range(len(queries)), itertools.repeat(conn), queries) + ) + + print(f"[All Jobs] {timer.elapsed:.2f}s") + + with Timer() as timer: + df = pa.concat_tables(dfs) + print(f"[Concat] {timer.elapsed:.2f}s") + + with Timer() as timer: + df = df.to_pandas() + print(f"[To Pandas] {timer.elapsed:.2f}s") + + print(df.head()) diff --git a/benchmarks/tpch-pyarrow.py b/benchmarks/tpch-pyarrow.py new file mode 100644 index 0000000..66121da --- /dev/null +++ b/benchmarks/tpch-pyarrow.py @@ -0,0 +1,46 @@ +""" +Usage: + tpch-pyarrow.py + +Options: + -h --help Show this screen. + --version Show version. +""" +import io +import os + +from contexttimer import Timer +from pyarrow import csv +from sqlalchemy import create_engine +from docopt import docopt + +if __name__ == "__main__": + args = docopt(__doc__, version="1.0") + conn = os.environ["POSTGRES_URL"] + table = os.environ["POSTGRES_TABLE"] + + engine = create_engine(conn) + conn = engine.connect() + + cur = conn.connection.cursor() + store = io.BytesIO() + with Timer() as timer: + cur.copy_expert( + f"COPY (SELECT * FROM {table}) TO STDOUT WITH CSV HEADER;", store + ) + print(f"[Copy] {timer.elapsed:.2f}s") + + store.seek(0) + + with Timer() as timer: + df = csv.read_csv(store, read_options=csv.ReadOptions(use_threads=False)) + print(f"[Read CSV] {timer.elapsed:.2f}s") + + with Timer() as timer: + df = df.to_pandas() + print(f"[To Pandas] {timer.elapsed:.2f}s") + + conn.close() + print(df.head()) + # _, peak = tracemalloc.get_traced_memory() + # print(f"memory peak: {peak/10**9:.2f}G") diff --git a/benchmarks/tpch-queries-cx.py b/benchmarks/tpch-queries-cx.py new file mode 100644 index 0000000..2c3f83d --- /dev/null +++ b/benchmarks/tpch-queries-cx.py @@ -0,0 +1,71 @@ +""" +Usage: + tpch-queries-cx.py [--conn=] [--ret=] [--part=] [--protocol=] [--force-parallel] + +Options: + --ret= The return type [default: pandas]. + --conn= The connection url to use [default: POSTGRES_URL]. + --part= The number of partitions to use [default: 1]. + --protocol= The protocol to use [default: binary]. + --force-parallel Force parallelism by setting variables + -h --help Show this screen. + --version Show version. + """ + +import os + +from pathlib import Path +from contexttimer import Timer +from docopt import docopt +import connectorx as cx + +if __name__ == "__main__": + args = docopt(__doc__, version="1.0") + conn = os.environ[args["--conn"]] + print(f"conn url: {conn}") + + ret = args["--ret"] + print(f"return type: {ret}") + + qid = args[""] + print(f"execute query id: {qid}") + + part = int(args["--part"]) + print(f"# partitions: {part}") + + # multi_access_plan = "force_parallel" if args["--force-parallel"] else "default" + # print(f"plan: {multi_access_plan}") + + if part > 1: + qdir = Path(f"{os.environ['TPCH_QUERIES']}_part", f"q{qid}.sql") + with open(qdir, "r") as f: + part_col = f.readline()[:-1] # first line is partition key, remove last '\n' + query = f.read() + else: + qdir = Path(os.environ["TPCH_QUERIES"], f"q{qid}.sql") + with open(qdir, "r") as f: + part_col = "" + query = f.read() + print(f"load query from: {qdir}") + print(f"query: {query}") + print(f"partition on : {part_col}") + query = query.replace("%", "%%") + + with Timer() as timer: + if ret == "pandas": + if part > 1: + # df = cx.read_sql(conn, query, partition_on=part_col, partition_num=part, protocol=args["--protocol"], multi_access_plan=multi_access_plan) + df = cx.read_sql(conn, query, partition_on=part_col, partition_num=part, protocol=args["--protocol"]) + else: + df = cx.read_sql(conn, query, protocol=args["--protocol"]) + elif ret == "arrow": + if part > 1: + # table = cx.read_sql(conn, query, return_type="arrow", partition_on=part_col, partition_num=part, protocol=args["--protocol"], multi_access_plan=multi_access_plan) + table = cx.read_sql(conn, query, return_type="arrow", partition_on=part_col, partition_num=part, protocol=args["--protocol"]) + else: + table = cx.read_sql(conn, query, return_type="arrow", protocol=args["--protocol"]) + print(f"get arrow table time: {timer.elapsed:.2f}s") + df = table.to_pandas(split_blocks=False, date_as_object=False) + print(f"[cx][QID: {qid} Total] {timer.elapsed:.2f}s") + + print(df) diff --git a/benchmarks/tpch-queries-pd.py b/benchmarks/tpch-queries-pd.py new file mode 100644 index 0000000..e018f5c --- /dev/null +++ b/benchmarks/tpch-queries-pd.py @@ -0,0 +1,43 @@ +""" +Usage: + tpch-queries-pd.py [--conn=] + +Options: + --conn= The connection url to use [default: POSTGRES_URL]. + -h --help Show this screen. + --version Show version. + """ + +import os + +from pathlib import Path +from contexttimer import Timer +from sqlalchemy import create_engine +from docopt import docopt +import pandas as pd + +if __name__ == "__main__": + args = docopt(__doc__, version="1.0") + conn = os.environ[args["--conn"]] + print(f"conn url: {conn}") + engine = create_engine(conn) + conn = engine.connect() + + qid = args[""] + print(f"execute query id: {qid}") + + qdir = Path(os.environ["TPCH_QUERIES"], f"q{qid}.sql") + print(f"load query from: {qdir}") + + with open(qdir, "r") as f: + query = f.read() + print(f"query: {query}") + query = query.replace("%", "%%") + + with Timer() as timer: + df = pd.read_sql(query, conn) + print(f"[pd][QID: {qid} Total] {timer.elapsed:.2f}s") + + conn.close() + print(df) + print(f"result size: {len(df)}x{len(df.columns)}") diff --git a/benchmarks/tpch-rust-arrow.py b/benchmarks/tpch-rust-arrow.py new file mode 100644 index 0000000..8d0a5d4 --- /dev/null +++ b/benchmarks/tpch-rust-arrow.py @@ -0,0 +1,155 @@ +""" +Usage: + tpch-rust-arrow.py + +Options: + -h --help Show this screen. + --version Show version. +""" +import json +import os +import time +from typing import List + +import numpy as np +import pyarrow as pa +from connectorx import read_pg +from docopt import docopt + + +def get_sqls(table: str, count: int) -> List[str]: + sqls = [] + split = np.linspace(0, 60000000, num=count + 1, endpoint=True, dtype=int) + for i in range(len(split) - 1): + + sqls.append( + f"""select l_orderkey, + l_partkey, + l_suppkey, + l_linenumber, + l_quantity::float8, + l_extendedprice::float8, + l_discount::float8, + l_tax::float8, + l_returnflag, + l_linestatus, + l_shipdate, + l_commitdate, + l_receiptdate, + l_shipinstruct, + l_shipmode, + l_comment from {table} where l_orderkey > {split[i]} and l_orderkey <= {split[i+1]}""" + ) + return sqls + + +def field_to_json(field): + json = { + "name": field.name, + "nullable": field.nullable, + } + if isinstance(field.type, pa.ListType): + json = { + **json, + "type": {"name": "list"}, + "children": [field_to_json(field.type.value_field)], + } + elif field.type == pa.float64(): + json = { + **json, + "type": {"name": "floatingpoint", "precision": "DOUBLE"}, + "children": [], + } + elif field.type == pa.uint64(): + json = { + **json, + "type": {"name": "int", "bitWidth": 64, "isSigned": False}, + "children": [], + } + elif field.type == pa.string(): + json = { + **json, + "type": {"name": "utf8"}, + "children": [], + } + elif field.type == pa.date32(): + json = { + **json, + "type": {"name": "date", "unit": "DAY"}, + "children": [], + } + elif isinstance(field.type, pa.StructType): + json = { + **json, + "type": {"name": "struct"}, + "children": [ + field_to_json(field.type[i]) for i in range(field.type.num_fields) + ], + } + else: + raise NotImplementedError(field.type) + + return json + + +def schema_to_json(schema): + return { + "fields": [field_to_json(schema.field(name)) for name in schema.names], + "metadata": {}, + } + + +SCHEMA = pa.schema( + [ + pa.field("l_orderkey", pa.uint64(), False), + pa.field("l_partkey", pa.uint64(), False), + pa.field("l_suppkey", pa.uint64(), False), + pa.field("l_linenumber", pa.uint64(), False), + pa.field("l_quantity", pa.float64(), False), + pa.field("l_extendedprice", pa.float64(), False), + pa.field("l_discount", pa.float64(), False), + pa.field("l_tax", pa.float64(), False), + pa.field("l_returnflag", pa.string(), False), + pa.field("l_linestatus", pa.string(), False), + # pa.field("l_shipdate", pa.date32(), False), + # pa.field("l_commitdate", pa.date32(), False), + # pa.field("l_receiptdate", pa.date32(), False), + pa.field("l_shipdate", pa.string(), False), + pa.field("l_commitdate", pa.string(), False), + pa.field("l_receiptdate", pa.string(), False), + pa.field("l_shipinstruct", pa.string(), False), + pa.field("l_shipmode", pa.string(), False), + pa.field("l_comment", pa.string(), False), + ] +) + + +if __name__ == "__main__": + args = docopt(__doc__, version="1.0") + conn = os.environ["POSTGRES_URL"] + table = os.environ["POSTGRES_TABLE"] + + queries = get_sqls(table, int(args[""])) + + print(f"numer of threads: {int(args[''])}\nsqls: {queries}") + + then = time.time() + table = read_pg( + conn, + queries, + json.dumps(schema_to_json(SCHEMA)), + ) + print(f"finish read_pg:", time.time() - then) + + tb = pa.Table.from_arrays( + [ + pa.chunked_array([pa.Array._import_from_c(*ptr) for ptr in ptrs]) + for ptrs in table.values() + ], + names=list(table.keys()), + ) + print("finish concat:", time.time() - then) + + df = tb.to_pandas() + print("finish to_pandas:", time.time() - then) + print(df) diff --git a/benchmarks/tpch-turbodbc.py b/benchmarks/tpch-turbodbc.py new file mode 100644 index 0000000..745a6f8 --- /dev/null +++ b/benchmarks/tpch-turbodbc.py @@ -0,0 +1,62 @@ +""" +Usage: + tpch-turbodbc.py [--driver=] [--ret=] + +Options: + --driver= ODBC driver to use [default: PostgreSQL]. + --ret= The return type [default: pandas-numpy]. + -h --help Show this screen. + --version Show version. + +""" + +import os + +from docopt import docopt +from turbodbc import connect, make_options +import pandas as pd +from contexttimer import Timer + +if __name__ == "__main__": + args = docopt(__doc__, version="Naval Fate 2.0") + table = os.environ["TPCH_TABLE"] + driver = args["--driver"] + ret = args["--ret"] + query = f"SELECT * FROM {table}" + + with Timer() as gtimer: + with Timer() as timer: + if driver == "MSSQL": + options = make_options(prefer_unicode=True) + connection = connect( + dsn=driver, uid=os.environ["MSSQL_USER"], pwd=os.environ["MSSQL_PASSWORD"], turbodbc_options=options) + else: + connection = connect(dsn=driver) + cursor = connection.cursor() + print(f"connect: {timer.elapsed}") + with Timer() as timer: + cursor.execute(query) + print(f"execute: {timer.elapsed}") + if ret == "pandas-numpy": + with Timer() as timer: + data = cursor.fetchallnumpy() + print(f"fetchallnumpy: {timer.elapsed}") + with Timer() as timer: + df = pd.DataFrame(data=data) + print(f"convert to pandas: {timer.elapsed}") + elif ret == "pandas-arrow": + with Timer() as timer: + data = cursor.fetchallarrow() + print(f"fetchallarrow: {timer.elapsed}") + with Timer() as timer: + # to be fair with other benchmarks, generate consolidate blocks and convert date + df = data.to_pandas(split_blocks=False, date_as_object=False) + print(f"convert to pandas: {timer.elapsed}") + else: + assert ret == "arrow" + with Timer() as timer: + df = cursor.fetchallarrow() + print(f"fetchallarrow: {timer.elapsed}") + + print(f"time in total: {gtimer.elapsed}") + print(df) diff --git a/connectorx-cpp/Cargo.toml b/connectorx-cpp/Cargo.toml new file mode 100644 index 0000000..05944b4 --- /dev/null +++ b/connectorx-cpp/Cargo.toml @@ -0,0 +1,37 @@ +[package] +name = "connectorx-cpp" +version = "0.3.3-alpha.1" +edition = "2021" +license = "MIT" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +libc = "0.2" +connectorx = {path = "../connectorx", default-features = false} +arrow = {workspace = true} +openssl = {version = "0.10", features = ["vendored"]} + +[lib] +crate-type = ["cdylib"] +name = "connectorx_cpp" + +[features] +default = ["fptr", "nbstr", "dsts", "srcs", "federation"] +light = ["fptr", "nbstr", "dsts_light" , "srcs_light", "federation"] +srcs_light = ["connectorx/src_postgres"] +dsts_light = ["connectorx/dst_arrow"] + +dsts = ["connectorx/dst_arrow", "connectorx/dst_arrow2"] +fptr = ["connectorx/fptr"] +branch = ["connectorx/branch"] +federation = ["connectorx/federation"] +nbstr = [] +srcs = [ + "connectorx/src_postgres", + "connectorx/src_mysql", + "connectorx/src_sqlite", + "connectorx/src_mssql", + "connectorx/src_oracle", + "connectorx/src_bigquery", +] diff --git a/connectorx-cpp/src/lib.rs b/connectorx-cpp/src/lib.rs new file mode 100644 index 0000000..1147a97 --- /dev/null +++ b/connectorx-cpp/src/lib.rs @@ -0,0 +1,365 @@ +mod plan; + +use arrow::ffi::{FFI_ArrowArray, FFI_ArrowSchema}; +use connectorx::prelude::*; +use libc::c_char; +use std::collections::HashMap; +use std::convert::TryFrom; +use std::env; +use std::ffi::{CStr, CString}; +use std::sync::Arc; + +#[repr(C)] +pub struct CXSlice { + ptr: *const T, + len: usize, + capacity: usize, +} + +impl CXSlice { + pub fn new_from_vec(v: Vec) -> Self { + // If `Vec::into_raw_parts` becomes stable, can directly change to: + // let (ptr, len, capacity) = v.into_raw_parts(); + // Self {ptr, len, capacity} + + let slice = Self { + ptr: v.as_ptr(), + len: v.len(), + capacity: v.capacity(), + }; + std::mem::forget(v); + slice + } +} + +#[repr(C)] +pub struct CXTable { + name: *const c_char, + columns: CXSlice<*const c_char>, +} + +#[repr(C)] +pub struct CXConnectionInfo { + name: *const c_char, + conn: *const c_char, + schema: CXSlice, + is_local: bool, + jdbc_url: *const c_char, + jdbc_driver: *const c_char, +} + +#[repr(C)] +pub struct CXFederatedPlan { + db_name: *const c_char, + db_alias: *const c_char, + sql: *const c_char, + cardinality: usize, +} + +#[cfg(feature = "federation")] +#[no_mangle] +pub unsafe extern "C" fn free_plans(res: *const CXSlice) { + let plans = get_vec::<_>((*res).ptr, (*res).len, (*res).capacity); + plans.into_iter().for_each(|plan| { + free_str(plan.db_name); + free_str(plan.db_alias); + free_str(plan.sql); + }); +} + +#[no_mangle] +pub unsafe extern "C" fn connectorx_rewrite( + conn_list: *const CXSlice, + query: *const c_char, +) -> CXSlice { + let mut db_map = HashMap::new(); + let conn_slice = unsafe { std::slice::from_raw_parts((*conn_list).ptr, (*conn_list).len) }; + for p in conn_slice { + let name = unsafe { CStr::from_ptr(p.name) }.to_str().unwrap(); + if p.conn.is_null() { + let mut table_map: HashMap> = HashMap::new(); + let table_slice = unsafe { std::slice::from_raw_parts(p.schema.ptr, p.schema.len) }; + for t in table_slice { + let table_name = unsafe { CStr::from_ptr(t.name) }.to_str().unwrap(); + // println!("raw table name: {:?}", table_name); + let column_slice = + unsafe { std::slice::from_raw_parts(t.columns.ptr, t.columns.len) }; + + let mut column_names = vec![]; + for &c in column_slice { + let column_name = unsafe { CStr::from_ptr(c).to_str().unwrap() }; + column_names.push(column_name.to_string()); + } + table_map.insert(table_name.to_string(), column_names); + } + let source_info = + FederatedDataSourceInfo::new_from_manual_schema(table_map, p.is_local); + db_map.insert(name.to_string(), source_info); + } else { + let conn = unsafe { CStr::from_ptr(p.conn) }.to_str().unwrap(); + let jdbc_url = match p.jdbc_url.is_null() { + true => "", + false => unsafe { CStr::from_ptr(p.jdbc_url) }.to_str().unwrap(), + }; + let jdbc_driver = match p.jdbc_driver.is_null() { + true => "", + false => unsafe { CStr::from_ptr(p.jdbc_driver) }.to_str().unwrap(), + }; + // println!("name: {:?}, conn: {:?}", name, conn); + let source_info = FederatedDataSourceInfo::new_from_conn_str( + SourceConn::try_from(conn).unwrap(), + p.is_local, + jdbc_url, + jdbc_driver, + ); + db_map.insert(name.to_string(), source_info); + } + } + + let query_str = unsafe { CStr::from_ptr(query) }.to_str().unwrap(); + let j4rs_base = match env::var("CX_LIB_PATH") { + Ok(val) => Some(val), + Err(_) => None, + }; + // println!("j4rs_base: {:?}", j4rs_base); + let fed_plan: Vec = rewrite_sql(query_str, &db_map, j4rs_base.as_deref()) + .unwrap() + .into_iter() + .map(|p| p.into()) + .collect(); + + CXSlice::<_>::new_from_vec(fed_plan) +} + +#[repr(C)] +pub struct CXArray { + array: *const FFI_ArrowArray, + schema: *const FFI_ArrowSchema, +} + +#[repr(C)] +pub struct CXResult { + data: CXSlice>, + header: CXSlice<*const c_char>, +} + +pub unsafe fn get_vec(ptr: *const T, len: usize, capacity: usize) -> Vec { + Vec::from_raw_parts(ptr as *mut T, len, capacity) +} + +pub unsafe fn free_str(ptr: *const c_char) { + let _ = CString::from_raw(ptr as *mut _); +} + +#[no_mangle] +pub unsafe extern "C" fn free_result(res: *const CXResult) { + let header = get_vec::<_>((*res).header.ptr, (*res).header.len, (*res).header.capacity); + header.into_iter().for_each(|col| free_str(col)); + + let rbs = get_vec::<_>((*res).data.ptr, (*res).data.len, (*res).data.capacity); + rbs.into_iter().for_each(|rb| { + get_vec::<_>(rb.ptr, rb.len, rb.capacity) + .into_iter() + .for_each(|a| { + // Otherwise memory leak + std::sync::Arc::from_raw(a.array); + std::sync::Arc::from_raw(a.schema); + }) + }); +} + +#[no_mangle] +pub unsafe extern "C" fn connectorx_scan(conn: *const c_char, query: *const c_char) -> CXResult { + let conn_str = unsafe { CStr::from_ptr(conn) }.to_str().unwrap(); + let query_str = unsafe { CStr::from_ptr(query) }.to_str().unwrap(); + let source_conn = SourceConn::try_from(conn_str).unwrap(); + let record_batches = get_arrow(&source_conn, None, &[CXQuery::from(query_str)]) + .unwrap() + .arrow() + .unwrap(); + + // arrow::util::pretty::print_batches(&record_batches[..]).unwrap(); + + let names: Vec<*const c_char> = record_batches[0] + .schema() + .fields() + .iter() + .map(|f| { + CString::new(f.name().as_str()) + .expect("new CString error") + .into_raw() as *const c_char + }) + .collect(); + + let mut result = vec![]; + for rb in record_batches { + let mut cols = vec![]; + + for array in rb.columns() { + let data = array.to_data(); + let array = Arc::new(FFI_ArrowArray::new(&data)); + let schema = Arc::new( + arrow::ffi::FFI_ArrowSchema::try_from(data.data_type()).expect("export schema c"), + ); + let array_ptr = Arc::into_raw(array); + let schema_ptr = Arc::into_raw(schema); + + let cx_array = CXArray { + array: array_ptr, + schema: schema_ptr, + }; + cols.push(cx_array); + } + + let cx_rb = CXSlice::::new_from_vec(cols); + result.push(cx_rb); + } + + let res = CXResult { + data: CXSlice::<_>::new_from_vec(result), + header: CXSlice::<_>::new_from_vec(names), + }; + + res +} + +#[repr(C)] +pub struct CXSchema { + types: CXSlice, + headers: CXSlice<*const c_char>, +} + +#[no_mangle] +pub unsafe extern "C" fn free_iter(iter: *mut Box) { + let _ = Box::from_raw(iter); +} + +#[no_mangle] +pub unsafe extern "C" fn free_schema(schema: *mut CXSchema) { + let res = Box::from_raw(schema); + + let header = get_vec::<_>(res.headers.ptr, res.headers.len, res.headers.capacity); + header.into_iter().for_each(|col| free_str(col)); + + get_vec::<_>(res.types.ptr, res.types.len, res.types.capacity) + .into_iter() + .for_each(|a| { + std::sync::Arc::from_raw(a.array); + std::sync::Arc::from_raw(a.schema); + }); +} + +#[no_mangle] +pub unsafe extern "C" fn free_record_batch(rb: *mut CXSlice) { + let slice = Box::from_raw(rb); + get_vec::<_>(slice.ptr, slice.len, slice.capacity) + .into_iter() + .for_each(|a| { + std::sync::Arc::from_raw(a.array); + std::sync::Arc::from_raw(a.schema); + }) +} + +#[no_mangle] +pub unsafe extern "C" fn connectorx_scan_iter( + conn: *const c_char, + queries: *const CXSlice<*const c_char>, + batch_size: usize, +) -> *mut Box { + let conn_str = unsafe { CStr::from_ptr(conn) }.to_str().unwrap(); + let source_conn = SourceConn::try_from(conn_str).unwrap(); + + let query_slice = unsafe { std::slice::from_raw_parts((*queries).ptr, (*queries).len) }; + + let mut query_vec = vec![]; + for &q in query_slice { + let query = unsafe { CStr::from_ptr(q).to_str().unwrap() }; + query_vec.push(CXQuery::from(query)); + } + + let arrow_iter: Box = + new_record_batch_iter(&source_conn, None, query_vec.as_slice(), batch_size); + + Box::into_raw(Box::new(arrow_iter)) +} + +#[no_mangle] +pub unsafe extern "C" fn connectorx_get_schema( + iter: *mut Box, +) -> *mut CXSchema { + let arrow_iter = unsafe { &*iter }; + let (empty_batch, names) = arrow_iter.get_schema(); + let mut cols = vec![]; + for array in empty_batch.columns() { + let data = array.to_data(); + let array = Arc::new(arrow::ffi::FFI_ArrowArray::new(&data)); + let schema = Arc::new( + arrow::ffi::FFI_ArrowSchema::try_from(data.data_type()).expect("export schema c"), + ); + let array_ptr = Arc::into_raw(array); + let schema_ptr = Arc::into_raw(schema); + let cx_array = CXArray { + array: array_ptr, + schema: schema_ptr, + }; + cols.push(cx_array); + } + + let names: Vec<*const c_char> = names + .iter() + .map(|name| { + CString::new(name.as_str()) + .expect("new CString error") + .into_raw() as *const c_char + }) + .collect(); + + let res = Box::new(CXSchema { + types: CXSlice::<_>::new_from_vec(cols), + headers: CXSlice::<_>::new_from_vec(names), + }); + + Box::into_raw(res) +} + +#[no_mangle] +pub unsafe extern "C" fn connectorx_prepare(iter: *mut Box) { + let arrow_iter = unsafe { &mut *iter }; + arrow_iter.prepare(); +} + +#[no_mangle] +pub unsafe extern "C" fn connectorx_iter_next( + iter: *mut Box, +) -> *mut CXSlice { + let arrow_iter = unsafe { &mut *iter }; + match arrow_iter.next_batch() { + Some(rb) => { + let mut cols = vec![]; + + for array in rb.columns() { + let data = array.to_data(); + let array = Arc::new(arrow::ffi::FFI_ArrowArray::new(&data)); + let schema = + Arc::new(FFI_ArrowSchema::try_from(data.data_type()).expect("export schema c")); + let array_ptr = Arc::into_raw(array); + let schema_ptr = Arc::into_raw(schema); + + let cx_array = CXArray { + array: array_ptr, + schema: schema_ptr, + }; + cols.push(cx_array); + } + + let cx_rb = Box::new(CXSlice::::new_from_vec(cols)); + Box::into_raw(cx_rb) + } + None => std::ptr::null_mut(), + } +} + +#[no_mangle] +pub unsafe extern "C" fn connectorx_set_thread_num(num: usize) { + set_global_num_thread(num); +} diff --git a/connectorx-cpp/src/plan.rs b/connectorx-cpp/src/plan.rs new file mode 100644 index 0000000..b1baf33 --- /dev/null +++ b/connectorx-cpp/src/plan.rs @@ -0,0 +1,22 @@ +use crate::CXFederatedPlan; +use connectorx::fed_rewriter::Plan; +use libc::c_char; +use std::convert::Into; +use std::ffi::CString; + +impl Into for Plan { + fn into(self) -> CXFederatedPlan { + CXFederatedPlan { + db_name: CString::new(self.db_name.as_str()) + .expect("new CString error") + .into_raw() as *const c_char, + db_alias: CString::new(self.db_alias.as_str()) + .expect("new CString error") + .into_raw() as *const c_char, + sql: CString::new(self.sql.as_str()) + .expect("new CString error") + .into_raw() as *const c_char, + cardinality: self.cardinality, + } + } +} diff --git a/connectorx-python/.cargo/config b/connectorx-python/.cargo/config new file mode 100644 index 0000000..59c989e --- /dev/null +++ b/connectorx-python/.cargo/config @@ -0,0 +1,11 @@ +[target.x86_64-apple-darwin] +rustflags = [ + "-C", "link-arg=-undefined", + "-C", "link-arg=dynamic_lookup", +] + +[target.aarch64-apple-darwin] +rustflags = [ + "-C", "link-arg=-undefined", + "-C", "link-arg=dynamic_lookup", +] \ No newline at end of file diff --git a/connectorx-python/Cargo.lock b/connectorx-python/Cargo.lock new file mode 100644 index 0000000..50171df --- /dev/null +++ b/connectorx-python/Cargo.lock @@ -0,0 +1,5977 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "addr2line" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + +[[package]] +name = "ahash" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" +dependencies = [ + "getrandom 0.2.10", + "once_cell", + "version_check", +] + +[[package]] +name = "ahash" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" +dependencies = [ + "cfg-if 1.0.0", + "const-random", + "getrandom 0.2.10", + "once_cell", + "version_check", +] + +[[package]] +name = "aho-corasick" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c378d78423fdad8089616f827526ee33c19f2fddbd5de1629152c9593ba4783" +dependencies = [ + "memchr", +] + +[[package]] +name = "alloc-no-stdlib" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" + +[[package]] +name = "alloc-stdlib" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" +dependencies = [ + "alloc-no-stdlib", +] + +[[package]] +name = "allocator-api2" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5" + +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "ansi_term" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" +dependencies = [ + "winapi", +] + +[[package]] +name = "anyhow" +version = "1.0.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6" + +[[package]] +name = "argminmax" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "202108b46429b765ef483f8a24d5c46f48c14acfdacc086dd4ab6dddf6bcdbd2" +dependencies = [ + "num-traits", +] + +[[package]] +name = "array-init-cursor" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf7d0a018de4f6aa429b9d33d69edf69072b1c5b1cb8d3e4a5f7ef898fc3eb76" + +[[package]] +name = "arrayref" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545" + +[[package]] +name = "arrayvec" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" + +[[package]] +name = "arrow" +version = "40.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6619cab21a0cdd8c9b9f1d9e09bfaa9b1974e5ef809a6566aef0b998caf38ace" +dependencies = [ + "ahash 0.8.3", + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-csv", + "arrow-data", + "arrow-ipc", + "arrow-json", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", +] + +[[package]] +name = "arrow-arith" +version = "40.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0dc95485623a76e00929bda8caa40c1f838190952365c4f43a7b9ae86d03e94" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "half 2.3.1", + "num", +] + +[[package]] +name = "arrow-array" +version = "40.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3267847f53d3042473cfd2c769afd8d74a6d7d201fc3a34f5cb84c0282ef47a7" +dependencies = [ + "ahash 0.8.3", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "chrono-tz", + "half 2.3.1", + "hashbrown 0.13.2", + "num", +] + +[[package]] +name = "arrow-buffer" +version = "40.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5f66553e66e120ac4b21570368ee9ebf35ff3f5399f872b0667699e145678f5" +dependencies = [ + "half 2.3.1", + "num", +] + +[[package]] +name = "arrow-cast" +version = "40.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65e6f3579dbf0d97c683d451b2550062b0f0e62a3169bf74238b5f59f44ad6d8" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "chrono", + "comfy-table 6.2.0", + "lexical-core", + "num", +] + +[[package]] +name = "arrow-csv" +version = "40.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "373579c4c1a8f5307d3125b7a89c700fcf8caf85821c77eb4baab3855ae0aba5" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "chrono", + "csv", + "csv-core", + "lazy_static", + "lexical-core", + "regex", +] + +[[package]] +name = "arrow-data" +version = "40.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61bc8df9912cca6642665fdf989d6fa0de2570f18a7f709bcf59d29de96d2097" +dependencies = [ + "arrow-buffer", + "arrow-schema", + "half 2.3.1", + "num", +] + +[[package]] +name = "arrow-format" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07884ea216994cdc32a2d5f8274a8bee979cfe90274b83f86f440866ee3132c7" +dependencies = [ + "planus", + "serde", +] + +[[package]] +name = "arrow-ipc" +version = "40.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0105dcf5f91daa7182d87b713ee0b32b3bfc88e0c48e7dc3e9d6f1277a07d1ae" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "flatbuffers", +] + +[[package]] +name = "arrow-json" +version = "40.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e73134fb5b5ec8770f8cbb214c2c487b2d350081e403ca4eeeb6f8f5e19846ac" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "chrono", + "half 2.3.1", + "indexmap 1.9.3", + "lexical-core", + "num", + "serde", + "serde_json", +] + +[[package]] +name = "arrow-ord" +version = "40.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89f25bc66e18d4c2aa1fe2f9bb03e2269da60e636213210385ae41a107f9965a" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "half 2.3.1", + "num", +] + +[[package]] +name = "arrow-row" +version = "40.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1095ff85ea4f5ff02d17b30b089de31b51a50be01c6b674f0a0509ab771232f1" +dependencies = [ + "ahash 0.8.3", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "half 2.3.1", + "hashbrown 0.13.2", +] + +[[package]] +name = "arrow-schema" +version = "40.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25187bbef474151a2e4ddec67b9e34bda5cbfba292dc571392fa3a1f71ff5a82" +dependencies = [ + "bitflags 2.4.0", +] + +[[package]] +name = "arrow-select" +version = "40.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd0d4ee884aec3aa05e41478e3cd312bf609de9babb5d187a43fb45931da4da4" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "num", +] + +[[package]] +name = "arrow-string" +version = "40.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6d71c3ffe4c07e66ce8fdc6aed5b00e0e60c5144911879b10546f5b72d8fa1c" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "regex", + "regex-syntax 0.7.5", +] + +[[package]] +name = "arrow2" +version = "0.17.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59c468daea140b747d781a1da9f7db5f0a8e6636d4af20cc539e43d05b0604fa" +dependencies = [ + "ahash 0.8.3", + "arrow-format", + "bytemuck", + "chrono", + "dyn-clone", + "either", + "ethnum", + "foreign_vec", + "futures", + "getrandom 0.2.10", + "hash_hasher", + "lexical-core", + "lz4", + "multiversion", + "num-traits", + "regex", + "regex-syntax 0.6.29", + "rustc_version", + "simdutf8", + "strength_reduce", + "zstd", +] + +[[package]] +name = "async-channel" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35" +dependencies = [ + "concurrent-queue", + "event-listener", + "futures-core", +] + +[[package]] +name = "async-compression" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d495b6dc0184693324491a5ac05f559acc97bf937ab31d7a1c33dd0016be6d2b" +dependencies = [ + "bzip2", + "flate2", + "futures-core", + "futures-io", + "memchr", + "pin-project-lite", + "tokio", + "xz2", + "zstd", + "zstd-safe", +] + +[[package]] +name = "async-lock" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "287272293e9d8c41773cec55e365490fe034813a2f172f502d6ddcf75b2f582b" +dependencies = [ + "event-listener", +] + +[[package]] +name = "async-native-tls" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e9e7a929bd34c68a82d58a4de7f86fffdaf97fb2af850162a7bb19dd7269b33" +dependencies = [ + "async-std", + "native-tls", + "thiserror", + "url", +] + +[[package]] +name = "async-std" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62565bb4402e926b29953c785397c6dc0391b7b446e45008b0049eb43cec6f5d" +dependencies = [ + "async-channel", + "async-lock", + "crossbeam-utils", + "futures-channel", + "futures-core", + "futures-io", + "memchr", + "once_cell", + "pin-project-lite", + "pin-utils", + "slab", + "wasm-bindgen-futures", +] + +[[package]] +name = "async-stream" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22068c0c19514942eefcfd4daf8976ef1aad84e61539f95cd200c35202f80af5" +dependencies = [ + "async-stream-impl 0.2.1", + "futures-core", +] + +[[package]] +name = "async-stream" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd56dd203fef61ac097dd65721a419ddccb106b2d2b70ba60a6b529f03961a51" +dependencies = [ + "async-stream-impl 0.3.5", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-stream-impl" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25f9db3b38af870bf7e5cc649167533b493928e50744e2c30ae350230b414670" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.31", +] + +[[package]] +name = "async-trait" +version = "0.1.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc00ceb34980c03614e35a3a4e218276a0a824e911d07651cd0d858a51e8c0f0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.31", +] + +[[package]] +name = "asynchronous-codec" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb4401f0a3622dad2e0763fa79e0eb328bc70fb7dccfdd645341f00d671247d6" +dependencies = [ + "bytes", + "futures-sink", + "futures-util", + "memchr", + "pin-project-lite", +] + +[[package]] +name = "atoi" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" +dependencies = [ + "num-traits", +] + +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi 0.1.19", + "libc", + "winapi", +] + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "backtrace" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" +dependencies = [ + "addr2line", + "cc", + "cfg-if 1.0.0", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] + +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + +[[package]] +name = "base64" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "414dcefbc63d77c526a76b3afcf6fbb9b5e2791c19c3aa2297733208750c6e53" + +[[package]] +name = "bb8" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e9f4fa9768efd269499d8fba693260cfc670891cf6de3adc935588447a77cc8" +dependencies = [ + "async-trait", + "futures-channel", + "futures-util", + "parking_lot 0.11.2", + "tokio", +] + +[[package]] +name = "bb8-tiberius" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "648d5365b34a2a362d5b8790d3c1b230d263d2377e563c76cb79c10d326b917e" +dependencies = [ + "async-trait", + "bb8", + "futures", + "thiserror", + "tiberius", + "tokio", + "tokio-util 0.6.10", +] + +[[package]] +name = "bigdecimal" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6773ddc0eafc0e509fb60e48dff7f450f8e674a0686ae8605e8d9901bd5eefa" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + +[[package]] +name = "bindgen" +version = "0.59.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bd2a9a458e8f4304c52c43ebb0cfbd520289f8379a52e329a38afda99bf8eb8" +dependencies = [ + "bitflags 1.3.2", + "cexpr", + "clang-sys", + "clap", + "env_logger", + "lazy_static", + "lazycell", + "log", + "peeking_take_while", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "which", +] + +[[package]] +name = "bitfield" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46afbd2983a5d5a7bd740ccb198caf5b82f45c40c09c0eed36052d91cb92e719" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4682ae6287fcf752ecaabbfcc7b6f9b72aa33933dc23a554d853aea8eea8635" + +[[package]] +name = "bitvec" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c" +dependencies = [ + "funty", + "radium", + "tap", + "wyz", +] + +[[package]] +name = "blake2" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe" +dependencies = [ + "digest", +] + +[[package]] +name = "blake3" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "199c42ab6972d92c9f8995f086273d25c42fc0f7b2a1fcefba465c1352d25ba5" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if 1.0.0", + "constant_time_eq", + "digest", +] + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "borsh" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4114279215a005bc675e386011e594e1d9b800918cea18fcadadcce864a2046b" +dependencies = [ + "borsh-derive", + "hashbrown 0.13.2", +] + +[[package]] +name = "borsh-derive" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0754613691538d51f329cce9af41d7b7ca150bc973056f1156611489475f54f7" +dependencies = [ + "borsh-derive-internal", + "borsh-schema-derive-internal", + "proc-macro-crate", + "proc-macro2", + "syn 1.0.109", +] + +[[package]] +name = "borsh-derive-internal" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "afb438156919598d2c7bad7e1c0adf3d26ed3840dbc010db1a882a65583ca2fb" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "borsh-schema-derive-internal" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "634205cc43f74a1b9046ef87c4540ebda95696ec0f315024860cad7c5b0f5ccd" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "brotli" +version = "3.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1a0b1dbcc8ae29329621f8d4f0d835787c1c38bb1401979b49d13b0b305ff68" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", + "brotli-decompressor", +] + +[[package]] +name = "brotli-decompressor" +version = "2.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b6561fd3f895a11e8f72af2cb7d22e08366bebc2b6b57f7744c4bda27034744" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", +] + +[[package]] +name = "bufstream" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40e38929add23cdf8a366df9b0e088953150724bcbe5fc330b0d8eb3b328eec8" + +[[package]] +name = "built" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b9c056b9ed43aee5e064b683aa1ec783e19c6acec7559e3ae931b7490472fbe" +dependencies = [ + "cargo-lock", + "chrono", +] + +[[package]] +name = "bumpalo" +version = "3.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1" + +[[package]] +name = "bytecheck" +version = "0.6.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b6372023ac861f6e6dc89c8344a8f398fb42aaba2b5dbc649ca0c0e9dbcb627" +dependencies = [ + "bytecheck_derive", + "ptr_meta", + "simdutf8", +] + +[[package]] +name = "bytecheck_derive" +version = "0.6.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7ec4c6f261935ad534c0c22dbef2201b45918860eb1c574b972bd213a76af61" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "bytemuck" +version = "1.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "374d28ec25809ee0e23827c2ab573d729e293f281dfe393500e7ad618baa61c6" +dependencies = [ + "bytemuck_derive", +] + +[[package]] +name = "bytemuck_derive" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "965ab7eb5f8f97d2a083c799f3a1b994fc397b2fe2da5d1da1626ce15a39f2b1" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.31", +] + +[[package]] +name = "byteorder" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" + +[[package]] +name = "bytes" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be" + +[[package]] +name = "bzip2" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" +dependencies = [ + "bzip2-sys", + "libc", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.11+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + +[[package]] +name = "cargo-lock" +version = "8.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "031718ddb8f78aa5def78a09e90defe30151d1f6c672f937af4dd916429ed996" +dependencies = [ + "semver", + "serde", + "toml", + "url", +] + +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + +[[package]] +name = "cc" +version = "1.0.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" +dependencies = [ + "jobserver", + "libc", +] + +[[package]] +name = "cesu8" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c" + +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + +[[package]] +name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "chrono" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d87d9d13be47a5b7c3907137f1290b0459a7f80efb26be8c52afb11963bccb02" +dependencies = [ + "android-tzdata", + "iana-time-zone", + "js-sys", + "num-traits", + "serde", + "time 0.1.45", + "wasm-bindgen", + "windows-targets", +] + +[[package]] +name = "chrono-tz" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1369bc6b9e9a7dfdae2055f6ec151fe9c554a9d23d357c0237cee2e25eaabb7" +dependencies = [ + "chrono", + "chrono-tz-build", + "phf", +] + +[[package]] +name = "chrono-tz-build" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2f5ebdc942f57ed96d560a6d1a459bae5851102a25d5bf89dc04ae453e31ecf" +dependencies = [ + "parse-zoneinfo", + "phf", + "phf_codegen", +] + +[[package]] +name = "clang-sys" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f" +dependencies = [ + "glob", + "libc", + "libloading", +] + +[[package]] +name = "clap" +version = "2.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" +dependencies = [ + "ansi_term", + "atty", + "bitflags 1.3.2", + "strsim 0.8.0", + "textwrap", + "unicode-width", + "vec_map", +] + +[[package]] +name = "cmake" +version = "0.1.50" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a31c789563b815f77f4250caee12365734369f942439b7defd71e18a48197130" +dependencies = [ + "cc", +] + +[[package]] +name = "comfy-table" +version = "6.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e959d788268e3bf9d35ace83e81b124190378e4c91c9067524675e33394b8ba" +dependencies = [ + "strum", + "strum_macros 0.24.3", + "unicode-width", +] + +[[package]] +name = "comfy-table" +version = "7.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ab77dbd8adecaf3f0db40581631b995f312a8a5ae3aa9993188bb8f23d83a5b" +dependencies = [ + "crossterm", + "strum", + "strum_macros 0.24.3", + "unicode-width", +] + +[[package]] +name = "concurrent-queue" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62ec6771ecfa0762d24683ee5a32ad78487a3d3afdc0fb8cae19d2c5deb50b7c" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "connection-string" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c4ecb0dc8c35d2c626e45ae70bbfcb1050b302f42bcdf025d913cc0c5a0b443" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "connectorx" +version = "0.3.3-alpha.1" +dependencies = [ + "anyhow", + "arrow", + "arrow2", + "bb8", + "bb8-tiberius", + "chrono", + "csv", + "datafusion", + "fallible-streaming-iterator", + "fehler", + "futures", + "gcp-bigquery-client", + "hex", + "itertools", + "j4rs", + "log", + "mysql_common", + "native-tls", + "num-traits", + "openssl", + "oracle", + "owning_ref", + "polars", + "postgres", + "postgres-native-tls", + "postgres-openssl", + "r2d2", + "r2d2-oracle", + "r2d2_mysql", + "r2d2_postgres", + "r2d2_sqlite", + "rayon", + "rusqlite", + "rust_decimal", + "rust_decimal_macros", + "serde_json", + "sqlparser 0.11.0", + "thiserror", + "tiberius", + "tokio", + "tokio-util 0.6.10", + "url", + "urlencoding", + "uuid 0.8.2", +] + +[[package]] +name = "connectorx-python" +version = "0.3.3-alpha.1" +dependencies = [ + "anyhow", + "arrow", + "arrow2", + "bitfield", + "built", + "bytes", + "chrono", + "connectorx", + "criterion", + "criterion-macro", + "dict_derive", + "env_logger", + "fehler", + "iai", + "itertools", + "lazy_static", + "libc", + "log", + "ndarray", + "numpy", + "openssl", + "postgres", + "postgres-native-tls", + "postgres-openssl", + "pprof", + "pyo3", + "pyo3-built", + "rayon", + "rust_decimal", + "serde_json", + "sqlparser 0.11.0", + "thiserror", + "tokio", + "tokio-util 0.6.10", + "url", + "urlencoding", + "uuid 0.8.2", +] + +[[package]] +name = "const-random" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368a7a772ead6ce7e1de82bfb04c485f3db8ec744f72925af5735e29a22cc18e" +dependencies = [ + "const-random-macro", + "proc-macro-hack", +] + +[[package]] +name = "const-random-macro" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d7d6ab3c3a2282db210df5f02c4dab6e0a7057af0fb7ebd4070f30fe05c0ddb" +dependencies = [ + "getrandom 0.2.10", + "once_cell", + "proc-macro-hack", + "tiny-keccak", +] + +[[package]] +name = "constant_time_eq" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7144d30dcf0fafbce74250a3963025d8d52177934239851c917d29f1df280c2" + +[[package]] +name = "core-foundation" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "194a7a9e6de53fa55116934067c844d9d749312f75c6f6d0980e8c252f8c2146" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" + +[[package]] +name = "cpp_demangle" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eeaa953eaad386a53111e47172c2fedba671e5684c8dd601a5f474f4f118710f" +dependencies = [ + "cfg-if 1.0.0", +] + +[[package]] +name = "cpufeatures" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1" +dependencies = [ + "libc", +] + +[[package]] +name = "crc32fast" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +dependencies = [ + "cfg-if 1.0.0", +] + +[[package]] +name = "criterion" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b01d6de93b2b6c65e17c634a26653a29d107b3c98c607c765bf38d041531cd8f" +dependencies = [ + "atty", + "cast", + "clap", + "criterion-plot", + "csv", + "itertools", + "lazy_static", + "num-traits", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_cbor", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-macro" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8421c08c2e60050bb24ebfb7232bdd2fcf44fa74c5777b00a71daa7d332a8164" +dependencies = [ + "proc-macro2", + "quote", +] + +[[package]] +name = "criterion-plot" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2673cc8207403546f45f5fd319a974b1e6983ad1a3ee7e6041650013be041876" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2801af0d36612ae591caa9568261fddce32ce6e08a7275ea334a06a4ad021a2c" +dependencies = [ + "cfg-if 1.0.0", + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-epoch", + "crossbeam-queue", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200" +dependencies = [ + "cfg-if 1.0.0", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" +dependencies = [ + "cfg-if 1.0.0", + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" +dependencies = [ + "autocfg", + "cfg-if 1.0.0", + "crossbeam-utils", + "memoffset", + "scopeguard", +] + +[[package]] +name = "crossbeam-queue" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1cfb3ea8a53f37c40dea2c7bedcbd88bdfae54f5e2175d6ecaff1c988353add" +dependencies = [ + "cfg-if 1.0.0", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294" +dependencies = [ + "cfg-if 1.0.0", +] + +[[package]] +name = "crossterm" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a84cda67535339806297f1b331d6dd6320470d2a0fe65381e79ee9e156dd3d13" +dependencies = [ + "bitflags 1.3.2", + "crossterm_winapi", + "libc", + "mio", + "parking_lot 0.12.1", + "signal-hook", + "signal-hook-mio", + "winapi", +] + +[[package]] +name = "crossterm_winapi" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b" +dependencies = [ + "winapi", +] + +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "csv" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "626ae34994d3d8d668f4269922248239db4ae42d538b14c398b74a52208e8086" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +dependencies = [ + "memchr", +] + +[[package]] +name = "darling" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a01d95850c592940db9b8194bc39f4bc0e89dee5c4265e4b1807c34a9aba453c" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "859d65a907b6852c9361e3185c862aae7fafd2887876799fa55f5f99dc40d610" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim 0.10.0", + "syn 1.0.109", +] + +[[package]] +name = "darling_macro" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c972679f83bdf9c42bd905396b6c3588a843a17f0f16dfcfa3e2c5d57441835" +dependencies = [ + "darling_core", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "dashmap" +version = "5.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856" +dependencies = [ + "cfg-if 1.0.0", + "hashbrown 0.14.0", + "lock_api", + "once_cell", + "parking_lot_core 0.9.8", +] + +[[package]] +name = "datafusion" +version = "26.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9992c267436551d40b52d65289b144712e7b0ebdc62c8c859fd1574e5f73efbb" +dependencies = [ + "ahash 0.8.3", + "arrow", + "arrow-array", + "arrow-schema", + "async-compression", + "async-trait", + "bytes", + "bzip2", + "chrono", + "dashmap", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-optimizer", + "datafusion-physical-expr", + "datafusion-row", + "datafusion-sql", + "flate2", + "futures", + "glob", + "hashbrown 0.13.2", + "indexmap 1.9.3", + "itertools", + "lazy_static", + "log", + "num_cpus", + "object_store", + "parking_lot 0.12.1", + "parquet", + "percent-encoding", + "pin-project-lite", + "rand 0.8.5", + "smallvec", + "sqlparser 0.34.0", + "tempfile", + "tokio", + "tokio-stream", + "tokio-util 0.7.8", + "url", + "uuid 1.4.1", + "xz2", + "zstd", +] + +[[package]] +name = "datafusion-common" +version = "26.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3be97f7a7c720cdbb71e9eeabf814fa6ad8102b9022390f6cac74d3b4af6392" +dependencies = [ + "arrow", + "arrow-array", + "chrono", + "num_cpus", + "object_store", + "parquet", + "sqlparser 0.34.0", +] + +[[package]] +name = "datafusion-execution" +version = "26.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c77c4b14b809b0e4c5bb101b6834504f06cdbb0d3c643400c61d0d844b33264e" +dependencies = [ + "dashmap", + "datafusion-common", + "datafusion-expr", + "hashbrown 0.13.2", + "log", + "object_store", + "parking_lot 0.12.1", + "rand 0.8.5", + "tempfile", + "url", +] + +[[package]] +name = "datafusion-expr" +version = "26.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6ec7409bd45cf4fae6395d7d1024c8a97e543cadc88363e405d2aad5330e5e7" +dependencies = [ + "ahash 0.8.3", + "arrow", + "datafusion-common", + "lazy_static", + "sqlparser 0.34.0", + "strum", + "strum_macros 0.24.3", +] + +[[package]] +name = "datafusion-optimizer" +version = "26.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64b537c93f87989c212db92a448a0f5eb4f0995e27199bb7687ae94f8b64a7a8" +dependencies = [ + "arrow", + "async-trait", + "chrono", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-expr", + "hashbrown 0.13.2", + "itertools", + "log", + "regex-syntax 0.7.5", +] + +[[package]] +name = "datafusion-physical-expr" +version = "26.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f60ee3f53340fdef36ee54d9e12d446ae2718b1d0196ac581f791d34808ec876" +dependencies = [ + "ahash 0.8.3", + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-schema", + "blake2", + "blake3", + "chrono", + "datafusion-common", + "datafusion-expr", + "datafusion-row", + "half 2.3.1", + "hashbrown 0.13.2", + "indexmap 1.9.3", + "itertools", + "lazy_static", + "libc", + "md-5", + "paste 1.0.14", + "petgraph 0.6.4", + "rand 0.8.5", + "regex", + "sha2", + "unicode-segmentation", + "uuid 1.4.1", +] + +[[package]] +name = "datafusion-row" +version = "26.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d58fc64058aa3bcb00077a0d19474a0d584d31dec8c7ac3406868f485f659af9" +dependencies = [ + "arrow", + "datafusion-common", + "paste 1.0.14", + "rand 0.8.5", +] + +[[package]] +name = "datafusion-sql" +version = "26.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1531f0314151a34bf6c0a83c7261525688b7c729876f53e7896b8f4ca8f57d07" +dependencies = [ + "arrow", + "arrow-schema", + "datafusion-common", + "datafusion-expr", + "log", + "sqlparser 0.34.0", +] + +[[package]] +name = "debugid" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6ee87af31d84ef885378aebca32be3d682b0e0dc119d5b4860a2c5bb5046730" +dependencies = [ + "uuid 0.8.2", +] + +[[package]] +name = "deranged" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2696e8a945f658fd14dc3b87242e6b80cd0f36ff04ea560fa39082368847946" +dependencies = [ + "serde", +] + +[[package]] +name = "derive_utils" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9abcad25e9720609ccb3dcdb795d845e37d8ce34183330a9f48b03a1a71c8e21" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.31", +] + +[[package]] +name = "dict_derive" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6207f46b33b2bf00858b0edb03d188d31a46fedfde4aa53a27d69fe25acd80cf" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", + "subtle", +] + +[[package]] +name = "dirs" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3aa72a6f96ea37bbc5aa912f6788242832f75369bdfdadcb0e38423f100059" +dependencies = [ + "dirs-sys", +] + +[[package]] +name = "dirs-sys" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b1d1d91c932ef41c0f2663aa8b0ca0342d444d842c06914aa0a7e352d0bada6" +dependencies = [ + "libc", + "redox_users", + "winapi", +] + +[[package]] +name = "doc-comment" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" + +[[package]] +name = "dunce" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56ce8c6da7551ec6c462cbaf3bfbc75131ebbfa1c944aeaa9dab51ca1c5f0c3b" + +[[package]] +name = "dyn-clone" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbfc4744c1b8f2a09adc0e55242f60b1af195d88596bd8700be74418c056c555" + +[[package]] +name = "either" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" + +[[package]] +name = "encoding" +version = "0.2.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" +dependencies = [ + "encoding-index-japanese", + "encoding-index-korean", + "encoding-index-simpchinese", + "encoding-index-singlebyte", + "encoding-index-tradchinese", +] + +[[package]] +name = "encoding-index-japanese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-korean" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-simpchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-singlebyte" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-tradchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding_index_tests" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" + +[[package]] +name = "encoding_rs" +version = "0.8.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7268b386296a025e474d5140678f75d6de9493ae55a5d709eeb9dd08149945e1" +dependencies = [ + "cfg-if 1.0.0", +] + +[[package]] +name = "enum_dispatch" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f33313078bb8d4d05a2733a94ac4c2d8a0df9a2b84424ebf4f33bfc224a890e" +dependencies = [ + "once_cell", + "proc-macro2", + "quote", + "syn 2.0.31", +] + +[[package]] +name = "enumflags2" +version = "0.7.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c041f5090df68b32bcd905365fd51769c8b9d553fe87fde0b683534f10c01bd2" +dependencies = [ + "enumflags2_derive", +] + +[[package]] +name = "enumflags2_derive" +version = "0.7.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e9a1f9f7d83e59740248a6e14ecf93929ade55027844dfcea78beafccc15745" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.31", +] + +[[package]] +name = "env_logger" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a12e6657c4c97ebab115a42dcee77225f7f482cdd841cf7088c657a42e9e00e7" +dependencies = [ + "atty", + "humantime", + "log", + "regex", + "termcolor", +] + +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + +[[package]] +name = "errno" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "136526188508e25c6fef639d7927dfb3e0e3084488bf202267829cf7fc23dbdd" +dependencies = [ + "errno-dragonfly", + "libc", + "windows-sys", +] + +[[package]] +name = "errno-dragonfly" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "ethnum" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c8ff382b2fa527fb7fb06eeebfc5bbb3f17e3cc6b9d70b006c41daa8824adac" + +[[package]] +name = "event-listener" +version = "2.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" + +[[package]] +name = "fallible-iterator" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" + +[[package]] +name = "fallible-streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" + +[[package]] +name = "fast-float" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95765f67b4b18863968b4a1bd5bb576f732b29a4a28c7cd84c09fa3e2875f33c" + +[[package]] +name = "fastrand" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764" + +[[package]] +name = "fehler" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5729fe49ba028cd550747b6e62cd3d841beccab5390aa398538c31a2d983635" +dependencies = [ + "fehler-macros", +] + +[[package]] +name = "fehler-macros" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccb5acb1045ebbfa222e2c50679e392a71dd77030b78fb0189f2d9c5974400f9" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "finl_unicode" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6" + +[[package]] +name = "fixedbitset" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37ab347416e802de484e4d03c7316c48f1ecb56574dfd4a46a80f173ce1de04d" + +[[package]] +name = "fixedbitset" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" + +[[package]] +name = "flatbuffers" +version = "23.5.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dac53e22462d78c16d64a1cd22371b54cc3fe94aa15e7886a2fa6e5d1ab8640" +dependencies = [ + "bitflags 1.3.2", + "rustc_version", +] + +[[package]] +name = "flate2" +version = "1.0.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6c98ee8095e9d1dcbf2fcc6d95acccb90d1c81db1e44725c6a984b1dbdfb010" +dependencies = [ + "crc32fast", + "libz-sys", + "miniz_oxide", +] + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + +[[package]] +name = "foreign_vec" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee1b05cbd864bcaecbd3455d6d967862d446e4ebfc3c2e5e5b9841e53cba6673" + +[[package]] +name = "form_urlencoded" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a62bc1cf6f830c2ec14a513a9fb124d0a213a629668a4186f329db21fe045652" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "frunk" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11a351b59e12f97b4176ee78497dff72e4276fb1ceb13e19056aca7fa0206287" +dependencies = [ + "frunk_core", + "frunk_derives", + "frunk_proc_macros", +] + +[[package]] +name = "frunk_core" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af2469fab0bd07e64ccf0ad57a1438f63160c69b2e57f04a439653d68eb558d6" + +[[package]] +name = "frunk_derives" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fa992f1656e1707946bbba340ad244f0814009ef8c0118eb7b658395f19a2e" +dependencies = [ + "frunk_proc_macro_helpers", + "quote", + "syn 2.0.31", +] + +[[package]] +name = "frunk_proc_macro_helpers" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35b54add839292b743aeda6ebedbd8b11e93404f902c56223e51b9ec18a13d2c" +dependencies = [ + "frunk_core", + "proc-macro2", + "quote", + "syn 2.0.31", +] + +[[package]] +name = "frunk_proc_macros" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71b85a1d4a9a6b300b41c05e8e13ef2feca03e0334127f29eca9506a7fe13a93" +dependencies = [ + "frunk_core", + "frunk_proc_macro_helpers", + "quote", + "syn 2.0.31", +] + +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + +[[package]] +name = "funty" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" + +[[package]] +name = "futures" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" + +[[package]] +name = "futures-executor" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964" + +[[package]] +name = "futures-macro" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.31", +] + +[[package]] +name = "futures-sink" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e" + +[[package]] +name = "futures-task" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65" + +[[package]] +name = "futures-util" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "gcp-bigquery-client" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ab5966c98f6d4e71e247cda6a6d8497bc8a1df3a4ba9ee548087842cffc21d" +dependencies = [ + "async-stream 0.3.5", + "hyper", + "hyper-rustls 0.23.2", + "log", + "reqwest", + "serde", + "serde_json", + "thiserror", + "time 0.3.28", + "tokio", + "tokio-stream", + "url", + "yup-oauth2", +] + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" +dependencies = [ + "cfg-if 1.0.0", + "libc", + "wasi 0.9.0+wasi-snapshot-preview1", +] + +[[package]] +name = "getrandom" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" +dependencies = [ + "cfg-if 1.0.0", + "js-sys", + "libc", + "wasi 0.11.0+wasi-snapshot-preview1", + "wasm-bindgen", +] + +[[package]] +name = "gimli" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fb8d784f27acf97159b40fc4db5ecd8aa23b9ad5ef69cdd136d3bc80665f0c0" + +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + +[[package]] +name = "h2" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91fc23aa11be92976ef4729127f1a74adf36d8436f7816b185d18df956790833" +dependencies = [ + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http", + "indexmap 1.9.3", + "slab", + "tokio", + "tokio-util 0.7.8", + "tracing", +] + +[[package]] +name = "half" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" + +[[package]] +name = "half" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc52e53916c08643f1b56ec082790d1e86a32e58dc5268f897f313fbae7b4872" +dependencies = [ + "cfg-if 1.0.0", + "crunchy", + "num-traits", +] + +[[package]] +name = "hash_hasher" +version = "2.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74721d007512d0cb3338cd20f0654ac913920061a4c4d0d8708edb3f2a698c0c" + +[[package]] +name = "hashbrown" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" +dependencies = [ + "ahash 0.7.6", +] + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" +dependencies = [ + "ahash 0.7.6", +] + +[[package]] +name = "hashbrown" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e" +dependencies = [ + "ahash 0.8.3", +] + +[[package]] +name = "hashbrown" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a" +dependencies = [ + "ahash 0.8.3", + "allocator-api2", + "rayon", +] + +[[package]] +name = "hashlink" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7249a3129cbc1ffccd74857f81464a323a152173cdb134e0fd81bc803b29facf" +dependencies = [ + "hashbrown 0.11.2", +] + +[[package]] +name = "heck" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" +dependencies = [ + "unicode-segmentation", +] + +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + +[[package]] +name = "hermit-abi" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + +[[package]] +name = "home" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5444c27eef6923071f7ebcc33e3444508466a76f7a2b93da00ed6e19f30c1ddb" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "http" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd6effc99afb63425aff9b05836f029929e345a6148a14b7ecd5ab67af944482" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http-body" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5f38f16d184e36f2408a55281cd658ecbd3ca05cce6d6510a176eca393e26d1" +dependencies = [ + "bytes", + "http", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" + +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + +[[package]] +name = "hyper" +version = "0.14.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffb1cfd654a8219eaef89881fdb3bb3b1cdc5fa75ded05d6933b2b382e395468" +dependencies = [ + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "socket2 0.4.9", + "tokio", + "tower-service", + "tracing", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.23.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1788965e61b367cd03a62950836d5cd41560c3577d90e40e0819373194d1661c" +dependencies = [ + "http", + "hyper", + "log", + "rustls 0.20.9", + "rustls-native-certs", + "tokio", + "tokio-rustls 0.23.4", +] + +[[package]] +name = "hyper-rustls" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d78e1e73ec14cf7375674f74d7dde185c8206fd9dea6fb6295e8a98098aaa97" +dependencies = [ + "futures-util", + "http", + "hyper", + "rustls 0.21.7", + "tokio", + "tokio-rustls 0.24.1", +] + +[[package]] +name = "iai" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71a816c97c42258aa5834d07590b718b4c9a598944cd39a52dc25b351185d678" + +[[package]] +name = "iana-time-zone" +version = "0.1.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fad5b825842d2b38bd206f3e81d6957625fd7f0a361e345c30e01a0ae2dd613" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + +[[package]] +name = "idna" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d20d6b07bfbc108882d88ed8e37d39636dcc260e15e30c45e6ba089610b917c" +dependencies = [ + "unicode-bidi", + "unicode-normalization", +] + +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown 0.12.3", +] + +[[package]] +name = "indexmap" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5477fe2230a79769d8dc68e0eabf5437907c0457a5614a9e8dddb67f65eb65d" +dependencies = [ + "equivalent", + "hashbrown 0.14.0", +] + +[[package]] +name = "indoc" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47741a8bc60fb26eb8d6e0238bbb26d8575ff623fdc97b1a2c00c050b9684ed8" +dependencies = [ + "indoc-impl", + "proc-macro-hack", +] + +[[package]] +name = "indoc-impl" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce046d161f000fffde5f432a0d034d0341dc152643b2598ed5bfce44c4f3a8f0" +dependencies = [ + "proc-macro-hack", + "proc-macro2", + "quote", + "syn 1.0.109", + "unindent", +] + +[[package]] +name = "inferno" +version = "0.10.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de3886428c6400486522cf44b8626e7b94ad794c14390290f2a274dcf728a58f" +dependencies = [ + "ahash 0.7.6", + "atty", + "indexmap 1.9.3", + "itoa", + "lazy_static", + "log", + "num-format", + "quick-xml", + "rgb", + "str_stack", +] + +[[package]] +name = "instant" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" +dependencies = [ + "cfg-if 1.0.0", +] + +[[package]] +name = "integer-encoding" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" + +[[package]] +name = "io-enum" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5305557fa27b460072ae15ce07617e999f5879f14d376c8449f0bfb9f9d8e91e" +dependencies = [ + "derive_utils", + "syn 2.0.31", +] + +[[package]] +name = "ipnet" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6" + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" + +[[package]] +name = "j4rs" +version = "0.15.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76cc9c1648a1cc940ac10c19f56e50bee15344590e10f220899d955db5f87ac2" +dependencies = [ + "cesu8", + "dirs", + "dunce", + "fs_extra", + "glob", + "java-locator", + "jni-sys", + "lazy_static", + "libc", + "libloading", + "log", + "serde", + "serde_json", + "sha2", +] + +[[package]] +name = "java-locator" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90003f2fd9c52f212c21d8520f1128da0080bad6fff16b68fe6e7f2f0c3780c2" +dependencies = [ + "glob", + "lazy_static", +] + +[[package]] +name = "jni-sys" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130" + +[[package]] +name = "jobserver" +version = "0.1.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2" +dependencies = [ + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a38fc24e30fd564ce974c02bf1d337caddff65be6cc4735a1f7eab22a7440f04" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + +[[package]] +name = "lexical" +version = "6.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7aefb36fd43fef7003334742cbf77b243fcd36418a1d1bdd480d613a67968f6" +dependencies = [ + "lexical-core", +] + +[[package]] +name = "lexical-core" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46" +dependencies = [ + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", +] + +[[package]] +name = "lexical-parse-float" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f" +dependencies = [ + "lexical-parse-integer", + "lexical-util", + "static_assertions", +] + +[[package]] +name = "lexical-parse-integer" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9" +dependencies = [ + "lexical-util", + "static_assertions", +] + +[[package]] +name = "lexical-util" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5255b9ff16ff898710eb9eb63cb39248ea8a5bb036bea8085b1a767ff6c4e3fc" +dependencies = [ + "static_assertions", +] + +[[package]] +name = "lexical-write-float" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862" +dependencies = [ + "lexical-util", + "lexical-write-integer", + "static_assertions", +] + +[[package]] +name = "lexical-write-integer" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446" +dependencies = [ + "lexical-util", + "static_assertions", +] + +[[package]] +name = "libc" +version = "0.2.147" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" + +[[package]] +name = "libgssapi" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "724dbcd1f871da9c67983537a47ac510c278656f6392418ad67c7a52720e54b2" +dependencies = [ + "bitflags 1.3.2", + "bytes", + "lazy_static", + "libgssapi-sys", + "parking_lot 0.11.2", +] + +[[package]] +name = "libgssapi-sys" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dd7d65e409c889f6c9d81ff079371d0d8fd88d7dca702ff187ef96fb0450fb7" +dependencies = [ + "bindgen", +] + +[[package]] +name = "libloading" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" +dependencies = [ + "cfg-if 1.0.0", + "winapi", +] + +[[package]] +name = "libm" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7012b1bbb0719e1097c47611d3898568c546d597c2e74d66f6087edd5233ff4" + +[[package]] +name = "libsqlite3-sys" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "898745e570c7d0453cc1fbc4a701eb6c662ed54e8fec8b7d14be137ebeeb9d14" +dependencies = [ + "cc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "libz-sys" +version = "1.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d97137b25e321a73eef1418d1d5d2eda4d77e12813f8e6dead84bc52c5870a7b" +dependencies = [ + "cc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "linux-raw-sys" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57bcfdad1b858c2db7c38303a6d2ad4dfaf5eb53dfeb0910128b2c26d6158503" + +[[package]] +name = "lock_api" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" + +[[package]] +name = "lru" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6e8aaa3f231bb4bd57b84b2d5dc3ae7f350265df8aa96492e0bc394a1571909" +dependencies = [ + "hashbrown 0.12.3", +] + +[[package]] +name = "lz4" +version = "1.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e9e2dd86df36ce760a60f6ff6ad526f7ba1f14ba0356f8254fb6905e6494df1" +dependencies = [ + "libc", + "lz4-sys", +] + +[[package]] +name = "lz4-sys" +version = "1.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57d27b317e207b10f69f5e75494119e391a96f48861ae870d1da6edac98ca900" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + +[[package]] +name = "matrixmultiply" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "090126dc04f95dc0d1c1c91f61bdd474b3930ca064c1edc8a849da2c6cbe1e77" +dependencies = [ + "autocfg", + "rawpointer", +] + +[[package]] +name = "md-5" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6365506850d44bff6e2fbcb5176cf63650e48bd45ef2fe2665ae1570e0f4b9ca" +dependencies = [ + "digest", +] + +[[package]] +name = "md5" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e6bcd6433cff03a4bfc3d9834d504467db1f1cf6d0ea765d37d330249ed629d" + +[[package]] +name = "md5" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" + +[[package]] +name = "memchr" +version = "2.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c" + +[[package]] +name = "memmap2" +version = "0.5.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327" +dependencies = [ + "libc", +] + +[[package]] +name = "memmap2" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f49388d20533534cd19360ad3d6a7dadc885944aa802ba3995040c5ec11288c6" +dependencies = [ + "libc", +] + +[[package]] +name = "memoffset" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" +dependencies = [ + "autocfg", +] + +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "miniz_oxide" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7" +dependencies = [ + "adler", +] + +[[package]] +name = "mio" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2" +dependencies = [ + "libc", + "log", + "wasi 0.11.0+wasi-snapshot-preview1", + "windows-sys", +] + +[[package]] +name = "multimap" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" + +[[package]] +name = "multiversion" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2c7b9d7fe61760ce5ea19532ead98541f6b4c495d87247aff9826445cf6872a" +dependencies = [ + "multiversion-macros", + "target-features", +] + +[[package]] +name = "multiversion-macros" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26a83d8500ed06d68877e9de1dde76c1dbb83885dcdbda4ef44ccbc3fbda2ac8" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", + "target-features", +] + +[[package]] +name = "mysql" +version = "23.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f11339ca5c251941805d51362a07823605a80586ced92914ab7de84fba813f" +dependencies = [ + "bufstream", + "bytes", + "crossbeam", + "flate2", + "io-enum", + "libc", + "lru", + "mysql_common", + "named_pipe", + "native-tls", + "once_cell", + "pem", + "percent-encoding", + "serde", + "serde_json", + "socket2 0.4.9", + "twox-hash", + "url", +] + +[[package]] +name = "mysql_common" +version = "0.29.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9006c95034ccf7b903d955f210469119f6c3477fc9c9e7a7845ce38a3e665c2a" +dependencies = [ + "base64 0.13.1", + "bigdecimal", + "bindgen", + "bitflags 1.3.2", + "bitvec", + "byteorder", + "bytes", + "cc", + "chrono", + "cmake", + "crc32fast", + "flate2", + "frunk", + "lazy_static", + "lexical", + "num-bigint", + "num-traits", + "rand 0.8.5", + "regex", + "rust_decimal", + "saturating", + "serde", + "serde_json", + "sha1", + "sha2", + "smallvec", + "subprocess", + "thiserror", + "time 0.3.28", + "uuid 1.4.1", +] + +[[package]] +name = "named_pipe" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad9c443cce91fc3e12f017290db75dde490d685cdaaf508d7159d7cf41f0eb2b" +dependencies = [ + "winapi", +] + +[[package]] +name = "native-tls" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e" +dependencies = [ + "lazy_static", + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + +[[package]] +name = "ndarray" +version = "0.15.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32" +dependencies = [ + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "rawpointer", +] + +[[package]] +name = "nix" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa9b4819da1bc61c0ea48b63b7bc8604064dd43013e7cc325df098d49cd7c18a" +dependencies = [ + "bitflags 1.3.2", + "cc", + "cfg-if 1.0.0", + "libc", +] + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "now" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d89e9874397a1f0a52fc1f197a8effd9735223cb2390e9dcc83ac6cd02923d0" +dependencies = [ + "chrono", +] + +[[package]] +name = "ntapi" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8a3895c6391c39d7fe7ebc444a87eb2991b2a0bc718fdabd071eec617fc68e4" +dependencies = [ + "winapi", +] + +[[package]] +name = "num" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b05180d69e3da0e530ba2a1dae5110317e49e3b7f3d41be227dc5f92e49ee7af" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + +[[package]] +name = "num-bigint" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "608e7659b5c3d7cba262d894801b9ec9d00de989e8a82bd4bef91d08da45cdc0" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-complex" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ba157ca0885411de85d6ca030ba7e2a83a28636056c7c699b07c8b6f7383214" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-format" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a652d9771a63711fd3c3deb670acfbe5c30a4072e664d7a3bf5a9e1056ac72c3" +dependencies = [ + "arrayvec", + "itoa", +] + +[[package]] +name = "num-integer" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" +dependencies = [ + "autocfg", + "num-traits", +] + +[[package]] +name = "num-iter" +version = "0.1.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0" +dependencies = [ + "autocfg", + "num-bigint", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "num_cpus" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" +dependencies = [ + "hermit-abi 0.3.2", + "libc", +] + +[[package]] +name = "num_threads" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44" +dependencies = [ + "libc", +] + +[[package]] +name = "numpy" +version = "0.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f3a190dd1aa88ee0de91e59e970d5b85cfa079a9ff6531b69f811ccd0c2a6e1" +dependencies = [ + "cfg-if 0.1.10", + "libc", + "ndarray", + "num-complex", + "num-traits", + "pyo3", +] + +[[package]] +name = "object" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cf5f9dd3933bd50a9e1f149ec995f39ae2c496d31fd772c1fd45ebc27e902b0" +dependencies = [ + "memchr", +] + +[[package]] +name = "object_store" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec9cd6ca25e796a49fa242876d1c4de36a24a6da5258e9f0bc062dbf5e81c53b" +dependencies = [ + "async-trait", + "bytes", + "chrono", + "futures", + "itertools", + "parking_lot 0.12.1", + "percent-encoding", + "snafu", + "tokio", + "tracing", + "url", + "walkdir", +] + +[[package]] +name = "once_cell" +version = "1.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" + +[[package]] +name = "oorandom" +version = "11.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" + +[[package]] +name = "openssl" +version = "0.10.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bac25ee399abb46215765b1cb35bc0212377e58a061560d8b29b024fd0430e7c" +dependencies = [ + "bitflags 2.4.0", + "cfg-if 1.0.0", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.31", +] + +[[package]] +name = "openssl-probe" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" + +[[package]] +name = "openssl-src" +version = "300.1.3+3.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd2c101a165fff9935e34def4669595ab1c7847943c42be86e21503e482be107" +dependencies = [ + "cc", +] + +[[package]] +name = "openssl-sys" +version = "0.9.93" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db4d56a4c0478783083cfafcc42493dd4a981d41669da64b4572a2a089b51b1d" +dependencies = [ + "cc", + "libc", + "openssl-src", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "opentls" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f561874f8d6ecfb674fc08863414040c93cc90c0b6963fe679895fab8b65560" +dependencies = [ + "futures-util", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "url", +] + +[[package]] +name = "oracle" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfe80334af1fbaea016fbef0af77f5fa32452362e29a039389b8c93737585003" +dependencies = [ + "cc", + "chrono", + "lazy_static", + "oracle_procmacro", + "paste 1.0.14", +] + +[[package]] +name = "oracle_procmacro" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad247f3421d57de56a0d0408d3249d4b1048a522be2013656d92f022c3d8af27" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "ordered-float" +version = "2.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7940cf2ca942593318d07fcf2596cdca60a85c9e7fab408a5e21a4f9dcd40d87" +dependencies = [ + "num-traits", +] + +[[package]] +name = "owning_ref" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ff55baddef9e4ad00f88b6c743a2a8062d4c6ade126c2a528644b8e444d52ce" +dependencies = [ + "stable_deref_trait", +] + +[[package]] +name = "parking_lot" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" +dependencies = [ + "instant", + "lock_api", + "parking_lot_core 0.8.6", +] + +[[package]] +name = "parking_lot" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +dependencies = [ + "lock_api", + "parking_lot_core 0.9.8", +] + +[[package]] +name = "parking_lot_core" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a2cfe6f0ad2bfc16aefa463b497d5c7a5ecd44a23efa72aa342d90177356dc" +dependencies = [ + "cfg-if 1.0.0", + "instant", + "libc", + "redox_syscall 0.2.16", + "smallvec", + "winapi", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" +dependencies = [ + "cfg-if 1.0.0", + "libc", + "redox_syscall 0.3.5", + "smallvec", + "windows-targets", +] + +[[package]] +name = "parquet" +version = "40.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6a656fcc17e641657c955742c689732684e096f790ff30865d9f8dcc39f7c4a" +dependencies = [ + "ahash 0.8.3", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ipc", + "arrow-schema", + "arrow-select", + "base64 0.21.3", + "brotli", + "bytes", + "chrono", + "flate2", + "futures", + "hashbrown 0.13.2", + "lz4", + "num", + "num-bigint", + "object_store", + "paste 1.0.14", + "seq-macro", + "snap", + "thrift", + "tokio", + "twox-hash", + "zstd", +] + +[[package]] +name = "parse-zoneinfo" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c705f256449c60da65e11ff6626e0c16a0a0b96aaa348de61376b249bc340f41" +dependencies = [ + "regex", +] + +[[package]] +name = "paste" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45ca20c77d80be666aef2b45486da86238fabe33e38306bd3118fe4af33fa880" +dependencies = [ + "paste-impl", + "proc-macro-hack", +] + +[[package]] +name = "paste" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" + +[[package]] +name = "paste-impl" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d95a7db200b97ef370c8e6de0088252f7e0dfff7d047a28528e47456c0fc98b6" +dependencies = [ + "proc-macro-hack", +] + +[[package]] +name = "peeking_take_while" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" + +[[package]] +name = "pem" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8835c273a76a90455d7344889b0964598e3316e2a79ede8e36f16bdcf2228b8" +dependencies = [ + "base64 0.13.1", +] + +[[package]] +name = "percent-encoding" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94" + +[[package]] +name = "petgraph" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "467d164a6de56270bd7c4d070df81d07beace25012d5103ced4e9ff08d6afdb7" +dependencies = [ + "fixedbitset 0.2.0", + "indexmap 1.9.3", +] + +[[package]] +name = "petgraph" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1d3afd2628e69da2be385eb6f2fd57c8ac7977ceeff6dc166ff1657b0e386a9" +dependencies = [ + "fixedbitset 0.4.2", + "indexmap 2.0.0", +] + +[[package]] +name = "phf" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" +dependencies = [ + "phf_shared", + "rand 0.8.5", +] + +[[package]] +name = "phf_shared" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" +dependencies = [ + "siphasher", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkg-config" +version = "0.3.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" + +[[package]] +name = "planus" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc1691dd09e82f428ce8d6310bd6d5da2557c82ff17694d2a32cad7242aea89f" +dependencies = [ + "array-init-cursor", +] + +[[package]] +name = "plotters" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2c224ba00d7cadd4d5c660deaf2098e5e80e07846537c51f9cfa4be50c1fd45" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e76628b4d3a7581389a35d5b6e2139607ad7c75b17aed325f210aa91f4a9609" + +[[package]] +name = "plotters-svg" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38f6d39893cca0701371e3c27294f09797214b86f1fb951b89ade8ec04e2abab" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "polars" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1362d4a136c0ebacb40d88a37ba361738b222fd8a2ee9340a3d8642f698c52b" +dependencies = [ + "getrandom 0.2.10", + "polars-core", + "polars-io", + "polars-lazy", + "polars-ops", + "polars-sql", + "polars-time", + "version_check", +] + +[[package]] +name = "polars-arrow" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f967c901fa5da4ca7f64e813d1268488ba97e9b3004cefc579ff851c197a1138" +dependencies = [ + "arrow2", + "hashbrown 0.14.0", + "multiversion", + "num-traits", + "polars-error", + "thiserror", + "version_check", +] + +[[package]] +name = "polars-core" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b24f92fc5b167f668ff85ab9607dfa72e2c09664cacef59297ee8601dee60126" +dependencies = [ + "ahash 0.8.3", + "arrow2", + "bitflags 2.4.0", + "chrono", + "comfy-table 7.0.1", + "either", + "hashbrown 0.14.0", + "indexmap 2.0.0", + "num-traits", + "once_cell", + "polars-arrow", + "polars-error", + "polars-row", + "polars-utils", + "rand 0.8.5", + "rand_distr", + "rayon", + "regex", + "smartstring", + "thiserror", + "version_check", + "xxhash-rust", +] + +[[package]] +name = "polars-error" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40d09c3a7337e53b38c37b57999038440fa39c6801b9ba48afaecd8e16f7ac0a" +dependencies = [ + "arrow2", + "regex", + "thiserror", +] + +[[package]] +name = "polars-io" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92cab0df9f2a35702fa5aec99edfaabf9ae8e9cdd0acf69e143ad2d132f34f9c" +dependencies = [ + "ahash 0.8.3", + "arrow2", + "async-trait", + "bytes", + "chrono", + "fast-float", + "futures", + "home", + "lexical", + "lexical-core", + "memchr", + "memmap2 0.7.1", + "num-traits", + "once_cell", + "polars-arrow", + "polars-core", + "polars-error", + "polars-time", + "polars-utils", + "rayon", + "regex", + "simdutf8", + "tokio", +] + +[[package]] +name = "polars-lazy" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c33762ec2a55e01c9f8776b34db86257c70a0a3b3929bd4eb91a52aacf61456" +dependencies = [ + "ahash 0.8.3", + "bitflags 2.4.0", + "glob", + "once_cell", + "polars-arrow", + "polars-core", + "polars-io", + "polars-ops", + "polars-pipe", + "polars-plan", + "polars-time", + "polars-utils", + "rayon", + "smartstring", + "version_check", +] + +[[package]] +name = "polars-ops" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e825575c96302d2daedfc205a0062180033c92c55bcd6aafc4e109d4d8849ed0" +dependencies = [ + "argminmax", + "arrow2", + "either", + "indexmap 2.0.0", + "memchr", + "polars-arrow", + "polars-core", + "polars-utils", + "smartstring", + "version_check", +] + +[[package]] +name = "polars-pipe" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f2bc9a12da9ed043fb0cb51dbcb87b365e4845b7ab6399d7a81e838460c6974" +dependencies = [ + "enum_dispatch", + "hashbrown 0.14.0", + "num-traits", + "polars-arrow", + "polars-core", + "polars-io", + "polars-ops", + "polars-plan", + "polars-row", + "polars-utils", + "rayon", + "smartstring", + "version_check", +] + +[[package]] +name = "polars-plan" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb67b014f0295e8e9dbb84404a91d666d477b3bc248a2ed51bc442833b16da35" +dependencies = [ + "ahash 0.8.3", + "arrow2", + "once_cell", + "polars-arrow", + "polars-core", + "polars-io", + "polars-ops", + "polars-time", + "polars-utils", + "rayon", + "regex", + "smartstring", + "strum_macros 0.25.2", + "version_check", +] + +[[package]] +name = "polars-row" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27f54c1956027bf6301948fb4f2837cf6d6b638d8dd1edf3aaeaa19906a986be" +dependencies = [ + "arrow2", + "polars-error", + "polars-utils", +] + +[[package]] +name = "polars-sql" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbfcb15cf8eebd25ea1724109d0153817cd484c6326290585f0736b4e7fcf2f4" +dependencies = [ + "polars-arrow", + "polars-core", + "polars-lazy", + "polars-plan", + "serde", + "serde_json", + "sqlparser 0.36.1", +] + +[[package]] +name = "polars-time" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53f42d2632f5971c9575041d33cbcfb1f996900c40bbf58bc6eb0a0c5efbecea" +dependencies = [ + "arrow2", + "atoi", + "chrono", + "now", + "once_cell", + "polars-arrow", + "polars-core", + "polars-ops", + "polars-utils", + "regex", + "smartstring", +] + +[[package]] +name = "polars-utils" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c326708a370d71dc6e11a8f4bbc10a8479e1c314dc048ba73543b815cd0bf339" +dependencies = [ + "ahash 0.8.3", + "hashbrown 0.14.0", + "num-traits", + "once_cell", + "polars-error", + "rayon", + "smartstring", + "sysinfo", + "version_check", +] + +[[package]] +name = "postgres" +version = "0.19.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7915b33ed60abc46040cbcaa25ffa1c7ec240668e0477c4f3070786f5916d451" +dependencies = [ + "bytes", + "fallible-iterator", + "futures-util", + "log", + "tokio", + "tokio-postgres", +] + +[[package]] +name = "postgres-native-tls" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d442770e2b1e244bb5eb03b31c79b65bb2568f413b899eaba850fa945a65954" +dependencies = [ + "futures", + "native-tls", + "tokio", + "tokio-native-tls", + "tokio-postgres", +] + +[[package]] +name = "postgres-openssl" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1de0ea6504e07ca78355a6fb88ad0f36cafe9e696cbc6717f16a207f3a60be72" +dependencies = [ + "futures", + "openssl", + "tokio", + "tokio-openssl", + "tokio-postgres", +] + +[[package]] +name = "postgres-protocol" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49b6c5ef183cd3ab4ba005f1ca64c21e8bd97ce4699cfea9e8d9a2c4958ca520" +dependencies = [ + "base64 0.21.3", + "byteorder", + "bytes", + "fallible-iterator", + "hmac", + "md-5", + "memchr", + "rand 0.8.5", + "sha2", + "stringprep", +] + +[[package]] +name = "postgres-types" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d2234cdee9408b523530a9b6d2d6b373d1db34f6a8e51dc03ded1828d7fb67c" +dependencies = [ + "bytes", + "chrono", + "fallible-iterator", + "postgres-protocol", + "serde", + "serde_json", + "uuid 0.8.2", +] + +[[package]] +name = "pprof" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc842ca3fb958643d1696cfdada75410482480c11a7129463924fff5ab18d405" +dependencies = [ + "backtrace", + "criterion", + "inferno", + "lazy_static", + "libc", + "log", + "nix", + "parking_lot 0.11.2", + "prost", + "prost-build", + "prost-derive", + "symbolic-demangle", + "tempfile", + "thiserror", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" + +[[package]] +name = "pretty-hex" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be91bcc43e73799dc46a6c194a55e7aae1d86cc867c860fd4a436019af21bd8c" + +[[package]] +name = "proc-macro-crate" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d6ea3c4595b96363c13943497db34af4460fb474a95c43f4446ad341b8c9785" +dependencies = [ + "toml", +] + +[[package]] +name = "proc-macro-hack" +version = "0.5.20+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" + +[[package]] +name = "proc-macro2" +version = "1.0.66" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "prost" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de5e2533f59d08fcf364fd374ebda0692a70bd6d7e66ef97f306f45c6c5d8020" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-build" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "355f634b43cdd80724ee7848f95770e7e70eefa6dcf14fea676216573b8fd603" +dependencies = [ + "bytes", + "heck 0.3.3", + "itertools", + "log", + "multimap", + "petgraph 0.5.1", + "prost", + "prost-types", + "tempfile", + "which", +] + +[[package]] +name = "prost-derive" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "600d2f334aa05acb02a755e217ef1ab6dea4d51b58b7846588b747edec04efba" +dependencies = [ + "anyhow", + "itertools", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "prost-types" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "603bbd6394701d13f3f25aada59c7de9d35a6a5887cfc156181234a44002771b" +dependencies = [ + "bytes", + "prost", +] + +[[package]] +name = "ptr_meta" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0738ccf7ea06b608c10564b31debd4f5bc5e197fc8bfe088f68ae5ce81e7a4f1" +dependencies = [ + "ptr_meta_derive", +] + +[[package]] +name = "ptr_meta_derive" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16b845dbfca988fa33db069c0e230574d15a3088f147a87b64c7589eb662c9ac" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "pyo3" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d41d50a7271e08c7c8a54cd24af5d62f73ee3a6f6a314215281ebdec421d5752" +dependencies = [ + "cfg-if 1.0.0", + "indoc", + "libc", + "parking_lot 0.11.2", + "paste 0.1.18", + "pyo3-build-config", + "pyo3-macros", + "unindent", +] + +[[package]] +name = "pyo3-build-config" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "779239fc40b8e18bc8416d3a37d280ca9b9fb04bda54b98037bb6748595c2410" +dependencies = [ + "once_cell", +] + +[[package]] +name = "pyo3-built" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be6d574e0f8cab2cdd1eeeb640cbf845c974519fa9e9b62fa9c08ecece0ca5de" + +[[package]] +name = "pyo3-macros" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b247e8c664be87998d8628e86f282c25066165f1f8dda66100c48202fdb93a" +dependencies = [ + "pyo3-macros-backend", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a8c2812c412e00e641d99eeb79dd478317d981d938aa60325dfa7157b607095" +dependencies = [ + "proc-macro2", + "pyo3-build-config", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "quick-xml" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8533f14c8382aaad0d592c812ac3b826162128b65662331e1127b45c3d18536b" +dependencies = [ + "memchr", +] + +[[package]] +name = "quote" +version = "1.0.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r2d2" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51de85fb3fb6524929c8a2eb85e6b6d363de4e8c48f9e2c2eac4944abc181c93" +dependencies = [ + "log", + "parking_lot 0.12.1", + "scheduled-thread-pool", +] + +[[package]] +name = "r2d2-oracle" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eca5358dca54423e557b30e7b5a6d950d3a442ab4a56cc916965030cead8b02b" +dependencies = [ + "oracle", + "r2d2", +] + +[[package]] +name = "r2d2_mysql" +version = "23.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9733d738ce65959a744f387bae69aa690a867e18d48e5486b171c47bc7b0c575" +dependencies = [ + "mysql", + "r2d2", +] + +[[package]] +name = "r2d2_postgres" +version = "0.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7029c56be658cb54f321e0bee597810ee16796b735fa2559d7056bf06b12230b" +dependencies = [ + "postgres", + "r2d2", +] + +[[package]] +name = "r2d2_sqlite" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fdc8e4da70586127893be32b7adf21326a4c6b1aba907611edf467d13ffe895" +dependencies = [ + "r2d2", + "rusqlite", +] + +[[package]] +name = "radium" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" + +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom 0.1.16", + "libc", + "rand_chacha 0.2.2", + "rand_core 0.5.1", + "rand_hc", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core 0.5.1", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom 0.1.16", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.10", +] + +[[package]] +name = "rand_distr" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" +dependencies = [ + "num-traits", + "rand 0.8.5", +] + +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core 0.5.1", +] + +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + +[[package]] +name = "rayon" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-utils", + "num_cpus", +] + +[[package]] +name = "redox_syscall" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" +dependencies = [ + "bitflags 1.3.2", +] + +[[package]] +name = "redox_syscall" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" +dependencies = [ + "bitflags 1.3.2", +] + +[[package]] +name = "redox_users" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b" +dependencies = [ + "getrandom 0.2.10", + "redox_syscall 0.2.16", + "thiserror", +] + +[[package]] +name = "regex" +version = "1.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax 0.7.5", +] + +[[package]] +name = "regex-automata" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax 0.7.5", +] + +[[package]] +name = "regex-syntax" +version = "0.6.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" + +[[package]] +name = "regex-syntax" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" + +[[package]] +name = "rend" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "581008d2099240d37fb08d77ad713bcaec2c4d89d50b5b21a8bb1996bbab68ab" +dependencies = [ + "bytecheck", +] + +[[package]] +name = "reqwest" +version = "0.11.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e9ad3fe7488d7e34558a2033d45a0c90b72d97b4f80705666fea71472e2e6a1" +dependencies = [ + "base64 0.21.3", + "bytes", + "encoding_rs", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "hyper", + "hyper-rustls 0.24.1", + "ipnet", + "js-sys", + "log", + "mime", + "once_cell", + "percent-encoding", + "pin-project-lite", + "rustls 0.21.7", + "rustls-pemfile 1.0.3", + "serde", + "serde_json", + "serde_urlencoded", + "tokio", + "tokio-rustls 0.24.1", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", + "webpki-roots", + "winreg", +] + +[[package]] +name = "rgb" +version = "0.8.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20ec2d3e3fc7a92ced357df9cebd5a10b6fb2aa1ee797bf7e9ce2f17dffc8f59" +dependencies = [ + "bytemuck", +] + +[[package]] +name = "ring" +version = "0.16.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc" +dependencies = [ + "cc", + "libc", + "once_cell", + "spin", + "untrusted", + "web-sys", + "winapi", +] + +[[package]] +name = "rkyv" +version = "0.7.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0200c8230b013893c0b2d6213d6ec64ed2b9be2e0e016682b7224ff82cff5c58" +dependencies = [ + "bitvec", + "bytecheck", + "hashbrown 0.12.3", + "ptr_meta", + "rend", + "rkyv_derive", + "seahash", + "tinyvec", + "uuid 1.4.1", +] + +[[package]] +name = "rkyv_derive" +version = "0.7.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2e06b915b5c230a17d7a736d1e2e63ee753c256a8614ef3f5147b13a4f5541d" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "rusqlite" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85127183a999f7db96d1a976a309eebbfb6ea3b0b400ddd8340190129de6eb7a" +dependencies = [ + "bitflags 1.3.2", + "chrono", + "fallible-iterator", + "fallible-streaming-iterator", + "hashlink", + "libsqlite3-sys", + "memchr", + "smallvec", +] + +[[package]] +name = "rust_decimal" +version = "1.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4c4216490d5a413bc6d10fa4742bd7d4955941d062c0ef873141d6b0e7b30fd" +dependencies = [ + "arrayvec", + "borsh", + "bytes", + "num-traits", + "postgres", + "rand 0.8.5", + "rkyv", + "serde", + "serde_json", +] + +[[package]] +name = "rust_decimal_macros" +version = "1.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86444b802de0b10ac5e563b5ddb43b541b9705de4e01a50e82194d2b183c1835" +dependencies = [ + "quote", + "rust_decimal", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "rustc_version" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +dependencies = [ + "semver", +] + +[[package]] +name = "rustix" +version = "0.38.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0c3dde1fc030af041adc40e79c0e7fbcf431dd24870053d187d7c66e4b87453" +dependencies = [ + "bitflags 2.4.0", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + +[[package]] +name = "rustls" +version = "0.20.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b80e3dec595989ea8510028f30c408a4630db12c9cbb8de34203b89d6577e99" +dependencies = [ + "log", + "ring", + "sct", + "webpki", +] + +[[package]] +name = "rustls" +version = "0.21.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd8d6c9f025a446bc4d18ad9632e69aec8f287aa84499ee335599fabd20c3fd8" +dependencies = [ + "log", + "ring", + "rustls-webpki", + "sct", +] + +[[package]] +name = "rustls-native-certs" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9aace74cb666635c918e9c12bc0d348266037aa8eb599b5cba565709a8dff00" +dependencies = [ + "openssl-probe", + "rustls-pemfile 1.0.3", + "schannel", + "security-framework", +] + +[[package]] +name = "rustls-pemfile" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ee86d63972a7c661d1536fefe8c3c8407321c3df668891286de28abcd087360" +dependencies = [ + "base64 0.13.1", +] + +[[package]] +name = "rustls-pemfile" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d3987094b1d07b653b7dfdc3f70ce9a1da9c51ac18c1b06b662e4f9a0e9f4b2" +dependencies = [ + "base64 0.21.3", +] + +[[package]] +name = "rustls-webpki" +version = "0.101.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d93931baf2d282fff8d3a532bbfd7653f734643161b87e3e01e59a04439bf0d" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "rustversion" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4" + +[[package]] +name = "ryu" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "saturating" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ece8e78b2f38ec51c51f5d475df0a7187ba5111b2a28bdc761ee05b075d40a71" + +[[package]] +name = "schannel" +version = "0.1.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c3733bf4cf7ea0880754e19cb5a462007c4a8c1914bff372ccc95b464f1df88" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "scheduled-thread-pool" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3cbc66816425a074528352f5789333ecff06ca41b36b0b0efdfbb29edc391a19" +dependencies = [ + "parking_lot 0.12.1", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "sct" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d53dcdb7c9f8158937a7981b48accfd39a43af418591a5d008c7b22b5e1b7ca4" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "seahash" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b" + +[[package]] +name = "security-framework" +version = "2.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05b64fb303737d99b81884b2c63433e9ae28abebe5eb5045dcdd175dc2ecf4de" +dependencies = [ + "bitflags 1.3.2", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e932934257d3b408ed8f30db49d85ea163bfe74961f017f405b025af298f0c7a" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "semver" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0293b4b29daaf487284529cc2f5675b8e57c61f70167ba415a463651fd6a918" +dependencies = [ + "serde", +] + +[[package]] +name = "seq-macro" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" + +[[package]] +name = "serde" +version = "1.0.188" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf9e0fcba69a370eed61bcf2b728575f726b50b55cba78064753d708ddc7549e" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_cbor" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5" +dependencies = [ + "half 1.8.2", + "serde", +] + +[[package]] +name = "serde_derive" +version = "1.0.188" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.31", +] + +[[package]] +name = "serde_json" +version = "1.0.105" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "693151e1ac27563d6dbcec9dee9fbd5da8539b20fa14ad3752b2e6d363ace360" +dependencies = [ + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "sha1" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f04293dc80c3993519f2d7f6f511707ee7094fe0c6d3406feb330cdb3540eba3" +dependencies = [ + "cfg-if 1.0.0", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha2" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479fb9d862239e610720565ca91403019f2f00410f1864c5aa7479b950a76ed8" +dependencies = [ + "cfg-if 1.0.0", + "cpufeatures", + "digest", +] + +[[package]] +name = "shlex" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7cee0529a6d40f580e7a5e6c495c8fbfe21b7b52795ed4bb5e62cdf92bc6380" + +[[package]] +name = "signal-hook" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8621587d4798caf8eb44879d42e56b9a93ea5dcd315a6487c357130095b62801" +dependencies = [ + "libc", + "signal-hook-registry", +] + +[[package]] +name = "signal-hook-mio" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af" +dependencies = [ + "libc", + "mio", + "signal-hook", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" +dependencies = [ + "libc", +] + +[[package]] +name = "simdutf8" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f27f6278552951f1f2b8cf9da965d10969b2efdea95a6ec47987ab46edfe263a" + +[[package]] +name = "siphasher" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" + +[[package]] +name = "slab" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" +dependencies = [ + "autocfg", +] + +[[package]] +name = "smallvec" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" + +[[package]] +name = "smartstring" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fb72c633efbaa2dd666986505016c32c3044395ceaf881518399d2f4127ee29" +dependencies = [ + "autocfg", + "static_assertions", + "version_check", +] + +[[package]] +name = "snafu" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4de37ad025c587a29e8f3f5605c00f70b98715ef90b9061a815b9e59e9042d6" +dependencies = [ + "doc-comment", + "snafu-derive", +] + +[[package]] +name = "snafu-derive" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990079665f075b699031e9c08fd3ab99be5029b96f3b78dc0709e8f77e4efebf" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "snap" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e9f0ab6ef7eb7353d9119c170a436d1bf248eea575ac42d19d12f4e34130831" + +[[package]] +name = "socket2" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "socket2" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2538b18701741680e0322a2302176d3253a35388e2e62f172f64f4f16605f877" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "spin" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" + +[[package]] +name = "sqlparser" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10e1ce16b71375ad72d28d111131069ce0d5f8603f4f86d8acd3456b41b57a51" +dependencies = [ + "log", +] + +[[package]] +name = "sqlparser" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37d3706eefb17039056234df6b566b0014f303f867f2656108334a55b8096f59" +dependencies = [ + "log", + "sqlparser_derive", +] + +[[package]] +name = "sqlparser" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2eaa1e88e78d2c2460d78b7dc3f0c08dbb606ab4222f9aff36f420d36e307d87" +dependencies = [ + "log", +] + +[[package]] +name = "sqlparser_derive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55fe75cb4a364c7f7ae06c7dbbc8d84bddd85d6cdf9975963c3935bc1991761e" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "str_stack" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9091b6114800a5f2141aee1d1b9d6ca3592ac062dc5decb3764ec5895a47b4eb" + +[[package]] +name = "strength_reduce" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe895eb47f22e2ddd4dabc02bce419d2e643c8e3b585c78158b349195bc24d82" + +[[package]] +name = "stringprep" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb41d74e231a107a1b4ee36bd1214b11285b77768d2e3824aedafa988fd36ee6" +dependencies = [ + "finl_unicode", + "unicode-bidi", + "unicode-normalization", +] + +[[package]] +name = "strsim" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" + +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + +[[package]] +name = "strum" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f" +dependencies = [ + "strum_macros 0.24.3", +] + +[[package]] +name = "strum_macros" +version = "0.24.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "quote", + "rustversion", + "syn 1.0.109", +] + +[[package]] +name = "strum_macros" +version = "0.25.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad8d03b598d3d0fff69bf533ee3ef19b8eeb342729596df84bcc7e1f96ec4059" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.31", +] + +[[package]] +name = "subprocess" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c2e86926081dda636c546d8c5e641661049d7562a68f5488be4a1f7f66f6086" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "subtle" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" + +[[package]] +name = "symbolic-common" +version = "8.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f551f902d5642e58039aee6a9021a61037926af96e071816361644983966f540" +dependencies = [ + "debugid", + "memmap2 0.5.10", + "stable_deref_trait", + "uuid 0.8.2", +] + +[[package]] +name = "symbolic-demangle" +version = "8.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4564ca7b4e6eb14105aa8bbbce26e080f6b5d9c4373e67167ab31f7b86443750" +dependencies = [ + "cpp_demangle", + "rustc-demangle", + "symbolic-common", +] + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "718fa2415bcb8d8bd775917a1bf12a7931b6dfa890753378538118181e0cb398" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "sysinfo" +version = "0.29.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a18d114d420ada3a891e6bc8e96a2023402203296a47cdd65083377dad18ba5" +dependencies = [ + "cfg-if 1.0.0", + "core-foundation-sys", + "libc", + "ntapi", + "once_cell", + "winapi", +] + +[[package]] +name = "tap" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" + +[[package]] +name = "target-features" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06f6b473c37f9add4cf1df5b4d66a8ef58ab6c895f1a3b3f949cf3e21230140e" + +[[package]] +name = "tempfile" +version = "3.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb94d2f3cc536af71caac6b6fcebf65860b347e7ce0cc9ebe8f70d3e521054ef" +dependencies = [ + "cfg-if 1.0.0", + "fastrand", + "redox_syscall 0.3.5", + "rustix", + "windows-sys", +] + +[[package]] +name = "termcolor" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "textwrap" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" +dependencies = [ + "unicode-width", +] + +[[package]] +name = "thiserror" +version = "1.0.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d6d7a740b8a666a7e828dd00da9c0dc290dff53154ea77ac109281de90589b7" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49922ecae66cc8a249b77e68d1d0623c1b2c514f0060c27cdc68bd62a1219d35" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.31", +] + +[[package]] +name = "thrift" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" +dependencies = [ + "byteorder", + "integer-encoding", + "ordered-float", +] + +[[package]] +name = "tiberius" +version = "0.5.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e08c782c165a53700c17e4b15a1f6facc21e40a6a80402c518e0f3a2c3fcedd4" +dependencies = [ + "async-native-tls", + "async-stream 0.2.1", + "async-trait", + "asynchronous-codec", + "byteorder", + "bytes", + "chrono", + "connection-string", + "encoding", + "enumflags2", + "futures", + "futures-sink", + "futures-util", + "libgssapi", + "num-traits", + "once_cell", + "opentls", + "pin-project-lite", + "pretty-hex", + "rust_decimal", + "thiserror", + "tracing", + "uuid 0.8.2", + "winauth", +] + +[[package]] +name = "time" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a" +dependencies = [ + "libc", + "wasi 0.10.0+wasi-snapshot-preview1", + "winapi", +] + +[[package]] +name = "time" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17f6bb557fd245c28e6411aa56b6403c689ad95061f50e4be16c274e70a17e48" +dependencies = [ + "deranged", + "itoa", + "libc", + "num_threads", + "serde", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb" + +[[package]] +name = "time-macros" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a942f44339478ef67935ab2bbaec2fb0322496cf3cbe84b261e06ac3814c572" +dependencies = [ + "time-core", +] + +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "tinyvec" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "tokio" +version = "1.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17ed6077ed6cd6c74735e21f37eb16dc3935f96878b1fe961074089cc80893f9" +dependencies = [ + "backtrace", + "bytes", + "libc", + "mio", + "num_cpus", + "parking_lot 0.12.1", + "pin-project-lite", + "socket2 0.5.3", + "tokio-macros", + "windows-sys", +] + +[[package]] +name = "tokio-macros" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.31", +] + +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + +[[package]] +name = "tokio-openssl" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08f9ffb7809f1b20c1b398d92acf4cc719874b3b2b2d9ea2f09b4a80350878a" +dependencies = [ + "futures-util", + "openssl", + "openssl-sys", + "tokio", +] + +[[package]] +name = "tokio-postgres" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d340244b32d920260ae7448cb72b6e238bddc3d4f7603394e7dd46ed8e48f5b8" +dependencies = [ + "async-trait", + "byteorder", + "bytes", + "fallible-iterator", + "futures-channel", + "futures-util", + "log", + "parking_lot 0.12.1", + "percent-encoding", + "phf", + "pin-project-lite", + "postgres-protocol", + "postgres-types", + "rand 0.8.5", + "socket2 0.5.3", + "tokio", + "tokio-util 0.7.8", + "whoami", +] + +[[package]] +name = "tokio-rustls" +version = "0.23.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c43ee83903113e03984cb9e5cebe6c04a5116269e900e3ddba8f068a62adda59" +dependencies = [ + "rustls 0.20.9", + "tokio", + "webpki", +] + +[[package]] +name = "tokio-rustls" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" +dependencies = [ + "rustls 0.21.7", + "tokio", +] + +[[package]] +name = "tokio-stream" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "397c988d37662c7dda6d2208364a706264bf3d6138b11d436cbac0ad38832842" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.6.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36943ee01a6d67977dd3f84a5a1d2efeb4ada3a1ae771cadfaa535d9d9fc6507" +dependencies = [ + "bytes", + "futures-core", + "futures-io", + "futures-sink", + "log", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.7.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "806fe8c2c87eccc8b3267cbae29ed3ab2d0bd37fca70ab622e46aaa9375ddb7d" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", + "tracing", +] + +[[package]] +name = "toml" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4f7f0dd8d50a853a531c426359045b1998f04219d88799810762cd4ad314234" +dependencies = [ + "serde", +] + +[[package]] +name = "tower-service" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" + +[[package]] +name = "tracing" +version = "0.1.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8" +dependencies = [ + "cfg-if 1.0.0", + "log", + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.31", +] + +[[package]] +name = "tracing-core" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a" +dependencies = [ + "once_cell", +] + +[[package]] +name = "try-lock" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed" + +[[package]] +name = "twox-hash" +version = "1.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" +dependencies = [ + "cfg-if 1.0.0", + "rand 0.8.5", + "static_assertions", +] + +[[package]] +name = "typenum" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" + +[[package]] +name = "unicode-bidi" +version = "0.3.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" + +[[package]] +name = "unicode-ident" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c" + +[[package]] +name = "unicode-normalization" +version = "0.1.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "unicode-segmentation" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" + +[[package]] +name = "unicode-width" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" + +[[package]] +name = "unindent" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1766d682d402817b5ac4490b3c3002d91dfa0d22812f341609f97b08757359c" + +[[package]] +name = "untrusted" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" + +[[package]] +name = "url" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "143b538f18257fac9cad154828a57c6bf5157e1aa604d4816b5995bf6de87ae5" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", +] + +[[package]] +name = "urlencoding" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" + +[[package]] +name = "uuid" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" +dependencies = [ + "getrandom 0.2.10", + "md5 0.7.0", +] + +[[package]] +name = "uuid" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79daa5ed5740825c40b389c5e50312b9c86df53fccd33f281df655642b43869d" +dependencies = [ + "getrandom 0.2.10", +] + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "vec_map" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "walkdir" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71d857dc86794ca4c280d616f7da00d2dbfd8cd788846559a6813e6aa4b54ee" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + +[[package]] +name = "wasi" +version = "0.10.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "wasm-bindgen" +version = "0.2.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25f1af7423d8588a3d840681122e72e6a24ddbcb3f0ec385cac0d12d24256c06" +dependencies = [ + "cfg-if 1.0.0", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b21c0df030f5a177f3cba22e9bc4322695ec43e7257d865302900290bcdedca" +dependencies = [ + "bumpalo", + "lazy_static", + "log", + "proc-macro2", + "quote", + "syn 1.0.109", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2eb6ec270a31b1d3c7e266b999739109abce8b6c87e4b31fcfcd788b65267395" +dependencies = [ + "cfg-if 1.0.0", + "js-sys", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f4203d69e40a52ee523b2529a773d5ffc1dc0071801c87b3d270b471b80ed01" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa8a30d46208db204854cadbb5d4baf5fcf8071ba5bf48190c3e59937962ebc" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d958d035c4438e28c70e4321a2911302f10135ce78a9c7834c0cab4123d06a2" + +[[package]] +name = "web-sys" +version = "0.3.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c060b319f29dd25724f09a2ba1418f142f539b2be99fbf4d2d5a8f7330afb8eb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "webpki" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0e74f82d49d545ad128049b7e88f6576df2da6b02e9ce565c6f533be576957e" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "webpki-roots" +version = "0.25.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14247bb57be4f377dfb94c72830b8ce8fc6beac03cf4bf7b9732eadd414123fc" + +[[package]] +name = "which" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" +dependencies = [ + "either", + "home", + "once_cell", + "rustix", +] + +[[package]] +name = "whoami" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22fc3756b8a9133049b26c7f61ab35416c130e8c09b660f5b3958b446f52cc50" +dependencies = [ + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "winauth" +version = "0.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f820cd208ce9c6b050812dc2d724ba98c6c1e9db5ce9b3f58d925ae5723a5e6" +dependencies = [ + "bitflags 1.3.2", + "byteorder", + "md5 0.6.1", + "rand 0.7.3", + "winapi", +] + +[[package]] +name = "windows" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "winreg" +version = "0.50.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1" +dependencies = [ + "cfg-if 1.0.0", + "windows-sys", +] + +[[package]] +name = "wyz" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed" +dependencies = [ + "tap", +] + +[[package]] +name = "xxhash-rust" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "735a71d46c4d68d71d4b24d03fdc2b98e38cea81730595801db779c04fe80d70" + +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] + +[[package]] +name = "yup-oauth2" +version = "7.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98748970d2ddf05253e6525810d989740334aa7509457864048a829902db76f3" +dependencies = [ + "anyhow", + "async-trait", + "base64 0.13.1", + "futures", + "http", + "hyper", + "hyper-rustls 0.23.2", + "itertools", + "log", + "percent-encoding", + "rustls 0.20.9", + "rustls-pemfile 0.3.0", + "seahash", + "serde", + "serde_json", + "time 0.3.28", + "tokio", + "tower-service", + "url", +] + +[[package]] +name = "zstd" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "6.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581" +dependencies = [ + "libc", + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.8+zstd.1.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c" +dependencies = [ + "cc", + "libc", + "pkg-config", +] diff --git a/connectorx-python/Cargo.toml b/connectorx-python/Cargo.toml new file mode 100644 index 0000000..0b010af --- /dev/null +++ b/connectorx-python/Cargo.toml @@ -0,0 +1,79 @@ +[package] +authors = ["Weiyuan Wu "] +edition = "2018" +name = "connectorx-python" +version = "0.3.3-alpha.1" +license = "MIT" +readme = "README.md" + +[workspace] +# prevents package from thinking it's in the workspace + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +anyhow = "1" +arrow = { version = "46" } +arrow2 = {version = "0.17", default-features = false} +bitfield = "0.13" +bytes = "1.4" +chrono = "0.4" +connectorx = {path = "../connectorx", default-features = false} +dict_derive = "0.4" +env_logger = "0.9" +fehler = "1" +itertools = "0.10" +lazy_static = "1.4.0" +libc = "0.2" +log = "0.4" +ndarray = "0.15" +numpy = "0.15" +openssl = {version = "0.10", features = ["vendored"]} +postgres = {version = "0.19", features = ["with-chrono-0_4", "with-uuid-0_8", "with-serde_json-1"]} +postgres-native-tls = {version = "0.5"} +postgres-openssl = {version = "0.5.0"} +pyo3 = {version = "0.15", default-features = false, features = ["macros"]} +pyo3-built = "0.4" +rust_decimal = {version = "1", features = ["db-postgres"]} +serde_json = "1" +sqlparser = "0.37" +thiserror = "1" +tokio = {version = "1", features = ["rt", "rt-multi-thread", "net"]} +tokio-util = "0.6" +url = "2" +urlencoding = "2.1" +uuid = "0.8" + +[build-dependencies] +built = {version = "0.5", features = ["chrono"]} + +[dev-dependencies] +criterion = "0.3" +criterion-macro = "0.3" +iai = "0.1" +pprof = {version = "0.5", features = ["flamegraph", "criterion", "protobuf"]} +rayon = "1" + +[lib] +crate-type = ["cdylib"] +name = "connectorx" + +[features] +branch = ["connectorx/branch"] +default = ["extension", "fptr", "nbstr", "dsts", "srcs", "federation", "fed_exec"] +dsts = ["connectorx/dst_arrow", "connectorx/dst_arrow2"] +executable = ["pyo3/auto-initialize"] +extension = ["pyo3/extension-module"] +fptr = ["connectorx/fptr"] +federation = ["connectorx/federation"] +fed_exec = ["connectorx/fed_exec"] +nbstr = [] +srcs = [ + "connectorx/src_postgres", + "connectorx/src_mysql", + "connectorx/src_sqlite", + "connectorx/src_mssql", + "connectorx/src_oracle", + "connectorx/src_bigquery", +] +integrated-auth-gssapi = ["connectorx/integrated-auth-gssapi"] diff --git a/connectorx-python/LICENSE b/connectorx-python/LICENSE new file mode 120000 index 0000000..ea5b606 --- /dev/null +++ b/connectorx-python/LICENSE @@ -0,0 +1 @@ +../LICENSE \ No newline at end of file diff --git a/connectorx-python/README.md b/connectorx-python/README.md new file mode 120000 index 0000000..32d46ee --- /dev/null +++ b/connectorx-python/README.md @@ -0,0 +1 @@ +../README.md \ No newline at end of file diff --git a/connectorx-python/build.rs b/connectorx-python/build.rs new file mode 100644 index 0000000..648d8e1 --- /dev/null +++ b/connectorx-python/build.rs @@ -0,0 +1,13 @@ +// https://github.com/PyO3/pyo3-built/issues/21 + +fn main() { + // let src = std::env::var("CARGO_MANIFEST_DIR").unwrap(); + // println!("src: {}", src); + // let dst = std::path::Path::new(&std::env::var("OUT_DIR").unwrap()).join("built.rs"); + // let mut opts = built::Options::default(); + // println!("out: {:?}", dst); + // opts.set_dependencies(true).set_compiler(true).set_env(true); + + // built::write_built_file_with_opts(&opts, std::path::Path::new(&src), &dst) + // .expect("Failed to acquire build-time information"); +} diff --git a/connectorx-python/connectorx/__init__.py b/connectorx-python/connectorx/__init__.py new file mode 100644 index 0000000..64e82d6 --- /dev/null +++ b/connectorx-python/connectorx/__init__.py @@ -0,0 +1,379 @@ +from typing import Optional, Tuple, Union, List, Dict, Any + +from .connectorx import ( + read_sql as _read_sql, + partition_sql as _partition_sql, + read_sql2 as _read_sql2, + get_meta as _get_meta, +) + +try: + from importlib.metadata import version + + __version__ = version(__name__) +except: + try: + from importlib_metadata import version + + __version__ = version(__name__) + except: + pass + +import os + +dir_path = os.path.dirname(os.path.realpath(__file__)) +# check whether it is in development env or installed +if ( + not os.path.basename(os.path.abspath(os.path.join(dir_path, ".."))) + == "connectorx-python" +): + if "J4RS_BASE_PATH" not in os.environ: + os.environ["J4RS_BASE_PATH"] = os.path.join(dir_path, "dependencies") +if "CX_REWRITER_PATH" not in os.environ: + os.environ["CX_REWRITER_PATH"] = os.path.join( + dir_path, "dependencies/federated-rewriter.jar" + ) + + +def rewrite_conn(conn: str, protocol: Optional[str] = None): + if not protocol: + # note: redshift/clickhouse are not compatible with the 'binary' protocol, and use other database + # drivers to connect. set a compatible protocol and masquerade as the appropriate backend. + backend, connection_details = conn.split(":", 1) if conn else ("", "") + if "redshift" in backend: + conn = f"postgresql:{connection_details}" + protocol = "cursor" + elif "clickhouse" in backend: + conn = f"mysql:{connection_details}" + protocol = "text" + else: + protocol = "binary" + return conn, protocol + + +def get_meta( + conn: str, + query: str, + protocol: Optional[str] = None, +): + """ + Get metadata (header) of the given query (only for pandas) + + Parameters + ========== + conn + the connection string. + query + a SQL query or a list of SQL queries. + protocol + backend-specific transfer protocol directive; defaults to 'binary' (except for redshift + connection strings, where 'cursor' will be used instead). + + """ + conn, protocol = rewrite_conn(conn, protocol) + result = _get_meta(conn, protocol, query) + df = reconstruct_pandas(result) + return df + + +def partition_sql( + conn: str, + query: str, + partition_on: str, + partition_num: int, + partition_range: Optional[Tuple[int, int]] = None, +): + """ + Partition the sql query + + Parameters + ========== + conn + the connection string. + query + a SQL query or a list of SQL queries. + partition_on + the column on which to partition the result. + partition_num + how many partitions to generate. + partition_range + the value range of the partition column. + """ + partition_query = { + "query": query, + "column": partition_on, + "min": partition_range[0] if partition_range else None, + "max": partition_range[1] if partition_range else None, + "num": partition_num, + } + return _partition_sql(conn, partition_query) + + +def read_sql_pandas( + sql: Union[List[str], str], + con: Union[str, Dict[str, str]], + index_col: Optional[str] = None, + protocol: Optional[str] = None, + partition_on: Optional[str] = None, + partition_range: Optional[Tuple[int, int]] = None, + partition_num: Optional[int] = None, +): + """ + Run the SQL query, download the data from database into a dataframe. + First several parameters are in the same name and order with `pandas.read_sql`. + + Parameters + ========== + Please refer to `read_sql` + + Examples + ======== + Read a DataFrame from a SQL query using a single thread: + + >>> # from pandas import read_sql + >>> from connectorx import read_sql_pandas as read_sql + >>> postgres_url = "postgresql://username:password@server:port/database" + >>> query = "SELECT * FROM lineitem" + >>> read_sql(query, postgres_url) + + """ + return read_sql( + con, + sql, + return_type="pandas", + protocol=protocol, + partition_on=partition_on, + partition_range=partition_range, + partition_num=partition_num, + index_col=index_col, + ) + + +def read_sql( + conn: Union[str, Dict[str, str]], + query: Union[List[str], str], + *, + return_type: str = "pandas", + protocol: Optional[str] = None, + partition_on: Optional[str] = None, + partition_range: Optional[Tuple[int, int]] = None, + partition_num: Optional[int] = None, + index_col: Optional[str] = None, +): + """ + Run the SQL query, download the data from database into a dataframe. + + Parameters + ========== + conn + the connection string, or dict of connection string mapping for federated query. + query + a SQL query or a list of SQL queries. + return_type + the return type of this function; one of "arrow(2)", "pandas", "modin", "dask" or "polars(2)". + protocol + backend-specific transfer protocol directive; defaults to 'binary' (except for redshift + connection strings, where 'cursor' will be used instead). + partition_on + the column on which to partition the result. + partition_range + the value range of the partition column. + partition_num + how many partitions to generate. + index_col + the index column to set; only applicable for return type "pandas", "modin", "dask". + + Examples + ======== + Read a DataFrame from a SQL query using a single thread: + + >>> postgres_url = "postgresql://username:password@server:port/database" + >>> query = "SELECT * FROM lineitem" + >>> read_sql(postgres_url, query) + + Read a DataFrame in parallel using 10 threads by automatically partitioning the provided SQL on the partition column: + + >>> postgres_url = "postgresql://username:password@server:port/database" + >>> query = "SELECT * FROM lineitem" + >>> read_sql(postgres_url, query, partition_on="partition_col", partition_num=10) + + Read a DataFrame in parallel using 2 threads by explicitly providing two SQL queries: + + >>> postgres_url = "postgresql://username:password@server:port/database" + >>> queries = ["SELECT * FROM lineitem WHERE partition_col <= 10", "SELECT * FROM lineitem WHERE partition_col > 10"] + >>> read_sql(postgres_url, queries) + + """ + if isinstance(query, list) and len(query) == 1: + query = query[0] + + if isinstance(conn, dict): + assert partition_on is None and isinstance( + query, str + ), "Federated query does not support query partitioning for now" + assert ( + protocol is None + ), "Federated query does not support specifying protocol for now" + result = _read_sql2(query, conn) + df = reconstruct_arrow(result) + if return_type == "pandas": + df = df.to_pandas(date_as_object=False, split_blocks=False) + if return_type == "polars": + try: + import polars as pl + except ModuleNotFoundError: + raise ValueError("You need to install polars first") + + try: + # api change for polars >= 0.8.* + df = pl.from_arrow(df) + except AttributeError: + df = pl.DataFrame.from_arrow(df) + return df + + if isinstance(query, str): + if partition_on is None: + queries = [query] + partition_query = None + else: + partition_query = { + "query": query, + "column": partition_on, + "min": partition_range[0] if partition_range else None, + "max": partition_range[1] if partition_range else None, + "num": partition_num, + } + queries = None + elif isinstance(query, list): + queries = query + partition_query = None + + if partition_on is not None: + raise ValueError("Partition on multiple queries is not supported.") + else: + raise ValueError("query must be either str or a list of str") + + conn, protocol = rewrite_conn(conn, protocol) + + if return_type in {"modin", "dask", "pandas"}: + try: + import pandas + except ModuleNotFoundError: + raise ValueError("You need to install pandas first") + + result = _read_sql( + conn, + "pandas", + queries=queries, + protocol=protocol, + partition_query=partition_query, + ) + df = reconstruct_pandas(result) + + if index_col is not None: + df.set_index(index_col, inplace=True) + + if return_type == "modin": + try: + import modin.pandas as mpd + except ModuleNotFoundError: + raise ValueError("You need to install modin first") + + df = mpd.DataFrame(df) + elif return_type == "dask": + try: + import dask.dataframe as dd + except ModuleNotFoundError: + raise ValueError("You need to install dask first") + + df = dd.from_pandas(df, npartitions=1) + + elif return_type in {"arrow", "arrow2", "polars", "polars2"}: + try: + import pyarrow + except ModuleNotFoundError: + raise ValueError("You need to install pyarrow first") + + result = _read_sql( + conn, + "arrow2" if return_type in {"arrow2", "polars", "polars2"} else "arrow", + queries=queries, + protocol=protocol, + partition_query=partition_query, + ) + df = reconstruct_arrow(result) + if return_type in {"polars", "polars2"}: + try: + import polars as pl + except ModuleNotFoundError: + raise ValueError("You need to install polars first") + + try: + df = pl.DataFrame.from_arrow(df) + except AttributeError: + # api change for polars >= 0.8.* + df = pl.from_arrow(df) + else: + raise ValueError(return_type) + + return df + + +def reconstruct_arrow(result: Tuple[List[str], List[List[Tuple[int, int]]]]): + import pyarrow as pa + + names, ptrs = result + if len(names) == 0: + return pa.Table.from_arrays([]) + + rbs = [] + for chunk in ptrs: + rb = pa.RecordBatch.from_arrays( + [pa.Array._import_from_c(*col_ptr) for col_ptr in chunk], names + ) + rbs.append(rb) + return pa.Table.from_batches(rbs) + + +def reconstruct_pandas(df_infos: Dict[str, Any]): + import pandas as pd + + data = df_infos["data"] + headers = df_infos["headers"] + block_infos = df_infos["block_infos"] + + nrows = data[0][0].shape[-1] if isinstance(data[0], tuple) else data[0].shape[-1] + blocks = [] + for binfo, block_data in zip(block_infos, data): + if binfo.dt == 0: # NumpyArray + blocks.append( + pd.core.internals.make_block(block_data, placement=binfo.cids) + ) + elif binfo.dt == 1: # IntegerArray + blocks.append( + pd.core.internals.make_block( + pd.core.arrays.IntegerArray(block_data[0], block_data[1]), + placement=binfo.cids[0], + ) + ) + elif binfo.dt == 2: # BooleanArray + blocks.append( + pd.core.internals.make_block( + pd.core.arrays.BooleanArray(block_data[0], block_data[1]), + placement=binfo.cids[0], + ) + ) + elif binfo.dt == 3: # DatetimeArray + blocks.append( + pd.core.internals.make_block( + pd.core.arrays.DatetimeArray(block_data), placement=binfo.cids + ) + ) + else: + raise ValueError(f"unknown dt: {binfo.dt}") + + block_manager = pd.core.internals.BlockManager( + blocks, [pd.Index(headers), pd.RangeIndex(start=0, stop=nrows, step=1)] + ) + df = pd.DataFrame(block_manager) + return df diff --git a/connectorx-python/connectorx/tests/__init__.py b/connectorx-python/connectorx/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/connectorx-python/connectorx/tests/benchmarks.py b/connectorx-python/connectorx/tests/benchmarks.py new file mode 100644 index 0000000..778d61e --- /dev/null +++ b/connectorx-python/connectorx/tests/benchmarks.py @@ -0,0 +1,24 @@ +""" +This file is skipped during normal test because the file name is not started with benchmarks +""" +import os + +from .. import read_sql + + +def read_sql_impl(conn: str, table: str): + read_sql( + conn, + f"""SELECT * FROM {table}""", + partition_on="L_ORDERKEY", + partition_num=10, + ) + + +def bench_mysql(benchmark): + benchmark(read_sql_impl, os.environ["MYSQL_URL"], os.environ["TPCH_TABLE"]) + + +def bench_postgres(benchmark): + benchmark(read_sql_impl, + os.environ["POSTGRES_URL"], os.environ["TPCH_TABLE"]) diff --git a/connectorx-python/connectorx/tests/test_arrow.py b/connectorx-python/connectorx/tests/test_arrow.py new file mode 100644 index 0000000..d784d4f --- /dev/null +++ b/connectorx-python/connectorx/tests/test_arrow.py @@ -0,0 +1,193 @@ +import os + +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal +import datetime + +from .. import read_sql + + +@pytest.fixture(scope="module") # type: ignore +def postgres_url() -> str: + conn = os.environ["POSTGRES_URL"] + return conn + + +def test_arrow(postgres_url: str) -> None: + query = "SELECT * FROM test_table" + df = read_sql( + postgres_url, + query, + partition_on="test_int", + partition_range=(0, 2000), + partition_num=3, + return_type="arrow", + ) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([0, 1, 2, 3, 4, 1314], dtype="int64"), + "test_nullint": pd.Series([5, 3, None, 7, 9, 2], dtype="float64"), + "test_str": pd.Series( + ["a", "str1", "str2", "b", "c", None], dtype="object" + ), + "test_float": pd.Series([3.1, None, 2.2, 3, 7.8, -10], dtype="float64"), + "test_bool": pd.Series( + [None, True, False, False, None, True], dtype="object" + ), + }, + ) + + df = df.to_pandas() + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_arrow2(postgres_url: str) -> None: + query = "SELECT * FROM test_table" + df = read_sql( + postgres_url, + query, + partition_on="test_int", + partition_range=(0, 2000), + partition_num=3, + return_type="arrow2", + ) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([0, 1, 2, 3, 4, 1314], dtype="int32"), + "test_nullint": pd.Series([5, 3, None, 7, 9, 2], dtype="float64"), + "test_str": pd.Series( + ["a", "str1", "str2", "b", "c", None], dtype="object" + ), + "test_float": pd.Series([3.1, None, 2.2, 3, 7.8, -10], dtype="float64"), + "test_bool": pd.Series( + [None, True, False, False, None, True], dtype="object" + ), + }, + ) + + df = df.to_pandas() + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_arrow2_type(postgres_url: str) -> None: + query = "SELECT test_date, test_timestamp, test_timestamptz, test_int16, test_int64, test_float32, test_numeric, test_bpchar, test_char, test_varchar, test_uuid, test_time, test_bytea, test_json, test_jsonb, test_f4array, test_f8array, test_narray, test_i2array, test_i4array, test_i8array, test_enum, test_ltree, test_name FROM test_types" + df = read_sql(postgres_url, query, return_type="arrow2") + df = df.to_pandas(date_as_object=False) + df.sort_values(by="test_int16", inplace=True, ignore_index=True) + expected = pd.DataFrame( + index=range(4), + data={ + "test_date": pd.Series( + ["1970-01-01", "2000-02-28", "2038-01-18", None], dtype="datetime64[ns]" + ), + "test_timestamp": pd.Series( + [ + "1970-01-01 00:00:01", + "2000-02-28 12:00:10", + "2038-01-18 23:59:59", + None, + ], + dtype="datetime64[ns]", + ), + "test_timestamptz": pd.Series( + [ + "1970-01-01 00:00:01+00:00", + "2000-02-28 16:00:10+00:00", + "2038-01-18 15:59:59+00:00", + None, + ], + dtype="datetime64[ns, UTC]", + ), + "test_int16": pd.Series([0, 1, 2, 3], dtype="int32"), + "test_int64": pd.Series( + [-9223372036854775808, 0, 9223372036854775807, None], dtype="float64" + ), + "test_float32": pd.Series( + [None, 3.1415926535, 2.71, -1e-37], dtype="float32" + ), + "test_numeric": pd.Series([None, 521.34, 0.00, 0.00], dtype="float64"), + "test_bpchar": pd.Series(["a ", "bb ", "ccc ", None], dtype="object"), + "test_char": pd.Series(["a", "b", None, "d"], dtype="object"), + "test_varchar": pd.Series([None, "bb", "c", "defghijklm"], dtype="object"), + "test_uuid": pd.Series( + [ + "86b494cc-96b2-11eb-9298-3e22fbb9fe9d", + "86b49b84-96b2-11eb-9298-3e22fbb9fe9d", + "86b49c42-96b2-11eb-9298-3e22fbb9fe9d", + None, + ], + dtype="object", + ), + "test_time": pd.Series( + [ + datetime.time(8, 12, 40), + None, + datetime.time(23, 0, 10), + datetime.time(18, 30), + ], + dtype="object", + ), + "test_bytea": pd.Series( + [ + None, + b"\xd0\x97\xd0\xb4\xd1\x80\xd0\xb0\xcc\x81\xd0\xb2\xd1\x81\xd1\x82\xd0\xb2\xd1\x83\xd0\xb9\xd1\x82\xd0\xb5", + b"", + b"\xf0\x9f\x98\x9c", + ], + dtype="object", + ), + "test_json": pd.Series( + [ + '{"customer":"John Doe","items":{"product":"Beer","qty":6}}', + '{"customer":"Lily Bush","items":{"product":"Diaper","qty":24}}', + '{"customer":"Josh William","items":{"product":"Toy Car","qty":1}}', + None, + ], + dtype="object", + ), + "test_jsonb": pd.Series( + [ + '{"product":"Beer","qty":6}', + '{"product":"Diaper","qty":24}', + '{"product":"Toy Car","qty":1}', + None, + ], + dtype="object", + ), + "test_f4array": pd.Series( + [[], None, [123.123], [-1e-37, 1e37]], dtype="object" + ), + "test_f8array": pd.Series( + [[], None, [1e-307, 1e308], [0.000234, -12.987654321]], dtype="object" + ), + "test_narray": pd.Series( + [[], None, [521.34], [0.12, 333.33, 22.22]], dtype="object" + ), + "test_i2array": pd.Series( + [[-1, 0, 1], [], [-32768, 32767], None], dtype="object" + ), + "test_i4array": pd.Series( + [[-1, 0, 1123], [], [-2147483648, 2147483647], None], dtype="object" + ), + "test_i8array": pd.Series( + [[-9223372036854775808, 9223372036854775807], [], [0], None], + dtype="object", + ), + "test_enum": pd.Series( + ["happy", "very happy", "ecstatic", None], dtype="object" + ), + "test_ltree": pd.Series( + ["A.B.C.D", "A.B.E", "A", None], dtype="object" + ), + "test_name": pd.Series( + ["0", "21", "someName", "101203203-1212323-22131235"] + ) + + }, + ) + assert_frame_equal(df, expected, check_names=True) diff --git a/connectorx-python/connectorx/tests/test_bigquery.py b/connectorx-python/connectorx/tests/test_bigquery.py new file mode 100644 index 0000000..c5007a3 --- /dev/null +++ b/connectorx-python/connectorx/tests/test_bigquery.py @@ -0,0 +1,312 @@ +import os + +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal + +from .. import read_sql + + +@pytest.fixture(scope="module") # type: ignore +def bigquery_url() -> str: + conn = os.environ["BIGQUERY_URL"] + return conn + + +@pytest.mark.skipif( + not os.environ.get("BIGQUERY_URL"), + reason="Test bigquery only when `BIGQUERY_URL` is set", +) +def test_bigquery_without_partition(bigquery_url: str) -> None: + query = "select * from `dataprep-bigquery.dataprep.test_table` order by test_int" + df = read_sql(bigquery_url, query) + expected = pd.DataFrame( + index=range(5), + data={ + "test_int": pd.Series([1, 2, 4, 5, 2333], dtype="Int64"), + "test_string": pd.Series( + ["str1", "str2", None, "str05", None], dtype="object" + ), + "test_float": pd.Series([1.10, 2.20, -4.44, None, None], dtype="float64"), + "test_bool": pd.Series([True, False, False, None, True], dtype="boolean"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("BIGQUERY_URL"), + reason="Test bigquery only when `BIGQUERY_URL` is set", +) +def test_bigquery_with_partition(bigquery_url: str) -> None: + query = "select * from `dataprep-bigquery.dataprep.test_table` order by test_int" + df = read_sql( + bigquery_url, + query, + partition_on="test_int", + partition_num=3, + partition_range=[0, 2500], + ) + df = df.sort_values("test_int").reset_index(drop=True) + expected = pd.DataFrame( + index=range(5), + data={ + "test_int": pd.Series([1, 2, 4, 5, 2333], dtype="Int64"), + "test_string": pd.Series( + ["str1", "str2", None, "str05", None], dtype="object" + ), + "test_float": pd.Series([1.10, 2.20, -4.44, None, None], dtype="float64"), + "test_bool": pd.Series([True, False, False, None, True], dtype="boolean"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("BIGQUERY_URL"), + reason="Test bigquery only when `BIGQUERY_URL` is set", +) +def test_bigquery_with_partition_without_partition_range(bigquery_url: str) -> None: + query = "select * from `dataprep-bigquery.dataprep.test_table` order by test_int" + df = read_sql(bigquery_url, query, partition_on="test_int", partition_num=3) + df = df.sort_values("test_int").reset_index(drop=True) + expected = pd.DataFrame( + index=range(5), + data={ + "test_int": pd.Series([1, 2, 4, 5, 2333], dtype="Int64"), + "test_string": pd.Series( + ["str1", "str2", None, "str05", None], dtype="object" + ), + "test_float": pd.Series([1.10, 2.20, -4.44, None, None], dtype="float64"), + "test_bool": pd.Series([True, False, False, None, True], dtype="boolean"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("BIGQUERY_URL"), + reason="Test bigquery only when `BIGQUERY_URL` is set", +) +def test_bigquery_manual_partition(bigquery_url: str) -> None: + queries = [ + "select * from `dataprep-bigquery.dataprep.test_table` where test_int < 2 order by test_int", + "select * from `dataprep-bigquery.dataprep.test_table` where test_int >= 2 order by test_int", + ] + df = read_sql(bigquery_url, query=queries) + df = df.sort_values("test_int").reset_index(drop=True) + expected = pd.DataFrame( + index=range(5), + data={ + "test_int": pd.Series([1, 2, 4, 5, 2333], dtype="Int64"), + "test_string": pd.Series( + ["str1", "str2", None, "str05", None], dtype="object" + ), + "test_float": pd.Series([1.10, 2.20, -4.44, None, None], dtype="float64"), + "test_bool": pd.Series([True, False, False, None, True], dtype="boolean"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("BIGQUERY_URL"), + reason="Test bigquery only when `BIGQUERY_URL` is set", +) +def test_bigquery_some_empty_partition(bigquery_url: str) -> None: + query = "select * from `dataprep-bigquery.dataprep.test_table` where test_int=1" + df = read_sql(bigquery_url, query, partition_on="test_int", partition_num=3) + df = df.sort_values("test_int").reset_index(drop=True) + expected = pd.DataFrame( + index=range(1), + data={ + "test_int": pd.Series([1], dtype="Int64"), + "test_string": pd.Series( + ["str1"], dtype="object" + ), + "test_float": pd.Series([1.10], dtype="float64"), + "test_bool": pd.Series([True], dtype="boolean"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("BIGQUERY_URL"), + reason="Test bigquery only when `BIGQUERY_URL` is set", +) +def test_bigquery_join(bigquery_url: str) -> None: + query = "SELECT T.test_int, T.test_string, S.test_str FROM `dataprep-bigquery.dataprep.test_table` T INNER JOIN `dataprep-bigquery.dataprep.test_types` S ON T.test_int = S.test_int" + df = read_sql( + bigquery_url, + query + ) + df = df.sort_values("test_int").reset_index(drop=True) + expected = pd.DataFrame( + index=range(2), + data={ + "test_int": pd.Series([1, 2], dtype="Int64"), + "test_string": pd.Series( + [ + "str1", + "str2", + ], + dtype="object" + ), + "test_str": pd.Series( + [ + "😁😂😜", + "こんにちはЗдра́в", + ], + dtype="object" + ), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("BIGQUERY_URL"), + reason="Test bigquery only when `BIGQUERY_URL` is set", +) +def test_bigquery_join_with_partition(bigquery_url: str) -> None: + query = "SELECT T.test_int, T.test_string, S.test_str FROM `dataprep-bigquery.dataprep.test_table` T INNER JOIN `dataprep-bigquery.dataprep.test_types` S ON T.test_int = S.test_int" + df = read_sql( + bigquery_url, + query, + partition_on="test_int", + partition_num=3, + ) + df = df.sort_values("test_int").reset_index(drop=True) + expected = pd.DataFrame( + index=range(2), + data={ + "test_int": pd.Series([1, 2], dtype="Int64"), + "test_string": pd.Series( + [ + "str1", + "str2", + ], + dtype="object" + ), + "test_str": pd.Series( + [ + "😁😂😜", + "こんにちはЗдра́в", + ], + dtype="object" + ), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + + +@pytest.mark.skipif( + not os.environ.get("BIGQUERY_URL"), + reason="Test bigquery only when `BIGQUERY_URL` is set", +) +def test_bigquery_aggregation1(bigquery_url: str) -> None: + query = "SELECT test_bool, SUM(test_int) as sum_int, SUM(test_float) as sum_float FROM `dataprep-bigquery.dataprep.test_table` GROUP BY test_bool" + df = read_sql(bigquery_url, query) + df = df.sort_values("sum_int").reset_index(drop=True) + expected = pd.DataFrame( + index=range(3), + data={ + "test_bool": pd.Series([None, False, True], dtype="boolean"), + "sum_int": pd.Series([5, 6, 2334], dtype="Int64"), + "sum_float": pd.Series([None, -2.24, 1.10], dtype="float64"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("BIGQUERY_URL"), + reason="Test bigquery only when `BIGQUERY_URL` is set", +) +def test_bigquery_aggregation2(bigquery_url: str) -> None: + query = "select MAX(test_int) as max_int, MIN(test_int) min_int from `dataprep-bigquery.dataprep.test_table`" + df = read_sql(bigquery_url, query) + expected = pd.DataFrame( + index=range(1), + data={ + "max_int": pd.Series([2333], dtype="Int64"), + "min_int": pd.Series([1], dtype="Int64"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("BIGQUERY_URL"), + reason="Test bigquery only when `BIGQUERY_URL` is set", +) +def test_bigquery_aggregation1_with_partition(bigquery_url: str) -> None: + query = "SELECT test_bool, SUM(test_int) as sum_int, SUM(test_float) as sum_float FROM `dataprep-bigquery.dataprep.test_table` GROUP BY test_bool" + df = read_sql(bigquery_url, query, partition_on="sum_int", partition_num=2) + df.sort_values(by="sum_int", inplace=True, ignore_index=True) + expected = pd.DataFrame( + index=range(3), + data={ + "test_bool": pd.Series([None, False, True], dtype="boolean"), + "sum_int": pd.Series([5, 6, 2334], dtype="Int64"), + "sum_float": pd.Series([None, -2.24, 1.10], dtype="float64"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("BIGQUERY_URL"), + reason="Test bigquery only when `BIGQUERY_URL` is set", +) +def test_bigquery_aggregation2_with_partition(bigquery_url: str) -> None: + query = "select MAX(test_int) as max_int, MIN(test_int) min_int from `dataprep-bigquery.dataprep.test_table`" + df = read_sql(bigquery_url, query, partition_on="max_int", partition_num=2) + expected = pd.DataFrame( + index=range(1), + data={ + "max_int": pd.Series([2333], dtype="Int64"), + "min_int": pd.Series([1], dtype="Int64"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("BIGQUERY_URL"), + reason="Test bigquery only when `BIGQUERY_URL` is set", +) +def test_bigquery_types(bigquery_url: str) -> None: + query = "select * from `dataprep-bigquery.dataprep.test_types`" + df = read_sql(bigquery_url, query) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + expected = pd.DataFrame( + index=range(3), + data={ + "test_int": pd.Series([1, 2, None], dtype="Int64"), + "test_numeric": pd.Series([1.23, 234.56, None], dtype="float"), + "test_bool": pd.Series([True, None, False], dtype="boolean"), + "test_date": pd.Series( + ["1937-01-28", "2053-07-25", None], dtype="datetime64[ns]" + ), + "test_time": pd.Series(["00:00:00", "12:59:59", None], dtype="object"), + "test_datetime": pd.Series( + [None, "2053-07-25 12:59:59", "1937-01-28 00:00:00"], + dtype="datetime64[ns]", + ), + "test_timestamp": pd.Series( + ["1970-01-01 00:00:01.000", None, "2004-02-29 09:00:01.300"], + dtype="datetime64[ns]", + ), + "test_str": pd.Series(["😁😂😜", "こんにちはЗдра́в", None], dtype="object"), + "test_bytes": pd.Series( + ["8J+YgfCfmILwn5ic", "44GT44KT44Gr44Gh44Gv0JfQtNGA0LDMgdCy", None], + dtype="object", + ), + }, + ) + assert_frame_equal(df, expected, check_names=True) diff --git a/connectorx-python/connectorx/tests/test_clickhouse.py b/connectorx-python/connectorx/tests/test_clickhouse.py new file mode 100644 index 0000000..630ef37 --- /dev/null +++ b/connectorx-python/connectorx/tests/test_clickhouse.py @@ -0,0 +1,83 @@ +import os + +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal + +from .. import read_sql + + +@pytest.fixture(scope="module") # type: ignore +def clickhouse_url() -> str: + conn = os.environ["CLICKHOUSE_URL"] + return conn + + +@pytest.mark.skipif( + not os.environ.get("CLICKHOUSE_URL"), + reason="Do not test Clickhouse unless `CLICKHOUSE_URL` is set", +) +def test_clickhouse_without_partition(clickhouse_url: str) -> None: + query = "select * from test_table limit 3" + # clickhouse does not support binary protocol + df = read_sql(clickhouse_url, query, protocol="text") + # result from clickhouse might have different order each time + df.sort_values(by="test_int", inplace=True, ignore_index=True) + expected = pd.DataFrame( + index=range(3), + data={ + "test_int": pd.Series([1, 2, 3], dtype="float64"), + "test_str": pd.Series(["abc", "defg", "hijkl"], dtype="object"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("CLICKHOUSE_URL"), + reason="Do not test Clickhouse unless `CLICKHOUSE_URL` is set", +) +def test_clickhouse_with_partition(clickhouse_url: str) -> None: + query = "select * from test_table" + df = read_sql( + clickhouse_url, query, partition_on="test_int", partition_num=3, protocol="text" + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([1, 2, 3, 4, 5, 6], dtype="float64"), + "test_str": pd.Series( + ["abc", "defg", "hijkl", "mnopqr", "st", "u"], dtype="object" + ), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("CLICKHOUSE_URL"), + reason="Do not test Clickhouse unless `CLICKHOUSE_URL` is set", +) +def test_clickhouse_types(clickhouse_url: str) -> None: + query = "select * from test_types" + df = read_sql(clickhouse_url, query, protocol="text") + df.sort_values(by="test_int", inplace=True, ignore_index=True) + expected = pd.DataFrame( + index=range(3), + data={ + "test_int": pd.Series([1, 2, 3], dtype="Int64"), + "test_float": pd.Series([2.3, 3.3, 4.3], dtype="float64"), + "test_date": pd.Series( + ["1999-07-25", "1979-04-07", "1999-09-22"], dtype="datetime64[ns]" + ), + "test_datetime": pd.Series( + ["1999-07-25 23:14:07", "1979-04-07 03:04:37", "1999-07-25 20:21:14"], + dtype="datetime64[ns]", + ), + "test_decimal": pd.Series(["2.22", "3.33", "4.44"], dtype="object"), + "test_varchar": pd.Series(["こんにちは", "Ha好ち😁ðy", "b"], dtype="object"), + "test_char": pd.Series(["0123456789", "abcdefghij", "321"], dtype="object"), + }, + ) + assert_frame_equal(df, expected, check_names=True) diff --git a/connectorx-python/connectorx/tests/test_dask.py b/connectorx-python/connectorx/tests/test_dask.py new file mode 100644 index 0000000..a36838f --- /dev/null +++ b/connectorx-python/connectorx/tests/test_dask.py @@ -0,0 +1,42 @@ +import os + +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal + +from .. import read_sql + + +@pytest.fixture(scope="module") # type: ignore +def postgres_url() -> str: + conn = os.environ["POSTGRES_URL"] + return conn + + +def test_dask(postgres_url: str) -> None: + query = "SELECT * FROM test_table" + df = read_sql( + postgres_url, + query, + partition_on="test_int", + partition_range=(0, 2000), + partition_num=3, + return_type="dask", + ) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([0, 1, 2, 3, 4, 1314], dtype="Int64"), + "test_nullint": pd.Series([5, 3, None, 7, 9, 2], dtype="Int64"), + "test_str": pd.Series( + ["a", "str1", "str2", "b", "c", None], dtype="object" + ), + "test_float": pd.Series([3.1, None, 2.2, 3, 7.8, -10], dtype="float64"), + "test_bool": pd.Series( + [None, True, False, False, None, True], dtype="boolean" + ), + }, + ) + df = df.compute() + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) diff --git a/connectorx-python/connectorx/tests/test_federation.py b/connectorx-python/connectorx/tests/test_federation.py new file mode 100644 index 0000000..36203d7 --- /dev/null +++ b/connectorx-python/connectorx/tests/test_federation.py @@ -0,0 +1,59 @@ +import os + +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal + +from .. import read_sql + + +@pytest.fixture(scope="module") # type: ignore +def db1_url() -> str: + conn = os.environ["DB1"] + return conn + + +@pytest.fixture(scope="module") # type: ignore +def db2_url() -> str: + conn = os.environ["DB2"] + return conn + + +@pytest.mark.skipif( + not (os.environ.get("DB1") and os.environ.get("DB2")), + reason="Do not test federated queries is set unless both `DB1` and `DB2` are set", +) +def test_fed_spj(db1_url: str, db2_url: str) -> None: + query = "SELECT T.test_int, T.test_bool, S.test_language FROM db1.test_table T INNER JOIN db2.test_str S ON T.test_int = S.id" + df = read_sql({"db1": db1_url, "db2": db2_url}, query) + expected = pd.DataFrame( + index=range(5), + data={ + "TEST_INT": pd.Series([0, 1, 2, 3, 4], dtype="int64"), + "TEST_BOOL": pd.Series([None, True, False, False, None], dtype="object"), + "TEST_LANGUAGE": pd.Series( + ["English", "中文", "日本語", "русский", "Emoji"], dtype="object" + ), + }, + ) + df.sort_values(by="TEST_INT", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not (os.environ.get("DB1") and os.environ.get("DB2")), + reason="Do not test federated queries is set unless both `DB1` and `DB2` are set", +) +def test_fed_spja(db1_url: str, db2_url: str) -> None: + query = "select test_bool, AVG(test_float) as avg_float, SUM(test_int) as sum_int from db1.test_table as a, db2.test_str as b where a.test_int = b.id AND test_nullint is not NULL GROUP BY test_bool ORDER BY sum_int" + df = read_sql({"db1": db1_url, "db2": db2_url}, query) + expected = pd.DataFrame( + index=range(3), + data={ + "test_bool": pd.Series([True, False, None], dtype="object"), + "AVG_FLOAT": pd.Series([None, 3, 5.45], dtype="float64"), + "SUM_INT": pd.Series([1, 3, 4], dtype="int64"), + }, + ) + df.sort_values(by="SUM_INT", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) diff --git a/connectorx-python/connectorx/tests/test_meta.py b/connectorx-python/connectorx/tests/test_meta.py new file mode 100644 index 0000000..5484845 --- /dev/null +++ b/connectorx-python/connectorx/tests/test_meta.py @@ -0,0 +1,34 @@ +import os + +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal + +from .. import get_meta + + +@pytest.fixture(scope="module") # type: ignore +def postgres_url() -> str: + conn = os.environ["POSTGRES_URL"] + return conn + +def test_get_meta(postgres_url: str) -> None: + query = "SELECT * FROM test_table limit 10" + df = get_meta( + postgres_url, + query, + ) + expected = pd.DataFrame( + data={ + "test_int": pd.Series([], dtype="Int64"), + "test_nullint": pd.Series([], dtype="Int64"), + "test_str": pd.Series( + [], dtype="object" + ), + "test_float": pd.Series([], dtype="float64"), + "test_bool": pd.Series( + [], dtype="boolean" + ), + }, + ) + assert_frame_equal(df, expected, check_names=True) \ No newline at end of file diff --git a/connectorx-python/connectorx/tests/test_modin.py b/connectorx-python/connectorx/tests/test_modin.py new file mode 100644 index 0000000..285bab5 --- /dev/null +++ b/connectorx-python/connectorx/tests/test_modin.py @@ -0,0 +1,42 @@ +import os + +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal + +from .. import read_sql + + +@pytest.fixture(scope="module") # type: ignore +def postgres_url() -> str: + conn = os.environ["POSTGRES_URL"] + return conn + + +def test_modin(postgres_url: str) -> None: + query = "SELECT * FROM test_table" + df = read_sql( + postgres_url, + query, + partition_on="test_int", + partition_range=(0, 2000), + partition_num=3, + return_type="modin", + ) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([0, 1, 2, 3, 4, 1314], dtype="Int64"), + "test_nullint": pd.Series([5, 3, None, 7, 9, 2], dtype="Int64"), + "test_str": pd.Series( + ["a", "str1", "str2", "b", "c", None], dtype="object" + ), + "test_float": pd.Series([3.1, None, 2.2, 3, 7.8, -10], dtype="float64"), + "test_bool": pd.Series( + [None, True, False, False, None, True], dtype="boolean" + ), + }, + ) + df = df._to_pandas() + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) diff --git a/connectorx-python/connectorx/tests/test_mssql.py b/connectorx-python/connectorx/tests/test_mssql.py new file mode 100644 index 0000000..ee04d00 --- /dev/null +++ b/connectorx-python/connectorx/tests/test_mssql.py @@ -0,0 +1,498 @@ +import os + +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal + +from .. import read_sql + + +@pytest.fixture(scope="module") # type: ignore +def mssql_url() -> str: + conn = os.environ["MSSQL_URL"] + # conn = os.environ["AZURE_MSSQL_URL"] + return conn + + +@pytest.mark.xfail +def test_on_non_select(mssql_url: str) -> None: + query = "CREATE TABLE non_select(id INTEGER NOT NULL)" + df = read_sql(mssql_url, query) + + +def test_aggregation(mssql_url: str) -> None: + query = ( + "SELECT test_bool, SUM(test_float) as sum FROM test_table GROUP BY test_bool" + ) + df = read_sql(mssql_url, query) + expected = pd.DataFrame( + index=range(3), + data={ + "test_bool": pd.Series([None, False, True], dtype="boolean"), + "sum": pd.Series([10.9, 5.2, -10.0], dtype="float64"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_partition_on_aggregation(mssql_url: str) -> None: + query = ( + "SELECT test_bool, SUM(test_int) AS test_int FROM test_table GROUP BY test_bool" + ) + df = read_sql(mssql_url, query, partition_on="test_int", partition_num=2) + expected = pd.DataFrame( + index=range(3), + data={ + "test_bool": pd.Series([None, False, True], dtype="boolean"), + "test_int": pd.Series([4, 5, 1315], dtype="Int64"), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_aggregation2(mssql_url: str) -> None: + query = "select DISTINCT(test_bool) from test_table" + df = read_sql(mssql_url, query) + expected = pd.DataFrame( + index=range(3), + data={ + "test_bool": pd.Series([None, False, True], dtype="boolean"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_partition_on_aggregation2(mssql_url: str) -> None: + query = "select MAX(test_int) as max, MIN(test_int) as min from test_table" + df = read_sql(mssql_url, query, partition_on="max", partition_num=2) + expected = pd.DataFrame( + index=range(1), + data={ + "max": pd.Series([1314], dtype="Int64"), + "min": pd.Series([0], dtype="Int64"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_mssql_udf(mssql_url: str) -> None: + query = ( + "SELECT dbo.increment(test_int) AS test_int FROM test_table ORDER BY test_int" + ) + df = read_sql(mssql_url, query, partition_on="test_int", partition_num=2) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([1, 2, 3, 4, 5, 1315], dtype="Int64"), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_manual_partition(mssql_url: str) -> None: + + queries = [ + "SELECT * FROM test_table WHERE test_int < 2", + "SELECT * FROM test_table WHERE test_int >= 2", + ] + + df = read_sql(mssql_url, query=queries) + + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([0, 1, 2, 3, 4, 1314], dtype="int64"), + "test_nullint": pd.Series([5, 3, None, 7, 9, 2], dtype="Int64"), + "test_str": pd.Series( + ["a", "str1", "str2", "b", "c", None], dtype="object" + ), + "test_float": pd.Series([3.1, None, 2.2, 3, 7.8, -10], dtype="float64"), + "test_bool": pd.Series( + [None, True, False, False, None, True], dtype="boolean" + ), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_mssql_without_partition(mssql_url: str) -> None: + query = "SELECT * FROM test_table" + df = read_sql(mssql_url, query) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([1, 2, 0, 3, 4, 1314], dtype="int64"), + "test_nullint": pd.Series([3, None, 5, 7, 9, 2], dtype="Int64"), + "test_str": pd.Series( + ["str1", "str2", "a", "b", "c", None], dtype="object" + ), + "test_float": pd.Series([None, 2.2, 3.1, 3, 7.8, -10], dtype="float64"), + "test_bool": pd.Series( + [True, False, None, False, None, True], dtype="boolean" + ), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_mssql_limit_without_partition(mssql_url: str) -> None: + query = "SELECT top 3 * FROM test_table" + df = read_sql(mssql_url, query) + expected = pd.DataFrame( + index=range(3), + data={ + "test_int": pd.Series([1, 2, 0], dtype="int64"), + "test_nullint": pd.Series([3, None, 5], dtype="Int64"), + "test_str": pd.Series(["str1", "str2", "a"], dtype="object"), + "test_float": pd.Series([None, 2.2, 3.1], dtype="float64"), + "test_bool": pd.Series([True, False, None], dtype="boolean"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_mssql_limit_large_without_partition(mssql_url: str) -> None: + query = "SELECT top 10 * FROM test_table" + df = read_sql(mssql_url, query) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([1, 2, 0, 3, 4, 1314], dtype="int64"), + "test_nullint": pd.Series([3, None, 5, 7, 9, 2], dtype="Int64"), + "test_str": pd.Series( + ["str1", "str2", "a", "b", "c", None], dtype="object" + ), + "test_float": pd.Series([None, 2.2, 3.1, 3, 7.8, -10], dtype="float64"), + "test_bool": pd.Series( + [True, False, None, False, None, True], dtype="boolean" + ), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_mssql_with_partition(mssql_url: str) -> None: + query = "SELECT * FROM test_table" + df = read_sql( + mssql_url, + query, + partition_on="test_int", + partition_range=(0, 2000), + partition_num=3, + ) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([0, 1, 2, 3, 4, 1314], dtype="int64"), + "test_nullint": pd.Series([5, 3, None, 7, 9, 2], dtype="Int64"), + "test_str": pd.Series( + ["a", "str1", "str2", "b", "c", None], dtype="object" + ), + "test_float": pd.Series([3.1, None, 2.2, 3, 7.8, -10], dtype="float64"), + "test_bool": pd.Series( + [None, True, False, False, None, True], dtype="boolean" + ), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_mssql_limit_with_partition(mssql_url: str) -> None: + query = "SELECT top 3 * FROM test_table" + df = read_sql( + mssql_url, + query, + partition_on="test_int", + partition_range=(0, 2000), + partition_num=3, + ) + expected = pd.DataFrame( + index=range(3), + data={ + "test_int": pd.Series([0, 1, 2], dtype="int64"), + "test_nullint": pd.Series([5, 3, None], dtype="Int64"), + "test_str": pd.Series(["a", "str1", "str2"], dtype="object"), + "test_float": pd.Series([3.1, None, 2.20], dtype="float64"), + "test_bool": pd.Series([None, True, False], dtype="boolean"), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_mssql_limit_large_with_partition(mssql_url: str) -> None: + query = "SELECT top 10 * FROM test_table" + df = read_sql( + mssql_url, + query, + partition_on="test_int", + partition_range=(0, 2000), + partition_num=3, + ) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([0, 1, 2, 3, 4, 1314], dtype="int64"), + "test_nullint": pd.Series([5, 3, None, 7, 9, 2], dtype="Int64"), + "test_str": pd.Series( + ["a", "str1", "str2", "b", "c", None], dtype="object" + ), + "test_float": pd.Series([3.1, None, 2.2, 3, 7.8, -10], dtype="float64"), + "test_bool": pd.Series( + [None, True, False, False, None, True], dtype="boolean" + ), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_mssql_with_partition_without_partition_range(mssql_url: str) -> None: + query = "SELECT * FROM test_table where test_float > 3" + df = read_sql( + mssql_url, + query, + partition_on="test_int", + partition_num=3, + ) + + expected = pd.DataFrame( + index=range(2), + data={ + "test_int": pd.Series([0, 4], dtype="int64"), + "test_nullint": pd.Series([5, 9], dtype="Int64"), + "test_str": pd.Series(["a", "c"], dtype="object"), + "test_float": pd.Series([3.1, 7.8], dtype="float64"), + "test_bool": pd.Series([None, None], dtype="boolean"), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_mssql_with_partition_and_selection(mssql_url: str) -> None: + query = "SELECT * FROM test_table WHERE 1 = 3 OR 2 = 2" + df = read_sql( + mssql_url, + query, + partition_on="test_int", + partition_range=(0, 2000), + partition_num=3, + ) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([0, 1, 2, 3, 4, 1314], dtype="int64"), + "test_nullint": pd.Series([5, 3, None, 7, 9, 2], dtype="Int64"), + "test_str": pd.Series( + ["a", "str1", "str2", "b", "c", None], dtype="object" + ), + "test_float": pd.Series([3.1, None, 2.2, 3, 7.8, -10], dtype="float64"), + "test_bool": pd.Series( + [None, True, False, False, None, True], dtype="boolean" + ), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_mssql_with_partition_and_projection(mssql_url: str) -> None: + query = "SELECT test_int, test_float, test_str FROM test_table" + df = read_sql( + mssql_url, + query, + partition_on="test_int", + partition_range=(0, 2000), + partition_num=3, + ) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([0, 1, 2, 3, 4, 1314], dtype="int64"), + "test_float": pd.Series([3.1, None, 2.2, 3, 7.8, -10], dtype="float64"), + "test_str": pd.Series( + ["a", "str1", "str2", "b", "c", None], dtype="object" + ), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_mssql_with_partition_and_spja(mssql_url: str) -> None: + query = """ + SELECT test_bool, AVG(test_float) AS avg, SUM(test_int) AS sum + FROM test_table AS a, test_str AS b + WHERE a.test_int = b.id AND test_nullint IS NOT NULL + GROUP BY test_bool + ORDER BY sum + """ + df = read_sql(mssql_url, query, partition_on="sum", partition_num=2) + expected = pd.DataFrame( + index=range(3), + data={ + "test_bool": pd.Series([True, False, None], dtype="boolean"), + "avg": pd.Series([None, 3, 5.45], dtype="float64"), + "sum": pd.Series([1, 3, 4], dtype="Int64"), + }, + ) + df = df.sort_values("sum").reset_index(drop=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_empty_result(mssql_url: str) -> None: + query = "SELECT * FROM test_table where test_int < -100" + df = read_sql(mssql_url, query) + expected = pd.DataFrame( + data={ + "test_int": pd.Series([], dtype="int64"), + "test_nullint": pd.Series([], dtype="Int64"), + "test_str": pd.Series([], dtype="object"), + "test_float": pd.Series([], dtype="float64"), + "test_bool": pd.Series([], dtype="boolean"), + } + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_empty_result_on_partition(mssql_url: str) -> None: + query = "SELECT * FROM test_table where test_int < -100" + df = read_sql(mssql_url, query, partition_on="test_int", partition_num=3) + expected = pd.DataFrame( + data={ + "test_int": pd.Series([], dtype="int64"), + "test_nullint": pd.Series([], dtype="Int64"), + "test_str": pd.Series([], dtype="object"), + "test_float": pd.Series([], dtype="float64"), + "test_bool": pd.Series([], dtype="boolean"), + } + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_empty_result_on_some_partition(mssql_url: str) -> None: + query = "SELECT * FROM test_table where test_int < 1" + df = read_sql(mssql_url, query, partition_on="test_int", partition_num=3) + expected = pd.DataFrame( + data={ + "test_int": pd.Series([0], dtype="int64"), + "test_nullint": pd.Series([5], dtype="Int64"), + "test_str": pd.Series(["a"], dtype="object"), + "test_float": pd.Series([3.1], dtype="float"), + "test_bool": pd.Series([None], dtype="boolean"), + } + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_mssql_types(mssql_url: str) -> None: + query = "SELECT * FROM test_types" + df = read_sql(mssql_url, query) + expected = pd.DataFrame( + index=range(3), + data={ + "test_int1": pd.Series([0, 255, None], dtype="Int64"), + "test_int2": pd.Series([-32768, 32767, None], dtype="Int64"), + "test_int4": pd.Series([-2147483648, 2147483647, None], dtype="Int64"), + "test_int8": pd.Series( + [-9223372036854775808, 9223372036854775807, None], dtype="Int64" + ), + "test_float24": pd.Series([None, 1.18e-38, 3.40e38], dtype="float"), + "test_float53": pd.Series([None, -2.23e-308, 1.79e308], dtype="float"), + "test_floatn": pd.Series([None, 0, 123.1234567], dtype="float"), + "test_date": pd.Series( + ["1999-07-25", None, "2021-01-28"], dtype="datetime64[ns]" + ), + "test_time": pd.Series(["00:00:00", "23:59:59", None], dtype="object"), + "test_datetime": pd.Series( + [None, "2020-12-31 23:59:59", "2021-01-28 10:30:30"], + dtype="datetime64[ns]", + ), + "test_smalldatetime": pd.Series( + ["1990-01-01 10:00:00", None, "2079-06-05 23:00:00"], + dtype="datetime64[ns]", + ), + "test_naivedatetime": pd.Series( + ["1753-01-01 12:00:00", "2038-12-31 01:00:00", None], + dtype="datetime64[ns]", + ), + "test_naivedatetime2": pd.Series( + ["1900-01-01 12:00:00.12345", None, "2027-03-18 14:30:30.54321"], + dtype="datetime64[ns]", + ), + "test_new_decimal": pd.Series([1.1, 2.2, None], dtype="float"), + "test_decimal": pd.Series([1, 2, None], dtype="float"), + "test_varchar": pd.Series([None, "varchar2", "varchar3"], dtype="object"), + "test_char": pd.Series([None, "char2 ", "char3 "], dtype="object"), + "test_varbinary": pd.Series([None, b"1234", b""], dtype="object"), + "test_binary": pd.Series( + [None, b"12\x00\x00\x00", b"\x00\x00\x00\x00\x00"], dtype="object" + ), + "test_nchar": pd.Series(["1234", None, "12 "], dtype="object"), + "test_text": pd.Series(["text", "t", None], dtype="object"), + "test_ntext": pd.Series(["ntext", "nt", None], dtype="object"), + "test_uuid": pd.Series( + [ + "86b494cc-96b2-11eb-9298-3e22fbb9fe9d", + None, + "86b49b84-96b2-11eb-9298-3e22fbb9fe9d", + ], + dtype="object", + ), + "test_money": pd.Series( + [None, 922337203685477.5807, -922337203685477.5808], dtype="float" + ), + "test_smallmoney": pd.Series( + [None, 214748.3647, -214748.3648], dtype="float" + ), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_mssql_unicode(mssql_url: str) -> None: + query = "SELECT test_hello FROM test_str where 1 <= id and id <= 4" + df = read_sql(mssql_url, query) + expected = pd.DataFrame( + index=range(4), + data={ + "test_hello": pd.Series( + ["你好", "こんにちは", "Здра́вствуйте", "😁😂😜"], dtype="object" + ), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_mssql_cte(mssql_url: str) -> None: + query = "with test_cte (test_int, test_str) as (select test_int, test_str from test_table where test_float > 0) select test_int, test_str from test_cte" + df = read_sql(mssql_url, query, partition_on="test_int", partition_num=3) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + expected = pd.DataFrame( + index=range(4), + data={ + "test_int": pd.Series([0, 2, 3, 4], dtype="int64"), + "test_str": pd.Series(["a", "str2", "b", "c"], dtype="object"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_mssql_offset(mssql_url: str) -> None: + query = "SELECT * FROM (SELECT * FROM test_table) AS _ ORDER BY(SELECT NULL) OFFSET 0 ROWS FETCH NEXT 1 ROWS ONLY" + df = read_sql(mssql_url, query) + expected = pd.DataFrame( + data={ + "test_int": pd.Series([1], dtype="int64"), + "test_nullint": pd.Series([3], dtype="Int64"), + "test_str": pd.Series(["str1"], dtype="object"), + "test_float": pd.Series([None], dtype="float"), + "test_bool": pd.Series([True], dtype="boolean"), + } + ) + assert_frame_equal(df, expected, check_names=True) diff --git a/connectorx-python/connectorx/tests/test_mysql.py b/connectorx-python/connectorx/tests/test_mysql.py new file mode 100644 index 0000000..9376bf5 --- /dev/null +++ b/connectorx-python/connectorx/tests/test_mysql.py @@ -0,0 +1,470 @@ +import os + +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal + +from .. import read_sql + + +@pytest.fixture(scope="module") # type: ignore +def mysql_url() -> str: + conn = os.environ["MYSQL_URL"] + # conn = os.environ["MARIADB_URL"] + return conn + + +def test_mysql_without_partition(mysql_url: str) -> None: + query = "select * from test_table limit 3" + df = read_sql(mysql_url, query) + expected = pd.DataFrame( + index=range(3), + data={ + "test_int": pd.Series([1, 2, 3], dtype="Int64"), + "test_float": pd.Series([1.1, 2.2, 3.3], dtype="float64"), + "test_enum": pd.Series(["odd", "even", "odd"], dtype="object"), + "test_null": pd.Series([None, None, None], dtype="Int64"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_mysql_with_partition(mysql_url: str) -> None: + query = "select * from test_table" + df = read_sql( + mysql_url, + query, + partition_on="test_int", + partition_range=(0, 10), + partition_num=6, + ) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([1, 2, 3, 4, 5, 6], dtype="Int64"), + "test_float": pd.Series([1.1, 2.2, 3.3, 4.4, 5.5, 6.6], dtype="float64"), + "test_enum": pd.Series( + ["odd", "even", "odd", "even", "odd", "even"], dtype="object" + ), + "test_null": pd.Series([None, None, None, None, None, None], dtype="Int64"), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_mysql_without_partition(mysql_url: str) -> None: + query = "SELECT * FROM test_table" + df = read_sql(mysql_url, query) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([1, 2, 3, 4, 5, 6], dtype="Int64"), + "test_float": pd.Series([1.1, 2.2, 3.3, 4.4, 5.5, 6.6], dtype="float64"), + "test_enum": pd.Series( + ["odd", "even", "odd", "even", "odd", "even"], dtype="object" + ), + "test_null": pd.Series([None, None, None, None, None, None], dtype="Int64"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_mysql_limit_without_partition(mysql_url: str) -> None: + query = "SELECT * FROM test_table limit 3" + df = read_sql(mysql_url, query) + expected = pd.DataFrame( + index=range(3), + data={ + "test_int": pd.Series([1, 2, 3], dtype="Int64"), + "test_float": pd.Series([1.1, 2.2, 3.3], dtype="float64"), + "test_enum": pd.Series(["odd", "even", "odd"], dtype="object"), + "test_null": pd.Series([None, None, None], dtype="Int64"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_mysql_limit_large_without_partition(mysql_url: str) -> None: + query = "SELECT * FROM test_table limit 10" + df = read_sql(mysql_url, query) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([1, 2, 3, 4, 5, 6], dtype="Int64"), + "test_float": pd.Series([1.1, 2.2, 3.3, 4.4, 5.5, 6.6], dtype="float64"), + "test_enum": pd.Series( + ["odd", "even", "odd", "even", "odd", "even"], dtype="object" + ), + "test_null": pd.Series([None, None, None, None, None, None], dtype="Int64"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_mysql_with_partition(mysql_url: str) -> None: + query = "SELECT * FROM test_table" + df = read_sql( + mysql_url, + query, + partition_on="test_int", + partition_range=(0, 2000), + partition_num=3, + ) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([1, 2, 3, 4, 5, 6], dtype="Int64"), + "test_float": pd.Series([1.1, 2.2, 3.3, 4.4, 5.5, 6.6], dtype="float64"), + "test_enum": pd.Series( + ["odd", "even", "odd", "even", "odd", "even"], dtype="object" + ), + "test_null": pd.Series([None, None, None, None, None, None], dtype="Int64"), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_mysql_limit_with_partition(mysql_url: str) -> None: + query = "SELECT * FROM test_table limit 3" + df = read_sql( + mysql_url, + query, + partition_on="test_int", + partition_range=(0, 2000), + partition_num=3, + ) + expected = pd.DataFrame( + index=range(3), + data={ + "test_int": pd.Series([1, 2, 3], dtype="Int64"), + "test_float": pd.Series([1.1, 2.2, 3.3], dtype="float64"), + "test_enum": pd.Series(["odd", "even", "odd"], dtype="object"), + "test_null": pd.Series([None, None, None], dtype="Int64"), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_mysql_limit_large_with_partition(mysql_url: str) -> None: + query = "SELECT * FROM test_table limit 10" + df = read_sql( + mysql_url, + query, + partition_on="test_int", + partition_range=(0, 2000), + partition_num=3, + ) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([1, 2, 3, 4, 5, 6], dtype="Int64"), + "test_float": pd.Series([1.1, 2.2, 3.3, 4.4, 5.5, 6.6], dtype="float64"), + "test_enum": pd.Series( + ["odd", "even", "odd", "even", "odd", "even"], dtype="object" + ), + "test_null": pd.Series([None, None, None, None, None, None], dtype="Int64"), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_mysql_with_partition_without_partition_range(mysql_url: str) -> None: + query = "SELECT * FROM test_table where test_float > 3" + df = read_sql( + mysql_url, + query, + partition_on="test_int", + partition_num=3, + ) + expected = pd.DataFrame( + index=range(4), + data={ + "test_int": pd.Series([3, 4, 5, 6], dtype="Int64"), + "test_float": pd.Series([3.3, 4.4, 5.5, 6.6], dtype="float64"), + "test_enum": pd.Series(["odd", "even", "odd", "even"], dtype="object"), + "test_null": pd.Series([None, None, None, None], dtype="Int64"), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_mysql_manual_partition(mysql_url: str) -> None: + queries = [ + "SELECT * FROM test_table WHERE test_int < 2", + "SELECT * FROM test_table WHERE test_int >= 2", + ] + df = read_sql(mysql_url, query=queries) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([1, 2, 3, 4, 5, 6], dtype="Int64"), + "test_float": pd.Series([1.1, 2.2, 3.3, 4.4, 5.5, 6.6], dtype="float64"), + "test_enum": pd.Series( + ["odd", "even", "odd", "even", "odd", "even"], dtype="object" + ), + "test_null": pd.Series([None, None, None, None, None, None], dtype="Int64"), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_mysql_selection_and_projection(mysql_url: str) -> None: + query = "SELECT test_int FROM test_table WHERE test_float < 5" + df = read_sql( + mysql_url, + query, + partition_on="test_int", + partition_num=3, + ) + expected = pd.DataFrame( + index=range(4), + data={ + "test_int": pd.Series([1, 2, 3, 4], dtype="Int64"), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_mysql_join(mysql_url: str) -> None: + query = "SELECT T.test_int, T.test_float, S.test_str FROM test_table T INNER JOIN test_table_extra S ON T.test_int = S.test_int" + df = read_sql( + mysql_url, + query, + partition_on="test_int", + partition_num=3, + ) + expected = pd.DataFrame( + index=range(3), + data={ + "test_int": pd.Series([1, 2, 3], dtype="Int64"), + "test_float": pd.Series([1.1, 2.2, 3.3], dtype="float64"), + "test_str": pd.Series( + [ + "Ha好ち😁ðy̆", + "こんにちは", + "русский", + ], + dtype="object", + ), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_mysql_aggregate(mysql_url: str) -> None: + query = "select AVG(test_float) as avg_float, SUM(T.test_int) as sum_int, SUM(test_null) as sum_null from test_table as T INNER JOIN test_table_extra as S where T.test_int = S.test_int GROUP BY test_enum ORDER BY sum_int" + df = read_sql(mysql_url, query) + expected = pd.DataFrame( + index=range(2), + data={ + "avg_float": pd.Series([2.2, 2.2], dtype="float64"), + "sum_int": pd.Series([2.0, 4.0], dtype="float64"), + "sum_null": pd.Series([None, None], dtype="float64"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_mysql_types_binary(mysql_url: str) -> None: + query = "select * from test_types" + df = read_sql(mysql_url, query, protocol="binary") + expected = pd.DataFrame( + index=range(3), + data={ + "test_timestamp": pd.Series( + ["1970-01-01 00:00:01", "2038-01-19 00:00:00", None], + dtype="datetime64[ns]", + ), + "test_date": pd.Series( + [None, "1970-01-01", "2038-01-19"], dtype="datetime64[ns]" + ), + "test_time": pd.Series(["00:00:00", None, "23:59:59"], dtype="object"), + "test_datetime": pd.Series( + ["1970-01-01 00:00:01", "2038-01-19 00:0:00", None], + dtype="datetime64[ns]", + ), + "test_new_decimal": pd.Series([1.1, None, 3.3], dtype="float"), + "test_decimal": pd.Series([1, 2, None], dtype="float"), + "test_varchar": pd.Series([None, "varchar2", "varchar3"], dtype="object"), + "test_char": pd.Series(["char1", None, "char3"], dtype="object"), + "test_tiny": pd.Series([-128, 127, None], dtype="Int64"), + "test_short": pd.Series([-32768, 32767, None], dtype="Int64"), + "test_int24": pd.Series([-8388608, 8388607, None], dtype="Int64"), + "test_long": pd.Series([-2147483648, 2147483647, None], dtype="Int64"), + "test_longlong": pd.Series( + [-9223372036854775808, 9223372036854775807, None], dtype="Int64" + ), + "test_tiny_unsigned": pd.Series([None, 255, 0], dtype="Int64"), + "test_short_unsigned": pd.Series([None, 65535, 0], dtype="Int64"), + "test_int24_unsigned": pd.Series([None, 16777215, 0], dtype="Int64"), + "test_long_unsigned": pd.Series([None, 4294967295, 0], dtype="Int64"), + "test_longlong_unsigned": pd.Series( + [None, 18446744070000001024.0, 0.0], dtype="float" + ), + "test_long_notnull": pd.Series([1, 2147483647, -2147483648], dtype="int64"), + "test_short_unsigned_notnull": pd.Series([1, 65535, 0], dtype="int64"), + "test_float": pd.Series([None, -1.1e-38, 3.4e38], dtype="float"), + "test_double": pd.Series([-2.2e-308, None, 1.7e308], dtype="float"), + "test_double_notnull": pd.Series([1.2345, -1.1e-3, 1.7e30], dtype="float"), + "test_year": pd.Series([1901, 2155, None], dtype="Int64"), + "test_tinyblob": pd.Series( + [None, b"tinyblob2", b"tinyblob3"], dtype="object" + ), + "test_blob": pd.Series( + [None, b"blobblobblobblob2", b"blobblobblobblob3"], dtype="object" + ), + "test_mediumblob": pd.Series( + [None, b"mediumblob2", b"mediumblob3"], dtype="object" + ), + "test_longblob": pd.Series( + [None, b"longblob2", b"longblob3"], dtype="object" + ), + "test_enum": pd.Series(["apple", None, "mango"], dtype="object"), + "test_json": pd.Series( + ['{"age":1,"name":"piggy"}', '{"age":2,"name":"kitty"}', None], + # mariadb + # [b'{"name": "piggy", "age": 1}', b'{"name": "kitty", "age": 2}', None], + dtype="object", + ), + "test_mediumtext": pd.Series( + [None, b"", b"medium text!!!!"], dtype="object" + ), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_mysql_types_text(mysql_url: str) -> None: + query = "select * from test_types" + df = read_sql(mysql_url, query, protocol="text") + expected = pd.DataFrame( + index=range(3), + data={ + "test_timestamp": pd.Series( + ["1970-01-01 00:00:01", "2038-01-19 00:00:00", None], + dtype="datetime64[ns]", + ), + "test_date": pd.Series( + [None, "1970-01-01", "2038-01-19"], dtype="datetime64[ns]" + ), + "test_time": pd.Series(["00:00:00", None, "23:59:59"], dtype="object"), + "test_datetime": pd.Series( + ["1970-01-01 00:00:01", "2038-01-19 00:00:00", None], + dtype="datetime64[ns]", + ), + "test_new_decimal": pd.Series([1.1, None, 3.3], dtype="float"), + "test_decimal": pd.Series([1, 2, None], dtype="float"), + "test_varchar": pd.Series([None, "varchar2", "varchar3"], dtype="object"), + "test_char": pd.Series(["char1", None, "char3"], dtype="object"), + "test_tiny": pd.Series([-128, 127, None], dtype="Int64"), + "test_short": pd.Series([-32768, 32767, None], dtype="Int64"), + "test_int24": pd.Series([-8388608, 8388607, None], dtype="Int64"), + "test_long": pd.Series([-2147483648, 2147483647, None], dtype="Int64"), + "test_longlong": pd.Series( + [-9223372036854775808, 9223372036854775807, None], dtype="Int64" + ), + "test_tiny_unsigned": pd.Series([None, 255, 0], dtype="Int64"), + "test_short_unsigned": pd.Series([None, 65535, 0], dtype="Int64"), + "test_int24_unsigned": pd.Series([None, 16777215, 0], dtype="Int64"), + "test_long_unsigned": pd.Series([None, 4294967295, 0], dtype="Int64"), + "test_longlong_unsigned": pd.Series( + [None, 18446744070000001024.0, 0.0], dtype="float" + ), + "test_long_notnull": pd.Series([1, 2147483647, -2147483648], dtype="int64"), + "test_short_unsigned_notnull": pd.Series([1, 65535, 0], dtype="int64"), + "test_float": pd.Series([None, -1.1e-38, 3.4e38], dtype="float"), + "test_double": pd.Series([-2.2e-308, None, 1.7e308], dtype="float"), + "test_double_notnull": pd.Series([1.2345, -1.1e-3, 1.7e30], dtype="float"), + "test_year": pd.Series([1901, 2155, None], dtype="Int64"), + "test_tinyblob": pd.Series( + [None, b"tinyblob2", b"tinyblob3"], dtype="object" + ), + "test_blob": pd.Series( + [None, b"blobblobblobblob2", b"blobblobblobblob3"], dtype="object" + ), + "test_mediumblob": pd.Series( + [None, b"mediumblob2", b"mediumblob3"], dtype="object" + ), + "test_longblob": pd.Series( + [None, b"longblob2", b"longblob3"], dtype="object" + ), + "test_enum": pd.Series(["apple", None, "mango"], dtype="object"), + "test_json": pd.Series( + ['{"age":1,"name":"piggy"}', '{"age":2,"name":"kitty"}', None], + # mariadb + # [b'{"name": "piggy", "age": 1}', b'{"name": "kitty", "age": 2}', None], + dtype="object", + ), + "test_mediumtext": pd.Series( + [None, b"", b"medium text!!!!"], dtype="object" + ), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_empty_result(mysql_url: str) -> None: + query = "SELECT * FROM test_table where test_int < -100" + df = read_sql(mysql_url, query) + expected = pd.DataFrame( + data={ + "test_int": pd.Series([], dtype="Int64"), + "test_float": pd.Series([], dtype="float64"), + "test_enum": pd.Series([], dtype="object"), + "test_null": pd.Series([], dtype="Int64"), + } + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_empty_result_on_partition(mysql_url: str) -> None: + query = "SELECT * FROM test_table where test_int < -100" + df = read_sql(mysql_url, query, partition_on="test_int", partition_num=3) + expected = pd.DataFrame( + data={ + "test_int": pd.Series([], dtype="Int64"), + "test_float": pd.Series([], dtype="float64"), + "test_enum": pd.Series([], dtype="object"), + "test_null": pd.Series([], dtype="Int64"), + } + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_empty_result_on_some_partition(mysql_url: str) -> None: + query = "SELECT * FROM test_table where test_int = 6" + df = read_sql(mysql_url, query, partition_on="test_int", partition_num=3) + expected = pd.DataFrame( + index=range(1), + data={ + "test_int": pd.Series([6], dtype="Int64"), + "test_float": pd.Series([6.6], dtype="float64"), + "test_enum": pd.Series(["even"], dtype="object"), + "test_null": pd.Series([None], dtype="Int64"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_mysql_cte(mysql_url: str) -> None: + query = "with test_cte (test_int, test_enum) as (select test_int, test_enum from test_table where test_float > 2) select test_int, test_enum from test_cte" + df = read_sql(mysql_url, query, partition_on="test_int", partition_num=3) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + expected = pd.DataFrame( + index=range(5), + data={ + "test_int": pd.Series([2, 3, 4, 5, 6], dtype="Int64"), + "test_enum": pd.Series( + ["even", "odd", "even", "odd", "even"], dtype="object" + ), + }, + ) + assert_frame_equal(df, expected, check_names=True) diff --git a/connectorx-python/connectorx/tests/test_oracle.py b/connectorx-python/connectorx/tests/test_oracle.py new file mode 100644 index 0000000..59e489c --- /dev/null +++ b/connectorx-python/connectorx/tests/test_oracle.py @@ -0,0 +1,443 @@ +import os + +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal + +from .. import read_sql + + +@pytest.fixture(scope="module") # type: ignore +def oracle_url() -> str: + conn = os.environ["ORACLE_URL"] + return conn + +@pytest.mark.xfail +@pytest.mark.skipif( + not os.environ.get("ORACLE_URL"), reason="Test oracle only when `ORACLE_URL` is set" +) +def test_on_non_select(oracle_url: str) -> None: + query = "CREATE TABLE non_select(id INTEGER NOT NULL)" + read_sql(oracle_url, query) + + +@pytest.mark.skipif( + not os.environ.get("ORACLE_URL"), reason="Test oracle only when `ORACLE_URL` is set" +) +def test_oracle_complex_join(oracle_url: str) -> None: + query = "SELECT a.test_int, b.test_date, c.test_num_int FROM test_table a left join test_types b on a.test_int = b.test_num_int cross join (select test_num_int from test_types) c where c.test_num_int < 3" + df = read_sql(oracle_url, query) + df = df.sort_values("TEST_INT").reset_index(drop=True) + expected = pd.DataFrame( + data={ + "TEST_INT": pd.Series([1, 2, 4, 5, 5, 2333], dtype="Int64"), + "TEST_DATE": pd.Series( + ["2019-05-21", None, None, "2020-05-21", "2020-05-21", None], + dtype="datetime64[ns]", + ), + "TEST_NUM_INT": pd.Series([1, 1, 1, 1, 1, 1], dtype="Int64"), + } + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_oracle_complex_join(oracle_url: str) -> None: + query = "SELECT a.test_int, b.test_date, c.test_num_int FROM test_table a left join test_types b on a.test_int = b.test_num_int cross join (select test_num_int from test_types) c where c.test_num_int < 3" + df = read_sql(oracle_url, query) + df = df.sort_values("TEST_INT").reset_index(drop=True) + expected = pd.DataFrame( + data={ + "TEST_INT": pd.Series([1, 2, 4, 5, 5, 2333], dtype="Int64"), + "TEST_DATE": pd.Series( + ["2019-05-21", None, None, "2020-05-21", "2020-05-21", None], + dtype="datetime64[ns]", + ), + "TEST_NUM_INT": pd.Series([1, 1, 1, 1, 1, 1], dtype="Int64"), + } + ) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("ORACLE_URL"), reason="Test oracle only when `ORACLE_URL` is set" +) +def test_oracle_complex_join(oracle_url: str) -> None: + query = "SELECT a.test_int, b.test_date, c.test_num_int FROM test_table a left join test_types b on a.test_int = b.test_num_int cross join (select test_num_int from test_types) c where c.test_num_int < 3" + df = read_sql(oracle_url, query) + df = df.sort_values("TEST_INT").reset_index(drop=True) + expected = pd.DataFrame( + data={ + "TEST_INT": pd.Series([1, 2, 4, 5, 5, 2333], dtype="Int64"), + "TEST_DATE": pd.Series( + ["2019-05-21", None, None, "2020-05-21", "2020-05-21", None], + dtype="datetime64[ns]", + ), + "TEST_NUM_INT": pd.Series([1, 1, 1, 1, 1, 1], dtype="Int64"), + } + ) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("ORACLE_URL"), reason="Test oracle only when `ORACLE_URL` is set" +) +def test_oracle_aggregation(oracle_url: str) -> None: + query = "select avg(test_int), test_char from test_table group by test_char" + df = read_sql(oracle_url, query) + df = df.sort_values("AVG(TEST_INT)").reset_index(drop=True) + expected = pd.DataFrame( + data={ + "AVG(TEST_INT)": pd.Series([1, 2, 5, 1168.5], dtype="float64"), + "TEST_CHAR": pd.Series(["str1 ", "str2 ", "str05", None], dtype="object"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("ORACLE_URL"), reason="Test oracle only when `ORACLE_URL` is set" +) +def test_oracle_partition_on_aggregation(oracle_url: str) -> None: + query = "select sum(test_int) cid, test_char from test_table group by test_char" + df = read_sql(oracle_url, query, partition_on="cid", partition_num=3) + df = df.sort_values("CID").reset_index(drop=True) + expected = pd.DataFrame( + index=range(4), + data={ + "CID": pd.Series([1, 2, 5, 2337], dtype="float64"), + "TEST_CHAR": pd.Series(["str1 ", "str2 ", "str05", None], dtype="object"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("ORACLE_URL"), reason="Test oracle only when `ORACLE_URL` is set" +) +def test_oracle_aggregation2(oracle_url: str) -> None: + query = "select DISTINCT(test_char) from test_table" + df = read_sql(oracle_url, query) + expected = pd.DataFrame( + data={ + "TEST_CHAR": pd.Series(["str05", "str1 ", "str2 ", None], dtype="object"), + }, + ) + df.sort_values(by="TEST_CHAR", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("ORACLE_URL"), reason="Test oracle only when `ORACLE_URL` is set" +) +def test_oracle_partition_on_aggregation2(oracle_url: str) -> None: + query = "select MAX(test_int) MAX, MIN(test_int) MIN from test_table" + df = read_sql(oracle_url, query, partition_on="MAX", partition_num=2) + expected = pd.DataFrame( + index=range(1), + data={ + "MAX": pd.Series([2333], dtype="float64"), + "MIN": pd.Series([1], dtype="float64"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("ORACLE_URL"), reason="Test oracle only when `ORACLE_URL` is set" +) +def test_oracle_manual_partition(oracle_url: str) -> None: + queries = [ + "SELECT * FROM test_table WHERE test_int < 2", + "SELECT * FROM test_table WHERE test_int >= 2", + ] + df = read_sql(oracle_url, query=queries) + expected = pd.DataFrame( + data={ + "TEST_INT": pd.Series([1, 2, 4, 5, 2333], dtype="Int64"), + "TEST_CHAR": pd.Series( + ["str1 ", "str2 ", None, "str05", None], dtype="object" + ), + "TEST_FLOAT": pd.Series([1.1, 2.2, -4.44, None, None], dtype="float64"), + }, + ) + df.sort_values(by="TEST_INT", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + +@pytest.mark.skipif( + not os.environ.get("ORACLE_URL"), reason="Test oracle only when `ORACLE_URL` is set" +) +def test_oracle_without_partition(oracle_url: str) -> None: + query = "SELECT * FROM test_table" + df = read_sql(oracle_url, query) + expected = pd.DataFrame( + data={ + "TEST_INT": pd.Series([1, 2, 2333, 4, 5], dtype="Int64"), + "TEST_CHAR": pd.Series( + ["str1 ", "str2 ", None, None, "str05"], dtype="object" + ), + "TEST_FLOAT": pd.Series([1.1, 2.2, None, -4.44, None], dtype="float64"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("ORACLE_URL"), reason="Test oracle only when `ORACLE_URL` is set" +) +def test_oracle_limit_without_partition(oracle_url: str) -> None: + query = "SELECT * FROM test_table where rownum <= 3" + df = read_sql(oracle_url, query) + expected = pd.DataFrame( + data={ + "TEST_INT": pd.Series([1, 2, 2333], dtype="Int64"), + "TEST_CHAR": pd.Series(["str1 ", "str2 ", None], dtype="object"), + "TEST_FLOAT": pd.Series([1.1, 2.2, None], dtype="float64"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("ORACLE_URL"), reason="Test oracle only when `ORACLE_URL` is set" +) +def test_oracle_limit_large_without_partition(oracle_url: str) -> None: + query = "SELECT * FROM test_table where rownum < 10" + df = read_sql(oracle_url, query) + expected = pd.DataFrame( + data={ + "TEST_INT": pd.Series([1, 2, 2333, 4, 5], dtype="Int64"), + "TEST_CHAR": pd.Series( + ["str1 ", "str2 ", None, None, "str05"], dtype="object" + ), + "TEST_FLOAT": pd.Series([1.1, 2.2, None, -4.44, None], dtype="float64"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("ORACLE_URL"), reason="Test oracle only when `ORACLE_URL` is set" +) +def test_oracle_with_partition(oracle_url: str) -> None: + query = "SELECT * FROM test_table" + df = read_sql( + oracle_url, + query, + partition_on="test_int", + partition_range=(0, 5001), + partition_num=3, + ) + expected = pd.DataFrame( + data={ + "TEST_INT": pd.Series([1, 2, 4, 5, 2333], dtype="Int64"), + "TEST_CHAR": pd.Series( + ["str1 ", "str2 ", None, "str05", None], dtype="object" + ), + "TEST_FLOAT": pd.Series([1.1, 2.2, -4.44, None, None], dtype="float64"), + }, + ) + df.sort_values(by="TEST_INT", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("ORACLE_URL"), reason="Test oracle only when `ORACLE_URL` is set" +) +def test_oracle_with_partition_without_partition_range(oracle_url: str) -> None: + query = "SELECT * FROM test_table where test_float > 1" + df = read_sql( + oracle_url, + query, + partition_on="test_int", + partition_num=3, + ) + expected = pd.DataFrame( + data={ + "TEST_INT": pd.Series([1, 2], dtype="Int64"), + "TEST_CHAR": pd.Series(["str1 ", "str2 "], dtype="object"), + "TEST_FLOAT": pd.Series([1.1, 2.2], dtype="float64"), + }, + ) + df.sort_values(by="TEST_INT", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("ORACLE_URL"), reason="Test oracle only when `ORACLE_URL` is set" +) +def test_oracle_with_partition_and_selection(oracle_url: str) -> None: + query = "SELECT * FROM test_table WHERE 1 = 3 OR 2 = 2" + df = read_sql( + oracle_url, + query, + partition_on="test_int", + partition_range=(1, 2333), + partition_num=3, + ) + expected = pd.DataFrame( + data={ + "TEST_INT": pd.Series([1, 2, 4, 5, 2333], dtype="Int64"), + "TEST_CHAR": pd.Series( + ["str1 ", "str2 ", None, "str05", None], dtype="object" + ), + "TEST_FLOAT": pd.Series([1.1, 2.2, -4.44, None, None], dtype="float64"), + }, + ) + df.sort_values(by="TEST_INT", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("ORACLE_URL"), reason="Test oracle only when `ORACLE_URL` is set" +) +def test_oracle_with_partition_and_spja(oracle_url: str) -> None: + query = "select test_table.test_int cid, SUM(test_types.test_num_float) sfloat from test_table, test_types where test_table.test_int=test_types.test_num_int group by test_table.test_int" + df = read_sql(oracle_url, query, partition_on="cid", partition_num=2) + expected = pd.DataFrame( + data={ + "CID": pd.Series([1, 5], dtype="Int64"), + "SFLOAT": pd.Series([2.3, -0.2], dtype="float64"), + }, + ) + df.sort_values(by="CID", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("ORACLE_URL"), reason="Test oracle only when `ORACLE_URL` is set" +) +def test_oracle_types(oracle_url: str) -> None: + query = "SELECT * FROM test_types" + df = read_sql(oracle_url, query) + print(df) + expected = pd.DataFrame( + data={ + "TEST_NUM_INT": pd.Series([1, 5, 5, None], dtype="Int64"), + "TEST_INT": pd.Series([-10, 22, 22, 100], dtype="Int64"), + "TEST_NUM_FLOAT": pd.Series([2.3, -0.1, -0.1, None], dtype="float64"), + "TEST_FLOAT": pd.Series([2.34, 123.455, 123.455, None], dtype="float64"), + "TEST_BINARY_FLOAT": pd.Series( + [-3.456, 3.1415926535, 3.1415926535, None], dtype="float64" + ), + "TEST_BINARY_DOUBLE": pd.Series( + [9999.99991, -111111.2345, -111111.2345, None], dtype="float64" + ), + "TEST_CHAR": pd.Series(["char1", "char2", "char2", None], dtype="object"), + "TEST_VARCHAR": pd.Series( + ["varchar1", "varchar222", "varchar222", None], dtype="object" + ), + "TEST_NCHAR": pd.Series( + ["y123 ", "aab123", "aab123", None], dtype="object" + ), + "TEST_NVARCHAR": pd.Series( + ["aK>?KJ@#$%", ")>KDS)(F*&%J", ")>KDS)(F*&%J", None], dtype="object" + ), + "TEST_DATE": pd.Series( + ["2019-05-21", "2020-05-21", "2020-05-21", None], dtype="datetime64[ns]" + ), + "TEST_TIMESTAMP": pd.Series( + [ + "2019-05-21 01:02:33", + "2020-05-21 01:02:33", + "2020-05-21 01:02:33", + None, + ], + dtype="datetime64[ns]", + ), + "TEST_TIMESTAMPTZ": pd.Series( + [ + "1999-12-01 11:00:00", + "1899-12-01 11:00:00", + "1899-12-01 11:00:00", + None, + ], + dtype="datetime64[ns]", + ), + "TEST_CLOB": pd.Series( + ["13ab", "13ab", "13ab", None], dtype="object" + ), + "TEST_BLOB": pd.Series( + [ b'9\xaf', b'9\xaf', b'9\xaf', None], dtype="object" + ), + } + ) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("ORACLE_URL"), reason="Test oracle only when `ORACLE_URL` is set" +) +def test_oracle_empty_result(oracle_url: str) -> None: + query = "SELECT * FROM test_table where test_int < -100" + df = read_sql(oracle_url, query) + expected = pd.DataFrame( + data={ + "TEST_INT": pd.Series([], dtype="Int64"), + "TEST_CHAR": pd.Series([], dtype="object"), + "TEST_FLOAT": pd.Series([], dtype="float64"), + } + ) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("ORACLE_URL"), reason="Test oracle only when `ORACLE_URL` is set" +) +def test_oracle_empty_result_on_partition(oracle_url: str) -> None: + query = "SELECT * FROM test_table where test_int < -100" + df = read_sql(oracle_url, query, partition_on="test_int", partition_num=3) + print(df) + expected = pd.DataFrame( + data={ + "TEST_INT": pd.Series([], dtype="Int64"), + "TEST_CHAR": pd.Series([], dtype="object"), + "TEST_FLOAT": pd.Series([], dtype="float64"), + } + ) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("ORACLE_URL"), reason="Test oracle only when `ORACLE_URL` is set" +) +def test_oracle_empty_result_on_some_partition(oracle_url: str) -> None: + query = "SELECT * FROM test_table where test_int < 2" + df = read_sql(oracle_url, query, partition_on="test_int", partition_num=3) + expected = pd.DataFrame( + data={ + "TEST_INT": pd.Series([1], dtype="Int64"), + "TEST_CHAR": pd.Series(["str1 "], dtype="object"), + "TEST_FLOAT": pd.Series([1.1], dtype="float64"), + } + ) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("ORACLE_URL"), reason="Test oracle only when `ORACLE_URL` is set" +) +def test_oracle_cte(oracle_url: str) -> None: + query = "with test_cte (test_int, test_str) as (select test_int, test_char from test_table where test_float > 0) select test_int, test_str from test_cte" + df = read_sql(oracle_url, query, partition_on="test_int", partition_num=3) + df.sort_values(by="TEST_INT", inplace=True, ignore_index=True) + expected = pd.DataFrame( + index=range(2), + data={ + "TEST_INT": pd.Series([1, 2], dtype="Int64"), + "TEST_STR": pd.Series(["str1 ", "str2 "], dtype="object"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + +@pytest.mark.skipif( + not os.environ.get("ORACLE_URL"), reason="Test oracle only when `ORACLE_URL` is set" +) +def test_oracle_round_function(oracle_url: str) -> None: + query = "SELECT round(v,2) TEST_ROUND FROM test_issue" + df = read_sql(oracle_url, query) + expected = pd.DataFrame( + data={ + "TEST_ROUND": pd.Series([1.11, 2.22, 3.33, None], dtype="float64"), + } + ) + assert_frame_equal(df, expected, check_names=True) \ No newline at end of file diff --git a/connectorx-python/connectorx/tests/test_partition.py b/connectorx-python/connectorx/tests/test_partition.py new file mode 100644 index 0000000..27e5638 --- /dev/null +++ b/connectorx-python/connectorx/tests/test_partition.py @@ -0,0 +1,21 @@ +import os + +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal + +from .. import partition_sql + + +@pytest.fixture(scope="module") # type: ignore +def postgres_url() -> str: + conn = os.environ["POSTGRES_URL"] + return conn + + +def test_partition_sql(postgres_url: str) -> None: + query = "SELECT * FROM test_table" + queires = partition_sql( + postgres_url, query, partition_on="test_int", partition_num=2 + ) + assert len(queires) == 2 diff --git a/connectorx-python/connectorx/tests/test_polars.py b/connectorx-python/connectorx/tests/test_polars.py new file mode 100644 index 0000000..f748fcb --- /dev/null +++ b/connectorx-python/connectorx/tests/test_polars.py @@ -0,0 +1,38 @@ +import os + +import pandas as pd +import pytest +import polars as pl + +from .. import read_sql + + +@pytest.fixture(scope="module") # type: ignore +def postgres_url() -> str: + conn = os.environ["POSTGRES_URL"] + return conn + + +def test_modin(postgres_url: str) -> None: + query = "SELECT * FROM test_table" + df = read_sql( + postgres_url, + query, + partition_on="test_int", + partition_range=(0, 2000), + partition_num=3, + return_type="polars", + ) + + expected = pl.DataFrame( + { + "test_int": [0, 1, 2, 3, 4, 1314], + "test_nullint": [5, 3, None, 7, 9, 2], + "test_str": ["a", "str1", "str2", "b", "c", None], + "test_float": [3.1, None, 2.2, 3, 7.8, -10], + "test_bool": [None, True, False, False, None, True], + }, + ) + + df = df.sort('test_int') + assert df.frame_equal(expected, null_equal=True) diff --git a/connectorx-python/connectorx/tests/test_postgres.py b/connectorx-python/connectorx/tests/test_postgres.py new file mode 100644 index 0000000..4f636fb --- /dev/null +++ b/connectorx-python/connectorx/tests/test_postgres.py @@ -0,0 +1,1141 @@ +import os + +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal + +from .. import read_sql + + +@pytest.fixture(scope="module") # type: ignore +def postgres_url() -> str: + conn = os.environ["POSTGRES_URL"] + return conn + + +@pytest.fixture(scope="module") # type: ignore +def postgres_url_tls() -> str: + conn = os.environ["POSTGRES_URL_TLS"] + return conn + + +@pytest.fixture(scope="module") # type: ignore +def postgres_rootcert() -> str: + cert = os.environ["POSTGRES_ROOTCERT"] + return cert + + +@pytest.fixture(scope="module") # type: ignore +def postgres_sslcert() -> str: + cert = os.environ["POSTGRES_SSLCERT"] + return cert + + +@pytest.fixture(scope="module") # type: ignore +def postgres_sslkey() -> str: + key = os.environ["POSTGRES_SSLKEY"] + return key + + +@pytest.mark.xfail +def test_on_non_select(postgres_url: str) -> None: + query = "CREATE TABLE non_select(id INTEGER NOT NULL)" + df = read_sql(postgres_url, query) + + +def test_aggregation(postgres_url: str) -> None: + query = "SELECT test_bool, SUM(test_float) FROM test_table GROUP BY test_bool" + df = read_sql(postgres_url, query) + expected = pd.DataFrame( + index=range(3), + data={ + "test_bool": pd.Series([None, False, True], dtype="boolean"), + "sum": pd.Series([10.9, 5.2, -10.0], dtype="float64"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_partition_on_aggregation(postgres_url: str) -> None: + query = ( + "SELECT test_bool, SUM(test_int) AS test_int FROM test_table GROUP BY test_bool" + ) + df = read_sql(postgres_url, query, partition_on="test_int", partition_num=2) + expected = pd.DataFrame( + index=range(3), + data={ + "test_bool": pd.Series([None, False, True], dtype="boolean"), + "test_int": pd.Series([4, 5, 1315], dtype="Int64"), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_aggregation2(postgres_url: str) -> None: + query = "select DISTINCT(test_bool) from test_table" + df = read_sql(postgres_url, query) + expected = pd.DataFrame( + index=range(3), + data={ + "test_bool": pd.Series([None, False, True], dtype="boolean"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_partition_on_aggregation2(postgres_url: str) -> None: + query = "select MAX(test_int), MIN(test_int) from test_table" + df = read_sql(postgres_url, query, partition_on="max", partition_num=2) + expected = pd.DataFrame( + index=range(1), + data={ + "max": pd.Series([1314], dtype="Int64"), + "min": pd.Series([0], dtype="Int64"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_udf(postgres_url: str) -> None: + query = "select increment(test_int) as test_int from test_table ORDER BY test_int" + df = read_sql(postgres_url, query, partition_on="test_int", partition_num=2) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([1, 2, 3, 4, 5, 1315], dtype="Int64"), + }, + ) + df = df.sort_values("test_int").reset_index(drop=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_manual_partition(postgres_url: str) -> None: + + queries = [ + "SELECT * FROM test_table WHERE test_int < 2", + "SELECT * FROM test_table WHERE test_int >= 2", + ] + + df = read_sql(postgres_url, query=queries) + + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([0, 1, 2, 3, 4, 1314], dtype="Int64"), + "test_nullint": pd.Series([5, 3, None, 7, 9, 2], dtype="Int64"), + "test_str": pd.Series( + ["a", "str1", "str2", "b", "c", None], dtype="object" + ), + "test_float": pd.Series([3.1, None, 2.2, 3, 7.8, -10], dtype="float64"), + "test_bool": pd.Series( + [None, True, False, False, None, True], dtype="boolean" + ), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_postgres_without_partition(postgres_url: str) -> None: + query = "SELECT * FROM test_table" + df = read_sql(postgres_url, query) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([1, 2, 0, 3, 4, 1314], dtype="Int64"), + "test_nullint": pd.Series([3, None, 5, 7, 9, 2], dtype="Int64"), + "test_str": pd.Series( + ["str1", "str2", "a", "b", "c", None], dtype="object" + ), + "test_float": pd.Series([None, 2.2, 3.1, 3, 7.8, -10], dtype="float64"), + "test_bool": pd.Series( + [True, False, None, False, None, True], dtype="boolean" + ), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_postgres_limit(postgres_url: str) -> None: + query = "SELECT * FROM test_table limit 3" + df = read_sql( + postgres_url, + query, + ) + expected = pd.DataFrame( + index=range(3), + data={ + "test_int": pd.Series([0, 1, 2], dtype="Int64"), + "test_nullint": pd.Series([5, 3, None], dtype="Int64"), + "test_str": pd.Series(["a", "str1", "str2"], dtype="object"), + "test_float": pd.Series([3.1, None, 2.2], dtype="float64"), + "test_bool": pd.Series([None, True, False], dtype="boolean"), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_postgres_limit_large(postgres_url: str) -> None: + query = "SELECT * FROM test_table limit 10" + df = read_sql( + postgres_url, + query, + ) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([0, 1, 2, 3, 4, 1314], dtype="Int64"), + "test_nullint": pd.Series([5, 3, None, 7, 9, 2], dtype="Int64"), + "test_str": pd.Series( + ["a", "str1", "str2", "b", "c", None], dtype="object" + ), + "test_float": pd.Series([3.1, None, 2.2, 3, 7.8, -10], dtype="float64"), + "test_bool": pd.Series( + [None, True, False, False, None, True], dtype="boolean" + ), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_postgres_limit_with_partition(postgres_url: str) -> None: + query = "SELECT * FROM test_table limit 3" + df = read_sql( + postgres_url, + query, + partition_on="test_int", + partition_range=(0, 2000), + partition_num=3, + ) + expected = pd.DataFrame( + index=range(3), + data={ + "test_int": pd.Series([0, 1, 2], dtype="Int64"), + "test_nullint": pd.Series([5, 3, None], dtype="Int64"), + "test_str": pd.Series(["a", "str1", "str2"], dtype="object"), + "test_float": pd.Series([3.1, None, 2.2], dtype="float64"), + "test_bool": pd.Series([None, True, False], dtype="boolean"), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_postgres_limit_large_with_partition(postgres_url: str) -> None: + query = "SELECT * FROM test_table limit 10" + df = read_sql( + postgres_url, + query, + partition_on="test_int", + partition_range=(0, 2000), + partition_num=3, + ) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([0, 1, 2, 3, 4, 1314], dtype="Int64"), + "test_nullint": pd.Series([5, 3, None, 7, 9, 2], dtype="Int64"), + "test_str": pd.Series( + ["a", "str1", "str2", "b", "c", None], dtype="object" + ), + "test_float": pd.Series([3.1, None, 2.2, 3, 7.8, -10], dtype="float64"), + "test_bool": pd.Series( + [None, True, False, False, None, True], dtype="boolean" + ), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_postgres_with_partition(postgres_url: str) -> None: + query = "SELECT * FROM test_table" + df = read_sql( + postgres_url, + query, + partition_on="test_int", + partition_range=(0, 2000), + partition_num=3, + ) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([0, 1, 2, 3, 4, 1314], dtype="Int64"), + "test_nullint": pd.Series([5, 3, None, 7, 9, 2], dtype="Int64"), + "test_str": pd.Series( + ["a", "str1", "str2", "b", "c", None], dtype="object" + ), + "test_float": pd.Series([3.1, None, 2.2, 3, 7.8, -10], dtype="float64"), + "test_bool": pd.Series( + [None, True, False, False, None, True], dtype="boolean" + ), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_postgres_with_partition_without_partition_range(postgres_url: str) -> None: + query = "SELECT * FROM test_table where test_float > 3" + df = read_sql( + postgres_url, + query, + partition_on="test_int", + partition_num=3, + ) + + expected = pd.DataFrame( + index=range(2), + data={ + "test_int": pd.Series([0, 4], dtype="Int64"), + "test_nullint": pd.Series([5, 9], dtype="Int64"), + "test_str": pd.Series(["a", "c"], dtype="object"), + "test_float": pd.Series([3.1, 7.8], dtype="float64"), + "test_bool": pd.Series([None, None], dtype="boolean"), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_postgres_with_partition_and_selection(postgres_url: str) -> None: + query = "SELECT * FROM test_table WHERE 1 = 3 OR 2 = 2" + df = read_sql( + postgres_url, + query, + partition_on="test_int", + partition_range=(0, 2000), + partition_num=3, + ) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([0, 1, 2, 3, 4, 1314], dtype="Int64"), + "test_nullint": pd.Series([5, 3, None, 7, 9, 2], dtype="Int64"), + "test_str": pd.Series( + ["a", "str1", "str2", "b", "c", None], dtype="object" + ), + "test_float": pd.Series([3.1, None, 2.2, 3, 7.8, -10], dtype="float64"), + "test_bool": pd.Series( + [None, True, False, False, None, True], dtype="boolean" + ), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_postgres_with_partition_and_projection(postgres_url: str) -> None: + query = "SELECT test_int, test_nullint, test_str FROM test_table" + df = read_sql( + postgres_url, + query, + partition_on="test_int", + partition_range=(0, 2000), + partition_num=3, + ) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([0, 1, 2, 3, 4, 1314], dtype="Int64"), + "test_nullint": pd.Series([5, 3, None, 7, 9, 2], dtype="Int64"), + "test_str": pd.Series( + ["a", "str1", "str2", "b", "c", None], dtype="object" + ), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_postgres_with_partition_and_join(postgres_url: str) -> None: + query = "SELECT T.test_int, T.test_bool, S.test_language FROM test_table T INNER JOIN test_str S ON T.test_int = S.id" + df = read_sql( + postgres_url, + query, + partition_on="test_int", + partition_range=(0, 2000), + partition_num=3, + ) + expected = pd.DataFrame( + index=range(5), + data={ + "test_int": pd.Series([0, 1, 2, 3, 4], dtype="Int64"), + "test_bool": pd.Series([None, True, False, False, None], dtype="boolean"), + "test_language": pd.Series( + ["English", "中文", "日本語", "русский", "Emoji"], dtype="object" + ), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_postgres_with_partition_and_spja(postgres_url: str) -> None: + query = "select test_bool, AVG(test_float) as avg, SUM(test_int) as sum from test_table as a, test_str as b where a.test_int = b.id AND test_nullint is not NULL GROUP BY test_bool ORDER BY sum" + df = read_sql(postgres_url, query, partition_on="sum", partition_num=2) + expected = pd.DataFrame( + index=range(3), + data={ + "test_bool": pd.Series([True, False, None], dtype="boolean"), + "avg": pd.Series([None, 3, 5.45], dtype="float64"), + "sum": pd.Series([1, 3, 4], dtype="Int64"), + }, + ) + df.sort_values(by="sum", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_postgres_on_utf8(postgres_url: str) -> None: + query = "SELECT * FROM test_str" + df = read_sql(postgres_url, query) + expected = pd.DataFrame( + index=range(9), + data={ + "id": pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype="Int64"), + "test_language": pd.Series( + [ + "English", + "中文", + "日本語", + "русский", + "Emoji", + "Latin1", + "Extra", + "Mixed", + "", + ], + dtype="object", + ), + "test_hello": pd.Series( + [ + "Hello", + "你好", + "こんにちは", + "Здра́вствуйте", + "😁😂😜", + "¥§¤®ð", + "y̆", + "Ha好ち😁ðy̆", + None, + ], + dtype="object", + ), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_postgres_with_index_col(postgres_url: str) -> None: + query = "SELECT * FROM test_table" + df = read_sql(postgres_url, query, index_col="test_int") + expected = pd.DataFrame( + data={ + "test_int": pd.Series([1, 2, 0, 3, 4, 1314], dtype="Int64"), + "test_nullint": pd.Series([3, None, 5, 7, 9, 2], dtype="Int64"), + "test_str": pd.Series( + ["str1", "str2", "a", "b", "c", None], dtype="object" + ), + "test_float": pd.Series([None, 2.2, 3.1, 3, 7.8, -10], dtype="float64"), + "test_bool": pd.Series( + [True, False, None, False, None, True], dtype="boolean" + ), + }, + ) + expected.set_index("test_int", inplace=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_postgres_types_binary(postgres_url: str) -> None: + query = "SELECT test_date, test_timestamp, test_timestamptz, test_int16, test_int64, test_float32, test_numeric, test_bpchar, test_char, test_varchar, test_uuid, test_time, test_json, test_jsonb, test_bytea, test_enum, test_f4array, test_f8array, test_narray, test_boolarray, test_i2array, test_i4array, test_i8array, test_citext, test_ltree, test_lquery, test_ltxtquery FROM test_types" + df = read_sql(postgres_url, query) + expected = pd.DataFrame( + index=range(4), + data={ + "test_date": pd.Series( + ["1970-01-01", "2000-02-28", "2038-01-18", None], dtype="datetime64[ns]" + ), + "test_timestamp": pd.Series( + [ + "1970-01-01 00:00:01", + "2000-02-28 12:00:10", + "2038-01-18 23:59:59", + None, + ], + dtype="datetime64[ns]", + ), + "test_timestamptz": pd.Series( + [ + "1970-01-01 00:00:01", + "2000-02-28 16:00:10", + "2038-01-18 15:59:59", + None, + ], + dtype="datetime64[ns]", + ), + "test_int16": pd.Series([0, 1, 2, 3], dtype="Int64"), + "test_int64": pd.Series( + [-9223372036854775808, 0, 9223372036854775807, None], dtype="Int64" + ), + "test_float32": pd.Series( + [None, 3.1415926535, 2.71, -1e-37], dtype="float64" + ), + "test_numeric": pd.Series([None, 521.34, 0.00, 0.00], dtype="float64"), + "test_bpchar": pd.Series(["a ", "bb ", "ccc ", None], dtype="object"), + "test_char": pd.Series(["a", "b", None, "d"], dtype="object"), + "test_varchar": pd.Series([None, "bb", "c", "defghijklm"], dtype="object"), + "test_uuid": pd.Series( + [ + "86b494cc-96b2-11eb-9298-3e22fbb9fe9d", + "86b49b84-96b2-11eb-9298-3e22fbb9fe9d", + "86b49c42-96b2-11eb-9298-3e22fbb9fe9d", + None, + ], + dtype="object", + ), + "test_time": pd.Series( + ["08:12:40", None, "23:00:10", "18:30:00"], dtype="object" + ), + "test_json": pd.Series( + [ + '{"customer":"John Doe","items":{"product":"Beer","qty":6}}', + '{"customer":"Lily Bush","items":{"product":"Diaper","qty":24}}', + '{"customer":"Josh William","items":{"product":"Toy Car","qty":1}}', + None, + ], + dtype="object", + ), + "test_jsonb": pd.Series( + [ + '{"product":"Beer","qty":6}', + '{"product":"Diaper","qty":24}', + '{"product":"Toy Car","qty":1}', + None, + ], + dtype="object", + ), + "test_bytea": pd.Series( + [ + None, + b"\xd0\x97\xd0\xb4\xd1\x80\xd0\xb0\xcc\x81\xd0\xb2\xd1\x81\xd1\x82\xd0\xb2\xd1\x83\xd0\xb9\xd1\x82\xd0\xb5", + b"", + b"\xf0\x9f\x98\x9c", + ], + dtype="object", + ), + "test_enum": pd.Series( + ["happy", "very happy", "ecstatic", None], dtype="object" + ), + "test_f4array": pd.Series( + [[], None, [123.123], [-1e-37, 1e37]], dtype="object" + ), + "test_f8array": pd.Series( + [[], None, [-1e-307, 1e308], [0.000234, -12.987654321]], dtype="object" + ), + "test_narray": pd.Series( + [[], None, [521.34], [0.12, 333.33, 22.22]], dtype="object" + ), + "test_boolarray": pd.Series( + [[True, False], [], [True], None], dtype="object" + ), + "test_i2array": pd.Series( + [[-1, 0, 1], [], [-32768, 32767], None], dtype="object" + ), + "test_i4array": pd.Series( + [[-1, 0, 1123], [], [-2147483648, 2147483647], None], dtype="object" + ), + "test_i8array": pd.Series( + [[-9223372036854775808, 9223372036854775807], [], [0], None], + dtype="object", + ), + "test_citext": pd.Series(["str_citext", "", "s", None], dtype="object"), + "test_ltree": pd.Series(["A.B.C.D", "A.B.E", "A", None], dtype="object"), + "test_lquery": pd.Series(["*.B.*", "A.*", "*", None], dtype="object"), + "test_ltxtquery": pd.Series( + ["A & B*", "A | B", "A@", None], dtype="object" + ), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_postgres_types_csv(postgres_url: str) -> None: + query = "SELECT test_date, test_timestamp, test_timestamptz, test_int16, test_int64, test_float32, test_numeric, test_bpchar, test_char, test_varchar, test_uuid, test_time, test_json, test_jsonb, test_bytea, test_enum::text, test_f4array, test_f8array, test_narray, test_boolarray, test_i2array, test_i4array, test_i8array, test_citext, test_ltree FROM test_types" + df = read_sql(postgres_url, query, protocol="csv") + expected = pd.DataFrame( + index=range(4), + data={ + "test_date": pd.Series( + ["1970-01-01", "2000-02-28", "2038-01-18", None], dtype="datetime64[ns]" + ), + "test_timestamp": pd.Series( + [ + "1970-01-01 00:00:01", + "2000-02-28 12:00:10", + "2038-01-18 23:59:59", + None, + ], + dtype="datetime64[ns]", + ), + "test_timestamptz": pd.Series( + [ + "1970-01-01 00:00:01", + "2000-02-28 16:00:10", + "2038-01-18 15:59:59", + None, + ], + dtype="datetime64[ns]", + ), + "test_int16": pd.Series([0, 1, 2, 3], dtype="Int64"), + "test_int64": pd.Series( + [-9223372036854775808, 0, 9223372036854775807, None], dtype="Int64" + ), + "test_float32": pd.Series( + [None, 3.1415926535, 2.71, -1e-37], dtype="float64" + ), + "test_numeric": pd.Series([None, 521.34, 0.00, 0.00], dtype="float64"), + "test_bpchar": pd.Series(["a ", "bb ", "ccc ", None], dtype="object"), + "test_char": pd.Series(["a", "b", None, "d"], dtype="object"), + "test_varchar": pd.Series([None, "bb", "c", "defghijklm"], dtype="object"), + "test_uuid": pd.Series( + [ + "86b494cc-96b2-11eb-9298-3e22fbb9fe9d", + "86b49b84-96b2-11eb-9298-3e22fbb9fe9d", + "86b49c42-96b2-11eb-9298-3e22fbb9fe9d", + None, + ], + dtype="object", + ), + "test_time": pd.Series( + ["08:12:40", None, "23:00:10", "18:30:00"], dtype="object" + ), + "test_json": pd.Series( + [ + '{"customer":"John Doe","items":{"product":"Beer","qty":6}}', + '{"customer":"Lily Bush","items":{"product":"Diaper","qty":24}}', + '{"customer":"Josh William","items":{"product":"Toy Car","qty":1}}', + None, + ], + dtype="object", + ), + "test_jsonb": pd.Series( + [ + '{"product":"Beer","qty":6}', + '{"product":"Diaper","qty":24}', + '{"product":"Toy Car","qty":1}', + None, + ], + dtype="object", + ), + "test_bytea": pd.Series( + [ + None, + b"\xd0\x97\xd0\xb4\xd1\x80\xd0\xb0\xcc\x81\xd0\xb2\xd1\x81\xd1\x82\xd0\xb2\xd1\x83\xd0\xb9\xd1\x82\xd0\xb5", + b"", + b"\xf0\x9f\x98\x9c", + ], + dtype="object", + ), + "test_enum": pd.Series( + ["happy", "very happy", "ecstatic", None], dtype="object" + ), + "test_f4array": pd.Series( + [[], None, [123.123], [-1e-37, 1e37]], dtype="object" + ), + "test_f8array": pd.Series( + [[], None, [1e-307, 1e308], [0.000234, -12.987654321]], dtype="object" + ), + "test_narray": pd.Series( + [[], None, [521.34], [0.12, 333.33, 22.22]], dtype="object" + ), + "test_boolarray": pd.Series( + [[True, False], [], [True], None], dtype="object" + ), + "test_i2array": pd.Series( + [[-1, 0, 1], [], [-32768, 32767], None], dtype="object" + ), + "test_i4array": pd.Series( + [[-1, 0, 1123], [], [-2147483648, 2147483647], None], dtype="object" + ), + "test_i8array": pd.Series( + [[-9223372036854775808, 9223372036854775807], [], [0], None], + dtype="object", + ), + "test_citext": pd.Series(["str_citext", None, "s", None], dtype="object"), + "test_ltree": pd.Series(["A.B.C.D", "A.B.E", "A", None], dtype="object"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_postgres_types_cursor(postgres_url: str) -> None: + query = "SELECT test_date, test_timestamp, test_timestamptz, test_int16, test_int64, test_float32, test_numeric, test_bpchar, test_char, test_varchar, test_uuid, test_time, test_json, test_jsonb, test_bytea, test_enum::text, test_f4array, test_f8array, test_narray, test_boolarray, test_i2array, test_i4array, test_i8array, test_citext, test_ltree FROM test_types" + df = read_sql(postgres_url, query, protocol="cursor") + expected = pd.DataFrame( + index=range(4), + data={ + "test_date": pd.Series( + ["1970-01-01", "2000-02-28", "2038-01-18", None], dtype="datetime64[ns]" + ), + "test_timestamp": pd.Series( + [ + "1970-01-01 00:00:01", + "2000-02-28 12:00:10", + "2038-01-18 23:59:59", + None, + ], + dtype="datetime64[ns]", + ), + "test_timestamptz": pd.Series( + [ + "1970-01-01 00:00:01", + "2000-02-28 16:00:10", + "2038-01-18 15:59:59", + None, + ], + dtype="datetime64[ns]", + ), + "test_int16": pd.Series([0, 1, 2, 3], dtype="Int64"), + "test_int64": pd.Series( + [-9223372036854775808, 0, 9223372036854775807, None], dtype="Int64" + ), + "test_float32": pd.Series( + [None, 3.1415926535, 2.71, -1e-37], dtype="float64" + ), + "test_numeric": pd.Series([None, 521.34, 0.00, 0.00], dtype="float64"), + "test_bpchar": pd.Series(["a ", "bb ", "ccc ", None], dtype="object"), + "test_char": pd.Series(["a", "b", None, "d"], dtype="object"), + "test_varchar": pd.Series([None, "bb", "c", "defghijklm"], dtype="object"), + "test_uuid": pd.Series( + [ + "86b494cc-96b2-11eb-9298-3e22fbb9fe9d", + "86b49b84-96b2-11eb-9298-3e22fbb9fe9d", + "86b49c42-96b2-11eb-9298-3e22fbb9fe9d", + None, + ], + dtype="object", + ), + "test_time": pd.Series( + ["08:12:40", None, "23:00:10", "18:30:00"], dtype="object" + ), + "test_json": pd.Series( + [ + '{"customer":"John Doe","items":{"product":"Beer","qty":6}}', + '{"customer":"Lily Bush","items":{"product":"Diaper","qty":24}}', + '{"customer":"Josh William","items":{"product":"Toy Car","qty":1}}', + None, + ], + dtype="object", + ), + "test_jsonb": pd.Series( + [ + '{"product":"Beer","qty":6}', + '{"product":"Diaper","qty":24}', + '{"product":"Toy Car","qty":1}', + None, + ], + dtype="object", + ), + "test_bytea": pd.Series( + [ + None, + b"\xd0\x97\xd0\xb4\xd1\x80\xd0\xb0\xcc\x81\xd0\xb2\xd1\x81\xd1\x82\xd0\xb2\xd1\x83\xd0\xb9\xd1\x82\xd0\xb5", + b"", + b"\xf0\x9f\x98\x9c", + ], + dtype="object", + ), + "test_enum": pd.Series( + ["happy", "very happy", "ecstatic", None], dtype="object" + ), + "test_f4array": pd.Series( + [[], None, [123.123], [-1e-37, 1e37]], dtype="object" + ), + "test_f8array": pd.Series( + [[], None, [1e-307, 1e308], [0.000234, -12.987654321]], dtype="object" + ), + "test_narray": pd.Series( + [[], None, [521.34], [0.12, 333.33, 22.22]], dtype="object" + ), + "test_boolarray": pd.Series( + [[True, False], [], [True], None], dtype="object" + ), + "test_i2array": pd.Series( + [[-1, 0, 1], [], [-32768, 32767], None], dtype="object" + ), + "test_i4array": pd.Series( + [[-1, 0, 1123], [], [-2147483648, 2147483647], None], dtype="object" + ), + "test_i8array": pd.Series( + [[-9223372036854775808, 9223372036854775807], [], [0], None], + dtype="object", + ), + "test_citext": pd.Series(["str_citext", "", "s", None], dtype="object"), + "test_ltree": pd.Series(["A.B.C.D", "A.B.E", "A", None], dtype="object"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_postgres_types_simple(postgres_url: str) -> None: + query = "SELECT test_date, test_timestamp, test_timestamptz, test_int16, test_int64, test_float32, test_numeric, test_bpchar, test_char, test_varchar, test_uuid, test_time, test_bytea, test_enum, test_f4array, test_f8array, test_narray, test_boolarray, test_i2array, test_i4array, test_i8array FROM test_types" + df = read_sql(postgres_url, query, protocol="simple") + expected = pd.DataFrame( + index=range(4), + data={ + "test_date": pd.Series( + ["1970-01-01", "2000-02-28", "2038-01-18", None], dtype="datetime64[ns]" + ), + "test_timestamp": pd.Series( + [ + "1970-01-01 00:00:01", + "2000-02-28 12:00:10", + "2038-01-18 23:59:59", + None, + ], + dtype="datetime64[ns]", + ), + "test_timestamptz": pd.Series( + [ + "1970-01-01 00:00:01", + "2000-02-28 16:00:10", + "2038-01-18 15:59:59", + None, + ], + dtype="datetime64[ns]", + ), + "test_int16": pd.Series([0, 1, 2, 3], dtype="Int64"), + "test_int64": pd.Series( + [-9223372036854775808, 0, 9223372036854775807, None], dtype="Int64" + ), + "test_float32": pd.Series( + [None, 3.1415926535, 2.71, -1e-37], dtype="float64" + ), + "test_numeric": pd.Series([None, 521.34, 0.00, 0.00], dtype="float64"), + "test_bpchar": pd.Series(["a ", "bb ", "ccc ", None], dtype="object"), + "test_char": pd.Series(["a", "b", None, "d"], dtype="object"), + "test_varchar": pd.Series([None, "bb", "c", "defghijklm"], dtype="object"), + "test_uuid": pd.Series( + [ + "86b494cc-96b2-11eb-9298-3e22fbb9fe9d", + "86b49b84-96b2-11eb-9298-3e22fbb9fe9d", + "86b49c42-96b2-11eb-9298-3e22fbb9fe9d", + None, + ], + dtype="object", + ), + "test_time": pd.Series( + ["08:12:40", None, "23:00:10", "18:30:00"], dtype="object" + ), + "test_bytea": pd.Series( + [ + None, + b"\xd0\x97\xd0\xb4\xd1\x80\xd0\xb0\xcc\x81\xd0\xb2\xd1\x81\xd1\x82\xd0\xb2\xd1\x83\xd0\xb9\xd1\x82\xd0\xb5", + b"", + b"\xf0\x9f\x98\x9c", + ], + dtype="object", + ), + "test_enum": pd.Series( + ["happy", "very happy", "ecstatic", None], dtype="object" + ), + "test_f4array": pd.Series( + [[], None, [123.123], [-1e-37, 1e37]], dtype="object" + ), + "test_f8array": pd.Series( + [[], None, [1e-307, 1e308], [0.000234, -12.987654321]], dtype="object" + ), + "test_narray": pd.Series( + [[], None, [521.34], [0.12, 333.33, 22.22]], dtype="object" + ), + "test_boolarray": pd.Series( + [[True, False], [], [True], None], dtype="object" + ), + "test_i2array": pd.Series( + [[-1, 0, 1], [], [-32768, 32767], None], dtype="object" + ), + "test_i4array": pd.Series( + [[-1, 0, 1123], [], [-2147483648, 2147483647], None], dtype="object" + ), + "test_i8array": pd.Series( + [[-9223372036854775808, 9223372036854775807], [], [0], None], + dtype="object", + ), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_empty_result(postgres_url: str) -> None: + query = "SELECT * FROM test_table where test_int < -100" + df = read_sql(postgres_url, query) + expected = pd.DataFrame( + data={ + "test_int": pd.Series([], dtype="Int64"), + "test_nullint": pd.Series([], dtype="Int64"), + "test_str": pd.Series([], dtype="object"), + "test_float": pd.Series([], dtype="float64"), + "test_bool": pd.Series([], dtype="boolean"), + } + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_empty_result_on_partition(postgres_url: str) -> None: + query = "SELECT * FROM test_table where test_int < -100" + df = read_sql(postgres_url, query, partition_on="test_int", partition_num=3) + expected = pd.DataFrame( + data={ + "test_int": pd.Series([], dtype="Int64"), + "test_nullint": pd.Series([], dtype="Int64"), + "test_str": pd.Series([], dtype="object"), + "test_float": pd.Series([], dtype="float64"), + "test_bool": pd.Series([], dtype="boolean"), + } + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_empty_result_on_some_partition(postgres_url: str) -> None: + query = "SELECT * FROM test_table where test_int < 1" + df = read_sql(postgres_url, query, partition_on="test_int", partition_num=3) + expected = pd.DataFrame( + data={ + "test_int": pd.Series([0], dtype="Int64"), + "test_nullint": pd.Series([5], dtype="Int64"), + "test_str": pd.Series(["a"], dtype="object"), + "test_float": pd.Series([3.1], dtype="float64"), + "test_bool": pd.Series([None], dtype="boolean"), + } + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_posix_regex(postgres_url: str) -> None: + query = "select test_int, case when test_str ~* 'str.*' then 'convert_str' end as converted_str from test_table" + df = read_sql(postgres_url, query) + expected = pd.DataFrame( + data={ + "test_int": pd.Series([1, 2, 0, 3, 4, 1314], dtype="Int64"), + "converted_str": pd.Series( + ["convert_str", "convert_str", None, None, None, None], dtype="object" + ), + } + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_json(postgres_url: str) -> None: + query = "select test_json->>'customer' as customer from test_types" + df = read_sql(postgres_url, query) + expected = pd.DataFrame( + data={ + "customer": pd.Series( + ["John Doe", "Lily Bush", "Josh William", None], dtype="object" + ), + } + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_partition_on_json(postgres_url: str) -> None: + query = "select test_int16, test_jsonb->>'qty' as qty from test_types" + df = read_sql(postgres_url, query, partition_on="test_int16", partition_num=3) + expected = pd.DataFrame( + data={ + "test_int16": pd.Series([0, 1, 2, 3], dtype="Int64"), + "qty": pd.Series(["6", "24", "1", None], dtype="object"), + } + ) + df.sort_values(by="test_int16", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_cte(postgres_url: str) -> None: + query = "with test_cte (test_int, test_str) as (select test_int, test_str from test_table where test_float > 0) select test_int, test_str from test_cte" + df = read_sql(postgres_url, query, partition_on="test_int", partition_num=3) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + expected = pd.DataFrame( + index=range(4), + data={ + "test_int": pd.Series([0, 2, 3, 4], dtype="Int64"), + "test_str": pd.Series(["a", "str2", "b", "c"], dtype="object"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("POSTGRES_URL_TLS"), + reason="Do not test Postgres TLS unless `POSTGRES_URL_TLS` is set", +) +def test_postgres_tls(postgres_url_tls: str) -> None: + query = "SELECT * FROM test_table" + df = read_sql( + f"{postgres_url_tls}?sslmode=require", + query, + partition_on="test_int", + partition_range=(0, 2000), + partition_num=3, + ) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([0, 1, 2, 3, 4, 1314], dtype="Int64"), + "test_nullint": pd.Series([5, 3, None, 7, 9, 2], dtype="Int64"), + "test_str": pd.Series( + ["a", "str1", "str2", "b", "c", None], dtype="object" + ), + "test_float": pd.Series([3.1, None, 2.2, 3, 7.8, -10], dtype="float64"), + "test_bool": pd.Series( + [None, True, False, False, None, True], dtype="boolean" + ), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_partition_on_decimal(postgres_url: str) -> None: + # partition column can not have None + query = "SELECT * FROM test_table where test_int<>1" + df = read_sql(postgres_url, query, partition_on="test_float", partition_num=3) + expected = pd.DataFrame( + data={ + "test_int": pd.Series([0, 2, 3, 4, 1314], dtype="Int64"), + "test_nullint": pd.Series([5, None, 7, 9, 2], dtype="Int64"), + "test_str": pd.Series(["a", "str2", "b", "c", None], dtype="object"), + "test_float": pd.Series([3.1, 2.2, 3, 7.8, -10], dtype="float64"), + "test_bool": pd.Series([None, False, False, None, True], dtype="boolean"), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("POSTGRES_URL_TLS"), + reason="Do not test Postgres TLS unless `POSTGRES_URL_TLS` is set", +) +def test_postgres_tls_with_cert(postgres_url_tls: str, postgres_rootcert: str) -> None: + query = "SELECT * FROM test_table" + df = read_sql( + f"{postgres_url_tls}?sslmode=require&sslrootcert={postgres_rootcert}", + query, + partition_on="test_int", + partition_range=(0, 2000), + partition_num=3, + ) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([0, 1, 2, 3, 4, 1314], dtype="Int64"), + "test_nullint": pd.Series([5, 3, None, 7, 9, 2], dtype="Int64"), + "test_str": pd.Series( + ["a", "str1", "str2", "b", "c", None], dtype="object" + ), + "test_float": pd.Series([3.1, None, 2.2, 3, 7.8, -10], dtype="float64"), + "test_bool": pd.Series( + [None, True, False, False, None, True], dtype="boolean" + ), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("POSTGRES_URL_TLS"), + reason="Do not test Postgres TLS unless `POSTGRES_URL_TLS` is set", +) +def test_postgres_tls_client_auth( + postgres_url_tls: str, + postgres_rootcert: str, + postgres_sslcert: str, + postgres_sslkey: str, +) -> None: + query = "SELECT * FROM test_table" + df = read_sql( + f"{postgres_url_tls}?sslmode=require&sslrootcert={postgres_rootcert}&sslcert={postgres_sslcert}&sslkey={postgres_sslkey}", + query, + partition_on="test_int", + partition_range=(0, 2000), + partition_num=3, + ) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([0, 1, 2, 3, 4, 1314], dtype="Int64"), + "test_nullint": pd.Series([5, 3, None, 7, 9, 2], dtype="Int64"), + "test_str": pd.Series( + ["a", "str1", "str2", "b", "c", None], dtype="object" + ), + "test_float": pd.Series([3.1, None, 2.2, 3, 7.8, -10], dtype="float64"), + "test_bool": pd.Series( + [None, True, False, False, None, True], dtype="boolean" + ), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("POSTGRES_URL_TLS"), + reason="Do not test Postgres TLS unless `POSTGRES_URL_TLS` is set", +) +def test_postgres_tls_disable(postgres_url_tls: str) -> None: + query = "SELECT * FROM test_table" + df = read_sql( + f"{postgres_url_tls}?sslmode=disable", + query, + partition_on="test_int", + partition_range=(0, 2000), + partition_num=3, + ) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([0, 1, 2, 3, 4, 1314], dtype="Int64"), + "test_nullint": pd.Series([5, 3, None, 7, 9, 2], dtype="Int64"), + "test_str": pd.Series( + ["a", "str1", "str2", "b", "c", None], dtype="object" + ), + "test_float": pd.Series([3.1, None, 2.2, 3, 7.8, -10], dtype="float64"), + "test_bool": pd.Series( + [None, True, False, False, None, True], dtype="boolean" + ), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif( + not os.environ.get("POSTGRES_URL_TLS"), + reason="Do not test Postgres TLS unless `POSTGRES_URL_TLS` is set", +) +@pytest.mark.xfail +def test_postgres_tls_fail(postgres_url_tls: str) -> None: + query = "SELECT * FROM test_table" + df = read_sql( + f"{postgres_url_tls}?sslmode=require&sslrootcert=fake.cert", + query, + partition_on="test_int", + partition_range=(0, 2000), + partition_num=3, + ) + +def test_postgres_name_type(postgres_url: str) -> None: + # partition column can not have None + query = "SELECT test_name FROM test_types" + df = read_sql(postgres_url, query) + expected = pd.DataFrame( + data={ + "test_name": pd.Series(["0", "21", "someName", "101203203-1212323-22131235"]), + }, + ) + assert_frame_equal(df, expected, check_names=True) \ No newline at end of file diff --git a/connectorx-python/connectorx/tests/test_redshift.py b/connectorx-python/connectorx/tests/test_redshift.py new file mode 100644 index 0000000..7c17f41 --- /dev/null +++ b/connectorx-python/connectorx/tests/test_redshift.py @@ -0,0 +1,136 @@ +import os + +import numpy as np +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal + +from .. import read_sql + + +@pytest.fixture(scope="module") # type: ignore +def redshift_url() -> str: + conn = os.environ["REDSHIFT_URL"] + return conn + + +@pytest.mark.skipif(not os.environ.get("REDSHIFT_URL"), reason="Do not test Redshift unless `REDSHIFT_URL` is set") +def test_redshift_without_partition(redshift_url: str) -> None: + query = "SELECT * FROM test_table" + df = read_sql(redshift_url, query, protocol="cursor") + # result from redshift might have different order each time + df.sort_values(by="test_int", inplace=True, ignore_index=True) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([0, 1, 2, 3, 4, 1314], dtype="Int64"), + "test_nullint": pd.Series([5, 3, None, 7, 9, 2], dtype="Int64"), + "test_str": pd.Series( + ["a", "str1", "str2", "b", "c", None], dtype="object" + ), + "test_float": pd.Series([3.1, None, 2.2, 3, 7.8, -10], dtype="float64"), + "test_bool": pd.Series( + [None, True, False, False, None, True], dtype="boolean" + ), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif(not os.environ.get("REDSHIFT_URL"), reason="Do not test Redshift unless `REDSHIFT_URL` is set") +def test_redshift_with_partition(redshift_url: str) -> None: + query = "SELECT * FROM test_table" + df = read_sql( + redshift_url, + query, + partition_on="test_int", + partition_range=(0, 2000), + partition_num=3, + protocol="cursor" + ) + # result from redshift might have different order each time + df.sort_values(by="test_int", inplace=True, ignore_index=True) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([0, 1, 2, 3, 4, 1314], dtype="Int64"), + "test_nullint": pd.Series([5, 3, None, 7, 9, 2], dtype="Int64"), + "test_str": pd.Series( + ["a", "str1", "str2", "b", "c", None], dtype="object" + ), + "test_float": pd.Series([3.1, None, 2.2, 3, 7.8, -10], dtype="float64"), + "test_bool": pd.Series( + [None, True, False, False, None, True], dtype="boolean" + ), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif(not os.environ.get("REDSHIFT_URL"), reason="Do not test Redshift unless `REDSHIFT_URL` is set") +def test_redshift_types(redshift_url: str) -> None: + query = "SELECT test_int16, test_char, test_time, test_datetime FROM test_types" + df = read_sql(redshift_url, query, protocol="cursor") + # result from redshift might have different order each time + df.sort_values(by="test_int16", inplace=True, ignore_index=True) + expected = pd.DataFrame( + index=range(4), + data={ + "test_int16": pd.Series([0, 1, 2, 3], dtype="Int64"), + "test_char": pd.Series(["a", "b", "c", "d"], dtype="object"), + "test_time": pd.Series( + ["08:12:40", "10:03:00", "23:00:10", "18:30:00"], dtype="object" + ), + "test_datetime": pd.Series( + [ + np.datetime64("2007-01-01T10:00:19"), + np.datetime64("2005-01-01T22:03:00"), + None, + np.datetime64("1987-01-01T11:00:00"), + ], dtype="datetime64[ns]" + ), + + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +@pytest.mark.skipif(not os.environ.get("REDSHIFT_URL"), reason="Do not test Redshift unless `REDSHIFT_URL` is set") +def test_read_sql_on_utf8(redshift_url: str) -> None: + query = "SELECT * FROM test_str" + df = read_sql(redshift_url, query, protocol="cursor") + # result from redshift might have different order each time + df.sort_values(by="id", inplace=True, ignore_index=True) + expected = pd.DataFrame( + index=range(8), + data={ + "id": pd.Series([0, 1, 2, 3, 4, 5, 6, 7], dtype="Int64"), + "test_language": pd.Series( + [ + "English", + "中文", + "日本語", + "русский", + "Emoji", + "Latin1", + "Extra", + "Mixed", + ], + dtype="object", + ), + "test_hello": pd.Series( + [ + "Hello", + "你好", + "こんにちは", + "Здра́вствуйте", + "😁😂😜", + "¥§¤®ð", + "y̆", + "Ha好ち😁ðy̆", + ], + dtype="object", + ), + }, + ) + assert_frame_equal(df, expected, check_names=True) diff --git a/connectorx-python/connectorx/tests/test_sqlite.py b/connectorx-python/connectorx/tests/test_sqlite.py new file mode 100644 index 0000000..f0e8a99 --- /dev/null +++ b/connectorx-python/connectorx/tests/test_sqlite.py @@ -0,0 +1,393 @@ +import os + +import numpy as np +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal + +from .. import read_sql + + +@pytest.fixture(scope="module") # type: ignore +def sqlite_db() -> str: + conn = os.environ["SQLITE_URL"] + return conn + + +def test_sqlite_without_partition(sqlite_db: str) -> None: + query = "SELECT test_int, test_nullint, test_str, test_float, test_bool, test_date, test_time, test_datetime FROM test_table" + df = read_sql(sqlite_db, query) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([1, 2, 0, 3, 4, 1314], dtype="Int64"), + "test_nullint": pd.Series([3, None, 5, 7, 9, 2], dtype="Int64"), + "test_str": pd.Series( + ["str1", "str2", "こんにちは", "b", "Ha好ち😁ðy̆", None], dtype="object" + ), + "test_float": pd.Series([None, 2.2, 3.1, 3, 7.8, -10], dtype="float64"), + "test_bool": pd.Series( + [True, False, None, False, None, True], dtype="boolean" + ), + "test_date": pd.Series( + [ + np.datetime64("1996-03-13"), + np.datetime64("1996-01-30"), + np.datetime64("1996-02-28"), + np.datetime64("2020-01-12"), + np.datetime64("1996-04-20"), + None, + ], + dtype="datetime64[ns]", + ), + "test_time": pd.Series( + [ + "08:12:40", + "10:03:00", + "23:00:10", + "23:00:10", + "18:30:00", + "18:30:00", + ], + dtype="object", + ), + "test_datetime": pd.Series( + [ + np.datetime64("2007-01-01T10:00:19"), + np.datetime64("2005-01-01T22:03:00"), + None, + np.datetime64("1987-01-01T11:00:00"), + None, + np.datetime64("2007-10-01T10:32:00"), + ], + dtype="datetime64[ns]", + ), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_sqlite_limit_without_partition(sqlite_db: str) -> None: + query = "SELECT test_int, test_nullint, test_str, test_float, test_bool, test_date, test_time, test_datetime FROM test_table limit 3" + df = read_sql(sqlite_db, query) + expected = pd.DataFrame( + index=range(3), + data={ + "test_int": pd.Series([1, 2, 0], dtype="Int64"), + "test_nullint": pd.Series([3, None, 5], dtype="Int64"), + "test_str": pd.Series(["str1", "str2", "こんにちは"], dtype="object"), + "test_float": pd.Series([None, 2.2, 3.1], dtype="float64"), + "test_bool": pd.Series([True, False, None], dtype="boolean"), + "test_date": pd.Series( + [ + np.datetime64("1996-03-13"), + np.datetime64("1996-01-30"), + np.datetime64("1996-02-28"), + ], + dtype="datetime64[ns]", + ), + "test_time": pd.Series( + ["08:12:40", "10:03:00", "23:00:10"], dtype="object" + ), + "test_datetime": pd.Series( + [ + np.datetime64("2007-01-01T10:00:19"), + np.datetime64("2005-01-01T22:03:00"), + None, + ], + dtype="datetime64[ns]", + ), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_sqlite_limit_large_without_partition(sqlite_db: str) -> None: + query = "SELECT test_int, test_nullint, test_str, test_float, test_bool, test_date, test_time, test_datetime FROM test_table limit 10" + df = read_sql(sqlite_db, query) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([1, 2, 0, 3, 4, 1314], dtype="Int64"), + "test_nullint": pd.Series([3, None, 5, 7, 9, 2], dtype="Int64"), + "test_str": pd.Series( + ["str1", "str2", "こんにちは", "b", "Ha好ち😁ðy̆", None], dtype="object" + ), + "test_float": pd.Series([None, 2.2, 3.1, 3, 7.8, -10], dtype="float64"), + "test_bool": pd.Series( + [True, False, None, False, None, True], dtype="boolean" + ), + "test_date": pd.Series( + [ + np.datetime64("1996-03-13"), + np.datetime64("1996-01-30"), + np.datetime64("1996-02-28"), + np.datetime64("2020-01-12"), + np.datetime64("1996-04-20"), + None, + ], + dtype="datetime64[ns]", + ), + "test_time": pd.Series( + [ + "08:12:40", + "10:03:00", + "23:00:10", + "23:00:10", + "18:30:00", + "18:30:00", + ], + dtype="object", + ), + "test_datetime": pd.Series( + [ + np.datetime64("2007-01-01T10:00:19"), + np.datetime64("2005-01-01T22:03:00"), + None, + np.datetime64("1987-01-01T11:00:00"), + None, + np.datetime64("2007-10-01T10:32:00"), + ], + dtype="datetime64[ns]", + ), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_sqlite_with_partition(sqlite_db: str) -> None: + query = "SELECT test_int, test_nullint, test_str, test_float, test_bool, test_date, test_time, test_datetime FROM test_table" + df = read_sql( + sqlite_db, + query, + partition_on="test_int", + partition_num=3, + ) + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([0, 1, 2, 3, 4, 1314], dtype="Int64"), + "test_nullint": pd.Series([5, 3, None, 7, 9, 2], dtype="Int64"), + "test_str": pd.Series( + ["こんにちは", "str1", "str2", "b", "Ha好ち😁ðy̆", None], dtype="object" + ), + "test_float": pd.Series([3.1, None, 2.2, 3, 7.8, -10], dtype="float64"), + "test_bool": pd.Series( + [None, True, False, False, None, True], dtype="boolean" + ), + "test_date": pd.Series( + [ + np.datetime64("1996-02-28"), + np.datetime64("1996-03-13"), + np.datetime64("1996-01-30"), + np.datetime64("2020-01-12"), + np.datetime64("1996-04-20"), + None, + ], + dtype="datetime64[ns]", + ), + "test_time": pd.Series( + [ + "23:00:10", + "08:12:40", + "10:03:00", + "23:00:10", + "18:30:00", + "18:30:00", + ], + dtype="object", + ), + "test_datetime": pd.Series( + [ + None, + np.datetime64("2007-01-01T10:00:19"), + np.datetime64("2005-01-01T22:03:00"), + np.datetime64("1987-01-01T11:00:00"), + None, + np.datetime64("2007-10-01T10:32:00"), + ], + dtype="datetime64[ns]", + ), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_manual_partition(sqlite_db: str) -> None: + + queries = [ + "SELECT test_int, test_nullint, test_str, test_float, test_bool, test_date, test_time, test_datetime FROM test_table WHERE test_int < 2", + "SELECT test_int, test_nullint, test_str, test_float, test_bool, test_date, test_time, test_datetime FROM test_table WHERE test_int >= 2", + ] + + df = read_sql(sqlite_db, query=queries) + + expected = pd.DataFrame( + index=range(6), + data={ + "test_int": pd.Series([0, 1, 2, 3, 4, 1314], dtype="Int64"), + "test_nullint": pd.Series([5, 3, None, 7, 9, 2], dtype="Int64"), + "test_str": pd.Series( + ["こんにちは", "str1", "str2", "b", "Ha好ち😁ðy̆", None], dtype="object" + ), + "test_float": pd.Series([3.1, None, 2.2, 3, 7.8, -10], dtype="float64"), + "test_bool": pd.Series( + [None, True, False, False, None, True], dtype="boolean" + ), + "test_date": pd.Series( + [ + np.datetime64("1996-02-28"), + np.datetime64("1996-03-13"), + np.datetime64("1996-01-30"), + np.datetime64("2020-01-12"), + np.datetime64("1996-04-20"), + None, + ], + dtype="datetime64[ns]", + ), + "test_time": pd.Series( + [ + "23:00:10", + "08:12:40", + "10:03:00", + "23:00:10", + "18:30:00", + "18:30:00", + ], + dtype="object", + ), + "test_datetime": pd.Series( + [ + None, + np.datetime64("2007-01-01T10:00:19"), + np.datetime64("2005-01-01T22:03:00"), + np.datetime64("1987-01-01T11:00:00"), + None, + np.datetime64("2007-10-01T10:32:00"), + ], + dtype="datetime64[ns]", + ), + }, + ) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_sqlite_without_partition_and_spa(sqlite_db: str) -> None: + query = """ + SELECT test_bool, AVG(test_float) AS avg, SUM(test_int) AS sum + FROM test_table + WHERE test_nullint IS NOT NULL + GROUP BY test_bool + ORDER BY sum + """ + df = read_sql(sqlite_db, query) + expected = pd.DataFrame( + index=range(3), + data={ + "test_bool": pd.Series([False, None, True], dtype="boolean"), + "avg": pd.Series([3.00, 5.45, -10.00], dtype="float64"), + "sum": pd.Series([3, 4, 1315], dtype="Int64"), + }, + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_sqlite_with_partition_and_spa(sqlite_db: str) -> None: + query = """ + SELECT test_bool, AVG(test_float) AS avg, SUM(test_int) AS sum + FROM test_table + WHERE test_nullint IS NOT NULL + GROUP BY test_bool + ORDER BY sum + """ + df = read_sql(sqlite_db, query, partition_on="sum", partition_num=2) + expected = pd.DataFrame( + index=range(3), + data={ + "test_bool": pd.Series([False, None, True], dtype="boolean"), + "avg": pd.Series([3.00, 5.45, -10.00], dtype="float64"), + "sum": pd.Series([3, 4, 1315], dtype="Int64"), + }, + ) + df = df.sort_values("sum").reset_index(drop=True) + assert_frame_equal(df, expected, check_names=True) + + +def test_empty_result(sqlite_db: str) -> None: + query = "SELECT * FROM test_table where test_int < -100" + df = read_sql(sqlite_db, query) + expected = pd.DataFrame( + data={ + "test_int": pd.Series([], dtype="object"), + "test_nullint": pd.Series([], dtype="object"), + "test_str": pd.Series([], dtype="object"), + "test_float": pd.Series([], dtype="object"), + "test_bool": pd.Series([], dtype="object"), + "test_date": pd.Series([], dtype="object"), + "test_time": pd.Series([], dtype="object"), + "test_datetime": pd.Series([], dtype="object"), + } + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_empty_result_on_partition(sqlite_db: str) -> None: + query = "SELECT * FROM test_table where test_int < -100" + df = read_sql(sqlite_db, query, partition_on="test_int", partition_num=3) + expected = pd.DataFrame( + data={ + "test_int": pd.Series([], dtype="object"), + "test_nullint": pd.Series([], dtype="object"), + "test_str": pd.Series([], dtype="object"), + "test_float": pd.Series([], dtype="object"), + "test_bool": pd.Series([], dtype="object"), + "test_date": pd.Series([], dtype="object"), + "test_time": pd.Series([], dtype="object"), + "test_datetime": pd.Series([], dtype="object"), + } + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_empty_result_on_some_partition(sqlite_db: str) -> None: + query = "SELECT * FROM test_table where test_int < 1" + df = read_sql(sqlite_db, query, partition_on="test_int", partition_num=3) + expected = pd.DataFrame( + data={ + "test_int": pd.Series([0], dtype="Int64"), + "test_nullint": pd.Series([5], dtype="Int64"), + "test_str": pd.Series(["こんにちは"], dtype="object"), + "test_float": pd.Series([3.1], dtype="float"), + "test_bool": pd.Series([None], dtype="boolean"), + "test_date": pd.Series( + [ + np.datetime64("1996-02-28"), + ], + dtype="datetime64[ns]", + ), + "test_time": pd.Series(["23:00:10"], dtype="object"), + "test_datetime": pd.Series( + [ + None, + ], + dtype="datetime64[ns]", + ), + } + ) + assert_frame_equal(df, expected, check_names=True) + + +def test_sqlite_cte(sqlite_db: str) -> None: + query = "with test_cte (test_int, test_str) as (select test_int, test_str from test_table where test_float > 0) select test_int, test_str from test_cte" + df = read_sql(sqlite_db, query, partition_on="test_int", partition_num=3) + df.sort_values(by="test_int", inplace=True, ignore_index=True) + expected = pd.DataFrame( + index=range(4), + data={ + "test_int": pd.Series([0, 2, 3, 4], dtype="Int64"), + "test_str": pd.Series(["こんにちは", "str2", "b", "Ha好ち😁ðy̆"], dtype="object"), + }, + ) + assert_frame_equal(df, expected, check_names=True) diff --git a/connectorx-python/examples/flame_tpch.rs b/connectorx-python/examples/flame_tpch.rs new file mode 100644 index 0000000..926df9a --- /dev/null +++ b/connectorx-python/examples/flame_tpch.rs @@ -0,0 +1,25 @@ +mod tpch; + +use pprof::protos::Message; +use std::env; +use std::fs::File; +use std::io::Write; + +fn main() { + let args: Vec = env::args().collect(); + let guard = pprof::ProfilerGuard::new(10).unwrap(); + + tpch::run(10, &args[1]); + + if let Ok(report) = guard.report().build() { + let file = File::create("flamegraph.svg").unwrap(); + report.flamegraph(file).unwrap(); + + let mut file = File::create("profile.pb").unwrap(); + let profile = report.pprof().unwrap(); + + let mut content = Vec::new(); + profile.encode(&mut content).unwrap(); + file.write_all(&content).unwrap(); + }; +} diff --git a/connectorx-python/examples/tpch.rs b/connectorx-python/examples/tpch.rs new file mode 100644 index 0000000..69c9afa --- /dev/null +++ b/connectorx-python/examples/tpch.rs @@ -0,0 +1,28 @@ +use connectorx_python::read_sql::{read_sql, PartitionQuery}; +use pyo3::Python; +use std::env; + +const QUERY: &'static str = r#" +SELECT + * +FROM LINEITEM"#; + +pub fn run(nq: usize, conn: &str) { + let conn = env::var(conn).unwrap(); + + Python::with_gil(|py| { + read_sql( + py, + &conn, + "pandas", + None, + None, + Some(PartitionQuery::new(QUERY, "L_ORDERKEY", None, None, nq)), + ) + .unwrap(); + }); +} + +fn main() { + run(1, "POSTGRES_URL"); +} diff --git a/connectorx-python/poetry.lock b/connectorx-python/poetry.lock new file mode 100644 index 0000000..6922983 --- /dev/null +++ b/connectorx-python/poetry.lock @@ -0,0 +1,1229 @@ +[[package]] +name = "appnope" +version = "0.1.3" +description = "Disable App Nap on macOS >= 10.9" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "atomicwrites" +version = "1.4.1" +description = "Atomic file writes." +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[[package]] +name = "attrs" +version = "22.2.0" +description = "Classes Without Boilerplate" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.extras] +cov = ["attrs", "coverage-enable-subprocess", "coverage[toml] (>=5.3)"] +dev = ["attrs"] +docs = ["furo", "sphinx", "myst-parser", "zope.interface", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier"] +tests = ["attrs", "zope.interface"] +tests-no-zope = ["hypothesis", "pympler", "pytest (>=4.3.0)", "pytest-xdist", "cloudpickle", "mypy (>=0.971,<0.990)", "pytest-mypy-plugins"] +tests_no_zope = ["hypothesis", "pympler", "pytest (>=4.3.0)", "pytest-xdist", "cloudpickle", "mypy (>=0.971,<0.990)", "pytest-mypy-plugins"] + +[[package]] +name = "backcall" +version = "0.2.0" +description = "Specifications for callback functions passed in to an API" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "black" +version = "21.12b0" +description = "The uncompromising code formatter." +category = "dev" +optional = false +python-versions = ">=3.6.2" + +[package.dependencies] +click = ">=7.1.2" +mypy-extensions = ">=0.4.3" +pathspec = ">=0.9.0,<1" +platformdirs = ">=2" +tomli = ">=0.2.6,<2.0.0" +typing-extensions = [ + {version = ">=3.10.0.0", markers = "python_version < \"3.10\""}, + {version = "!=3.10.0.1", markers = "python_version >= \"3.10\""}, +] + +[package.extras] +colorama = ["colorama (>=0.4.3)"] +d = ["aiohttp (>=3.7.4)"] +jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] +python2 = ["typed-ast (>=1.4.3)"] +uvloop = ["uvloop (>=0.15.2)"] + +[[package]] +name = "bleach" +version = "5.0.1" +description = "An easy safelist-based HTML-sanitizing tool." +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +six = ">=1.9.0" +webencodings = "*" + +[package.extras] +css = ["tinycss2 (>=1.1.0,<1.2)"] +dev = ["build (==0.8.0)", "flake8 (==4.0.1)", "hashin (==0.17.0)", "pip-tools (==6.6.2)", "pytest (==7.1.2)", "Sphinx (==4.3.2)", "tox (==3.25.0)", "twine (==4.0.1)", "wheel (==0.37.1)", "black (==22.3.0)", "mypy (==0.961)"] + +[[package]] +name = "certifi" +version = "2022.12.7" +description = "Python package for providing Mozilla's CA Bundle." +category = "dev" +optional = false +python-versions = ">=3.6" + +[[package]] +name = "cffi" +version = "1.15.1" +description = "Foreign Function Interface for Python calling C code." +category = "dev" +optional = false +python-versions = "*" + +[package.dependencies] +pycparser = "*" + +[[package]] +name = "charset-normalizer" +version = "2.1.1" +description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +category = "dev" +optional = false +python-versions = ">=3.6.0" + +[package.extras] +unicode_backport = ["unicodedata2"] + +[[package]] +name = "click" +version = "8.1.3" +description = "Composable command line interface toolkit" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[[package]] +name = "cloudpickle" +version = "2.2.0" +description = "Extended pickling support for Python objects" +category = "main" +optional = false +python-versions = ">=3.6" + +[[package]] +name = "colorama" +version = "0.4.6" +description = "Cross-platform colored terminal text." +category = "dev" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" + +[[package]] +name = "contexttimer" +version = "0.3.3" +description = "A timer context manager measuring the clock wall time of the code block it contains." +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "cryptography" +version = "38.0.4" +description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +cffi = ">=1.12" + +[package.extras] +docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx-rtd-theme"] +docstest = ["pyenchant (>=1.6.11)", "twine (>=1.12.0)", "sphinxcontrib-spelling (>=4.0.1)"] +pep8test = ["black", "flake8", "flake8-import-order", "pep8-naming"] +sdist = ["setuptools-rust (>=0.11.4)"] +ssh = ["bcrypt (>=3.1.5)"] +test = ["pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-subtests", "pytest-xdist", "pretend", "iso8601", "pytz", "hypothesis (>=1.11.4,!=3.79.2)"] + +[[package]] +name = "dask" +version = "2021.12.0" +description = "Parallel PyData with Task Scheduling" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +cloudpickle = ">=1.1.1" +fsspec = ">=0.6.0" +numpy = {version = ">=1.18", optional = true, markers = "extra == \"dataframe\""} +packaging = ">=20.0" +pandas = {version = ">=1.0", optional = true, markers = "extra == \"dataframe\""} +partd = ">=0.3.10" +pyyaml = "*" +toolz = ">=0.8.2" + +[package.extras] +array = ["numpy (>=1.18)"] +complete = ["bokeh (>=2.1.1)", "distributed (==2021.12.0)", "jinja2", "numpy (>=1.18)", "pandas (>=1.0)"] +dataframe = ["numpy (>=1.18)", "pandas (>=1.0)"] +diagnostics = ["bokeh (>=2.1.1)", "jinja2"] +distributed = ["distributed (==2021.12.0)"] +test = ["pytest", "pytest-rerunfailures", "pytest-xdist", "pre-commit"] + +[[package]] +name = "decorator" +version = "5.1.1" +description = "Decorators for Humans" +category = "dev" +optional = false +python-versions = ">=3.5" + +[[package]] +name = "distributed" +version = "2021.12.0" +description = "Distributed scheduler for Dask" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +click = ">=6.6" +cloudpickle = ">=1.5.0" +dask = "2021.12.0" +jinja2 = "*" +msgpack = ">=0.6.0" +psutil = ">=5.0" +pyyaml = "*" +sortedcontainers = "<2.0.0 || >2.0.0,<2.0.1 || >2.0.1" +tblib = ">=1.6.0" +toolz = ">=0.8.2" +tornado = {version = ">=6.0.3", markers = "python_version >= \"3.8\""} +zict = ">=0.1.3" + +[[package]] +name = "docopt" +version = "0.6.2" +description = "Pythonic argument parser, that will make you smile" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "docutils" +version = "0.19" +description = "Docutils -- Python Documentation Utilities" +category = "dev" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "fsspec" +version = "2022.11.0" +description = "File-system specification" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.extras] +abfs = ["adlfs"] +adl = ["adlfs"] +arrow = ["pyarrow (>=1)"] +dask = ["dask", "distributed"] +dropbox = ["dropboxdrivefs", "requests", "dropbox"] +entrypoints = ["importlib-metadata"] +fuse = ["fusepy"] +gcs = ["gcsfs"] +git = ["pygit2"] +github = ["requests"] +gs = ["gcsfs"] +gui = ["panel"] +hdfs = ["pyarrow (>=1)"] +http = ["requests", "aiohttp (!=4.0.0a0,!=4.0.0a1)"] +libarchive = ["libarchive-c"] +oci = ["ocifs"] +s3 = ["s3fs"] +sftp = ["paramiko"] +smb = ["smbprotocol"] +ssh = ["paramiko"] +tqdm = ["tqdm"] + +[[package]] +name = "heapdict" +version = "1.0.1" +description = "a heap with decrease-key and increase-key operations" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "idna" +version = "3.4" +description = "Internationalized Domain Names in Applications (IDNA)" +category = "dev" +optional = false +python-versions = ">=3.5" + +[[package]] +name = "importlib-metadata" +version = "5.2.0" +description = "Read metadata from Python packages" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +zipp = ">=0.5" + +[package.extras] +docs = ["sphinx (>=3.5)", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "furo", "sphinx-lint", "jaraco.tidelift (>=1.4)"] +perf = ["ipython"] +testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "flake8 (<5)", "pytest-cov", "pytest-enabler (>=1.3)", "packaging", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "pytest-flake8", "importlib-resources (>=1.3)"] + +[[package]] +name = "importlib-resources" +version = "5.10.1" +description = "Read resources from Python packages" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +zipp = {version = ">=3.1.0", markers = "python_version < \"3.10\""} + +[package.extras] +docs = ["sphinx (>=3.5)", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "furo", "jaraco.tidelift (>=1.4)"] +testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "flake8 (<5)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "pytest-flake8"] + +[[package]] +name = "iniconfig" +version = "1.1.1" +description = "iniconfig: brain-dead simple config-ini parsing" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "ipython" +version = "7.34.0" +description = "IPython: Productive Interactive Computing" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +appnope = {version = "*", markers = "sys_platform == \"darwin\""} +backcall = "*" +colorama = {version = "*", markers = "sys_platform == \"win32\""} +decorator = "*" +jedi = ">=0.16" +matplotlib-inline = "*" +pexpect = {version = ">4.3", markers = "sys_platform != \"win32\""} +pickleshare = "*" +prompt-toolkit = ">=2.0.0,<3.0.0 || >3.0.0,<3.0.1 || >3.0.1,<3.1.0" +pygments = "*" +traitlets = ">=4.2" + +[package.extras] +all = ["Sphinx (>=1.3)", "ipykernel", "ipyparallel", "ipywidgets", "nbconvert", "nbformat", "nose (>=0.10.1)", "notebook", "numpy (>=1.17)", "pygments", "qtconsole", "requests", "testpath"] +doc = ["Sphinx (>=1.3)"] +kernel = ["ipykernel"] +nbconvert = ["nbconvert"] +nbformat = ["nbformat"] +notebook = ["notebook", "ipywidgets"] +parallel = ["ipyparallel"] +qtconsole = ["qtconsole"] +test = ["nose (>=0.10.1)", "requests", "testpath", "pygments", "nbformat", "ipykernel", "numpy (>=1.17)"] + +[[package]] +name = "jaraco.classes" +version = "3.2.3" +description = "Utility functions for Python class constructs" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +more-itertools = "*" + +[package.extras] +docs = ["sphinx (>=3.5)", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "jaraco.tidelift (>=1.4)"] +testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "flake8 (<5)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)"] + +[[package]] +name = "jedi" +version = "0.18.2" +description = "An autocompletion tool for Python that can be used for text editors." +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +parso = ">=0.8.0,<0.9.0" + +[package.extras] +docs = ["Jinja2 (==2.11.3)", "MarkupSafe (==1.1.1)", "Pygments (==2.8.1)", "alabaster (==0.7.12)", "babel (==2.9.1)", "chardet (==4.0.0)", "commonmark (==0.8.1)", "docutils (==0.17.1)", "future (==0.18.2)", "idna (==2.10)", "imagesize (==1.2.0)", "mock (==1.0.1)", "packaging (==20.9)", "pyparsing (==2.4.7)", "pytz (==2021.1)", "readthedocs-sphinx-ext (==2.1.4)", "recommonmark (==0.5.0)", "requests (==2.25.1)", "six (==1.15.0)", "snowballstemmer (==2.1.0)", "sphinx-rtd-theme (==0.4.3)", "sphinx (==1.8.5)", "sphinxcontrib-serializinghtml (==1.1.4)", "sphinxcontrib-websupport (==1.2.4)", "urllib3 (==1.26.4)"] +qa = ["flake8 (==3.8.3)", "mypy (==0.782)"] +testing = ["Django (<3.1)", "attrs", "colorama", "docopt", "pytest (<7.0.0)"] + +[[package]] +name = "jeepney" +version = "0.8.0" +description = "Low-level, pure Python DBus protocol wrapper." +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.extras] +test = ["pytest", "pytest-trio", "pytest-asyncio (>=0.17)", "testpath", "trio", "async-timeout"] +trio = ["trio", "async-generator"] + +[[package]] +name = "jinja2" +version = "3.1.2" +description = "A very fast and expressive template engine." +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +MarkupSafe = ">=2.0" + +[package.extras] +i18n = ["Babel (>=2.7)"] + +[[package]] +name = "keyring" +version = "23.13.1" +description = "Store and access your passwords safely." +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +importlib-metadata = {version = ">=4.11.4", markers = "python_version < \"3.12\""} +importlib-resources = {version = "*", markers = "python_version < \"3.9\""} +"jaraco.classes" = "*" +jeepney = {version = ">=0.4.2", markers = "sys_platform == \"linux\""} +pywin32-ctypes = {version = ">=0.2.0", markers = "sys_platform == \"win32\""} +SecretStorage = {version = ">=3.2", markers = "sys_platform == \"linux\""} + +[package.extras] +completion = ["shtab"] +docs = ["sphinx (>=3.5)", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "furo", "jaraco.tidelift (>=1.4)"] +testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "flake8 (<5)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "pytest-flake8"] + +[[package]] +name = "locket" +version = "1.0.0" +description = "File-based locks for Python on Linux and Windows" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[[package]] +name = "markupsafe" +version = "2.1.1" +description = "Safely add untrusted strings to HTML/XML markup." +category = "dev" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "matplotlib-inline" +version = "0.1.6" +description = "Inline Matplotlib backend for Jupyter" +category = "dev" +optional = false +python-versions = ">=3.5" + +[package.dependencies] +traitlets = "*" + +[[package]] +name = "maturin" +version = "1.2.3" +description = "Build and publish crates with pyo3, rust-cpython and cffi bindings as well as rust binaries as python packages" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} + +[package.extras] +zig = ["ziglang (>=0.10.0,<0.11.0)"] +patchelf = ["patchelf"] + +[[package]] +name = "modin" +version = "0.18.0" +description = "Modin: Make your pandas code run faster by changing one line of code." +category = "main" +optional = false +python-versions = ">=3.8" + +[package.dependencies] +dask = {version = ">=2.22.0", optional = true, markers = "extra == \"dask\""} +distributed = {version = ">=2.22.0", optional = true, markers = "extra == \"dask\""} +fsspec = "*" +numpy = ">=1.18.5" +packaging = "*" +pandas = "1.5.2" +psutil = "*" + +[package.extras] +all = ["dask (>=2.22.0)", "distributed (>=2.22.0)", "ray[default] (>=1.13.0)", "pyarrow", "unidist[mpi] (>=0.2.1)", "rpyc (==4.1.5)", "cloudpickle", "boto3", "modin-spreadsheet (>=0.1.0)"] +dask = ["dask (>=2.22.0)", "distributed (>=2.22.0)"] +ray = ["ray[default] (>=1.13.0)", "pyarrow"] +remote = ["rpyc (==4.1.5)", "cloudpickle", "boto3"] +spreadsheet = ["modin-spreadsheet (>=0.1.0)"] +sql = ["dfsql (>=0.4.2)", "pyparsing (<=2.4.7)"] +unidist = ["unidist[mpi] (>=0.2.1)"] + +[[package]] +name = "more-itertools" +version = "9.0.0" +description = "More routines for operating on iterables, beyond itertools" +category = "dev" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "msgpack" +version = "1.0.4" +description = "MessagePack serializer" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "mypy-extensions" +version = "0.4.3" +description = "Experimental type system extensions for programs checked with the mypy typechecker." +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "numpy" +version = "1.24.1" +description = "Fundamental package for array computing in Python" +category = "main" +optional = false +python-versions = ">=3.8" + +[[package]] +name = "packaging" +version = "22.0" +description = "Core utilities for Python packages" +category = "main" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "pandas" +version = "1.5.2" +description = "Powerful data structures for data analysis, time series, and statistics" +category = "main" +optional = false +python-versions = ">=3.8" + +[package.dependencies] +numpy = [ + {version = ">=1.20.3", markers = "python_version < \"3.10\""}, + {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, + {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, +] +python-dateutil = ">=2.8.1" +pytz = ">=2020.1" + +[package.extras] +test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"] + +[[package]] +name = "parso" +version = "0.8.3" +description = "A Python Parser" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.extras] +qa = ["flake8 (==3.8.3)", "mypy (==0.782)"] +testing = ["docopt", "pytest (<6.0.0)"] + +[[package]] +name = "partd" +version = "1.3.0" +description = "Appendable key-value storage" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +locket = "*" +toolz = "*" + +[package.extras] +complete = ["numpy (>=1.9.0)", "pandas (>=0.19.0)", "pyzmq", "blosc"] + +[[package]] +name = "pathspec" +version = "0.10.3" +description = "Utility library for gitignore style pattern matching of file paths." +category = "dev" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "pexpect" +version = "4.8.0" +description = "Pexpect allows easy control of interactive console applications." +category = "dev" +optional = false +python-versions = "*" + +[package.dependencies] +ptyprocess = ">=0.5" + +[[package]] +name = "pickleshare" +version = "0.7.5" +description = "Tiny 'shelve'-like database with concurrency support" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "pkginfo" +version = "1.9.2" +description = "Query metadatdata from sdists / bdists / installed packages." +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.extras] +testing = ["pytest", "pytest-cov"] + +[[package]] +name = "platformdirs" +version = "2.6.2" +description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.extras] +docs = ["furo (>=2022.12.7)", "proselint (>=0.13)", "sphinx-autodoc-typehints (>=1.19.5)", "sphinx (>=5.3)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.2.2)", "pytest-cov (>=4)", "pytest-mock (>=3.10)", "pytest (>=7.2)"] + +[[package]] +name = "pluggy" +version = "1.0.0" +description = "plugin and hook calling mechanisms for python" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + +[[package]] +name = "polars" +version = "0.15.8" +description = "Blazingly fast DataFrame library" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +typing_extensions = {version = ">=4.0.0", markers = "python_version < \"3.10\""} + +[package.extras] +matplotlib = ["matplotlib"] +numpy = ["numpy (>=1.16.0)"] +pandas = ["pyarrow (>=4.0.0)", "pandas"] +fsspec = ["fsspec"] +xlsx2csv = ["xlsx2csv (>=0.8.0)"] +pyarrow = ["pyarrow (>=4.0.0)"] +deltalake = ["deltalake"] +timezone = ["backports.zoneinfo", "tzdata"] +connectorx = ["connectorx"] +all = ["polars"] + +[[package]] +name = "prompt-toolkit" +version = "3.0.36" +description = "Library for building powerful interactive command lines in Python" +category = "dev" +optional = false +python-versions = ">=3.6.2" + +[package.dependencies] +wcwidth = "*" + +[[package]] +name = "psutil" +version = "5.9.4" +description = "Cross-platform lib for process and system monitoring in Python." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[package.extras] +test = ["ipaddress", "mock", "enum34", "pywin32", "wmi"] + +[[package]] +name = "ptyprocess" +version = "0.7.0" +description = "Run a subprocess in a pseudo terminal" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "py" +version = "1.11.0" +description = "library with cross-python path, ini-parsing, io, code, log facilities" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[[package]] +name = "py-cpuinfo" +version = "9.0.0" +description = "Get CPU info with pure Python" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "pyarrow" +version = "10.0.1" +description = "Python library for Apache Arrow" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +numpy = ">=1.16.6" + +[[package]] +name = "pycparser" +version = "2.21" +description = "C parser in Python" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[[package]] +name = "pygments" +version = "2.13.0" +description = "Pygments is a syntax highlighting package written in Python." +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.extras] +plugins = ["importlib-metadata"] + +[[package]] +name = "pytest" +version = "6.2.5" +description = "pytest: simple powerful testing with Python" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""} +attrs = ">=19.2.0" +colorama = {version = "*", markers = "sys_platform == \"win32\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=0.12,<2.0" +py = ">=1.8.2" +toml = "*" + +[package.extras] +testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"] + +[[package]] +name = "pytest-benchmark" +version = "3.4.1" +description = "A ``pytest`` fixture for benchmarking code. It will group the tests into rounds that are calibrated to the chosen timer." +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[package.dependencies] +py-cpuinfo = "*" +pytest = ">=3.8" + +[package.extras] +aspect = ["aspectlib"] +elasticsearch = ["elasticsearch"] +histogram = ["pygal", "pygaljs"] + +[[package]] +name = "python-dateutil" +version = "2.8.2" +description = "Extensions to the standard Python datetime module" +category = "main" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" + +[package.dependencies] +six = ">=1.5" + +[[package]] +name = "pytz" +version = "2022.7" +description = "World timezone definitions, modern and historical" +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "pywin32-ctypes" +version = "0.2.0" +description = "" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "pyyaml" +version = "6.0.1" +description = "YAML parser and emitter for Python" +category = "main" +optional = false +python-versions = ">=3.6" + +[[package]] +name = "readme-renderer" +version = "37.3" +description = "readme_renderer is a library for rendering \"readme\" descriptions for Warehouse" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +bleach = ">=2.1.0" +docutils = ">=0.13.1" +Pygments = ">=2.5.1" + +[package.extras] +md = ["cmarkgfm (>=0.8.0)"] + +[[package]] +name = "requests" +version = "2.28.1" +description = "Python HTTP for Humans." +category = "dev" +optional = false +python-versions = ">=3.7, <4" + +[package.dependencies] +certifi = ">=2017.4.17" +charset-normalizer = ">=2,<3" +idna = ">=2.5,<4" +urllib3 = ">=1.21.1,<1.27" + +[package.extras] +socks = ["PySocks (>=1.5.6,!=1.5.7)"] +use_chardet_on_py3 = ["chardet (>=3.0.2,<6)"] + +[[package]] +name = "requests-toolbelt" +version = "0.10.1" +description = "A utility belt for advanced users of python-requests" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[package.dependencies] +requests = ">=2.0.1,<3.0.0" + +[[package]] +name = "rfc3986" +version = "2.0.0" +description = "Validating URI References per RFC 3986" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.extras] +idna2008 = ["idna"] + +[[package]] +name = "secretstorage" +version = "3.3.3" +description = "Python bindings to FreeDesktop.org Secret Service API" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +cryptography = ">=2.0" +jeepney = ">=0.6" + +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" + +[[package]] +name = "sortedcontainers" +version = "2.4.0" +description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "tblib" +version = "1.7.0" +description = "Traceback serialization library." +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[[package]] +name = "toml" +version = "0.10.2" +description = "Python Library for Tom's Obvious, Minimal Language" +category = "dev" +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" + +[[package]] +name = "tomli" +version = "1.2.3" +description = "A lil' TOML parser" +category = "dev" +optional = false +python-versions = ">=3.6" + +[[package]] +name = "toolz" +version = "0.12.0" +description = "List processing tools and functional utilities" +category = "main" +optional = false +python-versions = ">=3.5" + +[[package]] +name = "tornado" +version = "6.2" +description = "Tornado is a Python web framework and asynchronous networking library, originally developed at FriendFeed." +category = "dev" +optional = false +python-versions = ">= 3.7" + +[[package]] +name = "tqdm" +version = "4.64.1" +description = "Fast, Extensible Progress Meter" +category = "dev" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[package.extras] +dev = ["py-make (>=0.1.0)", "twine", "wheel"] +notebook = ["ipywidgets (>=6)"] +slack = ["slack-sdk"] +telegram = ["requests"] + +[[package]] +name = "traitlets" +version = "5.8.0" +description = "Traitlets Python configuration system" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.extras] +docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"] +test = ["argcomplete (>=2.0)", "pre-commit", "pytest", "pytest-mock"] + +[[package]] +name = "twine" +version = "3.8.0" +description = "Collection of utilities for publishing packages on PyPI" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +colorama = ">=0.4.3" +importlib-metadata = ">=3.6" +keyring = ">=15.1" +pkginfo = ">=1.8.1" +readme-renderer = ">=21.0" +requests = ">=2.20" +requests-toolbelt = ">=0.8.0,<0.9.0 || >0.9.0" +rfc3986 = ">=1.4.0" +tqdm = ">=4.14" +urllib3 = ">=1.26.0" + +[[package]] +name = "typing-extensions" +version = "4.4.0" +description = "Backported and Experimental Type Hints for Python 3.7+" +category = "main" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "urllib3" +version = "1.26.13" +description = "HTTP library with thread-safe connection pooling, file post, and more." +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" + +[package.extras] +brotli = ["brotlicffi (>=0.8.0)", "brotli (>=1.0.9)", "brotlipy (>=0.6.0)"] +secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "urllib3-secure-extra", "ipaddress"] +socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] + +[[package]] +name = "wcwidth" +version = "0.2.5" +description = "Measures the displayed width of unicode strings in a terminal" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "webencodings" +version = "0.5.1" +description = "Character encoding aliases for legacy web content" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "zict" +version = "2.2.0" +description = "Mutable mapping tools" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +heapdict = "*" + +[[package]] +name = "zipp" +version = "3.11.0" +description = "Backport of pathlib-compatible object wrapper for zip files" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.extras] +docs = ["sphinx (>=3.5)", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "furo", "jaraco.tidelift (>=1.4)"] +testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "flake8 (<5)", "pytest-cov", "pytest-enabler (>=1.3)", "jaraco.itertools", "func-timeout", "jaraco.functools", "more-itertools", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "pytest-flake8"] + +[extras] +all = ["dask", "pandas", "modin", "polars", "pyarrow"] +dask = ["dask", "pandas"] +modin = ["modin", "pandas"] +pandas = ["pandas"] +polars = ["pyarrow", "polars"] +pyarrow = ["pyarrow"] + +[metadata] +lock-version = "1.1" +python-versions = ">=3.8,<3.12" +content-hash = "88ecb2d7561052dfa32e9969765fb5972cd5f4c8eccea8b280debeda70ed7639" + +[metadata.files] +appnope = [] +atomicwrites = [] +attrs = [] +backcall = [ + {file = "backcall-0.2.0-py2.py3-none-any.whl", hash = "sha256:fbbce6a29f263178a1f7915c1940bde0ec2b2a967566fe1c65c1dfb7422bd255"}, + {file = "backcall-0.2.0.tar.gz", hash = "sha256:5cbdbf27be5e7cfadb448baf0aa95508f91f2bbc6c6437cd9cd06e2a4c215e1e"}, +] +black = [ + {file = "black-21.12b0-py3-none-any.whl", hash = "sha256:a615e69ae185e08fdd73e4715e260e2479c861b5740057fde6e8b4e3b7dd589f"}, + {file = "black-21.12b0.tar.gz", hash = "sha256:77b80f693a569e2e527958459634f18df9b0ba2625ba4e0c2d5da5be42e6f2b3"}, +] +bleach = [] +certifi = [] +cffi = [] +charset-normalizer = [] +click = [] +cloudpickle = [] +colorama = [] +contexttimer = [ + {file = "contexttimer-0.3.3.tar.gz", hash = "sha256:35a1efd389af3f1ca509f33ff23e17d98b66c8fde5ba2a4eb8a8b7fa456598a5"}, +] +cryptography = [] +dask = [ + {file = "dask-2021.12.0-py3-none-any.whl", hash = "sha256:47041fe1874e64c395e7be772173999e50b5c61a577084158083b9ef4b4175b2"}, + {file = "dask-2021.12.0.tar.gz", hash = "sha256:90614c9d162713e4849532c86f2854e8d53468521285413403b6c496344c0109"}, +] +decorator = [] +distributed = [ + {file = "distributed-2021.12.0-py3-none-any.whl", hash = "sha256:ea8cdb56ecbf1f999c4e28a5c848ce231cb90d6919e42c13e89ceb0d86366d41"}, + {file = "distributed-2021.12.0.tar.gz", hash = "sha256:c6119a2cf1fb2d8ac60337915bb9a790af6530afcb5d7a809a3308323b874714"}, +] +docopt = [ + {file = "docopt-0.6.2.tar.gz", hash = "sha256:49b3a825280bd66b3aa83585ef59c4a8c82f2c8a522dbe754a8bc8d08c85c491"}, +] +docutils = [] +fsspec = [] +heapdict = [ + {file = "HeapDict-1.0.1-py3-none-any.whl", hash = "sha256:6065f90933ab1bb7e50db403b90cab653c853690c5992e69294c2de2b253fc92"}, + {file = "HeapDict-1.0.1.tar.gz", hash = "sha256:8495f57b3e03d8e46d5f1b2cc62ca881aca392fd5cc048dc0aa2e1a6d23ecdb6"}, +] +idna = [] +importlib-metadata = [] +importlib-resources = [] +iniconfig = [ + {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, + {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, +] +ipython = [] +"jaraco.classes" = [] +jedi = [] +jeepney = [] +jinja2 = [] +keyring = [] +locket = [] +markupsafe = [] +matplotlib-inline = [] +maturin = [] +modin = [] +more-itertools = [] +msgpack = [] +mypy-extensions = [ + {file = "mypy_extensions-0.4.3-py2.py3-none-any.whl", hash = "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d"}, + {file = "mypy_extensions-0.4.3.tar.gz", hash = "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"}, +] +numpy = [] +packaging = [] +pandas = [] +parso = [] +partd = [] +pathspec = [] +pexpect = [ + {file = "pexpect-4.8.0-py2.py3-none-any.whl", hash = "sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937"}, + {file = "pexpect-4.8.0.tar.gz", hash = "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c"}, +] +pickleshare = [ + {file = "pickleshare-0.7.5-py2.py3-none-any.whl", hash = "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56"}, + {file = "pickleshare-0.7.5.tar.gz", hash = "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca"}, +] +pkginfo = [] +platformdirs = [] +pluggy = [ + {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, + {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, +] +polars = [] +prompt-toolkit = [] +psutil = [] +ptyprocess = [ + {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"}, + {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"}, +] +py = [ + {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"}, + {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, +] +py-cpuinfo = [] +pyarrow = [] +pycparser = [] +pygments = [] +pytest = [ + {file = "pytest-6.2.5-py3-none-any.whl", hash = "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"}, + {file = "pytest-6.2.5.tar.gz", hash = "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89"}, +] +pytest-benchmark = [ + {file = "pytest-benchmark-3.4.1.tar.gz", hash = "sha256:40e263f912de5a81d891619032983557d62a3d85843f9a9f30b98baea0cd7b47"}, + {file = "pytest_benchmark-3.4.1-py2.py3-none-any.whl", hash = "sha256:36d2b08c4882f6f997fd3126a3d6dfd70f3249cde178ed8bbc0b73db7c20f809"}, +] +python-dateutil = [ + {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, + {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, +] +pytz = [] +pywin32-ctypes = [ + {file = "pywin32-ctypes-0.2.0.tar.gz", hash = "sha256:24ffc3b341d457d48e8922352130cf2644024a4ff09762a2261fd34c36ee5942"}, + {file = "pywin32_ctypes-0.2.0-py2.py3-none-any.whl", hash = "sha256:9dc2d991b3479cc2df15930958b674a48a227d5361d413827a4cfd0b5876fc98"}, +] +pyyaml = [] +readme-renderer = [] +requests = [] +requests-toolbelt = [] +rfc3986 = [] +secretstorage = [] +six = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] +sortedcontainers = [ + {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"}, + {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"}, +] +tblib = [ + {file = "tblib-1.7.0-py2.py3-none-any.whl", hash = "sha256:289fa7359e580950e7d9743eab36b0691f0310fce64dee7d9c31065b8f723e23"}, + {file = "tblib-1.7.0.tar.gz", hash = "sha256:059bd77306ea7b419d4f76016aef6d7027cc8a0785579b5aad198803435f882c"}, +] +toml = [ + {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, + {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, +] +tomli = [ + {file = "tomli-1.2.3-py3-none-any.whl", hash = "sha256:e3069e4be3ead9668e21cb9b074cd948f7b3113fd9c8bba083f48247aab8b11c"}, + {file = "tomli-1.2.3.tar.gz", hash = "sha256:05b6166bff487dc068d322585c7ea4ef78deed501cc124060e0f238e89a9231f"}, +] +toolz = [] +tornado = [] +tqdm = [] +traitlets = [] +twine = [] +typing-extensions = [] +urllib3 = [] +wcwidth = [ + {file = "wcwidth-0.2.5-py2.py3-none-any.whl", hash = "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784"}, + {file = "wcwidth-0.2.5.tar.gz", hash = "sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83"}, +] +webencodings = [ + {file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"}, + {file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"}, +] +zict = [] +zipp = [] diff --git a/connectorx-python/pyproject.toml b/connectorx-python/pyproject.toml new file mode 100644 index 0000000..aebc2f8 --- /dev/null +++ b/connectorx-python/pyproject.toml @@ -0,0 +1,81 @@ +[tool.poetry] +authors = ["SFU Database System Lab "] +classifiers = [ + "Development Status :: 4 - Beta", + "Topic :: Software Development :: Build Tools", + "Environment :: Console", + "Operating System :: OS Independent", + "Intended Audience :: Science/Research", + "Intended Audience :: Developers", + "Intended Audience :: Financial and Insurance Industry", + "Intended Audience :: Healthcare Industry", + "Topic :: Scientific/Engineering", + "Framework :: IPython", +] +description = "Load data from databases to dataframes, the fastest way." +keywords = ["read_sql"] +license = "MIT" +maintainers = ["Weiyuan Wu "] +name = "connectorx" +readme = "README.md" # Markdown files are supported +version = "0.3.3-alpha.1" + +[project] +name = "connectorx" # Target file name of maturin build +readme = "README.md" +license = { text = "MIT" } +requires-python = ">=3.8" + +[tool.poetry.dependencies] +dask = {version = "^2021", optional = true, extras = ["dataframe"]} +modin = {version = ">=0.10", optional = true} +numpy = ">=1.21.5" +pandas = {version = "^1", optional = true} +polars = {version = ">=0.8", optional = true} +pyarrow = {version = ">=4", optional = true} +python = ">=3.8,<3.12" + +[tool.poetry.extras] +all = ["dask", "pandas", "modin", "polars", "pyarrow"] +dask = ["dask", "pandas"] +modin = ["modin", "pandas"] +pandas = ["pandas"] +polars = ["pyarrow", "polars"] +pyarrow = ["pyarrow"] + +[tool.poetry.dev-dependencies] +black = "^21.4b0" +contexttimer = "^0.3.3" +dask = {extras = ["dataframe"], version = "^2021.7.0"} +docopt = "^0.6.2" +ipython = "^7.31.1" +maturin = ">=1.0,<2.0" +modin = {extras = ["dask"], version = ">=0.10.1"} +polars = ">=0.8" +pyarrow = ">=4" +pytest = "^6.2" +pytest-benchmark = "^3.4.1" +twine = "^3.4.1" + +[tool.pytest.ini_options] +minversion = "6.0" +python_functions = "test_* bench_*" +# python_files = check_*.py +# python_classes = Check +# addopts = "-ra -q" +# testpaths = [ +# "tests", +# "integration", +# ] + +[build-system] +build-backend = "maturin" +requires = ["maturin>=1.0,<2.0"] + +[tool.maturin] +include = [ + { path = "connectorx/*.so", format = "sdist"}, + { path = "connectorx/*.pyd", format = "sdist"}, + { path = "connectorx/dependencies/", format = "sdist"}, + { path = "LICENSE", format = "sdist"}, +] diff --git a/connectorx-python/src/arrow.rs b/connectorx-python/src/arrow.rs new file mode 100644 index 0000000..6521a2f --- /dev/null +++ b/connectorx-python/src/arrow.rs @@ -0,0 +1,57 @@ +use crate::errors::ConnectorXPythonError; +use arrow::record_batch::RecordBatch; +use connectorx::source_router::SourceConn; +use connectorx::{prelude::*, sql::CXQuery}; +use fehler::throws; +use libc::uintptr_t; +use pyo3::prelude::*; +use pyo3::{PyAny, Python}; +use std::convert::TryFrom; +use std::sync::Arc; + +#[throws(ConnectorXPythonError)] +pub fn write_arrow<'a>( + py: Python<'a>, + source_conn: &SourceConn, + origin_query: Option, + queries: &[CXQuery], +) -> &'a PyAny { + let destination = get_arrow(source_conn, origin_query, queries)?; + let rbs = destination.arrow()?; + let ptrs = to_ptrs(rbs); + let obj: PyObject = ptrs.into_py(py); + obj.into_ref(py) +} + +pub fn to_ptrs(rbs: Vec) -> (Vec, Vec>) { + if rbs.is_empty() { + return (vec![], vec![]); + } + + let mut result = vec![]; + let names = rbs[0] + .schema() + .fields() + .iter() + .map(|f| f.name().clone()) + .collect(); + + for rb in rbs.into_iter() { + let mut cols = vec![]; + + for array in rb.columns().into_iter() { + let data = array.to_data(); + let array_ptr = Arc::new(arrow::ffi::FFI_ArrowArray::new(&data)); + let schema_ptr = Arc::new( + arrow::ffi::FFI_ArrowSchema::try_from(data.data_type()).expect("export schema c"), + ); + cols.push(( + Arc::into_raw(array_ptr) as uintptr_t, + Arc::into_raw(schema_ptr) as uintptr_t, + )); + } + + result.push(cols); + } + (names, result) +} diff --git a/connectorx-python/src/arrow2.rs b/connectorx-python/src/arrow2.rs new file mode 100644 index 0000000..573c7c8 --- /dev/null +++ b/connectorx-python/src/arrow2.rs @@ -0,0 +1,57 @@ +use crate::errors::ConnectorXPythonError; +use arrow2::{ + array::Array, + chunk::Chunk, + datatypes::{Field, Schema}, + ffi, +}; +use connectorx::source_router::SourceConn; +use connectorx::{prelude::*, sql::CXQuery}; +use fehler::throws; +use libc::uintptr_t; +use pyo3::prelude::*; +use pyo3::{PyAny, Python}; +use std::sync::Arc; + +#[throws(ConnectorXPythonError)] +pub fn write_arrow<'a>( + py: Python<'a>, + source_conn: &SourceConn, + origin_query: Option, + queries: &[CXQuery], +) -> &'a PyAny { + let destination = get_arrow2(source_conn, origin_query, queries)?; + let (rbs, schema) = destination.arrow()?; + let ptrs = to_ptrs(rbs, schema); + let obj: PyObject = ptrs.into_py(py); + obj.into_ref(py) +} + +fn to_ptrs( + rbs: Vec>>, + schema: Arc, +) -> (Vec, Vec>) { + if rbs.is_empty() { + return (vec![], vec![]); + } + + let mut result = vec![]; + let names = schema.fields.iter().map(|f| f.name.clone()).collect(); + + for rb in rbs.into_iter() { + let mut cols = vec![]; + + for array in rb.into_arrays() { + let schema_ptr = + ffi::export_field_to_c(&Field::new("", array.data_type().clone(), true)); + let array_ptr = ffi::export_array_to_c(array); + let array_ptr = Box::into_raw(Box::new(array_ptr)); + let schema_ptr = Box::into_raw(Box::new(schema_ptr)); + + cols.push((array_ptr as uintptr_t, schema_ptr as uintptr_t)); + } + + result.push(cols); + } + (names, result) +} diff --git a/connectorx-python/src/constants.rs b/connectorx-python/src/constants.rs new file mode 100644 index 0000000..7e2cc84 --- /dev/null +++ b/connectorx-python/src/constants.rs @@ -0,0 +1,7 @@ +// PyString buffer size in MB +pub const PYSTRING_BUFFER_SIZE: usize = 4; + +#[cfg(not(debug_assertions))] +pub const J4RS_BASE_PATH: &str = "./target/release"; +#[cfg(debug_assertions)] +pub const J4RS_BASE_PATH: &str = "./target/debug"; diff --git a/connectorx-python/src/errors.rs b/connectorx-python/src/errors.rs new file mode 100644 index 0000000..a8754ef --- /dev/null +++ b/connectorx-python/src/errors.rs @@ -0,0 +1,66 @@ +use pyo3::exceptions::PyRuntimeError; +use pyo3::PyErr; +use thiserror::Error; + +#[allow(unused)] +pub type Result = std::result::Result; + +/// Errors that can be raised from this library. +#[derive(Error, Debug)] +pub enum ConnectorXPythonError { + /// The required type does not same as the schema defined. + #[error("Unknown pandas data type: {0}.")] + UnknownPandasType(String), + + #[error("Python: {0}.")] + PythonError(String), + + #[error(transparent)] + NdArrayShapeError(#[from] ndarray::ShapeError), + + #[error(transparent)] + ConnectorXError(#[from] connectorx::errors::ConnectorXError), + + #[error(transparent)] + ConnectorXOutError(#[from] connectorx::errors::ConnectorXOutError), + + #[error(transparent)] + MsSQLSourceError(#[from] connectorx::sources::mssql::MsSQLSourceError), + + #[error(transparent)] + PostgresSourceError(#[from] connectorx::sources::postgres::PostgresSourceError), + + #[error(transparent)] + MySQLSourceError(#[from] connectorx::sources::mysql::MySQLSourceError), + + #[error(transparent)] + SQLiteSourceError(#[from] connectorx::sources::sqlite::SQLiteSourceError), + + #[error(transparent)] + OracleSourceError(#[from] connectorx::sources::oracle::OracleSourceError), + + #[error(transparent)] + BigQuerySourceError(#[from] connectorx::sources::bigquery::BigQuerySourceError), + + #[error(transparent)] + ArrowDestinationError(#[from] connectorx::destinations::arrow::ArrowDestinationError), + + #[error(transparent)] + Arrow2DestinationError(#[from] connectorx::destinations::arrow2::Arrow2DestinationError), + + /// Any other errors that are too trivial to be put here explicitly. + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +impl From for PyErr { + fn from(e: ConnectorXPythonError) -> PyErr { + PyRuntimeError::new_err(format!("{}", e)) + } +} + +impl From for ConnectorXPythonError { + fn from(e: PyErr) -> ConnectorXPythonError { + ConnectorXPythonError::PythonError(format!("{}", e)) + } +} diff --git a/connectorx-python/src/lib.rs b/connectorx-python/src/lib.rs new file mode 100644 index 0000000..5487275 --- /dev/null +++ b/connectorx-python/src/lib.rs @@ -0,0 +1,96 @@ +pub mod arrow; +pub mod arrow2; +pub mod constants; +mod errors; +pub mod pandas; +pub mod read_sql; + +use crate::constants::J4RS_BASE_PATH; +use connectorx::fed_dispatcher::run; +use pyo3::exceptions::PyRuntimeError; +use pyo3::prelude::*; +use pyo3::{wrap_pyfunction, PyResult}; +use std::collections::HashMap; +use std::env; +use std::sync::Once; + +#[macro_use] +extern crate lazy_static; + +static START: Once = Once::new(); + +// https://github.com/PyO3/pyo3-built/issues/21 +// #[allow(dead_code)] +// mod build { +// include!(concat!(env!("OUT_DIR"), "/built.rs")); +// } + +#[pymodule] +fn connectorx(_: Python, m: &PyModule) -> PyResult<()> { + START.call_once(|| { + let _ = env_logger::try_init(); + }); + + m.add_wrapped(wrap_pyfunction!(read_sql))?; + m.add_wrapped(wrap_pyfunction!(read_sql2))?; + m.add_wrapped(wrap_pyfunction!(partition_sql))?; + m.add_wrapped(wrap_pyfunction!(get_meta))?; + m.add_class::()?; + Ok(()) +} + +#[pyfunction] +pub fn read_sql<'a>( + py: Python<'a>, + conn: &str, + return_type: &str, + protocol: Option<&str>, + queries: Option>, + partition_query: Option, +) -> PyResult<&'a PyAny> { + read_sql::read_sql(py, conn, return_type, protocol, queries, partition_query) +} + +#[pyfunction] +pub fn partition_sql( + conn: &str, + partition_query: read_sql::PyPartitionQuery, +) -> PyResult> { + let source_conn = connectorx::source_router::parse_source(conn, None) + .map_err(|e| crate::errors::ConnectorXPythonError::from(e))?; + let queries = connectorx::partition::partition(&partition_query.into(), &source_conn) + .map_err(|e| crate::errors::ConnectorXPythonError::from(e))?; + Ok(queries.into_iter().map(|q| q.to_string()).collect()) +} + +#[pyfunction] +pub fn read_sql2<'a>( + py: Python<'a>, + sql: &str, + db_map: HashMap, +) -> PyResult<&'a PyAny> { + let rbs = run( + sql.to_string(), + db_map, + Some( + env::var("J4RS_BASE_PATH") + .unwrap_or(J4RS_BASE_PATH.to_string()) + .as_str(), + ), + ) + .map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?; + let ptrs = arrow::to_ptrs(rbs); + let obj: PyObject = ptrs.into_py(py); + Ok(obj.into_ref(py)) +} + +#[pyfunction] +pub fn get_meta<'a>( + py: Python<'a>, + conn: &str, + protocol: Option<&str>, + query: String, +) -> PyResult<&'a PyAny> { + pandas::get_meta::get_meta(py, conn, protocol.unwrap_or("binary"), query) + .map_err(|e| From::from(e)) +} diff --git a/connectorx-python/src/pandas/destination.rs b/connectorx-python/src/pandas/destination.rs new file mode 100644 index 0000000..82426f3 --- /dev/null +++ b/connectorx-python/src/pandas/destination.rs @@ -0,0 +1,407 @@ +use super::{ + pandas_columns::{ + ArrayBlock, BooleanBlock, BytesBlock, DateTimeBlock, Float64Block, HasPandasColumn, + Int64Block, PandasColumn, PandasColumnObject, PyBytes, StringBlock, + }, + pystring::PyString, + typesystem::{PandasArrayType, PandasBlockType, PandasTypeSystem}, +}; +use crate::errors::{ConnectorXPythonError, Result}; +use anyhow::anyhow; +use connectorx::prelude::*; +use fehler::{throw, throws}; +use itertools::Itertools; +use numpy::{PyArray1, PyArray2}; +use pyo3::{ + prelude::{pyclass, pymethods, PyResult}, + types::{IntoPyDict, PyList, PyTuple}, + FromPyObject, IntoPy, PyAny, PyObject, Python, +}; +use std::{ + collections::HashMap, + mem::transmute, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, + }, +}; + +#[pyclass] +pub struct PandasBlockInfo { + dt: PandasBlockType, + #[pyo3(get, set)] + cids: Vec, // column ids +} + +#[pymethods] +impl PandasBlockInfo { + #[getter] + fn dt(&self) -> PyResult { + Ok(PandasArrayType::from(self.dt) as u32) + } +} + +pub struct PandasDestination<'py> { + py: Python<'py>, + nrow: usize, + schema: Vec, + names: Vec, + block_datas: Vec<&'py PyAny>, // either 2d array for normal blocks, or two 1d arrays for extension blocks + block_infos: Vec, +} + +impl<'a> PandasDestination<'a> { + pub fn new(py: Python<'a>) -> Self { + PandasDestination { + py, + nrow: 0, + schema: vec![], + names: vec![], + block_datas: vec![], + block_infos: vec![], + } + } + + pub fn result(self) -> Result<&'a PyAny> { + #[throws(ConnectorXPythonError)] + fn to_list>(py: Python<'_>, arr: Vec) -> &'_ PyList { + let list = PyList::empty(py); + for e in arr { + list.append(e.into_py(py))?; + } + list + } + let block_infos = to_list(self.py, self.block_infos)?; + let names = to_list(self.py, self.names)?; + let block_datas = to_list(self.py, self.block_datas)?; + let result = [ + ("data", block_datas), + ("headers", names), + ("block_infos", block_infos), + ] + .into_py_dict(self.py); + Ok(result) + } + + #[throws(ConnectorXPythonError)] + fn allocate_array( + &mut self, + dt: PandasBlockType, + placement: Vec, + ) { + // has to use `zeros` instead of `new` for String type initialization + let data = PyArray2::::zeros(self.py, [placement.len(), self.nrow], false); + let block_info = PandasBlockInfo { + dt, + cids: placement, + }; + + self.block_datas.push(data.into()); + self.block_infos.push(block_info); + } + + #[throws(ConnectorXPythonError)] + fn allocate_masked_array( + &mut self, + dt: PandasBlockType, + placement: Vec, + ) { + for pos in placement { + let block_info = PandasBlockInfo { + dt, + cids: vec![pos], + }; + let data = PyArray1::::zeros(self.py, self.nrow, false); + let mask = PyArray1::::zeros(self.py, self.nrow, false); + let obj = PyTuple::new(self.py, vec![data.as_ref(), mask.as_ref()]); + self.block_datas.push(obj.into()); + self.block_infos.push(block_info); + } + } +} + +impl<'a> Destination for PandasDestination<'a> { + const DATA_ORDERS: &'static [DataOrder] = &[DataOrder::RowMajor]; + type TypeSystem = PandasTypeSystem; + type Partition<'b> = PandasPartitionDestination<'b> where 'a: 'b; + type Error = ConnectorXPythonError; + + fn needs_count(&self) -> bool { + true + } + + #[throws(ConnectorXPythonError)] + fn allocate>( + &mut self, + nrows: usize, + names: &[S], + schema: &[PandasTypeSystem], + data_order: DataOrder, + ) { + if !matches!(data_order, DataOrder::RowMajor) { + throw!(ConnectorXError::UnsupportedDataOrder(data_order)) + } + self.nrow = nrows; + self.schema = schema.to_vec(); + self.names + .extend(names.iter().map(AsRef::as_ref).map(ToString::to_string)); + + let mut block_indices = HashMap::>::new(); + schema + .iter() + .enumerate() + .for_each(|(i, dt)| block_indices.entry((*dt).into()).or_default().push(i)); + + for (dt, placement) in block_indices { + match dt { + PandasBlockType::Boolean(true) => { + self.allocate_masked_array::(dt, placement)?; + } + PandasBlockType::Boolean(false) => { + self.allocate_array::(dt, placement)?; + } + PandasBlockType::Int64(true) => { + self.allocate_masked_array::(dt, placement)?; + } + PandasBlockType::Int64(false) => { + self.allocate_array::(dt, placement)?; + } + PandasBlockType::Float64 => { + self.allocate_array::(dt, placement)?; + } + PandasBlockType::BooleanArray => { + self.allocate_array::(dt, placement)?; + } + PandasBlockType::Float64Array => { + self.allocate_array::(dt, placement)?; + } + PandasBlockType::Int64Array => { + self.allocate_array::(dt, placement)?; + } + PandasBlockType::String => { + self.allocate_array::(dt, placement)?; + } + PandasBlockType::DateTime => { + self.allocate_array::(dt, placement)?; + } + PandasBlockType::Bytes => { + self.allocate_array::(dt, placement)?; + } + }; + } + } + + #[throws(ConnectorXPythonError)] + fn partition(&mut self, counts: usize) -> Vec> { + let mut partitioned_columns: Vec>> = + (0..self.schema.len()).map(|_| Vec::new()).collect(); + + for (idx, block) in self.block_infos.iter().enumerate() { + let buf = self.block_datas[idx]; + match block.dt { + PandasBlockType::Boolean(_) => { + let bblock = BooleanBlock::extract(buf)?; + + let bcols = bblock.split()?; + for (&cid, bcol) in block.cids.iter().zip_eq(bcols) { + partitioned_columns[cid] = bcol + .partition(counts) + .into_iter() + .map(|c| Box::new(c) as _) + .collect() + } + } + PandasBlockType::Float64 => { + let fblock = Float64Block::extract(buf)?; + let fcols = fblock.split()?; + for (&cid, fcol) in block.cids.iter().zip_eq(fcols) { + partitioned_columns[cid] = fcol + .partition(counts) + .into_iter() + .map(|c| Box::new(c) as _) + .collect() + } + } + PandasBlockType::BooleanArray => { + let bblock = ArrayBlock::::extract(buf)?; + let bcols = bblock.split()?; + for (&cid, bcol) in block.cids.iter().zip_eq(bcols) { + partitioned_columns[cid] = bcol + .partition(counts) + .into_iter() + .map(|c| Box::new(c) as _) + .collect() + } + } + PandasBlockType::Float64Array => { + let fblock = ArrayBlock::::extract(buf)?; + let fcols = fblock.split()?; + for (&cid, fcol) in block.cids.iter().zip_eq(fcols) { + partitioned_columns[cid] = fcol + .partition(counts) + .into_iter() + .map(|c| Box::new(c) as _) + .collect() + } + } + PandasBlockType::Int64Array => { + let fblock = ArrayBlock::::extract(buf)?; + let fcols = fblock.split()?; + for (&cid, fcol) in block.cids.iter().zip_eq(fcols) { + partitioned_columns[cid] = fcol + .partition(counts) + .into_iter() + .map(|c| Box::new(c) as _) + .collect() + } + } + PandasBlockType::Int64(_) => { + let ublock = Int64Block::extract(buf)?; + let ucols = ublock.split()?; + for (&cid, ucol) in block.cids.iter().zip_eq(ucols) { + partitioned_columns[cid] = ucol + .partition(counts) + .into_iter() + .map(|c| Box::new(c) as _) + .collect() + } + } + PandasBlockType::String => { + let sblock = StringBlock::extract(buf)?; + let scols = sblock.split()?; + for (&cid, scol) in block.cids.iter().zip_eq(scols) { + partitioned_columns[cid] = scol + .partition(counts) + .into_iter() + .map(|c| Box::new(c) as _) + .collect() + } + } + PandasBlockType::Bytes => { + let bblock = BytesBlock::extract(buf)?; + let bcols = bblock.split()?; + for (&cid, bcol) in block.cids.iter().zip_eq(bcols) { + partitioned_columns[cid] = bcol + .partition(counts) + .into_iter() + .map(|c| Box::new(c) as _) + .collect() + } + } + PandasBlockType::DateTime => { + let dblock = DateTimeBlock::extract(buf)?; + let dcols = dblock.split()?; + for (&cid, dcol) in block.cids.iter().zip_eq(dcols) { + partitioned_columns[cid] = dcol + .partition(counts) + .into_iter() + .map(|c| Box::new(c) as _) + .collect() + } + } + } + } + + let mut par_destinations = vec![]; + let glob_row = Arc::new(AtomicUsize::new(0)); + for _ in 0..counts { + let mut columns = Vec::with_capacity(partitioned_columns.len()); + for (i, partitions) in partitioned_columns.iter_mut().enumerate() { + columns.push( + partitions + .pop() + .ok_or_else(|| anyhow!("empty partition for {}th column", i))?, + ); + } + + par_destinations.push(PandasPartitionDestination::new( + columns, + &self.schema[..], + Arc::clone(&glob_row), + )); + } + + par_destinations + } + + fn schema(&self) -> &[Self::TypeSystem] { + self.schema.as_ref() + } +} +pub struct PandasPartitionDestination<'a> { + columns: Vec>, + schema: &'a [PandasTypeSystem], + seq: usize, + glob_row: Arc, + cur_row: usize, +} + +impl<'a> PandasPartitionDestination<'a> { + fn new( + columns: Vec>, + schema: &'a [PandasTypeSystem], + glob_row: Arc, + ) -> Self { + Self { + columns, + schema, + seq: 0, + glob_row, + cur_row: 0, + } + } + + fn loc(&mut self) -> (usize, usize) { + let (row, col) = ( + self.cur_row + self.seq / self.ncols(), + self.seq % self.ncols(), + ); + self.seq += 1; + (row, col) + } +} + +impl<'a> DestinationPartition<'a> for PandasPartitionDestination<'a> { + type TypeSystem = PandasTypeSystem; + type Error = ConnectorXPythonError; + + fn ncols(&self) -> usize { + self.schema.len() + } + + fn finalize(&mut self) -> Result<()> { + for col in &mut self.columns { + col.finalize()?; + } + Ok(()) + } + + #[throws(ConnectorXPythonError)] + fn aquire_row(&mut self, n: usize) -> usize { + if n == 0 { + return self.cur_row; + } + self.cur_row = self.glob_row.fetch_add(n, Ordering::Relaxed); + self.seq = 0; + self.cur_row + } +} + +impl<'a, T> Consume for PandasPartitionDestination<'a> +where + T: HasPandasColumn + TypeAssoc + std::fmt::Debug, +{ + type Error = ConnectorXPythonError; + + fn consume(&mut self, value: T) -> Result<()> { + let (row, col) = self.loc(); + + self.schema[col].check::()?; + // How do we check type id for borrowed types? + // assert!(self.columns[col].typecheck(TypeId::of::())); + + let (column, _): (&mut T::PandasColumn<'a>, *const ()) = + unsafe { transmute(&*self.columns[col]) }; + column.write(value, row) + } +} diff --git a/connectorx-python/src/pandas/get_meta.rs b/connectorx-python/src/pandas/get_meta.rs new file mode 100644 index 0000000..bc5e7de --- /dev/null +++ b/connectorx-python/src/pandas/get_meta.rs @@ -0,0 +1,230 @@ +use super::{ + destination::PandasDestination, + transports::{ + BigQueryPandasTransport, MsSQLPandasTransport, MysqlPandasTransport, OraclePandasTransport, + PostgresPandasTransport, SqlitePandasTransport, + }, +}; +use crate::errors::ConnectorXPythonError; +use connectorx::source_router::{SourceConn, SourceType}; +use connectorx::{ + prelude::*, + sources::{ + bigquery::BigQuerySource, + mssql::MsSQLSource, + mysql::{BinaryProtocol as MySQLBinaryProtocol, MySQLSource, TextProtocol}, + postgres::{ + rewrite_tls_args, BinaryProtocol as PgBinaryProtocol, CSVProtocol, CursorProtocol, + PostgresSource, SimpleProtocol, + }, + sqlite::SQLiteSource, + }, + sql::CXQuery, +}; +use fehler::throws; +use log::debug; +use postgres::NoTls; +use postgres_openssl::MakeTlsConnector; +use pyo3::prelude::*; +use std::convert::TryFrom; +use std::sync::Arc; + +#[throws(ConnectorXPythonError)] +pub fn get_meta<'a>(py: Python<'a>, conn: &str, protocol: &str, query: String) -> &'a PyAny { + let source_conn = SourceConn::try_from(conn)?; + let mut destination = PandasDestination::new(py); + let queries = &[CXQuery::Naked(query)]; + + match source_conn.ty { + SourceType::Postgres => { + debug!("Protocol: {}", protocol); + let (config, tls) = rewrite_tls_args(&source_conn.conn)?; + match (protocol, tls) { + ("csv", Some(tls_conn)) => { + let sb = + PostgresSource::::new(config, tls_conn, 1)?; + let mut dispatcher = Dispatcher::< + _, + _, + PostgresPandasTransport, + >::new( + sb, &mut destination, queries, None + ); + debug!("Running dispatcher"); + dispatcher.get_meta()?; + } + ("csv", None) => { + let sb = PostgresSource::::new(config, NoTls, 1)?; + let mut dispatcher = Dispatcher::< + _, + _, + PostgresPandasTransport, + >::new( + sb, &mut destination, queries, None + ); + debug!("Running dispatcher"); + dispatcher.get_meta()?; + } + ("binary", Some(tls_conn)) => { + let sb = PostgresSource::::new( + config, tls_conn, 1, + )?; + let mut dispatcher = + Dispatcher::< + _, + _, + PostgresPandasTransport, + >::new(sb, &mut destination, queries, None); + debug!("Running dispatcher"); + dispatcher.get_meta()?; + } + ("binary", None) => { + let sb = PostgresSource::::new(config, NoTls, 1)?; + let mut dispatcher = Dispatcher::< + _, + _, + PostgresPandasTransport, + >::new( + sb, &mut destination, queries, None + ); + debug!("Running dispatcher"); + dispatcher.get_meta()?; + } + ("cursor", Some(tls_conn)) => { + let sb = PostgresSource::::new( + config, tls_conn, 1, + )?; + let mut dispatcher = Dispatcher::< + _, + _, + PostgresPandasTransport, + >::new( + sb, &mut destination, queries, None + ); + debug!("Running dispatcher"); + dispatcher.get_meta()?; + } + ("cursor", None) => { + let sb = PostgresSource::::new(config, NoTls, 1)?; + let mut dispatcher = Dispatcher::< + _, + _, + PostgresPandasTransport, + >::new( + sb, &mut destination, queries, None + ); + debug!("Running dispatcher"); + dispatcher.get_meta()?; + } + ("simple", Some(tls_conn)) => { + let sb = PostgresSource::::new( + config, tls_conn, 1, + )?; + let mut dispatcher = Dispatcher::< + _, + _, + PostgresPandasTransport, + >::new( + sb, &mut destination, queries, None + ); + debug!("Running dispatcher"); + dispatcher.get_meta()?; + } + ("simple", None) => { + let sb = PostgresSource::::new(config, NoTls, 1)?; + let mut dispatcher = Dispatcher::< + _, + _, + PostgresPandasTransport, + >::new( + sb, &mut destination, queries, None + ); + debug!("Running dispatcher"); + dispatcher.get_meta()?; + } + _ => unimplemented!("{} protocol not supported", protocol), + } + } + SourceType::SQLite => { + // remove the first "sqlite://" manually since url.path is not correct for windows + let path = &source_conn.conn.as_str()[9..]; + let source = SQLiteSource::new(path, 1)?; + let mut dispatcher = Dispatcher::<_, _, SqlitePandasTransport>::new( + source, + &mut destination, + queries, + None, + ); + debug!("Running dispatcher"); + dispatcher.get_meta()?; + } + SourceType::MySQL => { + debug!("Protocol: {}", protocol); + match protocol { + "binary" => { + let source = MySQLSource::::new(&source_conn.conn[..], 1)?; + let mut dispatcher = Dispatcher::< + _, + _, + MysqlPandasTransport, + >::new( + source, &mut destination, queries, None + ); + debug!("Running dispatcher"); + dispatcher.get_meta()?; + } + "text" => { + let source = MySQLSource::::new(&source_conn.conn[..], 1)?; + let mut dispatcher = + Dispatcher::<_, _, MysqlPandasTransport>::new( + source, + &mut destination, + queries, + None, + ); + debug!("Running dispatcher"); + dispatcher.get_meta()?; + } + _ => unimplemented!("{} protocol not supported", protocol), + } + } + SourceType::MsSQL => { + let rt = Arc::new(tokio::runtime::Runtime::new().expect("Failed to create runtime")); + let source = MsSQLSource::new(rt, &source_conn.conn[..], 1)?; + let mut dispatcher = Dispatcher::<_, _, MsSQLPandasTransport>::new( + source, + &mut destination, + queries, + None, + ); + debug!("Running dispatcher"); + dispatcher.get_meta()?; + } + SourceType::Oracle => { + let source = OracleSource::new(&source_conn.conn[..], 1)?; + let mut dispatcher = Dispatcher::<_, _, OraclePandasTransport>::new( + source, + &mut destination, + queries, + None, + ); + debug!("Running dispatcher"); + dispatcher.get_meta()?; + } + SourceType::BigQuery => { + let rt = Arc::new(tokio::runtime::Runtime::new().expect("Failed to create runtime")); + let source = BigQuerySource::new(rt, &source_conn.conn[..])?; + let mut dispatcher = Dispatcher::<_, _, BigQueryPandasTransport>::new( + source, + &mut destination, + queries, + None, + ); + debug!("Running dispatcher"); + dispatcher.get_meta()?; + } + _ => unimplemented!("{:?} not implemented!", source_conn.ty), + } + + destination.result()? +} diff --git a/connectorx-python/src/pandas/mod.rs b/connectorx-python/src/pandas/mod.rs new file mode 100644 index 0000000..be2e419 --- /dev/null +++ b/connectorx-python/src/pandas/mod.rs @@ -0,0 +1,237 @@ +mod destination; +pub mod get_meta; +mod pandas_columns; +mod pystring; +mod transports; +mod typesystem; + +pub use self::destination::{PandasBlockInfo, PandasDestination, PandasPartitionDestination}; +pub use self::transports::{ + BigQueryPandasTransport, MsSQLPandasTransport, MysqlPandasTransport, OraclePandasTransport, + PostgresPandasTransport, SqlitePandasTransport, +}; +pub use self::typesystem::{PandasDType, PandasTypeSystem}; +use crate::errors::ConnectorXPythonError; +use connectorx::source_router::{SourceConn, SourceType}; +use connectorx::sources::oracle::OracleSource; +use connectorx::{ + prelude::*, + sources::{ + mysql::{BinaryProtocol as MySQLBinaryProtocol, TextProtocol}, + postgres::{ + rewrite_tls_args, BinaryProtocol as PgBinaryProtocol, CSVProtocol, CursorProtocol, + SimpleProtocol, + }, + }, + sql::CXQuery, +}; +use fehler::throws; +use log::debug; +use postgres::NoTls; +use postgres_openssl::MakeTlsConnector; +use pyo3::{PyAny, Python}; +use std::sync::Arc; + +#[throws(ConnectorXPythonError)] +pub fn write_pandas<'a>( + py: Python<'a>, + source_conn: &SourceConn, + origin_query: Option, + queries: &[CXQuery], +) -> &'a PyAny { + let mut destination = PandasDestination::new(py); + let protocol = source_conn.proto.as_str(); + debug!("Protocol: {}", protocol); + + // TODO: unlock gil if possible + match source_conn.ty { + SourceType::Postgres => { + let (config, tls) = rewrite_tls_args(&source_conn.conn)?; + match (protocol, tls) { + ("csv", Some(tls_conn)) => { + let sb = PostgresSource::::new( + config, + tls_conn, + queries.len(), + )?; + let dispatcher = Dispatcher::< + _, + _, + PostgresPandasTransport, + >::new( + sb, &mut destination, queries, origin_query + ); + dispatcher.run()?; + } + ("csv", None) => { + let sb = + PostgresSource::::new(config, NoTls, queries.len())?; + let dispatcher = + Dispatcher::<_, _, PostgresPandasTransport>::new( + sb, + &mut destination, + queries, + origin_query, + ); + dispatcher.run()?; + } + ("binary", Some(tls_conn)) => { + let sb = PostgresSource::::new( + config, + tls_conn, + queries.len(), + )?; + let dispatcher = + Dispatcher::< + _, + _, + PostgresPandasTransport, + >::new(sb, &mut destination, queries, origin_query); + dispatcher.run()?; + } + ("binary", None) => { + let sb = PostgresSource::::new( + config, + NoTls, + queries.len(), + )?; + let dispatcher = Dispatcher::< + _, + _, + PostgresPandasTransport, + >::new( + sb, &mut destination, queries, origin_query + ); + dispatcher.run()?; + } + ("cursor", Some(tls_conn)) => { + let sb = PostgresSource::::new( + config, + tls_conn, + queries.len(), + )?; + let dispatcher = Dispatcher::< + _, + _, + PostgresPandasTransport, + >::new( + sb, &mut destination, queries, origin_query + ); + dispatcher.run()?; + } + ("cursor", None) => { + let sb = + PostgresSource::::new(config, NoTls, queries.len())?; + let dispatcher = Dispatcher::< + _, + _, + PostgresPandasTransport, + >::new( + sb, &mut destination, queries, origin_query + ); + dispatcher.run()?; + } + ("simple", Some(tls_conn)) => { + let sb = PostgresSource::::new( + config, + tls_conn, + queries.len(), + )?; + let dispatcher = Dispatcher::< + _, + _, + PostgresPandasTransport, + >::new( + sb, &mut destination, queries, origin_query + ); + dispatcher.run()?; + } + ("simple", None) => { + let sb = + PostgresSource::::new(config, NoTls, queries.len())?; + let dispatcher = Dispatcher::< + _, + _, + PostgresPandasTransport, + >::new( + sb, &mut destination, queries, origin_query + ); + dispatcher.run()?; + } + _ => unimplemented!("{} protocol not supported", protocol), + } + } + SourceType::SQLite => { + // remove the first "sqlite://" manually since url.path is not correct for windows + let path = &source_conn.conn.as_str()[9..]; + let source = SQLiteSource::new(path, queries.len())?; + let dispatcher = Dispatcher::<_, _, SqlitePandasTransport>::new( + source, + &mut destination, + queries, + origin_query, + ); + dispatcher.run()?; + } + SourceType::MySQL => match protocol { + "binary" => { + let source = + MySQLSource::::new(&source_conn.conn[..], queries.len())?; + let dispatcher = Dispatcher::<_, _, MysqlPandasTransport>::new( + source, + &mut destination, + queries, + origin_query, + ); + dispatcher.run()?; + } + "text" => { + let source = + MySQLSource::::new(&source_conn.conn[..], queries.len())?; + let dispatcher = Dispatcher::<_, _, MysqlPandasTransport>::new( + source, + &mut destination, + queries, + origin_query, + ); + dispatcher.run()?; + } + _ => unimplemented!("{} protocol not supported", protocol), + }, + SourceType::MsSQL => { + let rt = Arc::new(tokio::runtime::Runtime::new().expect("Failed to create runtime")); + let source = MsSQLSource::new(rt, &source_conn.conn[..], queries.len())?; + let dispatcher = Dispatcher::<_, _, MsSQLPandasTransport>::new( + source, + &mut destination, + queries, + origin_query, + ); + dispatcher.run()?; + } + SourceType::Oracle => { + let source = OracleSource::new(&source_conn.conn[..], queries.len())?; + let dispatcher = Dispatcher::<_, _, OraclePandasTransport>::new( + source, + &mut destination, + queries, + origin_query, + ); + dispatcher.run()?; + } + SourceType::BigQuery => { + let rt = Arc::new(tokio::runtime::Runtime::new().expect("Failed to create runtime")); + let source = BigQuerySource::new(rt, &source_conn.conn[..])?; + let dispatcher = Dispatcher::<_, _, BigQueryPandasTransport>::new( + source, + &mut destination, + queries, + origin_query, + ); + dispatcher.run()?; + } + _ => unimplemented!("{:?} not implemented!", source_conn.ty), + } + + destination.result()? +} diff --git a/connectorx-python/src/pandas/pandas_columns/array.rs b/connectorx-python/src/pandas/pandas_columns/array.rs new file mode 100644 index 0000000..f084a1a --- /dev/null +++ b/connectorx-python/src/pandas/pandas_columns/array.rs @@ -0,0 +1,266 @@ +use super::{check_dtype, HasPandasColumn, PandasColumn, PandasColumnObject, GIL_MUTEX}; +use crate::errors::ConnectorXPythonError; +use anyhow::anyhow; +use fehler::throws; +use ndarray::{ArrayViewMut2, Axis, Ix2}; +use numpy::{npyffi::NPY_TYPES, Element, PyArray, PyArrayDescr}; +use pyo3::{FromPyObject, Py, PyAny, PyResult, Python, ToPyObject}; +use std::any::TypeId; +use std::marker::PhantomData; + +#[derive(Clone)] +#[repr(transparent)] +pub struct PyList(Py); + +// In order to put it into a numpy array +unsafe impl Element for PyList { + const DATA_TYPE: numpy::DataType = numpy::DataType::Object; + fn is_same_type(dtype: &PyArrayDescr) -> bool { + unsafe { *dtype.as_dtype_ptr() }.type_num == NPY_TYPES::NPY_OBJECT as i32 + } +} + +pub struct ArrayBlock<'a, V> { + data: ArrayViewMut2<'a, PyList>, + buf_size_mb: usize, + _value_type: PhantomData, +} + +impl<'a, V> FromPyObject<'a> for ArrayBlock<'a, V> { + fn extract(ob: &'a PyAny) -> PyResult { + check_dtype(ob, "object")?; + let array = ob.downcast::>()?; + let data = unsafe { array.as_array_mut() }; + Ok(ArrayBlock:: { + data, + buf_size_mb: 16, // in MB + _value_type: PhantomData, + }) + } +} + +impl<'a, V> ArrayBlock<'a, V> { + #[throws(ConnectorXPythonError)] + pub fn split(self) -> Vec> { + let mut ret = vec![]; + let mut view = self.data; + + let nrows = view.ncols(); + while view.nrows() > 0 { + let (col, rest) = view.split_at(Axis(0), 1); + view = rest; + ret.push(ArrayColumn:: { + data: col + .into_shape(nrows)? + .into_slice() + .ok_or_else(|| anyhow!("get None for splitted FloatArray data"))? + .as_mut_ptr(), + lengths: vec![], + row_idx: vec![], + buffer: Vec::with_capacity(self.buf_size_mb * (1 << 17) * 11 / 10), // allocate a little bit more memory to avoid Vec growth + buf_size: self.buf_size_mb * (1 << 17), + }) + } + ret + } +} + +pub struct ArrayColumn { + data: *mut PyList, + buffer: Vec, + lengths: Vec, // usize::MAX if the string is None + row_idx: Vec, + buf_size: usize, +} + +unsafe impl Send for ArrayColumn {} +unsafe impl Sync for ArrayColumn {} + +impl PandasColumnObject for ArrayColumn +where + V: Send + ToPyObject, +{ + fn typecheck(&self, id: TypeId) -> bool { + id == TypeId::of::() || id == TypeId::of::>() + } + + fn typename(&self) -> &'static str { + std::any::type_name::() + } + + #[throws(ConnectorXPythonError)] + fn finalize(&mut self) { + self.flush()?; + } +} + +impl PandasColumn> for ArrayColumn { + #[throws(ConnectorXPythonError)] + fn write(&mut self, val: Vec, row: usize) { + self.lengths.push(val.len()); + self.buffer.extend_from_slice(&val[..]); + self.row_idx.push(row); + self.try_flush()?; + } +} + +impl PandasColumn>> for ArrayColumn { + #[throws(ConnectorXPythonError)] + fn write(&mut self, val: Option>, row: usize) { + match val { + Some(v) => { + self.lengths.push(v.len()); + self.buffer.extend_from_slice(&v[..]); + self.row_idx.push(row); + self.try_flush()?; + } + None => { + self.lengths.push(usize::MAX); + self.row_idx.push(row); + } + } + } +} +impl PandasColumn> for ArrayColumn { + #[throws(ConnectorXPythonError)] + fn write(&mut self, val: Vec, row: usize) { + self.lengths.push(val.len()); + self.buffer.extend_from_slice(&val[..]); + self.row_idx.push(row); + self.try_flush()?; + } +} + +impl PandasColumn>> for ArrayColumn { + #[throws(ConnectorXPythonError)] + fn write(&mut self, val: Option>, row: usize) { + match val { + Some(v) => { + self.lengths.push(v.len()); + self.buffer.extend_from_slice(&v[..]); + self.row_idx.push(row); + self.try_flush()?; + } + None => { + self.lengths.push(usize::MAX); + self.row_idx.push(row); + } + } + } +} + +impl PandasColumn> for ArrayColumn { + #[throws(ConnectorXPythonError)] + fn write(&mut self, val: Vec, row: usize) { + self.lengths.push(val.len()); + self.buffer.extend_from_slice(&val[..]); + self.row_idx.push(row); + self.try_flush()?; + } +} + +impl PandasColumn>> for ArrayColumn { + #[throws(ConnectorXPythonError)] + fn write(&mut self, val: Option>, row: usize) { + match val { + Some(v) => { + self.lengths.push(v.len()); + self.buffer.extend_from_slice(&v[..]); + self.row_idx.push(row); + self.try_flush()?; + } + None => { + self.lengths.push(usize::MAX); + self.row_idx.push(row); + } + } + } +} + +impl HasPandasColumn for Vec { + type PandasColumn<'a> = ArrayColumn; +} + +impl HasPandasColumn for Option> { + type PandasColumn<'a> = ArrayColumn; +} + +impl HasPandasColumn for Vec { + type PandasColumn<'a> = ArrayColumn; +} + +impl HasPandasColumn for Option> { + type PandasColumn<'a> = ArrayColumn; +} + +impl HasPandasColumn for Vec { + type PandasColumn<'a> = ArrayColumn; +} + +impl HasPandasColumn for Option> { + type PandasColumn<'a> = ArrayColumn; +} +impl ArrayColumn +where + V: Send + ToPyObject, +{ + pub fn partition(self, counts: usize) -> Vec> { + let mut partitions = vec![]; + + for _ in 0..counts { + partitions.push(ArrayColumn { + data: self.data, + lengths: vec![], + row_idx: vec![], + buffer: Vec::with_capacity(self.buf_size), + buf_size: self.buf_size, + }); + } + partitions + } + + #[throws(ConnectorXPythonError)] + pub fn flush(&mut self) { + let nvecs = self.lengths.len(); + + if nvecs > 0 { + let py = unsafe { Python::assume_gil_acquired() }; + + { + // allocation in python is not thread safe + let _guard = GIL_MUTEX + .lock() + .map_err(|e| anyhow!("mutex poisoned {}", e))?; + let mut start = 0; + for (i, &len) in self.lengths.iter().enumerate() { + if len != usize::MAX { + let end = start + len; + unsafe { + // allocate and write in the same time + *self.data.add(self.row_idx[i]) = PyList( + pyo3::types::PyList::new(py, &self.buffer[start..end]).into(), + ); + }; + start = end; + } else { + unsafe { + let n = Py::from_borrowed_ptr(py, pyo3::ffi::Py_None()); + *self.data.add(self.row_idx[i]) = PyList(n); + } + } + } + } + + self.buffer.truncate(0); + self.lengths.truncate(0); + self.row_idx.truncate(0); + } + } + + #[throws(ConnectorXPythonError)] + pub fn try_flush(&mut self) { + if self.buffer.len() >= self.buf_size { + self.flush()?; + } + } +} diff --git a/connectorx-python/src/pandas/pandas_columns/boolean.rs b/connectorx-python/src/pandas/pandas_columns/boolean.rs new file mode 100644 index 0000000..aab01a3 --- /dev/null +++ b/connectorx-python/src/pandas/pandas_columns/boolean.rs @@ -0,0 +1,143 @@ +use super::{check_dtype, HasPandasColumn, PandasColumn, PandasColumnObject}; +use crate::errors::ConnectorXPythonError; +use anyhow::anyhow; +use fehler::throws; +use ndarray::{ArrayViewMut1, ArrayViewMut2, Axis, Ix2}; +use numpy::{PyArray, PyArray1}; +use pyo3::{types::PyTuple, FromPyObject, PyAny, PyResult}; +use std::any::TypeId; + +// Boolean +pub enum BooleanBlock<'a> { + NumPy(ArrayViewMut2<'a, bool>), + Extention(ArrayViewMut1<'a, bool>, ArrayViewMut1<'a, bool>), +} +impl<'a> FromPyObject<'a> for BooleanBlock<'a> { + fn extract(ob: &'a PyAny) -> PyResult { + if let Ok(array) = ob.downcast::>() { + // if numpy array + check_dtype(ob, "bool")?; + let data = unsafe { array.as_array_mut() }; + Ok(BooleanBlock::NumPy(data)) + } else { + // if extension array + let tuple = ob.downcast::()?; + let data = tuple.get_item(0)?; + let mask = tuple.get_item(1)?; + check_dtype(data, "bool")?; + check_dtype(mask, "bool")?; + + Ok(BooleanBlock::Extention( + unsafe { data.downcast::>()?.as_array_mut() }, + unsafe { mask.downcast::>()?.as_array_mut() }, + )) + } + } +} + +impl<'a> BooleanBlock<'a> { + #[throws(ConnectorXPythonError)] + pub fn split(self) -> Vec { + let mut ret = vec![]; + match self { + BooleanBlock::Extention(data, mask) => ret.push(BooleanColumn { + data: data + .into_slice() + .ok_or_else(|| anyhow!("get None for Boolean data"))? + .as_mut_ptr(), + mask: Some( + mask.into_slice() + .ok_or_else(|| anyhow!("get None for Boolean mask"))? + .as_mut_ptr(), + ), + }), + BooleanBlock::NumPy(mut view) => { + let nrows = view.ncols(); + while view.nrows() > 0 { + let (col, rest) = view.split_at(Axis(0), 1); + view = rest; + ret.push(BooleanColumn { + data: col + .into_shape(nrows)? + .into_slice() + .ok_or_else(|| anyhow!("get None for splitted Boolean data"))? + .as_mut_ptr(), + mask: None, + }) + } + } + } + ret + } +} + +pub struct BooleanColumn { + data: *mut bool, + mask: Option<*mut bool>, +} + +unsafe impl Send for BooleanColumn {} +unsafe impl Sync for BooleanColumn {} + +impl PandasColumnObject for BooleanColumn { + fn typecheck(&self, id: TypeId) -> bool { + id == TypeId::of::() || id == TypeId::of::>() + } + fn typename(&self) -> &'static str { + std::any::type_name::() + } +} + +impl PandasColumn for BooleanColumn { + #[throws(ConnectorXPythonError)] + fn write(&mut self, val: bool, row: usize) { + unsafe { *self.data.add(row) = val }; + if let Some(mask) = self.mask.as_mut() { + unsafe { *mask.add(row) = false }; + } + } +} + +impl PandasColumn> for BooleanColumn { + #[throws(ConnectorXPythonError)] + fn write(&mut self, val: Option, row: usize) { + match val { + Some(val) => { + unsafe { *self.data.add(row) = val }; + if let Some(mask) = self.mask.as_mut() { + unsafe { *mask.add(row) = false }; + } + } + None => { + if let Some(mask) = self.mask.as_mut() { + unsafe { *mask.add(row) = true }; + } else { + panic!("Writing null u64 to not null pandas array") + } + } + } + } +} + +impl HasPandasColumn for bool { + type PandasColumn<'a> = BooleanColumn; +} + +impl HasPandasColumn for Option { + type PandasColumn<'a> = BooleanColumn; +} + +impl BooleanColumn { + pub fn partition(self, counts: usize) -> Vec { + let mut partitions = vec![]; + + for _ in 0..counts { + partitions.push(BooleanColumn { + data: self.data, + mask: self.mask, + }); + } + + partitions + } +} diff --git a/connectorx-python/src/pandas/pandas_columns/bytes.rs b/connectorx-python/src/pandas/pandas_columns/bytes.rs new file mode 100644 index 0000000..6890df9 --- /dev/null +++ b/connectorx-python/src/pandas/pandas_columns/bytes.rs @@ -0,0 +1,222 @@ +use super::{check_dtype, HasPandasColumn, PandasColumn, PandasColumnObject, GIL_MUTEX}; +use crate::errors::ConnectorXPythonError; +use anyhow::anyhow; +use fehler::throws; +use ndarray::{ArrayViewMut2, Axis, Ix2}; +use numpy::{npyffi::NPY_TYPES, Element, PyArray, PyArrayDescr}; +use pyo3::{FromPyObject, Py, PyAny, PyResult, Python}; +use std::any::TypeId; + +#[derive(Clone)] +#[repr(transparent)] +pub struct PyBytes(Py); + +// In order to put it into a numpy array +unsafe impl Element for PyBytes { + const DATA_TYPE: numpy::DataType = numpy::DataType::Object; + fn is_same_type(dtype: &PyArrayDescr) -> bool { + unsafe { *dtype.as_dtype_ptr() }.type_num == NPY_TYPES::NPY_OBJECT as i32 + } +} + +pub struct BytesBlock<'a> { + data: ArrayViewMut2<'a, PyBytes>, + buf_size_mb: usize, +} + +impl<'a> FromPyObject<'a> for BytesBlock<'a> { + fn extract(ob: &'a PyAny) -> PyResult { + check_dtype(ob, "object")?; + let array = ob.downcast::>()?; + let data = unsafe { array.as_array_mut() }; + Ok(BytesBlock { + data, + buf_size_mb: 16, // in MB + }) + } +} + +impl<'a> BytesBlock<'a> { + #[throws(ConnectorXPythonError)] + pub fn split(self) -> Vec { + let mut ret = vec![]; + let mut view = self.data; + + let nrows = view.ncols(); + while view.nrows() > 0 { + let (col, rest) = view.split_at(Axis(0), 1); + view = rest; + ret.push(BytesColumn { + data: col + .into_shape(nrows)? + .into_slice() + .ok_or_else(|| anyhow!("get None for splitted String data"))? + .as_mut_ptr(), + bytes_lengths: vec![], + row_idx: vec![], + bytes_buf: Vec::with_capacity(self.buf_size_mb * (1 << 20) * 11 / 10), // allocate a little bit more memory to avoid Vec growth + buf_size: self.buf_size_mb * (1 << 20), + }) + } + ret + } +} + +pub struct BytesColumn { + data: *mut PyBytes, + bytes_buf: Vec, + bytes_lengths: Vec, // usize::MAX if the string is None + row_idx: Vec, + buf_size: usize, +} + +unsafe impl Send for BytesColumn {} +unsafe impl Sync for BytesColumn {} + +impl PandasColumnObject for BytesColumn { + fn typecheck(&self, id: TypeId) -> bool { + id == TypeId::of::<&'static [u8]>() || id == TypeId::of::>() + } + fn typename(&self) -> &'static str { + std::any::type_name::<&'static [u8]>() + } + #[throws(ConnectorXPythonError)] + fn finalize(&mut self) { + self.flush()?; + } +} + +impl PandasColumn> for BytesColumn { + #[throws(ConnectorXPythonError)] + fn write(&mut self, val: Vec, row: usize) { + self.bytes_lengths.push(val.len()); + self.bytes_buf.extend_from_slice(&val[..]); + self.row_idx.push(row); + self.try_flush()?; + } +} + +impl<'r> PandasColumn<&'r [u8]> for BytesColumn { + #[throws(ConnectorXPythonError)] + fn write(&mut self, val: &'r [u8], row: usize) { + self.bytes_lengths.push(val.len()); + self.bytes_buf.extend_from_slice(val); + self.row_idx.push(row); + self.try_flush()?; + } +} + +impl PandasColumn>> for BytesColumn { + #[throws(ConnectorXPythonError)] + fn write(&mut self, val: Option>, row: usize) { + match val { + Some(b) => { + self.bytes_lengths.push(b.len()); + self.bytes_buf.extend_from_slice(&b[..]); + self.row_idx.push(row); + self.try_flush()?; + } + None => { + self.bytes_lengths.push(usize::MAX); + self.row_idx.push(row); + } + } + } +} + +impl<'r> PandasColumn> for BytesColumn { + #[throws(ConnectorXPythonError)] + fn write(&mut self, val: Option<&'r [u8]>, row: usize) { + match val { + Some(b) => { + self.bytes_lengths.push(b.len()); + self.bytes_buf.extend_from_slice(b); + self.row_idx.push(row); + self.try_flush()?; + } + None => { + self.bytes_lengths.push(usize::MAX); + self.row_idx.push(row); + } + } + } +} + +impl HasPandasColumn for Vec { + type PandasColumn<'a> = BytesColumn; +} + +impl HasPandasColumn for Option> { + type PandasColumn<'a> = BytesColumn; +} + +impl<'r> HasPandasColumn for &'r [u8] { + type PandasColumn<'a> = BytesColumn; +} + +impl<'r> HasPandasColumn for Option<&'r [u8]> { + type PandasColumn<'a> = BytesColumn; +} + +impl BytesColumn { + pub fn partition(self, counts: usize) -> Vec { + let mut partitions = vec![]; + + for _ in 0..counts { + partitions.push(BytesColumn { + data: self.data, + bytes_lengths: vec![], + row_idx: vec![], + bytes_buf: Vec::with_capacity(self.buf_size), + buf_size: self.buf_size, + }); + } + + partitions + } + + #[throws(ConnectorXPythonError)] + pub fn flush(&mut self) { + let nstrings = self.bytes_lengths.len(); + + if nstrings > 0 { + let py = unsafe { Python::assume_gil_acquired() }; + + { + // allocation in python is not thread safe + let _guard = GIL_MUTEX + .lock() + .map_err(|e| anyhow!("mutex poisoned {}", e))?; + let mut start = 0; + for (i, &len) in self.bytes_lengths.iter().enumerate() { + if len != usize::MAX { + let end = start + len; + unsafe { + // allocate and write in the same time + *self.data.add(self.row_idx[i]) = PyBytes( + pyo3::types::PyBytes::new(py, &self.bytes_buf[start..end]).into(), + ); + }; + start = end; + } else { + unsafe { + let b = Py::from_borrowed_ptr(py, pyo3::ffi::Py_None()); + *self.data.add(self.row_idx[i]) = PyBytes(b); + } + } + } + } + + self.bytes_buf.truncate(0); + self.bytes_lengths.truncate(0); + self.row_idx.truncate(0); + } + } + + #[throws(ConnectorXPythonError)] + pub fn try_flush(&mut self) { + if self.bytes_buf.len() >= self.buf_size { + self.flush()?; + } + } +} diff --git a/connectorx-python/src/pandas/pandas_columns/datetime.rs b/connectorx-python/src/pandas/pandas_columns/datetime.rs new file mode 100644 index 0000000..1c927d9 --- /dev/null +++ b/connectorx-python/src/pandas/pandas_columns/datetime.rs @@ -0,0 +1,100 @@ +use super::{check_dtype, HasPandasColumn, PandasColumn, PandasColumnObject}; +use crate::errors::ConnectorXPythonError; +use anyhow::anyhow; +use chrono::{DateTime, Utc}; +use fehler::throws; +use ndarray::{ArrayViewMut2, Axis, Ix2}; +use numpy::PyArray; +use pyo3::{FromPyObject, PyAny, PyResult}; +use std::any::TypeId; + +// datetime64 is represented in int64 in numpy +// https://github.com/numpy/numpy/blob/master/numpy/core/include/numpy/npy_common.h#L1104 +pub struct DateTimeBlock<'a> { + data: ArrayViewMut2<'a, i64>, +} + +impl<'a> FromPyObject<'a> for DateTimeBlock<'a> { + fn extract(ob: &'a PyAny) -> PyResult { + check_dtype(ob, "int64")?; + let array = ob.downcast::>()?; + let data = unsafe { array.as_array_mut() }; + Ok(DateTimeBlock { data }) + } +} + +impl<'a> DateTimeBlock<'a> { + #[throws(ConnectorXPythonError)] + pub fn split(self) -> Vec { + let mut ret = vec![]; + let mut view = self.data; + + let nrows = view.ncols(); + while view.nrows() > 0 { + let (col, rest) = view.split_at(Axis(0), 1); + view = rest; + ret.push(DateTimeColumn { + data: col + .into_shape(nrows)? + .into_slice() + .ok_or_else(|| anyhow!("get None for splitted DateTime data"))? + .as_mut_ptr(), + }) + } + ret + } +} + +pub struct DateTimeColumn { + data: *mut i64, +} + +unsafe impl Send for DateTimeColumn {} +unsafe impl Sync for DateTimeColumn {} + +impl PandasColumnObject for DateTimeColumn { + fn typecheck(&self, id: TypeId) -> bool { + id == TypeId::of::>() || id == TypeId::of::>>() + } + + fn typename(&self) -> &'static str { + std::any::type_name::>() + } +} + +impl PandasColumn> for DateTimeColumn { + #[throws(ConnectorXPythonError)] + fn write(&mut self, val: DateTime, row: usize) { + unsafe { *self.data.add(row) = val.timestamp_nanos() }; + } +} + +impl PandasColumn>> for DateTimeColumn { + #[throws(ConnectorXPythonError)] + fn write(&mut self, val: Option>, row: usize) { + // numpy use i64::MIN as NaT + unsafe { + *self.data.add(row) = val.map(|t| t.timestamp_nanos()).unwrap_or(i64::MIN); + }; + } +} + +impl HasPandasColumn for DateTime { + type PandasColumn<'a> = DateTimeColumn; +} + +impl HasPandasColumn for Option> { + type PandasColumn<'a> = DateTimeColumn; +} + +impl DateTimeColumn { + pub fn partition(self, counts: usize) -> Vec { + let mut partitions = vec![]; + + for _ in 0..counts { + partitions.push(DateTimeColumn { data: self.data }); + } + + partitions + } +} diff --git a/connectorx-python/src/pandas/pandas_columns/float64.rs b/connectorx-python/src/pandas/pandas_columns/float64.rs new file mode 100644 index 0000000..7d437bd --- /dev/null +++ b/connectorx-python/src/pandas/pandas_columns/float64.rs @@ -0,0 +1,96 @@ +use super::{check_dtype, HasPandasColumn, PandasColumn, PandasColumnObject}; +use crate::errors::ConnectorXPythonError; +use anyhow::anyhow; +use fehler::throws; +use ndarray::{ArrayViewMut2, Axis, Ix2}; +use numpy::PyArray; +use pyo3::{FromPyObject, PyAny, PyResult}; +use std::any::TypeId; + +// Float +pub struct Float64Block<'a> { + data: ArrayViewMut2<'a, f64>, +} + +impl<'a> FromPyObject<'a> for Float64Block<'a> { + fn extract(ob: &'a PyAny) -> PyResult { + check_dtype(ob, "float64")?; + let array = ob.downcast::>()?; + let data = unsafe { array.as_array_mut() }; + Ok(Float64Block { data }) + } +} + +impl<'a> Float64Block<'a> { + #[throws(ConnectorXPythonError)] + pub fn split(self) -> Vec { + let mut ret = vec![]; + let mut view = self.data; + + let nrows = view.ncols(); + while view.nrows() > 0 { + let (col, rest) = view.split_at(Axis(0), 1); + view = rest; + ret.push(Float64Column { + data: col + .into_shape(nrows)? + .into_slice() + .ok_or_else(|| anyhow!("get None for splitted Float64 data"))? + .as_mut_ptr(), + }) + } + ret + } +} + +pub struct Float64Column { + data: *mut f64, +} + +unsafe impl Send for Float64Column {} +unsafe impl Sync for Float64Column {} + +impl<'a> PandasColumnObject for Float64Column { + fn typecheck(&self, id: TypeId) -> bool { + id == TypeId::of::() || id == TypeId::of::>() + } + + fn typename(&self) -> &'static str { + std::any::type_name::() + } +} + +impl<'a> PandasColumn for Float64Column { + #[throws(ConnectorXPythonError)] + fn write(&mut self, val: f64, row: usize) { + unsafe { *self.data.add(row) = val }; + } +} + +impl<'a> PandasColumn> for Float64Column { + #[throws(ConnectorXPythonError)] + fn write(&mut self, val: Option, row: usize) { + match val { + None => unsafe { *self.data.add(row) = f64::NAN }, + Some(val) => unsafe { *self.data.add(row) = val }, + } + } +} + +impl HasPandasColumn for f64 { + type PandasColumn<'a> = Float64Column; +} + +impl HasPandasColumn for Option { + type PandasColumn<'a> = Float64Column; +} + +impl Float64Column { + pub fn partition(self, counts: usize) -> Vec { + let mut partitions = vec![]; + for _ in 0..counts { + partitions.push(Float64Column { data: self.data }); + } + partitions + } +} diff --git a/connectorx-python/src/pandas/pandas_columns/int64.rs b/connectorx-python/src/pandas/pandas_columns/int64.rs new file mode 100644 index 0000000..cfdbfd5 --- /dev/null +++ b/connectorx-python/src/pandas/pandas_columns/int64.rs @@ -0,0 +1,142 @@ +use super::{check_dtype, HasPandasColumn, PandasColumn, PandasColumnObject}; +use crate::errors::ConnectorXPythonError; +use anyhow::anyhow; +use fehler::throws; +use ndarray::{ArrayViewMut1, ArrayViewMut2, Axis, Ix2}; +use numpy::{PyArray, PyArray1}; +use pyo3::{types::PyTuple, FromPyObject, PyAny, PyResult}; +use std::any::TypeId; + +pub enum Int64Block<'a> { + NumPy(ArrayViewMut2<'a, i64>), + Extention(ArrayViewMut1<'a, i64>, ArrayViewMut1<'a, bool>), +} +impl<'a> FromPyObject<'a> for Int64Block<'a> { + fn extract(ob: &'a PyAny) -> PyResult { + if let Ok(array) = ob.downcast::>() { + check_dtype(ob, "int64")?; + let data = unsafe { array.as_array_mut() }; + Ok(Int64Block::NumPy(data)) + } else { + let tuple = ob.downcast::()?; + let data = tuple.get_item(0)?; + let mask = tuple.get_item(1)?; + check_dtype(data, "int64")?; + check_dtype(mask, "bool")?; + + Ok(Int64Block::Extention( + unsafe { data.downcast::>()?.as_array_mut() }, + unsafe { mask.downcast::>()?.as_array_mut() }, + )) + } + } +} + +impl<'a> Int64Block<'a> { + #[throws(ConnectorXPythonError)] + pub fn split(self) -> Vec { + let mut ret = vec![]; + match self { + Int64Block::Extention(data, mask) => ret.push(Int64Column { + data: data + .into_slice() + .ok_or_else(|| anyhow!("get None for Int64 data"))? + .as_mut_ptr(), + mask: Some( + mask.into_slice() + .ok_or_else(|| anyhow!("get None for Int64 mask"))? + .as_mut_ptr(), + ), + }), + Int64Block::NumPy(mut view) => { + let nrows = view.ncols(); + while view.nrows() > 0 { + let (col, rest) = view.split_at(Axis(0), 1); + view = rest; + ret.push(Int64Column { + data: col + .into_shape(nrows)? + .into_slice() + .ok_or_else(|| anyhow!("get None for splitted Int64 data"))? + .as_mut_ptr(), + mask: None, + }) + } + } + } + ret + } +} + +// for uint64 and Int64 +pub struct Int64Column { + data: *mut i64, + mask: Option<*mut bool>, +} + +unsafe impl Send for Int64Column {} +unsafe impl Sync for Int64Column {} + +impl PandasColumnObject for Int64Column { + fn typecheck(&self, id: TypeId) -> bool { + id == TypeId::of::() || id == TypeId::of::>() + } + + fn typename(&self) -> &'static str { + std::any::type_name::() + } +} + +impl PandasColumn for Int64Column { + #[throws(ConnectorXPythonError)] + fn write(&mut self, val: i64, row: usize) { + unsafe { *self.data.add(row) = val }; + if let Some(mask) = self.mask.as_mut() { + unsafe { *mask.add(row) = false }; + } + } +} + +impl PandasColumn> for Int64Column { + #[throws(ConnectorXPythonError)] + fn write(&mut self, val: Option, row: usize) { + match val { + Some(val) => { + unsafe { *self.data.add(row) = val }; + if let Some(mask) = self.mask.as_mut() { + unsafe { *mask.add(row) = false }; + } + } + None => { + if let Some(mask) = self.mask.as_mut() { + unsafe { *mask.add(row) = true }; + } else { + panic!("Writing null i64 to not null pandas array") + } + } + } + } +} + +impl HasPandasColumn for i64 { + type PandasColumn<'a> = Int64Column; +} + +impl HasPandasColumn for Option { + type PandasColumn<'a> = Int64Column; +} + +impl Int64Column { + pub fn partition(self, counts: usize) -> Vec { + let mut partitions = vec![]; + + for _ in 0..counts { + partitions.push(Int64Column { + data: self.data, + mask: self.mask, + }); + } + + partitions + } +} diff --git a/connectorx-python/src/pandas/pandas_columns/mod.rs b/connectorx-python/src/pandas/pandas_columns/mod.rs new file mode 100644 index 0000000..82ababc --- /dev/null +++ b/connectorx-python/src/pandas/pandas_columns/mod.rs @@ -0,0 +1,58 @@ +mod array; +mod boolean; +mod bytes; +mod datetime; +mod float64; +mod int64; +mod string; +// TODO: use macro for integers + +use crate::errors::Result; +pub use crate::pandas::pandas_columns::array::{ArrayBlock, ArrayColumn, PyList}; +pub use crate::pandas::pandas_columns::bytes::{BytesBlock, BytesColumn, PyBytes}; +pub use boolean::{BooleanBlock, BooleanColumn}; +pub use datetime::{DateTimeBlock, DateTimeColumn}; +use fehler::throw; +pub use float64::{Float64Block, Float64Column}; +pub use int64::{Int64Block, Int64Column}; +use pyo3::{exceptions::PyRuntimeError, PyAny, PyResult}; +use std::any::TypeId; +use std::sync::Mutex; +pub use string::{StringBlock, StringColumn}; + +// A global GIL lock for Python object allocations like string, bytes and list +lazy_static! { + static ref GIL_MUTEX: Mutex<()> = Mutex::new(()); +} + +pub trait PandasColumnObject: Send { + fn typecheck(&self, _: TypeId) -> bool; + fn typename(&self) -> &'static str; + fn finalize(&mut self) -> Result<()> { + Ok(()) + } +} + +pub trait PandasColumn: Sized + PandasColumnObject { + fn write(&mut self, val: V, row: usize) -> Result<()>; +} + +// Indicates a type has an associated pandas column +pub trait HasPandasColumn: Sized { + type PandasColumn<'a>: PandasColumn; +} + +pub fn check_dtype(ob: &PyAny, expected_dtype: &str) -> PyResult<()> { + let dtype = ob.getattr("dtype")?.str()?; + let dtype = dtype.to_str()?; + if dtype != expected_dtype { + throw!(PyRuntimeError::new_err(format!( + "expecting ndarray to be '{}' found '{}' at {}:{}", + expected_dtype, + dtype, + file!(), + line!() + ))); + } + Ok(()) +} diff --git a/connectorx-python/src/pandas/pandas_columns/string.rs b/connectorx-python/src/pandas/pandas_columns/string.rs new file mode 100644 index 0000000..5f38a00 --- /dev/null +++ b/connectorx-python/src/pandas/pandas_columns/string.rs @@ -0,0 +1,328 @@ +use super::super::pystring::{PyString, StringInfo}; +use super::{check_dtype, HasPandasColumn, PandasColumn, PandasColumnObject, GIL_MUTEX}; +use crate::constants::PYSTRING_BUFFER_SIZE; +use crate::errors::ConnectorXPythonError; +use anyhow::anyhow; +use fehler::throws; +use itertools::Itertools; +use ndarray::{ArrayViewMut2, Axis, Ix2}; +use numpy::PyArray; +use pyo3::{FromPyObject, PyAny, PyResult, Python}; +use std::any::TypeId; + +pub struct StringBlock<'a> { + data: ArrayViewMut2<'a, PyString>, + buf_size_mb: usize, +} + +impl<'a> FromPyObject<'a> for StringBlock<'a> { + fn extract(ob: &'a PyAny) -> PyResult { + check_dtype(ob, "object")?; + let array = ob.downcast::>()?; + let data = unsafe { array.as_array_mut() }; + Ok(StringBlock { + data, + buf_size_mb: PYSTRING_BUFFER_SIZE, // in MB + }) + } +} + +impl<'a> StringBlock<'a> { + #[throws(ConnectorXPythonError)] + pub fn split(self) -> Vec { + let mut ret = vec![]; + let mut view = self.data; + + let nrows = view.ncols(); + while view.nrows() > 0 { + let (col, rest) = view.split_at(Axis(0), 1); + view = rest; + ret.push(StringColumn { + data: col + .into_shape(nrows)? + .into_slice() + .ok_or_else(|| anyhow!("get None for splitted String data"))? + .as_mut_ptr(), + string_lengths: vec![], + row_idx: vec![], + string_buf: Vec::with_capacity(self.buf_size_mb * (1 << 20) * 11 / 10), // allocate a little bit more memory to avoid Vec growth + buf_size: self.buf_size_mb * (1 << 20), + }) + } + ret + } +} + +pub struct StringColumn { + data: *mut PyString, + string_buf: Vec, + string_lengths: Vec, // usize::MAX for empty string + row_idx: Vec, + buf_size: usize, +} + +unsafe impl Send for StringColumn {} +unsafe impl Sync for StringColumn {} + +impl PandasColumnObject for StringColumn { + fn typecheck(&self, id: TypeId) -> bool { + id == TypeId::of::<&'static [u8]>() || id == TypeId::of::>() + } + + fn typename(&self) -> &'static str { + std::any::type_name::<&'static [u8]>() + } + #[throws(ConnectorXPythonError)] + fn finalize(&mut self) { + self.flush(true)?; + } +} + +impl<'r> PandasColumn<&'r str> for StringColumn { + #[throws(ConnectorXPythonError)] + fn write(&mut self, val: &'r str, row: usize) { + let bytes = val.as_bytes(); + self.string_lengths.push(bytes.len()); + self.string_buf.extend_from_slice(bytes); + self.row_idx.push(row); + self.try_flush()?; + } +} + +impl PandasColumn> for StringColumn { + #[throws(ConnectorXPythonError)] + fn write(&mut self, val: Box, row: usize) { + let bytes = val.as_bytes(); + self.string_lengths.push(bytes.len()); + self.string_buf.extend_from_slice(bytes); + self.row_idx.push(row); + self.try_flush()?; + } +} + +impl PandasColumn for StringColumn { + #[throws(ConnectorXPythonError)] + fn write(&mut self, val: String, row: usize) { + let bytes = val.as_bytes(); + self.string_lengths.push(bytes.len()); + self.string_buf.extend_from_slice(bytes); + self.row_idx.push(row); + self.try_flush()?; + } +} + +impl PandasColumn for StringColumn { + #[throws(ConnectorXPythonError)] + fn write(&mut self, val: char, row: usize) { + let mut buffer = [0; 4]; // a char is max to 4 bytes + let bytes = val.encode_utf8(&mut buffer).as_bytes(); + self.string_lengths.push(bytes.len()); + self.string_buf.extend_from_slice(bytes); + self.row_idx.push(row); + self.try_flush()?; + } +} + +impl<'r> PandasColumn> for StringColumn { + #[throws(ConnectorXPythonError)] + fn write(&mut self, val: Option<&'r str>, row: usize) { + match val { + Some(b) => { + let bytes = b.as_bytes(); + self.string_lengths.push(bytes.len()); + self.string_buf.extend_from_slice(bytes); + self.row_idx.push(row); + self.try_flush()?; + } + None => { + self.string_lengths.push(usize::MAX); + self.row_idx.push(row); + } + } + } +} + +impl PandasColumn>> for StringColumn { + #[throws(ConnectorXPythonError)] + fn write(&mut self, val: Option>, row: usize) { + match val { + Some(b) => { + let bytes = b.as_bytes(); + self.string_lengths.push(bytes.len()); + self.string_buf.extend_from_slice(bytes); + self.row_idx.push(row); + self.try_flush()?; + } + None => { + self.string_lengths.push(usize::MAX); + self.row_idx.push(row); + } + } + } +} +impl PandasColumn> for StringColumn { + #[throws(ConnectorXPythonError)] + fn write(&mut self, val: Option, row: usize) { + match val { + Some(b) => { + let bytes = b.as_bytes(); + self.string_lengths.push(bytes.len()); + self.string_buf.extend_from_slice(bytes); + self.row_idx.push(row); + self.try_flush()?; + } + None => { + self.string_lengths.push(usize::MAX); + self.row_idx.push(row); + } + } + } +} + +impl PandasColumn> for StringColumn { + #[throws(ConnectorXPythonError)] + fn write(&mut self, val: Option, row: usize) { + match val { + Some(b) => { + let mut buffer = [0; 4]; // a char is max to 4 bytes + let bytes = b.encode_utf8(&mut buffer).as_bytes(); + self.string_lengths.push(bytes.len()); + self.string_buf.extend_from_slice(bytes); + self.row_idx.push(row); + self.try_flush()?; + } + None => { + self.string_lengths.push(usize::MAX); + self.row_idx.push(row); + } + } + } +} + +impl<'r> HasPandasColumn for &'r str { + type PandasColumn<'a> = StringColumn; +} + +impl<'r> HasPandasColumn for Option<&'r str> { + type PandasColumn<'a> = StringColumn; +} + +impl HasPandasColumn for String { + type PandasColumn<'a> = StringColumn; +} + +impl HasPandasColumn for Option { + type PandasColumn<'a> = StringColumn; +} + +impl HasPandasColumn for char { + type PandasColumn<'a> = StringColumn; +} + +impl HasPandasColumn for Option { + type PandasColumn<'a> = StringColumn; +} + +impl HasPandasColumn for Box { + type PandasColumn<'a> = StringColumn; +} + +impl HasPandasColumn for Option> { + type PandasColumn<'a> = StringColumn; +} + +impl StringColumn { + pub fn partition(self, counts: usize) -> Vec { + let mut partitions = vec![]; + + for _ in 0..counts { + partitions.push(StringColumn { + data: self.data, + string_lengths: vec![], + row_idx: vec![], + string_buf: Vec::with_capacity(self.buf_size), + buf_size: self.buf_size, + }); + } + + partitions + } + + #[throws(ConnectorXPythonError)] + pub fn flush(&mut self, force: bool) { + let nstrings = self.string_lengths.len(); + if nstrings == 0 { + return; + } + + let guard = if force { + GIL_MUTEX + .lock() + .map_err(|e| anyhow!("mutex poisoned {}", e))? + } else { + match GIL_MUTEX.try_lock() { + Ok(guard) => guard, + Err(_) => return, + } + }; + let py = unsafe { Python::assume_gil_acquired() }; + + let mut string_infos = Vec::with_capacity(self.string_lengths.len()); + let mut start = 0; + for (i, &len) in self.string_lengths.iter().enumerate() { + if len != usize::MAX { + let end = start + len; + + unsafe { + let string_info = StringInfo::detect(&self.string_buf[start..end]); + *self.data.add(self.row_idx[i]) = string_info.pystring(py); + string_infos.push(Some(string_info)); + }; + + start = end; + } else { + string_infos.push(None); + + unsafe { *self.data.add(self.row_idx[i]) = PyString::none(py) }; + } + } + + // unlock GIL + std::mem::drop(guard); + + if !string_infos.is_empty() { + let mut start = 0; + for (i, (len, info)) in self + .string_lengths + .drain(..) + .zip_eq(string_infos) + .enumerate() + { + if len != usize::MAX { + let end = start + len; + unsafe { + (*self.data.add(self.row_idx[i])) + .write(&self.string_buf[start..end], info.unwrap()) + }; + + start = end; + } + } + + self.string_buf.truncate(0); + self.row_idx.truncate(0); + } + } + + #[throws(ConnectorXPythonError)] + pub fn try_flush(&mut self) { + if self.string_buf.len() >= self.buf_size { + self.flush(true)?; + return; + } + #[cfg(feature = "nbstr")] + if self.string_buf.len() >= self.buf_size / 2 { + self.flush(false)?; + } + } +} diff --git a/connectorx-python/src/pandas/pystring.rs b/connectorx-python/src/pandas/pystring.rs new file mode 100644 index 0000000..84f39d5 --- /dev/null +++ b/connectorx-python/src/pandas/pystring.rs @@ -0,0 +1,165 @@ +use bitfield::bitfield; +use numpy::{npyffi::NPY_TYPES, Element, PyArrayDescr}; +use pyo3::{ffi, Py, Python}; +use std::str::from_utf8_unchecked; + +#[derive(Clone, Debug)] +#[repr(transparent)] +pub struct PyString(Py); + +// In order to put it into a numpy array +unsafe impl Element for PyString { + const DATA_TYPE: numpy::DataType = numpy::DataType::Object; + fn is_same_type(dtype: &PyArrayDescr) -> bool { + unsafe { *dtype.as_dtype_ptr() }.type_num == NPY_TYPES::NPY_OBJECT as i32 + } +} + +#[derive(Clone, Copy)] +pub enum StringInfo { + ASCII(usize), // len of the string, not byte length + UCS1(usize), + UCS2(usize), + UCS4(usize), +} + +impl StringInfo { + pub unsafe fn detect(s: &[u8]) -> StringInfo { + let s = from_utf8_unchecked(s); + let mut maxchar = 0; + let mut len = 0; + + for ch in s.chars() { + if ch as u32 > maxchar { + maxchar = ch as u32; + } + len += 1; + } + + if maxchar <= 0x7F { + StringInfo::ASCII(len) + } else if maxchar <= 0xFF { + StringInfo::UCS1(len) + } else if maxchar <= 0xFFFF { + StringInfo::UCS2(len) + } else { + StringInfo::UCS4(len) + } + } + + pub fn pystring(&self, py: Python) -> PyString { + let objptr = unsafe { + match self { + StringInfo::ASCII(len) => ffi::PyUnicode_New(*len as ffi::Py_ssize_t, 0x7F), + StringInfo::UCS1(len) => ffi::PyUnicode_New(*len as ffi::Py_ssize_t, 0xFF), + StringInfo::UCS2(len) => ffi::PyUnicode_New(*len as ffi::Py_ssize_t, 0xFFFF), + StringInfo::UCS4(len) => ffi::PyUnicode_New(*len as ffi::Py_ssize_t, 0x10FFFF), + } + }; + + let s: Py = unsafe { Py::from_owned_ptr(py, objptr) }; + + PyString(s) + } +} + +impl PyString { + // get none string converted from none object, otherwise default strings are zeros + pub fn none(py: Python) -> PyString { + // this is very unsafe because Py_None is not a PyString from Rust's perspective. But it is fine because + // later these stuff will all be converted to a python object + let s = unsafe { Py::from_borrowed_ptr(py, ffi::Py_None()) }; + PyString(s) + } + + // the val should be same as the val used for new + pub unsafe fn write(&mut self, data: &[u8], info: StringInfo) { + match info { + StringInfo::ASCII(len) => { + let pyobj = PyASCIIObject::from_mut_ref(&mut self.0); + let buf = std::slice::from_raw_parts_mut( + (pyobj as *mut PyASCIIObject).offset(1) as *mut u8, + len as usize, + ); + + buf.copy_from_slice(data); + } + StringInfo::UCS1(len) => { + let pyobj = PyCompactUnicodeObject::from_mut_ref(&mut self.0); + let buf = std::slice::from_raw_parts_mut( + (pyobj as *mut PyCompactUnicodeObject).offset(1) as *mut u8, + len as usize, + ); + let data: Vec = from_utf8_unchecked(data).chars().map(|c| c as u8).collect(); + buf.copy_from_slice(&data); + } + StringInfo::UCS2(len) => { + let pyobj = PyCompactUnicodeObject::from_mut_ref(&mut self.0); + let buf = std::slice::from_raw_parts_mut( + (pyobj as *mut PyCompactUnicodeObject).offset(1) as *mut u16, + len as usize, + ); + let data: Vec = from_utf8_unchecked(data) + .chars() + .map(|c| c as u16) + .collect(); + buf.copy_from_slice(&data); + } + StringInfo::UCS4(len) => { + let pyobj = PyCompactUnicodeObject::from_mut_ref(&mut self.0); + let buf = std::slice::from_raw_parts_mut( + (pyobj as *mut PyCompactUnicodeObject).offset(1) as *mut u32, + len as usize, + ); + let data: Vec = from_utf8_unchecked(data) + .chars() + .map(|c| c as u32) + .collect(); + buf.copy_from_slice(&data); + } + } + } +} + +bitfield! { + struct PyUnicodeState(u32); + u32; + interned, _: 1, 0; + kind, _: 4, 2; + compact, _: 5, 5; + ascii, _: 6, 6; + ready, _: 7, 7; +} + +#[repr(C)] +pub struct PyASCIIObject { + obj: ffi::PyObject, + length: ffi::Py_ssize_t, + hash: ffi::Py_hash_t, + state: PyUnicodeState, + wstr: *mut u8, + // python string stores data right after all the fields +} + +impl PyASCIIObject { + pub unsafe fn from_mut_ref<'a>(obj: &'a mut Py) -> &'a mut Self { + let ascii: &mut &mut PyASCIIObject = std::mem::transmute(obj); + *ascii + } +} + +#[repr(C)] +pub struct PyCompactUnicodeObject { + base: PyASCIIObject, + utf8_length: ffi::Py_ssize_t, + utf8: *mut u8, + wstr_length: ffi::Py_ssize_t, + // python string stores data right after all the fields +} + +impl PyCompactUnicodeObject { + pub unsafe fn from_mut_ref<'a>(obj: &'a mut Py) -> &'a mut Self { + let unicode: &mut &mut PyCompactUnicodeObject = std::mem::transmute(obj); + *unicode + } +} diff --git a/connectorx-python/src/pandas/transports/bigquery.rs b/connectorx-python/src/pandas/transports/bigquery.rs new file mode 100644 index 0000000..2648662 --- /dev/null +++ b/connectorx-python/src/pandas/transports/bigquery.rs @@ -0,0 +1,56 @@ +use crate::errors::ConnectorXPythonError; +use crate::pandas::destination::PandasDestination; +use crate::pandas::typesystem::PandasTypeSystem; +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Utc}; +use connectorx::{ + impl_transport, + sources::bigquery::{BigQuerySource, BigQueryTypeSystem}, + typesystem::TypeConversion, +}; + +pub struct BigQueryPandasTransport<'py>(&'py ()); + +impl_transport!( + name = BigQueryPandasTransport<'tp>, + error = ConnectorXPythonError, + systems = BigQueryTypeSystem => PandasTypeSystem, + route = BigQuerySource => PandasDestination<'tp>, + mappings = { + { Bool[bool] => Bool[bool] | conversion auto } + { Boolean[bool] => Bool[bool] | conversion none } + { Int64[i64] => I64[i64] | conversion auto } + { Integer[i64] => I64[i64] | conversion none } + { Float64[f64] => F64[f64] | conversion auto } + { Float[f64] => F64[f64] | conversion none } + { Numeric[f64] => F64[f64] | conversion none } + { Bignumeric[f64] => F64[f64] | conversion none } + { String[String] => String[String] | conversion auto } + { Bytes[String] => String[String] | conversion none } + { Date[NaiveDate] => DateTime[DateTime] | conversion option } + { Datetime[NaiveDateTime] => DateTime[DateTime] | conversion option } + { Time[NaiveTime] => String[String] | conversion option } + { Timestamp[DateTime] => DateTime[DateTime] | conversion auto } + } +); + +impl<'py> TypeConversion> for BigQueryPandasTransport<'py> { + fn convert(val: NaiveDate) -> DateTime { + DateTime::from_naive_utc_and_offset( + val.and_hms_opt(0, 0, 0) + .unwrap_or_else(|| panic!("and_hms_opt got None from {:?}", val)), + Utc, + ) + } +} + +impl<'py> TypeConversion> for BigQueryPandasTransport<'py> { + fn convert(val: NaiveDateTime) -> DateTime { + DateTime::from_naive_utc_and_offset(val, Utc) + } +} + +impl<'py> TypeConversion for BigQueryPandasTransport<'py> { + fn convert(val: NaiveTime) -> String { + val.to_string() + } +} diff --git a/connectorx-python/src/pandas/transports/mod.rs b/connectorx-python/src/pandas/transports/mod.rs new file mode 100644 index 0000000..9f03abf --- /dev/null +++ b/connectorx-python/src/pandas/transports/mod.rs @@ -0,0 +1,13 @@ +mod bigquery; +mod mssql; +mod mysql; +mod oracle; +mod postgres; +mod sqlite; + +pub use self::postgres::PostgresPandasTransport; +pub use bigquery::BigQueryPandasTransport; +pub use mssql::MsSQLPandasTransport; +pub use mysql::MysqlPandasTransport; +pub use oracle::OraclePandasTransport; +pub use sqlite::SqlitePandasTransport; diff --git a/connectorx-python/src/pandas/transports/mssql.rs b/connectorx-python/src/pandas/transports/mssql.rs new file mode 100644 index 0000000..d8a1745 --- /dev/null +++ b/connectorx-python/src/pandas/transports/mssql.rs @@ -0,0 +1,97 @@ +use crate::errors::ConnectorXPythonError; +use crate::pandas::{destination::PandasDestination, typesystem::PandasTypeSystem}; +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Utc}; +use connectorx::{ + impl_transport, + sources::mssql::{FloatN, IntN, MsSQLSource, MsSQLTypeSystem}, + typesystem::TypeConversion, +}; +use rust_decimal::prelude::*; +use uuid::Uuid; + +pub struct MsSQLPandasTransport<'py>(&'py ()); + +impl_transport!( + name = MsSQLPandasTransport<'tp>, + error = ConnectorXPythonError, + systems = MsSQLTypeSystem => PandasTypeSystem, + route = MsSQLSource => PandasDestination<'tp>, + mappings = { + { Tinyint[u8] => I64[i64] | conversion auto } + { Smallint[i16] => I64[i64] | conversion auto } + { Int[i32] => I64[i64] | conversion auto } + { Bigint[i64] => I64[i64] | conversion auto } + { Intn[IntN] => I64[i64] | conversion option } + { Float24[f32] => F64[f64] | conversion auto } + { Float53[f64] => F64[f64] | conversion auto } + { Floatn[FloatN] => F64[f64] | conversion option } + { Bit[bool] => Bool[bool] | conversion auto } + { Nvarchar[&'r str] => Str[&'r str] | conversion auto } + { Varchar[&'r str] => Str[&'r str] | conversion none } + { Nchar[&'r str] => Str[&'r str] | conversion none } + { Char[&'r str] => Str[&'r str] | conversion none } + { Text[&'r str] => Str[&'r str] | conversion none } + { Ntext[&'r str] => Str[&'r str] | conversion none } + { Binary[&'r [u8]] => ByteSlice[&'r [u8]] | conversion auto } + { Varbinary[&'r [u8]] => ByteSlice[&'r [u8]] | conversion none } + { Image[&'r [u8]] => ByteSlice[&'r [u8]] | conversion none } + { Numeric[Decimal] => F64[f64] | conversion option } + { Decimal[Decimal] => F64[f64] | conversion none } + { Datetime[NaiveDateTime] => DateTime[DateTime] | conversion option } + { Datetime2[NaiveDateTime] => DateTime[DateTime] | conversion none } + { Smalldatetime[NaiveDateTime] => DateTime[DateTime] | conversion none } + { Date[NaiveDate] => DateTime[DateTime] | conversion option } + { Datetimeoffset[DateTime] => DateTime[DateTime] | conversion auto } + { Uniqueidentifier[Uuid] => String[String] | conversion option } + { Time[NaiveTime] => String[String] | conversion option } + { SmallMoney[f32] => F64[f64] | conversion none } + { Money[f64] => F64[f64] | conversion none } + } +); + +impl<'py> TypeConversion for MsSQLPandasTransport<'py> { + fn convert(val: IntN) -> i64 { + val.0 + } +} + +impl<'py> TypeConversion for MsSQLPandasTransport<'py> { + fn convert(val: FloatN) -> f64 { + val.0 + } +} + +impl<'py> TypeConversion> for MsSQLPandasTransport<'py> { + fn convert(val: NaiveDateTime) -> DateTime { + DateTime::from_naive_utc_and_offset(val, Utc) + } +} + +impl<'py> TypeConversion> for MsSQLPandasTransport<'py> { + fn convert(val: NaiveDate) -> DateTime { + DateTime::from_naive_utc_and_offset( + val.and_hms_opt(0, 0, 0) + .unwrap_or_else(|| panic!("and_hms_opt got None from {:?}", val)), + Utc, + ) + } +} + +impl<'py> TypeConversion for MsSQLPandasTransport<'py> { + fn convert(val: Uuid) -> String { + val.to_string() + } +} + +impl<'py> TypeConversion for MsSQLPandasTransport<'py> { + fn convert(val: Decimal) -> f64 { + val.to_f64() + .unwrap_or_else(|| panic!("cannot convert decimal {:?} to float64", val)) + } +} + +impl<'py> TypeConversion for MsSQLPandasTransport<'py> { + fn convert(val: NaiveTime) -> String { + val.to_string() + } +} diff --git a/connectorx-python/src/pandas/transports/mysql.rs b/connectorx-python/src/pandas/transports/mysql.rs new file mode 100644 index 0000000..9cd7213 --- /dev/null +++ b/connectorx-python/src/pandas/transports/mysql.rs @@ -0,0 +1,119 @@ +use crate::errors::ConnectorXPythonError; +use crate::pandas::destination::PandasDestination; +use crate::pandas::typesystem::PandasTypeSystem; +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Utc}; +use connectorx::{ + impl_transport, + sources::mysql::{BinaryProtocol, MySQLSource, MySQLTypeSystem, TextProtocol}, + typesystem::TypeConversion, +}; +use rust_decimal::prelude::*; +use serde_json::{to_string, Value}; +use std::marker::PhantomData; + +pub struct MysqlPandasTransport<'py, P>(&'py (), PhantomData

); + +impl_transport!( + name = MysqlPandasTransport<'tp, BinaryProtocol>, + error = ConnectorXPythonError, + systems = MySQLTypeSystem => PandasTypeSystem, + route = MySQLSource => PandasDestination<'tp>, + mappings = { + { Float[f32] => F64[f64] | conversion auto } + { Double[f64] => F64[f64] | conversion auto } + { Tiny[i8] => I64[i64] | conversion auto } + { Short[i16] => I64[i64] | conversion auto } + { Long[i32] => I64[i64] | conversion auto } + { Int24[i32] => I64[i64] | conversion none } + { LongLong[i64] => I64[i64] | conversion auto } + { UTiny[u8] => I64[i64] | conversion auto } + { UShort[u16] => I64[i64] | conversion auto } + { ULong[u32] => I64[i64] | conversion auto } + { UInt24[u32] => I64[i64] | conversion none } + { ULongLong[u64] => F64[f64] | conversion auto } + { Date[NaiveDate] => DateTime[DateTime] | conversion option } + { Time[NaiveTime] => String[String] | conversion option } + { Year[i16] => I64[i64] | conversion none} + { Datetime[NaiveDateTime] => DateTime[DateTime] | conversion option } + { Timestamp[NaiveDateTime] => DateTime[DateTime] | conversion none } + { Decimal[Decimal] => F64[f64] | conversion option } + { VarChar[String] => String[String] | conversion auto } + { Char[String] => String[String] | conversion none } + { Enum[String] => Str[String] | conversion none } + { TinyBlob[Vec] => Bytes[Vec] | conversion auto } + { Blob[Vec] => Bytes[Vec] | conversion none } + { MediumBlob[Vec] => Bytes[Vec] | conversion none } + { LongBlob[Vec] => Bytes[Vec] | conversion none } + { Json[Value] => String[String] | conversion option } + } +); + +impl_transport!( + name = MysqlPandasTransport<'tp, TextProtocol>, + error = ConnectorXPythonError, + systems = MySQLTypeSystem => PandasTypeSystem, + route = MySQLSource => PandasDestination<'tp>, + mappings = { + { Float[f32] => F64[f64] | conversion auto } + { Double[f64] => F64[f64] | conversion auto } + { Tiny[i8] => I64[i64] | conversion auto } + { Short[i16] => I64[i64] | conversion auto } + { Long[i32] => I64[i64] | conversion auto } + { Int24[i32] => I64[i64] | conversion none } + { LongLong[i64] => I64[i64] | conversion auto } + { UTiny[u8] => I64[i64] | conversion auto } + { UShort[u16] => I64[i64] | conversion auto } + { ULong[u32] => I64[i64] | conversion auto } + { UInt24[u32] => I64[i64] | conversion none } + { ULongLong[u64] => F64[f64] | conversion auto } + { Date[NaiveDate] => DateTime[DateTime] | conversion option } + { Time[NaiveTime] => String[String] | conversion option } + { Datetime[NaiveDateTime] => DateTime[DateTime] | conversion option } + { Timestamp[NaiveDateTime] => DateTime[DateTime] | conversion none } + { Year[i16] => I64[i64] | conversion none} + { Decimal[Decimal] => F64[f64] | conversion option } + { VarChar[String] => String[String] | conversion auto } + { Char[String] => String[String] | conversion none } + { Enum[String] => Str[String] | conversion none } + { TinyBlob[Vec] => Bytes[Vec] | conversion auto } + { Blob[Vec] => Bytes[Vec] | conversion none } + { MediumBlob[Vec] => Bytes[Vec] | conversion none } + { LongBlob[Vec] => Bytes[Vec] | conversion none } + { Json[Value] => String[String] | conversion option } + } +); + +impl<'py, P> TypeConversion> for MysqlPandasTransport<'py, P> { + fn convert(val: NaiveDate) -> DateTime { + DateTime::from_naive_utc_and_offset( + val.and_hms_opt(0, 0, 0) + .unwrap_or_else(|| panic!("and_hms_opt got None from {:?}", val)), + Utc, + ) + } +} + +impl<'py, P> TypeConversion for MysqlPandasTransport<'py, P> { + fn convert(val: NaiveTime) -> String { + val.to_string() + } +} + +impl<'py, P> TypeConversion> for MysqlPandasTransport<'py, P> { + fn convert(val: NaiveDateTime) -> DateTime { + DateTime::from_naive_utc_and_offset(val, Utc) + } +} + +impl<'py, P> TypeConversion for MysqlPandasTransport<'py, P> { + fn convert(val: Decimal) -> f64 { + val.to_f64() + .unwrap_or_else(|| panic!("cannot convert decimal {:?} to float64", val)) + } +} + +impl<'py, P> TypeConversion for MysqlPandasTransport<'py, P> { + fn convert(val: Value) -> String { + to_string(&val).unwrap() + } +} diff --git a/connectorx-python/src/pandas/transports/oracle.rs b/connectorx-python/src/pandas/transports/oracle.rs new file mode 100644 index 0000000..5ff54fa --- /dev/null +++ b/connectorx-python/src/pandas/transports/oracle.rs @@ -0,0 +1,40 @@ +use crate::errors::ConnectorXPythonError; +use crate::pandas::destination::PandasDestination; +use crate::pandas::typesystem::PandasTypeSystem; +use chrono::{DateTime, NaiveDateTime, Utc}; +use connectorx::{ + impl_transport, + sources::oracle::{OracleSource, OracleTypeSystem}, + typesystem::TypeConversion, +}; + +pub struct OraclePandasTransport<'py>(&'py ()); + +impl_transport!( + name = OraclePandasTransport<'tp>, + error = ConnectorXPythonError, + systems = OracleTypeSystem => PandasTypeSystem, + route = OracleSource => PandasDestination<'tp>, + mappings = { + { NumFloat[f64] => F64[f64] | conversion auto } + { Float[f64] => F64[f64] | conversion none } + { BinaryFloat[f64] => F64[f64] | conversion none } + { BinaryDouble[f64] => F64[f64] | conversion none } + { NumInt[i64] => I64[i64] | conversion auto } + { Blob[Vec] => Bytes[Vec] | conversion auto } + { Clob[String] => String[String] | conversion none } + { VarChar[String] => String[String] | conversion auto } + { Char[String] => String[String] | conversion none } + { NVarChar[String] => String[String] | conversion none } + { NChar[String] => String[String] | conversion none } + { Date[NaiveDateTime] => DateTime[DateTime] | conversion option } + { Timestamp[NaiveDateTime] => DateTime[DateTime] | conversion none } + { TimestampTz[DateTime] => DateTime[DateTime] | conversion auto } + } +); + +impl<'py> TypeConversion> for OraclePandasTransport<'py> { + fn convert(val: NaiveDateTime) -> DateTime { + DateTime::from_naive_utc_and_offset(val, Utc) + } +} diff --git a/connectorx-python/src/pandas/transports/postgres.rs b/connectorx-python/src/pandas/transports/postgres.rs new file mode 100644 index 0000000..f4c6c21 --- /dev/null +++ b/connectorx-python/src/pandas/transports/postgres.rs @@ -0,0 +1,139 @@ +use crate::errors::ConnectorXPythonError; +use crate::pandas::{destination::PandasDestination, typesystem::PandasTypeSystem}; +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Utc}; +use connectorx::{ + impl_transport, + sources::postgres::{ + BinaryProtocol, CSVProtocol, CursorProtocol, PostgresSource, PostgresTypeSystem, + SimpleProtocol, + }, + typesystem::TypeConversion, +}; +use postgres::NoTls; +use postgres_openssl::MakeTlsConnector; +use rust_decimal::prelude::*; +use serde_json::{to_string, Value}; +use std::collections::HashMap; +use std::marker::PhantomData; +use uuid::Uuid; + +pub struct PostgresPandasTransport<'py, P, C>(&'py (), PhantomData

, PhantomData); + +macro_rules! impl_postgres_transport { + ($proto:ty, $tls:ty) => { + impl_transport!( + name = PostgresPandasTransport<'tp, $proto, $tls>, + error = ConnectorXPythonError, + systems = PostgresTypeSystem => PandasTypeSystem, + route = PostgresSource<$proto, $tls> => PandasDestination<'tp>, + mappings = { + { Float4[f32] => F64[f64] | conversion auto } + { Float8[f64] => F64[f64] | conversion auto } + { Numeric[Decimal] => F64[f64] | conversion option } + { Int2[i16] => I64[i64] | conversion auto } + { Int4[i32] => I64[i64] | conversion auto } + { Int8[i64] => I64[i64] | conversion auto } + { BoolArray[Vec] => BoolArray[Vec] | conversion auto_vec } + { Int2Array[Vec] => I64Array[Vec] | conversion auto_vec } + { Int4Array[Vec] => I64Array[Vec] | conversion auto_vec } + { Int8Array[Vec] => I64Array[Vec] | conversion auto } + { Float4Array[Vec] => F64Array[Vec] | conversion auto_vec } + { Float8Array[Vec] => F64Array[Vec] | conversion auto } + { NumericArray[Vec] => F64Array[Vec] | conversion option } + { Bool[bool] => Bool[bool] | conversion auto } + { Char[i8] => Char[char] | conversion option } + { Text[&'r str] => Str[&'r str] | conversion auto } + { BpChar[&'r str] => Str[&'r str] | conversion none } + { VarChar[&'r str] => Str[&'r str] | conversion none } + { Name[&'r str] => Str[&'r str] | conversion none } + { Timestamp[NaiveDateTime] => DateTime[DateTime] | conversion option } + { TimestampTz[DateTime] => DateTime[DateTime] | conversion auto } + { Date[NaiveDate] => DateTime[DateTime] | conversion option } + { UUID[Uuid] => String[String] | conversion option } + { JSON[Value] => String[String] | conversion option } + { JSONB[Value] => String[String] | conversion none } + { Time[NaiveTime] => String[String] | conversion option } + { ByteA[Vec] => Bytes[Vec] | conversion auto } + { Enum[&'r str] => Str[&'r str] | conversion none } + { HSTORE[HashMap>] => String[String] | conversion option } + } + ); + } +} + +impl_postgres_transport!(BinaryProtocol, NoTls); +impl_postgres_transport!(BinaryProtocol, MakeTlsConnector); +impl_postgres_transport!(CSVProtocol, NoTls); +impl_postgres_transport!(CSVProtocol, MakeTlsConnector); +impl_postgres_transport!(CursorProtocol, NoTls); +impl_postgres_transport!(CursorProtocol, MakeTlsConnector); +impl_postgres_transport!(SimpleProtocol, NoTls); +impl_postgres_transport!(SimpleProtocol, MakeTlsConnector); + +impl<'py, P, C> TypeConversion>, String> + for PostgresPandasTransport<'py, P, C> +{ + fn convert(val: HashMap>) -> String { + to_string(&val).unwrap() + } +} + +impl<'py, P, C> TypeConversion, Vec> for PostgresPandasTransport<'py, P, C> { + fn convert(val: Vec) -> Vec { + val.into_iter() + .map(|v| { + v.to_f64() + .unwrap_or_else(|| panic!("cannot convert decimal {:?} to float64", v)) + }) + .collect() + } +} + +impl<'py, P, C> TypeConversion for PostgresPandasTransport<'py, P, C> { + fn convert(val: Decimal) -> f64 { + val.to_f64() + .unwrap_or_else(|| panic!("cannot convert decimal {:?} to float64", val)) + } +} + +impl<'py, P, C> TypeConversion for PostgresPandasTransport<'py, P, C> { + fn convert(val: NaiveTime) -> String { + val.to_string() + } +} + +impl<'py, P, C> TypeConversion for PostgresPandasTransport<'py, P, C> { + fn convert(val: i8) -> char { + val as u8 as char + } +} + +impl<'py, P, C> TypeConversion> + for PostgresPandasTransport<'py, P, C> +{ + fn convert(val: NaiveDateTime) -> DateTime { + DateTime::from_naive_utc_and_offset(val, Utc) + } +} + +impl<'py, P, C> TypeConversion> for PostgresPandasTransport<'py, P, C> { + fn convert(val: NaiveDate) -> DateTime { + DateTime::from_naive_utc_and_offset( + val.and_hms_opt(0, 0, 0) + .unwrap_or_else(|| panic!("and_hms_opt got None from {:?}", val)), + Utc, + ) + } +} + +impl<'py, P, C> TypeConversion for PostgresPandasTransport<'py, P, C> { + fn convert(val: Uuid) -> String { + val.to_string() + } +} + +impl<'py, P, C> TypeConversion for PostgresPandasTransport<'py, P, C> { + fn convert(val: Value) -> String { + to_string(&val).unwrap() + } +} diff --git a/connectorx-python/src/pandas/transports/sqlite.rs b/connectorx-python/src/pandas/transports/sqlite.rs new file mode 100644 index 0000000..122ba29 --- /dev/null +++ b/connectorx-python/src/pandas/transports/sqlite.rs @@ -0,0 +1,52 @@ +use crate::errors::ConnectorXPythonError; +use crate::pandas::destination::PandasDestination; +use crate::pandas::typesystem::PandasTypeSystem; +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Utc}; +use connectorx::{ + impl_transport, + sources::sqlite::{SQLiteSource, SQLiteTypeSystem}, + typesystem::TypeConversion, +}; + +pub struct SqlitePandasTransport<'py>(&'py ()); + +impl_transport!( + name = SqlitePandasTransport<'tp>, + error = ConnectorXPythonError, + systems = SQLiteTypeSystem => PandasTypeSystem, + route = SQLiteSource => PandasDestination<'tp>, + mappings = { + { Bool[bool] => Bool[bool] | conversion auto } + { Int8[i64] => I64[i64] | conversion auto } + { Int4[i32] => I64[i64] | conversion auto } + { Int2[i16] => I64[i64] | conversion auto } + { Real[f64] => F64[f64] | conversion auto } + { Text[Box] => BoxStr[Box] | conversion auto } + { Date[NaiveDate] => DateTime[DateTime] | conversion option } + { Time[NaiveTime] => String[String] | conversion option } + { Timestamp[NaiveDateTime] => DateTime[DateTime] | conversion option } + { Blob[Vec] => Bytes[Vec] | conversion auto } + } +); + +impl<'py> TypeConversion> for SqlitePandasTransport<'py> { + fn convert(val: NaiveDateTime) -> DateTime { + DateTime::from_naive_utc_and_offset(val, Utc) + } +} + +impl<'py> TypeConversion> for SqlitePandasTransport<'py> { + fn convert(val: NaiveDate) -> DateTime { + DateTime::from_naive_utc_and_offset( + val.and_hms_opt(0, 0, 0) + .unwrap_or_else(|| panic!("and_hms_opt got None from {:?}", val)), + Utc, + ) + } +} + +impl<'py> TypeConversion for SqlitePandasTransport<'py> { + fn convert(val: NaiveTime) -> String { + val.to_string() + } +} diff --git a/connectorx-python/src/pandas/typesystem.rs b/connectorx-python/src/pandas/typesystem.rs new file mode 100644 index 0000000..35a108b --- /dev/null +++ b/connectorx-python/src/pandas/typesystem.rs @@ -0,0 +1,113 @@ +// Unfortunately, due to the orphan rule, typesystem implementation should be in this crate. +use chrono::{DateTime, Utc}; +use connectorx::impl_typesystem; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub enum PandasTypeSystem { + F64(bool), + I64(bool), + F64Array(bool), + I64Array(bool), + Bool(bool), + BoolArray(bool), + Char(bool), + Str(bool), + BoxStr(bool), + String(bool), + Bytes(bool), + ByteSlice(bool), + DateTime(bool), +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub enum PandasBlockType { + Boolean(bool), // bool indicates nullablity + Int64(bool), + Float64, + BooleanArray, + Int64Array, + Float64Array, + String, + DateTime, + Bytes, +} + +pub enum PandasArrayType { + NumpyArray, + IntegerArray, + BooleanArray, + DatetimeArray, +} + +impl From for PandasArrayType { + fn from(ty: PandasBlockType) -> PandasArrayType { + match ty { + PandasBlockType::Boolean(true) => PandasArrayType::BooleanArray, + PandasBlockType::Int64(true) => PandasArrayType::IntegerArray, + PandasBlockType::DateTime => PandasArrayType::DatetimeArray, + _ => PandasArrayType::NumpyArray, + } + } +} + +impl From for PandasBlockType { + fn from(ty: PandasTypeSystem) -> PandasBlockType { + match ty { + PandasTypeSystem::Bool(nullable) => PandasBlockType::Boolean(nullable), + PandasTypeSystem::I64(nullable) => PandasBlockType::Int64(nullable), + PandasTypeSystem::F64(_) => PandasBlockType::Float64, + PandasTypeSystem::BoolArray(_) => PandasBlockType::BooleanArray, + PandasTypeSystem::F64Array(_) => PandasBlockType::Float64Array, + PandasTypeSystem::I64Array(_) => PandasBlockType::Int64Array, + PandasTypeSystem::String(_) + | PandasTypeSystem::BoxStr(_) + | PandasTypeSystem::Str(_) + | PandasTypeSystem::Char(_) => PandasBlockType::String, + PandasTypeSystem::Bytes(_) | PandasTypeSystem::ByteSlice(_) => PandasBlockType::Bytes, + PandasTypeSystem::DateTime(_) => PandasBlockType::DateTime, + } + } +} + +impl_typesystem! { + system = PandasTypeSystem, + mappings = { + { F64 => f64 } + { I64 => i64 } + { F64Array => Vec } + { I64Array => Vec } + { Bool => bool } + { BoolArray => Vec } + { Char => char } + { Str => &'r str } + { BoxStr => Box } + { String => String } + { Bytes => Vec } + { ByteSlice => &'r [u8] } + { DateTime => DateTime } + } +} + +pub trait PandasDType: Sized { + // For initialize a pandas array when creating the pandas dataframe + fn is_masked(&self) -> bool; + fn array_name(&self) -> &'static str; +} + +impl PandasDType for PandasBlockType { + fn is_masked(&self) -> bool { + matches!( + *self, + PandasBlockType::Boolean(true) | PandasBlockType::Int64(true) + ) + } + + fn array_name(&self) -> &'static str { + match *self { + PandasBlockType::Boolean(true) => "BooleanArray", + PandasBlockType::Int64(true) => "IntegerArray", + PandasBlockType::DateTime => "DatetimeArray", + _ => "", + } + } +} diff --git a/connectorx-python/src/read_sql.rs b/connectorx-python/src/read_sql.rs new file mode 100644 index 0000000..e91c5bc --- /dev/null +++ b/connectorx-python/src/read_sql.rs @@ -0,0 +1,83 @@ +use connectorx::{ + partition::{partition, PartitionQuery}, + source_router::parse_source, + sql::CXQuery, +}; +use dict_derive::FromPyObject; +use fehler::throw; +use pyo3::prelude::*; +use pyo3::{exceptions::PyValueError, PyResult}; + +use crate::errors::ConnectorXPythonError; + +#[derive(FromPyObject)] +pub struct PyPartitionQuery { + query: String, + column: String, + min: Option, + max: Option, + num: usize, +} + +impl Into for PyPartitionQuery { + fn into(self) -> PartitionQuery { + PartitionQuery::new( + self.query.as_str(), + self.column.as_str(), + self.min, + self.max, + self.num, + ) + } +} + +pub fn read_sql<'a>( + py: Python<'a>, + conn: &str, + return_type: &str, + protocol: Option<&str>, + queries: Option>, + partition_query: Option, +) -> PyResult<&'a PyAny> { + let source_conn = parse_source(conn, protocol).map_err(|e| ConnectorXPythonError::from(e))?; + let (queries, origin_query) = match (queries, partition_query) { + (Some(queries), None) => (queries.into_iter().map(CXQuery::Naked).collect(), None), + (None, Some(part)) => { + let origin_query = Some(part.query.clone()); + let queries = partition(&part.into(), &source_conn) + .map_err(|e| ConnectorXPythonError::from(e))?; + (queries, origin_query) + } + (Some(_), Some(_)) => throw!(PyValueError::new_err( + "partition_query and queries cannot be both specified", + )), + (None, None) => throw!(PyValueError::new_err( + "partition_query and queries cannot be both None", + )), + }; + + match return_type { + "pandas" => Ok(crate::pandas::write_pandas( + py, + &source_conn, + origin_query, + &queries, + )?), + "arrow" => Ok(crate::arrow::write_arrow( + py, + &source_conn, + origin_query, + &queries, + )?), + "arrow2" => Ok(crate::arrow2::write_arrow( + py, + &source_conn, + origin_query, + &queries, + )?), + _ => Err(PyValueError::new_err(format!( + "return type should be 'pandas' or 'arrow', got '{}'", + return_type + ))), + } +} diff --git a/connectorx/Cargo.toml b/connectorx/Cargo.toml new file mode 100644 index 0000000..e64f2d2 --- /dev/null +++ b/connectorx/Cargo.toml @@ -0,0 +1,105 @@ +[package] +authors = ["SFU Database System Lab "] +description = "Load data from databases to dataframes, the fastest way." +documentation = "https://docs.rs/connectorx" +edition = "2018" +license = "MIT" +name = "connectorx" +readme = "../README.md" +repository = "https://github.com/sfu-db/connector-x" +version = "0.3.3-alpha.1" + +[dependencies] +anyhow = "1" +fehler = "1" +itertools = "0.10" +log = "0.4" +rayon = "1" +sqlparser = "0.37" +thiserror = "1" +url = "2" +owning_ref = "0.4" +serde_json = "1" +chrono = "0.4" + +arrow = {workspace = true, optional = true} +arrow2 = {workspace = true, default-features = false, optional = true} +bb8 = {version = "0.7", optional = true} +bb8-tiberius = {version = "0.5", optional = true} +csv = {version = "1", optional = true} +fallible-streaming-iterator = {version = "0.1", optional = true} +futures = {version = "0.3", optional = true} +gcp-bigquery-client = {version = "0.13.0", optional = true} +hex = {version = "0.4", optional = true} +native-tls = {version = "0.2", optional = true} +ndarray = {version = "0.15", optional = true} +num-traits = {version = "0.2", optional = true} +openssl = {version = "0.10", optional = true} +oracle = {version = "0.5", optional = true} +polars = {version = "0.32", optional = true, features=["dtype-u8", "dtype-u16"]} +postgres = {version = "0.19", features = ["with-chrono-0_4", "with-uuid-0_8", "with-serde_json-1"], optional = true} +postgres-native-tls = {version = "0.5", optional = true} +postgres-openssl = {version = "0.5", optional = true} +mysql_common = {version = "0.29", features = ["chrono"], optional = true} +r2d2 = {version = "0.8", optional = true} +r2d2-oracle = {version = "0.6", features = ["chrono"], optional = true} +r2d2_mysql = {version = "23", optional = true} +r2d2_postgres = {version = "0.18.1", optional = true} +r2d2_sqlite = {version = "0.22.0", optional = true} +regex = {version = "1", optional = true} +rusqlite = {version = "0.29.0", features = ["column_decltype", "chrono", "bundled"], optional = true} +rust_decimal = {version = "1", features = ["db-postgres"], optional = true} +rust_decimal_macros = {version = "1", optional = true} +tiberius = {version = "0.5", features = ["rust_decimal", "chrono", "integrated-auth-gssapi"], optional = true} +tokio = {version = "1", features = ["rt", "rt-multi-thread", "net"], optional = true} +tokio-util = {version = "0.6", optional = true} +urlencoding = {version = "2.1", optional = true} +uuid = {version = "0.8", optional = true} +j4rs = {version = "0.15", optional = true} +datafusion = {version = "31", optional = true} + +[lib] +crate-type = ["cdylib", "rlib"] +name = "connectorx" + +[dev-dependencies] +criterion = "0.3" +env_logger = "0.9" +iai = "0.1" +pprof = {version = "0.5", features = ["flamegraph"]} + +[features] +all = ["src_sqlite", "src_postgres", "src_mysql", "src_mssql", "src_oracle", "src_bigquery", "src_csv", "src_dummy", "dst_arrow", "dst_arrow2", "federation", "fed_exec"] +branch = [] +default = ["fptr"] +dst_arrow = ["arrow"] +dst_arrow2 = ["polars", "arrow2"] +fptr = [] +src_bigquery = ["gcp-bigquery-client", "tokio"] +src_csv = ["csv", "regex"] +src_dummy = ["num-traits"] +src_mssql = ["rust_decimal", "num-traits", "tiberius", "bb8-tiberius", "bb8", "tokio", "tokio-util", "uuid", "futures", "urlencoding"] +src_mysql = ["r2d2_mysql", "mysql_common", "rust_decimal", "num-traits", "r2d2"] +src_oracle = ["oracle", "r2d2-oracle","r2d2", "urlencoding"] +src_postgres = [ + "postgres", + "r2d2_postgres", + "postgres-native-tls", + "csv", + "hex", + "uuid", + "rust_decimal", + "rust_decimal_macros", + "num-traits", + "r2d2", + "native-tls", + "openssl", + "postgres-openssl", +] +src_sqlite = ["rusqlite", "r2d2_sqlite", "fallible-streaming-iterator", "r2d2", "urlencoding"] +federation = ["j4rs"] +fed_exec = ["datafusion", "tokio"] +integrated-auth-gssapi = ["tiberius/integrated-auth-gssapi"] + +[package.metadata.docs.rs] +features = ["all"] diff --git a/connectorx/examples/batch_test.rs b/connectorx/examples/batch_test.rs new file mode 100644 index 0000000..aa8bb16 --- /dev/null +++ b/connectorx/examples/batch_test.rs @@ -0,0 +1,64 @@ +use connectorx::arrow_batch_iter::ArrowBatchIter; +use connectorx::prelude::*; +use connectorx::sources::postgres::{rewrite_tls_args, BinaryProtocol as PgBinaryProtocol}; +use postgres::NoTls; +use std::convert::TryFrom; +use std::time::Instant; + +fn main() { + // let queries = &[CXQuery::naked("select * from test_table")]; + // let queries = &[ + // CXQuery::naked("select * from test_table where test_int < 3"), + // CXQuery::naked("select * from test_table where test_int >= 3"), + // ]; + + let start = Instant::now(); + + let queries = &[ + CXQuery::naked("select * from lineitem where l_orderkey < 1000000"), + CXQuery::naked( + "select * from lineitem where l_orderkey >= 1000000 AND l_orderkey < 2000000", + ), + CXQuery::naked( + "select * from lineitem where l_orderkey >= 2000000 AND l_orderkey < 3000000", + ), + CXQuery::naked( + "select * from lineitem where l_orderkey >= 3000000 AND l_orderkey < 4000000", + ), + CXQuery::naked( + "select * from lineitem where l_orderkey >= 4000000 AND l_orderkey < 5000000", + ), + CXQuery::naked("select * from lineitem where l_orderkey >= 5000000"), + ]; + + let origin_query = None; + + let conn = "postgresql://postgres:postgres@localhost:5432/tpch"; + let source = SourceConn::try_from(conn).unwrap(); + let (config, _) = rewrite_tls_args(&source.conn).unwrap(); + let source = + PostgresSource::::new(config, NoTls, queries.len()).unwrap(); + + let destination = ArrowStreamDestination::new_with_batch_size(2048); + + let mut batch_iter: ArrowBatchIter<_, PostgresArrowStreamTransport> = + ArrowBatchIter::new(source, destination, origin_query, queries).unwrap(); + + batch_iter.prepare(); + + let mut num_rows = 0; + let mut num_batches = 0; + for record_batch in batch_iter { + let record_batch = record_batch; + println!("got 1 batch, with {} rows", record_batch.num_rows()); + num_rows += record_batch.num_rows(); + num_batches += 1; + // arrow::util::pretty::print_batches(&[record_batch]).unwrap(); + } + println!( + "got {} batches, {} rows in total, took {:?}", + num_batches, + num_rows, + start.elapsed() + ); +} diff --git a/connectorx/examples/jvm_test.rs b/connectorx/examples/jvm_test.rs new file mode 100644 index 0000000..7e294ab --- /dev/null +++ b/connectorx/examples/jvm_test.rs @@ -0,0 +1,57 @@ +use connectorx::{ + prelude::*, + sources::postgres::{rewrite_tls_args, BinaryProtocol, PostgresSource}, + sql::CXQuery, + transports::PostgresArrowTransport, +}; +use j4rs::{ClasspathEntry, InvocationArg, Jvm, JvmBuilder}; +use postgres::NoTls; +use std::convert::TryFrom; +use std::env; +use std::fs; +use std::iter::Iterator; +use url::Url; + +fn main() { + let path = fs::canonicalize("./federated-rewriter.jar").unwrap(); + println!("path: {:?}", path); + let entry = ClasspathEntry::new(path.to_str().unwrap()); + let jvm: Jvm = JvmBuilder::new().classpath_entry(entry).build().unwrap(); + + let args: Vec = env::args().collect(); + let file = &args[1]; + let sql = fs::read_to_string(file).unwrap(); + println!("input sql: {}", sql); + let sql = InvocationArg::try_from(sql).unwrap(); + let rewrite_sql = jvm + .invoke_static("ai.dataprep.federated.QueryRewriter", "rewrite", &[sql]) + .unwrap(); + + let rewrite_sql: String = jvm.to_rust(rewrite_sql).unwrap(); + + println!("rewrite sql: {}", rewrite_sql); + + let conn = env::var("POSTGRES_URL").unwrap(); + let url = Url::parse(&conn).unwrap(); + let (config, _) = rewrite_tls_args(&url).unwrap(); + + let sb = PostgresSource::::new(config, NoTls, 1).unwrap(); + let mut destination = ArrowDestination::new(); + let queries = [CXQuery::naked(rewrite_sql)]; + let dispatcher = Dispatcher::<_, _, PostgresArrowTransport>::new( + sb, + &mut destination, + &queries, + None, + ); + println!("run dispatcher"); + dispatcher.run().unwrap(); + let result = destination.arrow().unwrap(); + let counts = result + .iter() + .map(|rb| rb.num_rows()) + .collect::>(); + + println!("result rows: {}", counts.iter().sum::()); + println!("result columns: {}", result[0].schema()) +} diff --git a/connectorx/src/arrow_batch_iter.rs b/connectorx/src/arrow_batch_iter.rs new file mode 100644 index 0000000..1794a96 --- /dev/null +++ b/connectorx/src/arrow_batch_iter.rs @@ -0,0 +1,183 @@ +use crate::prelude::*; +use arrow::record_batch::RecordBatch; +use itertools::Itertools; +use log::debug; +use rayon::prelude::*; +use std::marker::PhantomData; + +pub fn set_global_num_thread(num: usize) { + rayon::ThreadPoolBuilder::new() + .num_threads(num) + .build_global() + .unwrap(); +} + +/// The iterator that returns arrow in `RecordBatch` +pub struct ArrowBatchIter +where + S: Source, + TP: Transport< + TSS = S::TypeSystem, + TSD = ArrowStreamTypeSystem, + S = S, + D = ArrowStreamDestination, + >, + ::Partition: 'static, + ::TypeSystem: 'static, + ::Error: 'static, +{ + dst: ArrowStreamDestination, + dst_parts: Option>, + src_parts: Option>, + dorder: DataOrder, + src_schema: Vec, + dst_schema: Vec, + _phantom: PhantomData, +} + +impl<'a, S, TP> ArrowBatchIter +where + S: Source + 'a, + TP: Transport< + TSS = S::TypeSystem, + TSD = ArrowStreamTypeSystem, + S = S, + D = ArrowStreamDestination, + >, +{ + pub fn new( + src: S, + mut dst: ArrowStreamDestination, + origin_query: Option, + queries: &[CXQuery], + ) -> Result { + let dispatcher = Dispatcher::<_, _, TP>::new(src, &mut dst, queries, origin_query); + let (dorder, src_parts, dst_parts, src_schema, dst_schema) = dispatcher.prepare()?; + + Ok(Self { + dst, + dst_parts: Some(dst_parts), + src_parts: Some(src_parts), + dorder, + src_schema, + dst_schema, + _phantom: PhantomData, + }) + } + + fn run(&mut self) { + let src_schema = self.src_schema.clone(); + let dst_schema = self.dst_schema.clone(); + let src_partitions = self.src_parts.take().unwrap(); + let dst_partitions = self.dst_parts.take().unwrap(); + let dorder = self.dorder; + + std::thread::spawn(move || -> Result<(), TP::Error> { + let schemas: Vec<_> = src_schema + .iter() + .zip_eq(&dst_schema) + .map(|(&src_ty, &dst_ty)| (src_ty, dst_ty)) + .collect(); + + debug!("Start writing"); + // parse and write + dst_partitions + .into_par_iter() + .zip_eq(src_partitions) + .enumerate() + .try_for_each(|(i, (mut dst, mut src))| -> Result<(), TP::Error> { + let mut parser = src.parser()?; + + match dorder { + DataOrder::RowMajor => loop { + let (n, is_last) = parser.fetch_next()?; + dst.aquire_row(n)?; + for _ in 0..n { + #[allow(clippy::needless_range_loop)] + for col in 0..dst.ncols() { + { + let (s1, s2) = schemas[col]; + TP::process(s1, s2, &mut parser, &mut dst)?; + } + } + } + if is_last { + break; + } + }, + DataOrder::ColumnMajor => loop { + let (n, is_last) = parser.fetch_next()?; + dst.aquire_row(n)?; + #[allow(clippy::needless_range_loop)] + for col in 0..dst.ncols() { + for _ in 0..n { + { + let (s1, s2) = schemas[col]; + TP::process(s1, s2, &mut parser, &mut dst)?; + } + } + } + if is_last { + break; + } + }, + } + + debug!("Finalize partition {}", i); + dst.finalize()?; + debug!("Partition {} finished", i); + Ok(()) + })?; + + debug!("Writing finished"); + + Ok(()) + }); + } +} + +impl<'a, S, TP> Iterator for ArrowBatchIter +where + S: Source + 'a, + TP: Transport< + TSS = S::TypeSystem, + TSD = ArrowStreamTypeSystem, + S = S, + D = ArrowStreamDestination, + >, +{ + type Item = RecordBatch; + /// NOTE: not thread safe + fn next(&mut self) -> Option { + self.dst.record_batch().unwrap() + } +} + +pub trait RecordBatchIterator { + fn get_schema(&self) -> (RecordBatch, &[String]); + fn prepare(&mut self); + fn next_batch(&mut self) -> Option; +} + +impl<'a, S, TP> RecordBatchIterator for ArrowBatchIter +where + S: Source + 'a, + TP: Transport< + TSS = S::TypeSystem, + TSD = ArrowStreamTypeSystem, + S = S, + D = ArrowStreamDestination, + >, +{ + fn get_schema(&self) -> (RecordBatch, &[String]) { + (self.dst.empty_batch(), self.dst.names()) + } + + fn prepare(&mut self) { + self.run(); + } + + fn next_batch(&mut self) -> Option { + self.next() + } +} diff --git a/connectorx/src/constants.rs b/connectorx/src/constants.rs new file mode 100644 index 0000000..b65cca4 --- /dev/null +++ b/connectorx/src/constants.rs @@ -0,0 +1,40 @@ +#[cfg(any(feature = "dst_arrow", feature = "dst_arrow2"))] +pub(crate) const SECONDS_IN_DAY: i64 = 86_400; + +#[allow(dead_code)] +const KILO: usize = 1 << 10; + +#[cfg(any(feature = "dst_arrow", feature = "dst_arrow2"))] +pub const RECORD_BATCH_SIZE: usize = 64 * KILO; + +#[cfg(any( + feature = "src_postgres", + feature = "src_mysql", + feature = "src_oracle", + feature = "src_mssql" +))] +pub const DB_BUFFER_SIZE: usize = 32; + +#[cfg(any(feature = "src_oracle"))] +pub const ORACLE_ARRAY_SIZE: u32 = KILO as u32; + +#[cfg(all(not(debug_assertions), feature = "federation"))] +pub const J4RS_BASE_PATH: &str = "../target/release"; + +#[cfg(all(debug_assertions, feature = "federation"))] +pub const J4RS_BASE_PATH: &str = "../target/debug"; + +#[cfg(feature = "federation")] +pub const CX_REWRITER_PATH: &str = + "../connectorx-python/connectorx/dependencies/federated-rewriter.jar"; + +#[cfg(feature = "federation")] +pub const POSTGRES_JDBC_DRIVER: &str = "org.postgresql.Driver"; + +#[cfg(feature = "federation")] +pub const MYSQL_JDBC_DRIVER: &str = "com.mysql.cj.jdbc.Driver"; + +#[cfg(feature = "federation")] +pub const DUCKDB_JDBC_DRIVER: &str = "org.duckdb.DuckDBDriver"; + +pub const CONNECTORX_PROTOCOL: &str = "cxprotocol"; diff --git a/connectorx/src/data_order.rs b/connectorx/src/data_order.rs new file mode 100644 index 0000000..d865f3c --- /dev/null +++ b/connectorx/src/data_order.rs @@ -0,0 +1,27 @@ +//! This module provides two data orders: row-wise and column-wise for tabular data, +//! as well as a function to coordinate the data order between source and destination. + +use crate::errors::ConnectorXError; +use fehler::{throw, throws}; +#[derive(Copy, Clone, Eq, PartialEq, Debug)] +pub enum DataOrder { + RowMajor, + ColumnMajor, +} + +/// Given the supported data order from source and destination, decide the optimal data order +/// for producing and writing. +#[throws(ConnectorXError)] +pub fn coordinate(src: &[DataOrder], dst: &[DataOrder]) -> DataOrder { + assert!(!src.is_empty() && !dst.is_empty()); + + match (src, dst) { + ([s, ..], [d, ..]) if s == d => *s, + ([s, ..], [_, d, ..]) if s == d => *s, + ([_, s, ..], [d, ..]) if s == d => *s, + _ => throw!(ConnectorXError::CannotResolveDataOrder( + src.to_vec(), + dst.to_vec() + )), + } +} diff --git a/connectorx/src/destinations/arrow/arrow_assoc.rs b/connectorx/src/destinations/arrow/arrow_assoc.rs new file mode 100644 index 0000000..cd6b01d --- /dev/null +++ b/connectorx/src/destinations/arrow/arrow_assoc.rs @@ -0,0 +1,338 @@ +use super::errors::{ArrowDestinationError, Result}; +use crate::constants::SECONDS_IN_DAY; +use arrow::array::{ + ArrayBuilder, BooleanBuilder, Date32Builder, Date64Builder, Float32Builder, Float64Builder, + Int32Builder, Int64Builder, LargeBinaryBuilder, StringBuilder, Time64NanosecondBuilder, + TimestampNanosecondBuilder, UInt32Builder, UInt64Builder, +}; +use arrow::datatypes::Field; +use arrow::datatypes::{DataType as ArrowDataType, TimeUnit}; +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Timelike, Utc}; +use fehler::throws; + +/// Associate arrow builder with native type +pub trait ArrowAssoc { + type Builder: ArrayBuilder + Send; + + fn builder(nrows: usize) -> Self::Builder; + fn append(builder: &mut Self::Builder, value: Self) -> Result<()>; + fn field(header: &str) -> Field; +} + +macro_rules! impl_arrow_assoc { + ($T:ty, $AT:expr, $B:ty) => { + impl ArrowAssoc for $T { + type Builder = $B; + + fn builder(nrows: usize) -> Self::Builder { + Self::Builder::with_capacity(nrows) + } + + #[throws(ArrowDestinationError)] + fn append(builder: &mut Self::Builder, value: Self) { + builder.append_value(value); + } + + fn field(header: &str) -> Field { + Field::new(header, $AT, false) + } + } + + impl ArrowAssoc for Option<$T> { + type Builder = $B; + + fn builder(nrows: usize) -> Self::Builder { + Self::Builder::with_capacity(nrows) + } + + #[throws(ArrowDestinationError)] + fn append(builder: &mut Self::Builder, value: Self) { + builder.append_option(value); + } + + fn field(header: &str) -> Field { + Field::new(header, $AT, true) + } + } + }; +} + +impl_arrow_assoc!(u32, ArrowDataType::UInt32, UInt32Builder); +impl_arrow_assoc!(u64, ArrowDataType::UInt64, UInt64Builder); +impl_arrow_assoc!(i32, ArrowDataType::Int32, Int32Builder); +impl_arrow_assoc!(i64, ArrowDataType::Int64, Int64Builder); +impl_arrow_assoc!(f32, ArrowDataType::Float32, Float32Builder); +impl_arrow_assoc!(f64, ArrowDataType::Float64, Float64Builder); +impl_arrow_assoc!(bool, ArrowDataType::Boolean, BooleanBuilder); + +impl ArrowAssoc for &str { + type Builder = StringBuilder; + + fn builder(nrows: usize) -> Self::Builder { + StringBuilder::with_capacity(1024, nrows) + } + + #[throws(ArrowDestinationError)] + fn append(builder: &mut Self::Builder, value: Self) { + builder.append_value(value); + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::Utf8, false) + } +} + +impl ArrowAssoc for Option<&str> { + type Builder = StringBuilder; + + fn builder(nrows: usize) -> Self::Builder { + StringBuilder::with_capacity(1024, nrows) + } + + #[throws(ArrowDestinationError)] + fn append(builder: &mut Self::Builder, value: Self) { + match value { + Some(s) => builder.append_value(s), + None => builder.append_null(), + } + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::Utf8, true) + } +} + +impl ArrowAssoc for String { + type Builder = StringBuilder; + + fn builder(nrows: usize) -> Self::Builder { + StringBuilder::with_capacity(1024, nrows) + } + + #[throws(ArrowDestinationError)] + fn append(builder: &mut Self::Builder, value: String) { + builder.append_value(value.as_str()); + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::Utf8, false) + } +} + +impl ArrowAssoc for Option { + type Builder = StringBuilder; + + fn builder(nrows: usize) -> Self::Builder { + StringBuilder::with_capacity(1024, nrows) + } + + #[throws(ArrowDestinationError)] + fn append(builder: &mut Self::Builder, value: Self) { + match value { + Some(s) => builder.append_value(s.as_str()), + None => builder.append_null(), + } + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::Utf8, true) + } +} + +impl ArrowAssoc for DateTime { + type Builder = TimestampNanosecondBuilder; + + fn builder(nrows: usize) -> Self::Builder { + TimestampNanosecondBuilder::with_capacity(nrows) + } + + #[throws(ArrowDestinationError)] + fn append(builder: &mut Self::Builder, value: DateTime) { + builder.append_value(value.timestamp_nanos()) + } + + fn field(header: &str) -> Field { + Field::new( + header, + ArrowDataType::Timestamp(TimeUnit::Nanosecond, None), + false, + ) + } +} + +impl ArrowAssoc for Option> { + type Builder = TimestampNanosecondBuilder; + + fn builder(nrows: usize) -> Self::Builder { + TimestampNanosecondBuilder::with_capacity(nrows) + } + + #[throws(ArrowDestinationError)] + fn append(builder: &mut Self::Builder, value: Option>) { + builder.append_option(value.map(|x| x.timestamp_nanos())) + } + + fn field(header: &str) -> Field { + Field::new( + header, + ArrowDataType::Timestamp(TimeUnit::Nanosecond, None), + true, + ) + } +} + +fn naive_date_to_arrow(nd: NaiveDate) -> i32 { + match nd.and_hms_opt(0, 0, 0) { + Some(dt) => (dt.timestamp() / SECONDS_IN_DAY) as i32, + None => panic!("and_hms_opt got None from {:?}", nd), + } +} + +fn naive_datetime_to_arrow(nd: NaiveDateTime) -> i64 { + nd.timestamp_millis() +} + +impl ArrowAssoc for Option { + type Builder = Date32Builder; + + fn builder(nrows: usize) -> Self::Builder { + Date32Builder::with_capacity(nrows) + } + + fn append(builder: &mut Self::Builder, value: Option) -> Result<()> { + builder.append_option(value.map(naive_date_to_arrow)); + Ok(()) + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::Date32, true) + } +} + +impl ArrowAssoc for NaiveDate { + type Builder = Date32Builder; + + fn builder(nrows: usize) -> Self::Builder { + Date32Builder::with_capacity(nrows) + } + + fn append(builder: &mut Self::Builder, value: NaiveDate) -> Result<()> { + builder.append_value(naive_date_to_arrow(value)); + Ok(()) + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::Date32, false) + } +} + +impl ArrowAssoc for Option { + type Builder = Date64Builder; + + fn builder(nrows: usize) -> Self::Builder { + Date64Builder::with_capacity(nrows) + } + + fn append(builder: &mut Self::Builder, value: Option) -> Result<()> { + builder.append_option(value.map(naive_datetime_to_arrow)); + Ok(()) + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::Date64, true) + } +} + +impl ArrowAssoc for NaiveDateTime { + type Builder = Date64Builder; + + fn builder(nrows: usize) -> Self::Builder { + Date64Builder::with_capacity(nrows) + } + + fn append(builder: &mut Self::Builder, value: NaiveDateTime) -> Result<()> { + builder.append_value(naive_datetime_to_arrow(value)); + Ok(()) + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::Date64, false) + } +} + +impl ArrowAssoc for Option { + type Builder = Time64NanosecondBuilder; + + fn builder(nrows: usize) -> Self::Builder { + Time64NanosecondBuilder::with_capacity(nrows) + } + + fn append(builder: &mut Self::Builder, value: Option) -> Result<()> { + builder.append_option( + value.map(|t| { + t.num_seconds_from_midnight() as i64 * 1_000_000_000 + t.nanosecond() as i64 + }), + ); + Ok(()) + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::Time64(TimeUnit::Nanosecond), true) + } +} + +impl ArrowAssoc for NaiveTime { + type Builder = Time64NanosecondBuilder; + + fn builder(nrows: usize) -> Self::Builder { + Time64NanosecondBuilder::with_capacity(nrows) + } + + fn append(builder: &mut Self::Builder, value: NaiveTime) -> Result<()> { + builder.append_value( + value.num_seconds_from_midnight() as i64 * 1_000_000_000 + value.nanosecond() as i64, + ); + Ok(()) + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::Time64(TimeUnit::Nanosecond), false) + } +} + +impl ArrowAssoc for Option> { + type Builder = LargeBinaryBuilder; + + fn builder(nrows: usize) -> Self::Builder { + LargeBinaryBuilder::with_capacity(1024, nrows) + } + + fn append(builder: &mut Self::Builder, value: Self) -> Result<()> { + match value { + Some(v) => builder.append_value(v), + None => builder.append_null(), + }; + Ok(()) + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::LargeBinary, true) + } +} + +impl ArrowAssoc for Vec { + type Builder = LargeBinaryBuilder; + + fn builder(nrows: usize) -> Self::Builder { + LargeBinaryBuilder::with_capacity(1024, nrows) + } + + fn append(builder: &mut Self::Builder, value: Self) -> Result<()> { + builder.append_value(value); + Ok(()) + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::LargeBinary, false) + } +} diff --git a/connectorx/src/destinations/arrow/errors.rs b/connectorx/src/destinations/arrow/errors.rs new file mode 100644 index 0000000..85d4177 --- /dev/null +++ b/connectorx/src/destinations/arrow/errors.rs @@ -0,0 +1,16 @@ +use thiserror::Error; + +pub type Result = std::result::Result; + +#[derive(Error, Debug)] +pub enum ArrowDestinationError { + #[error(transparent)] + ArrowError(#[from] arrow::error::ArrowError), + + #[error(transparent)] + ConnectorXError(#[from] crate::errors::ConnectorXError), + + /// Any other errors that are too trivial to be put here explicitly. + #[error(transparent)] + Other(#[from] anyhow::Error), +} diff --git a/connectorx/src/destinations/arrow/funcs.rs b/connectorx/src/destinations/arrow/funcs.rs new file mode 100644 index 0000000..0bcd0f7 --- /dev/null +++ b/connectorx/src/destinations/arrow/funcs.rs @@ -0,0 +1,74 @@ +use super::arrow_assoc::ArrowAssoc; +use super::Builder; +use crate::errors::Result; +use crate::typesystem::{ParameterizedFunc, ParameterizedOn}; +use anyhow::anyhow; +use arrow::array::{ArrayBuilder, ArrayRef}; +use arrow::datatypes::Field; + +pub struct FNewBuilder; + +impl ParameterizedFunc for FNewBuilder { + type Function = fn(nrows: usize) -> Builder; +} + +impl ParameterizedOn for FNewBuilder +where + T: ArrowAssoc, +{ + fn parameterize() -> Self::Function { + fn imp(nrows: usize) -> Builder + where + T: ArrowAssoc, + { + Box::new(T::builder(nrows)) as Builder + } + imp:: + } +} + +pub struct FFinishBuilder; + +impl ParameterizedFunc for FFinishBuilder { + type Function = fn(Builder) -> Result; +} + +impl ParameterizedOn for FFinishBuilder +where + T: ArrowAssoc, +{ + fn parameterize() -> Self::Function { + fn imp(mut builder: Builder) -> Result + where + T: ArrowAssoc, + { + let t = builder + .downcast_mut::() + .ok_or_else(|| anyhow!("cannot cast arrow builder for finish"))?; + let a = ArrayBuilder::finish(t); + Ok(a) + } + imp:: + } +} + +pub struct FNewField; + +impl ParameterizedFunc for FNewField { + type Function = fn(header: &str) -> Field; +} + +impl ParameterizedOn for FNewField +where + T: ArrowAssoc, +{ + fn parameterize() -> Self::Function { + fn imp(header: &str) -> Field + where + T: ArrowAssoc, + { + T::field(header) + } + imp:: + } +} diff --git a/connectorx/src/destinations/arrow/mod.rs b/connectorx/src/destinations/arrow/mod.rs new file mode 100644 index 0000000..96f7102 --- /dev/null +++ b/connectorx/src/destinations/arrow/mod.rs @@ -0,0 +1,276 @@ +//! Destination implementation for Arrow and Polars. + +mod arrow_assoc; +mod errors; +mod funcs; +pub mod typesystem; + +pub use self::errors::{ArrowDestinationError, Result}; +pub use self::typesystem::ArrowTypeSystem; +use super::{Consume, Destination, DestinationPartition}; +use crate::constants::RECORD_BATCH_SIZE; +use crate::data_order::DataOrder; +use crate::typesystem::{Realize, TypeAssoc, TypeSystem}; +use anyhow::anyhow; +use arrow::{datatypes::Schema, record_batch::RecordBatch}; +use arrow_assoc::ArrowAssoc; +use fehler::{throw, throws}; +use funcs::{FFinishBuilder, FNewBuilder, FNewField}; +use itertools::Itertools; +use std::{ + any::Any, + sync::{Arc, Mutex}, +}; + +type Builder = Box; +type Builders = Vec; + +pub struct ArrowDestination { + schema: Vec, + names: Vec, + data: Arc>>, + arrow_schema: Arc, + batch_size: usize, +} + +impl Default for ArrowDestination { + fn default() -> Self { + ArrowDestination { + schema: vec![], + names: vec![], + data: Arc::new(Mutex::new(vec![])), + arrow_schema: Arc::new(Schema::empty()), + batch_size: RECORD_BATCH_SIZE, + } + } +} + +impl ArrowDestination { + pub fn new() -> Self { + Self::default() + } + + pub fn new_with_batch_size(batch_size: usize) -> Self { + ArrowDestination { + schema: vec![], + names: vec![], + data: Arc::new(Mutex::new(vec![])), + arrow_schema: Arc::new(Schema::empty()), + batch_size, + } + } +} + +impl Destination for ArrowDestination { + const DATA_ORDERS: &'static [DataOrder] = &[DataOrder::ColumnMajor, DataOrder::RowMajor]; + type TypeSystem = ArrowTypeSystem; + type Partition<'a> = ArrowPartitionWriter; + type Error = ArrowDestinationError; + + fn needs_count(&self) -> bool { + false + } + + #[throws(ArrowDestinationError)] + fn allocate>( + &mut self, + _nrow: usize, + names: &[S], + schema: &[ArrowTypeSystem], + data_order: DataOrder, + ) { + // todo: support colmajor + if !matches!(data_order, DataOrder::RowMajor) { + throw!(crate::errors::ConnectorXError::UnsupportedDataOrder( + data_order + )) + } + + // parse the metadata + self.schema = schema.to_vec(); + self.names = names.iter().map(|n| n.as_ref().to_string()).collect(); + let fields = self + .schema + .iter() + .zip_eq(&self.names) + .map(|(&dt, h)| Ok(Realize::::realize(dt)?(h.as_str()))) + .collect::>>()?; + self.arrow_schema = Arc::new(Schema::new(fields)); + } + + #[throws(ArrowDestinationError)] + fn partition(&mut self, counts: usize) -> Vec> { + let mut partitions = vec![]; + for _ in 0..counts { + partitions.push(ArrowPartitionWriter::new( + self.schema.clone(), + Arc::clone(&self.data), + Arc::clone(&self.arrow_schema), + self.batch_size, + )?); + } + partitions + } + + fn schema(&self) -> &[ArrowTypeSystem] { + self.schema.as_slice() + } +} + +impl ArrowDestination { + #[throws(ArrowDestinationError)] + pub fn arrow(self) -> Vec { + let lock = Arc::try_unwrap(self.data).map_err(|_| anyhow!("Partitions are not freed"))?; + lock.into_inner() + .map_err(|e| anyhow!("mutex poisoned {}", e))? + } + + #[throws(ArrowDestinationError)] + pub fn record_batch(&mut self) -> Option { + let mut guard = self + .data + .lock() + .map_err(|e| anyhow!("mutex poisoned {}", e))?; + (*guard).pop() + } + + pub fn empty_batch(&self) -> RecordBatch { + RecordBatch::new_empty(self.arrow_schema.clone()) + } + + pub fn arrow_schema(&self) -> Arc { + self.arrow_schema.clone() + } + + pub fn names(&self) -> &[String] { + self.names.as_slice() + } +} + +pub struct ArrowPartitionWriter { + schema: Vec, + builders: Option, + current_row: usize, + current_col: usize, + data: Arc>>, + arrow_schema: Arc, + batch_size: usize, +} + +// unsafe impl Sync for ArrowPartitionWriter {} + +impl ArrowPartitionWriter { + #[throws(ArrowDestinationError)] + fn new( + schema: Vec, + data: Arc>>, + arrow_schema: Arc, + batch_size: usize, + ) -> Self { + let mut pw = ArrowPartitionWriter { + schema, + builders: None, + current_row: 0, + current_col: 0, + data, + arrow_schema, + batch_size, + }; + pw.allocate()?; + pw + } + + #[throws(ArrowDestinationError)] + fn allocate(&mut self) { + let builders = self + .schema + .iter() + .map(|dt| Ok(Realize::::realize(*dt)?(self.batch_size))) + .collect::>>()?; + self.builders.replace(builders); + } + + #[throws(ArrowDestinationError)] + fn flush(&mut self) { + let builders = self + .builders + .take() + .unwrap_or_else(|| panic!("arrow builder is none when flush!")); + let columns = builders + .into_iter() + .zip(self.schema.iter()) + .map(|(builder, &dt)| Realize::::realize(dt)?(builder)) + .collect::, crate::errors::ConnectorXError>>()?; + let rb = RecordBatch::try_new(Arc::clone(&self.arrow_schema), columns)?; + { + let mut guard = self + .data + .lock() + .map_err(|e| anyhow!("mutex poisoned {}", e))?; + let inner_data = &mut *guard; + inner_data.push(rb); + } + + self.current_row = 0; + self.current_col = 0; + } +} + +impl<'a> DestinationPartition<'a> for ArrowPartitionWriter { + type TypeSystem = ArrowTypeSystem; + type Error = ArrowDestinationError; + + #[throws(ArrowDestinationError)] + fn finalize(&mut self) { + if self.builders.is_some() { + self.flush()?; + } + } + + #[throws(ArrowDestinationError)] + fn aquire_row(&mut self, _n: usize) -> usize { + self.current_row + } + + fn ncols(&self) -> usize { + self.schema.len() + } +} + +impl<'a, T> Consume for ArrowPartitionWriter +where + T: TypeAssoc<>::TypeSystem> + ArrowAssoc + 'static, +{ + type Error = ArrowDestinationError; + + #[throws(ArrowDestinationError)] + fn consume(&mut self, value: T) { + let col = self.current_col; + self.current_col = (self.current_col + 1) % self.ncols(); + self.schema[col].check::()?; + + loop { + match &mut self.builders { + Some(builders) => { + ::append( + builders[col] + .downcast_mut::() + .ok_or_else(|| anyhow!("cannot cast arrow builder for append"))?, + value, + )?; + break; + } + None => self.allocate()?, // allocate if builders are not initialized + } + } + + // flush if exceed batch_size + if self.current_col == 0 { + self.current_row += 1; + if self.current_row >= self.batch_size { + self.flush()?; + self.allocate()?; + } + } + } +} diff --git a/connectorx/src/destinations/arrow/typesystem.rs b/connectorx/src/destinations/arrow/typesystem.rs new file mode 100644 index 0000000..a6997a2 --- /dev/null +++ b/connectorx/src/destinations/arrow/typesystem.rs @@ -0,0 +1,38 @@ +use crate::impl_typesystem; +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Utc}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub enum ArrowTypeSystem { + Int32(bool), + Int64(bool), + UInt32(bool), + UInt64(bool), + Float32(bool), + Float64(bool), + Boolean(bool), + LargeUtf8(bool), + LargeBinary(bool), + Date32(bool), + Date64(bool), + Time64(bool), + DateTimeTz(bool), +} + +impl_typesystem! { + system = ArrowTypeSystem, + mappings = { + { Int32 => i32 } + { Int64 => i64 } + { UInt32 => u32 } + { UInt64 => u64 } + { Float64 => f64 } + { Float32 => f32 } + { Boolean => bool } + { LargeUtf8 => String } + { LargeBinary => Vec } + { Date32 => NaiveDate } + { Date64 => NaiveDateTime } + { Time64 => NaiveTime } + { DateTimeTz => DateTime } + } +} diff --git a/connectorx/src/destinations/arrow2/arrow_assoc.rs b/connectorx/src/destinations/arrow2/arrow_assoc.rs new file mode 100644 index 0000000..db6d585 --- /dev/null +++ b/connectorx/src/destinations/arrow2/arrow_assoc.rs @@ -0,0 +1,451 @@ +use arrow2::{ + array::*, + datatypes::{DataType as ArrowDataType, Field, TimeUnit}, +}; +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Timelike, Utc}; + +use crate::constants::SECONDS_IN_DAY; + +/// Associate arrow builder with native type +pub trait ArrowAssoc { + type Builder: MutableArray + 'static + Send; + + fn builder(nrows: usize) -> Self::Builder; + fn push(builder: &mut Self::Builder, value: Self); + fn field(header: &str) -> Field; +} + +macro_rules! impl_arrow_assoc { + ($T:ty, $AT:expr, $B:ty) => { + impl ArrowAssoc for $T { + type Builder = $B; + + fn builder(nrows: usize) -> Self::Builder { + Self::Builder::with_capacity(nrows) + } + + #[inline] + fn push(builder: &mut Self::Builder, value: Self) { + builder.push(Some(value)); + } + + fn field(header: &str) -> Field { + Field::new(header, $AT, false) + } + } + + impl ArrowAssoc for Option<$T> { + type Builder = $B; + + fn builder(nrows: usize) -> Self::Builder { + Self::Builder::with_capacity(nrows) + } + + #[inline] + fn push(builder: &mut Self::Builder, value: Self) { + builder.push(value); + } + + fn field(header: &str) -> Field { + Field::new(header, $AT, true) + } + } + }; +} + +impl_arrow_assoc!(u32, ArrowDataType::UInt32, MutablePrimitiveArray); +impl_arrow_assoc!(u64, ArrowDataType::UInt64, MutablePrimitiveArray); +impl_arrow_assoc!(i32, ArrowDataType::Int32, MutablePrimitiveArray); +impl_arrow_assoc!(i64, ArrowDataType::Int64, MutablePrimitiveArray); +impl_arrow_assoc!(f32, ArrowDataType::Float32, MutablePrimitiveArray); +impl_arrow_assoc!(f64, ArrowDataType::Float64, MutablePrimitiveArray); +impl_arrow_assoc!(bool, ArrowDataType::Boolean, MutableBooleanArray); + +macro_rules! impl_arrow_assoc_vec { + ($T:ty, $PT:ty, $AT:expr) => { + impl ArrowAssoc for Vec<$T> { + type Builder = MutableListArray; + + fn builder(nrows: usize) -> Self::Builder { + MutableListArray::::with_capacity(nrows) + } + + #[inline] + fn push(builder: &mut Self::Builder, value: Self) { + let val: Vec> = value.into_iter().map(|v| Some(v)).collect(); + builder.try_push(Some(val)).unwrap(); + } + + fn field(header: &str) -> Field { + Field::new( + header, + ArrowDataType::LargeList(Box::new(Field::new("", $AT, false))), + false, + ) + } + } + + impl ArrowAssoc for Option> { + type Builder = MutableListArray; + + fn builder(nrows: usize) -> Self::Builder { + MutableListArray::::with_capacity(nrows) + } + + #[inline] + fn push(builder: &mut Self::Builder, value: Self) { + match value { + Some(values) => { + let val: Vec> = values.into_iter().map(|v| Some(v)).collect(); + builder.try_push(Some(val)).unwrap(); + } + None => builder.push_null(), + } + } + + fn field(header: &str) -> Field { + Field::new( + header, + ArrowDataType::LargeList(Box::new(Field::new("", $AT, false))), + true, + ) + } + } + }; +} + +macro_rules! impl_arrow_assoc_primitive_vec { + ($T:ty, $AT:expr) => { + impl_arrow_assoc_vec!($T, MutablePrimitiveArray<$T>, $AT); + }; +} + +impl_arrow_assoc_vec!(bool, MutableBooleanArray, ArrowDataType::Boolean); +impl_arrow_assoc_primitive_vec!(i32, ArrowDataType::Int32); +impl_arrow_assoc_primitive_vec!(i64, ArrowDataType::Int64); +impl_arrow_assoc_primitive_vec!(u32, ArrowDataType::UInt32); +impl_arrow_assoc_primitive_vec!(u64, ArrowDataType::UInt64); +impl_arrow_assoc_primitive_vec!(f32, ArrowDataType::Float32); +impl_arrow_assoc_primitive_vec!(f64, ArrowDataType::Float64); + +impl ArrowAssoc for &str { + type Builder = MutableUtf8Array; + + fn builder(nrows: usize) -> Self::Builder { + MutableUtf8Array::::with_capacity(nrows) + } + + #[inline] + fn push(builder: &mut Self::Builder, value: Self) { + builder.push(Some(value)); + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::LargeUtf8, false) + } +} + +impl ArrowAssoc for Option<&str> { + type Builder = MutableUtf8Array; + + fn builder(nrows: usize) -> Self::Builder { + MutableUtf8Array::::with_capacity(nrows) + } + + #[inline] + fn push(builder: &mut Self::Builder, value: Self) { + builder.push(value); + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::LargeUtf8, true) + } +} + +impl ArrowAssoc for String { + type Builder = MutableUtf8Array; + + fn builder(nrows: usize) -> Self::Builder { + MutableUtf8Array::::with_capacity(nrows) + } + + #[inline] + fn push(builder: &mut Self::Builder, value: String) { + builder.push(Some(value)); + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::LargeUtf8, false) + } +} + +impl ArrowAssoc for Option { + type Builder = MutableUtf8Array; + + fn builder(nrows: usize) -> Self::Builder { + MutableUtf8Array::with_capacity(nrows) + } + + #[inline] + fn push(builder: &mut Self::Builder, value: Self) { + builder.push(value); + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::LargeUtf8, true) + } +} + +impl ArrowAssoc for DateTime { + type Builder = MutablePrimitiveArray; + + fn builder(nrows: usize) -> Self::Builder { + MutablePrimitiveArray::with_capacity(nrows).to(ArrowDataType::Timestamp( + TimeUnit::Nanosecond, + Some("UTC".to_string()), + )) + } + + #[inline] + fn push(builder: &mut Self::Builder, value: DateTime) { + builder.push(Some(value).map(|x| x.timestamp_nanos())); + } + + fn field(header: &str) -> Field { + Field::new( + header, + ArrowDataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".to_string())), + true, + ) + } +} + +impl ArrowAssoc for Option> { + type Builder = MutablePrimitiveArray; + + fn builder(nrows: usize) -> Self::Builder { + MutablePrimitiveArray::with_capacity(nrows).to(ArrowDataType::Timestamp( + TimeUnit::Nanosecond, + Some("UTC".to_string()), + )) + } + + #[inline] + fn push(builder: &mut Self::Builder, value: Option>) { + builder.push(value.map(|x| x.timestamp_nanos())); + } + + fn field(header: &str) -> Field { + Field::new( + header, + ArrowDataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".to_string())), + false, + ) + } +} + +fn naive_date_to_date32(nd: NaiveDate) -> i32 { + match nd.and_hms_opt(0, 0, 0) { + Some(dt) => (dt.timestamp() / SECONDS_IN_DAY) as i32, + None => panic!("and_hms_opt got None from {:?}", nd), + } +} + +fn naive_time_to_time64_nanos(nd: NaiveTime) -> i64 { + nd.num_seconds_from_midnight() as i64 * 1_000_000_000 + nd.nanosecond() as i64 +} + +impl ArrowAssoc for Option { + type Builder = MutablePrimitiveArray; + + fn builder(nrows: usize) -> Self::Builder { + MutablePrimitiveArray::with_capacity(nrows).to(ArrowDataType::Date32) + } + + #[inline] + fn push(builder: &mut Self::Builder, value: Option) { + builder.push(value.map(naive_date_to_date32)); + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::Date32, true) + } +} + +impl ArrowAssoc for NaiveDate { + type Builder = MutablePrimitiveArray; + + fn builder(nrows: usize) -> Self::Builder { + MutablePrimitiveArray::with_capacity(nrows).to(ArrowDataType::Date32) + } + + #[inline] + fn push(builder: &mut Self::Builder, value: NaiveDate) { + builder.push(Some(naive_date_to_date32(value))); + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::Date32, false) + } +} + +impl ArrowAssoc for Option { + type Builder = MutablePrimitiveArray; + + fn builder(nrows: usize) -> Self::Builder { + // naive => None + MutablePrimitiveArray::with_capacity(nrows) + .to(ArrowDataType::Timestamp(TimeUnit::Nanosecond, None)) + } + + #[inline] + fn push(builder: &mut Self::Builder, value: Option) { + builder.push(value.map(|x| x.timestamp_nanos())); + } + + fn field(header: &str) -> Field { + // naive => None + Field::new( + header, + ArrowDataType::Timestamp(TimeUnit::Nanosecond, None), + true, + ) + } +} + +impl ArrowAssoc for NaiveDateTime { + type Builder = MutablePrimitiveArray; + + fn builder(nrows: usize) -> Self::Builder { + // naive => None + MutablePrimitiveArray::with_capacity(nrows) + .to(ArrowDataType::Timestamp(TimeUnit::Nanosecond, None)) + } + + fn push(builder: &mut Self::Builder, value: NaiveDateTime) { + builder.push(Some(value).map(|x| x.timestamp_nanos())); + } + + fn field(header: &str) -> Field { + // naive => None + Field::new( + header, + ArrowDataType::Timestamp(TimeUnit::Nanosecond, None), + true, + ) + } +} + +impl ArrowAssoc for Option { + type Builder = MutablePrimitiveArray; + + fn builder(nrows: usize) -> Self::Builder { + MutablePrimitiveArray::with_capacity(nrows).to(ArrowDataType::Time64(TimeUnit::Nanosecond)) + } + + fn push(builder: &mut Self::Builder, value: Option) { + builder.push(value.map(naive_time_to_time64_nanos)); + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::Time64(TimeUnit::Nanosecond), true) + } +} + +impl ArrowAssoc for NaiveTime { + type Builder = MutablePrimitiveArray; + + fn builder(nrows: usize) -> Self::Builder { + MutablePrimitiveArray::with_capacity(nrows).to(ArrowDataType::Time64(TimeUnit::Nanosecond)) + } + + fn push(builder: &mut Self::Builder, value: NaiveTime) { + builder.push(Some(value).map(naive_time_to_time64_nanos)); + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::Time64(TimeUnit::Nanosecond), false) + } +} + +impl ArrowAssoc for Option> { + type Builder = MutableBinaryArray; + + fn builder(nrows: usize) -> Self::Builder { + MutableBinaryArray::with_capacity(nrows) + } + + #[inline] + fn push(builder: &mut Self::Builder, value: Self) { + builder.push(value); + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::LargeBinary, true) + } +} + +impl ArrowAssoc for Vec { + type Builder = MutableBinaryArray; + + fn builder(nrows: usize) -> Self::Builder { + MutableBinaryArray::with_capacity(nrows) + } + + #[inline] + fn push(builder: &mut Self::Builder, value: Self) { + builder.push(Some(value)); + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::LargeBinary, false) + } +} + +impl ArrowAssoc for Option> { + type Builder = MutableListArray>; + + fn builder(nrows: usize) -> Self::Builder { + MutableListArray::with_capacity(nrows) + } + + fn push(builder: &mut Self::Builder, value: Self) { + let mut string_array: Vec> = vec![]; + match value { + Some(value) => { + for sub_value in value { + string_array.push(Some(sub_value)) + } + + builder.try_push(Some(string_array)).unwrap(); + } + None => { + builder.try_push(Some(string_array)).unwrap(); + } + }; + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::LargeUtf8, true) + } +} + +impl ArrowAssoc for Vec { + type Builder = MutableListArray>; + + fn builder(nrows: usize) -> Self::Builder { + MutableListArray::with_capacity(nrows) + } + + fn push(builder: &mut Self::Builder, value: Self) { + let mut string_array: Vec> = vec![]; + for sub_value in value { + string_array.push(Some(sub_value)) + } + builder.try_push(Some(string_array)).unwrap(); + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::LargeUtf8, false) + } +} diff --git a/connectorx/src/destinations/arrow2/errors.rs b/connectorx/src/destinations/arrow2/errors.rs new file mode 100644 index 0000000..f9856b7 --- /dev/null +++ b/connectorx/src/destinations/arrow2/errors.rs @@ -0,0 +1,19 @@ +use thiserror::Error; + +pub type Result = std::result::Result; + +#[derive(Error, Debug)] +pub enum Arrow2DestinationError { + #[error(transparent)] + ArrowError(#[from] arrow2::error::Error), + + #[error(transparent)] + PolarsError(#[from] polars::error::PolarsError), + + #[error(transparent)] + ConnectorXError(#[from] crate::errors::ConnectorXError), + + /// Any other errors that are too trivial to be put here explicitly. + #[error(transparent)] + Other(#[from] anyhow::Error), +} diff --git a/connectorx/src/destinations/arrow2/funcs.rs b/connectorx/src/destinations/arrow2/funcs.rs new file mode 100644 index 0000000..666f7bd --- /dev/null +++ b/connectorx/src/destinations/arrow2/funcs.rs @@ -0,0 +1,76 @@ +use super::arrow_assoc::ArrowAssoc; +use super::Builder; +use crate::errors::Result; +use crate::typesystem::{ParameterizedFunc, ParameterizedOn}; +use anyhow::anyhow; +use arrow2::array::{Array, MutableArray}; +use arrow2::datatypes::Field; + +pub struct FNewBuilder; + +impl ParameterizedFunc for FNewBuilder { + type Function = fn(nrows: usize) -> Builder; +} + +impl ParameterizedOn for FNewBuilder +where + T: ArrowAssoc, +{ + fn parameterize() -> Self::Function { + fn imp(nrows: usize) -> Builder + where + T: ArrowAssoc, + { + Box::new(T::builder(nrows)) as Builder + } + imp:: + } +} + +pub struct FFinishBuilder; + +impl ParameterizedFunc for FFinishBuilder { + type Function = fn(Builder) -> Result>; +} + +impl ParameterizedOn for FFinishBuilder +where + T: ArrowAssoc, +{ + fn parameterize() -> Self::Function { + fn imp(mut builder: Builder) -> Result> + where + T: ArrowAssoc, + { + builder.shrink_to_fit(); + Ok(MutableArray::as_box( + builder + .as_mut_any() + .downcast_mut::() + .ok_or_else(|| anyhow!("cannot cast arrow builder for finish"))?, + )) + } + imp:: + } +} + +pub struct FNewField; + +impl ParameterizedFunc for FNewField { + type Function = fn(header: &str) -> Field; +} + +impl ParameterizedOn for FNewField +where + T: ArrowAssoc, +{ + fn parameterize() -> Self::Function { + fn imp(header: &str) -> Field + where + T: ArrowAssoc, + { + T::field(header) + } + imp:: + } +} diff --git a/connectorx/src/destinations/arrow2/mod.rs b/connectorx/src/destinations/arrow2/mod.rs new file mode 100644 index 0000000..3545de2 --- /dev/null +++ b/connectorx/src/destinations/arrow2/mod.rs @@ -0,0 +1,284 @@ +//! Destination implementation for Arrow2. + +mod arrow_assoc; +mod errors; +mod funcs; +pub mod typesystem; + +use super::{Consume, Destination, DestinationPartition}; +use crate::constants::RECORD_BATCH_SIZE; +use crate::data_order::DataOrder; +use crate::typesystem::{Realize, TypeAssoc, TypeSystem}; +use anyhow::anyhow; +use arrow2::array::{Array, MutableArray}; +use arrow2::chunk::Chunk; +use arrow2::datatypes::{Field, Schema}; +use arrow_assoc::ArrowAssoc; +pub use errors::{Arrow2DestinationError, Result}; +use fehler::throw; +use fehler::throws; +use funcs::{FFinishBuilder, FNewBuilder, FNewField}; +use polars::prelude::{DataFrame, PolarsError, Series}; +use std::convert::TryFrom; +use std::sync::{Arc, Mutex}; +pub use typesystem::Arrow2TypeSystem; + +type Builder = Box; +type Builders = Vec; +type ChunkBuffer = Arc>>>>; + +pub struct Arrow2Destination { + schema: Vec, + names: Vec, + data: ChunkBuffer, + arrow_schema: Arc, +} + +impl Default for Arrow2Destination { + fn default() -> Self { + Arrow2Destination { + schema: vec![], + names: vec![], + data: Arc::new(Mutex::new(vec![])), + arrow_schema: Arc::new(Schema::default()), + } + } +} + +impl Arrow2Destination { + pub fn new() -> Self { + Self::default() + } +} + +impl Destination for Arrow2Destination { + const DATA_ORDERS: &'static [DataOrder] = &[DataOrder::ColumnMajor, DataOrder::RowMajor]; + type TypeSystem = Arrow2TypeSystem; + type Partition<'a> = ArrowPartitionWriter; + type Error = Arrow2DestinationError; + + fn needs_count(&self) -> bool { + false + } + + #[throws(Arrow2DestinationError)] + fn allocate>( + &mut self, + _nrows: usize, + names: &[S], + schema: &[Arrow2TypeSystem], + data_order: DataOrder, + ) { + // todo: support colmajor + if !matches!(data_order, DataOrder::RowMajor) { + throw!(crate::errors::ConnectorXError::UnsupportedDataOrder( + data_order + )) + } + + // parse the metadata + self.schema = schema.to_vec(); + self.names = names.iter().map(|n| n.as_ref().to_string()).collect(); + let fields = self + .schema + .iter() + .zip(&self.names) + .map(|(&dt, h)| Ok(Realize::::realize(dt)?(h.as_str()))) + .collect::>>()?; + self.arrow_schema = Arc::new(Schema::from(fields)); + } + + #[throws(Arrow2DestinationError)] + fn partition(&mut self, counts: usize) -> Vec> { + let mut partitions = vec![]; + for _ in 0..counts { + partitions.push(ArrowPartitionWriter::new( + self.schema.clone(), + Arc::clone(&self.data), + )?); + } + partitions + } + + fn schema(&self) -> &[Arrow2TypeSystem] { + self.schema.as_slice() + } +} + +impl Arrow2Destination { + #[throws(Arrow2DestinationError)] + pub fn arrow(self) -> (Vec>>, Arc) { + let lock = Arc::try_unwrap(self.data).map_err(|_| anyhow!("Partitions are not freed"))?; + ( + lock.into_inner() + .map_err(|e| anyhow!("mutex poisoned {}", e))?, + self.arrow_schema, + ) + } + + #[throws(Arrow2DestinationError)] + pub fn polars(self) -> DataFrame { + let (rbs, schema): (Vec>>, Arc) = self.arrow()?; + //let fields = schema.fields.as_slice(); + let fields: &[Field] = schema.fields.as_slice(); + + // This should be in polars but their version needs updating. + // Whave placed this here contained in an inner function until the fix is merged upstream + fn try_from( + chunks: (Vec>>, &[Field]), + ) -> std::result::Result { + use polars::prelude::NamedFrom; + + let mut series: Vec = vec![]; + + for chunk in chunks.0.into_iter() { + let columns_results: std::result::Result, PolarsError> = chunk + .into_arrays() + .into_iter() + .zip(chunks.1) + .map(|(arr, field)| { + let a = Series::try_from((field.name.as_str(), arr)).map_err(|_| { + PolarsError::ComputeError("Couldn't build Series from box".into()) + }); + a + }) + .collect(); + + let columns = columns_results?; + + if series.is_empty() { + for col in columns.iter() { + let name = col.name().to_string(); + series.push(Series::new(&name, col)); + } + continue; + } + + for (i, col) in columns.into_iter().enumerate() { + series[i].append(&col)?; + } + } + + DataFrame::new(series) + } + + try_from((rbs, fields)).unwrap() + } +} + +pub struct ArrowPartitionWriter { + schema: Vec, + builders: Option, + current_row: usize, + current_col: usize, + data: ChunkBuffer, +} + +impl ArrowPartitionWriter { + #[throws(Arrow2DestinationError)] + fn new(schema: Vec, data: ChunkBuffer) -> Self { + let mut pw = ArrowPartitionWriter { + schema, + builders: None, + current_row: 0, + current_col: 0, + data, + }; + pw.allocate()?; + pw + } + + #[throws(Arrow2DestinationError)] + fn allocate(&mut self) { + let builders = self + .schema + .iter() + .map(|&dt| Ok(Realize::::realize(dt)?(RECORD_BATCH_SIZE))) + .collect::>>()?; + self.builders.replace(builders); + } + + #[throws(Arrow2DestinationError)] + fn flush(&mut self) { + let builders = self + .builders + .take() + .unwrap_or_else(|| panic!("arrow builder is none when flush!")); + + let columns = builders + .into_iter() + .zip(self.schema.iter()) + .map(|(builder, &dt)| Realize::::realize(dt)?(builder)) + .collect::>, crate::errors::ConnectorXError>>( + )?; + + let rb = Chunk::try_new(columns)?; + { + let mut guard = self + .data + .lock() + .map_err(|e| anyhow!("mutex poisoned {}", e))?; + let inner_data = &mut *guard; + inner_data.push(rb); + } + self.current_row = 0; + self.current_col = 0; + } +} + +impl<'a> DestinationPartition<'a> for ArrowPartitionWriter { + type TypeSystem = Arrow2TypeSystem; + type Error = Arrow2DestinationError; + + fn ncols(&self) -> usize { + self.schema.len() + } + + #[throws(Arrow2DestinationError)] + fn finalize(&mut self) { + if self.builders.is_some() { + self.flush()?; + } + } + + #[throws(Arrow2DestinationError)] + fn aquire_row(&mut self, _n: usize) -> usize { + self.current_row + } +} + +impl<'a, T> Consume for ArrowPartitionWriter +where + T: TypeAssoc<>::TypeSystem> + ArrowAssoc + 'static, +{ + type Error = Arrow2DestinationError; + + #[throws(Arrow2DestinationError)] + fn consume(&mut self, value: T) { + let col = self.current_col; + self.current_col = (self.current_col + 1) % self.ncols(); + self.schema[col].check::()?; + + match &mut self.builders { + Some(builders) => { + ::push( + builders[col] + .as_mut_any() + .downcast_mut::() + .ok_or_else(|| anyhow!("cannot cast arrow builder for append"))?, + value, + ); + } + None => throw!(anyhow!("arrow arrays are empty!")), + } + + // flush if exceed batch_size + if self.current_col == 0 { + self.current_row += 1; + if self.current_row >= RECORD_BATCH_SIZE { + self.flush()?; + self.allocate()?; + } + } + } +} diff --git a/connectorx/src/destinations/arrow2/typesystem.rs b/connectorx/src/destinations/arrow2/typesystem.rs new file mode 100644 index 0000000..ffb222b --- /dev/null +++ b/connectorx/src/destinations/arrow2/typesystem.rs @@ -0,0 +1,54 @@ +use crate::impl_typesystem; +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Utc}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub enum Arrow2TypeSystem { + Int32(bool), + Int64(bool), + UInt32(bool), + UInt64(bool), + Float32(bool), + Float64(bool), + Boolean(bool), + LargeUtf8(bool), + LargeBinary(bool), + Date32(bool), + Date64(bool), + Time64(bool), + DateTimeTz(bool), + BoolArray(bool), + Int32Array(bool), + Int64Array(bool), + UInt32Array(bool), + UInt64Array(bool), + Float32Array(bool), + Float64Array(bool), + Utf8Array(bool), +} + +impl_typesystem! { + system = Arrow2TypeSystem, + mappings = { + { Int32 => i32 } + { Int64 => i64 } + { UInt32 => u32 } + { UInt64 => u64 } + { Float64 => f64 } + { Float32 => f32 } + { Boolean => bool } + { LargeUtf8 => String } + { LargeBinary => Vec } + { Date32 => NaiveDate } + { Date64 => NaiveDateTime } + { Time64 => NaiveTime } + { DateTimeTz => DateTime } + { BoolArray => Vec } + { Int32Array => Vec } + { Int64Array => Vec } + { UInt32Array => Vec } + { UInt64Array => Vec } + { Float32Array => Vec } + { Float64Array => Vec } + { Utf8Array => Vec } + } +} diff --git a/connectorx/src/destinations/arrowstream/arrow_assoc.rs b/connectorx/src/destinations/arrowstream/arrow_assoc.rs new file mode 100644 index 0000000..cd6b01d --- /dev/null +++ b/connectorx/src/destinations/arrowstream/arrow_assoc.rs @@ -0,0 +1,338 @@ +use super::errors::{ArrowDestinationError, Result}; +use crate::constants::SECONDS_IN_DAY; +use arrow::array::{ + ArrayBuilder, BooleanBuilder, Date32Builder, Date64Builder, Float32Builder, Float64Builder, + Int32Builder, Int64Builder, LargeBinaryBuilder, StringBuilder, Time64NanosecondBuilder, + TimestampNanosecondBuilder, UInt32Builder, UInt64Builder, +}; +use arrow::datatypes::Field; +use arrow::datatypes::{DataType as ArrowDataType, TimeUnit}; +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Timelike, Utc}; +use fehler::throws; + +/// Associate arrow builder with native type +pub trait ArrowAssoc { + type Builder: ArrayBuilder + Send; + + fn builder(nrows: usize) -> Self::Builder; + fn append(builder: &mut Self::Builder, value: Self) -> Result<()>; + fn field(header: &str) -> Field; +} + +macro_rules! impl_arrow_assoc { + ($T:ty, $AT:expr, $B:ty) => { + impl ArrowAssoc for $T { + type Builder = $B; + + fn builder(nrows: usize) -> Self::Builder { + Self::Builder::with_capacity(nrows) + } + + #[throws(ArrowDestinationError)] + fn append(builder: &mut Self::Builder, value: Self) { + builder.append_value(value); + } + + fn field(header: &str) -> Field { + Field::new(header, $AT, false) + } + } + + impl ArrowAssoc for Option<$T> { + type Builder = $B; + + fn builder(nrows: usize) -> Self::Builder { + Self::Builder::with_capacity(nrows) + } + + #[throws(ArrowDestinationError)] + fn append(builder: &mut Self::Builder, value: Self) { + builder.append_option(value); + } + + fn field(header: &str) -> Field { + Field::new(header, $AT, true) + } + } + }; +} + +impl_arrow_assoc!(u32, ArrowDataType::UInt32, UInt32Builder); +impl_arrow_assoc!(u64, ArrowDataType::UInt64, UInt64Builder); +impl_arrow_assoc!(i32, ArrowDataType::Int32, Int32Builder); +impl_arrow_assoc!(i64, ArrowDataType::Int64, Int64Builder); +impl_arrow_assoc!(f32, ArrowDataType::Float32, Float32Builder); +impl_arrow_assoc!(f64, ArrowDataType::Float64, Float64Builder); +impl_arrow_assoc!(bool, ArrowDataType::Boolean, BooleanBuilder); + +impl ArrowAssoc for &str { + type Builder = StringBuilder; + + fn builder(nrows: usize) -> Self::Builder { + StringBuilder::with_capacity(1024, nrows) + } + + #[throws(ArrowDestinationError)] + fn append(builder: &mut Self::Builder, value: Self) { + builder.append_value(value); + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::Utf8, false) + } +} + +impl ArrowAssoc for Option<&str> { + type Builder = StringBuilder; + + fn builder(nrows: usize) -> Self::Builder { + StringBuilder::with_capacity(1024, nrows) + } + + #[throws(ArrowDestinationError)] + fn append(builder: &mut Self::Builder, value: Self) { + match value { + Some(s) => builder.append_value(s), + None => builder.append_null(), + } + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::Utf8, true) + } +} + +impl ArrowAssoc for String { + type Builder = StringBuilder; + + fn builder(nrows: usize) -> Self::Builder { + StringBuilder::with_capacity(1024, nrows) + } + + #[throws(ArrowDestinationError)] + fn append(builder: &mut Self::Builder, value: String) { + builder.append_value(value.as_str()); + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::Utf8, false) + } +} + +impl ArrowAssoc for Option { + type Builder = StringBuilder; + + fn builder(nrows: usize) -> Self::Builder { + StringBuilder::with_capacity(1024, nrows) + } + + #[throws(ArrowDestinationError)] + fn append(builder: &mut Self::Builder, value: Self) { + match value { + Some(s) => builder.append_value(s.as_str()), + None => builder.append_null(), + } + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::Utf8, true) + } +} + +impl ArrowAssoc for DateTime { + type Builder = TimestampNanosecondBuilder; + + fn builder(nrows: usize) -> Self::Builder { + TimestampNanosecondBuilder::with_capacity(nrows) + } + + #[throws(ArrowDestinationError)] + fn append(builder: &mut Self::Builder, value: DateTime) { + builder.append_value(value.timestamp_nanos()) + } + + fn field(header: &str) -> Field { + Field::new( + header, + ArrowDataType::Timestamp(TimeUnit::Nanosecond, None), + false, + ) + } +} + +impl ArrowAssoc for Option> { + type Builder = TimestampNanosecondBuilder; + + fn builder(nrows: usize) -> Self::Builder { + TimestampNanosecondBuilder::with_capacity(nrows) + } + + #[throws(ArrowDestinationError)] + fn append(builder: &mut Self::Builder, value: Option>) { + builder.append_option(value.map(|x| x.timestamp_nanos())) + } + + fn field(header: &str) -> Field { + Field::new( + header, + ArrowDataType::Timestamp(TimeUnit::Nanosecond, None), + true, + ) + } +} + +fn naive_date_to_arrow(nd: NaiveDate) -> i32 { + match nd.and_hms_opt(0, 0, 0) { + Some(dt) => (dt.timestamp() / SECONDS_IN_DAY) as i32, + None => panic!("and_hms_opt got None from {:?}", nd), + } +} + +fn naive_datetime_to_arrow(nd: NaiveDateTime) -> i64 { + nd.timestamp_millis() +} + +impl ArrowAssoc for Option { + type Builder = Date32Builder; + + fn builder(nrows: usize) -> Self::Builder { + Date32Builder::with_capacity(nrows) + } + + fn append(builder: &mut Self::Builder, value: Option) -> Result<()> { + builder.append_option(value.map(naive_date_to_arrow)); + Ok(()) + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::Date32, true) + } +} + +impl ArrowAssoc for NaiveDate { + type Builder = Date32Builder; + + fn builder(nrows: usize) -> Self::Builder { + Date32Builder::with_capacity(nrows) + } + + fn append(builder: &mut Self::Builder, value: NaiveDate) -> Result<()> { + builder.append_value(naive_date_to_arrow(value)); + Ok(()) + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::Date32, false) + } +} + +impl ArrowAssoc for Option { + type Builder = Date64Builder; + + fn builder(nrows: usize) -> Self::Builder { + Date64Builder::with_capacity(nrows) + } + + fn append(builder: &mut Self::Builder, value: Option) -> Result<()> { + builder.append_option(value.map(naive_datetime_to_arrow)); + Ok(()) + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::Date64, true) + } +} + +impl ArrowAssoc for NaiveDateTime { + type Builder = Date64Builder; + + fn builder(nrows: usize) -> Self::Builder { + Date64Builder::with_capacity(nrows) + } + + fn append(builder: &mut Self::Builder, value: NaiveDateTime) -> Result<()> { + builder.append_value(naive_datetime_to_arrow(value)); + Ok(()) + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::Date64, false) + } +} + +impl ArrowAssoc for Option { + type Builder = Time64NanosecondBuilder; + + fn builder(nrows: usize) -> Self::Builder { + Time64NanosecondBuilder::with_capacity(nrows) + } + + fn append(builder: &mut Self::Builder, value: Option) -> Result<()> { + builder.append_option( + value.map(|t| { + t.num_seconds_from_midnight() as i64 * 1_000_000_000 + t.nanosecond() as i64 + }), + ); + Ok(()) + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::Time64(TimeUnit::Nanosecond), true) + } +} + +impl ArrowAssoc for NaiveTime { + type Builder = Time64NanosecondBuilder; + + fn builder(nrows: usize) -> Self::Builder { + Time64NanosecondBuilder::with_capacity(nrows) + } + + fn append(builder: &mut Self::Builder, value: NaiveTime) -> Result<()> { + builder.append_value( + value.num_seconds_from_midnight() as i64 * 1_000_000_000 + value.nanosecond() as i64, + ); + Ok(()) + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::Time64(TimeUnit::Nanosecond), false) + } +} + +impl ArrowAssoc for Option> { + type Builder = LargeBinaryBuilder; + + fn builder(nrows: usize) -> Self::Builder { + LargeBinaryBuilder::with_capacity(1024, nrows) + } + + fn append(builder: &mut Self::Builder, value: Self) -> Result<()> { + match value { + Some(v) => builder.append_value(v), + None => builder.append_null(), + }; + Ok(()) + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::LargeBinary, true) + } +} + +impl ArrowAssoc for Vec { + type Builder = LargeBinaryBuilder; + + fn builder(nrows: usize) -> Self::Builder { + LargeBinaryBuilder::with_capacity(1024, nrows) + } + + fn append(builder: &mut Self::Builder, value: Self) -> Result<()> { + builder.append_value(value); + Ok(()) + } + + fn field(header: &str) -> Field { + Field::new(header, ArrowDataType::LargeBinary, false) + } +} diff --git a/connectorx/src/destinations/arrowstream/errors.rs b/connectorx/src/destinations/arrowstream/errors.rs new file mode 100644 index 0000000..85d4177 --- /dev/null +++ b/connectorx/src/destinations/arrowstream/errors.rs @@ -0,0 +1,16 @@ +use thiserror::Error; + +pub type Result = std::result::Result; + +#[derive(Error, Debug)] +pub enum ArrowDestinationError { + #[error(transparent)] + ArrowError(#[from] arrow::error::ArrowError), + + #[error(transparent)] + ConnectorXError(#[from] crate::errors::ConnectorXError), + + /// Any other errors that are too trivial to be put here explicitly. + #[error(transparent)] + Other(#[from] anyhow::Error), +} diff --git a/connectorx/src/destinations/arrowstream/funcs.rs b/connectorx/src/destinations/arrowstream/funcs.rs new file mode 100644 index 0000000..0bcd0f7 --- /dev/null +++ b/connectorx/src/destinations/arrowstream/funcs.rs @@ -0,0 +1,74 @@ +use super::arrow_assoc::ArrowAssoc; +use super::Builder; +use crate::errors::Result; +use crate::typesystem::{ParameterizedFunc, ParameterizedOn}; +use anyhow::anyhow; +use arrow::array::{ArrayBuilder, ArrayRef}; +use arrow::datatypes::Field; + +pub struct FNewBuilder; + +impl ParameterizedFunc for FNewBuilder { + type Function = fn(nrows: usize) -> Builder; +} + +impl ParameterizedOn for FNewBuilder +where + T: ArrowAssoc, +{ + fn parameterize() -> Self::Function { + fn imp(nrows: usize) -> Builder + where + T: ArrowAssoc, + { + Box::new(T::builder(nrows)) as Builder + } + imp:: + } +} + +pub struct FFinishBuilder; + +impl ParameterizedFunc for FFinishBuilder { + type Function = fn(Builder) -> Result; +} + +impl ParameterizedOn for FFinishBuilder +where + T: ArrowAssoc, +{ + fn parameterize() -> Self::Function { + fn imp(mut builder: Builder) -> Result + where + T: ArrowAssoc, + { + let t = builder + .downcast_mut::() + .ok_or_else(|| anyhow!("cannot cast arrow builder for finish"))?; + let a = ArrayBuilder::finish(t); + Ok(a) + } + imp:: + } +} + +pub struct FNewField; + +impl ParameterizedFunc for FNewField { + type Function = fn(header: &str) -> Field; +} + +impl ParameterizedOn for FNewField +where + T: ArrowAssoc, +{ + fn parameterize() -> Self::Function { + fn imp(header: &str) -> Field + where + T: ArrowAssoc, + { + T::field(header) + } + imp:: + } +} diff --git a/connectorx/src/destinations/arrowstream/mod.rs b/connectorx/src/destinations/arrowstream/mod.rs new file mode 100644 index 0000000..d8487a2 --- /dev/null +++ b/connectorx/src/destinations/arrowstream/mod.rs @@ -0,0 +1,290 @@ +//! Destination implementation for Arrow and Polars. + +mod arrow_assoc; +mod errors; +mod funcs; +pub mod typesystem; + +pub use self::errors::{ArrowDestinationError, Result}; +pub use self::typesystem::ArrowTypeSystem; +use super::{Consume, Destination, DestinationPartition}; +use crate::constants::RECORD_BATCH_SIZE; +use crate::data_order::DataOrder; +use crate::typesystem::{Realize, TypeAssoc, TypeSystem}; +use anyhow::anyhow; +use arrow::{datatypes::Schema, record_batch::RecordBatch}; +use arrow_assoc::ArrowAssoc; +use fehler::{throw, throws}; +use funcs::{FFinishBuilder, FNewBuilder, FNewField}; +use itertools::Itertools; +use std::{ + any::Any, + sync::{ + mpsc::{channel, Receiver, Sender}, + Arc, + }, +}; + +type Builder = Box; +type Builders = Vec; + +pub struct ArrowDestination { + schema: Vec, + names: Vec, + arrow_schema: Arc, + batch_size: usize, + sender: Option>, + receiver: Receiver, +} + +impl Default for ArrowDestination { + fn default() -> Self { + let (tx, rx) = channel(); + ArrowDestination { + schema: vec![], + names: vec![], + arrow_schema: Arc::new(Schema::empty()), + batch_size: RECORD_BATCH_SIZE, + sender: Some(tx), + receiver: rx, + } + } +} + +impl ArrowDestination { + pub fn new() -> Self { + Self::default() + } + + pub fn new_with_batch_size(batch_size: usize) -> Self { + let (tx, rx) = channel(); + ArrowDestination { + schema: vec![], + names: vec![], + arrow_schema: Arc::new(Schema::empty()), + batch_size, + sender: Some(tx), + receiver: rx, + } + } +} + +impl Destination for ArrowDestination { + const DATA_ORDERS: &'static [DataOrder] = &[DataOrder::ColumnMajor, DataOrder::RowMajor]; + type TypeSystem = ArrowTypeSystem; + type Partition<'a> = ArrowPartitionWriter; + type Error = ArrowDestinationError; + + fn needs_count(&self) -> bool { + false + } + + #[throws(ArrowDestinationError)] + fn allocate>( + &mut self, + _nrow: usize, + names: &[S], + schema: &[ArrowTypeSystem], + data_order: DataOrder, + ) { + // todo: support colmajor + if !matches!(data_order, DataOrder::RowMajor) { + throw!(crate::errors::ConnectorXError::UnsupportedDataOrder( + data_order + )) + } + + // parse the metadata + self.schema = schema.to_vec(); + self.names = names.iter().map(|n| n.as_ref().to_string()).collect(); + let fields = self + .schema + .iter() + .zip_eq(&self.names) + .map(|(&dt, h)| Ok(Realize::::realize(dt)?(h.as_str()))) + .collect::>>()?; + self.arrow_schema = Arc::new(Schema::new(fields)); + } + + #[throws(ArrowDestinationError)] + fn partition(&mut self, counts: usize) -> Vec> { + let mut partitions = vec![]; + let sender = self.sender.take().unwrap(); + for _ in 0..counts { + partitions.push(ArrowPartitionWriter::new( + self.schema.clone(), + Arc::clone(&self.arrow_schema), + self.batch_size, + sender.clone(), + )?); + } + partitions + // self.sender should be freed + } + + fn schema(&self) -> &[ArrowTypeSystem] { + self.schema.as_slice() + } +} + +impl ArrowDestination { + #[throws(ArrowDestinationError)] + pub fn arrow(self) -> Vec { + if self.sender.is_some() { + // should not happen since it is dropped after partition + // but need to make sure here otherwise recv will be blocked forever + std::mem::drop(self.sender); + } + let mut data = vec![]; + loop { + match self.receiver.recv() { + Ok(rb) => data.push(rb), + Err(_) => break, + } + } + data + } + + #[throws(ArrowDestinationError)] + pub fn record_batch(&mut self) -> Option { + match self.receiver.recv() { + Ok(rb) => Some(rb), + Err(_) => None, + } + } + + pub fn empty_batch(&self) -> RecordBatch { + RecordBatch::new_empty(self.arrow_schema.clone()) + } + + pub fn arrow_schema(&self) -> Arc { + self.arrow_schema.clone() + } + + pub fn names(&self) -> &[String] { + self.names.as_slice() + } +} + +pub struct ArrowPartitionWriter { + schema: Vec, + builders: Option, + current_row: usize, + current_col: usize, + arrow_schema: Arc, + batch_size: usize, + sender: Option>, +} + +// unsafe impl Sync for ArrowPartitionWriter {} + +impl ArrowPartitionWriter { + #[throws(ArrowDestinationError)] + fn new( + schema: Vec, + arrow_schema: Arc, + batch_size: usize, + sender: Sender, + ) -> Self { + let mut pw = ArrowPartitionWriter { + schema, + builders: None, + current_row: 0, + current_col: 0, + arrow_schema, + batch_size, + sender: Some(sender), + }; + pw.allocate()?; + pw + } + + #[throws(ArrowDestinationError)] + fn allocate(&mut self) { + let builders = self + .schema + .iter() + .map(|dt| Ok(Realize::::realize(*dt)?(self.batch_size))) + .collect::>>()?; + self.builders.replace(builders); + } + + #[throws(ArrowDestinationError)] + fn flush(&mut self) { + let builders = self + .builders + .take() + .unwrap_or_else(|| panic!("arrow builder is none when flush!")); + let columns = builders + .into_iter() + .zip(self.schema.iter()) + .map(|(builder, &dt)| Realize::::realize(dt)?(builder)) + .collect::, crate::errors::ConnectorXError>>()?; + let rb = RecordBatch::try_new(Arc::clone(&self.arrow_schema), columns)?; + self.sender.as_ref().unwrap().send(rb).unwrap(); + + self.current_row = 0; + self.current_col = 0; + } +} + +impl<'a> DestinationPartition<'a> for ArrowPartitionWriter { + type TypeSystem = ArrowTypeSystem; + type Error = ArrowDestinationError; + + #[throws(ArrowDestinationError)] + fn finalize(&mut self) { + if self.builders.is_some() { + self.flush()?; + } + // need to release the sender so receiver knows when the stream is exhasted + std::mem::drop(self.sender.take()); + } + + #[throws(ArrowDestinationError)] + fn aquire_row(&mut self, _n: usize) -> usize { + self.current_row + } + + fn ncols(&self) -> usize { + self.schema.len() + } +} + +impl<'a, T> Consume for ArrowPartitionWriter +where + T: TypeAssoc<>::TypeSystem> + ArrowAssoc + 'static, +{ + type Error = ArrowDestinationError; + + #[throws(ArrowDestinationError)] + fn consume(&mut self, value: T) { + let col = self.current_col; + self.current_col = (self.current_col + 1) % self.ncols(); + self.schema[col].check::()?; + + loop { + match &mut self.builders { + Some(builders) => { + ::append( + builders[col] + .downcast_mut::() + .ok_or_else(|| anyhow!("cannot cast arrow builder for append"))?, + value, + )?; + break; + } + None => self.allocate()?, // allocate if builders are not initialized + } + } + + // flush if exceed batch_size + if self.current_col == 0 { + self.current_row += 1; + if self.current_row >= self.batch_size { + self.flush()?; + self.allocate()?; + } + } + } +} diff --git a/connectorx/src/destinations/arrowstream/typesystem.rs b/connectorx/src/destinations/arrowstream/typesystem.rs new file mode 100644 index 0000000..a6997a2 --- /dev/null +++ b/connectorx/src/destinations/arrowstream/typesystem.rs @@ -0,0 +1,38 @@ +use crate::impl_typesystem; +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Utc}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub enum ArrowTypeSystem { + Int32(bool), + Int64(bool), + UInt32(bool), + UInt64(bool), + Float32(bool), + Float64(bool), + Boolean(bool), + LargeUtf8(bool), + LargeBinary(bool), + Date32(bool), + Date64(bool), + Time64(bool), + DateTimeTz(bool), +} + +impl_typesystem! { + system = ArrowTypeSystem, + mappings = { + { Int32 => i32 } + { Int64 => i64 } + { UInt32 => u32 } + { UInt64 => u64 } + { Float64 => f64 } + { Float32 => f32 } + { Boolean => bool } + { LargeUtf8 => String } + { LargeBinary => Vec } + { Date32 => NaiveDate } + { Date64 => NaiveDateTime } + { Time64 => NaiveTime } + { DateTimeTz => DateTime } + } +} diff --git a/connectorx/src/destinations/mod.rs b/connectorx/src/destinations/mod.rs new file mode 100644 index 0000000..0082b9d --- /dev/null +++ b/connectorx/src/destinations/mod.rs @@ -0,0 +1,78 @@ +//! This module defines three traits [`Destination`], [`DestinationPartition`], and [`Consume`] to define a destination. +//! This module also contains destination implementations for various dataframes. + +#[cfg(feature = "dst_arrow")] +pub mod arrow; +#[cfg(feature = "dst_arrow")] +pub mod arrowstream; + +#[cfg(feature = "dst_arrow2")] +pub mod arrow2; + +use crate::data_order::DataOrder; +use crate::errors::ConnectorXError; +use crate::typesystem::{TypeAssoc, TypeSystem}; + +/// A `Destination` is associated with a `TypeSystem` and a `PartitionDestination`. +/// `PartitionDestination` allows multiple threads write data into the buffer owned by `Destination`. +pub trait Destination: Sized { + const DATA_ORDERS: &'static [DataOrder]; + type TypeSystem: TypeSystem; + type Partition<'a>: DestinationPartition<'a, TypeSystem = Self::TypeSystem, Error = Self::Error> + where + Self: 'a; + type Error: From + Send; + + /// Specify whether the destination needs total rows in advance + /// in order to pre-allocate the buffer. + fn needs_count(&self) -> bool; + + /// Construct the `Destination`. + /// This allocates the memory based on the types of each columns + /// and the number of rows. + fn allocate>( + &mut self, + nrow: usize, + names: &[S], + schema: &[Self::TypeSystem], + data_order: DataOrder, + ) -> Result<(), Self::Error>; + + /// Create a bunch of partition destinations, with each write `count` number of rows. + fn partition(&mut self, counts: usize) -> Result>, Self::Error>; + /// Return the schema of the destination. + fn schema(&self) -> &[Self::TypeSystem]; +} + +/// `PartitionDestination` writes values to its own region. `PartitionDestination` is parameterized +/// on lifetime `'a`, which is the lifetime of the parent `Destination`. Usually, +/// a `PartitionDestination` can never live longer than the parent. +pub trait DestinationPartition<'a>: Send { + type TypeSystem: TypeSystem; + type Error: From + Send; + + /// Write a value of type T to the location (row, col). If T mismatch with the + /// schema, `ConnectorXError::TypeCheckFailed` will return. + fn write(&mut self, value: T) -> Result<(), >::Error> + where + T: TypeAssoc, + Self: Consume>::Error>, + { + self.consume(value) + } + + /// Number of rows this `PartitionDestination` controls. + fn ncols(&self) -> usize; + + /// Final clean ups + fn finalize(&mut self) -> Result<(), Self::Error>; + + /// Aquire n rows in final destination + fn aquire_row(&mut self, n: usize) -> Result; +} + +/// A type implemented `Consume` means that it can consume a value `T` by adding it to it's own buffer. +pub trait Consume { + type Error: From + Send; + fn consume(&mut self, value: T) -> Result<(), Self::Error>; +} diff --git a/connectorx/src/dispatcher.rs b/connectorx/src/dispatcher.rs new file mode 100644 index 0000000..c8cf97c --- /dev/null +++ b/connectorx/src/dispatcher.rs @@ -0,0 +1,217 @@ +///! This module provides [`dispatcher::Dispatcher`], the core struct in ConnectorX +///! that drives the data loading from a source to a destination. +use crate::{ + data_order::{coordinate, DataOrder}, + destinations::{Destination, DestinationPartition}, + errors::{ConnectorXError, Result as CXResult}, + sources::{PartitionParser, Source, SourcePartition}, + sql::CXQuery, + typesystem::Transport, +}; +use itertools::Itertools; +use log::debug; +use rayon::prelude::*; +use std::marker::PhantomData; + +/// A dispatcher takes a `S: Source`, a `D: Destination`, a `TP: Transport` and a vector of `queries` as input to +/// load data from `S` to `D` using the queries. +pub struct Dispatcher<'a, S, D, TP> { + src: S, + dst: &'a mut D, + queries: Vec>, + origin_query: Option, + _phantom: PhantomData, +} + +impl<'w, S, D, TP> Dispatcher<'w, S, D, TP> +where + S: Source, + D: Destination, + TP: Transport, +{ + /// Create a new dispatcher by providing a source, a destination and the queries. + pub fn new(src: S, dst: &'w mut D, queries: &[Q], origin_query: Option) -> Self + where + for<'a> &'a Q: Into, + { + Self { + src, + dst, + queries: queries.iter().map(Into::into).collect(), + origin_query, + _phantom: PhantomData, + } + } + + pub fn prepare( + mut self, + ) -> Result< + ( + DataOrder, + Vec, + Vec>, + Vec, + Vec, + ), + TP::Error, + > { + debug!("Prepare"); + let dorder = coordinate(S::DATA_ORDERS, D::DATA_ORDERS)?; + self.src.set_data_order(dorder)?; + self.src.set_queries(self.queries.as_slice()); + self.src.set_origin_query(self.origin_query); + + debug!("Fetching metadata"); + self.src.fetch_metadata()?; + let src_schema = self.src.schema(); + let dst_schema = src_schema + .iter() + .map(|&s| TP::convert_typesystem(s)) + .collect::>>()?; + let names = self.src.names(); + + let mut total_rows = if self.dst.needs_count() { + // return None if cannot derive total count + debug!("Try get row rounts for entire result"); + self.src.result_rows()? + } else { + debug!("Do not need counts in advance"); + Some(0) + }; + let mut src_partitions: Vec = self.src.partition()?; + if self.dst.needs_count() && total_rows.is_none() { + debug!("Manually count rows of each partitioned query and sum up"); + // run queries + src_partitions + .par_iter_mut() + .try_for_each(|partition| -> Result<(), S::Error> { partition.result_rows() })?; + + // get number of row of each partition from the source + let part_rows: Vec = src_partitions + .iter() + .map(|partition| partition.nrows()) + .collect(); + total_rows = Some(part_rows.iter().sum()); + } + let total_rows = total_rows.ok_or_else(ConnectorXError::CountError)?; + + debug!( + "Allocate destination memory: {}x{}", + total_rows, + src_schema.len() + ); + self.dst.allocate(total_rows, &names, &dst_schema, dorder)?; + + debug!("Create destination partition"); + let dst_partitions = self.dst.partition(self.queries.len())?; + + Ok(( + dorder, + src_partitions, + dst_partitions, + src_schema, + dst_schema, + )) + } + + /// Start the data loading process. + pub fn run(self) -> Result<(), TP::Error> { + debug!("Run dispatcher"); + let (dorder, src_partitions, dst_partitions, src_schema, dst_schema) = self.prepare()?; + + #[cfg(all(not(feature = "branch"), not(feature = "fptr")))] + compile_error!("branch or fptr, pick one"); + + #[cfg(feature = "branch")] + let schemas: Vec<_> = src_schema + .iter() + .zip_eq(&dst_schema) + .map(|(&src_ty, &dst_ty)| (src_ty, dst_ty)) + .collect(); + + debug!("Start writing"); + // parse and write + dst_partitions + .into_par_iter() + .zip_eq(src_partitions) + .enumerate() + .try_for_each(|(i, (mut dst, mut src))| -> Result<(), TP::Error> { + #[cfg(feature = "fptr")] + let f: Vec<_> = src_schema + .iter() + .zip_eq(&dst_schema) + .map(|(&src_ty, &dst_ty)| TP::processor(src_ty, dst_ty)) + .collect::>>()?; + + let mut parser = src.parser()?; + + match dorder { + DataOrder::RowMajor => loop { + let (n, is_last) = parser.fetch_next()?; + dst.aquire_row(n)?; + for _ in 0..n { + #[allow(clippy::needless_range_loop)] + for col in 0..dst.ncols() { + #[cfg(feature = "fptr")] + f[col](&mut parser, &mut dst)?; + + #[cfg(feature = "branch")] + { + let (s1, s2) = schemas[col]; + TP::process(s1, s2, &mut parser, &mut dst)?; + } + } + } + if is_last { + break; + } + }, + DataOrder::ColumnMajor => loop { + let (n, is_last) = parser.fetch_next()?; + dst.aquire_row(n)?; + #[allow(clippy::needless_range_loop)] + for col in 0..dst.ncols() { + for _ in 0..n { + #[cfg(feature = "fptr")] + f[col](&mut parser, &mut dst)?; + #[cfg(feature = "branch")] + { + let (s1, s2) = schemas[col]; + TP::process(s1, s2, &mut parser, &mut dst)?; + } + } + } + if is_last { + break; + } + }, + } + + debug!("Finalize partition {}", i); + dst.finalize()?; + debug!("Partition {} finished", i); + Ok(()) + })?; + + debug!("Writing finished"); + + Ok(()) + } + + /// Only fetch the metadata (header) of the destination. + pub fn get_meta(&mut self) -> Result<(), TP::Error> { + let dorder = coordinate(S::DATA_ORDERS, D::DATA_ORDERS)?; + self.src.set_data_order(dorder)?; + self.src.set_queries(self.queries.as_slice()); + self.src.set_origin_query(self.origin_query.clone()); + self.src.fetch_metadata()?; + let src_schema = self.src.schema(); + let dst_schema = src_schema + .iter() + .map(|&s| TP::convert_typesystem(s)) + .collect::>>()?; + let names = self.src.names(); + self.dst.allocate(0, &names, &dst_schema, dorder)?; + Ok(()) + } +} diff --git a/connectorx/src/errors.rs b/connectorx/src/errors.rs new file mode 100644 index 0000000..6168207 --- /dev/null +++ b/connectorx/src/errors.rs @@ -0,0 +1,216 @@ +use crate::data_order::DataOrder; +use std::any::type_name; +use std::fmt; +use thiserror::Error; + +pub type Result = std::result::Result; +pub type OutResult = std::result::Result; + +#[derive(Error, Debug)] +pub enum ConnectorXOutError { + #[error("File {0} not found.")] + FileNotFoundError(String), + + #[error("Source {0} not supported.")] + SourceNotSupport(String), + + #[error(transparent)] + IOError(#[from] std::io::Error), + + #[error(transparent)] + JsonError(#[from] serde_json::Error), + + #[cfg(feature = "federation")] + #[error(transparent)] + J4RSError(#[from] j4rs::errors::J4RsError), + + #[cfg(feature = "fed_exec")] + #[error(transparent)] + DataFusionError(#[from] datafusion::error::DataFusionError), + + #[error(transparent)] + UrlParseError(#[from] url::ParseError), + + #[error(transparent)] + ConnectorXInternalError(#[from] ConnectorXError), + + #[cfg(feature = "src_postgres")] + #[error(transparent)] + PostgresSourceError(#[from] crate::sources::postgres::PostgresSourceError), + + #[cfg(feature = "src_postgres")] + #[error(transparent)] + PostgresError(#[from] postgres::Error), + + #[cfg(feature = "src_mysql")] + #[error(transparent)] + MySQLSourceError(#[from] crate::sources::mysql::MySQLSourceError), + + #[cfg(feature = "src_mysql")] + #[error(transparent)] + MysqlError(#[from] r2d2_mysql::mysql::Error), + + #[cfg(feature = "src_mssql")] + #[error(transparent)] + MsSQLSourceError(#[from] crate::sources::mssql::MsSQLSourceError), + + #[cfg(feature = "src_mssql")] + #[error(transparent)] + MsSQL(#[from] tiberius::error::Error), + + #[cfg(feature = "src_sqlite")] + #[error(transparent)] + SQLiteSourceError(#[from] crate::sources::sqlite::SQLiteSourceError), + + #[cfg(feature = "src_sqlite")] + #[error(transparent)] + SQLiteError(#[from] rusqlite::Error), + + #[cfg(feature = "src_oracle")] + #[error(transparent)] + OracleSourceError(#[from] crate::sources::oracle::OracleSourceError), + + #[cfg(feature = "src_oracle")] + #[error(transparent)] + OracleError(#[from] r2d2_oracle::oracle::Error), + + #[cfg(feature = "src_bigquery")] + #[error(transparent)] + BigQuerySourceError(#[from] crate::sources::bigquery::BigQuerySourceError), + + #[cfg(feature = "src_bigquery")] + #[error(transparent)] + BigQueryError(#[from] gcp_bigquery_client::error::BQError), + + #[cfg(feature = "dst_arrow")] + #[error(transparent)] + ArrowError(#[from] crate::destinations::arrow::ArrowDestinationError), + + #[cfg(feature = "dst_arrow")] + #[error(transparent)] + ArrowStreamError(#[from] crate::destinations::arrowstream::ArrowDestinationError), + + #[cfg(feature = "dst_arrow2")] + #[error(transparent)] + Arrow2Error(#[from] crate::destinations::arrow2::Arrow2DestinationError), + + #[cfg(all(feature = "src_postgres", feature = "dst_arrow"))] + #[error(transparent)] + PostgresArrowTransportError(#[from] crate::transports::PostgresArrowTransportError), + + #[cfg(all(feature = "src_postgres", feature = "dst_arrow2"))] + #[error(transparent)] + PostgresArrow2TransportError(#[from] crate::transports::PostgresArrow2TransportError), + + #[cfg(all(feature = "src_mysql", feature = "dst_arrow"))] + #[error(transparent)] + MySQLArrowTransportError(#[from] crate::transports::MySQLArrowTransportError), + + #[cfg(all(feature = "src_mysql", feature = "dst_arrow2"))] + #[error(transparent)] + MySQLArrow2TransportError(#[from] crate::transports::MySQLArrow2TransportError), + + #[cfg(all(feature = "src_sqlite", feature = "dst_arrow"))] + #[error(transparent)] + SQLiteArrowTransportError(#[from] crate::transports::SQLiteArrowTransportError), + + #[cfg(all(feature = "src_sqlite", feature = "dst_arrow2"))] + #[error(transparent)] + SQLiteArrow2TransportError(#[from] crate::transports::SQLiteArrow2TransportError), + + #[cfg(all(feature = "src_mssql", feature = "dst_arrow"))] + #[error(transparent)] + MsSQLArrowTransportError(#[from] crate::transports::MsSQLArrowTransportError), + + #[cfg(all(feature = "src_mssql", feature = "dst_arrow2"))] + #[error(transparent)] + MsSQLArrow2TransportError(#[from] crate::transports::MsSQLArrow2TransportError), + + #[cfg(all(feature = "src_oracle", feature = "dst_arrow"))] + #[error(transparent)] + OracleArrowTransportError(#[from] crate::transports::OracleArrowTransportError), + + #[cfg(all(feature = "src_oracle", feature = "dst_arrow2"))] + #[error(transparent)] + OracleArrow2TransportError(#[from] crate::transports::OracleArrow2TransportError), + + #[cfg(all(feature = "src_bigquery", feature = "dst_arrow"))] + #[error(transparent)] + BigqueryArrowTransportError(#[from] crate::transports::BigQueryArrowTransportError), + + #[cfg(all(feature = "src_bigquery", feature = "dst_arrow2"))] + #[error(transparent)] + BigqueryArrow2TransportError(#[from] crate::transports::BigQueryArrow2TransportError), + + /// Any other errors that are too trivial to be put here explicitly. + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +/// Errors that can be raised from this library. +#[derive(Error, Debug)] +pub enum ConnectorXError { + /// The required type does not same as the schema defined. + #[error("Data type unexpected: {0:?} expected, {1} found.")] + TypeCheckFailed(String, &'static str), + + #[error("Data order not supported {0:?}.")] + UnsupportedDataOrder(DataOrder), + + #[error("Cannot resolve data order: got {0:?} from source, {1:?} from destination.")] + CannotResolveDataOrder(Vec, Vec), + + #[error("Cannot produce a {0}, context: {1}.")] + CannotProduce(&'static str, ProduceContext), + + #[error("No conversion rule from {0} to {1}.")] + NoConversionRule(String, String), + + #[error("Only support single query with SELECT statement, got {0}.")] + SqlQueryNotSupported(String), + + #[error("Cannot get total number of rows in advance.")] + CountError(), + + #[error(transparent)] + SQLParserError(#[from] sqlparser::parser::ParserError), + + #[error(transparent)] + StdIOError(#[from] std::io::Error), + + #[error(transparent)] + StdVarError(#[from] std::env::VarError), + + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +impl ConnectorXError { + pub fn cannot_produce(context: Option) -> Self { + ConnectorXError::CannotProduce(type_name::(), context.into()) + } +} + +#[derive(Debug)] +pub enum ProduceContext { + NoContext, + Context(String), +} + +impl From> for ProduceContext { + fn from(val: Option) -> Self { + match val { + Some(c) => ProduceContext::Context(c), + None => ProduceContext::NoContext, + } + } +} + +impl fmt::Display for ProduceContext { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ProduceContext::NoContext => write!(f, "No Context"), + ProduceContext::Context(s) => write!(f, "{}", s), + } + } +} diff --git a/connectorx/src/fed_dispatcher.rs b/connectorx/src/fed_dispatcher.rs new file mode 100644 index 0000000..875a9fa --- /dev/null +++ b/connectorx/src/fed_dispatcher.rs @@ -0,0 +1,92 @@ +use crate::{prelude::*, sql::CXQuery}; +use arrow::record_batch::RecordBatch; +use datafusion::datasource::MemTable; +use datafusion::prelude::*; +use fehler::throws; +use log::debug; +use rayon::prelude::*; +use std::collections::HashMap; +use std::convert::TryFrom; +use std::sync::{mpsc::channel, Arc}; + +#[throws(ConnectorXOutError)] +pub fn run( + sql: String, + db_map: HashMap, + j4rs_base: Option<&str>, +) -> Vec { + debug!("federated input sql: {}", sql); + let mut db_conn_map: HashMap = HashMap::new(); + for (k, v) in db_map.into_iter() { + db_conn_map.insert( + k, + FederatedDataSourceInfo::new_from_conn_str( + SourceConn::try_from(v.as_str())?, + false, + "", + "", + ), + ); + } + let fed_plan = rewrite_sql(sql.as_str(), &db_conn_map, j4rs_base)?; + + debug!("fetch queries from remote"); + let (sender, receiver) = channel(); + fed_plan.into_par_iter().enumerate().try_for_each_with( + sender, + |s, (i, p)| -> Result<(), ConnectorXOutError> { + match p.db_name.as_str() { + "LOCAL" => { + s.send((p.sql, None)).expect("send error local"); + } + _ => { + debug!("start query {}: {}", i, p.sql); + let mut queries = vec![]; + p.sql.split(';').for_each(|ss| { + queries.push(CXQuery::naked(ss)); + }); + let source_conn = &db_conn_map[p.db_name.as_str()] + .conn_str_info + .as_ref() + .unwrap(); + + let destination = get_arrow(source_conn, None, queries.as_slice())?; + let rbs = destination.arrow()?; + + let provider = MemTable::try_new(rbs[0].schema(), vec![rbs])?; + s.send((p.db_alias, Some(Arc::new(provider)))) + .expect(&format!("send error {}", i)); + debug!("query {} finished", i); + } + } + Ok(()) + }, + )?; + + let ctx = SessionContext::new(); + let mut alias_names: Vec = vec![]; + let mut local_sql = String::new(); + receiver + .iter() + .try_for_each(|(alias, provider)| -> Result<(), ConnectorXOutError> { + match provider { + Some(p) => { + ctx.register_table(alias.as_str(), p)?; + alias_names.push(alias); + } + None => local_sql = alias, + } + + Ok(()) + })?; + + debug!("\nexecute query final...\n{}\n", local_sql); + let rt = Arc::new(tokio::runtime::Runtime::new().expect("Failed to create runtime")); + // until datafusion fix the bug: https://github.com/apache/arrow-datafusion/issues/2147 + for alias in alias_names { + local_sql = local_sql.replace(format!("\"{}\"", alias).as_str(), alias.as_str()); + } + + let df = rt.block_on(ctx.sql(local_sql.as_str()))?; + rt.block_on(df.collect())? +} diff --git a/connectorx/src/fed_rewriter.rs b/connectorx/src/fed_rewriter.rs new file mode 100644 index 0000000..9d717c3 --- /dev/null +++ b/connectorx/src/fed_rewriter.rs @@ -0,0 +1,185 @@ +use crate::{ + constants::{CX_REWRITER_PATH, J4RS_BASE_PATH}, + prelude::*, +}; +use fehler::throws; +use j4rs::{ClasspathEntry, Instance, InvocationArg, Jvm, JvmBuilder}; +use log::debug; +use std::collections::HashMap; +use std::convert::TryFrom; +use std::{env, fs}; + +pub struct Plan { + pub db_name: String, + pub db_alias: String, + pub sql: String, + pub cardinality: usize, +} + +pub struct FederatedDataSourceInfo<'a> { + pub conn_str_info: Option, + pub manual_info: Option>>, + pub is_local: bool, + pub jdbc_url: &'a str, + pub jdbc_driver: &'a str, +} + +impl<'a> FederatedDataSourceInfo<'a> { + pub fn new_from_conn_str( + source_conn: SourceConn, + is_local: bool, + jdbc_url: &'a str, + jdbc_driver: &'a str, + ) -> Self { + Self { + conn_str_info: Some(source_conn), + manual_info: None, + is_local, + jdbc_url, + jdbc_driver, + } + } + pub fn new_from_manual_schema( + manual_schema: HashMap>, + is_local: bool, + ) -> Self { + Self { + conn_str_info: None, + manual_info: Some(manual_schema), + is_local, + jdbc_url: "", + jdbc_driver: "", + } + } +} + +#[throws(ConnectorXOutError)] +fn init_jvm(j4rs_base: Option<&str>) -> Jvm { + let base = match j4rs_base { + Some(path) => fs::canonicalize(path) + .map_err(|_| ConnectorXOutError::FileNotFoundError(path.to_string()))?, + None => fs::canonicalize(J4RS_BASE_PATH) + .map_err(|_| ConnectorXOutError::FileNotFoundError(J4RS_BASE_PATH.to_string()))?, + }; + debug!("j4rs base path: {:?}", base); + + let rewriter_path = env::var("CX_REWRITER_PATH").unwrap_or(CX_REWRITER_PATH.to_string()); + let path = fs::canonicalize(rewriter_path.as_str()) + .map_err(|_| ConnectorXOutError::FileNotFoundError(rewriter_path))?; + + debug!("rewriter path: {:?}", path); + + let entry = ClasspathEntry::new(path.to_str().unwrap()); + JvmBuilder::new() + .skip_setting_native_lib() + .classpath_entry(entry) + .with_base_path(base.to_str().unwrap()) + .build()? +} + +#[allow(dead_code)] +#[throws(ConnectorXOutError)] +fn create_sources( + jvm: &Jvm, + db_map: &HashMap, +) -> (Instance, Instance) { + let mut db_config = vec![]; + let db_manual = jvm.create_instance("java.util.HashMap", &[])?; + + for (db_name, db_info) in db_map.iter() { + if db_info.manual_info.is_some() { + let manual_info = db_info.manual_info.as_ref().unwrap(); + let schema_info = jvm.create_instance("java.util.HashMap", &[])?; + for (name, columns) in manual_info { + let arr_instance = jvm.java_list("java.lang.String", columns.to_vec())?; + jvm.invoke( + &schema_info, + "put", + &[ + InvocationArg::try_from(name).unwrap(), + InvocationArg::try_from(arr_instance).unwrap(), + ], + )?; + } + let fed_ds = jvm.create_instance( + "ai.dataprep.federated.FederatedDataSource", + &[ + InvocationArg::try_from(db_info.is_local).unwrap(), + InvocationArg::try_from(schema_info).unwrap(), + ], + )?; + jvm.invoke( + &db_manual, + "put", + &[ + InvocationArg::try_from(db_name).unwrap(), + InvocationArg::try_from(fed_ds).unwrap(), + ], + )?; + } else { + db_config.push(String::from(db_name)); + } + } + let db_config = jvm.java_list("java.lang.String", db_config)?; + (db_config, db_manual) +} + +#[allow(dead_code)] +#[throws(ConnectorXOutError)] +fn create_sources2(jvm: &Jvm, db_map: &HashMap) -> Instance { + let mut dbs = vec![]; + for db in db_map.keys() { + dbs.push(String::from(db)); + } + jvm.java_list("java.lang.String", dbs)? +} + +#[throws(ConnectorXOutError)] +pub fn rewrite_sql( + sql: &str, + db_map: &HashMap, + j4rs_base: Option<&str>, +) -> Vec { + let jvm = init_jvm(j4rs_base)?; + debug!("init jvm successfully!"); + + let sql = InvocationArg::try_from(sql).unwrap(); + let (db_config, db_manual) = create_sources(&jvm, db_map)?; + let rewriter = jvm.create_instance("ai.dataprep.federated.FederatedQueryRewriter", &[])?; + let db_config = InvocationArg::try_from(db_config).unwrap(); + let db_manual = InvocationArg::try_from(db_manual).unwrap(); + let plan = jvm.invoke(&rewriter, "rewrite3", &[sql, db_config, db_manual])?; + + let count = jvm.invoke(&plan, "getCount", &[])?; + let count: i32 = jvm.to_rust(count)?; + debug!("rewrite finished, got {} queries", count); + + let mut fed_plan = vec![]; + for i in 0..count { + let idx = [InvocationArg::try_from(i).unwrap().into_primitive()?]; + + let db = jvm.invoke(&plan, "getDBName", &idx)?; + let db: String = jvm.to_rust(db)?; + + let alias_db = jvm.invoke(&plan, "getAliasDBName", &idx)?; + let alias_db: String = jvm.to_rust(alias_db)?; + + let rewrite_sql = jvm.invoke(&plan, "getSql", &idx)?; + let rewrite_sql: String = jvm.to_rust(rewrite_sql)?; + + let cardinality = jvm.invoke(&plan, "getCardinality", &idx)?; + let cardinality: usize = jvm.to_rust(cardinality)?; + + debug!( + "{} - db: {}, alias: {}, cardinality: {}, rewrite sql: {}", + i, db, alias_db, cardinality, rewrite_sql + ); + fed_plan.push(Plan { + db_name: db, + db_alias: alias_db, + sql: rewrite_sql, + cardinality, + }); + } + fed_plan +} diff --git a/connectorx/src/get_arrow.rs b/connectorx/src/get_arrow.rs new file mode 100644 index 0000000..735eae2 --- /dev/null +++ b/connectorx/src/get_arrow.rs @@ -0,0 +1,428 @@ +#[cfg(feature = "src_mysql")] +use crate::sources::mysql::{BinaryProtocol as MySQLBinaryProtocol, TextProtocol}; +#[cfg(feature = "src_postgres")] +use crate::sources::postgres::{ + rewrite_tls_args, BinaryProtocol as PgBinaryProtocol, CSVProtocol, CursorProtocol, + SimpleProtocol, +}; +use crate::{ + arrow_batch_iter::{ArrowBatchIter, RecordBatchIterator}, + prelude::*, + sql::CXQuery, +}; +use fehler::{throw, throws}; +use log::debug; +#[cfg(feature = "src_postgres")] +use postgres::NoTls; +#[cfg(feature = "src_postgres")] +use postgres_openssl::MakeTlsConnector; +#[allow(unused_imports)] +use std::sync::Arc; + +#[allow(unreachable_code, unreachable_patterns, unused_variables, unused_mut)] +#[throws(ConnectorXOutError)] +pub fn get_arrow( + source_conn: &SourceConn, + origin_query: Option, + queries: &[CXQuery], +) -> ArrowDestination { + let mut destination = ArrowDestination::new(); + let protocol = source_conn.proto.as_str(); + debug!("Protocol: {}", protocol); + + match source_conn.ty { + #[cfg(feature = "src_postgres")] + SourceType::Postgres => { + let (config, tls) = rewrite_tls_args(&source_conn.conn)?; + match (protocol, tls) { + ("csv", Some(tls_conn)) => { + let source = PostgresSource::::new( + config, + tls_conn, + queries.len(), + )?; + let dispatcher = Dispatcher::< + _, + _, + PostgresArrowTransport, + >::new( + source, &mut destination, queries, origin_query + ); + dispatcher.run()?; + } + ("csv", None) => { + let source = + PostgresSource::::new(config, NoTls, queries.len())?; + let dispatcher = + Dispatcher::<_, _, PostgresArrowTransport>::new( + source, + &mut destination, + queries, + origin_query, + ); + dispatcher.run()?; + } + ("binary", Some(tls_conn)) => { + let source = PostgresSource::::new( + config, + tls_conn, + queries.len(), + )?; + let dispatcher = Dispatcher::< + _, + _, + PostgresArrowTransport, + >::new( + source, &mut destination, queries, origin_query + ); + dispatcher.run()?; + } + ("binary", None) => { + let source = PostgresSource::::new( + config, + NoTls, + queries.len(), + )?; + let dispatcher = Dispatcher::< + _, + _, + PostgresArrowTransport, + >::new( + source, &mut destination, queries, origin_query + ); + dispatcher.run()?; + } + ("cursor", Some(tls_conn)) => { + let source = PostgresSource::::new( + config, + tls_conn, + queries.len(), + )?; + let dispatcher = Dispatcher::< + _, + _, + PostgresArrowTransport, + >::new( + source, &mut destination, queries, origin_query + ); + dispatcher.run()?; + } + ("cursor", None) => { + let source = + PostgresSource::::new(config, NoTls, queries.len())?; + let dispatcher = Dispatcher::< + _, + _, + PostgresArrowTransport, + >::new( + source, &mut destination, queries, origin_query + ); + dispatcher.run()?; + } + ("simple", Some(tls_conn)) => { + let sb = PostgresSource::::new( + config, + tls_conn, + queries.len(), + )?; + let dispatcher = Dispatcher::< + _, + _, + PostgresArrowTransport, + >::new( + sb, &mut destination, queries, origin_query + ); + debug!("Running dispatcher"); + dispatcher.run()?; + } + ("simple", None) => { + let sb = + PostgresSource::::new(config, NoTls, queries.len())?; + let dispatcher = Dispatcher::< + _, + _, + PostgresArrowTransport, + >::new( + sb, &mut destination, queries, origin_query + ); + debug!("Running dispatcher"); + dispatcher.run()?; + } + _ => unimplemented!("{} protocol not supported", protocol), + } + } + #[cfg(feature = "src_mysql")] + SourceType::MySQL => match protocol { + "binary" => { + let source = + MySQLSource::::new(&source_conn.conn[..], queries.len())?; + let dispatcher = Dispatcher::<_, _, MySQLArrowTransport>::new( + source, + &mut destination, + queries, + origin_query, + ); + dispatcher.run()?; + } + "text" => { + let source = + MySQLSource::::new(&source_conn.conn[..], queries.len())?; + let dispatcher = Dispatcher::<_, _, MySQLArrowTransport>::new( + source, + &mut destination, + queries, + origin_query, + ); + dispatcher.run()?; + } + _ => unimplemented!("{} protocol not supported", protocol), + }, + #[cfg(feature = "src_sqlite")] + SourceType::SQLite => { + // remove the first "sqlite://" manually since url.path is not correct for windows + let path = &source_conn.conn.as_str()[9..]; + let source = SQLiteSource::new(path, queries.len())?; + let dispatcher = Dispatcher::<_, _, SQLiteArrowTransport>::new( + source, + &mut destination, + queries, + origin_query, + ); + dispatcher.run()?; + } + #[cfg(feature = "src_mssql")] + SourceType::MsSQL => { + let rt = Arc::new(tokio::runtime::Runtime::new().expect("Failed to create runtime")); + let source = MsSQLSource::new(rt, &source_conn.conn[..], queries.len())?; + let dispatcher = Dispatcher::<_, _, MsSQLArrowTransport>::new( + source, + &mut destination, + queries, + origin_query, + ); + dispatcher.run()?; + } + #[cfg(feature = "src_oracle")] + SourceType::Oracle => { + let source = OracleSource::new(&source_conn.conn[..], queries.len())?; + let dispatcher = Dispatcher::<_, _, OracleArrowTransport>::new( + source, + &mut destination, + queries, + origin_query, + ); + dispatcher.run()?; + } + #[cfg(feature = "src_bigquery")] + SourceType::BigQuery => { + let rt = Arc::new(tokio::runtime::Runtime::new().expect("Failed to create runtime")); + let source = BigQuerySource::new(rt, &source_conn.conn[..])?; + let dispatcher = Dispatcher::<_, _, BigQueryArrowTransport>::new( + source, + &mut destination, + queries, + origin_query, + ); + dispatcher.run()?; + } + _ => throw!(ConnectorXOutError::SourceNotSupport(format!( + "{:?}", + source_conn.ty + ))), + } + + destination +} + +#[allow(unreachable_code, unreachable_patterns, unused_variables, unused_mut)] +pub fn new_record_batch_iter( + source_conn: &SourceConn, + origin_query: Option, + queries: &[CXQuery], + batch_size: usize, +) -> Box { + let destination = ArrowStreamDestination::new_with_batch_size(batch_size); + let protocol = source_conn.proto.as_str(); + debug!("Protocol: {}", protocol); + + match source_conn.ty { + #[cfg(feature = "src_postgres")] + SourceType::Postgres => { + let (config, tls) = rewrite_tls_args(&source_conn.conn).unwrap(); + match (protocol, tls) { + ("csv", Some(tls_conn)) => { + let source = PostgresSource::::new( + config, + tls_conn, + queries.len(), + ) + .unwrap(); + let batch_iter = + ArrowBatchIter::< + _, + PostgresArrowStreamTransport, + >::new(source, destination, origin_query, queries) + .unwrap(); + return Box::new(batch_iter); + } + ("csv", None) => { + let source = + PostgresSource::::new(config, NoTls, queries.len()) + .unwrap(); + let batch_iter = ArrowBatchIter::< + _, + PostgresArrowStreamTransport, + >::new( + source, destination, origin_query, queries + ) + .unwrap(); + return Box::new(batch_iter); + } + ("binary", Some(tls_conn)) => { + let source = PostgresSource::::new( + config, + tls_conn, + queries.len(), + ) + .unwrap(); + let batch_iter = + ArrowBatchIter::< + _, + PostgresArrowStreamTransport, + >::new(source, destination, origin_query, queries) + .unwrap(); + return Box::new(batch_iter); + } + ("binary", None) => { + let source = PostgresSource::::new( + config, + NoTls, + queries.len(), + ) + .unwrap(); + let batch_iter = ArrowBatchIter::< + _, + PostgresArrowStreamTransport, + >::new( + source, destination, origin_query, queries + ) + .unwrap(); + return Box::new(batch_iter); + } + ("cursor", Some(tls_conn)) => { + let source = PostgresSource::::new( + config, + tls_conn, + queries.len(), + ) + .unwrap(); + let batch_iter = + ArrowBatchIter::< + _, + PostgresArrowStreamTransport, + >::new(source, destination, origin_query, queries) + .unwrap(); + return Box::new(batch_iter); + } + ("cursor", None) => { + let source = + PostgresSource::::new(config, NoTls, queries.len()) + .unwrap(); + let batch_iter = ArrowBatchIter::< + _, + PostgresArrowStreamTransport, + >::new( + source, destination, origin_query, queries + ) + .unwrap(); + return Box::new(batch_iter); + } + _ => unimplemented!("{} protocol not supported", protocol), + } + } + #[cfg(feature = "src_mysql")] + SourceType::MySQL => match protocol { + "binary" => { + let source = + MySQLSource::::new(&source_conn.conn[..], queries.len()) + .unwrap(); + let batch_iter = + ArrowBatchIter::<_, MySQLArrowStreamTransport>::new( + source, + destination, + origin_query, + queries, + ) + .unwrap(); + return Box::new(batch_iter); + } + "text" => { + let source = + MySQLSource::::new(&source_conn.conn[..], queries.len()).unwrap(); + let batch_iter = ArrowBatchIter::<_, MySQLArrowStreamTransport>::new( + source, + destination, + origin_query, + queries, + ) + .unwrap(); + return Box::new(batch_iter); + } + _ => unimplemented!("{} protocol not supported", protocol), + }, + #[cfg(feature = "src_sqlite")] + SourceType::SQLite => { + // remove the first "sqlite://" manually since url.path is not correct for windows + let path = &source_conn.conn.as_str()[9..]; + let source = SQLiteSource::new(path, queries.len()).unwrap(); + let batch_iter = ArrowBatchIter::<_, SQLiteArrowStreamTransport>::new( + source, + destination, + origin_query, + queries, + ) + .unwrap(); + return Box::new(batch_iter); + } + #[cfg(feature = "src_mssql")] + SourceType::MsSQL => { + let rt = Arc::new(tokio::runtime::Runtime::new().expect("Failed to create runtime")); + let source = MsSQLSource::new(rt, &source_conn.conn[..], queries.len()).unwrap(); + let batch_iter = ArrowBatchIter::<_, MsSQLArrowStreamTransport>::new( + source, + destination, + origin_query, + queries, + ) + .unwrap(); + return Box::new(batch_iter); + } + #[cfg(feature = "src_oracle")] + SourceType::Oracle => { + let source = OracleSource::new(&source_conn.conn[..], queries.len()).unwrap(); + let batch_iter = ArrowBatchIter::<_, OracleArrowStreamTransport>::new( + source, + destination, + origin_query, + queries, + ) + .unwrap(); + return Box::new(batch_iter); + } + #[cfg(feature = "src_bigquery")] + SourceType::BigQuery => { + let rt = Arc::new(tokio::runtime::Runtime::new().expect("Failed to create runtime")); + let source = BigQuerySource::new(rt, &source_conn.conn[..]).unwrap(); + let batch_iter = ArrowBatchIter::<_, BigQueryArrowStreamTransport>::new( + source, + destination, + origin_query, + queries, + ) + .unwrap(); + return Box::new(batch_iter); + } + _ => {} + } + panic!("not supported!"); +} diff --git a/connectorx/src/get_arrow2.rs b/connectorx/src/get_arrow2.rs new file mode 100644 index 0000000..98de815 --- /dev/null +++ b/connectorx/src/get_arrow2.rs @@ -0,0 +1,231 @@ +#[cfg(feature = "src_mysql")] +use crate::sources::mysql::{BinaryProtocol as MySQLBinaryProtocol, TextProtocol}; +#[cfg(feature = "src_postgres")] +use crate::sources::postgres::{ + rewrite_tls_args, BinaryProtocol as PgBinaryProtocol, CSVProtocol, CursorProtocol, + SimpleProtocol, +}; +use crate::{prelude::*, sql::CXQuery}; +use fehler::{throw, throws}; +use log::debug; +#[cfg(feature = "src_postgres")] +use postgres::NoTls; +#[cfg(feature = "src_postgres")] +use postgres_openssl::MakeTlsConnector; +#[allow(unused_imports)] +use std::sync::Arc; + +#[allow(unreachable_code, unreachable_patterns, unused_variables, unused_mut)] +#[throws(ConnectorXOutError)] +pub fn get_arrow2( + source_conn: &SourceConn, + origin_query: Option, + queries: &[CXQuery], +) -> Arrow2Destination { + let mut destination = Arrow2Destination::new(); + let protocol = source_conn.proto.as_str(); + debug!("Protocol: {}", protocol); + + match source_conn.ty { + #[cfg(feature = "src_postgres")] + SourceType::Postgres => { + let (config, tls) = rewrite_tls_args(&source_conn.conn)?; + match (protocol, tls) { + ("csv", Some(tls_conn)) => { + let sb = PostgresSource::::new( + config, + tls_conn, + queries.len(), + )?; + let dispatcher = Dispatcher::< + _, + _, + PostgresArrow2Transport, + >::new( + sb, &mut destination, queries, origin_query + ); + dispatcher.run()?; + } + ("csv", None) => { + let sb = + PostgresSource::::new(config, NoTls, queries.len())?; + let dispatcher = + Dispatcher::<_, _, PostgresArrow2Transport>::new( + sb, + &mut destination, + queries, + origin_query, + ); + dispatcher.run()?; + } + ("binary", Some(tls_conn)) => { + let sb = PostgresSource::::new( + config, + tls_conn, + queries.len(), + )?; + let dispatcher = + Dispatcher::< + _, + _, + PostgresArrow2Transport, + >::new(sb, &mut destination, queries, origin_query); + dispatcher.run()?; + } + ("binary", None) => { + let sb = PostgresSource::::new( + config, + NoTls, + queries.len(), + )?; + let dispatcher = Dispatcher::< + _, + _, + PostgresArrow2Transport, + >::new( + sb, &mut destination, queries, origin_query + ); + dispatcher.run()?; + } + ("cursor", Some(tls_conn)) => { + let sb = PostgresSource::::new( + config, + tls_conn, + queries.len(), + )?; + let dispatcher = Dispatcher::< + _, + _, + PostgresArrow2Transport, + >::new( + sb, &mut destination, queries, origin_query + ); + dispatcher.run()?; + } + ("cursor", None) => { + let sb = + PostgresSource::::new(config, NoTls, queries.len())?; + let dispatcher = Dispatcher::< + _, + _, + PostgresArrow2Transport, + >::new( + sb, &mut destination, queries, origin_query + ); + dispatcher.run()?; + } + ("simple", Some(tls_conn)) => { + let sb = PostgresSource::::new( + config, + tls_conn, + queries.len(), + )?; + let dispatcher = Dispatcher::< + _, + _, + PostgresArrow2Transport, + >::new( + sb, &mut destination, queries, origin_query + ); + debug!("Running dispatcher"); + dispatcher.run()?; + } + ("simple", None) => { + let sb = + PostgresSource::::new(config, NoTls, queries.len())?; + let dispatcher = Dispatcher::< + _, + _, + PostgresArrow2Transport, + >::new( + sb, &mut destination, queries, origin_query + ); + debug!("Running dispatcher"); + dispatcher.run()?; + } + + _ => unimplemented!("{} protocol not supported", protocol), + } + } + #[cfg(feature = "src_mysql")] + SourceType::MySQL => match protocol { + "binary" => { + let source = + MySQLSource::::new(&source_conn.conn[..], queries.len())?; + let dispatcher = Dispatcher::<_, _, MySQLArrow2Transport>::new( + source, + &mut destination, + queries, + origin_query, + ); + dispatcher.run()?; + } + "text" => { + let source = + MySQLSource::::new(&source_conn.conn[..], queries.len())?; + let dispatcher = Dispatcher::<_, _, MySQLArrow2Transport>::new( + source, + &mut destination, + queries, + origin_query, + ); + dispatcher.run()?; + } + _ => unimplemented!("{} protocol not supported", protocol), + }, + #[cfg(feature = "src_sqlite")] + SourceType::SQLite => { + // remove the first "sqlite://" manually since url.path is not correct for windows + let path = &source_conn.conn.as_str()[9..]; + let source = SQLiteSource::new(path, queries.len())?; + let dispatcher = Dispatcher::<_, _, SQLiteArrow2Transport>::new( + source, + &mut destination, + queries, + origin_query, + ); + dispatcher.run()?; + } + #[cfg(feature = "src_mssql")] + SourceType::MsSQL => { + let rt = Arc::new(tokio::runtime::Runtime::new().expect("Failed to create runtime")); + let source = MsSQLSource::new(rt, &source_conn.conn[..], queries.len())?; + let dispatcher = Dispatcher::<_, _, MsSQLArrow2Transport>::new( + source, + &mut destination, + queries, + origin_query, + ); + dispatcher.run()?; + } + #[cfg(feature = "src_oracle")] + SourceType::Oracle => { + let source = OracleSource::new(&source_conn.conn[..], queries.len())?; + let dispatcher = Dispatcher::<_, _, OracleArrow2Transport>::new( + source, + &mut destination, + queries, + origin_query, + ); + dispatcher.run()?; + } + #[cfg(feature = "src_bigquery")] + SourceType::BigQuery => { + let rt = Arc::new(tokio::runtime::Runtime::new().expect("Failed to create runtime")); + let source = BigQuerySource::new(rt, &source_conn.conn[..])?; + let dispatcher = Dispatcher::<_, _, BigQueryArrow2Transport>::new( + source, + &mut destination, + queries, + origin_query, + ); + dispatcher.run()?; + } + _ => throw!(ConnectorXOutError::SourceNotSupport(format!( + "{:?}", + source_conn.ty + ))), + } + + destination +} diff --git a/connectorx/src/lib.rs b/connectorx/src/lib.rs new file mode 100644 index 0000000..84b043b --- /dev/null +++ b/connectorx/src/lib.rs @@ -0,0 +1,218 @@ +#![allow(clippy::upper_case_acronyms)] + +//! # ConnectorX +//! +//! ConnectorX enables you to load data from databases into dataframes in the fastest and most memory efficient way by leveraging +//! zero-copy and partition-based parallelism. +//! +//! Currently, ConnectorX consists of a Rust core library and a python library. This is the documentation for the Rust crate. +//! For the documentation of the Python library, please refer to our [Github Readme](https://github.com/sfu-db/connector-x). +//! +//! # Design +//! +//! A data loading problem consists of three sub-problems: +//! 1. How to connect to the data source and read data. +//! 2. How to connect to the data destination and write data. +//! 3. How to map the types between the source and destination. +//! +//! Additionally, since ConnectorX will partition a query into partitions and execute them in parallel, we also have +//! 4. How to partition the query and run them in parallel. +//! +//! ConnectorX approaches these problems by defining abstractions on sources, destinations, and mapping rules. +//! For the partition-based parallelism, ConnectorX will partition the query as well as the source and the destination +//! together and put them into threads. +//! Each thread will own exactly 1 query, 1 partitioned source, and 1 partitioned destination. +//! +//! The following graph depicts the internal mechanism when ConnectorX is downloading the data. +//! +//! ```text +//! +------------------------------------------------------------+ +//! | Thread 1 | +//! | | +//! +---+ | +-----------------+ +-------------+ +-----------------+ | +---+ +//! | +-----------+>| Partitioned Src +-->| Type Mapper +->| Partitioned Dst +-+--------->| | +//! | | | +-----------------+ +-------------+ +-----------------+ | | | +//! | D | | | | D | +//! | a | +------------------------------------------------------------+ | a | +//! | t | . | t | +//! | a | . | a | +//! | b | . | f | +//! | a | +------------------------------------------------------------+ | r | +//! | s | | Thread n | | a | +//! | e | | | | m | +//! | | | +-----------------+ +-------------+ +-----------------+ | | e | +//! | +-----------+>| Partitioned Src +-->| Type Mapper +->| Partitioned Dst +-+--------->| | +//! +---+ | +-----------------+ +-------------+ +-----------------+ | +---+ +//! | | +//! +------------------------------------------------------------+ +//! +//! ``` +//! ## How does ConnectorX download the data? +//! +//! Upon receiving the query, e.g. SELECT * FROM lineitem, ConnectorX will first issue a LIMIT 1 query SELECT * FROM lineitem LIMIT 1 to get the schema of the result set. +//! +//! Then, if partition_on is specified, ConnectorX will issue `SELECT MIN($partition_on), MAX($partition_on) FROM (SELECT * FROM lineitem)` to know the range of the partition column. +//! After that, the original query is split into partitions based on the min/max information, e.g. `SELECT * FROM (SELECT * FROM lineitem) WHERE $partition_on > 0 AND $partition_on < 10000`. +//! ConnectorX will then run a count query to get the partition size (e.g. `SELECT COUNT(*) FROM (SELECT * FROM lineitem) WHERE $partition_on > 0 AND $partition_on < 10000`). +//! If the partition is not specified, the count query will be `SELECT COUNT(*) FROM (SELECT * FROM lineitem)`. +//! +//! Finally, ConnectorX will use the schema info as well as the count info to allocate memory and download data by executing the queries normally. +//! Once the downloading begins, there will be one thread for each partition so that the data are downloaded in parallel at the partition level. +//! The thread will issue the query of the corresponding partition to the database and then write the returned data to the destination row-wise or column-wise (depends on the database) in a streaming fashion. +//! This mechanism implies that having an index on the partition column is recommended to make full use of the parallel downloading power provided by ConnectorX. +//! +//! # Extending ConnectorX +//! ## Adding a new source +//! +//! To add a new data source, you need to implement [`sources::Source`], [`sources::SourcePartition`], [`sources::PartitionParser`], and [`sources::Produce`] for the source. +//! In detail, [`sources::Source`] describes how to connect to the database from a connection string, as well as how to do partitioning on the source to produce a list of [`sources::SourcePartition`]. +//! [`sources::SourcePartition`] describes how to get the row count for the specific partition so that the destination can preallocate the memory. +//! Finally, [`sources::PartitionParser`] and [`sources::Produce`] abstracts away the detail about how does each partition parse different types. +//! +//! ## Adding a new destination +//! +//! To add a new data destination, you need to implement [`destinations::Destination`], [`destinations::DestinationPartition`], and [`destinations::Consume`]. Similar to the sources, +//! [`destinations::Destination`] describes how to allocate the memory of the data destination, as well as how to do partitioning on the destination to produce a list of [`destinations::DestinationPartition`]. +//! [`destinations::DestinationPartition`] and [`destinations::Consume`] abstract away the detail about how does each partition writes different types. +//! +//! ## Adding a new transport (type mapping) +//! +//! After having a source and a destination that describes how to read and write the data, +//! ConnectorX also needs to know how to convert the values with different types from the source to the destination. +//! For example, Postgres can produce a `uuid` type but there's no uuid in Arrow. It is the transport's duty to convert +//! the `uuid` into an Arrow compatible type, e.g. string. You can use the [`impl_transport!`] macro to define a transport. +//! +//! ## Putting things together +//! +//! Say, you decide to load data from SQL Server to Arrow. In ConnectorX we already provided the source for SQL Server as [`sources::sqlite::SQLiteSource`], and the +//! Arrow destination [`destinations::arrow::ArrowDestination`], as well as the transport [`transports::SQLiteArrowTransport`]. +//! Given the source, destination and transport already implemented, you can use [`dispatcher::Dispatcher`] to load the data: +//! +//! ```no_run +//! use connectorx::prelude::*; +//! +//! let mut destination = ArrowDestination::new(); +//! let source = SQLiteSource::new("/path/to/db", 10).expect("cannot create the source"); +//! let queries = &["SELECT * FROM db WHERE id < 100", "SELECT * FROM db WHERE id >= 100"]; +//! let dispatcher = Dispatcher::::new(source, &mut destination, queries, None); +//! dispatcher.run().expect("run failed"); +//! +//! let data = destination.arrow(); +//! ``` +//! +//! Or simply you can directly use the [`get_arrow::get_arrow`] or [`get_arrow2::get_arrow2`] in which we wrapped the above procedures: +//! +//! ```no_run +//! use connectorx::prelude::*; +//! use std::convert::TryFrom; +//! +//! let mut source_conn = SourceConn::try_from("postgresql://username:password@host:port/db?cxprotocol=binary").expect("parse conn str failed"); +//! let queries = &[CXQuery::from("SELECT * FROM table WHERE id < 100"), CXQuery::from("SELECT * FROM table WHERE id >= 100")]; +//! let destination = get_arrow(&source_conn, None, queries).expect("run failed"); +//! +//! let data = destination.arrow(); +//! ``` +//! +//! NOTE: the pool size parameter `nconn` used in initializing the source should be larger than or equal to the number of partitioned queries input later. +//! +//! ## Need more examples? +//! You can use the existing implementation as the example. +//! [MySQL source](https://github.com/sfu-db/connector-x/tree/main/connectorx/src/sources/mysql), +//! [Arrow destination](https://github.com/sfu-db/connector-x/tree/main/connectorx/src/destinations/arrow), +//! [MySQL to Arrow transport](https://github.com/sfu-db/connector-x/blob/main/connectorx/src/transports/mysql_arrow.rs). +//! +//! # Sources protocols & Destinations that is implemented in the Rust core. +//! +//! ## Sources +//! - [x] Postgres +//! - [x] Mysql +//! - [x] Sqlite +//! - [x] SQL Server +//! - [x] Oracle +//! - [x] BigQuery +//! +//! ## Destinations +//! - [x] Arrow +//! - [x] Arrow2 +//! +//! # Feature gates +//! By default, ConnectorX does not enable any sources / destinations to keep the dependencies minimal. +//! Instead, we provide following features for you to opt-in: `src_sqlite`, `src_postgres`, `src_mysql`, `src_mssql`, `src_oracle`, `dst_arrow`, `dst_arrow2`. +//! For example, if you'd like to load data from Postgres to Arrow, you can enable `src_postgres` and `dst_arrow` in `Cargo.toml`. +//! This will enable [`sources::postgres`], [`destinations::arrow`] and [`transports::PostgresArrowTransport`]. + +pub mod typesystem; +#[macro_use] +mod macros; +#[cfg(feature = "dst_arrow")] +pub mod arrow_batch_iter; +pub mod constants; +pub mod data_order; +pub mod destinations; +mod dispatcher; +pub mod errors; +#[cfg(feature = "fed_exec")] +pub mod fed_dispatcher; +#[cfg(feature = "federation")] +pub mod fed_rewriter; +#[cfg(feature = "dst_arrow")] +pub mod get_arrow; +#[cfg(feature = "dst_arrow2")] +pub mod get_arrow2; +pub mod partition; +pub mod source_router; +pub mod sources; +#[doc(hidden)] +pub mod sql; +pub mod transports; +#[doc(hidden)] +pub mod utils; + +pub mod prelude { + #[cfg(feature = "dst_arrow")] + pub use crate::arrow_batch_iter::{set_global_num_thread, RecordBatchIterator}; + pub use crate::data_order::{coordinate, DataOrder}; + #[cfg(feature = "dst_arrow")] + pub use crate::destinations::arrow::{ArrowDestination, ArrowPartitionWriter, ArrowTypeSystem}; + #[cfg(feature = "dst_arrow2")] + pub use crate::destinations::arrow2::Arrow2Destination; + #[cfg(feature = "dst_arrow")] + pub use crate::destinations::arrowstream::{ + ArrowDestination as ArrowStreamDestination, + ArrowPartitionWriter as ArrowStreamPartitionWriter, + ArrowTypeSystem as ArrowStreamTypeSystem, + }; + pub use crate::destinations::{Consume, Destination, DestinationPartition}; + pub use crate::dispatcher::Dispatcher; + pub use crate::errors::{ConnectorXError, ConnectorXOutError}; + #[cfg(feature = "federation")] + pub use crate::fed_rewriter::{rewrite_sql, FederatedDataSourceInfo, Plan}; + #[cfg(feature = "dst_arrow")] + pub use crate::get_arrow::{get_arrow, new_record_batch_iter}; + #[cfg(feature = "dst_arrow2")] + pub use crate::get_arrow2::get_arrow2; + pub use crate::source_router::*; + #[cfg(feature = "src_bigquery")] + pub use crate::sources::bigquery::BigQuerySource; + #[cfg(feature = "src_csv")] + pub use crate::sources::csv::CSVSource; + #[cfg(feature = "src_dummy")] + pub use crate::sources::dummy::DummySource; + #[cfg(feature = "src_mssql")] + pub use crate::sources::mssql::MsSQLSource; + #[cfg(feature = "src_mysql")] + pub use crate::sources::mysql::MySQLSource; + #[cfg(feature = "src_oracle")] + pub use crate::sources::oracle::OracleSource; + #[cfg(feature = "src_postgres")] + pub use crate::sources::postgres::PostgresSource; + #[cfg(feature = "src_sqlite")] + pub use crate::sources::sqlite::SQLiteSource; + pub use crate::sources::{PartitionParser, Produce, Source, SourcePartition}; + pub use crate::sql::CXQuery; + pub use crate::transports::*; + pub use crate::typesystem::{ + ParameterizedFunc, ParameterizedOn, Realize, Transport, TypeAssoc, TypeConversion, + TypeSystem, + }; +} diff --git a/connectorx/src/macros.rs b/connectorx/src/macros.rs new file mode 100644 index 0000000..3d80d67 --- /dev/null +++ b/connectorx/src/macros.rs @@ -0,0 +1,321 @@ +/// Associate physical representations to a typesystem. +/// +/// # Example Usage +/// ```ignore +/// pub enum ArrowTypeSystem { +/// Int32(bool), +/// Int64(bool), +/// UInt32(bool), +/// UInt64(bool), +/// Float32(bool), +/// Float64(bool), +/// Boolean(bool), +/// LargeUtf8(bool), +/// LargeBinary(bool), +/// Date32(bool), +/// Date64(bool), +/// Time64(bool), +/// DateTimeTz(bool), +/// } +/// +/// impl_typesystem! { +/// system = ArrowTypeSystem, +/// mappings = { +/// { Int32 => i32 } +/// { Int64 => i64 } +/// { UInt32 => u32 } +/// { UInt64 => u64 } +/// { Float64 => f64 } +/// { Float32 => f32 } +/// { Boolean => bool } +/// { LargeUtf8 => String } +/// { LargeBinary => Vec } +/// { Date32 => NaiveDate } +/// { Date64 => NaiveDateTime } +/// { Time64 => NaiveTime } +/// { DateTimeTz => DateTime } +/// } +/// } +/// ``` +/// This means for the type system `ArrowTypeSystem`, it's variant `ArrowTypeSystem::Int32(false)` is corresponding to the physical type `i32` and +/// `ArrowTypeSystem::Int32(true)` is corresponding to the physical type `Option`. +#[macro_export] +macro_rules! impl_typesystem { + ( + system = $TS:tt, + mappings = { + $( + { $($V:tt)|+ => $NT:ty } + )* + } + ) => { + impl $crate::typesystem::TypeSystem for $TS {} + + $( + impl_typesystem!(@typeassoc $TS [$($V)+], $NT); + )+ + + impl_typesystem!(@realize $TS $([ [$($V)+] => $NT ])+ ); + }; + + (@typeassoc $TS:tt [$($V:tt)+], $NT:ty) => { + impl<'r> $crate::typesystem::TypeAssoc<$TS> for $NT { + fn check(ts: $TS) -> $crate::errors::Result<()> { + match ts { + $( + $TS::$V(false) => Ok(()), + )+ + _ => fehler::throw!($crate::errors::ConnectorXError::TypeCheckFailed(format!("{:?}", ts), std::any::type_name::<$NT>())) + } + } + } + + impl<'r> $crate::typesystem::TypeAssoc<$TS> for Option<$NT> { + fn check(ts: $TS) -> $crate::errors::Result<()> { + match ts { + $( + $TS::$V(true) => Ok(()), + )+ + _ => fehler::throw!($crate::errors::ConnectorXError::TypeCheckFailed(format!("{:?}", ts), std::any::type_name::<$NT>())) + } + } + } + }; + + (@realize $TS:tt $([ [$($V:tt)+] => $NT:ty ])+) => { + impl<'r, F> $crate::typesystem::Realize for $TS + where + F: $crate::typesystem::ParameterizedFunc, + $(F: $crate::typesystem::ParameterizedOn<$NT>,)+ + $(F: $crate::typesystem::ParameterizedOn>,)+ + { + fn realize(self) -> $crate::errors::Result { + match self { + $( + $( + $TS::$V(false) => Ok(F::realize::<$NT>()), + )+ + $( + $TS::$V(true) => Ok(F::realize::>()), + )+ + )+ + } + } + } + }; +} + +/// A macro to help define a Transport. +/// +/// # Example Usage +/// ```ignore +/// impl_transport!( +/// name = MsSQLArrowTransport, +/// error = MsSQLArrowTransportError, +/// systems = MsSQLTypeSystem => ArrowTypeSystem, +/// route = MsSQLSource => ArrowDestination, +/// mappings = { +/// { Tinyint[u8] => Int32[i32] | conversion auto } +/// { Smallint[i16] => Int32[i32] | conversion auto } +/// { Int[i32] => Int32[i32] | conversion auto } +/// { Bigint[i64] => Int64[i64] | conversion auto } +/// { Intn[IntN] => Int64[i64] | conversion option } +/// { Float24[f32] => Float32[f32] | conversion auto } +/// { Float53[f64] => Float64[f64] | conversion auto } +/// { Floatn[FloatN] => Float64[f64] | conversion option } +/// { Bit[bool] => Boolean[bool] | conversion auto } +/// { Nvarchar[&'r str] => LargeUtf8[String] | conversion owned } +/// { Varchar[&'r str] => LargeUtf8[String] | conversion none } +/// { Nchar[&'r str] => LargeUtf8[String] | conversion none } +/// { Char[&'r str] => LargeUtf8[String] | conversion none } +/// { Text[&'r str] => LargeUtf8[String] | conversion none } +/// { Ntext[&'r str] => LargeUtf8[String] | conversion none } +/// { Binary[&'r [u8]] => LargeBinary[Vec] | conversion owned } +/// { Varbinary[&'r [u8]] => LargeBinary[Vec] | conversion none } +/// { Image[&'r [u8]] => LargeBinary[Vec] | conversion none } +/// { Numeric[Decimal] => Float64[f64] | conversion option } +/// { Decimal[Decimal] => Float64[f64] | conversion none } +/// { Datetime[NaiveDateTime] => Date64[NaiveDateTime] | conversion auto } +/// { Datetime2[NaiveDateTime] => Date64[NaiveDateTime] | conversion none } +/// { Smalldatetime[NaiveDateTime] => Date64[NaiveDateTime] | conversion none } +/// { Date[NaiveDate] => Date32[NaiveDate] | conversion auto } +/// { Datetimeoffset[DateTime] => DateTimeTz[DateTime] | conversion auto } +/// { Uniqueidentifier[Uuid] => LargeUtf8[String] | conversion option } +/// } +/// ); +/// ``` +/// This implements a `Transport` called `MsSQLArrowTransport` that can convert types from MsSQL to Arrow. +#[macro_export] +macro_rules! impl_transport { + ( + name = $TP:ty, + error = $ET:ty, + systems = $TSS:tt => $TSD:tt, + route = $S:ty => $D:ty, + mappings = { + $( + { $($TOKENS:tt)+ } + )* + } + ) => { + $( + impl_transport!(@cvt $TP, $($TOKENS)+); + )* + + impl_transport!(@transport $TP, $ET [$TSS, $TSD] [$S, $D] $([ $($TOKENS)+ ])*); + }; + + // transport + (@transport $TP:ty, $ET:ty [$TSS:tt, $TSD:tt] [$S:ty, $D:ty] $([ $($TOKENS:tt)+ ])*) => { + impl <'tp> $crate::typesystem::Transport for $TP { + type TSS = $TSS; + type TSD = $TSD; + type S = $S; + type D = $D; + type Error = $ET; + + impl_transport!(@cvtts [$TSS, $TSD] $([ $($TOKENS)+ ])*); + impl_transport!(@process [$TSS, $TSD] $([ $($TOKENS)+ ])*); + impl_transport!(@processor [$TSS, $TSD] $([ $($TOKENS)+ ])*, $([ $($TOKENS)+ ])*); + } + }; + + (@cvtts [$TSS:tt, $TSD:tt] $( [$V1:tt [$T1:ty] => $V2:tt [$T2:ty] | conversion $HOW:ident] )*) => { + fn convert_typesystem(ts: Self::TSS) -> $crate::errors::Result { + match ts { + $( + $TSS::$V1(true) => Ok($TSD::$V2(true)), + $TSS::$V1(false) => Ok($TSD::$V2(false)), + )* + #[allow(unreachable_patterns)] + _ => fehler::throw!($crate::errors::ConnectorXError::NoConversionRule( + format!("{:?}", ts), format!("{}", std::any::type_name::()) + )) + } + } + }; + + (@process [$TSS:tt, $TSD:tt] $([ $V1:tt [$T1:ty] => $V2:tt [$T2:ty] | conversion $HOW:ident ])*) => { + fn process<'s, 'd, 'r>( + ts1: Self::TSS, + ts2: Self::TSD, + src: &'r mut <::Partition as $crate::sources::SourcePartition>::Parser<'s>, + dst: &'r mut ::Partition<'d>, + ) -> Result<(), Self::Error> where Self: 'd { + match (ts1, ts2) { + $( + ($TSS::$V1(true), $TSD::$V2(true)) => { + let val: Option<$T1> = $crate::sources::PartitionParser::parse(src)?; + let val: Option<$T2> = , _>>::convert(val); + $crate::destinations::DestinationPartition::write(dst, val)?; + Ok(()) + } + + ($TSS::$V1(false), $TSD::$V2(false)) => { + let val: $T1 = $crate::sources::PartitionParser::parse(src)?; + let val: $T2 = >::convert(val); + $crate::destinations::DestinationPartition::write(dst, val)?; + Ok(()) + } + )* + #[allow(unreachable_patterns)] + _ => fehler::throw!($crate::errors::ConnectorXError::NoConversionRule( + format!("{:?}", ts1), format!("{:?}", ts1)) + ) + } + + } + }; + + (@processor [$TSS:tt, $TSD:tt] $([ $V1:tt [$T1:ty] => $V2:tt [$T2:ty] | conversion $HOW:ident ])*, $([ $($TOKENS:tt)+ ])*) => { + fn processor<'s, 'd>( + ts1: Self::TSS, + ts2: Self::TSD, + ) -> $crate::errors::Result< + fn( + src: &mut <::Partition as $crate::sources::SourcePartition>::Parser<'s>, + dst: &mut ::Partition<'d>, + ) -> Result<(), Self::Error> + > where Self: 'd { + match (ts1, ts2) { + $( + ($TSS::$V1(true), $TSD::$V2(true)) => { + impl_transport!(@process_func_branch true [ $($TOKENS)+ ]) + } + + ($TSS::$V1(false), $TSD::$V2(false)) => { + impl_transport!(@process_func_branch false [ $($TOKENS)+ ]) + } + )* + #[allow(unreachable_patterns)] + _ => fehler::throw!($crate::errors::ConnectorXError::NoConversionRule( + format!("{:?}", ts1), format!("{:?}", ts1)) + ) + } + + } + }; + + (@process_func_branch $OPT:ident [ $V1:tt [&$L1:lifetime $T1:ty] => $V2:tt [&$L2:lifetime $T2:ty] | conversion $HOW:ident ]) => { + impl_transport!(@process_func_branch $OPT &$T1, &$T2) + }; + (@process_func_branch $OPT:ident [ $V1:tt [$T1:ty] => $V2:tt [&$L2:lifetime $T2:ty] | conversion $HOW:ident ]) => { + impl_transport!(@process_func_branch $OPT $T1, &$T2) + }; + (@process_func_branch $OPT:ident [ $V1:tt [&$L1:lifetime $T1:ty] => $V2:tt [$T2:ty] | conversion $HOW:ident ]) => { + impl_transport!(@process_func_branch $OPT &$T1, $T2) + }; + (@process_func_branch $OPT:ident [ $V1:tt [$T1:ty] => $V2:tt [$T2:ty] | conversion $HOW:ident ]) => { + impl_transport!(@process_func_branch $OPT $T1, $T2) + }; + (@process_func_branch true $T1:ty, $T2:ty) => { + Ok( + |s: &mut _, d: &mut _| $crate::typesystem::process::, Option<$T2>, Self, Self::S, Self::D, ::Error, ::Error, Self::Error>(s, d) + ) + }; + (@process_func_branch false $T1:ty, $T2:ty) => { + Ok( + |s: &mut _, d: &mut _| $crate::typesystem::process::<$T1, $T2, Self, Self::S, Self::D, ::Error, ::Error, Self::Error>(s, d) + ) + }; + + // TypeConversion + (@cvt $TP:ty, $V1:tt [$T1:ty] => $V2:tt [$T2:ty] | conversion $HOW:ident) => { + impl_transport!(@cvt $HOW $TP, $T1, $T2); + }; + (@cvt auto $TP:ty, $T1:ty, $T2:ty) => { + impl<'tp, 'r> $crate::typesystem::TypeConversion<$T1, $T2> for $TP { + fn convert(val: $T1) -> $T2 { + val as _ + } + } + + impl_transport!(@cvt option $TP, $T1, $T2); + }; + (@cvt auto_vec $TP:ty, $T1:ty, $T2:ty) => { + impl<'tp, 'r> $crate::typesystem::TypeConversion<$T1, $T2> for $TP { + fn convert(val: $T1) -> $T2 { + val.into_iter().map(|v| v as _).collect() + } + } + + impl_transport!(@cvt option $TP, $T1, $T2); + }; + (@cvt owned $TP:ty, $T1:ty, $T2:ty) => { + impl<'tp, 'r> $crate::typesystem::TypeConversion<$T1, $T2> for $TP { + fn convert(val: $T1) -> $T2 { + val.to_owned() + } + } + + impl_transport!(@cvt option $TP, $T1, $T2); + }; + (@cvt option $TP:ty, $T1:ty, $T2:ty) => { + impl<'tp, 'r> $crate::typesystem::TypeConversion, Option<$T2>> for $TP { + fn convert(val: Option<$T1>) -> Option<$T2> { + val.map(Self::convert) + } + } + }; + (@cvt none $TP:ty, $T1:ty, $T2:ty) => {}; +} diff --git a/connectorx/src/partition.rs b/connectorx/src/partition.rs new file mode 100644 index 0000000..370120e --- /dev/null +++ b/connectorx/src/partition.rs @@ -0,0 +1,483 @@ +use crate::errors::{ConnectorXOutError, OutResult}; +use crate::source_router::{SourceConn, SourceType}; +#[cfg(feature = "src_bigquery")] +use crate::sources::bigquery::BigQueryDialect; +#[cfg(feature = "src_mssql")] +use crate::sources::mssql::{mssql_config, FloatN, IntN, MsSQLTypeSystem}; +#[cfg(feature = "src_mysql")] +use crate::sources::mysql::{MySQLSourceError, MySQLTypeSystem}; +#[cfg(feature = "src_oracle")] +use crate::sources::oracle::{connect_oracle, OracleDialect}; +#[cfg(feature = "src_postgres")] +use crate::sources::postgres::{rewrite_tls_args, PostgresTypeSystem}; +#[cfg(feature = "src_sqlite")] +use crate::sql::get_partition_range_query_sep; +use crate::sql::{get_partition_range_query, single_col_partition_query, CXQuery}; +use anyhow::anyhow; +use fehler::{throw, throws}; +#[cfg(feature = "src_bigquery")] +use gcp_bigquery_client; +#[cfg(feature = "src_mysql")] +use r2d2_mysql::mysql::{prelude::Queryable, Opts, Pool, Row}; +#[cfg(feature = "src_sqlite")] +use rusqlite::{types::Type, Connection}; +#[cfg(feature = "src_postgres")] +use rust_decimal::{prelude::ToPrimitive, Decimal}; +#[cfg(feature = "src_postgres")] +use rust_decimal_macros::dec; +#[cfg(feature = "src_mssql")] +use sqlparser::dialect::MsSqlDialect; +#[cfg(feature = "src_mysql")] +use sqlparser::dialect::MySqlDialect; +#[cfg(feature = "src_postgres")] +use sqlparser::dialect::PostgreSqlDialect; +#[cfg(feature = "src_sqlite")] +use sqlparser::dialect::SQLiteDialect; +#[cfg(feature = "src_mssql")] +use tiberius::Client; +#[cfg(any(feature = "src_bigquery", feature = "src_mssql"))] +use tokio::{net::TcpStream, runtime::Runtime}; +#[cfg(feature = "src_mssql")] +use tokio_util::compat::TokioAsyncWriteCompatExt; +use url::Url; + +pub struct PartitionQuery { + query: String, + column: String, + min: Option, + max: Option, + num: usize, +} + +impl PartitionQuery { + pub fn new(query: &str, column: &str, min: Option, max: Option, num: usize) -> Self { + Self { + query: query.into(), + column: column.into(), + min, + max, + num, + } + } +} + +pub fn partition(part: &PartitionQuery, source_conn: &SourceConn) -> OutResult> { + let mut queries = vec![]; + let num = part.num as i64; + let (min, max) = match (part.min, part.max) { + (None, None) => get_col_range(source_conn, &part.query, &part.column)?, + (Some(min), Some(max)) => (min, max), + _ => throw!(anyhow!( + "partition_query range can not be partially specified", + )), + }; + + let partition_size = (max - min + 1) / num; + + for i in 0..num { + let lower = min + i * partition_size; + let upper = match i == num - 1 { + true => max + 1, + false => min + (i + 1) * partition_size, + }; + let partition_query = get_part_query(source_conn, &part.query, &part.column, lower, upper)?; + queries.push(partition_query); + } + Ok(queries) +} + +pub fn get_col_range(source_conn: &SourceConn, query: &str, col: &str) -> OutResult<(i64, i64)> { + match source_conn.ty { + #[cfg(feature = "src_postgres")] + SourceType::Postgres => pg_get_partition_range(&source_conn.conn, query, col), + #[cfg(feature = "src_sqlite")] + SourceType::SQLite => sqlite_get_partition_range(&source_conn.conn, query, col), + #[cfg(feature = "src_mysql")] + SourceType::MySQL => mysql_get_partition_range(&source_conn.conn, query, col), + #[cfg(feature = "src_mssql")] + SourceType::MsSQL => mssql_get_partition_range(&source_conn.conn, query, col), + #[cfg(feature = "src_oracle")] + SourceType::Oracle => oracle_get_partition_range(&source_conn.conn, query, col), + #[cfg(feature = "src_bigquery")] + SourceType::BigQuery => bigquery_get_partition_range(&source_conn.conn, query, col), + _ => unimplemented!("{:?} not implemented!", source_conn.ty), + } +} + +#[throws(ConnectorXOutError)] +pub fn get_part_query( + source_conn: &SourceConn, + query: &str, + col: &str, + lower: i64, + upper: i64, +) -> CXQuery { + let query = match source_conn.ty { + #[cfg(feature = "src_postgres")] + SourceType::Postgres => { + single_col_partition_query(query, col, lower, upper, &PostgreSqlDialect {})? + } + #[cfg(feature = "src_sqlite")] + SourceType::SQLite => { + single_col_partition_query(query, col, lower, upper, &SQLiteDialect {})? + } + #[cfg(feature = "src_mysql")] + SourceType::MySQL => { + single_col_partition_query(query, col, lower, upper, &MySqlDialect {})? + } + #[cfg(feature = "src_mssql")] + SourceType::MsSQL => { + single_col_partition_query(query, col, lower, upper, &MsSqlDialect {})? + } + #[cfg(feature = "src_oracle")] + SourceType::Oracle => { + single_col_partition_query(query, col, lower, upper, &OracleDialect {})? + } + #[cfg(feature = "src_bigquery")] + SourceType::BigQuery => { + single_col_partition_query(query, col, lower, upper, &BigQueryDialect {})? + } + _ => unimplemented!("{:?} not implemented!", source_conn.ty), + }; + CXQuery::Wrapped(query) +} + +#[cfg(feature = "src_postgres")] +#[throws(ConnectorXOutError)] +fn pg_get_partition_range(conn: &Url, query: &str, col: &str) -> (i64, i64) { + let (config, tls) = rewrite_tls_args(conn)?; + let mut client = match tls { + None => config.connect(postgres::NoTls)?, + Some(tls_conn) => config.connect(tls_conn)?, + }; + let range_query = get_partition_range_query(query, col, &PostgreSqlDialect {})?; + let row = client.query_one(range_query.as_str(), &[])?; + + let col_type = PostgresTypeSystem::from(row.columns()[0].type_()); + let (min_v, max_v) = match col_type { + PostgresTypeSystem::Int2(_) => { + let min_v: Option = row.get(0); + let max_v: Option = row.get(1); + (min_v.unwrap_or(0) as i64, max_v.unwrap_or(0) as i64) + } + PostgresTypeSystem::Int4(_) => { + let min_v: Option = row.get(0); + let max_v: Option = row.get(1); + (min_v.unwrap_or(0) as i64, max_v.unwrap_or(0) as i64) + } + PostgresTypeSystem::Int8(_) => { + let min_v: Option = row.get(0); + let max_v: Option = row.get(1); + (min_v.unwrap_or(0), max_v.unwrap_or(0)) + } + PostgresTypeSystem::Float4(_) => { + let min_v: Option = row.get(0); + let max_v: Option = row.get(1); + (min_v.unwrap_or(0.0) as i64, max_v.unwrap_or(0.0) as i64) + } + PostgresTypeSystem::Float8(_) => { + let min_v: Option = row.get(0); + let max_v: Option = row.get(1); + (min_v.unwrap_or(0.0) as i64, max_v.unwrap_or(0.0) as i64) + } + PostgresTypeSystem::Numeric(_) => { + let min_v: Option = row.get(0); + let max_v: Option = row.get(1); + ( + min_v.unwrap_or(dec!(0.0)).to_i64().unwrap_or(0), + max_v.unwrap_or(dec!(0.0)).to_i64().unwrap_or(0), + ) + } + _ => throw!(anyhow!( + "Partition can only be done on int or float columns" + )), + }; + + (min_v, max_v) +} + +#[cfg(feature = "src_sqlite")] +#[throws(ConnectorXOutError)] +fn sqlite_get_partition_range(conn: &Url, query: &str, col: &str) -> (i64, i64) { + // remove the first "sqlite://" manually since url.path is not correct for windows and for relative path + let conn = Connection::open(&conn.as_str()[9..])?; + // SQLite only optimize min max queries when there is only one aggregation + // https://www.sqlite.org/optoverview.html#minmax + let (min_query, max_query) = get_partition_range_query_sep(query, col, &SQLiteDialect {})?; + let mut error = None; + let min_v = conn.query_row(min_query.as_str(), [], |row| { + // declare type for count query will be None, only need to check the returned value type + let col_type = row.get_ref(0)?.data_type(); + match col_type { + Type::Integer => row.get(0), + Type::Real => { + let v: f64 = row.get(0)?; + Ok(v as i64) + } + Type::Null => Ok(0), + _ => { + error = Some(anyhow!("Partition can only be done on integer columns")); + Ok(0) + } + } + })?; + match error { + None => {} + Some(e) => throw!(e), + } + let max_v = conn.query_row(max_query.as_str(), [], |row| { + let col_type = row.get_ref(0)?.data_type(); + match col_type { + Type::Integer => row.get(0), + Type::Real => { + let v: f64 = row.get(0)?; + Ok(v as i64) + } + Type::Null => Ok(0), + _ => { + error = Some(anyhow!("Partition can only be done on integer columns")); + Ok(0) + } + } + })?; + match error { + None => {} + Some(e) => throw!(e), + } + + (min_v, max_v) +} + +#[cfg(feature = "src_mysql")] +#[throws(ConnectorXOutError)] +fn mysql_get_partition_range(conn: &Url, query: &str, col: &str) -> (i64, i64) { + let pool = Pool::new(Opts::from_url(conn.as_str()).map_err(MySQLSourceError::MySQLUrlError)?)?; + let mut conn = pool.get_conn()?; + let range_query = get_partition_range_query(query, col, &MySqlDialect {})?; + let row: Row = conn + .query_first(range_query)? + .ok_or_else(|| anyhow!("mysql range: no row returns"))?; + + let col_type = + MySQLTypeSystem::from((&row.columns()[0].column_type(), &row.columns()[0].flags())); + + let (min_v, max_v) = match col_type { + MySQLTypeSystem::Tiny(_) => { + let min_v: Option = row + .get(0) + .ok_or_else(|| anyhow!("mysql range: cannot get min value"))?; + let max_v: Option = row + .get(1) + .ok_or_else(|| anyhow!("mysql range: cannot get max value"))?; + (min_v.unwrap_or(0) as i64, max_v.unwrap_or(0) as i64) + } + MySQLTypeSystem::Short(_) => { + let min_v: Option = row + .get(0) + .ok_or_else(|| anyhow!("mysql range: cannot get min value"))?; + let max_v: Option = row + .get(1) + .ok_or_else(|| anyhow!("mysql range: cannot get max value"))?; + (min_v.unwrap_or(0) as i64, max_v.unwrap_or(0) as i64) + } + MySQLTypeSystem::Int24(_) => { + let min_v: Option = row + .get(0) + .ok_or_else(|| anyhow!("mysql range: cannot get min value"))?; + let max_v: Option = row + .get(1) + .ok_or_else(|| anyhow!("mysql range: cannot get max value"))?; + (min_v.unwrap_or(0) as i64, max_v.unwrap_or(0) as i64) + } + MySQLTypeSystem::Long(_) => { + let min_v: Option = row + .get(0) + .ok_or_else(|| anyhow!("mysql range: cannot get min value"))?; + let max_v: Option = row + .get(1) + .ok_or_else(|| anyhow!("mysql range: cannot get max value"))?; + (min_v.unwrap_or(0), max_v.unwrap_or(0)) + } + MySQLTypeSystem::LongLong(_) => { + let min_v: Option = row + .get(0) + .ok_or_else(|| anyhow!("mysql range: cannot get min value"))?; + let max_v: Option = row + .get(1) + .ok_or_else(|| anyhow!("mysql range: cannot get max value"))?; + (min_v.unwrap_or(0), max_v.unwrap_or(0)) + } + MySQLTypeSystem::UTiny(_) => { + let min_v: Option = row + .get(0) + .ok_or_else(|| anyhow!("mysql range: cannot get min value"))?; + let max_v: Option = row + .get(1) + .ok_or_else(|| anyhow!("mysql range: cannot get max value"))?; + (min_v.unwrap_or(0) as i64, max_v.unwrap_or(0) as i64) + } + MySQLTypeSystem::UShort(_) => { + let min_v: Option = row + .get(0) + .ok_or_else(|| anyhow!("mysql range: cannot get min value"))?; + let max_v: Option = row + .get(1) + .ok_or_else(|| anyhow!("mysql range: cannot get max value"))?; + (min_v.unwrap_or(0) as i64, max_v.unwrap_or(0) as i64) + } + MySQLTypeSystem::UInt24(_) => { + let min_v: Option = row + .get(0) + .ok_or_else(|| anyhow!("mysql range: cannot get min value"))?; + let max_v: Option = row + .get(1) + .ok_or_else(|| anyhow!("mysql range: cannot get max value"))?; + (min_v.unwrap_or(0) as i64, max_v.unwrap_or(0) as i64) + } + MySQLTypeSystem::ULong(_) => { + let min_v: Option = row + .get(0) + .ok_or_else(|| anyhow!("mysql range: cannot get min value"))?; + let max_v: Option = row + .get(1) + .ok_or_else(|| anyhow!("mysql range: cannot get max value"))?; + (min_v.unwrap_or(0) as i64, max_v.unwrap_or(0) as i64) + } + MySQLTypeSystem::ULongLong(_) => { + let min_v: Option = row + .get(0) + .ok_or_else(|| anyhow!("mysql range: cannot get min value"))?; + let max_v: Option = row + .get(1) + .ok_or_else(|| anyhow!("mysql range: cannot get max value"))?; + (min_v.unwrap_or(0) as i64, max_v.unwrap_or(0) as i64) + } + MySQLTypeSystem::Float(_) => { + let min_v: Option = row + .get(0) + .ok_or_else(|| anyhow!("mysql range: cannot get min value"))?; + let max_v: Option = row + .get(1) + .ok_or_else(|| anyhow!("mysql range: cannot get max value"))?; + (min_v.unwrap_or(0.0) as i64, max_v.unwrap_or(0.0) as i64) + } + MySQLTypeSystem::Double(_) => { + let min_v: Option = row + .get(0) + .ok_or_else(|| anyhow!("mysql range: cannot get min value"))?; + let max_v: Option = row + .get(1) + .ok_or_else(|| anyhow!("mysql range: cannot get max value"))?; + (min_v.unwrap_or(0.0) as i64, max_v.unwrap_or(0.0) as i64) + } + _ => throw!(anyhow!("Partition can only be done on int columns")), + }; + + (min_v, max_v) +} + +#[cfg(feature = "src_mssql")] +#[throws(ConnectorXOutError)] +fn mssql_get_partition_range(conn: &Url, query: &str, col: &str) -> (i64, i64) { + let rt = Runtime::new().expect("Failed to create runtime"); + let config = mssql_config(conn)?; + let tcp = rt.block_on(TcpStream::connect(config.get_addr()))?; + tcp.set_nodelay(true)?; + + let mut client = rt.block_on(Client::connect(config, tcp.compat_write()))?; + + let range_query = get_partition_range_query(query, col, &MsSqlDialect {})?; + let query_result = rt.block_on(client.query(range_query.as_str(), &[]))?; + let row = rt.block_on(query_result.into_row())?.unwrap(); + + let col_type = MsSQLTypeSystem::from(&row.columns()[0].column_type()); + let (min_v, max_v) = match col_type { + MsSQLTypeSystem::Tinyint(_) => { + let min_v: u8 = row.get(0).unwrap_or(0); + let max_v: u8 = row.get(1).unwrap_or(0); + (min_v as i64, max_v as i64) + } + MsSQLTypeSystem::Smallint(_) => { + let min_v: i16 = row.get(0).unwrap_or(0); + let max_v: i16 = row.get(1).unwrap_or(0); + (min_v as i64, max_v as i64) + } + MsSQLTypeSystem::Int(_) => { + let min_v: i32 = row.get(0).unwrap_or(0); + let max_v: i32 = row.get(1).unwrap_or(0); + (min_v as i64, max_v as i64) + } + MsSQLTypeSystem::Bigint(_) => { + let min_v: i64 = row.get(0).unwrap_or(0); + let max_v: i64 = row.get(1).unwrap_or(0); + (min_v, max_v) + } + MsSQLTypeSystem::Intn(_) => { + let min_v: IntN = row.get(0).unwrap_or(IntN(0)); + let max_v: IntN = row.get(1).unwrap_or(IntN(0)); + (min_v.0, max_v.0) + } + MsSQLTypeSystem::Float24(_) => { + let min_v: f32 = row.get(0).unwrap_or(0.0); + let max_v: f32 = row.get(1).unwrap_or(0.0); + (min_v as i64, max_v as i64) + } + MsSQLTypeSystem::Float53(_) => { + let min_v: f64 = row.get(0).unwrap_or(0.0); + let max_v: f64 = row.get(1).unwrap_or(0.0); + (min_v as i64, max_v as i64) + } + MsSQLTypeSystem::Floatn(_) => { + let min_v: FloatN = row.get(0).unwrap_or(FloatN(0.0)); + let max_v: FloatN = row.get(1).unwrap_or(FloatN(0.0)); + (min_v.0 as i64, max_v.0 as i64) + } + _ => throw!(anyhow!( + "Partition can only be done on int or float columns" + )), + }; + + (min_v, max_v) +} + +#[cfg(feature = "src_oracle")] +#[throws(ConnectorXOutError)] +fn oracle_get_partition_range(conn: &Url, query: &str, col: &str) -> (i64, i64) { + let connector = connect_oracle(conn)?; + let conn = connector.connect()?; + let range_query = get_partition_range_query(query, col, &OracleDialect {})?; + let row = conn.query_row(range_query.as_str(), &[])?; + let min_v: i64 = row.get(0).unwrap_or(0); + let max_v: i64 = row.get(1).unwrap_or(0); + (min_v, max_v) +} + +#[cfg(feature = "src_bigquery")] +#[throws(ConnectorXOutError)] // TODO +fn bigquery_get_partition_range(conn: &Url, query: &str, col: &str) -> (i64, i64) { + let rt = Runtime::new().expect("Failed to create runtime"); + let url = Url::parse(conn.as_str())?; + let sa_key_path = url.path(); + let client = rt.block_on(gcp_bigquery_client::Client::from_service_account_key_file( + sa_key_path, + )); + + let auth_data = std::fs::read_to_string(sa_key_path)?; + let auth_json: serde_json::Value = serde_json::from_str(&auth_data)?; + let project_id = auth_json + .get("project_id") + .ok_or_else(|| anyhow!("Cannot get project_id from auth file"))? + .as_str() + .ok_or_else(|| anyhow!("Cannot get project_id as string from auth file"))?; + let range_query = get_partition_range_query(query, col, &BigQueryDialect {})?; + + let mut query_result = rt.block_on(client.job().query( + project_id, + gcp_bigquery_client::model::query_request::QueryRequest::new(range_query.as_str()), + ))?; + query_result.next_row(); + let min_v = query_result.get_i64(0)?.unwrap_or(0); + let max_v = query_result.get_i64(1)?.unwrap_or(0); + + (min_v, max_v) +} diff --git a/connectorx/src/source_router.rs b/connectorx/src/source_router.rs new file mode 100644 index 0000000..d307967 --- /dev/null +++ b/connectorx/src/source_router.rs @@ -0,0 +1,83 @@ +use crate::constants::CONNECTORX_PROTOCOL; +use crate::errors::{ConnectorXError, Result}; +use anyhow::anyhow; +use fehler::throws; +use std::convert::TryFrom; +use url::Url; + +#[derive(Debug, Clone)] +pub enum SourceType { + Postgres, + SQLite, + MySQL, + MsSQL, + Oracle, + BigQuery, + DuckDB, + Unknown, +} + +#[derive(Debug, Clone)] +pub struct SourceConn { + pub ty: SourceType, + pub conn: Url, + pub proto: String, +} + +impl TryFrom<&str> for SourceConn { + type Error = ConnectorXError; + + fn try_from(conn: &str) -> Result { + let old_url = Url::parse(conn).map_err(|e| anyhow!("parse error: {}", e))?; + + // parse connectorx protocol + let proto = match old_url.query_pairs().find(|p| p.0 == CONNECTORX_PROTOCOL) { + Some((_, proto)) => proto.to_owned().to_string(), + None => "binary".to_string(), + }; + + // create url by removing connectorx protocol + let stripped_query: Vec<(_, _)> = old_url + .query_pairs() + .filter(|p| &*p.0 != CONNECTORX_PROTOCOL) + .collect(); + let mut url = old_url.clone(); + url.set_query(None); + for pair in stripped_query { + url.query_pairs_mut() + .append_pair(&pair.0.to_string()[..], &pair.1.to_string()[..]); + } + + // users from sqlalchemy may set engine in connection url (e.g. mssql+pymssql://...) + // only for compatablility, we don't use the same engine + match url.scheme().split('+').collect::>()[0] { + "postgres" | "postgresql" => Ok(SourceConn::new(SourceType::Postgres, url, proto)), + "sqlite" => Ok(SourceConn::new(SourceType::SQLite, url, proto)), + "mysql" => Ok(SourceConn::new(SourceType::MySQL, url, proto)), + "mssql" => Ok(SourceConn::new(SourceType::MsSQL, url, proto)), + "oracle" => Ok(SourceConn::new(SourceType::Oracle, url, proto)), + "bigquery" => Ok(SourceConn::new(SourceType::BigQuery, url, proto)), + "duckdb" => Ok(SourceConn::new(SourceType::DuckDB, url, proto)), + _ => Ok(SourceConn::new(SourceType::Unknown, url, proto)), + } + } +} + +impl SourceConn { + pub fn new(ty: SourceType, conn: Url, proto: String) -> Self { + Self { ty, conn, proto } + } + pub fn set_protocol(&mut self, protocol: &str) { + self.proto = protocol.to_string(); + } +} + +#[throws(ConnectorXError)] +pub fn parse_source(conn: &str, protocol: Option<&str>) -> SourceConn { + let mut source_conn = SourceConn::try_from(conn)?; + match protocol { + Some(p) => source_conn.set_protocol(p), + None => {} + } + source_conn +} diff --git a/connectorx/src/sources/bigquery/errors.rs b/connectorx/src/sources/bigquery/errors.rs new file mode 100644 index 0000000..262b4f0 --- /dev/null +++ b/connectorx/src/sources/bigquery/errors.rs @@ -0,0 +1,31 @@ +use gcp_bigquery_client::error::BQError; +use thiserror::Error; +use url; + +#[derive(Error, Debug)] +pub enum BigQuerySourceError { + #[error(transparent)] + ConnectorXError(#[from] crate::errors::ConnectorXError), + + #[error(transparent)] + BQError(#[from] BQError), + + #[error(transparent)] + BigQueryUrlError(#[from] url::ParseError), + + #[error(transparent)] + BigQueryStdError(#[from] std::io::Error), + + #[error(transparent)] + BigQueryJsonError(#[from] serde_json::Error), + + #[error(transparent)] + BigQueryParseFloatError(#[from] std::num::ParseFloatError), + + #[error(transparent)] + BigQueryParseIntError(#[from] std::num::ParseIntError), + + /// Any other errors that are too trivial to be put here explicitly. + #[error(transparent)] + Other(#[from] anyhow::Error), +} diff --git a/connectorx/src/sources/bigquery/mod.rs b/connectorx/src/sources/bigquery/mod.rs new file mode 100644 index 0000000..d7bbe3d --- /dev/null +++ b/connectorx/src/sources/bigquery/mod.rs @@ -0,0 +1,1143 @@ +//! Source implementation for Google BigQuery + +mod errors; +mod typesystem; + +pub use self::errors::BigQuerySourceError; +use crate::{ + data_order::DataOrder, + errors::ConnectorXError, + sources::{PartitionParser, Produce, Source, SourcePartition}, + sql::{count_query, limit1_query, CXQuery}, +}; +use anyhow::anyhow; +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Utc}; +use fehler::{throw, throws}; +use gcp_bigquery_client::{ + model::{ + get_query_results_parameters::GetQueryResultsParameters, + get_query_results_response::GetQueryResultsResponse, query_request::QueryRequest, + }, + Client, +}; +use sqlparser::dialect::Dialect; +use std::sync::Arc; +use tokio::runtime::Runtime; +pub use typesystem::BigQueryTypeSystem; +use url::Url; + +#[derive(Debug)] +pub struct BigQueryDialect {} + +impl Dialect for BigQueryDialect { + // See https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical + fn is_delimited_identifier_start(&self, ch: char) -> bool { + ch == '`' + } + + fn is_identifier_start(&self, ch: char) -> bool { + ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch == '_' || ch == '-' + } + + fn is_identifier_part(&self, ch: char) -> bool { + self.is_identifier_start(ch) || ch.is_ascii_digit() + } +} + +pub struct BigQuerySource { + rt: Arc, + client: Arc, + project_id: String, + origin_query: Option, + queries: Vec>, + names: Vec, + schema: Vec, +} + +impl BigQuerySource { + #[throws(BigQuerySourceError)] + pub fn new(rt: Arc, conn: &str) -> Self { + let url = Url::parse(conn)?; + let sa_key_path = url.path(); + let client = Arc::new(rt.block_on( + gcp_bigquery_client::Client::from_service_account_key_file(sa_key_path), + )); + let auth_data = std::fs::read_to_string(sa_key_path)?; + let auth_json: serde_json::Value = serde_json::from_str(&auth_data)?; + let project_id = auth_json + .get("project_id") + .ok_or_else(|| anyhow!("Cannot get project_id from auth file"))? + .as_str() + .ok_or_else(|| anyhow!("Cannot get project_id as string from auth file"))? + .to_string(); + Self { + rt, + client, + project_id, + origin_query: None, + queries: vec![], + names: vec![], + schema: vec![], + } + } +} + +impl Source for BigQuerySource +where + BigQuerySourcePartition: + SourcePartition, +{ + const DATA_ORDERS: &'static [DataOrder] = &[DataOrder::RowMajor]; + type Partition = BigQuerySourcePartition; + type TypeSystem = BigQueryTypeSystem; + type Error = BigQuerySourceError; + + #[throws(BigQuerySourceError)] + fn set_data_order(&mut self, data_order: DataOrder) { + if !matches!(data_order, DataOrder::RowMajor) { + throw!(ConnectorXError::UnsupportedDataOrder(data_order)); + } + } + + fn set_queries(&mut self, queries: &[CXQuery]) { + self.queries = queries.iter().map(|q| q.map(Q::to_string)).collect(); + } + + fn set_origin_query(&mut self, query: Option) { + self.origin_query = query; + } + + #[throws(BigQuerySourceError)] + fn fetch_metadata(&mut self) { + assert!(!self.queries.is_empty()); + let job = self.client.job(); + for (_, query) in self.queries.iter().enumerate() { + let l1query = limit1_query(query, &BigQueryDialect {})?; + let rs = self.rt.block_on(job.query( + self.project_id.as_str(), + QueryRequest::new(l1query.as_str()), + ))?; + let (names, types) = rs + .query_response() + .schema + .as_ref() + .ok_or_else(|| anyhow!("TableSchema is none"))? + .fields + .as_ref() + .ok_or_else(|| anyhow!("TableFieldSchema is none"))? + .iter() + .map(|col| { + ( + col.clone().name, + BigQueryTypeSystem::from(&col.clone().r#type), + ) + }) + .unzip(); + self.names = names; + self.schema = types; + } + } + + #[throws(BigQuerySourceError)] + fn result_rows(&mut self) -> Option { + match &self.origin_query { + Some(q) => { + let cxq = CXQuery::Naked(q.clone()); + let cquery = count_query(&cxq, &BigQueryDialect {})?; + let job = self.client.job(); + let mut rs = self.rt.block_on( + job.query(self.project_id.as_str(), QueryRequest::new(cquery.as_str())), + )?; + rs.next_row(); + let nrows = rs + .get_i64(0)? + .ok_or_else(|| anyhow!("cannot get row number"))?; + Some(nrows as usize) + } + None => None, + } + } + + fn names(&self) -> Vec { + self.names.clone() + } + + fn schema(&self) -> Vec { + self.schema.clone() + } + + #[throws(BigQuerySourceError)] + fn partition(self) -> Vec { + let mut ret = vec![]; + for query in self.queries { + ret.push(BigQuerySourcePartition::new( + self.rt.clone(), + self.client.clone(), + self.project_id.clone(), + &query, + &self.schema, + )); + } + ret + } +} + +pub struct BigQuerySourcePartition { + rt: Arc, + client: Arc, + project_id: String, + query: CXQuery, + schema: Vec, + nrows: usize, + ncols: usize, +} + +impl BigQuerySourcePartition { + pub fn new( + handle: Arc, + client: Arc, + project_id: String, + query: &CXQuery, + schema: &[BigQueryTypeSystem], + ) -> Self { + Self { + rt: handle, + client, + project_id: project_id.clone(), + query: query.clone(), + schema: schema.to_vec(), + nrows: 0, + ncols: schema.len(), + } + } +} + +impl SourcePartition for BigQuerySourcePartition { + type TypeSystem = BigQueryTypeSystem; + type Parser<'a> = BigQuerySourceParser; + type Error = BigQuerySourceError; + + #[throws(BigQuerySourceError)] + fn result_rows(&mut self) { + let cquery = count_query(&self.query, &BigQueryDialect {})?; + let job = self.client.job(); + let mut rs = self + .rt + .block_on(job.query(self.project_id.as_str(), QueryRequest::new(cquery.as_str())))?; + rs.next_row(); + let nrows = rs + .get_i64(0)? + .ok_or_else(|| anyhow!("cannot get row number"))?; + self.nrows = nrows as usize; + } + + #[throws(BigQuerySourceError)] + fn parser(&mut self) -> Self::Parser<'_> { + let job = self.client.job(); + let qry = self.rt.block_on(job.query( + self.project_id.as_str(), + QueryRequest::new(self.query.as_str()), + ))?; + let job_info = qry + .query_response() + .job_reference + .as_ref() + .ok_or_else(|| anyhow!("job_reference is none"))?; + let params = GetQueryResultsParameters { + format_options: None, + location: job_info.location.clone(), + max_results: None, + page_token: None, + start_index: None, + timeout_ms: None, + }; + let rs = self.rt.block_on( + job.get_query_results( + self.project_id.as_str(), + job_info + .job_id + .as_ref() + .ok_or_else(|| anyhow!("job_id is none"))? + .as_str(), + params, + ), + )?; + BigQuerySourceParser::new(self.rt.clone(), self.client.clone(), rs, &self.schema) + } + + fn nrows(&self) -> usize { + self.nrows + } + + fn ncols(&self) -> usize { + self.ncols + } +} + +pub struct BigQuerySourceParser { + rt: Arc, + client: Arc, + response: GetQueryResultsResponse, + ncols: usize, + current_col: usize, + current_row: usize, + nrows: Option, +} + +impl<'a> BigQuerySourceParser { + fn new( + rt: Arc, + client: Arc, + response: GetQueryResultsResponse, + schema: &[BigQueryTypeSystem], + ) -> Self { + Self { + rt, + client, + response, + ncols: schema.len(), + current_row: 0, + current_col: 0, + nrows: None, + } + } + + #[throws(BigQuerySourceError)] + fn next_loc(&mut self) -> (usize, usize) { + let ret = (self.current_row, self.current_col); + self.current_row += (self.current_col + 1) / self.ncols; + self.current_col = (self.current_col + 1) % self.ncols; + ret + } +} + +impl<'a> PartitionParser<'a> for BigQuerySourceParser { + type TypeSystem = BigQueryTypeSystem; + type Error = BigQuerySourceError; + + #[throws(BigQuerySourceError)] + fn fetch_next(&mut self) -> (usize, bool) { + assert!(self.current_col == 0); + match self.nrows { + Some(total_rows) => (total_rows - self.current_row, true), + None => { + // Get all number of rows + let total_rows = self + .response + .total_rows + .as_ref() + .ok_or_else(|| anyhow!("total_rows is none"))? + .parse::()?; + self.nrows = Some(total_rows); + (total_rows, true) + } + } + } +} + +macro_rules! impl_produce { + ($($t: ty,)+) => { + $( + impl<'r> Produce<'r, $t> for BigQuerySourceParser { + type Error = BigQuerySourceError; + + #[throws(BigQuerySourceError)] + fn produce(&'r mut self) -> $t { + let (mut ridx, cidx) = self.next_loc()?; + if ridx == (self.response.rows.as_ref().ok_or_else(|| anyhow!("rows is none"))?.len()) { + let job = self.client.job(); + let job_info = self.response.job_reference.as_ref().ok_or_else(|| anyhow!("job_reference is none"))?; + let params = GetQueryResultsParameters { format_options: None, location: job_info.location.clone(), max_results: None, page_token: self.response.page_token.clone(), start_index: None, timeout_ms: None }; + self.response = self.rt.block_on( + job.get_query_results( + job_info.project_id.as_ref().ok_or_else(|| anyhow!("project_id is none"))?.as_str(), + job_info.job_id.as_ref().ok_or_else(|| anyhow!("job_id is none"))?.as_str(), + params, + ), + )?; + self.current_row = 0; + ridx = 0; + } + let rows = self.response.rows.as_ref().ok_or_else(|| anyhow!("rows is none"))?; + let columns = rows[ridx].columns.as_ref().ok_or_else(|| anyhow!("columns is none"))?; + let v = columns.get(cidx).ok_or_else(|| anyhow!("Table Cell is none"))?.value.as_ref().ok_or_else(|| anyhow!("value is none"))?; + let s = v + .as_str() + .ok_or_else(|| anyhow!("cannot get str from json value"))?; + s.parse() + .map_err(|_| { + ConnectorXError::cannot_produce::<$t>(Some(s.into())) + })? + } + } + + impl<'r> Produce<'r, Option<$t>> for BigQuerySourceParser { + type Error = BigQuerySourceError; + + #[throws(BigQuerySourceError)] + fn produce(&'r mut self) -> Option<$t> { + let (mut ridx, cidx) = self.next_loc()?; + if ridx == (self.response.rows.as_ref().ok_or_else(|| anyhow!("rows is none"))?.len()) { + let job = self.client.job(); + let job_info = self.response.job_reference.as_ref().ok_or_else(|| anyhow!("job_reference is none"))?; + let params = GetQueryResultsParameters { format_options: None, location: job_info.location.clone(), max_results: None, page_token: self.response.page_token.clone(), start_index: None, timeout_ms: None }; + self.response = self.rt.block_on( + job.get_query_results( + job_info.project_id.as_ref().ok_or_else(|| anyhow!("project_id is none"))?.as_str(), + job_info.job_id.as_ref().ok_or_else(|| anyhow!("job_id is none"))?.as_str(), + params, + ), + )?; + self.current_row = 0; + ridx = 0; + } + let rows = self.response.rows.as_ref().ok_or_else(|| anyhow!("rows is none"))?; + let columns = rows[ridx].columns.as_ref().ok_or_else(|| anyhow!("columns is none"))?; + match &columns.get(cidx).ok_or_else(|| anyhow!("Table Cell is none"))?.value { + None => None, + Some(v) => { + let s = v.as_str().ok_or_else(|| anyhow!("cannot get str from json value"))?; + Some(s.parse().map_err(|_| { + ConnectorXError::cannot_produce::<$t>(Some(s.into())) + })?)}, + } + } + } + )+ + }; +} + +impl_produce!(i64, f64, String,); + +impl<'r, 'a> Produce<'r, bool> for BigQuerySourceParser { + type Error = BigQuerySourceError; + + #[throws(BigQuerySourceError)] + fn produce(&mut self) -> bool { + let (mut ridx, cidx) = self.next_loc()?; + if ridx + == (self + .response + .rows + .as_ref() + .ok_or_else(|| anyhow!("rows is none"))? + .len()) + { + let job = self.client.job(); + let job_info = self + .response + .job_reference + .as_ref() + .ok_or_else(|| anyhow!("job_reference is none"))?; + let params = GetQueryResultsParameters { + format_options: None, + location: job_info.location.clone(), + max_results: None, + page_token: self.response.page_token.clone(), + start_index: None, + timeout_ms: None, + }; + self.response = self.rt.block_on( + job.get_query_results( + job_info + .project_id + .as_ref() + .ok_or_else(|| anyhow!("project_id is none"))? + .as_str(), + job_info + .job_id + .as_ref() + .ok_or_else(|| anyhow!("job_id is none"))? + .as_str(), + params, + ), + )?; + self.current_row = 0; + ridx = 0; + } + let rows = self + .response + .rows + .as_ref() + .ok_or_else(|| anyhow!("rows is none"))?; + let columns = rows[ridx] + .columns + .as_ref() + .ok_or_else(|| anyhow!("columns is none"))?; + let v = columns + .get(cidx) + .ok_or_else(|| anyhow!("Table Cell is none"))? + .value + .as_ref() + .ok_or_else(|| anyhow!("value is none"))?; + let s = v + .as_str() + .ok_or_else(|| anyhow!("cannot get str from json value"))?; + + let ret = match s { + "true" => true, + "false" => false, + _ => throw!(ConnectorXError::cannot_produce::(Some(s.into()))), + }; + ret + } +} + +impl<'r, 'a> Produce<'r, Option> for BigQuerySourceParser { + type Error = BigQuerySourceError; + + #[throws(BigQuerySourceError)] + fn produce(&mut self) -> Option { + let (mut ridx, cidx) = self.next_loc()?; + if ridx + == (self + .response + .rows + .as_ref() + .ok_or_else(|| anyhow!("rows is none"))? + .len()) + { + let job = self.client.job(); + let job_info = self + .response + .job_reference + .as_ref() + .ok_or_else(|| anyhow!("job_reference is none"))?; + let params = GetQueryResultsParameters { + format_options: None, + location: job_info.location.clone(), + max_results: None, + page_token: self.response.page_token.clone(), + start_index: None, + timeout_ms: None, + }; + self.response = self.rt.block_on( + job.get_query_results( + job_info + .project_id + .as_ref() + .ok_or_else(|| anyhow!("project_id is none"))? + .as_str(), + job_info + .job_id + .as_ref() + .ok_or_else(|| anyhow!("job_id is none"))? + .as_str(), + params, + ), + )?; + self.current_row = 0; + ridx = 0; + } + let rows = self + .response + .rows + .as_ref() + .ok_or_else(|| anyhow!("rows is none"))?; + let columns = rows[ridx] + .columns + .as_ref() + .ok_or_else(|| anyhow!("columns is none"))?; + let ret = match &columns + .get(cidx) + .ok_or_else(|| anyhow!("Table Cell is none"))? + .value + { + None => None, + Some(v) => { + let s = v + .as_str() + .ok_or_else(|| anyhow!("cannot get str from json value"))?; + match s { + "true" => Some(true), + "false" => Some(false), + _ => throw!(ConnectorXError::cannot_produce::(Some(s.into()))), + } + } + }; + ret + } +} + +impl<'r, 'a> Produce<'r, NaiveDate> for BigQuerySourceParser { + type Error = BigQuerySourceError; + + #[throws(BigQuerySourceError)] + fn produce(&mut self) -> NaiveDate { + let (mut ridx, cidx) = self.next_loc()?; + if ridx + == (self + .response + .rows + .as_ref() + .ok_or_else(|| anyhow!("rows is none"))? + .len()) + { + let job = self.client.job(); + let job_info = self + .response + .job_reference + .as_ref() + .ok_or_else(|| anyhow!("job_reference is none"))?; + let params = GetQueryResultsParameters { + format_options: None, + location: job_info.location.clone(), + max_results: None, + page_token: self.response.page_token.clone(), + start_index: None, + timeout_ms: None, + }; + self.response = self.rt.block_on( + job.get_query_results( + job_info + .project_id + .as_ref() + .ok_or_else(|| anyhow!("project_id is none"))? + .as_str(), + job_info + .job_id + .as_ref() + .ok_or_else(|| anyhow!("job_id is none"))? + .as_str(), + params, + ), + )?; + self.current_row = 0; + ridx = 0; + } + let rows = self + .response + .rows + .as_ref() + .ok_or_else(|| anyhow!("rows is none"))?; + let columns = rows[ridx] + .columns + .as_ref() + .ok_or_else(|| anyhow!("columns is none"))?; + let v = columns + .get(cidx) + .ok_or_else(|| anyhow!("Table Cell is none"))? + .value + .as_ref() + .ok_or_else(|| anyhow!("value is none"))?; + let s = v + .as_str() + .ok_or_else(|| anyhow!("cannot get str from json value"))?; + NaiveDate::parse_from_str(s, "%Y-%m-%d") + .map_err(|_| ConnectorXError::cannot_produce::(Some(s.into())))? + } +} + +impl<'r, 'a> Produce<'r, Option> for BigQuerySourceParser { + type Error = BigQuerySourceError; + + #[throws(BigQuerySourceError)] + fn produce(&mut self) -> Option { + let (mut ridx, cidx) = self.next_loc()?; + if ridx + == (self + .response + .rows + .as_ref() + .ok_or_else(|| anyhow!("rows is none"))? + .len()) + { + let job = self.client.job(); + let job_info = self + .response + .job_reference + .as_ref() + .ok_or_else(|| anyhow!("job_reference is none"))?; + let params = GetQueryResultsParameters { + format_options: None, + location: job_info.location.clone(), + max_results: None, + page_token: self.response.page_token.clone(), + start_index: None, + timeout_ms: None, + }; + self.response = self.rt.block_on( + job.get_query_results( + job_info + .project_id + .as_ref() + .ok_or_else(|| anyhow!("project_id is none"))? + .as_str(), + job_info + .job_id + .as_ref() + .ok_or_else(|| anyhow!("job_id is none"))? + .as_str(), + params, + ), + )?; + self.current_row = 0; + ridx = 0; + } + let rows = self + .response + .rows + .as_ref() + .ok_or_else(|| anyhow!("rows is none"))?; + let columns = rows[ridx] + .columns + .as_ref() + .ok_or_else(|| anyhow!("columns is none"))?; + match &columns + .get(cidx) + .ok_or_else(|| anyhow!("Table Cell is none"))? + .value + { + None => None, + Some(v) => { + let s = v + .as_str() + .ok_or_else(|| anyhow!("cannot get str from json value"))?; + Some( + NaiveDate::parse_from_str(s, "%Y-%m-%d").map_err(|_| { + ConnectorXError::cannot_produce::(Some(s.into())) + })?, + ) + } + } + } +} + +impl<'r, 'a> Produce<'r, NaiveDateTime> for BigQuerySourceParser { + type Error = BigQuerySourceError; + + #[throws(BigQuerySourceError)] + fn produce(&mut self) -> NaiveDateTime { + let (mut ridx, cidx) = self.next_loc()?; + if ridx + == (self + .response + .rows + .as_ref() + .ok_or_else(|| anyhow!("rows is none"))? + .len()) + { + let job = self.client.job(); + let job_info = self + .response + .job_reference + .as_ref() + .ok_or_else(|| anyhow!("job_reference is none"))?; + let params = GetQueryResultsParameters { + format_options: None, + location: job_info.location.clone(), + max_results: None, + page_token: self.response.page_token.clone(), + start_index: None, + timeout_ms: None, + }; + self.response = self.rt.block_on( + job.get_query_results( + job_info + .project_id + .as_ref() + .ok_or_else(|| anyhow!("project_id is none"))? + .as_str(), + job_info + .job_id + .as_ref() + .ok_or_else(|| anyhow!("job_id is none"))? + .as_str(), + params, + ), + )?; + self.current_row = 0; + ridx = 0; + } + let rows = self + .response + .rows + .as_ref() + .ok_or_else(|| anyhow!("rows is none"))?; + let columns = rows[ridx] + .columns + .as_ref() + .ok_or_else(|| anyhow!("columns is none"))?; + let v = columns + .get(cidx) + .ok_or_else(|| anyhow!("Table Cell is none"))? + .value + .as_ref() + .ok_or_else(|| anyhow!("value is none"))?; + let s = v + .as_str() + .ok_or_else(|| anyhow!("cannot get str from json value"))?; + NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S") + .map_err(|_| ConnectorXError::cannot_produce::(Some(s.into())))? + } +} + +impl<'r, 'a> Produce<'r, Option> for BigQuerySourceParser { + type Error = BigQuerySourceError; + + #[throws(BigQuerySourceError)] + fn produce(&mut self) -> Option { + let (mut ridx, cidx) = self.next_loc()?; + if ridx + == (self + .response + .rows + .as_ref() + .ok_or_else(|| anyhow!("rows is none"))? + .len()) + { + let job = self.client.job(); + let job_info = self + .response + .job_reference + .as_ref() + .ok_or_else(|| anyhow!("job_reference is none"))?; + let params = GetQueryResultsParameters { + format_options: None, + location: job_info.location.clone(), + max_results: None, + page_token: self.response.page_token.clone(), + start_index: None, + timeout_ms: None, + }; + self.response = self.rt.block_on( + job.get_query_results( + job_info + .project_id + .as_ref() + .ok_or_else(|| anyhow!("project_id is none"))? + .as_str(), + job_info + .job_id + .as_ref() + .ok_or_else(|| anyhow!("job_id is none"))? + .as_str(), + params, + ), + )?; + self.current_row = 0; + ridx = 0; + } + let rows = self + .response + .rows + .as_ref() + .ok_or_else(|| anyhow!("rows is none"))?; + let columns = rows[ridx] + .columns + .as_ref() + .ok_or_else(|| anyhow!("columns is none"))?; + match &columns + .get(cidx) + .ok_or_else(|| anyhow!("Table Cell is none"))? + .value + { + None => None, + Some(v) => { + let s = v + .as_str() + .ok_or_else(|| anyhow!("cannot get str from json value"))?; + Some( + NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S").map_err(|_| { + ConnectorXError::cannot_produce::(Some(s.into())) + })?, + ) + } + } + } +} + +impl<'r, 'a> Produce<'r, NaiveTime> for BigQuerySourceParser { + type Error = BigQuerySourceError; + + #[throws(BigQuerySourceError)] + fn produce(&mut self) -> NaiveTime { + let (mut ridx, cidx) = self.next_loc()?; + if ridx + == (self + .response + .rows + .as_ref() + .ok_or_else(|| anyhow!("rows is none"))? + .len()) + { + let job = self.client.job(); + let job_info = self + .response + .job_reference + .as_ref() + .ok_or_else(|| anyhow!("job_reference is none"))?; + let params = GetQueryResultsParameters { + format_options: None, + location: job_info.location.clone(), + max_results: None, + page_token: self.response.page_token.clone(), + start_index: None, + timeout_ms: None, + }; + self.response = self.rt.block_on( + job.get_query_results( + job_info + .project_id + .as_ref() + .ok_or_else(|| anyhow!("project_id is none"))? + .as_str(), + job_info + .job_id + .as_ref() + .ok_or_else(|| anyhow!("job_id is none"))? + .as_str(), + params, + ), + )?; + self.current_row = 0; + ridx = 0; + } + let rows = self + .response + .rows + .as_ref() + .ok_or_else(|| anyhow!("rows is none"))?; + let columns = rows[ridx] + .columns + .as_ref() + .ok_or_else(|| anyhow!("columns is none"))?; + let v = columns + .get(cidx) + .ok_or_else(|| anyhow!("Table Cell is none"))? + .value + .as_ref() + .ok_or_else(|| anyhow!("value is none"))?; + let s = v + .as_str() + .ok_or_else(|| anyhow!("cannot get str from json value"))?; + NaiveTime::parse_from_str(s, "%H:%M:%S") + .map_err(|_| ConnectorXError::cannot_produce::(Some(s.into())))? + } +} + +impl<'r, 'a> Produce<'r, Option> for BigQuerySourceParser { + type Error = BigQuerySourceError; + + #[throws(BigQuerySourceError)] + fn produce(&mut self) -> Option { + let (mut ridx, cidx) = self.next_loc()?; + if ridx + == (self + .response + .rows + .as_ref() + .ok_or_else(|| anyhow!("rows is none"))? + .len()) + { + let job = self.client.job(); + let job_info = self + .response + .job_reference + .as_ref() + .ok_or_else(|| anyhow!("job_reference is none"))?; + let params = GetQueryResultsParameters { + format_options: None, + location: job_info.location.clone(), + max_results: None, + page_token: self.response.page_token.clone(), + start_index: None, + timeout_ms: None, + }; + self.response = self.rt.block_on( + job.get_query_results( + job_info + .project_id + .as_ref() + .ok_or_else(|| anyhow!("project_id is none"))? + .as_str(), + job_info + .job_id + .as_ref() + .ok_or_else(|| anyhow!("job_id is none"))? + .as_str(), + params, + ), + )?; + self.current_row = 0; + ridx = 0; + } + let rows = self + .response + .rows + .as_ref() + .ok_or_else(|| anyhow!("rows is none"))?; + let columns = rows[ridx] + .columns + .as_ref() + .ok_or_else(|| anyhow!("columns is none"))?; + match &columns + .get(cidx) + .ok_or_else(|| anyhow!("Table Cell is none"))? + .value + { + None => None, + Some(v) => { + let s = v + .as_str() + .ok_or_else(|| anyhow!("cannot get str from json value"))?; + Some( + NaiveTime::parse_from_str(s, "%H:%M:%S").map_err(|_| { + ConnectorXError::cannot_produce::(Some(s.into())) + })?, + ) + } + } + } +} + +impl<'r, 'a> Produce<'r, DateTime> for BigQuerySourceParser { + type Error = BigQuerySourceError; + + #[throws(BigQuerySourceError)] + fn produce(&mut self) -> DateTime { + let (mut ridx, cidx) = self.next_loc()?; + if ridx + == (self + .response + .rows + .as_ref() + .ok_or_else(|| anyhow!("rows is none"))? + .len()) + { + let job = self.client.job(); + let job_info = self + .response + .job_reference + .as_ref() + .ok_or_else(|| anyhow!("job_reference is none"))?; + let params = GetQueryResultsParameters { + format_options: None, + location: job_info.location.clone(), + max_results: None, + page_token: self.response.page_token.clone(), + start_index: None, + timeout_ms: None, + }; + self.response = self.rt.block_on( + job.get_query_results( + job_info + .project_id + .as_ref() + .ok_or_else(|| anyhow!("project_id is none"))? + .as_str(), + job_info + .job_id + .as_ref() + .ok_or_else(|| anyhow!("job_id is none"))? + .as_str(), + params, + ), + )?; + self.current_row = 0; + ridx = 0; + } + let rows = self + .response + .rows + .as_ref() + .ok_or_else(|| anyhow!("rows is none"))?; + let columns = rows[ridx] + .columns + .as_ref() + .ok_or_else(|| anyhow!("columns is none"))?; + let v = columns + .get(cidx) + .ok_or_else(|| anyhow!("Table Cell is none"))? + .value + .as_ref() + .ok_or_else(|| anyhow!("value is none"))?; + let timestamp_ns = (v + .as_str() + .ok_or_else(|| anyhow!("cannot get str from json value"))? + .parse::()? + * 1e9) as i64; + let secs = timestamp_ns / 1000000000; + let nsecs = (timestamp_ns % 1000000000) as u32; + DateTime::::from_naive_utc_and_offset( + NaiveDateTime::from_timestamp_opt(secs, nsecs) + .ok_or_else(|| anyhow!("from_timestamp_opt return None"))?, + Utc, + ) + } +} + +impl<'r, 'a> Produce<'r, Option>> for BigQuerySourceParser { + type Error = BigQuerySourceError; + + #[throws(BigQuerySourceError)] + fn produce(&mut self) -> Option> { + let (mut ridx, cidx) = self.next_loc()?; + if ridx + == (self + .response + .rows + .as_ref() + .ok_or_else(|| anyhow!("rows is none"))? + .len()) + { + let job = self.client.job(); + let job_info = self + .response + .job_reference + .as_ref() + .ok_or_else(|| anyhow!("job_reference is none"))?; + let params = GetQueryResultsParameters { + format_options: None, + location: job_info.location.clone(), + max_results: None, + page_token: self.response.page_token.clone(), + start_index: None, + timeout_ms: None, + }; + self.response = self.rt.block_on( + job.get_query_results( + job_info + .project_id + .as_ref() + .ok_or_else(|| anyhow!("project_id is none"))? + .as_str(), + job_info + .job_id + .as_ref() + .ok_or_else(|| anyhow!("job_id is none"))? + .as_str(), + params, + ), + )?; + self.current_row = 0; + ridx = 0; + } + let rows = self + .response + .rows + .as_ref() + .ok_or_else(|| anyhow!("rows is none"))?; + let columns = rows[ridx] + .columns + .as_ref() + .ok_or_else(|| anyhow!("columns is none"))?; + match &columns + .get(cidx) + .ok_or_else(|| anyhow!("Table Cell is none"))? + .value + { + None => None, + Some(v) => { + let timestamp_ns = (v + .as_str() + .ok_or_else(|| anyhow!("cannot get str from json value"))? + .parse::()? + * 1e9) as i64; + let secs = timestamp_ns / 1000000000; + let nsecs = (timestamp_ns % 1000000000) as u32; + NaiveDateTime::from_timestamp_opt(secs, nsecs).map(|ndt| DateTime::::from_naive_utc_and_offset(ndt, Utc)) + } + } + } +} diff --git a/connectorx/src/sources/bigquery/typesystem.rs b/connectorx/src/sources/bigquery/typesystem.rs new file mode 100644 index 0000000..91c1505 --- /dev/null +++ b/connectorx/src/sources/bigquery/typesystem.rs @@ -0,0 +1,79 @@ +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Utc}; +use gcp_bigquery_client::model::field_type::FieldType; + +#[derive(Copy, Clone, Debug)] +pub enum BigQueryTypeSystem { + Bool(bool), + Boolean(bool), + Int64(bool), + Integer(bool), + Float(bool), + Float64(bool), + Numeric(bool), + Bignumeric(bool), + String(bool), + Bytes(bool), + Date(bool), + Datetime(bool), + Time(bool), + Timestamp(bool), +} + +impl_typesystem! { + system = BigQueryTypeSystem, + mappings = { + { Bool | Boolean => bool } + { Int64 | Integer => i64 } + { Float64 | Float | Numeric | Bignumeric => f64 } + { String | Bytes => String } + { Date => NaiveDate } + { Datetime => NaiveDateTime } + { Time => NaiveTime } + { Timestamp => DateTime } + } +} + +impl<'a> From<&'a FieldType> for BigQueryTypeSystem { + fn from(ty: &'a FieldType) -> BigQueryTypeSystem { + use BigQueryTypeSystem::*; + match ty { + FieldType::Bool => Bool(true), + FieldType::Boolean => Boolean(true), + FieldType::Int64 => Int64(true), + FieldType::Integer => Integer(true), + FieldType::Float => Float(true), + FieldType::Float64 => Float64(true), + FieldType::Numeric => Numeric(true), + FieldType::Bignumeric => Bignumeric(true), + FieldType::String => String(true), + FieldType::Bytes => Bytes(true), + FieldType::Date => Date(true), + FieldType::Datetime => Datetime(true), + FieldType::Time => Time(true), + FieldType::Timestamp => Timestamp(true), + _ => unimplemented!("{}", format!("{:?}", ty)), + } + } +} + +impl<'a> From for FieldType { + fn from(ty: BigQueryTypeSystem) -> FieldType { + use BigQueryTypeSystem::*; + match ty { + Bool(_) => FieldType::Bool, + Boolean(_) => FieldType::Boolean, + Int64(_) => FieldType::Int64, + Integer(_) => FieldType::Integer, + Float64(_) => FieldType::Float64, + Float(_) => FieldType::Float, + Numeric(_) => FieldType::Numeric, + Bignumeric(_) => FieldType::Bignumeric, + String(_) => FieldType::String, + Bytes(_) => FieldType::Bytes, + Date(_) => FieldType::Date, + Datetime(_) => FieldType::Datetime, + Time(_) => FieldType::Time, + Timestamp(_) => FieldType::Timestamp, + } + } +} diff --git a/connectorx/src/sources/csv/errors.rs b/connectorx/src/sources/csv/errors.rs new file mode 100644 index 0000000..b971300 --- /dev/null +++ b/connectorx/src/sources/csv/errors.rs @@ -0,0 +1,20 @@ +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum CSVSourceError { + #[error(transparent)] + ConnectorXError(#[from] crate::errors::ConnectorXError), + + #[error(transparent)] + RegexError(#[from] regex::Error), + + #[error(transparent)] + CSVError(#[from] csv::Error), + + #[error(transparent)] + IOError(#[from] std::io::Error), + + /// Any other errors that are too trivial to be put here explicitly. + #[error(transparent)] + Other(#[from] anyhow::Error), +} diff --git a/connectorx/src/sources/csv/mod.rs b/connectorx/src/sources/csv/mod.rs new file mode 100644 index 0000000..66bde05 --- /dev/null +++ b/connectorx/src/sources/csv/mod.rs @@ -0,0 +1,410 @@ +//! Source implementation for CSV files. + +mod errors; +mod typesystem; + +pub use self::errors::CSVSourceError; +pub use self::typesystem::CSVTypeSystem; +use super::{PartitionParser, Produce, Source, SourcePartition}; +use crate::{data_order::DataOrder, errors::ConnectorXError, sql::CXQuery}; +use anyhow::anyhow; +use chrono::{DateTime, Utc}; +use fehler::{throw, throws}; +#[cfg(feature = "src_csv")] +use regex::{Regex, RegexBuilder}; +use std::collections::HashSet; +use std::fs::File; + +pub struct CSVSource { + schema: Vec, + files: Vec>, + names: Vec, +} + +impl CSVSource { + pub fn new(schema: &[CSVTypeSystem]) -> Self { + CSVSource { + schema: schema.to_vec(), + files: vec![], + names: vec![], + } + } + + #[throws(CSVSourceError)] + pub fn infer_schema(&mut self) -> Vec { + // regular expressions for infer CSVTypeSystem from string + let decimal_re: Regex = Regex::new(r"^-?(\d+\.\d+)$")?; + let integer_re: Regex = Regex::new(r"^-?(\d+)$")?; + let boolean_re: Regex = RegexBuilder::new(r"^(true)$|^(false)$") + .case_insensitive(true) + .build()?; + let datetime_re: Regex = Regex::new(r"^\d{4}-\d\d-\d\dT\d\d:\d\d:\d\d$")?; + + // read max_records rows to infer possible CSVTypeSystems for each field + let mut reader = csv::ReaderBuilder::new() + .has_headers(true) + .from_reader(File::open(self.files[0].as_str())?); + + let max_records_to_read = 50; + let num_cols = self.names.len(); + + let mut column_types: Vec> = vec![HashSet::new(); num_cols]; + let mut nulls: Vec = vec![false; num_cols]; + + let mut record = csv::StringRecord::new(); + + for _record_counter in 0..max_records_to_read { + if !reader.read_record(&mut record)? { + break; + } + for field_counter in 0..num_cols { + if let Some(string) = record.get(field_counter) { + if string.is_empty() { + nulls[field_counter] = true; + } else { + let dt: CSVTypeSystem; + + if string.starts_with('"') { + dt = CSVTypeSystem::String(false); + } else if boolean_re.is_match(string) { + dt = CSVTypeSystem::Bool(false); + } else if decimal_re.is_match(string) { + dt = CSVTypeSystem::F64(false); + } else if integer_re.is_match(string) { + dt = CSVTypeSystem::I64(false); + } else if datetime_re.is_match(string) { + dt = CSVTypeSystem::DateTime(false); + } else { + dt = CSVTypeSystem::String(false); + } + column_types[field_counter].insert(dt); + } + } + } + } + + // determine CSVTypeSystem based on possible candidates + let mut schema = vec![]; + + for field_counter in 0..num_cols { + let possibilities = &column_types[field_counter]; + let has_nulls = nulls[field_counter]; + + match possibilities.len() { + 1 => { + for dt in possibilities.iter() { + match *dt { + CSVTypeSystem::I64(false) => { + schema.push(CSVTypeSystem::I64(has_nulls)); + } + CSVTypeSystem::F64(false) => { + schema.push(CSVTypeSystem::F64(has_nulls)); + } + CSVTypeSystem::Bool(false) => { + schema.push(CSVTypeSystem::Bool(has_nulls)); + } + CSVTypeSystem::String(false) => { + schema.push(CSVTypeSystem::String(has_nulls)); + } + CSVTypeSystem::DateTime(false) => { + schema.push(CSVTypeSystem::DateTime(has_nulls)); + } + _ => {} + } + } + } + 2 => { + if possibilities.contains(&CSVTypeSystem::I64(false)) + && possibilities.contains(&CSVTypeSystem::F64(false)) + { + // Integer && Float -> Float + schema.push(CSVTypeSystem::F64(has_nulls)); + } else { + // Conflicting CSVTypeSystems -> String + schema.push(CSVTypeSystem::String(has_nulls)); + } + } + _ => { + // Conflicting CSVTypeSystems -> String + schema.push(CSVTypeSystem::String(has_nulls)); + } + } + } + schema + } +} + +impl Source for CSVSource { + const DATA_ORDERS: &'static [DataOrder] = &[DataOrder::RowMajor]; + type Partition = CSVSourcePartition; + type TypeSystem = CSVTypeSystem; + type Error = CSVSourceError; + + #[throws(CSVSourceError)] + fn set_data_order(&mut self, data_order: DataOrder) { + if !matches!(data_order, DataOrder::RowMajor) { + throw!(ConnectorXError::UnsupportedDataOrder(data_order)) + } + } + + fn set_queries(&mut self, queries: &[CXQuery]) { + self.files = queries.iter().map(|q| q.map(Q::to_string)).collect(); + } + + fn set_origin_query(&mut self, _query: Option) {} + + #[throws(CSVSourceError)] + fn fetch_metadata(&mut self) { + let mut reader = csv::ReaderBuilder::new() + .has_headers(true) + .from_reader(File::open(self.files[0].as_str())?); + let header = reader.headers()?; + + self.names = header.iter().map(|s| s.to_string()).collect(); + + if self.schema.is_empty() { + self.schema = self.infer_schema()?; + } + + assert_eq!(header.len(), self.schema.len()); + } + + #[throws(CSVSourceError)] + fn result_rows(&mut self) -> Option { + None + } + + fn names(&self) -> Vec { + self.names.clone() + } + + fn schema(&self) -> Vec { + self.schema.clone() + } + + #[throws(CSVSourceError)] + fn partition(self) -> Vec { + let mut partitions = vec![]; + for file in self.files { + partitions.push(CSVSourcePartition::new(file)?); + } + partitions + } +} + +pub struct CSVSourcePartition { + records: Vec, + counter: usize, + nrows: usize, + ncols: usize, +} + +impl CSVSourcePartition { + #[throws(CSVSourceError)] + pub fn new(fname: CXQuery) -> Self { + let reader = csv::ReaderBuilder::new() + .has_headers(true) + .from_reader(File::open(fname.as_str())?); + let mut records = vec![]; + reader + .into_records() + .try_for_each(|v| -> Result<(), CSVSourceError> { + records.push(v.map_err(|e| anyhow!(e))?); + Ok(()) + })?; + + let nrows = records.len(); + let ncols = if nrows > 0 { records[0].len() } else { 0 }; + + Self { + records, + counter: 0, + nrows, + ncols, + } + } +} + +impl SourcePartition for CSVSourcePartition { + type TypeSystem = CSVTypeSystem; + type Parser<'a> = CSVSourcePartitionParser<'a>; + type Error = CSVSourceError; + + /// The parameter `query` is the path of the csv file + #[throws(CSVSourceError)] + fn result_rows(&mut self) {} + + fn nrows(&self) -> usize { + self.nrows + } + + fn ncols(&self) -> usize { + self.ncols + } + + #[throws(CSVSourceError)] + fn parser(&mut self) -> Self::Parser<'_> { + CSVSourcePartitionParser { + records: &mut self.records, + counter: &mut self.counter, + ncols: self.ncols, + } + } +} + +pub struct CSVSourcePartitionParser<'a> { + records: &'a mut [csv::StringRecord], + counter: &'a mut usize, + ncols: usize, +} + +impl<'a> CSVSourcePartitionParser<'a> { + fn next_val(&mut self) -> &str { + let v: &str = self.records[*self.counter / self.ncols][*self.counter % self.ncols].as_ref(); + *self.counter += 1; + + v + } +} + +impl<'a> PartitionParser<'a> for CSVSourcePartitionParser<'a> { + type TypeSystem = CSVTypeSystem; + type Error = CSVSourceError; + + #[throws(CSVSourceError)] + fn fetch_next(&mut self) -> (usize, bool) { + (self.records.len(), true) + } +} + +impl<'r, 'a> Produce<'r, i64> for CSVSourcePartitionParser<'a> { + type Error = CSVSourceError; + + #[throws(CSVSourceError)] + fn produce(&mut self) -> i64 { + let v = self.next_val(); + v.parse() + .map_err(|_| ConnectorXError::cannot_produce::(Some(v.into())))? + } +} + +impl<'r, 'a> Produce<'r, Option> for CSVSourcePartitionParser<'a> { + type Error = CSVSourceError; + + #[throws(CSVSourceError)] + fn produce(&mut self) -> Option { + let v = self.next_val(); + if v.is_empty() { + return None; + } + let v = v + .parse() + .map_err(|_| ConnectorXError::cannot_produce::>(Some(v.into())))?; + + Some(v) + } +} + +impl<'r, 'a> Produce<'r, f64> for CSVSourcePartitionParser<'a> { + type Error = CSVSourceError; + + #[throws(CSVSourceError)] + fn produce(&mut self) -> f64 { + let v = self.next_val(); + v.parse() + .map_err(|_| ConnectorXError::cannot_produce::(Some(v.into())))? + } +} + +impl<'r, 'a> Produce<'r, Option> for CSVSourcePartitionParser<'a> { + type Error = CSVSourceError; + + #[throws(CSVSourceError)] + fn produce(&mut self) -> Option { + let v = self.next_val(); + if v.is_empty() { + return None; + } + let v = v + .parse() + .map_err(|_| ConnectorXError::cannot_produce::>(Some(v.into())))?; + + Some(v) + } +} + +impl<'r, 'a> Produce<'r, bool> for CSVSourcePartitionParser<'a> { + type Error = CSVSourceError; + + #[throws(CSVSourceError)] + fn produce(&mut self) -> bool { + let v = self.next_val(); + v.parse() + .map_err(|_| ConnectorXError::cannot_produce::(Some(v.into())))? + } +} + +impl<'r, 'a> Produce<'r, Option> for CSVSourcePartitionParser<'a> { + type Error = CSVSourceError; + + #[throws(CSVSourceError)] + fn produce(&mut self) -> Option { + let v = self.next_val(); + if v.is_empty() { + return None; + } + let v = v + .parse() + .map_err(|_| ConnectorXError::cannot_produce::>(Some(v.into())))?; + + Some(v) + } +} + +impl<'r, 'a> Produce<'r, String> for CSVSourcePartitionParser<'a> { + type Error = CSVSourceError; + + #[throws(CSVSourceError)] + fn produce(&mut self) -> String { + let v = self.next_val(); + String::from(v) + } +} + +impl<'a, 'r> Produce<'r, Option> for CSVSourcePartitionParser<'a> { + type Error = CSVSourceError; + + #[throws(CSVSourceError)] + fn produce(&'r mut self) -> Option { + let v = self.next_val(); + + Some(String::from(v)) + } +} + +impl<'r, 'a> Produce<'r, DateTime> for CSVSourcePartitionParser<'a> { + type Error = CSVSourceError; + + #[throws(CSVSourceError)] + fn produce(&mut self) -> DateTime { + let v = self.next_val(); + v.parse() + .map_err(|_| ConnectorXError::cannot_produce::>(Some(v.into())))? + } +} + +impl<'r, 'a> Produce<'r, Option>> for CSVSourcePartitionParser<'a> { + type Error = CSVSourceError; + + #[throws(CSVSourceError)] + fn produce(&mut self) -> Option> { + let v = self.next_val(); + if v.is_empty() { + return None; + } + let v = v + .parse() + .map_err(|_| ConnectorXError::cannot_produce::>(Some(v.into())))?; + Some(v) + } +} diff --git a/connectorx/src/sources/csv/typesystem.rs b/connectorx/src/sources/csv/typesystem.rs new file mode 100644 index 0000000..4921aeb --- /dev/null +++ b/connectorx/src/sources/csv/typesystem.rs @@ -0,0 +1,31 @@ +// Each variant in DataType represents a type that connectorx currently +// supports to read from a data source and write into a destination. +// When adding a new supported type T and associate it to the native representation N, please do +// 1. Add a T variant to DataType. +// 2. Add `DataType::T => N` to the macro impl_typesystem!. +// 3. Add `DataType::T => N` to the macro impl_transmit!. +// + +use chrono::{DateTime, Utc}; +/// This is a dummy type system used in this library. +/// For all the sources, their output values must be one of the types defined by DummyTypeSystem. +/// For all the destinations, they must support writing any value whose type is defined by DummyTypeSystem. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub enum CSVTypeSystem { + F64(bool), + I64(bool), + Bool(bool), + String(bool), + DateTime(bool), +} + +impl_typesystem! { + system = CSVTypeSystem, + mappings = { + { F64 => f64 } + { I64 => i64 } + { Bool => bool } + { String => String } + { DateTime => DateTime } + } +} diff --git a/connectorx/src/sources/dummy/mod.rs b/connectorx/src/sources/dummy/mod.rs new file mode 100644 index 0000000..f880421 --- /dev/null +++ b/connectorx/src/sources/dummy/mod.rs @@ -0,0 +1,247 @@ +//! A dummy source that generates different values based on an internal counter. +//! This source is for test purpose. + +mod typesystem; + +pub use self::typesystem::DummyTypeSystem; +use super::{PartitionParser, Produce, Source, SourcePartition}; +use crate::data_order::DataOrder; +use crate::errors::{ConnectorXError, Result}; +use crate::sql::CXQuery; +use chrono::{offset, DateTime, Utc}; +use fehler::{throw, throws}; +use num_traits::cast::FromPrimitive; + +pub struct DummySource { + names: Vec, + schema: Vec, + queries: Vec>, +} + +impl DummySource { + pub fn new>(names: &[S], schema: &[DummyTypeSystem]) -> Self { + assert_eq!(names.len(), schema.len()); + DummySource { + names: names.iter().map(|s| s.as_ref().to_string()).collect(), + schema: schema.to_vec(), + queries: vec![], + } + } +} + +impl Source for DummySource { + const DATA_ORDERS: &'static [DataOrder] = &[DataOrder::RowMajor]; + type TypeSystem = DummyTypeSystem; + type Partition = DummySourcePartition; + type Error = ConnectorXError; + + #[throws(ConnectorXError)] + fn set_data_order(&mut self, data_order: DataOrder) { + if !matches!(data_order, DataOrder::RowMajor) { + throw!(ConnectorXError::UnsupportedDataOrder(data_order)) + } + } + + // query: nrows,ncols + fn set_queries(&mut self, queries: &[CXQuery]) { + self.queries = queries.iter().map(|q| q.map(Q::to_string)).collect(); + } + + fn set_origin_query(&mut self, _query: Option) {} + + fn fetch_metadata(&mut self) -> Result<()> { + Ok(()) + } + + fn result_rows(&mut self) -> Result> { + Ok(None) + } + + fn names(&self) -> Vec { + self.names.clone() + } + + fn schema(&self) -> Vec { + self.schema.clone() + } + + fn partition(self) -> Result> { + assert!(!self.queries.is_empty()); + let queries = self.queries; + let schema = self.schema; + + Ok(queries + .into_iter() + .map(|q| DummySourcePartition::new(&schema, &q)) + .collect()) + } +} + +pub struct DummySourcePartition { + nrows: usize, + ncols: usize, + counter: usize, +} + +impl DummySourcePartition { + pub fn new(_schema: &[DummyTypeSystem], q: &CXQuery) -> Self { + let v: Vec = q.as_str().split(',').map(|s| s.parse().unwrap()).collect(); + + DummySourcePartition { + nrows: v[0], + ncols: v[1], + counter: 0, + } + } +} + +impl SourcePartition for DummySourcePartition { + type TypeSystem = DummyTypeSystem; + type Parser<'a> = DummySourcePartitionParser<'a>; + type Error = ConnectorXError; + + fn result_rows(&mut self) -> Result<()> { + Ok(()) + } + + fn parser(&mut self) -> Result> { + Ok(DummySourcePartitionParser::new( + &mut self.counter, + self.nrows, + self.ncols, + )) + } + + fn nrows(&self) -> usize { + self.nrows + } + + fn ncols(&self) -> usize { + self.ncols + } +} + +pub struct DummySourcePartitionParser<'a> { + counter: &'a mut usize, + #[allow(unused)] + nrows: usize, + ncols: usize, +} + +impl<'a> DummySourcePartitionParser<'a> { + fn new(counter: &'a mut usize, nrows: usize, ncols: usize) -> Self { + DummySourcePartitionParser { + counter, + ncols, + nrows, + } + } + + fn next_val(&mut self) -> usize { + let ret = *self.counter / self.ncols; + *self.counter += 1; + ret + } +} + +impl<'a> PartitionParser<'a> for DummySourcePartitionParser<'a> { + type TypeSystem = DummyTypeSystem; + type Error = ConnectorXError; + + fn fetch_next(&mut self) -> Result<(usize, bool)> { + Ok((self.nrows, true)) + } +} + +macro_rules! numeric_impl { + ($($t: ty),+) => { + $( + impl<'r, 'a> Produce<'r, $t> for DummySourcePartitionParser<'a> { + type Error = ConnectorXError; + + fn produce(&mut self) -> Result<$t> { + let ret = self.next_val(); + Ok(FromPrimitive::from_usize(ret).unwrap_or_default()) + } + } + + impl<'r, 'a> Produce<'r, Option<$t>> for DummySourcePartitionParser<'a> { + type Error = ConnectorXError; + + fn produce(&mut self) -> Result> { + let ret = self.next_val(); + Ok(Some(FromPrimitive::from_usize(ret).unwrap_or_default())) + } + } + )+ + }; +} + +numeric_impl!(u64, i32, i64, f64); + +impl<'r, 'a> Produce<'r, String> for DummySourcePartitionParser<'a> { + type Error = ConnectorXError; + + fn produce(&mut self) -> Result { + let ret = self.next_val().to_string(); + Ok(ret) + } +} + +impl<'r, 'a> Produce<'r, Option> for DummySourcePartitionParser<'a> { + type Error = ConnectorXError; + + fn produce(&mut self) -> Result> { + let ret = self.next_val().to_string(); + Ok(Some(ret)) + } +} + +impl<'r, 'a> Produce<'r, bool> for DummySourcePartitionParser<'a> { + type Error = ConnectorXError; + + fn produce(&mut self) -> Result { + let ret = self.next_val() % 2 == 0; + Ok(ret) + } +} + +impl<'r, 'a> Produce<'r, Option> for DummySourcePartitionParser<'a> { + type Error = ConnectorXError; + + fn produce(&mut self) -> Result> { + let ret = match self.next_val() % 3 { + 0 => Some(true), + 1 => Some(false), + 2 => None, + _ => unreachable!(), + }; + + Ok(ret) + } +} + +impl<'r, 'a> Produce<'r, DateTime> for DummySourcePartitionParser<'a> { + type Error = ConnectorXError; + + fn produce(&mut self) -> Result> { + self.next_val(); + let ret = offset::Utc::now(); + + Ok(ret) + } +} + +impl<'r, 'a> Produce<'r, Option>> for DummySourcePartitionParser<'a> { + type Error = ConnectorXError; + + fn produce(&mut self) -> Result>> { + self.next_val(); + let ret = match self.next_val() % 2 { + 0 => Some(offset::Utc::now()), + 1 => None, + _ => unreachable!(), + }; + Ok(ret) + } +} diff --git a/connectorx/src/sources/dummy/typesystem.rs b/connectorx/src/sources/dummy/typesystem.rs new file mode 100644 index 0000000..24aed4c --- /dev/null +++ b/connectorx/src/sources/dummy/typesystem.rs @@ -0,0 +1,31 @@ +// Each variant in DataType represents a type that connectorx currently +// supports to read from a data source and write into a destination. +// When adding a new supported type T and associate it to the native representation N, please do +// 1. Add a T variant to DataType. +// 2. Add `DataType::T => N` to the macro impl_typesystem!. +// 3. Add `DataType::T => N` to the macro impl_transmit!. +// + +use chrono::{DateTime, Utc}; +/// This is a dummy type system used in this library. +/// For all the sources, their output values must be one of the types defined by DummyTypeSystem. +/// For all the destinations, they must support writing any value whose type is defined by DummyTypeSystem. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub enum DummyTypeSystem { + F64(bool), + I64(bool), + Bool(bool), + String(bool), + DateTime(bool), +} + +impl_typesystem! { + system = DummyTypeSystem, + mappings = { + { F64 => f64 } + { I64 => i64 } + { Bool => bool } + { String => String } + { DateTime => DateTime } + } +} diff --git a/connectorx/src/sources/mod.rs b/connectorx/src/sources/mod.rs new file mode 100644 index 0000000..0afb641 --- /dev/null +++ b/connectorx/src/sources/mod.rs @@ -0,0 +1,100 @@ +//! This module defines four traits [`Source`], [`SourcePartition`], [`PartitionParser`], and [`Produce`] to define a source. +//! This module also contains source implementations for various databases. + +#[cfg(feature = "src_bigquery")] +pub mod bigquery; +#[cfg(feature = "src_csv")] +pub mod csv; +#[cfg(feature = "src_dummy")] +pub mod dummy; +#[cfg(feature = "src_mssql")] +pub mod mssql; +#[cfg(feature = "src_mysql")] +pub mod mysql; +#[cfg(feature = "src_oracle")] +pub mod oracle; +#[cfg(feature = "src_postgres")] +pub mod postgres; +#[cfg(feature = "src_sqlite")] +pub mod sqlite; + +use crate::data_order::DataOrder; +use crate::errors::ConnectorXError; +use crate::sql::CXQuery; +use crate::typesystem::{TypeAssoc, TypeSystem}; +use std::fmt::Debug; + +pub trait Source { + /// Supported data orders, ordering by preference. + const DATA_ORDERS: &'static [DataOrder]; + /// The type system this `Source` associated with. + type TypeSystem: TypeSystem; + // Partition needs to be send to different threads for parallel execution + type Partition: SourcePartition + Send; + type Error: From + Send + Debug; + + fn set_data_order(&mut self, data_order: DataOrder) -> Result<(), Self::Error>; + + fn set_queries(&mut self, queries: &[CXQuery]); + + fn set_origin_query(&mut self, query: Option); + + fn fetch_metadata(&mut self) -> Result<(), Self::Error>; + /// Get total number of rows if available + fn result_rows(&mut self) -> Result, Self::Error>; + + fn names(&self) -> Vec; + + fn schema(&self) -> Vec; + + fn partition(self) -> Result, Self::Error>; +} + +/// In general, a `DataSource` abstracts the data source as a stream, which can produce +/// a sequence of values of variate types by repetitively calling the function `produce`. +pub trait SourcePartition { + type TypeSystem: TypeSystem; + type Parser<'a>: PartitionParser<'a, TypeSystem = Self::TypeSystem, Error = Self::Error> + where + Self: 'a; + type Error: From + Send + Debug; + + /// Count total number of rows in each partition. + fn result_rows(&mut self) -> Result<(), Self::Error>; + + fn parser(&mut self) -> Result, Self::Error>; + + /// Number of rows this `DataSource` got. + /// Sometimes it is not possible for the source to know how many rows it gets before reading the whole data. + fn nrows(&self) -> usize; + + /// Number of cols this `DataSource` got. + fn ncols(&self) -> usize; +} + +pub trait PartitionParser<'a>: Send { + type TypeSystem: TypeSystem; + type Error: From + Send + Debug; + + /// Read a value `T` by calling `Produce::produce`. Usually this function does not need to be + /// implemented. + fn parse<'r, T>(&'r mut self) -> Result>::Error> + where + T: TypeAssoc, + Self: Produce<'r, T, Error = >::Error>, + { + self.produce() + } + + /// Fetch next batch of rows from database, return (number of rows fetched to local, whether all rows are fechted from database). + /// There might be rows that are not consumed yet when calling the next fetch_next. + /// The function might be called even after the last batch is fetched. + fn fetch_next(&mut self) -> Result<(usize, bool), Self::Error>; +} + +/// A type implemented `Produce` means that it can produce a value `T` by consuming part of it's raw data buffer. +pub trait Produce<'r, T> { + type Error: From + Send; + + fn produce(&'r mut self) -> Result; +} diff --git a/connectorx/src/sources/mssql/errors.rs b/connectorx/src/sources/mssql/errors.rs new file mode 100644 index 0000000..5e55bc3 --- /dev/null +++ b/connectorx/src/sources/mssql/errors.rs @@ -0,0 +1,30 @@ +use std::string::FromUtf8Error; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum MsSQLSourceError { + #[error("Cannot get # of rows in the partition")] + GetNRowsFailed, + + #[error(transparent)] + ConnectorXError(#[from] crate::errors::ConnectorXError), + + #[error(transparent)] + MsSQLError(#[from] tiberius::error::Error), + + #[error(transparent)] + MsSQLRuntimeError(#[from] bb8::RunError), + + #[error(transparent)] + MsSQLPoolError(#[from] bb8_tiberius::Error), + + #[error(transparent)] + MsSQLUrlError(#[from] url::ParseError), + + #[error(transparent)] + MsSQLUrlDecodeError(#[from] FromUtf8Error), + + /// Any other errors that are too trivial to be put here explicitly. + #[error(transparent)] + Other(#[from] anyhow::Error), +} diff --git a/connectorx/src/sources/mssql/mod.rs b/connectorx/src/sources/mssql/mod.rs new file mode 100644 index 0000000..2a88f24 --- /dev/null +++ b/connectorx/src/sources/mssql/mod.rs @@ -0,0 +1,409 @@ +//! Source implementation for SQL Server. + +mod errors; +mod typesystem; + +pub use self::errors::MsSQLSourceError; +pub use self::typesystem::{FloatN, IntN, MsSQLTypeSystem}; +use crate::constants::DB_BUFFER_SIZE; +use crate::{ + data_order::DataOrder, + errors::ConnectorXError, + sources::{PartitionParser, Produce, Source, SourcePartition}, + sql::{count_query, CXQuery}, + utils::DummyBox, +}; +use anyhow::anyhow; +use bb8::{Pool, PooledConnection}; +use bb8_tiberius::ConnectionManager; +use chrono::{DateTime, Utc}; +use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; +use fehler::{throw, throws}; +use futures::StreamExt; +use log::debug; +use owning_ref::OwningHandle; +use rust_decimal::Decimal; +use sqlparser::dialect::MsSqlDialect; +use std::collections::HashMap; +use std::sync::Arc; +use tiberius::{AuthMethod, Config, EncryptionLevel, QueryResult, Row}; +use tokio::runtime::{Handle, Runtime}; +use url::Url; +use urlencoding::decode; +use uuid::Uuid; + +type Conn<'a> = PooledConnection<'a, ConnectionManager>; +pub struct MsSQLSource { + rt: Arc, + pool: Pool, + origin_query: Option, + queries: Vec>, + names: Vec, + schema: Vec, +} + +#[throws(MsSQLSourceError)] +pub fn mssql_config(url: &Url) -> Config { + let mut config = Config::new(); + + let host = decode(url.host_str().unwrap_or("localhost"))?.into_owned(); + let hosts: Vec<&str> = host.split('\\').collect(); + match hosts.len() { + 1 => config.host(host), + 2 => { + // SQL Server support instance name: `server\instance:port` + config.host(hosts[0]); + config.instance_name(hosts[1]); + } + _ => throw!(anyhow!("MsSQL hostname parse error: {}", host)), + } + config.port(url.port().unwrap_or(1433)); + // remove the leading "/" + config.database(&url.path()[1..]); + // Using SQL Server authentication. + #[allow(unused)] + let params: HashMap = url.query_pairs().into_owned().collect(); + #[cfg(any(windows, feature = "integrated-auth-gssapi"))] + match params.get("trusted_connection") { + // pefer trusted_connection if set to true + Some(v) if v == "true" => { + debug!("mssql auth through trusted connection!"); + config.authentication(AuthMethod::Integrated); + } + _ => { + debug!("mssql auth through sqlserver authentication"); + config.authentication(AuthMethod::sql_server( + decode(url.username())?.to_owned(), + decode(url.password().unwrap_or(""))?.to_owned(), + )); + } + }; + #[cfg(all(not(windows), not(feature = "integrated-auth-gssapi")))] + config.authentication(AuthMethod::sql_server( + decode(url.username())?.to_owned(), + decode(url.password().unwrap_or(""))?.to_owned(), + )); + + match params.get("encrypt") { + Some(v) if v.to_lowercase() == "true" => config.encryption(EncryptionLevel::Required), + _ => config.encryption(EncryptionLevel::NotSupported), + }; + + match params.get("appname") { + Some(appname) => config.application_name(decode(appname)?.to_owned()), + _ => {} + }; + + config +} + +impl MsSQLSource { + #[throws(MsSQLSourceError)] + pub fn new(rt: Arc, conn: &str, nconn: usize) -> Self { + let url = Url::parse(conn)?; + let config = mssql_config(&url)?; + let manager = bb8_tiberius::ConnectionManager::new(config); + let pool = rt.block_on(Pool::builder().max_size(nconn as u32).build(manager))?; + + Self { + rt, + pool, + origin_query: None, + queries: vec![], + names: vec![], + schema: vec![], + } + } +} + +impl Source for MsSQLSource +where + MsSQLSourcePartition: SourcePartition, +{ + const DATA_ORDERS: &'static [DataOrder] = &[DataOrder::RowMajor]; + type Partition = MsSQLSourcePartition; + type TypeSystem = MsSQLTypeSystem; + type Error = MsSQLSourceError; + + #[throws(MsSQLSourceError)] + fn set_data_order(&mut self, data_order: DataOrder) { + if !matches!(data_order, DataOrder::RowMajor) { + throw!(ConnectorXError::UnsupportedDataOrder(data_order)); + } + } + + fn set_queries(&mut self, queries: &[CXQuery]) { + self.queries = queries.iter().map(|q| q.map(Q::to_string)).collect(); + } + + fn set_origin_query(&mut self, query: Option) { + self.origin_query = query; + } + + #[throws(MsSQLSourceError)] + fn fetch_metadata(&mut self) { + assert!(!self.queries.is_empty()); + + let mut conn = self.rt.block_on(self.pool.get())?; + let first_query = &self.queries[0]; + let (names, types) = match self.rt.block_on(conn.query(first_query.as_str(), &[])) { + Ok(stream) => { + let columns = stream.columns().ok_or_else(|| { + anyhow!("MsSQL failed to get the columns of query: {}", first_query) + })?; + columns + .iter() + .map(|col| { + ( + col.name().to_string(), + MsSQLTypeSystem::from(&col.column_type()), + ) + }) + .unzip() + } + Err(e) => { + // tried the last query but still get an error + debug!( + "cannot get metadata for '{}', try next query: {}", + first_query, e + ); + throw!(e); + } + }; + + self.names = names; + self.schema = types; + } + + #[throws(MsSQLSourceError)] + fn result_rows(&mut self) -> Option { + match &self.origin_query { + Some(q) => { + let cxq = CXQuery::Naked(q.clone()); + let cquery = count_query(&cxq, &MsSqlDialect {})?; + let mut conn = self.rt.block_on(self.pool.get())?; + + let stream = self.rt.block_on(conn.query(cquery.as_str(), &[]))?; + let row = self + .rt + .block_on(stream.into_row())? + .ok_or_else(|| anyhow!("MsSQL failed to get the count of query: {}", q))?; + + let row: i32 = row.get(0).ok_or(MsSQLSourceError::GetNRowsFailed)?; // the count in mssql is i32 + Some(row as usize) + } + None => None, + } + } + + fn names(&self) -> Vec { + self.names.clone() + } + + fn schema(&self) -> Vec { + self.schema.clone() + } + + #[throws(MsSQLSourceError)] + fn partition(self) -> Vec { + let mut ret = vec![]; + for query in self.queries { + ret.push(MsSQLSourcePartition::new( + self.pool.clone(), + self.rt.clone(), + &query, + &self.schema, + )); + } + ret + } +} + +pub struct MsSQLSourcePartition { + pool: Pool, + rt: Arc, + query: CXQuery, + schema: Vec, + nrows: usize, + ncols: usize, +} + +impl MsSQLSourcePartition { + pub fn new( + pool: Pool, + handle: Arc, + query: &CXQuery, + schema: &[MsSQLTypeSystem], + ) -> Self { + Self { + rt: handle, + pool, + query: query.clone(), + schema: schema.to_vec(), + nrows: 0, + ncols: schema.len(), + } + } +} + +impl SourcePartition for MsSQLSourcePartition { + type TypeSystem = MsSQLTypeSystem; + type Parser<'a> = MsSQLSourceParser<'a>; + type Error = MsSQLSourceError; + + #[throws(MsSQLSourceError)] + fn result_rows(&mut self) { + let cquery = count_query(&self.query, &MsSqlDialect {})?; + let mut conn = self.rt.block_on(self.pool.get())?; + + let stream = self.rt.block_on(conn.query(cquery.as_str(), &[]))?; + let row = self + .rt + .block_on(stream.into_row())? + .ok_or_else(|| anyhow!("MsSQL failed to get the count of query: {}", self.query))?; + + let row: i32 = row.get(0).ok_or(MsSQLSourceError::GetNRowsFailed)?; // the count in mssql is i32 + self.nrows = row as usize; + } + + #[throws(MsSQLSourceError)] + fn parser<'a>(&'a mut self) -> Self::Parser<'a> { + let conn = self.rt.block_on(self.pool.get())?; + let rows: OwningHandle>, DummyBox>> = + OwningHandle::new_with_fn(Box::new(conn), |conn: *const Conn<'a>| unsafe { + let conn = &mut *(conn as *mut Conn<'a>); + + DummyBox( + self.rt + .block_on(conn.query(self.query.as_str(), &[])) + .unwrap(), + ) + }); + + MsSQLSourceParser::new(self.rt.handle(), rows, &self.schema) + } + + fn nrows(&self) -> usize { + self.nrows + } + + fn ncols(&self) -> usize { + self.ncols + } +} + +pub struct MsSQLSourceParser<'a> { + rt: &'a Handle, + iter: OwningHandle>, DummyBox>>, + rowbuf: Vec, + ncols: usize, + current_col: usize, + current_row: usize, + is_finished: bool, +} + +impl<'a> MsSQLSourceParser<'a> { + fn new( + rt: &'a Handle, + iter: OwningHandle>, DummyBox>>, + schema: &[MsSQLTypeSystem], + ) -> Self { + Self { + rt, + iter, + rowbuf: Vec::with_capacity(DB_BUFFER_SIZE), + ncols: schema.len(), + current_row: 0, + current_col: 0, + is_finished: false, + } + } + + #[throws(MsSQLSourceError)] + fn next_loc(&mut self) -> (usize, usize) { + let ret = (self.current_row, self.current_col); + self.current_row += (self.current_col + 1) / self.ncols; + self.current_col = (self.current_col + 1) % self.ncols; + ret + } +} + +impl<'a> PartitionParser<'a> for MsSQLSourceParser<'a> { + type TypeSystem = MsSQLTypeSystem; + type Error = MsSQLSourceError; + + #[throws(MsSQLSourceError)] + fn fetch_next(&mut self) -> (usize, bool) { + assert!(self.current_col == 0); + let remaining_rows = self.rowbuf.len() - self.current_row; + if remaining_rows > 0 { + return (remaining_rows, self.is_finished); + } else if self.is_finished { + return (0, self.is_finished); + } + + if !self.rowbuf.is_empty() { + self.rowbuf.drain(..); + } + + for _ in 0..DB_BUFFER_SIZE { + if let Some(item) = self.rt.block_on(self.iter.next()) { + self.rowbuf.push(item?); + } else { + self.is_finished = true; + break; + } + } + self.current_row = 0; + self.current_col = 0; + (self.rowbuf.len(), self.is_finished) + } +} + +macro_rules! impl_produce { + ($($t: ty,)+) => { + $( + impl<'r, 'a> Produce<'r, $t> for MsSQLSourceParser<'a> { + type Error = MsSQLSourceError; + + #[throws(MsSQLSourceError)] + fn produce(&'r mut self) -> $t { + let (ridx, cidx) = self.next_loc()?; + let res = self.rowbuf[ridx].get(cidx).ok_or_else(|| anyhow!("MsSQL get None at position: ({}, {})", ridx, cidx))?; + res + } + } + + impl<'r, 'a> Produce<'r, Option<$t>> for MsSQLSourceParser<'a> { + type Error = MsSQLSourceError; + + #[throws(MsSQLSourceError)] + fn produce(&'r mut self) -> Option<$t> { + let (ridx, cidx) = self.next_loc()?; + let res = self.rowbuf[ridx].get(cidx); + res + } + } + )+ + }; +} + +impl_produce!( + u8, + i16, + i32, + i64, + IntN, + f32, + f64, + FloatN, + bool, + &'r str, + &'r [u8], + Uuid, + Decimal, + NaiveDateTime, + NaiveDate, + NaiveTime, + DateTime, +); diff --git a/connectorx/src/sources/mssql/typesystem.rs b/connectorx/src/sources/mssql/typesystem.rs new file mode 100644 index 0000000..2da6aa4 --- /dev/null +++ b/connectorx/src/sources/mssql/typesystem.rs @@ -0,0 +1,135 @@ +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Utc}; +use rust_decimal::Decimal; +use tiberius::{ColumnData, ColumnType, FromSql}; +use uuid::Uuid; + +// https://docs.microsoft.com/en-us/openspecs/windows_protocols/ms-tds/ce3183a6-9d89-47e8-a02f-de5a1a1303de +#[derive(Copy, Clone, Debug)] +pub enum MsSQLTypeSystem { + Tinyint(bool), + Smallint(bool), + Int(bool), + Bigint(bool), + Intn(bool), + Float24(bool), + Float53(bool), + Floatn(bool), + Bit(bool), + Nvarchar(bool), + Varchar(bool), + Nchar(bool), + Char(bool), + Ntext(bool), + Text(bool), + Binary(bool), + Varbinary(bool), + Image(bool), + Uniqueidentifier(bool), + Numeric(bool), + Decimal(bool), + Datetime(bool), + Datetime2(bool), + Smalldatetime(bool), + Date(bool), + Time(bool), + Datetimeoffset(bool), + Money(bool), + SmallMoney(bool), +} + +impl_typesystem! { + system = MsSQLTypeSystem, + mappings = { + { Tinyint => u8 } + { Smallint => i16 } + { Int => i32 } + { Bigint => i64 } + { Intn => IntN } + { Float24 | SmallMoney => f32 } + { Float53 | Money => f64 } + { Floatn => FloatN } + { Bit => bool } + { Nvarchar | Varchar | Nchar | Char | Text | Ntext => &'r str } + { Binary | Varbinary | Image => &'r [u8] } + { Uniqueidentifier => Uuid } + { Numeric | Decimal => Decimal } + { Datetime | Datetime2 | Smalldatetime => NaiveDateTime } + { Date => NaiveDate } + { Time => NaiveTime } + { Datetimeoffset => DateTime } + } +} + +impl<'a> From<&'a ColumnType> for MsSQLTypeSystem { + fn from(ty: &'a ColumnType) -> MsSQLTypeSystem { + use MsSQLTypeSystem::*; + + match ty { + ColumnType::Int1 => Tinyint(false), + ColumnType::Int2 => Smallint(false), + ColumnType::Int4 => Int(false), + ColumnType::Int8 => Bigint(false), + ColumnType::Intn => Intn(true), + ColumnType::Float4 => Float24(false), + ColumnType::Float8 => Float53(false), + ColumnType::Floatn => Floatn(true), + ColumnType::Bit => Bit(false), + ColumnType::Bitn => Bit(true), // nullable int, var-length + ColumnType::NVarchar => Nvarchar(true), + ColumnType::BigVarChar => Varchar(true), + ColumnType::NChar => Nchar(true), + ColumnType::BigChar => Char(true), + ColumnType::NText => Ntext(true), + ColumnType::Text => Text(true), + ColumnType::BigBinary => Binary(true), + ColumnType::BigVarBin => Varbinary(true), + ColumnType::Image => Image(true), + ColumnType::Guid => Uniqueidentifier(true), + ColumnType::Decimaln => Decimal(true), + ColumnType::Numericn => Numeric(true), + ColumnType::Datetime => Datetime(false), + ColumnType::Datetime2 => Datetime2(true), + ColumnType::Datetimen => Datetime(true), + ColumnType::Datetime4 => Datetime(false), + ColumnType::Daten => Date(true), + ColumnType::Timen => Time(true), + ColumnType::DatetimeOffsetn => Datetimeoffset(true), + ColumnType::Money => Money(true), + ColumnType::Money4 => SmallMoney(true), + _ => unimplemented!("{}", format!("{:?}", ty)), + } + } +} + +pub struct IntN(pub i64); +impl<'a> FromSql<'a> for IntN { + fn from_sql(value: &'a ColumnData<'static>) -> Result, tiberius::error::Error> { + match value { + ColumnData::U8(None) + | ColumnData::I16(None) + | ColumnData::I32(None) + | ColumnData::I64(None) => Ok(None), + ColumnData::U8(Some(d)) => Ok(Some(IntN(*d as i64))), + ColumnData::I16(Some(d)) => Ok(Some(IntN(*d as i64))), + ColumnData::I32(Some(d)) => Ok(Some(IntN(*d as i64))), + ColumnData::I64(Some(d)) => Ok(Some(IntN(*d))), + v => Err(tiberius::error::Error::Conversion( + format!("cannot interpret {:?} as a intn value", v).into(), + )), + } + } +} + +pub struct FloatN(pub f64); +impl<'a> FromSql<'a> for FloatN { + fn from_sql(value: &'a ColumnData<'static>) -> Result, tiberius::error::Error> { + match value { + ColumnData::F32(None) | ColumnData::F64(None) => Ok(None), + ColumnData::F32(Some(d)) => Ok(Some(FloatN(*d as f64))), + ColumnData::F64(Some(d)) => Ok(Some(FloatN(*d))), + v => Err(tiberius::error::Error::Conversion( + format!("cannot interpret {:?} as a floatn value", v).into(), + )), + } + } +} diff --git a/connectorx/src/sources/mysql/errors.rs b/connectorx/src/sources/mysql/errors.rs new file mode 100644 index 0000000..6c72466 --- /dev/null +++ b/connectorx/src/sources/mysql/errors.rs @@ -0,0 +1,20 @@ +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum MySQLSourceError { + #[error(transparent)] + ConnectorXError(#[from] crate::errors::ConnectorXError), + + #[error(transparent)] + MySQLError(#[from] r2d2_mysql::mysql::Error), + + #[error(transparent)] + MySQLUrlError(#[from] r2d2_mysql::mysql::UrlError), + + #[error(transparent)] + MySQLPoolError(#[from] r2d2::Error), + + /// Any other errors that are too trivial to be put here explicitly. + #[error(transparent)] + Other(#[from] anyhow::Error), +} diff --git a/connectorx/src/sources/mysql/mod.rs b/connectorx/src/sources/mysql/mod.rs new file mode 100644 index 0000000..5b3d039 --- /dev/null +++ b/connectorx/src/sources/mysql/mod.rs @@ -0,0 +1,494 @@ +//! Source implementation for MySQL database. + +mod errors; +mod typesystem; + +pub use self::errors::MySQLSourceError; +use crate::constants::DB_BUFFER_SIZE; +use crate::{ + data_order::DataOrder, + errors::ConnectorXError, + sources::{PartitionParser, Produce, Source, SourcePartition}, + sql::{count_query, limit1_query, CXQuery}, +}; +use anyhow::anyhow; +use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; +use fehler::{throw, throws}; +use log::{debug, warn}; +use r2d2::{Pool, PooledConnection}; +use r2d2_mysql::{ + mysql::{prelude::Queryable, Binary, Opts, OptsBuilder, QueryResult, Row, Text}, + MySqlConnectionManager, +}; +use rust_decimal::Decimal; +use serde_json::Value; +use sqlparser::dialect::MySqlDialect; +use std::marker::PhantomData; +pub use typesystem::MySQLTypeSystem; + +type MysqlConn = PooledConnection; + +pub enum BinaryProtocol {} +pub enum TextProtocol {} + +#[throws(MySQLSourceError)] +fn get_total_rows(conn: &mut MysqlConn, query: &CXQuery) -> usize { + conn.query_first(&count_query(query, &MySqlDialect {})?)? + .ok_or_else(|| anyhow!("mysql failed to get the count of query: {}", query))? +} + +pub struct MySQLSource

{ + pool: Pool, + origin_query: Option, + queries: Vec>, + names: Vec, + schema: Vec, + _protocol: PhantomData

, +} + +impl

MySQLSource

{ + #[throws(MySQLSourceError)] + pub fn new(conn: &str, nconn: usize) -> Self { + let manager = MySqlConnectionManager::new(OptsBuilder::from_opts(Opts::from_url(conn)?)); + let pool = r2d2::Pool::builder() + .max_size(nconn as u32) + .build(manager)?; + + Self { + pool, + origin_query: None, + queries: vec![], + names: vec![], + schema: vec![], + _protocol: PhantomData, + } + } +} + +impl

Source for MySQLSource

+where + MySQLSourcePartition

: + SourcePartition, + P: Send, +{ + const DATA_ORDERS: &'static [DataOrder] = &[DataOrder::RowMajor]; + type Partition = MySQLSourcePartition

; + type TypeSystem = MySQLTypeSystem; + type Error = MySQLSourceError; + + #[throws(MySQLSourceError)] + fn set_data_order(&mut self, data_order: DataOrder) { + if !matches!(data_order, DataOrder::RowMajor) { + throw!(ConnectorXError::UnsupportedDataOrder(data_order)); + } + } + + fn set_queries(&mut self, queries: &[CXQuery]) { + self.queries = queries.iter().map(|q| q.map(Q::to_string)).collect(); + } + + fn set_origin_query(&mut self, query: Option) { + self.origin_query = query; + } + + #[throws(MySQLSourceError)] + fn fetch_metadata(&mut self) { + assert!(!self.queries.is_empty()); + + let mut conn = self.pool.get()?; + let first_query = &self.queries[0]; + + match conn.prep(first_query) { + Ok(stmt) => { + let (names, types) = stmt + .columns() + .iter() + .map(|col| { + ( + col.name_str().to_string(), + MySQLTypeSystem::from((&col.column_type(), &col.flags())), + ) + }) + .unzip(); + self.names = names; + self.schema = types; + } + Err(e) => { + warn!( + "mysql text prepared statement error: {:?}, switch to limit1 method", + e + ); + for (i, query) in self.queries.iter().enumerate() { + // assuming all the partition queries yield same schema + match conn + .query_first::(limit1_query(query, &MySqlDialect {})?.as_str()) + { + Ok(Some(row)) => { + let (names, types) = row + .columns_ref() + .iter() + .map(|col| { + ( + col.name_str().to_string(), + MySQLTypeSystem::from((&col.column_type(), &col.flags())), + ) + }) + .unzip(); + self.names = names; + self.schema = types; + return; + } + Ok(None) => {} + Err(e) if i == self.queries.len() - 1 => { + // tried the last query but still get an error + debug!("cannot get metadata for '{}', try next query: {}", query, e); + throw!(e) + } + Err(_) => {} + } + } + + // tried all queries but all get empty result set + let iter = conn.query_iter(self.queries[0].as_str())?; + let (names, types) = iter + .columns() + .as_ref() + .iter() + .map(|col| { + ( + col.name_str().to_string(), + MySQLTypeSystem::VarChar(false), // set all columns as string (align with pandas) + ) + }) + .unzip(); + self.names = names; + self.schema = types; + } + } + } + + #[throws(MySQLSourceError)] + fn result_rows(&mut self) -> Option { + match &self.origin_query { + Some(q) => { + let cxq = CXQuery::Naked(q.clone()); + let mut conn = self.pool.get()?; + let nrows = get_total_rows(&mut conn, &cxq)?; + Some(nrows) + } + None => None, + } + } + + fn names(&self) -> Vec { + self.names.clone() + } + + fn schema(&self) -> Vec { + self.schema.clone() + } + + #[throws(MySQLSourceError)] + fn partition(self) -> Vec { + let mut ret = vec![]; + for query in self.queries { + let conn = self.pool.get()?; + ret.push(MySQLSourcePartition::new(conn, &query, &self.schema)); + } + ret + } +} + +pub struct MySQLSourcePartition

{ + conn: MysqlConn, + query: CXQuery, + schema: Vec, + nrows: usize, + ncols: usize, + _protocol: PhantomData

, +} + +impl

MySQLSourcePartition

{ + pub fn new(conn: MysqlConn, query: &CXQuery, schema: &[MySQLTypeSystem]) -> Self { + Self { + conn, + query: query.clone(), + schema: schema.to_vec(), + nrows: 0, + ncols: schema.len(), + _protocol: PhantomData, + } + } +} + +impl SourcePartition for MySQLSourcePartition { + type TypeSystem = MySQLTypeSystem; + type Parser<'a> = MySQLBinarySourceParser<'a>; + type Error = MySQLSourceError; + + #[throws(MySQLSourceError)] + fn result_rows(&mut self) { + self.nrows = get_total_rows(&mut self.conn, &self.query)?; + } + + #[throws(MySQLSourceError)] + fn parser(&mut self) -> Self::Parser<'_> { + let stmt = self.conn.prep(self.query.as_str())?; + let iter = self.conn.exec_iter(stmt, ())?; + MySQLBinarySourceParser::new(iter, &self.schema) + } + + fn nrows(&self) -> usize { + self.nrows + } + + fn ncols(&self) -> usize { + self.ncols + } +} + +impl SourcePartition for MySQLSourcePartition { + type TypeSystem = MySQLTypeSystem; + type Parser<'a> = MySQLTextSourceParser<'a>; + type Error = MySQLSourceError; + + #[throws(MySQLSourceError)] + fn result_rows(&mut self) { + self.nrows = get_total_rows(&mut self.conn, &self.query)?; + } + + #[throws(MySQLSourceError)] + fn parser(&mut self) -> Self::Parser<'_> { + let query = self.query.clone(); + let iter = self.conn.query_iter(query)?; + MySQLTextSourceParser::new(iter, &self.schema) + } + + fn nrows(&self) -> usize { + self.nrows + } + + fn ncols(&self) -> usize { + self.ncols + } +} + +pub struct MySQLBinarySourceParser<'a> { + iter: QueryResult<'a, 'a, 'a, Binary>, + rowbuf: Vec, + ncols: usize, + current_col: usize, + current_row: usize, + is_finished: bool, +} + +impl<'a> MySQLBinarySourceParser<'a> { + pub fn new(iter: QueryResult<'a, 'a, 'a, Binary>, schema: &[MySQLTypeSystem]) -> Self { + Self { + iter, + rowbuf: Vec::with_capacity(DB_BUFFER_SIZE), + ncols: schema.len(), + current_row: 0, + current_col: 0, + is_finished: false, + } + } + + #[throws(MySQLSourceError)] + fn next_loc(&mut self) -> (usize, usize) { + let ret = (self.current_row, self.current_col); + self.current_row += (self.current_col + 1) / self.ncols; + self.current_col = (self.current_col + 1) % self.ncols; + ret + } +} + +impl<'a> PartitionParser<'a> for MySQLBinarySourceParser<'a> { + type TypeSystem = MySQLTypeSystem; + type Error = MySQLSourceError; + + #[throws(MySQLSourceError)] + fn fetch_next(&mut self) -> (usize, bool) { + assert!(self.current_col == 0); + let remaining_rows = self.rowbuf.len() - self.current_row; + if remaining_rows > 0 { + return (remaining_rows, self.is_finished); + } else if self.is_finished { + return (0, self.is_finished); + } + + if !self.rowbuf.is_empty() { + self.rowbuf.drain(..); + } + + for _ in 0..DB_BUFFER_SIZE { + if let Some(item) = self.iter.next() { + self.rowbuf.push(item?); + } else { + self.is_finished = true; + break; + } + } + self.current_row = 0; + self.current_col = 0; + + (self.rowbuf.len(), self.is_finished) + } +} + +macro_rules! impl_produce_binary { + ($($t: ty,)+) => { + $( + impl<'r, 'a> Produce<'r, $t> for MySQLBinarySourceParser<'a> { + type Error = MySQLSourceError; + + #[throws(MySQLSourceError)] + fn produce(&'r mut self) -> $t { + let (ridx, cidx) = self.next_loc()?; + let res = self.rowbuf[ridx].take(cidx).ok_or_else(|| anyhow!("mysql cannot parse at position: ({}, {})", ridx, cidx))?; + res + } + } + + impl<'r, 'a> Produce<'r, Option<$t>> for MySQLBinarySourceParser<'a> { + type Error = MySQLSourceError; + + #[throws(MySQLSourceError)] + fn produce(&'r mut self) -> Option<$t> { + let (ridx, cidx) = self.next_loc()?; + let res = self.rowbuf[ridx].take(cidx).ok_or_else(|| anyhow!("mysql cannot parse at position: ({}, {})", ridx, cidx))?; + res + } + } + )+ + }; +} + +impl_produce_binary!( + i8, + i16, + i32, + i64, + u8, + u16, + u32, + u64, + f32, + f64, + NaiveDate, + NaiveTime, + NaiveDateTime, + Decimal, + String, + Vec, + Value, +); + +pub struct MySQLTextSourceParser<'a> { + iter: QueryResult<'a, 'a, 'a, Text>, + rowbuf: Vec, + ncols: usize, + current_col: usize, + current_row: usize, + is_finished: bool, +} + +impl<'a> MySQLTextSourceParser<'a> { + pub fn new(iter: QueryResult<'a, 'a, 'a, Text>, schema: &[MySQLTypeSystem]) -> Self { + Self { + iter, + rowbuf: Vec::with_capacity(DB_BUFFER_SIZE), + ncols: schema.len(), + current_row: 0, + current_col: 0, + is_finished: false, + } + } + + #[throws(MySQLSourceError)] + fn next_loc(&mut self) -> (usize, usize) { + let ret = (self.current_row, self.current_col); + self.current_row += (self.current_col + 1) / self.ncols; + self.current_col = (self.current_col + 1) % self.ncols; + ret + } +} + +impl<'a> PartitionParser<'a> for MySQLTextSourceParser<'a> { + type TypeSystem = MySQLTypeSystem; + type Error = MySQLSourceError; + + #[throws(MySQLSourceError)] + fn fetch_next(&mut self) -> (usize, bool) { + assert!(self.current_col == 0); + let remaining_rows = self.rowbuf.len() - self.current_row; + if remaining_rows > 0 { + return (remaining_rows, self.is_finished); + } else if self.is_finished { + return (0, self.is_finished); + } + + if !self.rowbuf.is_empty() { + self.rowbuf.drain(..); + } + for _ in 0..DB_BUFFER_SIZE { + if let Some(item) = self.iter.next() { + self.rowbuf.push(item?); + } else { + self.is_finished = true; + break; + } + } + self.current_row = 0; + self.current_col = 0; + (self.rowbuf.len(), self.is_finished) + } +} + +macro_rules! impl_produce_text { + ($($t: ty,)+) => { + $( + impl<'r, 'a> Produce<'r, $t> for MySQLTextSourceParser<'a> { + type Error = MySQLSourceError; + + #[throws(MySQLSourceError)] + fn produce(&'r mut self) -> $t { + let (ridx, cidx) = self.next_loc()?; + let res = self.rowbuf[ridx].take(cidx).ok_or_else(|| anyhow!("mysql cannot parse at position: ({}, {})", ridx, cidx))?; + res + } + } + + impl<'r, 'a> Produce<'r, Option<$t>> for MySQLTextSourceParser<'a> { + type Error = MySQLSourceError; + + #[throws(MySQLSourceError)] + fn produce(&'r mut self) -> Option<$t> { + let (ridx, cidx) = self.next_loc()?; + let res = self.rowbuf[ridx].take(cidx).ok_or_else(|| anyhow!("mysql cannot parse at position: ({}, {})", ridx, cidx))?; + res + } + } + )+ + }; +} + +impl_produce_text!( + i8, + i16, + i32, + i64, + u8, + u16, + u32, + u64, + f32, + f64, + NaiveDate, + NaiveTime, + NaiveDateTime, + Decimal, + String, + Vec, + Value, +); diff --git a/connectorx/src/sources/mysql/typesystem.rs b/connectorx/src/sources/mysql/typesystem.rs new file mode 100644 index 0000000..105b727 --- /dev/null +++ b/connectorx/src/sources/mysql/typesystem.rs @@ -0,0 +1,122 @@ +use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; +use r2d2_mysql::mysql::consts::{ColumnFlags, ColumnType}; +use rust_decimal::Decimal; +use serde_json::Value; + +#[derive(Copy, Clone, Debug)] +pub enum MySQLTypeSystem { + Float(bool), + Double(bool), + Tiny(bool), + Short(bool), + Long(bool), + Int24(bool), + LongLong(bool), + UTiny(bool), + UShort(bool), + ULong(bool), + UInt24(bool), + ULongLong(bool), + Datetime(bool), + Date(bool), + Time(bool), + Decimal(bool), + Char(bool), + VarChar(bool), + Timestamp(bool), + Year(bool), + Enum(bool), + TinyBlob(bool), + Blob(bool), + MediumBlob(bool), + LongBlob(bool), + Json(bool), +} + +impl_typesystem! { + system = MySQLTypeSystem, + mappings = { + { Tiny => i8 } + { Short | Year => i16 } + { Long | Int24 => i32} + { LongLong => i64 } + { Float => f32 } + { Double => f64 } + { UTiny => u8 } + { UShort => u16 } + { ULong | UInt24 => u32} + { ULongLong => u64 } + { Datetime | Timestamp => NaiveDateTime } + { Date => NaiveDate } + { Time => NaiveTime } + { Decimal => Decimal } + { Char | VarChar | Enum => String } + { TinyBlob | Blob | MediumBlob | LongBlob => Vec} + { Json => Value } + } +} + +impl<'a> From<(&'a ColumnType, &'a ColumnFlags)> for MySQLTypeSystem { + fn from(col: (&'a ColumnType, &'a ColumnFlags)) -> MySQLTypeSystem { + use MySQLTypeSystem::*; + let (ty, flag) = col; + let null_ok = !flag.contains(ColumnFlags::NOT_NULL_FLAG); + let unsigned = flag.contains(ColumnFlags::UNSIGNED_FLAG); + match ty { + ColumnType::MYSQL_TYPE_TINY => { + if unsigned { + UTiny(null_ok) + } else { + Tiny(null_ok) + } + } + ColumnType::MYSQL_TYPE_SHORT => { + if unsigned { + UShort(null_ok) + } else { + Short(null_ok) + } + } + ColumnType::MYSQL_TYPE_INT24 => { + if unsigned { + UInt24(null_ok) + } else { + Int24(null_ok) + } + } + ColumnType::MYSQL_TYPE_LONG => { + if unsigned { + ULong(null_ok) + } else { + Long(null_ok) + } + } + ColumnType::MYSQL_TYPE_LONGLONG => { + if unsigned { + ULongLong(null_ok) + } else { + LongLong(null_ok) + } + } + ColumnType::MYSQL_TYPE_FLOAT => Float(null_ok), + ColumnType::MYSQL_TYPE_DOUBLE => Double(null_ok), + ColumnType::MYSQL_TYPE_DATETIME => Datetime(null_ok), + ColumnType::MYSQL_TYPE_DATE => Date(null_ok), + ColumnType::MYSQL_TYPE_TIME => Time(null_ok), + ColumnType::MYSQL_TYPE_DECIMAL => Decimal(null_ok), + ColumnType::MYSQL_TYPE_NEWDECIMAL => Decimal(null_ok), + ColumnType::MYSQL_TYPE_STRING => Char(null_ok), + ColumnType::MYSQL_TYPE_VAR_STRING => VarChar(null_ok), + ColumnType::MYSQL_TYPE_TIMESTAMP => Timestamp(null_ok), + ColumnType::MYSQL_TYPE_YEAR => Year(null_ok), + ColumnType::MYSQL_TYPE_ENUM => Enum(null_ok), + ColumnType::MYSQL_TYPE_TINY_BLOB => TinyBlob(null_ok), + ColumnType::MYSQL_TYPE_BLOB => Blob(null_ok), + ColumnType::MYSQL_TYPE_MEDIUM_BLOB => MediumBlob(null_ok), + ColumnType::MYSQL_TYPE_LONG_BLOB => LongBlob(null_ok), + ColumnType::MYSQL_TYPE_JSON => Json(null_ok), + ColumnType::MYSQL_TYPE_VARCHAR => VarChar(null_ok), + _ => unimplemented!("{}", format!("{:?}", ty)), + } + } +} diff --git a/connectorx/src/sources/oracle/errors.rs b/connectorx/src/sources/oracle/errors.rs new file mode 100644 index 0000000..b42927c --- /dev/null +++ b/connectorx/src/sources/oracle/errors.rs @@ -0,0 +1,24 @@ +use std::string::FromUtf8Error; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum OracleSourceError { + #[error(transparent)] + ConnectorXError(#[from] crate::errors::ConnectorXError), + + #[error(transparent)] + OracleError(#[from] r2d2_oracle::oracle::Error), + + #[error(transparent)] + OraclePoolError(#[from] r2d2::Error), + + #[error(transparent)] + OracleUrlError(#[from] url::ParseError), + + #[error(transparent)] + OracleUrlDecodeError(#[from] FromUtf8Error), + + /// Any other errors that are too trivial to be put here explicitly. + #[error(transparent)] + Other(#[from] anyhow::Error), +} diff --git a/connectorx/src/sources/oracle/mod.rs b/connectorx/src/sources/oracle/mod.rs new file mode 100644 index 0000000..07d4c33 --- /dev/null +++ b/connectorx/src/sources/oracle/mod.rs @@ -0,0 +1,356 @@ +mod errors; +mod typesystem; + +pub use self::errors::OracleSourceError; +pub use self::typesystem::OracleTypeSystem; +use crate::constants::{DB_BUFFER_SIZE, ORACLE_ARRAY_SIZE}; +use crate::{ + data_order::DataOrder, + errors::ConnectorXError, + sources::{PartitionParser, Produce, Source, SourcePartition}, + sql::{count_query, limit1_query_oracle, CXQuery}, + utils::DummyBox, +}; +use chrono::{DateTime, NaiveDate, NaiveDateTime, Utc}; +use fehler::{throw, throws}; +use log::debug; +use owning_ref::OwningHandle; +use r2d2::{Pool, PooledConnection}; +use r2d2_oracle::oracle::ResultSet; +use r2d2_oracle::{ + oracle::{Connector, Row, Statement}, + OracleConnectionManager, +}; +use sqlparser::dialect::Dialect; +use url::Url; +use urlencoding::decode; + +type OracleManager = OracleConnectionManager; +type OracleConn = PooledConnection; + +#[derive(Debug)] +pub struct OracleDialect {} + +// implementation copy from AnsiDialect +impl Dialect for OracleDialect { + fn is_identifier_start(&self, ch: char) -> bool { + ch.is_ascii_lowercase() || ch.is_ascii_uppercase() + } + + fn is_identifier_part(&self, ch: char) -> bool { + ch.is_ascii_lowercase() + || ch.is_ascii_uppercase() + || ch.is_ascii_digit() + || ch == '_' + } +} + +pub struct OracleSource { + pool: Pool, + origin_query: Option, + queries: Vec>, + names: Vec, + schema: Vec, +} + +#[throws(OracleSourceError)] +pub fn connect_oracle(conn: &Url) -> Connector { + let user = decode(conn.username())?.into_owned(); + let password = decode(conn.password().unwrap_or(""))?.into_owned(); + let host = decode(conn.host_str().unwrap_or("localhost"))?.into_owned(); + let port = conn.port().unwrap_or(1521); + let path = decode(conn.path())?.into_owned(); + + let conn_str = format!("//{}:{}{}", host, port, path); + let mut connector = oracle::Connector::new(user.as_str(), password.as_str(), conn_str.as_str()); + if user.is_empty() && password.is_empty() && host == "localhost" { + debug!("No username or password provided, assuming system auth."); + connector.external_auth(true); + } + connector +} + +impl OracleSource { + #[throws(OracleSourceError)] + pub fn new(conn: &str, nconn: usize) -> Self { + let conn = Url::parse(conn)?; + let connector = connect_oracle(&conn)?; + let manager = OracleConnectionManager::from_connector(connector); + let pool = r2d2::Pool::builder() + .max_size(nconn as u32) + .build(manager)?; + + Self { + pool, + origin_query: None, + queries: vec![], + names: vec![], + schema: vec![], + } + } +} + +impl Source for OracleSource +where + OracleSourcePartition: + SourcePartition, +{ + const DATA_ORDERS: &'static [DataOrder] = &[DataOrder::RowMajor]; + type Partition = OracleSourcePartition; + type TypeSystem = OracleTypeSystem; + type Error = OracleSourceError; + + #[throws(OracleSourceError)] + fn set_data_order(&mut self, data_order: DataOrder) { + if !matches!(data_order, DataOrder::RowMajor) { + throw!(ConnectorXError::UnsupportedDataOrder(data_order)); + } + } + + fn set_queries(&mut self, queries: &[CXQuery]) { + self.queries = queries.iter().map(|q| q.map(Q::to_string)).collect(); + } + + fn set_origin_query(&mut self, query: Option) { + self.origin_query = query; + } + + #[throws(OracleSourceError)] + fn fetch_metadata(&mut self) { + assert!(!self.queries.is_empty()); + + let conn = self.pool.get()?; + for (i, query) in self.queries.iter().enumerate() { + // assuming all the partition queries yield same schema + // without rownum = 1, derived type might be wrong + // example: select avg(test_int), test_char from test_table group by test_char + // -> (NumInt, Char) instead of (NumtFloat, Char) + match conn.query(limit1_query_oracle(query)?.as_str(), &[]) { + Ok(rows) => { + let (names, types) = rows + .column_info() + .iter() + .map(|col| { + ( + col.name().to_string(), + OracleTypeSystem::from(col.oracle_type()), + ) + }) + .unzip(); + self.names = names; + self.schema = types; + return; + } + Err(e) if i == self.queries.len() - 1 => { + // tried the last query but still get an error + debug!("cannot get metadata for '{}': {}", query, e); + throw!(e); + } + Err(_) => {} + } + } + // tried all queries but all get empty result set + let iter = conn.query(self.queries[0].as_str(), &[])?; + let (names, types) = iter + .column_info() + .iter() + .map(|col| (col.name().to_string(), OracleTypeSystem::VarChar(false))) + .unzip(); + self.names = names; + self.schema = types; + } + + #[throws(OracleSourceError)] + fn result_rows(&mut self) -> Option { + match &self.origin_query { + Some(q) => { + let cxq = CXQuery::Naked(q.clone()); + let conn = self.pool.get()?; + + let nrows = conn + .query_row_as::(count_query(&cxq, &OracleDialect {})?.as_str(), &[])?; + Some(nrows) + } + None => None, + } + } + + fn names(&self) -> Vec { + self.names.clone() + } + + fn schema(&self) -> Vec { + self.schema.clone() + } + + #[throws(OracleSourceError)] + fn partition(self) -> Vec { + let mut ret = vec![]; + for query in self.queries { + let conn = self.pool.get()?; + ret.push(OracleSourcePartition::new(conn, &query, &self.schema)); + } + ret + } +} + +pub struct OracleSourcePartition { + conn: OracleConn, + query: CXQuery, + schema: Vec, + nrows: usize, + ncols: usize, +} + +impl OracleSourcePartition { + pub fn new(conn: OracleConn, query: &CXQuery, schema: &[OracleTypeSystem]) -> Self { + Self { + conn, + query: query.clone(), + schema: schema.to_vec(), + nrows: 0, + ncols: schema.len(), + } + } +} + +impl SourcePartition for OracleSourcePartition { + type TypeSystem = OracleTypeSystem; + type Parser<'a> = OracleTextSourceParser<'a>; + type Error = OracleSourceError; + + #[throws(OracleSourceError)] + fn result_rows(&mut self) { + self.nrows = self + .conn + .query_row_as::(count_query(&self.query, &OracleDialect {})?.as_str(), &[])?; + } + + #[throws(OracleSourceError)] + fn parser(&mut self) -> Self::Parser<'_> { + let query = self.query.clone(); + + // let iter = self.conn.query(query.as_str(), &[])?; + OracleTextSourceParser::new(&self.conn, query.as_str(), &self.schema)? + } + + fn nrows(&self) -> usize { + self.nrows + } + + fn ncols(&self) -> usize { + self.ncols + } +} + +unsafe impl<'a> Send for OracleTextSourceParser<'a> {} + +pub struct OracleTextSourceParser<'a> { + rows: OwningHandle>, DummyBox>>, + rowbuf: Vec, + ncols: usize, + current_col: usize, + current_row: usize, + is_finished: bool, +} + +impl<'a> OracleTextSourceParser<'a> { + #[throws(OracleSourceError)] + pub fn new(conn: &'a OracleConn, query: &str, schema: &[OracleTypeSystem]) -> Self { + let stmt = conn + .statement(query) + .prefetch_rows(ORACLE_ARRAY_SIZE) + .fetch_array_size(ORACLE_ARRAY_SIZE) + .build()?; + let rows: OwningHandle>, DummyBox>> = + OwningHandle::new_with_fn(Box::new(stmt), |stmt: *const Statement<'a>| unsafe { + DummyBox((*(stmt as *mut Statement<'_>)).query(&[]).unwrap()) + }); + + Self { + rows, + rowbuf: Vec::with_capacity(DB_BUFFER_SIZE), + ncols: schema.len(), + current_row: 0, + current_col: 0, + is_finished: false, + } + } + + #[throws(OracleSourceError)] + fn next_loc(&mut self) -> (usize, usize) { + let ret = (self.current_row, self.current_col); + self.current_row += (self.current_col + 1) / self.ncols; + self.current_col = (self.current_col + 1) % self.ncols; + ret + } +} + +impl<'a> PartitionParser<'a> for OracleTextSourceParser<'a> { + type TypeSystem = OracleTypeSystem; + type Error = OracleSourceError; + + #[throws(OracleSourceError)] + fn fetch_next(&mut self) -> (usize, bool) { + assert!(self.current_col == 0); + let remaining_rows = self.rowbuf.len() - self.current_row; + if remaining_rows > 0 { + return (remaining_rows, self.is_finished); + } else if self.is_finished { + return (0, self.is_finished); + } + + if !self.rowbuf.is_empty() { + self.rowbuf.drain(..); + } + for _ in 0..DB_BUFFER_SIZE { + if let Some(item) = (*self.rows).next() { + self.rowbuf.push(item?); + } else { + self.is_finished = true; + break; + } + } + self.current_row = 0; + self.current_col = 0; + (self.rowbuf.len(), self.is_finished) + } +} + +macro_rules! impl_produce_text { + ($($t: ty,)+) => { + $( + impl<'r, 'a> Produce<'r, $t> for OracleTextSourceParser<'a> { + type Error = OracleSourceError; + + #[throws(OracleSourceError)] + fn produce(&'r mut self) -> $t { + let (ridx, cidx) = self.next_loc()?; + let res = self.rowbuf[ridx].get(cidx)?; + res + } + } + + impl<'r, 'a> Produce<'r, Option<$t>> for OracleTextSourceParser<'a> { + type Error = OracleSourceError; + + #[throws(OracleSourceError)] + fn produce(&'r mut self) -> Option<$t> { + let (ridx, cidx) = self.next_loc()?; + let res = self.rowbuf[ridx].get(cidx)?; + res + } + } + )+ + }; +} + +impl_produce_text!( + i64, + f64, + String, + NaiveDate, + NaiveDateTime, + DateTime, + Vec, +); diff --git a/connectorx/src/sources/oracle/typesystem.rs b/connectorx/src/sources/oracle/typesystem.rs new file mode 100644 index 0000000..4c7d15a --- /dev/null +++ b/connectorx/src/sources/oracle/typesystem.rs @@ -0,0 +1,56 @@ +use chrono::{DateTime, NaiveDateTime, Utc}; +use r2d2_oracle::oracle::sql_type::OracleType; + +#[derive(Copy, Clone, Debug)] +pub enum OracleTypeSystem { + NumInt(bool), + Float(bool), + NumFloat(bool), + BinaryFloat(bool), + BinaryDouble(bool), + Blob(bool), + Clob(bool), + VarChar(bool), + Char(bool), + NVarChar(bool), + NChar(bool), + Date(bool), + Timestamp(bool), + TimestampTz(bool), +} + +impl_typesystem! { + system = OracleTypeSystem, + mappings = { + { NumInt => i64 } + { Float | NumFloat | BinaryFloat | BinaryDouble => f64 } + { Blob => Vec} + { Clob | VarChar | Char | NVarChar | NChar => String } + { Date | Timestamp => NaiveDateTime } + { TimestampTz => DateTime } + } +} + +impl<'a> From<&'a OracleType> for OracleTypeSystem { + fn from(ty: &'a OracleType) -> OracleTypeSystem { + use OracleTypeSystem::*; + match ty { + OracleType::Number(0, 0) => NumFloat(true), + OracleType::Number(_, 0) => NumInt(true), + OracleType::Number(_, _) => NumFloat(true), + OracleType::Float(_) => Float(true), + OracleType::BinaryFloat => BinaryFloat(true), + OracleType::BinaryDouble => BinaryDouble(true), + OracleType::BLOB => Blob(true), + OracleType::CLOB => Clob(true), + OracleType::Char(_) | OracleType::Long => Char(true), + OracleType::NChar(_) => NChar(true), + OracleType::Varchar2(_) => VarChar(true), + OracleType::NVarchar2(_) => NVarChar(true), + OracleType::Date => Date(true), + OracleType::Timestamp(_) => Timestamp(true), + OracleType::TimestampTZ(_) => TimestampTz(true), + _ => unimplemented!("{}", format!("Type {:?} not implemented for oracle!", ty)), + } + } +} diff --git a/connectorx/src/sources/postgres/connection.rs b/connectorx/src/sources/postgres/connection.rs new file mode 100644 index 0000000..035fce1 --- /dev/null +++ b/connectorx/src/sources/postgres/connection.rs @@ -0,0 +1,128 @@ +use crate::sources::postgres::errors::PostgresSourceError; +use openssl::ssl::{SslConnector, SslFiletype, SslMethod, SslVerifyMode}; +use postgres::{config::SslMode, Config}; +use postgres_openssl::MakeTlsConnector; +use std::collections::HashMap; +use std::convert::TryFrom; +use std::path::PathBuf; +use url::Url; + +#[derive(Clone, Debug)] +pub struct TlsConfig { + /// Postgres config, pg_config.sslmode (`sslmode`). + pub pg_config: Config, + /// Location of the client cert and key (`sslcert`, `sslkey`). + pub client_cert: Option<(PathBuf, PathBuf)>, + /// Location of the root certificate (`sslrootcert`). + pub root_cert: Option, +} + +impl TryFrom for MakeTlsConnector { + type Error = PostgresSourceError; + // The logic of this function adapted primarily from: + // https://github.com/sfackler/rust-postgres/pull/774 + // We only support server side authentication (`sslrootcert`) for now + fn try_from(tls_config: TlsConfig) -> Result { + let mut builder = SslConnector::builder(SslMethod::tls_client())?; + let ssl_mode = tls_config.pg_config.get_ssl_mode(); + let (verify_ca, verify_hostname) = match ssl_mode { + SslMode::Disable | SslMode::Prefer => (false, false), + SslMode::Require => match tls_config.root_cert { + // If a root CA file exists, the behavior of sslmode=require will be the same as + // that of verify-ca, meaning the server certificate is validated against the CA. + // + // For more details, check out the note about backwards compatibility in + // https://postgresql.org/docs/current/libpq-ssl.html#LIBQ-SSL-CERTIFICATES. + Some(_) => (true, false), + None => (false, false), + }, + // These two modes will not work until upstream rust-postgres supports parsing + // them as part of the TLS config. + // + // SslMode::VerifyCa => (true, false), + // SslMode::VerifyFull => (true, true), + _ => panic!("unexpected sslmode {:?}", ssl_mode), + }; + + if let Some((cert, key)) = tls_config.client_cert { + builder.set_certificate_file(cert, SslFiletype::PEM)?; + builder.set_private_key_file(key, SslFiletype::PEM)?; + } + + if let Some(root_cert) = tls_config.root_cert { + builder.set_ca_file(root_cert)?; + } + + if !verify_ca { + builder.set_verify(SslVerifyMode::NONE); // do not verify CA + } + + let mut tls_connector = MakeTlsConnector::new(builder.build()); + + if !verify_hostname { + tls_connector.set_callback(|connect, _| { + connect.set_verify_hostname(false); + Ok(()) + }); + } + + Ok(tls_connector) + } +} + +// Strip URL params not accepted by upstream rust-postgres +fn strip_bad_opts(url: &Url) -> Url { + let stripped_query: Vec<(_, _)> = url + .query_pairs() + .filter(|p| match &*p.0 { + "sslkey" | "sslcert" | "sslrootcert" => false, + _ => true, + }) + .collect(); + + let mut url2 = url.clone(); + url2.set_query(None); + + for pair in stripped_query { + url2.query_pairs_mut() + .append_pair(&pair.0.to_string()[..], &pair.1.to_string()[..]); + } + + url2 +} + +pub fn rewrite_tls_args( + conn: &Url, +) -> Result<(Config, Option), PostgresSourceError> { + // We parse the config, then strip unsupported SSL opts and rewrite the URI + // before calling conn.parse(). + // + // For more details on this approach, see the conversation here: + // https://github.com/sfackler/rust-postgres/pull/774#discussion_r641784774 + + let params: HashMap = conn.query_pairs().into_owned().collect(); + + let sslcert = params.get("sslcert").map(PathBuf::from); + let sslkey = params.get("sslkey").map(PathBuf::from); + let root_cert = params.get("sslrootcert").map(PathBuf::from); + let client_cert = match (sslcert, sslkey) { + (Some(a), Some(b)) => Some((a, b)), + _ => None, + }; + + let stripped_url = strip_bad_opts(conn); + let pg_config: Config = stripped_url.as_str().parse().unwrap(); + + let tls_config = TlsConfig { + pg_config: pg_config.clone(), + client_cert, + root_cert, + }; + + let tls_connector = match pg_config.get_ssl_mode() { + SslMode::Disable => None, + _ => Some(MakeTlsConnector::try_from(tls_config)?), + }; + + Ok((pg_config, tls_connector)) +} diff --git a/connectorx/src/sources/postgres/errors.rs b/connectorx/src/sources/postgres/errors.rs new file mode 100644 index 0000000..7351be7 --- /dev/null +++ b/connectorx/src/sources/postgres/errors.rs @@ -0,0 +1,29 @@ +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum PostgresSourceError { + #[error(transparent)] + ConnectorXError(#[from] crate::errors::ConnectorXError), + + #[error(transparent)] + PostgresPoolError(#[from] r2d2::Error), + + #[error(transparent)] + PostgresError(#[from] postgres::Error), + + #[error(transparent)] + CSVError(#[from] csv::Error), + + #[error(transparent)] + HexError(#[from] hex::FromHexError), + + #[error(transparent)] + IOError(#[from] std::io::Error), + + #[error(transparent)] + TlsError(#[from] openssl::error::ErrorStack), + + /// Any other errors that are too trivial to be put here explicitly. + #[error(transparent)] + Other(#[from] anyhow::Error), +} diff --git a/connectorx/src/sources/postgres/mod.rs b/connectorx/src/sources/postgres/mod.rs new file mode 100644 index 0000000..bf8892e --- /dev/null +++ b/connectorx/src/sources/postgres/mod.rs @@ -0,0 +1,1672 @@ +//! Source implementation for Postgres database, including the TLS support (client only). + +mod connection; +mod errors; +mod typesystem; + +pub use self::errors::PostgresSourceError; +pub use connection::rewrite_tls_args; +pub use typesystem::{PostgresTypePairs, PostgresTypeSystem}; + +use crate::constants::DB_BUFFER_SIZE; +use crate::{ + data_order::DataOrder, + errors::ConnectorXError, + sources::{PartitionParser, Produce, Source, SourcePartition}, + sql::{count_query, CXQuery}, +}; +use anyhow::anyhow; +use chrono::{DateTime, FixedOffset, NaiveDate, NaiveDateTime, NaiveTime, Utc}; +use csv::{ReaderBuilder, StringRecord, StringRecordsIntoIter}; +use fehler::{throw, throws}; +use hex::decode; +use postgres::{ + binary_copy::{BinaryCopyOutIter, BinaryCopyOutRow}, + fallible_iterator::FallibleIterator, + tls::{MakeTlsConnect, TlsConnect}, + Config, CopyOutReader, Row, RowIter, SimpleQueryMessage, Socket, +}; +use r2d2::{Pool, PooledConnection}; +use r2d2_postgres::PostgresConnectionManager; +use rust_decimal::Decimal; +use serde_json::{from_str, Value}; +use sqlparser::dialect::PostgreSqlDialect; +use std::collections::HashMap; +use std::convert::TryFrom; +use std::marker::PhantomData; +use uuid::Uuid; + +/// Protocol - Binary based bulk load +pub enum BinaryProtocol {} + +/// Protocol - CSV based bulk load +pub enum CSVProtocol {} + +/// Protocol - use Cursor +pub enum CursorProtocol {} + +/// Protocol - use Simple Query +pub enum SimpleProtocol {} + +type PgManager = PostgresConnectionManager; +type PgConn = PooledConnection>; + +// take a row and unwrap the interior field from column 0 +fn convert_row<'b, R: TryFrom + postgres::types::FromSql<'b> + Clone>(row: &'b Row) -> R { + let nrows: Option = row.get(0); + nrows.expect("Could not parse int result from count_query") +} + +#[throws(PostgresSourceError)] +fn get_total_rows(conn: &mut PgConn, query: &CXQuery) -> usize +where + C: MakeTlsConnect + Clone + 'static + Sync + Send, + C::TlsConnect: Send, + C::Stream: Send, + >::Future: Send, +{ + let dialect = PostgreSqlDialect {}; + + let row = conn.query_one(count_query(query, &dialect)?.as_str(), &[])?; + let col_type = PostgresTypeSystem::from(row.columns()[0].type_()); + match col_type { + PostgresTypeSystem::Int2(_) => convert_row::(&row) as usize, + PostgresTypeSystem::Int4(_) => convert_row::(&row) as usize, + PostgresTypeSystem::Int8(_) => convert_row::(&row) as usize, + _ => throw!(anyhow!( + "The result of the count query was not an int, aborting." + )), + } +} + +pub struct PostgresSource +where + C: MakeTlsConnect + Clone + 'static + Sync + Send, + C::TlsConnect: Send, + C::Stream: Send, + >::Future: Send, +{ + pool: Pool>, + origin_query: Option, + queries: Vec>, + names: Vec, + schema: Vec, + pg_schema: Vec, + _protocol: PhantomData

, +} + +impl PostgresSource +where + C: MakeTlsConnect + Clone + 'static + Sync + Send, + C::TlsConnect: Send, + C::Stream: Send, + >::Future: Send, +{ + #[throws(PostgresSourceError)] + pub fn new(config: Config, tls: C, nconn: usize) -> Self { + let manager = PostgresConnectionManager::new(config, tls); + let pool = Pool::builder().max_size(nconn as u32).build(manager)?; + + Self { + pool, + origin_query: None, + queries: vec![], + names: vec![], + schema: vec![], + pg_schema: vec![], + _protocol: PhantomData, + } + } +} + +impl Source for PostgresSource +where + PostgresSourcePartition: + SourcePartition, + P: Send, + C: MakeTlsConnect + Clone + 'static + Sync + Send, + C::TlsConnect: Send, + C::Stream: Send, + >::Future: Send, +{ + const DATA_ORDERS: &'static [DataOrder] = &[DataOrder::RowMajor]; + type Partition = PostgresSourcePartition; + type TypeSystem = PostgresTypeSystem; + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn set_data_order(&mut self, data_order: DataOrder) { + if !matches!(data_order, DataOrder::RowMajor) { + throw!(ConnectorXError::UnsupportedDataOrder(data_order)); + } + } + + fn set_queries(&mut self, queries: &[CXQuery]) { + self.queries = queries.iter().map(|q| q.map(Q::to_string)).collect(); + } + + fn set_origin_query(&mut self, query: Option) { + self.origin_query = query; + } + + #[throws(PostgresSourceError)] + fn fetch_metadata(&mut self) { + assert!(!self.queries.is_empty()); + + let mut conn = self.pool.get()?; + let first_query = &self.queries[0]; + + let stmt = conn.prepare(first_query.as_str())?; + + let (names, pg_types): (Vec, Vec) = stmt + .columns() + .iter() + .map(|col| (col.name().to_string(), col.type_().clone())) + .unzip(); + + self.names = names; + self.schema = pg_types + .iter() + .map(PostgresTypeSystem::from) + .collect(); + self.pg_schema = self + .schema + .iter() + .zip(pg_types.iter()) + .map(|(t1, t2)| PostgresTypePairs(t2, t1).into()) + .collect(); + } + + #[throws(PostgresSourceError)] + fn result_rows(&mut self) -> Option { + match &self.origin_query { + Some(q) => { + let cxq = CXQuery::Naked(q.clone()); + let mut conn = self.pool.get()?; + let nrows = get_total_rows(&mut conn, &cxq)?; + Some(nrows) + } + None => None, + } + } + + fn names(&self) -> Vec { + self.names.clone() + } + + fn schema(&self) -> Vec { + self.schema.clone() + } + + #[throws(PostgresSourceError)] + fn partition(self) -> Vec { + let mut ret = vec![]; + for query in self.queries { + let conn = self.pool.get()?; + + ret.push(PostgresSourcePartition::::new( + conn, + &query, + &self.schema, + &self.pg_schema, + )); + } + ret + } +} + +pub struct PostgresSourcePartition +where + C: MakeTlsConnect + Clone + 'static + Sync + Send, + C::TlsConnect: Send, + C::Stream: Send, + >::Future: Send, +{ + conn: PgConn, + query: CXQuery, + schema: Vec, + pg_schema: Vec, + nrows: usize, + ncols: usize, + _protocol: PhantomData

, +} + +impl PostgresSourcePartition +where + C: MakeTlsConnect + Clone + 'static + Sync + Send, + C::TlsConnect: Send, + C::Stream: Send, + >::Future: Send, +{ + pub fn new( + conn: PgConn, + query: &CXQuery, + schema: &[PostgresTypeSystem], + pg_schema: &[postgres::types::Type], + ) -> Self { + Self { + conn, + query: query.clone(), + schema: schema.to_vec(), + pg_schema: pg_schema.to_vec(), + nrows: 0, + ncols: schema.len(), + _protocol: PhantomData, + } + } +} + +impl SourcePartition for PostgresSourcePartition +where + C: MakeTlsConnect + Clone + 'static + Sync + Send, + C::TlsConnect: Send, + C::Stream: Send, + >::Future: Send, +{ + type TypeSystem = PostgresTypeSystem; + type Parser<'a> = PostgresBinarySourcePartitionParser<'a>; + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn result_rows(&mut self) -> () { + self.nrows = get_total_rows(&mut self.conn, &self.query)?; + } + + #[throws(PostgresSourceError)] + fn parser(&mut self) -> Self::Parser<'_> { + let query = format!("COPY ({}) TO STDOUT WITH BINARY", self.query); + let reader = self.conn.copy_out(&*query)?; // unless reading the data, it seems like issue the query is fast + let iter = BinaryCopyOutIter::new(reader, &self.pg_schema); + + PostgresBinarySourcePartitionParser::new(iter, &self.schema) + } + + fn nrows(&self) -> usize { + self.nrows + } + + fn ncols(&self) -> usize { + self.ncols + } +} + +impl SourcePartition for PostgresSourcePartition +where + C: MakeTlsConnect + Clone + 'static + Sync + Send, + C::TlsConnect: Send, + C::Stream: Send, + >::Future: Send, +{ + type TypeSystem = PostgresTypeSystem; + type Parser<'a> = PostgresCSVSourceParser<'a>; + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn result_rows(&mut self) { + self.nrows = get_total_rows(&mut self.conn, &self.query)?; + } + + #[throws(PostgresSourceError)] + fn parser(&mut self) -> Self::Parser<'_> { + let query = format!("COPY ({}) TO STDOUT WITH CSV", self.query); + let reader = self.conn.copy_out(&*query)?; // unless reading the data, it seems like issue the query is fast + let iter = ReaderBuilder::new() + .has_headers(false) + .from_reader(reader) + .into_records(); + + PostgresCSVSourceParser::new(iter, &self.schema) + } + + fn nrows(&self) -> usize { + self.nrows + } + + fn ncols(&self) -> usize { + self.ncols + } +} + +impl SourcePartition for PostgresSourcePartition +where + C: MakeTlsConnect + Clone + 'static + Sync + Send, + C::TlsConnect: Send, + C::Stream: Send, + >::Future: Send, +{ + type TypeSystem = PostgresTypeSystem; + type Parser<'a> = PostgresRawSourceParser<'a>; + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn result_rows(&mut self) { + self.nrows = get_total_rows(&mut self.conn, &self.query)?; + } + + #[throws(PostgresSourceError)] + fn parser(&mut self) -> Self::Parser<'_> { + let iter = self + .conn + .query_raw::<_, bool, _>(self.query.as_str(), vec![])?; // unless reading the data, it seems like issue the query is fast + PostgresRawSourceParser::new(iter, &self.schema) + } + + fn nrows(&self) -> usize { + self.nrows + } + + fn ncols(&self) -> usize { + self.ncols + } +} +pub struct PostgresBinarySourcePartitionParser<'a> { + iter: BinaryCopyOutIter<'a>, + rowbuf: Vec, + ncols: usize, + current_col: usize, + current_row: usize, + is_finished: bool, +} + +impl<'a> PostgresBinarySourcePartitionParser<'a> { + pub fn new(iter: BinaryCopyOutIter<'a>, schema: &[PostgresTypeSystem]) -> Self { + Self { + iter, + rowbuf: Vec::with_capacity(DB_BUFFER_SIZE), + ncols: schema.len(), + current_row: 0, + current_col: 0, + is_finished: false, + } + } + + #[throws(PostgresSourceError)] + fn next_loc(&mut self) -> (usize, usize) { + let ret = (self.current_row, self.current_col); + self.current_row += (self.current_col + 1) / self.ncols; + self.current_col = (self.current_col + 1) % self.ncols; + ret + } +} + +impl<'a> PartitionParser<'a> for PostgresBinarySourcePartitionParser<'a> { + type TypeSystem = PostgresTypeSystem; + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn fetch_next(&mut self) -> (usize, bool) { + assert!(self.current_col == 0); + let remaining_rows = self.rowbuf.len() - self.current_row; + if remaining_rows > 0 { + return (remaining_rows, self.is_finished); + } else if self.is_finished { + return (0, self.is_finished); + } + + // clear the buffer + if !self.rowbuf.is_empty() { + self.rowbuf.drain(..); + } + for _ in 0..DB_BUFFER_SIZE { + match self.iter.next()? { + Some(row) => { + self.rowbuf.push(row); + } + None => { + self.is_finished = true; + break; + } + } + } + + // reset current cursor positions + self.current_row = 0; + self.current_col = 0; + + (self.rowbuf.len(), self.is_finished) + } +} + +macro_rules! impl_produce { + ($($t: ty,)+) => { + $( + impl<'r, 'a> Produce<'r, $t> for PostgresBinarySourcePartitionParser<'a> { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> $t { + let (ridx, cidx) = self.next_loc()?; + let row = &self.rowbuf[ridx]; + let val = row.try_get(cidx)?; + val + } + } + + impl<'r, 'a> Produce<'r, Option<$t>> for PostgresBinarySourcePartitionParser<'a> { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> Option<$t> { + let (ridx, cidx) = self.next_loc()?; + let row = &self.rowbuf[ridx]; + let val = row.try_get(cidx)?; + val + } + } + )+ + }; +} + +impl_produce!( + i8, + i16, + i32, + i64, + f32, + f64, + Decimal, + Vec, + Vec, + Vec, + Vec, + Vec, + Vec, + bool, + Vec, + &'r str, + Vec, + NaiveTime, + NaiveDateTime, + DateTime, + NaiveDate, + Uuid, + Value, + Vec, +); + +impl<'r, 'a> Produce<'r, HashMap>> + for PostgresBinarySourcePartitionParser<'a> +{ + type Error = PostgresSourceError; + #[throws(PostgresSourceError)] + fn produce(&mut self) -> HashMap> { + unimplemented!("Please use `cursor` protocol for hstore type"); + } +} + +impl<'r, 'a> Produce<'r, Option>>> + for PostgresBinarySourcePartitionParser<'a> +{ + type Error = PostgresSourceError; + #[throws(PostgresSourceError)] + fn produce(&mut self) -> Option>> { + unimplemented!("Please use `cursor` protocol for hstore type"); + } +} + +pub struct PostgresCSVSourceParser<'a> { + iter: StringRecordsIntoIter>, + rowbuf: Vec, + ncols: usize, + current_col: usize, + current_row: usize, + is_finished: bool, +} + +impl<'a> PostgresCSVSourceParser<'a> { + pub fn new( + iter: StringRecordsIntoIter>, + schema: &[PostgresTypeSystem], + ) -> Self { + Self { + iter, + rowbuf: Vec::with_capacity(DB_BUFFER_SIZE), + ncols: schema.len(), + current_row: 0, + current_col: 0, + is_finished: false, + } + } + + #[throws(PostgresSourceError)] + fn next_loc(&mut self) -> (usize, usize) { + let ret = (self.current_row, self.current_col); + self.current_row += (self.current_col + 1) / self.ncols; + self.current_col = (self.current_col + 1) % self.ncols; + ret + } +} + +impl<'a> PartitionParser<'a> for PostgresCSVSourceParser<'a> { + type Error = PostgresSourceError; + type TypeSystem = PostgresTypeSystem; + + #[throws(PostgresSourceError)] + fn fetch_next(&mut self) -> (usize, bool) { + assert!(self.current_col == 0); + let remaining_rows = self.rowbuf.len() - self.current_row; + if remaining_rows > 0 { + return (remaining_rows, self.is_finished); + } else if self.is_finished { + return (0, self.is_finished); + } + + if !self.rowbuf.is_empty() { + self.rowbuf.drain(..); + } + for _ in 0..DB_BUFFER_SIZE { + if let Some(row) = self.iter.next() { + self.rowbuf.push(row?); + } else { + self.is_finished = true; + break; + } + } + self.current_row = 0; + self.current_col = 0; + (self.rowbuf.len(), self.is_finished) + } +} + +macro_rules! impl_csv_produce { + ($($t: ty,)+) => { + $( + impl<'r, 'a> Produce<'r, $t> for PostgresCSVSourceParser<'a> { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> $t { + let (ridx, cidx) = self.next_loc()?; + self.rowbuf[ridx][cidx].parse().map_err(|_| { + ConnectorXError::cannot_produce::<$t>(Some(self.rowbuf[ridx][cidx].into())) + })? + } + } + + impl<'r, 'a> Produce<'r, Option<$t>> for PostgresCSVSourceParser<'a> { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> Option<$t> { + let (ridx, cidx) = self.next_loc()?; + match &self.rowbuf[ridx][cidx][..] { + "" => None, + v => Some(v.parse().map_err(|_| { + ConnectorXError::cannot_produce::<$t>(Some(self.rowbuf[ridx][cidx].into())) + })?), + } + } + } + )+ + }; +} + +impl_csv_produce!(i8, i16, i32, i64, f32, f64, Decimal, Uuid,); + +macro_rules! impl_csv_vec_produce { + ($($t: ty,)+) => { + $( + impl<'r, 'a> Produce<'r, Vec<$t>> for PostgresCSVSourceParser<'a> { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&mut self) -> Vec<$t> { + let (ridx, cidx) = self.next_loc()?; + let s = &self.rowbuf[ridx][cidx][..]; + match s { + "{}" => vec![], + _ if s.len() < 3 => throw!(ConnectorXError::cannot_produce::<$t>(Some(s.into()))), + s => s[1..s.len() - 1] + .split(",") + .map(|v| { + v.parse() + .map_err(|_| ConnectorXError::cannot_produce::<$t>(Some(s.into()))) + }) + .collect::, ConnectorXError>>()?, + } + } + } + + impl<'r, 'a> Produce<'r, Option>> for PostgresCSVSourceParser<'a> { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&mut self) -> Option> { + let (ridx, cidx) = self.next_loc()?; + let s = &self.rowbuf[ridx][cidx][..]; + match s { + "" => None, + "{}" => Some(vec![]), + _ if s.len() < 3 => throw!(ConnectorXError::cannot_produce::<$t>(Some(s.into()))), + s => Some( + s[1..s.len() - 1] + .split(",") + .map(|v| { + v.parse() + .map_err(|_| ConnectorXError::cannot_produce::<$t>(Some(s.into()))) + }) + .collect::, ConnectorXError>>()?, + ), + } + } + } + )+ + }; +} + +impl_csv_vec_produce!(i8, i16, i32, i64, f32, f64, Decimal, String,); + +impl<'r, 'a> Produce<'r, HashMap>> for PostgresCSVSourceParser<'a> { + type Error = PostgresSourceError; + #[throws(PostgresSourceError)] + fn produce(&mut self) -> HashMap> { + unimplemented!("Please use `cursor` protocol for hstore type"); + } +} + +impl<'r, 'a> Produce<'r, Option>>> for PostgresCSVSourceParser<'a> { + type Error = PostgresSourceError; + #[throws(PostgresSourceError)] + fn produce(&mut self) -> Option>> { + unimplemented!("Please use `cursor` protocol for hstore type"); + } +} + +impl<'r, 'a> Produce<'r, bool> for PostgresCSVSourceParser<'a> { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&mut self) -> bool { + let (ridx, cidx) = self.next_loc()?; + let ret = match &self.rowbuf[ridx][cidx][..] { + "t" => true, + "f" => false, + _ => throw!(ConnectorXError::cannot_produce::(Some( + self.rowbuf[ridx][cidx].into() + ))), + }; + ret + } +} + +impl<'r, 'a> Produce<'r, Option> for PostgresCSVSourceParser<'a> { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&mut self) -> Option { + let (ridx, cidx) = self.next_loc()?; + let ret = match &self.rowbuf[ridx][cidx][..] { + "" => None, + "t" => Some(true), + "f" => Some(false), + _ => throw!(ConnectorXError::cannot_produce::(Some( + self.rowbuf[ridx][cidx].into() + ))), + }; + ret + } +} + +impl<'r, 'a> Produce<'r, Vec> for PostgresCSVSourceParser<'a> { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&mut self) -> Vec { + let (ridx, cidx) = self.next_loc()?; + let s = &self.rowbuf[ridx][cidx][..]; + match s { + "{}" => vec![], + _ if s.len() < 3 => throw!(ConnectorXError::cannot_produce::(Some(s.into()))), + s => s[1..s.len() - 1] + .split(',') + .map(|v| match v { + "t" => Ok(true), + "f" => Ok(false), + _ => throw!(ConnectorXError::cannot_produce::(Some(s.into()))), + }) + .collect::, ConnectorXError>>()?, + } + } +} + +impl<'r, 'a> Produce<'r, Option>> for PostgresCSVSourceParser<'a> { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&mut self) -> Option> { + let (ridx, cidx) = self.next_loc()?; + let s = &self.rowbuf[ridx][cidx][..]; + match s { + "" => None, + "{}" => Some(vec![]), + _ if s.len() < 3 => throw!(ConnectorXError::cannot_produce::(Some(s.into()))), + s => Some( + s[1..s.len() - 1] + .split(',') + .map(|v| match v { + "t" => Ok(true), + "f" => Ok(false), + _ => throw!(ConnectorXError::cannot_produce::(Some(s.into()))), + }) + .collect::, ConnectorXError>>()?, + ), + } + } +} + +impl<'r, 'a> Produce<'r, DateTime> for PostgresCSVSourceParser<'a> { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&mut self) -> DateTime { + let (ridx, cidx) = self.next_loc()?; + let s: &str = &self.rowbuf[ridx][cidx][..]; + // postgres csv return example: 1970-01-01 00:00:01+00 + format!("{}:00", s).parse().map_err(|_| { + ConnectorXError::cannot_produce::>(Some(self.rowbuf[ridx][cidx].into())) + })? + } +} + +impl<'r, 'a> Produce<'r, Option>> for PostgresCSVSourceParser<'a> { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&mut self) -> Option> { + let (ridx, cidx) = self.next_loc()?; + match &self.rowbuf[ridx][cidx][..] { + "" => None, + v => { + // postgres csv return example: 1970-01-01 00:00:01+00 + Some(format!("{}:00", v).parse().map_err(|_| { + ConnectorXError::cannot_produce::>(Some(v.into())) + })?) + } + } + } +} + +impl<'r, 'a> Produce<'r, NaiveDate> for PostgresCSVSourceParser<'a> { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&mut self) -> NaiveDate { + let (ridx, cidx) = self.next_loc()?; + NaiveDate::parse_from_str(&self.rowbuf[ridx][cidx], "%Y-%m-%d").map_err(|_| { + ConnectorXError::cannot_produce::(Some(self.rowbuf[ridx][cidx].into())) + })? + } +} + +impl<'r, 'a> Produce<'r, Option> for PostgresCSVSourceParser<'a> { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&mut self) -> Option { + let (ridx, cidx) = self.next_loc()?; + match &self.rowbuf[ridx][cidx][..] { + "" => None, + v => Some( + NaiveDate::parse_from_str(v, "%Y-%m-%d") + .map_err(|_| ConnectorXError::cannot_produce::(Some(v.into())))?, + ), + } + } +} + +impl<'r, 'a> Produce<'r, NaiveDateTime> for PostgresCSVSourceParser<'a> { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&mut self) -> NaiveDateTime { + let (ridx, cidx) = self.next_loc()?; + NaiveDateTime::parse_from_str(&self.rowbuf[ridx][cidx], "%Y-%m-%d %H:%M:%S").map_err( + |_| { + ConnectorXError::cannot_produce::(Some( + self.rowbuf[ridx][cidx].into(), + )) + }, + )? + } +} + +impl<'r, 'a> Produce<'r, Option> for PostgresCSVSourceParser<'a> { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&mut self) -> Option { + let (ridx, cidx) = self.next_loc()?; + match &self.rowbuf[ridx][cidx][..] { + "" => None, + v => Some( + NaiveDateTime::parse_from_str(v, "%Y-%m-%d %H:%M:%S").map_err(|_| { + ConnectorXError::cannot_produce::(Some(v.into())) + })?, + ), + } + } +} + +impl<'r, 'a> Produce<'r, NaiveTime> for PostgresCSVSourceParser<'a> { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&mut self) -> NaiveTime { + let (ridx, cidx) = self.next_loc()?; + NaiveTime::parse_from_str(&self.rowbuf[ridx][cidx], "%H:%M:%S").map_err(|_| { + ConnectorXError::cannot_produce::(Some(self.rowbuf[ridx][cidx].into())) + })? + } +} + +impl<'r, 'a> Produce<'r, Option> for PostgresCSVSourceParser<'a> { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&mut self) -> Option { + let (ridx, cidx) = self.next_loc()?; + match &self.rowbuf[ridx][cidx][..] { + "" => None, + v => Some( + NaiveTime::parse_from_str(v, "%H:%M:%S") + .map_err(|_| ConnectorXError::cannot_produce::(Some(v.into())))?, + ), + } + } +} + +impl<'r, 'a> Produce<'r, &'r str> for PostgresCSVSourceParser<'a> { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> &'r str { + let (ridx, cidx) = self.next_loc()?; + &self.rowbuf[ridx][cidx] + } +} + +impl<'r, 'a> Produce<'r, Option<&'r str>> for PostgresCSVSourceParser<'a> { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> Option<&'r str> { + let (ridx, cidx) = self.next_loc()?; + match &self.rowbuf[ridx][cidx][..] { + "" => None, + v => Some(v), + } + } +} + +impl<'r, 'a> Produce<'r, Vec> for PostgresCSVSourceParser<'a> { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> Vec { + let (ridx, cidx) = self.next_loc()?; + decode(&self.rowbuf[ridx][cidx][2..])? // escape \x in the beginning + } +} + +impl<'r, 'a> Produce<'r, Option>> for PostgresCSVSourceParser<'a> { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> Option> { + let (ridx, cidx) = self.next_loc()?; + match &self.rowbuf[ridx][cidx] { + // escape \x in the beginning, empty if None + "" => None, + v => Some(decode(&v[2..])?), + } + } +} + +impl<'r, 'a> Produce<'r, Value> for PostgresCSVSourceParser<'a> { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> Value { + let (ridx, cidx) = self.next_loc()?; + let v = &self.rowbuf[ridx][cidx]; + from_str(v).map_err(|_| ConnectorXError::cannot_produce::(Some(v.into())))? + } +} + +impl<'r, 'a> Produce<'r, Option> for PostgresCSVSourceParser<'a> { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> Option { + let (ridx, cidx) = self.next_loc()?; + + match &self.rowbuf[ridx][cidx][..] { + "" => None, + v => { + from_str(v).map_err(|_| ConnectorXError::cannot_produce::(Some(v.into())))? + } + } + } +} + +pub struct PostgresRawSourceParser<'a> { + iter: RowIter<'a>, + rowbuf: Vec, + ncols: usize, + current_col: usize, + current_row: usize, + is_finished: bool, +} + +impl<'a> PostgresRawSourceParser<'a> { + pub fn new(iter: RowIter<'a>, schema: &[PostgresTypeSystem]) -> Self { + Self { + iter, + rowbuf: Vec::with_capacity(DB_BUFFER_SIZE), + ncols: schema.len(), + current_row: 0, + current_col: 0, + is_finished: false, + } + } + + #[throws(PostgresSourceError)] + fn next_loc(&mut self) -> (usize, usize) { + let ret = (self.current_row, self.current_col); + self.current_row += (self.current_col + 1) / self.ncols; + self.current_col = (self.current_col + 1) % self.ncols; + ret + } +} + +impl<'a> PartitionParser<'a> for PostgresRawSourceParser<'a> { + type TypeSystem = PostgresTypeSystem; + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn fetch_next(&mut self) -> (usize, bool) { + assert!(self.current_col == 0); + let remaining_rows = self.rowbuf.len() - self.current_row; + if remaining_rows > 0 { + return (remaining_rows, self.is_finished); + } else if self.is_finished { + return (0, self.is_finished); + } + + if !self.rowbuf.is_empty() { + self.rowbuf.drain(..); + } + for _ in 0..DB_BUFFER_SIZE { + if let Some(row) = self.iter.next()? { + self.rowbuf.push(row); + } else { + self.is_finished = true; + break; + } + } + self.current_row = 0; + self.current_col = 0; + (self.rowbuf.len(), self.is_finished) + } +} + +macro_rules! impl_produce { + ($($t: ty,)+) => { + $( + impl<'r, 'a> Produce<'r, $t> for PostgresRawSourceParser<'a> { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> $t { + let (ridx, cidx) = self.next_loc()?; + let row = &self.rowbuf[ridx]; + let val = row.try_get(cidx)?; + val + } + } + + impl<'r, 'a> Produce<'r, Option<$t>> for PostgresRawSourceParser<'a> { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> Option<$t> { + let (ridx, cidx) = self.next_loc()?; + let row = &self.rowbuf[ridx]; + let val = row.try_get(cidx)?; + val + } + } + )+ + }; +} + +impl_produce!( + i8, + i16, + i32, + i64, + f32, + f64, + Decimal, + Vec, + Vec, + Vec, + Vec, + Vec, + Vec, + bool, + Vec, + &'r str, + Vec, + NaiveTime, + NaiveDateTime, + DateTime, + NaiveDate, + Uuid, + Value, + HashMap>, + Vec, +); + +impl SourcePartition for PostgresSourcePartition +where + C: MakeTlsConnect + Clone + 'static + Sync + Send, + C::TlsConnect: Send, + C::Stream: Send, + >::Future: Send, +{ + type TypeSystem = PostgresTypeSystem; + type Parser<'a> = PostgresSimpleSourceParser; + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn result_rows(&mut self) { + self.nrows = get_total_rows(&mut self.conn, &self.query)?; + } + + #[throws(PostgresSourceError)] + fn parser(&mut self) -> Self::Parser<'_> { + let rows = self.conn.simple_query(self.query.as_str())?; // unless reading the data, it seems like issue the query is fast + PostgresSimpleSourceParser::new(rows, &self.schema) + } + + fn nrows(&self) -> usize { + self.nrows + } + + fn ncols(&self) -> usize { + self.ncols + } +} + +pub struct PostgresSimpleSourceParser { + rows: Vec, + ncols: usize, + current_col: usize, + current_row: usize, +} +impl<'a> PostgresSimpleSourceParser { + pub fn new(rows: Vec, schema: &[PostgresTypeSystem]) -> Self { + Self { + rows, + ncols: schema.len(), + current_row: 0, + current_col: 0, + } + } + + #[throws(PostgresSourceError)] + fn next_loc(&mut self) -> (usize, usize) { + let ret = (self.current_row, self.current_col); + self.current_row += (self.current_col + 1) / self.ncols; + self.current_col = (self.current_col + 1) % self.ncols; + ret + } +} + +impl<'a> PartitionParser<'a> for PostgresSimpleSourceParser { + type TypeSystem = PostgresTypeSystem; + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn fetch_next(&mut self) -> (usize, bool) { + self.current_row = 0; + self.current_col = 0; + (self.rows.len() - 1, true) // last message is command complete + } +} + +macro_rules! impl_simple_produce_unimplemented { + ($($t: ty,)+) => { + $( + impl<'r, 'a> Produce<'r, $t> for PostgresSimpleSourceParser { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> $t { + unimplemented!("not implemented!"); + } + } + + impl<'r, 'a> Produce<'r, Option<$t>> for PostgresSimpleSourceParser { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> Option<$t> { + unimplemented!("not implemented!"); + } + } + )+ + }; +} + +macro_rules! impl_simple_produce { + ($($t: ty,)+) => { + $( + impl<'r> Produce<'r, $t> for PostgresSimpleSourceParser { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> $t { + let (ridx, cidx) = self.next_loc()?; + let val = match &self.rows[ridx] { + SimpleQueryMessage::Row(row) => match row.try_get(cidx)? { + Some(s) => s + .parse() + .map_err(|_| ConnectorXError::cannot_produce::<$t>(Some(s.into())))?, + None => throw!(anyhow!( + "Cannot parse NULL in NOT NULL column." + )), + }, + SimpleQueryMessage::CommandComplete(c) => { + panic!("get command: {}", c); + } + _ => { + panic!("what?"); + } + }; + val + } + } + + impl<'r, 'a> Produce<'r, Option<$t>> for PostgresSimpleSourceParser { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> Option<$t> { + let (ridx, cidx) = self.next_loc()?; + let val = match &self.rows[ridx] { + SimpleQueryMessage::Row(row) => match row.try_get(cidx)? { + Some(s) => Some( + s.parse() + .map_err(|_| ConnectorXError::cannot_produce::<$t>(Some(s.into())))?, + ), + None => None, + }, + SimpleQueryMessage::CommandComplete(c) => { + panic!("get command: {}", c); + } + _ => { + panic!("what?"); + } + }; + val + } + } + )+ + }; +} + +impl_simple_produce!(i8, i16, i32, i64, f32, f64, Decimal, Uuid, bool,); +impl_simple_produce_unimplemented!( + Value, + HashMap>,); + +impl<'r> Produce<'r, &'r str> for PostgresSimpleSourceParser { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> &'r str { + let (ridx, cidx) = self.next_loc()?; + let val = match &self.rows[ridx] { + SimpleQueryMessage::Row(row) => match row.try_get(cidx)? { + Some(s) => s, + None => throw!(anyhow!("Cannot parse NULL in non-NULL column.")), + }, + SimpleQueryMessage::CommandComplete(c) => { + panic!("get command: {}", c); + } + _ => { + panic!("what?"); + } + }; + val + } +} + +impl<'r, 'a> Produce<'r, Option<&'r str>> for PostgresSimpleSourceParser { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> Option<&'r str> { + let (ridx, cidx) = self.next_loc()?; + let val = match &self.rows[ridx] { + SimpleQueryMessage::Row(row) => row.try_get(cidx)?, + SimpleQueryMessage::CommandComplete(c) => { + panic!("get command: {}", c); + } + _ => { + panic!("what?"); + } + }; + val + } +} + +impl<'r> Produce<'r, Vec> for PostgresSimpleSourceParser { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> Vec { + let (ridx, cidx) = self.next_loc()?; + let val = match &self.rows[ridx] { + SimpleQueryMessage::Row(row) => match row.try_get(cidx)? { + Some(s) => { + let mut res = s.chars(); + res.next(); + res.next(); + decode( + res.enumerate() + .fold(String::new(), |acc, (_i, c)| format!("{}{}", acc, c)) + .chars() + .map(|c| c as u8) + .collect::>(), + )? + } + None => throw!(anyhow!("Cannot parse NULL in non-NULL column.")), + }, + SimpleQueryMessage::CommandComplete(c) => { + panic!("get command: {}", c); + } + _ => { + panic!("what?"); + } + }; + val + } +} + +impl<'r, 'a> Produce<'r, Option>> for PostgresSimpleSourceParser { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> Option> { + let (ridx, cidx) = self.next_loc()?; + let val = match &self.rows[ridx] { + SimpleQueryMessage::Row(row) => match row.try_get(cidx)? { + Some(s) => { + let mut res = s.chars(); + res.next(); + res.next(); + Some(decode( + res.enumerate() + .fold(String::new(), |acc, (_i, c)| format!("{}{}", acc, c)) + .chars() + .map(|c| c as u8) + .collect::>(), + )?) + } + None => None, + }, + SimpleQueryMessage::CommandComplete(c) => { + panic!("get command: {}", c); + } + _ => { + panic!("what?"); + } + }; + val + } +} + +fn rem_first_and_last(value: &str) -> &str { + let mut chars = value.chars(); + chars.next(); + chars.next_back(); + chars.as_str() +} + +macro_rules! impl_simple_vec_produce { + ($($t: ty,)+) => { + $( + impl<'r> Produce<'r, Vec<$t>> for PostgresSimpleSourceParser { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> Vec<$t> { + let (ridx, cidx) = self.next_loc()?; + let val = match &self.rows[ridx] { + SimpleQueryMessage::Row(row) => match row.try_get(cidx)? { + Some(s) => match s{ + "" => throw!(anyhow!("Cannot parse NULL in non-NULL column.")), + "{}" => vec![], + _ => rem_first_and_last(s).split(",").map(|token| token.parse().map_err(|_| ConnectorXError::cannot_produce::>(Some(s.into())))).collect::, ConnectorXError>>()? + }, + None => throw!(anyhow!("Cannot parse NULL in non-NULL column.")), + }, + SimpleQueryMessage::CommandComplete(c) => { + panic!("get command: {}", c); + } + _ => { + panic!("what?"); + } + }; + val + } + } + + impl<'r, 'a> Produce<'r, Option>> for PostgresSimpleSourceParser { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> Option> { + let (ridx, cidx) = self.next_loc()?; + let val = match &self.rows[ridx] { + + SimpleQueryMessage::Row(row) => match row.try_get(cidx)? { + Some(s) => match s{ + "" => None, + "{}" => Some(vec![]), + _ => Some(rem_first_and_last(s).split(",").map(|token| token.parse().map_err(|_| ConnectorXError::cannot_produce::>(Some(s.into())))).collect::, ConnectorXError>>()?) + }, + None => None, + }, + + SimpleQueryMessage::CommandComplete(c) => { + panic!("get command: {}", c); + } + _ => { + panic!("what?"); + } + }; + val + } + } + )+ + }; +} +impl_simple_vec_produce!(i16, i32, i64, f32, f64, Decimal, String,); + +impl<'r> Produce<'r, Vec> for PostgresSimpleSourceParser { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> Vec { + let (ridx, cidx) = self.next_loc()?; + let val = match &self.rows[ridx] { + SimpleQueryMessage::Row(row) => match row.try_get(cidx)? { + Some(s) => match s { + "" => throw!(anyhow!("Cannot parse NULL in non-NULL column.")), + "{}" => vec![], + _ => rem_first_and_last(s) + .split(',') + .map(|token| match token { + "t" => Ok(true), + "f" => Ok(false), + _ => { + throw!(ConnectorXError::cannot_produce::>(Some(s.into()))) + } + }) + .collect::, ConnectorXError>>()?, + }, + None => throw!(anyhow!("Cannot parse NULL in non-NULL column.")), + }, + SimpleQueryMessage::CommandComplete(c) => { + panic!("get command: {}", c); + } + _ => { + panic!("what?"); + } + }; + val + } +} + +impl<'r> Produce<'r, Option>> for PostgresSimpleSourceParser { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> Option> { + let (ridx, cidx) = self.next_loc()?; + let val = match &self.rows[ridx] { + SimpleQueryMessage::Row(row) => match row.try_get(cidx)? { + Some(s) => match s { + "" => None, + "{}" => Some(vec![]), + _ => Some( + rem_first_and_last(s) + .split(',') + .map(|token| match token { + "t" => Ok(true), + "f" => Ok(false), + _ => throw!(ConnectorXError::cannot_produce::>(Some( + s.into() + ))), + }) + .collect::, ConnectorXError>>()?, + ), + }, + None => None, + }, + SimpleQueryMessage::CommandComplete(c) => { + panic!("get command: {}", c); + } + _ => { + panic!("what?"); + } + }; + val + } +} + +impl<'r> Produce<'r, NaiveDate> for PostgresSimpleSourceParser { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> NaiveDate { + let (ridx, cidx) = self.next_loc()?; + let val = match &self.rows[ridx] { + SimpleQueryMessage::Row(row) => match row.try_get(cidx)? { + Some(s) => NaiveDate::parse_from_str(s, "%Y-%m-%d") + .map_err(|_| ConnectorXError::cannot_produce::(Some(s.into())))?, + None => throw!(anyhow!("Cannot parse NULL in non-NULL column.")), + }, + SimpleQueryMessage::CommandComplete(c) => { + panic!("get command: {}", c); + } + _ => { + panic!("what?"); + } + }; + val + } +} + +impl<'r> Produce<'r, Option> for PostgresSimpleSourceParser { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> Option { + let (ridx, cidx) = self.next_loc()?; + let val = match &self.rows[ridx] { + SimpleQueryMessage::Row(row) => match row.try_get(cidx)? { + Some(s) => Some(NaiveDate::parse_from_str(s, "%Y-%m-%d").map_err(|_| { + ConnectorXError::cannot_produce::>(Some(s.into())) + })?), + None => None, + }, + SimpleQueryMessage::CommandComplete(c) => { + panic!("get command: {}", c); + } + _ => { + panic!("what?"); + } + }; + val + } +} + +impl<'r> Produce<'r, NaiveTime> for PostgresSimpleSourceParser { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> NaiveTime { + let (ridx, cidx) = self.next_loc()?; + let val = match &self.rows[ridx] { + SimpleQueryMessage::Row(row) => match row.try_get(cidx)? { + Some(s) => NaiveTime::parse_from_str(s, "%H:%M:%S") + .map_err(|_| ConnectorXError::cannot_produce::(Some(s.into())))?, + None => throw!(anyhow!("Cannot parse NULL in non-NULL column.")), + }, + SimpleQueryMessage::CommandComplete(c) => { + panic!("get command: {}", c); + } + _ => { + panic!("what?"); + } + }; + val + } +} + +impl<'r> Produce<'r, Option> for PostgresSimpleSourceParser { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> Option { + let (ridx, cidx) = self.next_loc()?; + let val = match &self.rows[ridx] { + SimpleQueryMessage::Row(row) => match row.try_get(cidx)? { + Some(s) => Some(NaiveTime::parse_from_str(s, "%H:%M:%S").map_err(|_| { + ConnectorXError::cannot_produce::>(Some(s.into())) + })?), + None => None, + }, + SimpleQueryMessage::CommandComplete(c) => { + panic!("get command: {}", c); + } + _ => { + panic!("what?"); + } + }; + val + } +} + +impl<'r> Produce<'r, NaiveDateTime> for PostgresSimpleSourceParser { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> NaiveDateTime { + let (ridx, cidx) = self.next_loc()?; + let val = match &self.rows[ridx] { + SimpleQueryMessage::Row(row) => match row.try_get(cidx)? { + Some(s) => NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S").map_err(|_| { + ConnectorXError::cannot_produce::(Some(s.into())) + })?, + None => throw!(anyhow!("Cannot parse NULL in non-NULL column.")), + }, + SimpleQueryMessage::CommandComplete(c) => { + panic!("get command: {}", c); + } + _ => { + panic!("what?"); + } + }; + val + } +} + +impl<'r> Produce<'r, Option> for PostgresSimpleSourceParser { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> Option { + let (ridx, cidx) = self.next_loc()?; + let val = match &self.rows[ridx] { + SimpleQueryMessage::Row(row) => match row.try_get(cidx)? { + Some(s) => Some( + NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S").map_err(|_| { + ConnectorXError::cannot_produce::>(Some(s.into())) + })?, + ), + None => None, + }, + SimpleQueryMessage::CommandComplete(c) => { + panic!("get command: {}", c); + } + _ => { + panic!("what?"); + } + }; + val + } +} + +impl<'r> Produce<'r, DateTime> for PostgresSimpleSourceParser { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> DateTime { + let (ridx, cidx) = self.next_loc()?; + let val = match &self.rows[ridx] { + SimpleQueryMessage::Row(row) => match row.try_get(cidx)? { + Some(s) => { + let time_string = format!("{}:00", s).to_owned(); + let slice: &str = &time_string[..]; + let time: DateTime = + DateTime::parse_from_str(slice, "%Y-%m-%d %H:%M:%S%:z").unwrap(); + + time.with_timezone(&Utc) + } + None => throw!(anyhow!("Cannot parse NULL in non-NULL column.")), + }, + SimpleQueryMessage::CommandComplete(c) => { + panic!("get command: {}", c); + } + _ => { + panic!("what?"); + } + }; + val + } +} + +impl<'r> Produce<'r, Option>> for PostgresSimpleSourceParser { + type Error = PostgresSourceError; + + #[throws(PostgresSourceError)] + fn produce(&'r mut self) -> Option> { + let (ridx, cidx) = self.next_loc()?; + let val = match &self.rows[ridx] { + SimpleQueryMessage::Row(row) => match row.try_get(cidx)? { + Some(s) => { + let time_string = format!("{}:00", s).to_owned(); + let slice: &str = &time_string[..]; + let time: DateTime = + DateTime::parse_from_str(slice, "%Y-%m-%d %H:%M:%S%:z").unwrap(); + + Some(time.with_timezone(&Utc)) + } + None => None, + }, + SimpleQueryMessage::CommandComplete(c) => { + panic!("get command: {}", c); + } + _ => { + panic!("what?"); + } + }; + val + } +} diff --git a/connectorx/src/sources/postgres/typesystem.rs b/connectorx/src/sources/postgres/typesystem.rs new file mode 100644 index 0000000..5119c8d --- /dev/null +++ b/connectorx/src/sources/postgres/typesystem.rs @@ -0,0 +1,126 @@ +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Utc}; +use postgres::types::Type; +use rust_decimal::Decimal; +use serde_json::Value; +use std::collections::HashMap; +use uuid::Uuid; + +#[derive(Copy, Clone, Debug)] +pub enum PostgresTypeSystem { + Bool(bool), + Float4(bool), + Float8(bool), + Numeric(bool), + Int2(bool), + Int4(bool), + Int8(bool), + Float4Array(bool), + Float8Array(bool), + NumericArray(bool), + BoolArray(bool), + Int2Array(bool), + Int4Array(bool), + Int8Array(bool), + VarcharArray(bool), + TextArray(bool), + Date(bool), + Char(bool), + BpChar(bool), + VarChar(bool), + Text(bool), + ByteA(bool), + Time(bool), + Timestamp(bool), + TimestampTz(bool), + UUID(bool), + JSON(bool), + JSONB(bool), + Enum(bool), + HSTORE(bool), + Name(bool), +} + +impl_typesystem! { + system = PostgresTypeSystem, + mappings = { + { Int2 => i16 } + { Int4 => i32 } + { Int8 => i64 } + { Float4 => f32 } + { Float8 => f64 } + { Numeric => Decimal } + { BoolArray => Vec } + { Int2Array => Vec } + { Int4Array => Vec } + { Int8Array => Vec } + { Float4Array => Vec } + { Float8Array => Vec } + { NumericArray => Vec } + { VarcharArray | TextArray => Vec} + { Bool => bool } + { Char => i8 } + { Text | BpChar | VarChar | Enum | Name => &'r str } { ByteA => Vec } + { Time => NaiveTime } + { Timestamp => NaiveDateTime } + { TimestampTz => DateTime } + { Date => NaiveDate } + { UUID => Uuid } + { JSON | JSONB => Value } + { HSTORE => HashMap> } + } +} + +impl<'a> From<&'a Type> for PostgresTypeSystem { + fn from(ty: &'a Type) -> PostgresTypeSystem { + use PostgresTypeSystem::*; + match ty.name() { + "int2" => Int2(true), + "int4" => Int4(true), + "int8" => Int8(true), + "float4" => Float4(true), + "float8" => Float8(true), + "numeric" => Numeric(true), + "_bool" => BoolArray(true), + "_int2" => Int2Array(true), + "_int4" => Int4Array(true), + "_int8" => Int8Array(true), + "_float4" => Float4Array(true), + "_float8" => Float8Array(true), + "_numeric" => NumericArray(true), + "_varchar" => VarcharArray(true), + "_text" => TextArray(true), + "bool" => Bool(true), + "char" => Char(true), + "text" | "citext" | "ltree" | "lquery" | "ltxtquery" | "name" => Text(true), + "bpchar" => BpChar(true), + "varchar" => VarChar(true), + "bytea" => ByteA(true), + "time" => Time(true), + "timestamp" => Timestamp(true), + "timestamptz" => TimestampTz(true), + "date" => Date(true), + "uuid" => UUID(true), + "json" => JSON(true), + "jsonb" => JSONB(true), + "hstore" => HSTORE(true), + _ => match ty.kind() { + postgres::types::Kind::Enum(_) => Enum(true), + _ => unimplemented!("{}", ty.name()), + }, + } + } +} + +pub struct PostgresTypePairs<'a>(pub &'a Type, pub &'a PostgresTypeSystem); + +// Link (postgres::Type, connectorx::PostgresTypes) back to the one defiend by the postgres crate. +impl<'a> From> for Type { + fn from(ty: PostgresTypePairs) -> Type { + use PostgresTypeSystem::*; + match ty.1 { + Enum(_) => Type::TEXT, + HSTORE(_) => Type::TEXT, // hstore is not supported in binary protocol (since no corresponding inner TYPE) + _ => ty.0.clone(), + } + } +} diff --git a/connectorx/src/sources/sqlite/errors.rs b/connectorx/src/sources/sqlite/errors.rs new file mode 100644 index 0000000..fda3a9b --- /dev/null +++ b/connectorx/src/sources/sqlite/errors.rs @@ -0,0 +1,24 @@ +use std::string::FromUtf8Error; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum SQLiteSourceError { + #[error("Cannot infer type from null for SQLite")] + InferTypeFromNull, + + #[error(transparent)] + ConnectorXError(#[from] crate::errors::ConnectorXError), + + #[error(transparent)] + SQLiteError(#[from] rusqlite::Error), + + #[error(transparent)] + SQLitePoolError(#[from] r2d2::Error), + + #[error(transparent)] + SQLiteUrlDecodeError(#[from] FromUtf8Error), + + /// Any other errors that are too trivial to be put here explicitly. + #[error(transparent)] + Other(#[from] anyhow::Error), +} diff --git a/connectorx/src/sources/sqlite/mod.rs b/connectorx/src/sources/sqlite/mod.rs new file mode 100644 index 0000000..6187a54 --- /dev/null +++ b/connectorx/src/sources/sqlite/mod.rs @@ -0,0 +1,357 @@ +//! Source implementation for SQLite embedded database. + +mod errors; +mod typesystem; + +pub use self::errors::SQLiteSourceError; +use crate::{ + data_order::DataOrder, + errors::ConnectorXError, + sources::{PartitionParser, Produce, Source, SourcePartition}, + sql::{count_query, limit1_query, CXQuery}, + utils::DummyBox, +}; +use anyhow::anyhow; +use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; +use fallible_streaming_iterator::FallibleStreamingIterator; +use fehler::{throw, throws}; +use log::debug; +use owning_ref::OwningHandle; +use r2d2::{Pool, PooledConnection}; +use r2d2_sqlite::SqliteConnectionManager; +use rusqlite::{Row, Rows, Statement}; +use sqlparser::dialect::SQLiteDialect; +use std::convert::TryFrom; +pub use typesystem::SQLiteTypeSystem; +use urlencoding::decode; + +pub struct SQLiteSource { + pool: Pool, + origin_query: Option, + queries: Vec>, + names: Vec, + schema: Vec, +} + +impl SQLiteSource { + #[throws(SQLiteSourceError)] + pub fn new(conn: &str, nconn: usize) -> Self { + let decoded_conn = decode(conn)?.into_owned(); + debug!("decoded conn: {}", decoded_conn); + let manager = SqliteConnectionManager::file(decoded_conn); + let pool = r2d2::Pool::builder() + .max_size(nconn as u32) + .build(manager)?; + + Self { + pool, + origin_query: None, + queries: vec![], + names: vec![], + schema: vec![], + } + } +} + +impl Source for SQLiteSource +where + SQLiteSourcePartition: SourcePartition, +{ + const DATA_ORDERS: &'static [DataOrder] = &[DataOrder::RowMajor]; + type Partition = SQLiteSourcePartition; + type TypeSystem = SQLiteTypeSystem; + type Error = SQLiteSourceError; + + #[throws(SQLiteSourceError)] + fn set_data_order(&mut self, data_order: DataOrder) { + if !matches!(data_order, DataOrder::RowMajor) { + throw!(ConnectorXError::UnsupportedDataOrder(data_order)); + } + } + + fn set_queries(&mut self, queries: &[CXQuery]) { + self.queries = queries.iter().map(|q| q.map(Q::to_string)).collect(); + } + + fn set_origin_query(&mut self, query: Option) { + self.origin_query = query; + } + + #[throws(SQLiteSourceError)] + fn fetch_metadata(&mut self) { + assert!(!self.queries.is_empty()); + let conn = self.pool.get()?; + let mut names = vec![]; + let mut types = vec![]; + let mut num_empty = 0; + + // assuming all the partition queries yield same schema + for (i, query) in self.queries.iter().enumerate() { + let l1query = limit1_query(query, &SQLiteDialect {})?; + + let is_sucess = conn.query_row(l1query.as_str(), [], |row| { + for (j, col) in row.as_ref().columns().iter().enumerate() { + if j >= names.len() { + names.push(col.name().to_string()); + } + if j >= types.len() { + let vr = row.get_ref(j)?; + match SQLiteTypeSystem::try_from((col.decl_type(), vr.data_type())) { + Ok(t) => types.push(Some(t)), + Err(_) => { + types.push(None); + } + } + } else if types[j].is_none() { + // We didn't get the type in the previous round + let vr = row.get_ref(j)?; + if let Ok(t) = SQLiteTypeSystem::try_from((col.decl_type(), vr.data_type())) + { + types[j] = Some(t) + } + } + } + Ok(()) + }); + + match is_sucess { + Ok(()) => { + if !types.contains(&None) { + self.names = names; + self.schema = types.into_iter().map(|t| t.unwrap()).collect(); + return; + } else if i == self.queries.len() - 1 { + debug!( + "cannot get metadata for '{}' due to null value: {:?}", + query, types + ); + throw!(SQLiteSourceError::InferTypeFromNull); + } + } + Err(e) => { + if let rusqlite::Error::QueryReturnedNoRows = e { + num_empty += 1; // make sure when all partition results are empty, do not throw error + } + if i == self.queries.len() - 1 && num_empty < self.queries.len() { + // tried the last query but still get an error + debug!("cannot get metadata for '{}': {}", query, e); + throw!(e) + } + } + } + } + + // tried all queries but all get empty result set + let stmt = conn.prepare(self.queries[0].as_str())?; + + self.names = stmt + .column_names() + .into_iter() + .map(|s| s.to_string()) + .collect(); + // set all columns as string (align with pandas) + self.schema = vec![SQLiteTypeSystem::Text(false); self.names.len()]; + } + + #[throws(SQLiteSourceError)] + fn result_rows(&mut self) -> Option { + match &self.origin_query { + Some(q) => { + let cxq = CXQuery::Naked(q.clone()); + let conn = self.pool.get()?; + let nrows = + conn.query_row(count_query(&cxq, &SQLiteDialect {})?.as_str(), [], |row| { + Ok(row.get::<_, i64>(0)? as usize) + })?; + Some(nrows) + } + None => None, + } + } + + fn names(&self) -> Vec { + self.names.clone() + } + + fn schema(&self) -> Vec { + self.schema.clone() + } + + #[throws(SQLiteSourceError)] + fn partition(self) -> Vec { + let mut ret = vec![]; + for query in self.queries { + let conn = self.pool.get()?; + + ret.push(SQLiteSourcePartition::new(conn, &query, &self.schema)); + } + ret + } +} + +pub struct SQLiteSourcePartition { + conn: PooledConnection, + query: CXQuery, + schema: Vec, + nrows: usize, + ncols: usize, +} + +impl SQLiteSourcePartition { + pub fn new( + conn: PooledConnection, + query: &CXQuery, + schema: &[SQLiteTypeSystem], + ) -> Self { + Self { + conn, + query: query.clone(), + schema: schema.to_vec(), + nrows: 0, + ncols: schema.len(), + } + } +} + +impl SourcePartition for SQLiteSourcePartition { + type TypeSystem = SQLiteTypeSystem; + type Parser<'a> = SQLiteSourcePartitionParser<'a>; + type Error = SQLiteSourceError; + + #[throws(SQLiteSourceError)] + fn result_rows(&mut self) { + self.nrows = self.conn.query_row( + count_query(&self.query, &SQLiteDialect {})?.as_str(), + [], + |row| Ok(row.get::<_, i64>(0)? as usize), + )?; + } + + #[throws(SQLiteSourceError)] + fn parser(&mut self) -> Self::Parser<'_> { + SQLiteSourcePartitionParser::new(&self.conn, self.query.as_str(), &self.schema)? + } + + fn nrows(&self) -> usize { + self.nrows + } + + fn ncols(&self) -> usize { + self.ncols + } +} + +unsafe impl<'a> Send for SQLiteSourcePartitionParser<'a> {} + +pub struct SQLiteSourcePartitionParser<'a> { + rows: OwningHandle>, DummyBox>>, + ncols: usize, + current_col: usize, + current_consumed: bool, + is_finished: bool, +} + +impl<'a> SQLiteSourcePartitionParser<'a> { + #[throws(SQLiteSourceError)] + pub fn new( + conn: &'a PooledConnection, + query: &str, + schema: &[SQLiteTypeSystem], + ) -> Self { + let stmt: Statement<'a> = conn.prepare(query)?; + + // Safety: DummyBox borrows the on-heap stmt, which is owned by the OwningHandle. + // No matter how we move the owning handle (thus the Box), the Statement + // keeps its address static on the heap, thus the borrow of MyRows keeps valid. + let rows: OwningHandle>, DummyBox>> = + OwningHandle::new_with_fn(Box::new(stmt), |stmt: *const Statement<'a>| unsafe { + DummyBox((*(stmt as *mut Statement<'_>)).query([]).unwrap()) + }); + Self { + rows, + ncols: schema.len(), + current_col: 0, + current_consumed: true, + is_finished: false, + } + } + + #[throws(SQLiteSourceError)] + fn next_loc(&mut self) -> (&Row, usize) { + self.current_consumed = true; + let row: &Row = (*self.rows) + .get() + .ok_or_else(|| anyhow!("Sqlite empty current row"))?; + let col = self.current_col; + self.current_col = (self.current_col + 1) % self.ncols; + (row, col) + } +} + +impl<'a> PartitionParser<'a> for SQLiteSourcePartitionParser<'a> { + type TypeSystem = SQLiteTypeSystem; + type Error = SQLiteSourceError; + + #[throws(SQLiteSourceError)] + fn fetch_next(&mut self) -> (usize, bool) { + assert!(self.current_col == 0); + + if !self.current_consumed { + return (1, false); + } else if self.is_finished { + return (0, true); + } + + match (*self.rows).next()? { + Some(_) => { + self.current_consumed = false; + (1, false) + } + None => { + self.is_finished = true; + (0, true) + } + } + } +} + +macro_rules! impl_produce { + ($($t: ty,)+) => { + $( + impl<'r, 'a> Produce<'r, $t> for SQLiteSourcePartitionParser<'a> { + type Error = SQLiteSourceError; + + #[throws(SQLiteSourceError)] + fn produce(&'r mut self) -> $t { + let (row, col) = self.next_loc()?; + let val = row.get(col)?; + val + } + } + + impl<'r, 'a> Produce<'r, Option<$t>> for SQLiteSourcePartitionParser<'a> { + type Error = SQLiteSourceError; + + #[throws(SQLiteSourceError)] + fn produce(&'r mut self) -> Option<$t> { + let (row, col) = self.next_loc()?; + let val = row.get(col)?; + val + } + } + )+ + }; +} + +impl_produce!( + bool, + i64, + i32, + i16, + f64, + Box, + NaiveDate, + NaiveTime, + NaiveDateTime, + Vec, +); diff --git a/connectorx/src/sources/sqlite/typesystem.rs b/connectorx/src/sources/sqlite/typesystem.rs new file mode 100644 index 0000000..a38c875 --- /dev/null +++ b/connectorx/src/sources/sqlite/typesystem.rs @@ -0,0 +1,92 @@ +use super::errors::SQLiteSourceError; +use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; +use fehler::{throw, throws}; +use rusqlite::types::Type; +use std::convert::TryFrom; + +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum SQLiteTypeSystem { + Bool(bool), + Int8(bool), + Int4(bool), + Int2(bool), + Real(bool), + Text(bool), + Date(bool), + Time(bool), + Timestamp(bool), + Blob(bool), +} + +impl_typesystem! { + system = SQLiteTypeSystem, + mappings = { + { Bool => bool } + { Int8 => i64 } + { Int4 => i32 } + { Int2 => i16 } + { Real => f64 } + { Text => Box } + { Date => NaiveDate} + { Time => NaiveTime} + { Timestamp => NaiveDateTime} + { Blob => Vec} + } +} + +impl TryFrom for SQLiteTypeSystem { + type Error = SQLiteSourceError; + + #[throws(SQLiteSourceError)] + fn try_from(ty: Type) -> Self { + use SQLiteTypeSystem::*; + match ty { + Type::Integer => Int8(true), + Type::Real => Real(true), + Type::Text => Text(true), + Type::Blob => Blob(true), + Type::Null => throw!(SQLiteSourceError::InferTypeFromNull), + } + } +} + +impl TryFrom<(Option<&str>, Type)> for SQLiteTypeSystem { + type Error = SQLiteSourceError; + + #[throws(SQLiteSourceError)] + fn try_from(types: (Option<&str>, Type)) -> Self { + use SQLiteTypeSystem::*; + match types { + // derive from column's declare type, some rules refer to: + // https://www.sqlite.org/datatype3.html#affname + (Some(decl_type), ty) => { + let decl_type = decl_type.to_lowercase(); + match decl_type.as_str() { + "int4" => Int4(true), + "int2" => Int2(true), + "boolean" | "bool" => Bool(true), + "date" => Date(true), + "time" => Time(true), + "datetime" | "timestamp" => Timestamp(true), + _ if decl_type.contains("int") => Int8(true), + _ if decl_type.contains("char") + || decl_type.contains("clob") + || decl_type.contains("text") => + { + Text(true) + } + _ if decl_type.contains("real") + || decl_type.contains("floa") + || decl_type.contains("doub") => + { + Real(true) + } + _ if decl_type.contains("blob") => Blob(true), + _ => SQLiteTypeSystem::try_from(ty)?, + } + } + // derive from value type directly if no declare type available + (None, ty) => SQLiteTypeSystem::try_from(ty)?, + } + } +} diff --git a/connectorx/src/sql.rs b/connectorx/src/sql.rs new file mode 100644 index 0000000..cd22c06 --- /dev/null +++ b/connectorx/src/sql.rs @@ -0,0 +1,597 @@ +use crate::errors::ConnectorXError; +#[cfg(feature = "src_oracle")] +use crate::sources::oracle::OracleDialect; +use fehler::{throw, throws}; +use log::{debug, trace, warn}; +use sqlparser::ast::{ + BinaryOperator, Expr, Function, FunctionArg, FunctionArgExpr, Ident, ObjectName, Query, Select, + SelectItem, SetExpr, Statement, TableAlias, TableFactor, TableWithJoins, Value, + WildcardAdditionalOptions, +}; +use sqlparser::dialect::Dialect; +use sqlparser::parser::Parser; +#[cfg(feature = "src_oracle")] +use std::any::Any; + +#[derive(Debug, Clone)] +pub enum CXQuery { + Naked(Q), // The query directly comes from the user + Wrapped(Q), // The user query is already wrapped in a subquery +} + +impl std::fmt::Display for CXQuery { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + CXQuery::Naked(q) => write!(f, "{}", q), + CXQuery::Wrapped(q) => write!(f, "{}", q), + } + } +} + +impl> CXQuery { + pub fn as_str(&self) -> &str { + match self { + CXQuery::Naked(q) => q.as_ref(), + CXQuery::Wrapped(q) => q.as_ref(), + } + } +} + +impl From<&str> for CXQuery { + fn from(s: &str) -> CXQuery { + CXQuery::Naked(s.to_string()) + } +} + +impl From<&&str> for CXQuery { + fn from(s: &&str) -> CXQuery { + CXQuery::Naked(s.to_string()) + } +} + +impl From<&String> for CXQuery { + fn from(s: &String) -> CXQuery { + CXQuery::Naked(s.clone()) + } +} + +impl From<&CXQuery> for CXQuery { + fn from(q: &CXQuery) -> CXQuery { + q.clone() + } +} + +impl CXQuery { + pub fn naked>(q: Q) -> Self { + CXQuery::Naked(q.as_ref().to_string()) + } +} + +impl> AsRef for CXQuery { + fn as_ref(&self) -> &str { + match self { + CXQuery::Naked(q) => q.as_ref(), + CXQuery::Wrapped(q) => q.as_ref(), + } + } +} + +impl CXQuery { + pub fn map(&self, f: F) -> CXQuery + where + F: Fn(&Q) -> U, + { + match self { + CXQuery::Naked(q) => CXQuery::Naked(f(q)), + CXQuery::Wrapped(q) => CXQuery::Wrapped(f(q)), + } + } +} + +impl CXQuery> { + pub fn result(self) -> Result, E> { + match self { + CXQuery::Naked(q) => q.map(CXQuery::Naked), + CXQuery::Wrapped(q) => q.map(CXQuery::Wrapped), + } + } +} + +// wrap a query into a derived table +fn wrap_query( + query: &mut Query, + projection: Vec, + selection: Option, + tmp_tab_name: &str, +) -> Statement { + let with = query.with.clone(); + query.with = None; + let alias = if tmp_tab_name.is_empty() { + None + } else { + Some(TableAlias { + name: Ident { + value: tmp_tab_name.into(), + quote_style: None, + }, + columns: vec![], + }) + }; + Statement::Query(Box::new(Query { + with, + locks: vec![], + body: Box::new(SetExpr::Select(Box::new(Select { + distinct: None, + top: None, + projection, + from: vec![TableWithJoins { + relation: TableFactor::Derived { + lateral: false, + subquery: Box::new(query.clone()), + alias, + }, + joins: vec![], + }], + lateral_views: vec![], + selection, + group_by: vec![], + cluster_by: vec![], + distribute_by: vec![], + sort_by: vec![], + having: None, + into: None, + named_window: vec![], + qualify: None, + }))), + order_by: vec![], + limit: None, + offset: None, + fetch: None, + })) +} + +trait StatementExt { + fn as_query(&self) -> Option<&Query>; +} + +impl StatementExt for Statement { + fn as_query(&self) -> Option<&Query> { + match self { + Statement::Query(q) => Some(q), + _ => None, + } + } +} + +trait QueryExt { + fn as_select_mut(&mut self) -> Option<&mut Select>; +} + +impl QueryExt for Query { + fn as_select_mut(&mut self) -> Option<&mut Select> { + match *self.body { + SetExpr::Select(ref mut select) => Some(select), + _ => None, + } + } +} + +#[throws(ConnectorXError)] +pub fn count_query(sql: &CXQuery, dialect: &T) -> CXQuery { + trace!("Incoming query: {}", sql); + + const COUNT_TMP_TAB_NAME: &str = "CXTMPTAB_COUNT"; + + #[allow(unused_mut)] + let mut table_alias = COUNT_TMP_TAB_NAME; + + // HACK: Some dialect (e.g. Oracle) does not support "AS" for alias + #[cfg(feature = "src_oracle")] + if dialect.type_id() == (OracleDialect {}.type_id()) { + // table_alias = ""; + return CXQuery::Wrapped(format!( + "SELECT COUNT(*) FROM ({}) {}", + sql.as_str(), + COUNT_TMP_TAB_NAME + )); + } + + let tsql = match sql.map(|sql| Parser::parse_sql(dialect, sql)).result() { + Ok(ast) => { + let projection = vec![SelectItem::UnnamedExpr(Expr::Function(Function { + name: ObjectName(vec![Ident { + value: "count".to_string(), + quote_style: None, + }]), + args: vec![FunctionArg::Unnamed(FunctionArgExpr::Wildcard)], + over: None, + distinct: false, + order_by: vec![], + special: false, + }))]; + let ast_count: Statement = match ast { + CXQuery::Naked(ast) => { + if ast.len() != 1 { + throw!(ConnectorXError::SqlQueryNotSupported(sql.to_string())); + } + let mut query = ast[0] + .as_query() + .ok_or_else(|| ConnectorXError::SqlQueryNotSupported(sql.to_string()))? + .clone(); + if query.offset.is_none() { + query.order_by = vec![]; // mssql offset must appear with order by + } + let select = query + .as_select_mut() + .ok_or_else(|| ConnectorXError::SqlQueryNotSupported(sql.to_string()))?; + select.sort_by = vec![]; + wrap_query(&mut query, projection, None, table_alias) + } + CXQuery::Wrapped(ast) => { + if ast.len() != 1 { + throw!(ConnectorXError::SqlQueryNotSupported(sql.to_string())); + } + let mut query = ast[0] + .as_query() + .ok_or_else(|| ConnectorXError::SqlQueryNotSupported(sql.to_string()))? + .clone(); + let select = query + .as_select_mut() + .ok_or_else(|| ConnectorXError::SqlQueryNotSupported(sql.to_string()))?; + select.projection = projection; + Statement::Query(Box::new(query)) + } + }; + format!("{}", ast_count) + } + Err(e) => { + warn!("parser error: {:?}, manually compose query string", e); + format!( + "SELECT COUNT(*) FROM ({}) as {}", + sql.as_str(), + COUNT_TMP_TAB_NAME + ) + } + }; + + debug!("Transformed count query: {}", tsql); + CXQuery::Wrapped(tsql) +} + +#[throws(ConnectorXError)] +pub fn limit1_query(sql: &CXQuery, dialect: &T) -> CXQuery { + trace!("Incoming query: {}", sql); + + let sql = match Parser::parse_sql(dialect, sql.as_str()) { + Ok(mut ast) => { + if ast.len() != 1 { + throw!(ConnectorXError::SqlQueryNotSupported(sql.to_string())); + } + + match &mut ast[0] { + Statement::Query(q) => { + q.limit = Some(Expr::Value(Value::Number("1".to_string(), false))); + } + _ => throw!(ConnectorXError::SqlQueryNotSupported(sql.to_string())), + }; + + format!("{}", ast[0]) + } + Err(e) => { + warn!("parser error: {:?}, manually compose query string", e); + format!("{} LIMIT 1", sql.as_str()) + } + }; + + debug!("Transformed limit 1 query: {}", sql); + CXQuery::Wrapped(sql) +} + +#[throws(ConnectorXError)] +#[cfg(feature = "src_oracle")] +pub fn limit1_query_oracle(sql: &CXQuery) -> CXQuery { + trace!("Incoming oracle query: {}", sql); + + CXQuery::Wrapped(format!("SELECT * FROM ({}) WHERE rownum = 1", sql)) + + // let ast = Parser::parse_sql(&OracleDialect {}, sql.as_str())?; + // if ast.len() != 1 { + // throw!(ConnectorXError::SqlQueryNotSupported(sql.to_string())); + // } + // let ast_part: Statement; + // let mut query = ast[0] + // .as_query() + // .ok_or_else(|| ConnectorXError::SqlQueryNotSupported(sql.to_string()))? + // .clone(); + + // let selection = Expr::BinaryOp { + // left: Box::new(Expr::CompoundIdentifier(vec![Ident { + // value: "rownum".to_string(), + // quote_style: None, + // }])), + // op: BinaryOperator::Eq, + // right: Box::new(Expr::Value(Value::Number("1".to_string(), false))), + // }; + // ast_part = wrap_query(&mut query, vec![SelectItem::Wildcard], Some(selection), ""); + + // let tsql = format!("{}", ast_part); + // debug!("Transformed limit 1 query: {}", tsql); + // CXQuery::Wrapped(tsql) +} + +#[throws(ConnectorXError)] +pub fn single_col_partition_query( + sql: &str, + col: &str, + lower: i64, + upper: i64, + dialect: &T, +) -> String { + trace!("Incoming query: {}", sql); + const PART_TMP_TAB_NAME: &str = "CXTMPTAB_PART"; + + #[allow(unused_mut)] + let mut table_alias = PART_TMP_TAB_NAME; + #[allow(unused_mut)] + let mut cid = Box::new(Expr::CompoundIdentifier(vec![ + Ident { + value: PART_TMP_TAB_NAME.to_string(), + quote_style: None, + }, + Ident { + value: col.to_string(), + quote_style: None, + }, + ])); + + // HACK: Some dialect (e.g. Oracle) does not support "AS" for alias + #[cfg(feature = "src_oracle")] + if dialect.type_id() == (OracleDialect {}.type_id()) { + return format!("SELECT * FROM ({}) CXTMPTAB_PART WHERE CXTMPTAB_PART.{} >= {} AND CXTMPTAB_PART.{} < {}", sql, col, lower, col, upper); + // table_alias = ""; + // cid = Box::new(Expr::Identifier(Ident { + // value: col.to_string(), + // quote_style: None, + // })); + } + + let tsql = match Parser::parse_sql(dialect, sql) { + Ok(ast) => { + if ast.len() != 1 { + throw!(ConnectorXError::SqlQueryNotSupported(sql.to_string())); + } + + let mut query = ast[0] + .as_query() + .ok_or_else(|| ConnectorXError::SqlQueryNotSupported(sql.to_string()))? + .clone(); + + let select = query + .as_select_mut() + .ok_or_else(|| ConnectorXError::SqlQueryNotSupported(sql.to_string()))? + .clone(); + + let ast_part: Statement; + + let lb = Expr::BinaryOp { + left: Box::new(Expr::Value(Value::Number(lower.to_string(), false))), + op: BinaryOperator::LtEq, + right: cid.clone(), + }; + + let ub = Expr::BinaryOp { + left: cid, + op: BinaryOperator::Lt, + right: Box::new(Expr::Value(Value::Number(upper.to_string(), false))), + }; + + let selection = Expr::BinaryOp { + left: Box::new(lb), + op: BinaryOperator::And, + right: Box::new(ub), + }; + + if query.limit.is_none() && select.top.is_none() && !query.order_by.is_empty() { + // order by in a partition query does not make sense because partition is unordered. + // clear the order by beceause mssql does not support order by in a derived table. + // also order by in the derived table does not make any difference. + query.order_by.clear(); + } + + ast_part = wrap_query( + &mut query, + vec![SelectItem::Wildcard(WildcardAdditionalOptions::default())], + Some(selection), + table_alias, + ); + format!("{}", ast_part) + } + Err(e) => { + warn!("parser error: {:?}, manually compose query string", e); + format!("SELECT * FROM ({}) AS CXTMPTAB_PART WHERE CXTMPTAB_PART.{} >= {} AND CXTMPTAB_PART.{} < {}", sql, col, lower, col, upper) + } + }; + + debug!("Transformed single column partition query: {}", tsql); + tsql +} + +#[throws(ConnectorXError)] +pub fn get_partition_range_query(sql: &str, col: &str, dialect: &T) -> String { + trace!("Incoming query: {}", sql); + const RANGE_TMP_TAB_NAME: &str = "CXTMPTAB_RANGE"; + + #[allow(unused_mut)] + let mut table_alias = RANGE_TMP_TAB_NAME; + #[allow(unused_mut)] + let mut args = vec![FunctionArg::Unnamed(FunctionArgExpr::Expr( + Expr::CompoundIdentifier(vec![ + Ident { + value: RANGE_TMP_TAB_NAME.to_string(), + quote_style: None, + }, + Ident { + value: col.to_string(), + quote_style: None, + }, + ]), + ))]; + + // HACK: Some dialect (e.g. Oracle) does not support "AS" for alias + #[cfg(feature = "src_oracle")] + if dialect.type_id() == (OracleDialect {}.type_id()) { + return format!( + "SELECT MIN({}.{}) as min, MAX({}.{}) as max FROM ({}) {}", + RANGE_TMP_TAB_NAME, col, RANGE_TMP_TAB_NAME, col, sql, RANGE_TMP_TAB_NAME + ); + // table_alias = ""; + // args = vec![FunctionArg::Unnamed(Expr::Identifier(Ident { + // value: col.to_string(), + // quote_style: None, + // }))]; + } + + let tsql = match Parser::parse_sql(dialect, sql) { + Ok(ast) => { + if ast.len() != 1 { + throw!(ConnectorXError::SqlQueryNotSupported(sql.to_string())); + } + + let mut query = ast[0] + .as_query() + .ok_or_else(|| ConnectorXError::SqlQueryNotSupported(sql.to_string()))? + .clone(); + let ast_range: Statement; + query.order_by = vec![]; + let projection = vec![ + SelectItem::UnnamedExpr(Expr::Function(Function { + name: ObjectName(vec![Ident { + value: "min".to_string(), + quote_style: None, + }]), + args: args.clone(), + over: None, + distinct: false, + order_by: vec![], + special: false, + })), + SelectItem::UnnamedExpr(Expr::Function(Function { + name: ObjectName(vec![Ident { + value: "max".to_string(), + quote_style: None, + }]), + args, + over: None, + distinct: false, + order_by: vec![], + special: false, + })), + ]; + ast_range = wrap_query(&mut query, projection, None, table_alias); + format!("{}", ast_range) + } + Err(e) => { + warn!("parser error: {:?}, manually compose query string", e); + format!( + "SELECT MIN({}.{}) as min, MAX({}.{}) as max FROM ({}) AS {}", + RANGE_TMP_TAB_NAME, col, RANGE_TMP_TAB_NAME, col, sql, RANGE_TMP_TAB_NAME + ) + } + }; + + debug!("Transformed partition range query: {}", tsql); + tsql +} + +#[throws(ConnectorXError)] +pub fn get_partition_range_query_sep( + sql: &str, + col: &str, + dialect: &T, +) -> (String, String) { + trace!("Incoming query: {}", sql); + const RANGE_TMP_TAB_NAME: &str = "CXTMPTAB_RANGE"; + + let (sql_min, sql_max) = match Parser::parse_sql(dialect, sql) { + Ok(ast) => { + if ast.len() != 1 { + throw!(ConnectorXError::SqlQueryNotSupported(sql.to_string())); + } + + let mut query = ast[0] + .as_query() + .ok_or_else(|| ConnectorXError::SqlQueryNotSupported(sql.to_string()))? + .clone(); + + let ast_range_min: Statement; + let ast_range_max: Statement; + + query.order_by = vec![]; + let min_proj = vec![SelectItem::UnnamedExpr(Expr::Function(Function { + name: ObjectName(vec![Ident { + value: "min".to_string(), + quote_style: None, + }]), + args: vec![FunctionArg::Unnamed(FunctionArgExpr::Expr( + Expr::CompoundIdentifier(vec![ + Ident { + value: RANGE_TMP_TAB_NAME.to_string(), + quote_style: None, + }, + Ident { + value: col.to_string(), + quote_style: None, + }, + ]), + ))], + over: None, + distinct: false, + order_by: vec![], + special: false, + }))]; + let max_proj = vec![SelectItem::UnnamedExpr(Expr::Function(Function { + name: ObjectName(vec![Ident { + value: "max".to_string(), + quote_style: None, + }]), + args: vec![FunctionArg::Unnamed(FunctionArgExpr::Expr( + Expr::CompoundIdentifier(vec![ + Ident { + value: RANGE_TMP_TAB_NAME.into(), + quote_style: None, + }, + Ident { + value: col.into(), + quote_style: None, + }, + ]), + ))], + over: None, + distinct: false, + order_by: vec![], + special: false, + }))]; + ast_range_min = wrap_query(&mut query.clone(), min_proj, None, RANGE_TMP_TAB_NAME); + ast_range_max = wrap_query(&mut query, max_proj, None, RANGE_TMP_TAB_NAME); + (format!("{}", ast_range_min), format!("{}", ast_range_max)) + } + Err(e) => { + warn!("parser error: {:?}, manually compose query string", e); + ( + format!( + "SELECT MIN({}.{}) as min FROM ({}) AS {}", + RANGE_TMP_TAB_NAME, col, sql, RANGE_TMP_TAB_NAME + ), + format!( + "SELECT MAX({}.{}) as max FROM ({}) AS {}", + RANGE_TMP_TAB_NAME, col, sql, RANGE_TMP_TAB_NAME + ), + ) + } + }; + debug!( + "Transformed separated partition range query: {}, {}", + sql_min, sql_max + ); + (sql_min, sql_max) +} diff --git a/connectorx/src/transports/bigquery_arrow.rs b/connectorx/src/transports/bigquery_arrow.rs new file mode 100644 index 0000000..24bffba --- /dev/null +++ b/connectorx/src/transports/bigquery_arrow.rs @@ -0,0 +1,48 @@ +//! Transport from BigQuery Source to Arrow Destination. + +use crate::{ + destinations::arrow::{typesystem::ArrowTypeSystem, ArrowDestination, ArrowDestinationError}, + impl_transport, + sources::bigquery::{BigQuerySource, BigQuerySourceError, BigQueryTypeSystem}, + typesystem::TypeConversion, +}; +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Utc}; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum BigQueryArrowTransportError { + #[error(transparent)] + Source(#[from] BigQuerySourceError), + + #[error(transparent)] + Destination(#[from] ArrowDestinationError), + + #[error(transparent)] + ConnectorX(#[from] crate::errors::ConnectorXError), +} + +/// Convert BigQuery data types to Arrow data types. +pub struct BigQueryArrowTransport; + +impl_transport!( + name = BigQueryArrowTransport, + error = BigQueryArrowTransportError, + systems = BigQueryTypeSystem => ArrowTypeSystem, + route = BigQuerySource => ArrowDestination, + mappings = { + { Bool[bool] => Boolean[bool] | conversion auto } + { Boolean[bool] => Boolean[bool] | conversion none } + { Int64[i64] => Int64[i64] | conversion auto } + { Integer[i64] => Int64[i64] | conversion none } + { Float64[f64] => Float64[f64] | conversion auto } + { Float[f64] => Float64[f64] | conversion none } + { Numeric[f64] => Float64[f64] | conversion none } + { Bignumeric[f64] => Float64[f64] | conversion none } + { String[String] => LargeUtf8[String] | conversion auto } + { Bytes[String] => LargeUtf8[String] | conversion none } + { Date[NaiveDate] => Date32[NaiveDate] | conversion auto } + { Datetime[NaiveDateTime] => Date64[NaiveDateTime] | conversion auto } + { Time[NaiveTime] => Time64[NaiveTime] | conversion auto } + { Timestamp[DateTime] => DateTimeTz[DateTime] | conversion auto } + } +); diff --git a/connectorx/src/transports/bigquery_arrow2.rs b/connectorx/src/transports/bigquery_arrow2.rs new file mode 100644 index 0000000..ef5308b --- /dev/null +++ b/connectorx/src/transports/bigquery_arrow2.rs @@ -0,0 +1,50 @@ +//! Transport from BigQuery Source to Arrow Destination. + +use crate::{ + destinations::arrow2::{ + typesystem::Arrow2TypeSystem, Arrow2Destination, Arrow2DestinationError, + }, + impl_transport, + sources::bigquery::{BigQuerySource, BigQuerySourceError, BigQueryTypeSystem}, + typesystem::TypeConversion, +}; +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Utc}; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum BigQueryArrow2TransportError { + #[error(transparent)] + Source(#[from] BigQuerySourceError), + + #[error(transparent)] + Destination(#[from] Arrow2DestinationError), + + #[error(transparent)] + ConnectorX(#[from] crate::errors::ConnectorXError), +} + +/// Convert BigQuery data types to Arrow data types. +pub struct BigQueryArrow2Transport; + +impl_transport!( + name = BigQueryArrow2Transport, + error = BigQueryArrow2TransportError, + systems = BigQueryTypeSystem => Arrow2TypeSystem, + route = BigQuerySource => Arrow2Destination, + mappings = { + { Bool[bool] => Boolean[bool] | conversion auto } + { Boolean[bool] => Boolean[bool] | conversion none } + { Int64[i64] => Int64[i64] | conversion auto } + { Integer[i64] => Int64[i64] | conversion none } + { Float64[f64] => Float64[f64] | conversion auto } + { Float[f64] => Float64[f64] | conversion none } + { Numeric[f64] => Float64[f64] | conversion none } + { Bignumeric[f64] => Float64[f64] | conversion none } + { String[String] => LargeUtf8[String] | conversion auto } + { Bytes[String] => LargeUtf8[String] | conversion none } + { Date[NaiveDate] => Date32[NaiveDate] | conversion auto } + { Datetime[NaiveDateTime] => Date64[NaiveDateTime] | conversion auto } + { Time[NaiveTime] => Time64[NaiveTime] | conversion auto } + { Timestamp[DateTime] => DateTimeTz[DateTime] | conversion auto } + } +); diff --git a/connectorx/src/transports/bigquery_arrowstream.rs b/connectorx/src/transports/bigquery_arrowstream.rs new file mode 100644 index 0000000..c8350d5 --- /dev/null +++ b/connectorx/src/transports/bigquery_arrowstream.rs @@ -0,0 +1,50 @@ +//! Transport from BigQuery Source to Arrow Destination. + +use crate::{ + destinations::arrowstream::{ + typesystem::ArrowTypeSystem, ArrowDestination, ArrowDestinationError, + }, + impl_transport, + sources::bigquery::{BigQuerySource, BigQuerySourceError, BigQueryTypeSystem}, + typesystem::TypeConversion, +}; +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Utc}; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum BigQueryArrowTransportError { + #[error(transparent)] + Source(#[from] BigQuerySourceError), + + #[error(transparent)] + Destination(#[from] ArrowDestinationError), + + #[error(transparent)] + ConnectorX(#[from] crate::errors::ConnectorXError), +} + +/// Convert BigQuery data types to Arrow data types. +pub struct BigQueryArrowTransport; + +impl_transport!( + name = BigQueryArrowTransport, + error = BigQueryArrowTransportError, + systems = BigQueryTypeSystem => ArrowTypeSystem, + route = BigQuerySource => ArrowDestination, + mappings = { + { Bool[bool] => Boolean[bool] | conversion auto } + { Boolean[bool] => Boolean[bool] | conversion none } + { Int64[i64] => Int64[i64] | conversion auto } + { Integer[i64] => Int64[i64] | conversion none } + { Float64[f64] => Float64[f64] | conversion auto } + { Float[f64] => Float64[f64] | conversion none } + { Numeric[f64] => Float64[f64] | conversion none } + { Bignumeric[f64] => Float64[f64] | conversion none } + { String[String] => LargeUtf8[String] | conversion auto } + { Bytes[String] => LargeUtf8[String] | conversion none } + { Date[NaiveDate] => Date32[NaiveDate] | conversion auto } + { Datetime[NaiveDateTime] => Date64[NaiveDateTime] | conversion auto } + { Time[NaiveTime] => Time64[NaiveTime] | conversion auto } + { Timestamp[DateTime] => DateTimeTz[DateTime] | conversion auto } + } +); diff --git a/connectorx/src/transports/csv_arrow.rs b/connectorx/src/transports/csv_arrow.rs new file mode 100644 index 0000000..5383f0d --- /dev/null +++ b/connectorx/src/transports/csv_arrow.rs @@ -0,0 +1,36 @@ +//! Transport from CSV Source to Arrow Destination. + +use crate::destinations::arrow::{ArrowDestination, ArrowDestinationError, ArrowTypeSystem}; +use crate::sources::csv::{CSVSource, CSVSourceError, CSVTypeSystem}; +use crate::typesystem::TypeConversion; +use chrono::{DateTime, Utc}; +use thiserror::Error; + +/// Convert CSV data types to Arrow data types. +pub struct CSVArrowTransport; + +#[derive(Error, Debug)] +pub enum CSVArrowTransportError { + #[error(transparent)] + Source(#[from] CSVSourceError), + + #[error(transparent)] + Destination(#[from] ArrowDestinationError), + + #[error(transparent)] + ConnectorX(#[from] crate::errors::ConnectorXError), +} + +impl_transport!( + name = CSVArrowTransport, + error = CSVArrowTransportError, + systems = CSVTypeSystem => ArrowTypeSystem, + route = CSVSource => ArrowDestination, + mappings = { + { F64[f64] => Float64[f64] | conversion auto} + { I64[i64] => Int64[i64] | conversion auto} + { Bool[bool] => Boolean[bool] | conversion auto} + { String[String] => LargeUtf8[String] | conversion auto} + { DateTime[DateTime] => DateTimeTz[DateTime] | conversion auto} + } +); diff --git a/connectorx/src/transports/dummy_arrow.rs b/connectorx/src/transports/dummy_arrow.rs new file mode 100644 index 0000000..9ebad0f --- /dev/null +++ b/connectorx/src/transports/dummy_arrow.rs @@ -0,0 +1,56 @@ +//! Transport from Dummy Source to Arrow Destination. + +use crate::destinations::arrow::{ArrowDestination, ArrowDestinationError, ArrowTypeSystem}; +use crate::sources::dummy::{DummySource, DummyTypeSystem}; +use crate::typesystem::TypeConversion; +use chrono::{DateTime, NaiveDate, NaiveDateTime, Utc}; +use thiserror::Error; + +/// Convert Dummy data types to Arrow data types. +pub struct DummyArrowTransport; + +#[derive(Error, Debug)] +pub enum DummyArrowTransportError { + #[error(transparent)] + Destination(#[from] ArrowDestinationError), + + #[error(transparent)] + ConnectorX(#[from] crate::errors::ConnectorXError), +} + +impl_transport!( + name = DummyArrowTransport, + error = DummyArrowTransportError, + systems = DummyTypeSystem => ArrowTypeSystem, + route = DummySource => ArrowDestination, + mappings = { + { F64[f64] => Float64[f64] | conversion auto} + { I64[i64] => Int64[i64] | conversion auto} + { Bool[bool] => Boolean[bool] | conversion auto} + { String[String] => LargeUtf8[String] | conversion auto} + { DateTime[DateTime] => Date64[NaiveDateTime] | conversion option} + } +); + +impl TypeConversion, NaiveDateTime> for DummyArrowTransport { + fn convert(val: DateTime) -> NaiveDateTime { + NaiveDateTime::from_timestamp_opt(val.timestamp(), val.timestamp_subsec_nanos()) + .unwrap_or_else(|| panic!("from_timestamp_opt return None")) + } +} + +impl TypeConversion> for DummyArrowTransport { + fn convert(val: NaiveDateTime) -> DateTime { + DateTime::from_naive_utc_and_offset(val, Utc) + } +} + +impl TypeConversion> for DummyArrowTransport { + fn convert(val: NaiveDate) -> DateTime { + DateTime::from_naive_utc_and_offset( + val.and_hms_opt(0, 0, 0) + .unwrap_or_else(|| panic!("and_hms_opt return None")), + Utc, + ) + } +} diff --git a/connectorx/src/transports/dummy_arrow2.rs b/connectorx/src/transports/dummy_arrow2.rs new file mode 100644 index 0000000..89c582d --- /dev/null +++ b/connectorx/src/transports/dummy_arrow2.rs @@ -0,0 +1,56 @@ +//! Transport from Dummy Source to Arrow2 Destination. + +use crate::destinations::arrow2::{Arrow2Destination, Arrow2DestinationError, Arrow2TypeSystem}; +use crate::sources::dummy::{DummySource, DummyTypeSystem}; +use crate::typesystem::TypeConversion; +use chrono::{DateTime, NaiveDate, NaiveDateTime, Utc}; +use thiserror::Error; + +/// Convert Dummy data types to Arrow2 data types. +pub struct DummyArrow2Transport; + +#[derive(Error, Debug)] +pub enum DummyArrow2TransportError { + #[error(transparent)] + Destination(#[from] Arrow2DestinationError), + + #[error(transparent)] + ConnectorX(#[from] crate::errors::ConnectorXError), +} + +impl_transport!( + name = DummyArrow2Transport, + error = DummyArrow2TransportError, + systems = DummyTypeSystem => Arrow2TypeSystem, + route = DummySource => Arrow2Destination, + mappings = { + { F64[f64] => Float64[f64] | conversion auto} + { I64[i64] => Int64[i64] | conversion auto} + { Bool[bool] => Boolean[bool] | conversion auto} + { String[String] => LargeUtf8[String] | conversion auto} + { DateTime[DateTime] => Date64[NaiveDateTime] | conversion option} + } +); + +impl TypeConversion, NaiveDateTime> for DummyArrow2Transport { + fn convert(val: DateTime) -> NaiveDateTime { + NaiveDateTime::from_timestamp_opt(val.timestamp(), val.timestamp_subsec_nanos()) + .unwrap_or_else(|| panic!("from_timestamp_opt return None")) + } +} + +impl TypeConversion> for DummyArrow2Transport { + fn convert(val: NaiveDateTime) -> DateTime { + DateTime::from_naive_utc_and_offset(val, Utc) + } +} + +impl TypeConversion> for DummyArrow2Transport { + fn convert(val: NaiveDate) -> DateTime { + DateTime::from_naive_utc_and_offset( + val.and_hms_opt(0, 0, 0) + .unwrap_or_else(|| panic!("from_hms_opt return None")), + Utc, + ) + } +} diff --git a/connectorx/src/transports/dummy_arrowstream.rs b/connectorx/src/transports/dummy_arrowstream.rs new file mode 100644 index 0000000..19436b2 --- /dev/null +++ b/connectorx/src/transports/dummy_arrowstream.rs @@ -0,0 +1,56 @@ +//! Transport from Dummy Source to Arrow Destination. + +use crate::destinations::arrowstream::{ArrowDestination, ArrowDestinationError, ArrowTypeSystem}; +use crate::sources::dummy::{DummySource, DummyTypeSystem}; +use crate::typesystem::TypeConversion; +use chrono::{DateTime, NaiveDate, NaiveDateTime, Utc}; +use thiserror::Error; + +/// Convert Dummy data types to Arrow data types. +pub struct DummyArrowTransport; + +#[derive(Error, Debug)] +pub enum DummyArrowTransportError { + #[error(transparent)] + Destination(#[from] ArrowDestinationError), + + #[error(transparent)] + ConnectorX(#[from] crate::errors::ConnectorXError), +} + +impl_transport!( + name = DummyArrowTransport, + error = DummyArrowTransportError, + systems = DummyTypeSystem => ArrowTypeSystem, + route = DummySource => ArrowDestination, + mappings = { + { F64[f64] => Float64[f64] | conversion auto} + { I64[i64] => Int64[i64] | conversion auto} + { Bool[bool] => Boolean[bool] | conversion auto} + { String[String] => LargeUtf8[String] | conversion auto} + { DateTime[DateTime] => Date64[NaiveDateTime] | conversion option} + } +); + +impl TypeConversion, NaiveDateTime> for DummyArrowTransport { + fn convert(val: DateTime) -> NaiveDateTime { + NaiveDateTime::from_timestamp_opt(val.timestamp(), val.timestamp_subsec_nanos()) + .unwrap_or_else(|| panic!("from_timestamp_opt return None")) + } +} + +impl TypeConversion> for DummyArrowTransport { + fn convert(val: NaiveDateTime) -> DateTime { + DateTime::from_naive_utc_and_offset(val, Utc) + } +} + +impl TypeConversion> for DummyArrowTransport { + fn convert(val: NaiveDate) -> DateTime { + DateTime::from_naive_utc_and_offset( + val.and_hms_opt(0, 0, 0) + .unwrap_or_else(|| panic!("and_hms_opt return None")), + Utc, + ) + } +} diff --git a/connectorx/src/transports/mod.rs b/connectorx/src/transports/mod.rs new file mode 100644 index 0000000..8be61dc --- /dev/null +++ b/connectorx/src/transports/mod.rs @@ -0,0 +1,107 @@ +//! This module contains transport definitions for the sources and destinations implemented in ConnectorX. + +#[cfg(all(feature = "src_bigquery", feature = "dst_arrow"))] +mod bigquery_arrow; +#[cfg(all(feature = "src_bigquery", feature = "dst_arrow2"))] +mod bigquery_arrow2; +#[cfg(all(feature = "src_bigquery", feature = "dst_arrow"))] +mod bigquery_arrowstream; +#[cfg(all(feature = "src_csv", feature = "dst_arrow"))] +mod csv_arrow; +#[cfg(all(feature = "src_dummy", feature = "dst_arrow"))] +mod dummy_arrow; +#[cfg(all(feature = "src_dummy", feature = "dst_arrow2"))] +mod dummy_arrow2; +#[cfg(all(feature = "src_dummy", feature = "dst_arrow"))] +mod dummy_arrowstream; +#[cfg(all(feature = "src_mssql", feature = "dst_arrow"))] +mod mssql_arrow; +#[cfg(all(feature = "src_mssql", feature = "dst_arrow2"))] +mod mssql_arrow2; +#[cfg(all(feature = "src_mssql", feature = "dst_arrow"))] +mod mssql_arrowstream; +#[cfg(all(feature = "src_mysql", feature = "dst_arrow"))] +mod mysql_arrow; +#[cfg(all(feature = "src_mysql", feature = "dst_arrow2"))] +mod mysql_arrow2; +#[cfg(all(feature = "src_mysql", feature = "dst_arrow"))] +mod mysql_arrowstream; +#[cfg(all(feature = "src_oracle", feature = "dst_arrow"))] +mod oracle_arrow; +#[cfg(all(feature = "src_oracle", feature = "dst_arrow2"))] +mod oracle_arrow2; +#[cfg(all(feature = "src_oracle", feature = "dst_arrow"))] +mod oracle_arrowstream; +#[cfg(all(feature = "src_postgres", feature = "dst_arrow"))] +mod postgres_arrow; +#[cfg(all(feature = "src_postgres", feature = "dst_arrow2"))] +mod postgres_arrow2; +#[cfg(all(feature = "src_postgres", feature = "dst_arrow"))] +mod postgres_arrowstream; +#[cfg(all(feature = "src_sqlite", feature = "dst_arrow"))] +mod sqlite_arrow; +#[cfg(all(feature = "src_sqlite", feature = "dst_arrow2"))] +mod sqlite_arrow2; +#[cfg(all(feature = "src_sqlite", feature = "dst_arrow"))] +mod sqlite_arrowstream; + +#[cfg(all(feature = "src_bigquery", feature = "dst_arrow"))] +pub use bigquery_arrow::{BigQueryArrowTransport, BigQueryArrowTransportError}; +#[cfg(all(feature = "src_bigquery", feature = "dst_arrow2"))] +pub use bigquery_arrow2::{BigQueryArrow2Transport, BigQueryArrow2TransportError}; +#[cfg(all(feature = "src_bigquery", feature = "dst_arrow"))] +pub use bigquery_arrowstream::{ + BigQueryArrowTransport as BigQueryArrowStreamTransport, + BigQueryArrowTransportError as BigQueryArrowStreamTransportError, +}; +#[cfg(all(feature = "src_csv", feature = "dst_arrow"))] +pub use csv_arrow::CSVArrowTransport; +#[cfg(all(feature = "src_dummy", feature = "dst_arrow"))] +pub use dummy_arrow::DummyArrowTransport; +#[cfg(all(feature = "src_dummy", feature = "dst_arrow2"))] +pub use dummy_arrow2::DummyArrow2Transport; +#[cfg(all(feature = "src_mssql", feature = "dst_arrow"))] +pub use mssql_arrow::{MsSQLArrowTransport, MsSQLArrowTransportError}; +#[cfg(all(feature = "src_mssql", feature = "dst_arrow2"))] +pub use mssql_arrow2::{MsSQLArrow2Transport, MsSQLArrow2TransportError}; +#[cfg(all(feature = "src_mssql", feature = "dst_arrow"))] +pub use mssql_arrowstream::{ + MsSQLArrowTransport as MsSQLArrowStreamTransport, + MsSQLArrowTransportError as MsSQLArrowStreamTransportError, +}; +#[cfg(all(feature = "src_mysql", feature = "dst_arrow"))] +pub use mysql_arrow::{MySQLArrowTransport, MySQLArrowTransportError}; +#[cfg(all(feature = "src_mysql", feature = "dst_arrow2"))] +pub use mysql_arrow2::{MySQLArrow2Transport, MySQLArrow2TransportError}; +#[cfg(all(feature = "src_mysql", feature = "dst_arrow"))] +pub use mysql_arrowstream::{ + MySQLArrowTransport as MySQLArrowStreamTransport, + MySQLArrowTransportError as MySQLArrowStreamTransportError, +}; +#[cfg(all(feature = "src_oracle", feature = "dst_arrow"))] +pub use oracle_arrow::{OracleArrowTransport, OracleArrowTransportError}; +#[cfg(all(feature = "src_oracle", feature = "dst_arrow2"))] +pub use oracle_arrow2::{OracleArrow2Transport, OracleArrow2TransportError}; +#[cfg(all(feature = "src_oracle", feature = "dst_arrow"))] +pub use oracle_arrowstream::{ + OracleArrowTransport as OracleArrowStreamTransport, + OracleArrowTransportError as OracleArrowStreamTransportError, +}; +#[cfg(all(feature = "src_postgres", feature = "dst_arrow"))] +pub use postgres_arrow::{PostgresArrowTransport, PostgresArrowTransportError}; +#[cfg(all(feature = "src_postgres", feature = "dst_arrow2"))] +pub use postgres_arrow2::{PostgresArrow2Transport, PostgresArrow2TransportError}; +#[cfg(all(feature = "src_postgres", feature = "dst_arrow"))] +pub use postgres_arrowstream::{ + PostgresArrowTransport as PostgresArrowStreamTransport, + PostgresArrowTransportError as PostgresArrowStreamTransportError, +}; +#[cfg(all(feature = "src_sqlite", feature = "dst_arrow"))] +pub use sqlite_arrow::{SQLiteArrowTransport, SQLiteArrowTransportError}; +#[cfg(all(feature = "src_sqlite", feature = "dst_arrow2"))] +pub use sqlite_arrow2::{SQLiteArrow2Transport, SQLiteArrow2TransportError}; +#[cfg(all(feature = "src_sqlite", feature = "dst_arrow"))] +pub use sqlite_arrowstream::{ + SQLiteArrowTransport as SQLiteArrowStreamTransport, + SQLiteArrowTransportError as SQLiteArrowStreamTransportError, +}; diff --git a/connectorx/src/transports/mssql_arrow.rs b/connectorx/src/transports/mssql_arrow.rs new file mode 100644 index 0000000..9e3aa8e --- /dev/null +++ b/connectorx/src/transports/mssql_arrow.rs @@ -0,0 +1,88 @@ +//! Transport from MsSQL Source to Arrow Destination. + +use crate::destinations::arrow::{ArrowDestination, ArrowDestinationError, ArrowTypeSystem}; +use crate::sources::mssql::{FloatN, IntN, MsSQLSource, MsSQLSourceError, MsSQLTypeSystem}; +use crate::typesystem::TypeConversion; +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Utc}; +use num_traits::ToPrimitive; +use rust_decimal::Decimal; +use thiserror::Error; +use uuid::Uuid; + +/// Convert MsSQL data types to Arrow data types. +pub struct MsSQLArrowTransport; + +#[derive(Error, Debug)] +pub enum MsSQLArrowTransportError { + #[error(transparent)] + Source(#[from] MsSQLSourceError), + + #[error(transparent)] + Destination(#[from] ArrowDestinationError), + + #[error(transparent)] + ConnectorX(#[from] crate::errors::ConnectorXError), +} + +impl_transport!( + name = MsSQLArrowTransport, + error = MsSQLArrowTransportError, + systems = MsSQLTypeSystem => ArrowTypeSystem, + route = MsSQLSource => ArrowDestination, + mappings = { + { Tinyint[u8] => Int64[i64] | conversion auto } + { Smallint[i16] => Int64[i64] | conversion auto } + { Int[i32] => Int64[i64] | conversion auto } + { Bigint[i64] => Int64[i64] | conversion auto } + { Intn[IntN] => Int64[i64] | conversion option } + { Float24[f32] => Float32[f32] | conversion auto } + { Float53[f64] => Float64[f64] | conversion auto } + { Floatn[FloatN] => Float64[f64] | conversion option } + { Bit[bool] => Boolean[bool] | conversion auto } + { Nvarchar[&'r str] => LargeUtf8[String] | conversion owned } + { Varchar[&'r str] => LargeUtf8[String] | conversion none } + { Nchar[&'r str] => LargeUtf8[String] | conversion none } + { Char[&'r str] => LargeUtf8[String] | conversion none } + { Text[&'r str] => LargeUtf8[String] | conversion none } + { Ntext[&'r str] => LargeUtf8[String] | conversion none } + { Binary[&'r [u8]] => LargeBinary[Vec] | conversion owned } + { Varbinary[&'r [u8]] => LargeBinary[Vec] | conversion none } + { Image[&'r [u8]] => LargeBinary[Vec] | conversion none } + { Numeric[Decimal] => Float64[f64] | conversion option } + { Decimal[Decimal] => Float64[f64] | conversion none } + { Datetime[NaiveDateTime] => Date64[NaiveDateTime] | conversion auto } + { Datetime2[NaiveDateTime] => Date64[NaiveDateTime] | conversion none } + { Smalldatetime[NaiveDateTime] => Date64[NaiveDateTime] | conversion none } + { Date[NaiveDate] => Date32[NaiveDate] | conversion auto } + { Datetimeoffset[DateTime] => DateTimeTz[DateTime] | conversion auto } + { Uniqueidentifier[Uuid] => LargeUtf8[String] | conversion option } + { Time[NaiveTime] => Time64[NaiveTime] | conversion auto } + { SmallMoney[f32] => Float32[f32] | conversion none } + { Money[f64] => Float64[f64] | conversion none } + } +); + +impl TypeConversion for MsSQLArrowTransport { + fn convert(val: Uuid) -> String { + val.to_string() + } +} + +impl TypeConversion for MsSQLArrowTransport { + fn convert(val: IntN) -> i64 { + val.0 + } +} + +impl TypeConversion for MsSQLArrowTransport { + fn convert(val: FloatN) -> f64 { + val.0 + } +} + +impl TypeConversion for MsSQLArrowTransport { + fn convert(val: Decimal) -> f64 { + val.to_f64() + .unwrap_or_else(|| panic!("cannot convert decimal {:?} to float64", val)) + } +} diff --git a/connectorx/src/transports/mssql_arrow2.rs b/connectorx/src/transports/mssql_arrow2.rs new file mode 100644 index 0000000..fa3370c --- /dev/null +++ b/connectorx/src/transports/mssql_arrow2.rs @@ -0,0 +1,88 @@ +//! Transport from MsSQL Source to Arrow2 Destination. + +use crate::destinations::arrow2::{Arrow2Destination, Arrow2DestinationError, Arrow2TypeSystem}; +use crate::sources::mssql::{FloatN, IntN, MsSQLSource, MsSQLSourceError, MsSQLTypeSystem}; +use crate::typesystem::TypeConversion; +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Utc}; +use num_traits::ToPrimitive; +use rust_decimal::Decimal; +use thiserror::Error; +use uuid::Uuid; + +/// Convert MsSQL data types to Arrow2 data types. +pub struct MsSQLArrow2Transport; + +#[derive(Error, Debug)] +pub enum MsSQLArrow2TransportError { + #[error(transparent)] + Source(#[from] MsSQLSourceError), + + #[error(transparent)] + Destination(#[from] Arrow2DestinationError), + + #[error(transparent)] + ConnectorX(#[from] crate::errors::ConnectorXError), +} + +impl_transport!( + name = MsSQLArrow2Transport, + error = MsSQLArrow2TransportError, + systems = MsSQLTypeSystem => Arrow2TypeSystem, + route = MsSQLSource => Arrow2Destination, + mappings = { + { Tinyint[u8] => Int32[i32] | conversion auto } + { Smallint[i16] => Int32[i32] | conversion auto } + { Int[i32] => Int32[i32] | conversion auto } + { Bigint[i64] => Int64[i64] | conversion auto } + { Intn[IntN] => Int64[i64] | conversion option } + { Float24[f32] => Float32[f32] | conversion auto } + { Float53[f64] => Float64[f64] | conversion auto } + { Floatn[FloatN] => Float64[f64] | conversion option } + { Bit[bool] => Boolean[bool] | conversion auto } + { Nvarchar[&'r str] => LargeUtf8[String] | conversion owned } + { Varchar[&'r str] => LargeUtf8[String] | conversion none } + { Nchar[&'r str] => LargeUtf8[String] | conversion none } + { Char[&'r str] => LargeUtf8[String] | conversion none } + { Text[&'r str] => LargeUtf8[String] | conversion none } + { Ntext[&'r str] => LargeUtf8[String] | conversion none } + { Binary[&'r [u8]] => LargeBinary[Vec] | conversion owned } + { Varbinary[&'r [u8]] => LargeBinary[Vec] | conversion none } + { Image[&'r [u8]] => LargeBinary[Vec] | conversion none } + { Numeric[Decimal] => Float64[f64] | conversion option } + { Decimal[Decimal] => Float64[f64] | conversion none } + { Datetime[NaiveDateTime] => Date64[NaiveDateTime] | conversion auto } + { Datetime2[NaiveDateTime] => Date64[NaiveDateTime] | conversion none } + { Smalldatetime[NaiveDateTime] => Date64[NaiveDateTime] | conversion none } + { Date[NaiveDate] => Date32[NaiveDate] | conversion auto } + { Datetimeoffset[DateTime] => DateTimeTz[DateTime] | conversion auto } + { Uniqueidentifier[Uuid] => LargeUtf8[String] | conversion option } + { Time[NaiveTime] => Time64[NaiveTime] | conversion auto } + { SmallMoney[f32] => Float32[f32] | conversion none } + { Money[f64] => Float64[f64] | conversion none } + } +); + +impl TypeConversion for MsSQLArrow2Transport { + fn convert(val: Uuid) -> String { + val.to_string() + } +} + +impl TypeConversion for MsSQLArrow2Transport { + fn convert(val: IntN) -> i64 { + val.0 + } +} + +impl TypeConversion for MsSQLArrow2Transport { + fn convert(val: FloatN) -> f64 { + val.0 + } +} + +impl TypeConversion for MsSQLArrow2Transport { + fn convert(val: Decimal) -> f64 { + val.to_f64() + .unwrap_or_else(|| panic!("cannot convert decimal {:?} to float64", val)) + } +} diff --git a/connectorx/src/transports/mssql_arrowstream.rs b/connectorx/src/transports/mssql_arrowstream.rs new file mode 100644 index 0000000..9fa9ae9 --- /dev/null +++ b/connectorx/src/transports/mssql_arrowstream.rs @@ -0,0 +1,88 @@ +//! Transport from MsSQL Source to Arrow Destination. + +use crate::destinations::arrowstream::{ArrowDestination, ArrowDestinationError, ArrowTypeSystem}; +use crate::sources::mssql::{FloatN, IntN, MsSQLSource, MsSQLSourceError, MsSQLTypeSystem}; +use crate::typesystem::TypeConversion; +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Utc}; +use num_traits::ToPrimitive; +use rust_decimal::Decimal; +use thiserror::Error; +use uuid::Uuid; + +/// Convert MsSQL data types to Arrow data types. +pub struct MsSQLArrowTransport; + +#[derive(Error, Debug)] +pub enum MsSQLArrowTransportError { + #[error(transparent)] + Source(#[from] MsSQLSourceError), + + #[error(transparent)] + Destination(#[from] ArrowDestinationError), + + #[error(transparent)] + ConnectorX(#[from] crate::errors::ConnectorXError), +} + +impl_transport!( + name = MsSQLArrowTransport, + error = MsSQLArrowTransportError, + systems = MsSQLTypeSystem => ArrowTypeSystem, + route = MsSQLSource => ArrowDestination, + mappings = { + { Tinyint[u8] => Int64[i64] | conversion auto } + { Smallint[i16] => Int64[i64] | conversion auto } + { Int[i32] => Int64[i64] | conversion auto } + { Bigint[i64] => Int64[i64] | conversion auto } + { Intn[IntN] => Int64[i64] | conversion option } + { Float24[f32] => Float32[f32] | conversion auto } + { Float53[f64] => Float64[f64] | conversion auto } + { Floatn[FloatN] => Float64[f64] | conversion option } + { Bit[bool] => Boolean[bool] | conversion auto } + { Nvarchar[&'r str] => LargeUtf8[String] | conversion owned } + { Varchar[&'r str] => LargeUtf8[String] | conversion none } + { Nchar[&'r str] => LargeUtf8[String] | conversion none } + { Char[&'r str] => LargeUtf8[String] | conversion none } + { Text[&'r str] => LargeUtf8[String] | conversion none } + { Ntext[&'r str] => LargeUtf8[String] | conversion none } + { Binary[&'r [u8]] => LargeBinary[Vec] | conversion owned } + { Varbinary[&'r [u8]] => LargeBinary[Vec] | conversion none } + { Image[&'r [u8]] => LargeBinary[Vec] | conversion none } + { Numeric[Decimal] => Float64[f64] | conversion option } + { Decimal[Decimal] => Float64[f64] | conversion none } + { Datetime[NaiveDateTime] => Date64[NaiveDateTime] | conversion auto } + { Datetime2[NaiveDateTime] => Date64[NaiveDateTime] | conversion none } + { Smalldatetime[NaiveDateTime] => Date64[NaiveDateTime] | conversion none } + { Date[NaiveDate] => Date32[NaiveDate] | conversion auto } + { Datetimeoffset[DateTime] => DateTimeTz[DateTime] | conversion auto } + { Uniqueidentifier[Uuid] => LargeUtf8[String] | conversion option } + { Time[NaiveTime] => Time64[NaiveTime] | conversion auto } + { SmallMoney[f32] => Float32[f32] | conversion none } + { Money[f64] => Float64[f64] | conversion none } + } +); + +impl TypeConversion for MsSQLArrowTransport { + fn convert(val: Uuid) -> String { + val.to_string() + } +} + +impl TypeConversion for MsSQLArrowTransport { + fn convert(val: IntN) -> i64 { + val.0 + } +} + +impl TypeConversion for MsSQLArrowTransport { + fn convert(val: FloatN) -> f64 { + val.0 + } +} + +impl TypeConversion for MsSQLArrowTransport { + fn convert(val: Decimal) -> f64 { + val.to_f64() + .unwrap_or_else(|| panic!("cannot convert decimal {:?} to float64", val)) + } +} diff --git a/connectorx/src/transports/mysql_arrow.rs b/connectorx/src/transports/mysql_arrow.rs new file mode 100644 index 0000000..1885c05 --- /dev/null +++ b/connectorx/src/transports/mysql_arrow.rs @@ -0,0 +1,120 @@ +//! Transport from MySQL Source to Arrow Destination. + +use crate::{ + destinations::arrow::{typesystem::ArrowTypeSystem, ArrowDestination, ArrowDestinationError}, + impl_transport, + sources::mysql::{ + BinaryProtocol, MySQLSource, MySQLSourceError, MySQLTypeSystem, TextProtocol, + }, + typesystem::TypeConversion, +}; +use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; +use num_traits::ToPrimitive; +use rust_decimal::Decimal; +use serde_json::{to_string, Value}; +use std::marker::PhantomData; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum MySQLArrowTransportError { + #[error(transparent)] + Source(#[from] MySQLSourceError), + + #[error(transparent)] + Destination(#[from] ArrowDestinationError), + + #[error(transparent)] + ConnectorX(#[from] crate::errors::ConnectorXError), +} + +/// Convert MySQL data types to Arrow data types. +pub struct MySQLArrowTransport

(PhantomData

); + +impl_transport!( + name = MySQLArrowTransport, + error = MySQLArrowTransportError, + systems = MySQLTypeSystem => ArrowTypeSystem, + route = MySQLSource => ArrowDestination, + mappings = { + { Float[f32] => Float64[f64] | conversion auto } + { Double[f64] => Float64[f64] | conversion auto } + { Tiny[i8] => Boolean[bool] | conversion option } + { Short[i16] => Int64[i64] | conversion auto } + { Int24[i32] => Int64[i64] | conversion none } + { Long[i32] => Int64[i64] | conversion auto } + { LongLong[i64] => Int64[i64] | conversion auto } + { UTiny[u8] => Int64[i64] | conversion auto } + { UShort[u16] => Int64[i64] | conversion auto } + { ULong[u32] => Int64[i64] | conversion auto } + { UInt24[u32] => Int64[i64] | conversion none } + { ULongLong[u64] => Float64[f64] | conversion auto } + { Date[NaiveDate] => Date32[NaiveDate] | conversion auto } + { Time[NaiveTime] => Time64[NaiveTime] | conversion auto } + { Datetime[NaiveDateTime] => Date64[NaiveDateTime] | conversion auto } + { Year[i16] => Int64[i64] | conversion none} + { Timestamp[NaiveDateTime] => Date64[NaiveDateTime] | conversion none } + { Decimal[Decimal] => Float64[f64] | conversion option } + { VarChar[String] => LargeUtf8[String] | conversion auto } + { Char[String] => LargeUtf8[String] | conversion none } + { Enum[String] => LargeUtf8[String] | conversion none } + { TinyBlob[Vec] => LargeBinary[Vec] | conversion auto } + { Blob[Vec] => LargeBinary[Vec] | conversion none } + { MediumBlob[Vec] => LargeBinary[Vec] | conversion none } + { LongBlob[Vec] => LargeBinary[Vec] | conversion none } + { Json[Value] => LargeUtf8[String] | conversion option } + } +); + +impl_transport!( + name = MySQLArrowTransport, + error = MySQLArrowTransportError, + systems = MySQLTypeSystem => ArrowTypeSystem, + route = MySQLSource => ArrowDestination, + mappings = { + { Float[f32] => Float64[f64] | conversion auto } + { Double[f64] => Float64[f64] | conversion auto } + { Tiny[i8] => Boolean[bool] | conversion option } + { Short[i16] => Int64[i64] | conversion auto } + { Int24[i32] => Int64[i64] | conversion none } + { Long[i32] => Int64[i64] | conversion auto } + { LongLong[i64] => Int64[i64] | conversion auto } + { UTiny[u8] => Int64[i64] | conversion auto } + { UShort[u16] => Int64[i64] | conversion auto } + { ULong[u32] => Int64[i64] | conversion auto } + { UInt24[u32] => Int64[i64] | conversion none } + { ULongLong[u64] => Float64[f64] | conversion auto } + { Date[NaiveDate] => Date32[NaiveDate] | conversion auto } + { Time[NaiveTime] => Time64[NaiveTime] | conversion auto } + { Datetime[NaiveDateTime] => Date64[NaiveDateTime] | conversion auto } + { Year[i16] => Int64[i64] | conversion none} + { Timestamp[NaiveDateTime] => Date64[NaiveDateTime] | conversion none } + { Decimal[Decimal] => Float64[f64] | conversion option } + { VarChar[String] => LargeUtf8[String] | conversion auto } + { Char[String] => LargeUtf8[String] | conversion none } + { Enum[String] => LargeUtf8[String] | conversion none } + { TinyBlob[Vec] => LargeBinary[Vec] | conversion auto } + { Blob[Vec] => LargeBinary[Vec] | conversion none } + { MediumBlob[Vec] => LargeBinary[Vec] | conversion none } + { LongBlob[Vec] => LargeBinary[Vec] | conversion none } + { Json[Value] => LargeUtf8[String] | conversion option } + } +); + +impl

TypeConversion for MySQLArrowTransport

{ + fn convert(val: Decimal) -> f64 { + val.to_f64() + .unwrap_or_else(|| panic!("cannot convert decimal {:?} to float64", val)) + } +} + +impl

TypeConversion for MySQLArrowTransport

{ + fn convert(val: Value) -> String { + to_string(&val).unwrap() + } +} + +impl

TypeConversion for MySQLArrowTransport

{ + fn convert(val: i8) -> bool { + val != 0 + } +} diff --git a/connectorx/src/transports/mysql_arrow2.rs b/connectorx/src/transports/mysql_arrow2.rs new file mode 100644 index 0000000..23145be --- /dev/null +++ b/connectorx/src/transports/mysql_arrow2.rs @@ -0,0 +1,116 @@ +//! Transport from MySQL Source to Arrow2 Destination. + +use crate::{ + destinations::arrow2::{ + typesystem::Arrow2TypeSystem, Arrow2Destination, Arrow2DestinationError, + }, + impl_transport, + sources::mysql::{ + BinaryProtocol, MySQLSource, MySQLSourceError, MySQLTypeSystem, TextProtocol, + }, + typesystem::TypeConversion, +}; +use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; +use num_traits::ToPrimitive; +use rust_decimal::Decimal; +use serde_json::{to_string, Value}; +use std::marker::PhantomData; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum MySQLArrow2TransportError { + #[error(transparent)] + Source(#[from] MySQLSourceError), + + #[error(transparent)] + Destination(#[from] Arrow2DestinationError), + + #[error(transparent)] + ConnectorX(#[from] crate::errors::ConnectorXError), +} + +/// Convert MySQL data types to Arrow2 data types. +pub struct MySQLArrow2Transport

(PhantomData

); + +impl_transport!( + name = MySQLArrow2Transport, + error = MySQLArrow2TransportError, + systems = MySQLTypeSystem => Arrow2TypeSystem, + route = MySQLSource => Arrow2Destination, + mappings = { + { Float[f32] => Float64[f64] | conversion auto } + { Double[f64] => Float64[f64] | conversion auto } + { Tiny[i8] => Int64[i64] | conversion auto } + { Short[i16] => Int64[i64] | conversion auto } + { Int24[i32] => Int64[i64] | conversion none } + { Long[i32] => Int64[i64] | conversion auto } + { LongLong[i64] => Int64[i64] | conversion auto } + { UTiny[u8] => Int64[i64] | conversion auto } + { UShort[u16] => Int64[i64] | conversion auto } + { ULong[u32] => Int64[i64] | conversion auto } + { UInt24[u32] => Int64[i64] | conversion none } + { ULongLong[u64] => Float64[f64] | conversion auto } + { Date[NaiveDate] => Date32[NaiveDate] | conversion auto } + { Time[NaiveTime] => Time64[NaiveTime] | conversion auto } + { Datetime[NaiveDateTime] => Date64[NaiveDateTime] | conversion auto } + { Year[i16] => Int64[i64] | conversion none} + { Timestamp[NaiveDateTime] => Date64[NaiveDateTime] | conversion none } + { Decimal[Decimal] => Float64[f64] | conversion option } + { VarChar[String] => LargeUtf8[String] | conversion auto } + { Char[String] => LargeUtf8[String] | conversion none } + { Enum[String] => LargeUtf8[String] | conversion none } + { Json[Value] => LargeUtf8[String] | conversion option } + { TinyBlob[Vec] => LargeBinary[Vec] | conversion auto } + { Blob[Vec] => LargeBinary[Vec] | conversion none } + { MediumBlob[Vec] => LargeBinary[Vec] | conversion none } + { LongBlob[Vec] => LargeBinary[Vec] | conversion none } + } +); + +impl_transport!( + name = MySQLArrow2Transport, + error = MySQLArrow2TransportError, + systems = MySQLTypeSystem => Arrow2TypeSystem, + route = MySQLSource => Arrow2Destination, + mappings = { + { Float[f32] => Float64[f64] | conversion auto } + { Double[f64] => Float64[f64] | conversion auto } + { Tiny[i8] => Int64[i64] | conversion auto } + { Short[i16] => Int64[i64] | conversion auto } + { Int24[i32] => Int64[i64] | conversion none } + { Long[i32] => Int64[i64] | conversion auto } + { LongLong[i64] => Int64[i64] | conversion auto } + { UTiny[u8] => Int64[i64] | conversion auto } + { UShort[u16] => Int64[i64] | conversion auto } + { ULong[u32] => Int64[i64] | conversion auto } + { UInt24[u32] => Int64[i64] | conversion none } + { ULongLong[u64] => Float64[f64] | conversion auto } + { Date[NaiveDate] => Date32[NaiveDate] | conversion auto } + { Time[NaiveTime] => Time64[NaiveTime] | conversion auto } + { Datetime[NaiveDateTime] => Date64[NaiveDateTime] | conversion auto } + { Year[i16] => Int64[i64] | conversion none} + { Timestamp[NaiveDateTime] => Date64[NaiveDateTime] | conversion none } + { Decimal[Decimal] => Float64[f64] | conversion option } + { VarChar[String] => LargeUtf8[String] | conversion auto } + { Char[String] => LargeUtf8[String] | conversion none } + { Enum[String] => LargeUtf8[String] | conversion none } + { Json[Value] => LargeUtf8[String] | conversion option } + { TinyBlob[Vec] => LargeBinary[Vec] | conversion auto } + { Blob[Vec] => LargeBinary[Vec] | conversion none } + { MediumBlob[Vec] => LargeBinary[Vec] | conversion none } + { LongBlob[Vec] => LargeBinary[Vec] | conversion none } + } +); + +impl

TypeConversion for MySQLArrow2Transport

{ + fn convert(val: Decimal) -> f64 { + val.to_f64() + .unwrap_or_else(|| panic!("cannot convert decimal {:?} to float64", val)) + } +} + +impl

TypeConversion for MySQLArrow2Transport

{ + fn convert(val: Value) -> String { + to_string(&val).unwrap() + } +} diff --git a/connectorx/src/transports/mysql_arrowstream.rs b/connectorx/src/transports/mysql_arrowstream.rs new file mode 100644 index 0000000..0d0bccd --- /dev/null +++ b/connectorx/src/transports/mysql_arrowstream.rs @@ -0,0 +1,122 @@ +//! Transport from MySQL Source to Arrow Destination. + +use crate::{ + destinations::arrowstream::{ + typesystem::ArrowTypeSystem, ArrowDestination, ArrowDestinationError, + }, + impl_transport, + sources::mysql::{ + BinaryProtocol, MySQLSource, MySQLSourceError, MySQLTypeSystem, TextProtocol, + }, + typesystem::TypeConversion, +}; +use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; +use num_traits::ToPrimitive; +use rust_decimal::Decimal; +use serde_json::{to_string, Value}; +use std::marker::PhantomData; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum MySQLArrowTransportError { + #[error(transparent)] + Source(#[from] MySQLSourceError), + + #[error(transparent)] + Destination(#[from] ArrowDestinationError), + + #[error(transparent)] + ConnectorX(#[from] crate::errors::ConnectorXError), +} + +/// Convert MySQL data types to Arrow data types. +pub struct MySQLArrowTransport

(PhantomData

); + +impl_transport!( + name = MySQLArrowTransport, + error = MySQLArrowTransportError, + systems = MySQLTypeSystem => ArrowTypeSystem, + route = MySQLSource => ArrowDestination, + mappings = { + { Float[f32] => Float64[f64] | conversion auto } + { Double[f64] => Float64[f64] | conversion auto } + { Tiny[i8] => Boolean[bool] | conversion option } + { Short[i16] => Int64[i64] | conversion auto } + { Int24[i32] => Int64[i64] | conversion none } + { Long[i32] => Int64[i64] | conversion auto } + { LongLong[i64] => Int64[i64] | conversion auto } + { UTiny[u8] => Int64[i64] | conversion auto } + { UShort[u16] => Int64[i64] | conversion auto } + { ULong[u32] => Int64[i64] | conversion auto } + { UInt24[u32] => Int64[i64] | conversion none } + { ULongLong[u64] => Float64[f64] | conversion auto } + { Date[NaiveDate] => Date32[NaiveDate] | conversion auto } + { Time[NaiveTime] => Time64[NaiveTime] | conversion auto } + { Datetime[NaiveDateTime] => Date64[NaiveDateTime] | conversion auto } + { Year[i16] => Int64[i64] | conversion none} + { Timestamp[NaiveDateTime] => Date64[NaiveDateTime] | conversion none } + { Decimal[Decimal] => Float64[f64] | conversion option } + { VarChar[String] => LargeUtf8[String] | conversion auto } + { Char[String] => LargeUtf8[String] | conversion none } + { Enum[String] => LargeUtf8[String] | conversion none } + { TinyBlob[Vec] => LargeBinary[Vec] | conversion auto } + { Blob[Vec] => LargeBinary[Vec] | conversion none } + { MediumBlob[Vec] => LargeBinary[Vec] | conversion none } + { LongBlob[Vec] => LargeBinary[Vec] | conversion none } + { Json[Value] => LargeUtf8[String] | conversion option } + } +); + +impl_transport!( + name = MySQLArrowTransport, + error = MySQLArrowTransportError, + systems = MySQLTypeSystem => ArrowTypeSystem, + route = MySQLSource => ArrowDestination, + mappings = { + { Float[f32] => Float64[f64] | conversion auto } + { Double[f64] => Float64[f64] | conversion auto } + { Tiny[i8] => Boolean[bool] | conversion option } + { Short[i16] => Int64[i64] | conversion auto } + { Int24[i32] => Int64[i64] | conversion none } + { Long[i32] => Int64[i64] | conversion auto } + { LongLong[i64] => Int64[i64] | conversion auto } + { UTiny[u8] => Int64[i64] | conversion auto } + { UShort[u16] => Int64[i64] | conversion auto } + { ULong[u32] => Int64[i64] | conversion auto } + { UInt24[u32] => Int64[i64] | conversion none } + { ULongLong[u64] => Float64[f64] | conversion auto } + { Date[NaiveDate] => Date32[NaiveDate] | conversion auto } + { Time[NaiveTime] => Time64[NaiveTime] | conversion auto } + { Datetime[NaiveDateTime] => Date64[NaiveDateTime] | conversion auto } + { Year[i16] => Int64[i64] | conversion none} + { Timestamp[NaiveDateTime] => Date64[NaiveDateTime] | conversion none } + { Decimal[Decimal] => Float64[f64] | conversion option } + { VarChar[String] => LargeUtf8[String] | conversion auto } + { Char[String] => LargeUtf8[String] | conversion none } + { Enum[String] => LargeUtf8[String] | conversion none } + { TinyBlob[Vec] => LargeBinary[Vec] | conversion auto } + { Blob[Vec] => LargeBinary[Vec] | conversion none } + { MediumBlob[Vec] => LargeBinary[Vec] | conversion none } + { LongBlob[Vec] => LargeBinary[Vec] | conversion none } + { Json[Value] => LargeUtf8[String] | conversion option } + } +); + +impl

TypeConversion for MySQLArrowTransport

{ + fn convert(val: Decimal) -> f64 { + val.to_f64() + .unwrap_or_else(|| panic!("cannot convert decimal {:?} to float64", val)) + } +} + +impl

TypeConversion for MySQLArrowTransport

{ + fn convert(val: Value) -> String { + to_string(&val).unwrap() + } +} + +impl

TypeConversion for MySQLArrowTransport

{ + fn convert(val: i8) -> bool { + val != 0 + } +} diff --git a/connectorx/src/transports/oracle_arrow.rs b/connectorx/src/transports/oracle_arrow.rs new file mode 100644 index 0000000..7f08297 --- /dev/null +++ b/connectorx/src/transports/oracle_arrow.rs @@ -0,0 +1,45 @@ +use crate::{ + destinations::arrow::{typesystem::ArrowTypeSystem, ArrowDestination, ArrowDestinationError}, + impl_transport, + sources::oracle::{OracleSource, OracleSourceError, OracleTypeSystem}, + typesystem::TypeConversion, +}; +use chrono::{DateTime, NaiveDateTime, Utc}; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum OracleArrowTransportError { + #[error(transparent)] + Source(#[from] OracleSourceError), + + #[error(transparent)] + Destination(#[from] ArrowDestinationError), + + #[error(transparent)] + ConnectorX(#[from] crate::errors::ConnectorXError), +} + +pub struct OracleArrowTransport; + +impl_transport!( + name = OracleArrowTransport, + error = OracleArrowTransportError, + systems = OracleTypeSystem => ArrowTypeSystem, + route = OracleSource => ArrowDestination, + mappings = { + { NumFloat[f64] => Float64[f64] | conversion auto } + { Float[f64] => Float64[f64] | conversion none } + { BinaryFloat[f64] => Float64[f64] | conversion none } + { BinaryDouble[f64] => Float64[f64] | conversion none } + { NumInt[i64] => Int64[i64] | conversion auto } + { Blob[Vec] => LargeBinary[Vec] | conversion auto } + { Clob[String] => LargeUtf8[String] | conversion none } + { VarChar[String] => LargeUtf8[String] | conversion auto } + { Char[String] => LargeUtf8[String] | conversion none } + { NVarChar[String] => LargeUtf8[String] | conversion none } + { NChar[String] => LargeUtf8[String] | conversion none } + { Date[NaiveDateTime] => Date64[NaiveDateTime] | conversion auto } + { Timestamp[NaiveDateTime] => Date64[NaiveDateTime] | conversion none } + { TimestampTz[DateTime] => DateTimeTz[DateTime] | conversion auto } + } +); diff --git a/connectorx/src/transports/oracle_arrow2.rs b/connectorx/src/transports/oracle_arrow2.rs new file mode 100644 index 0000000..7e351fb --- /dev/null +++ b/connectorx/src/transports/oracle_arrow2.rs @@ -0,0 +1,47 @@ +use crate::{ + destinations::arrow2::{ + typesystem::Arrow2TypeSystem, Arrow2Destination, Arrow2DestinationError, + }, + impl_transport, + sources::oracle::{OracleSource, OracleSourceError, OracleTypeSystem}, + typesystem::TypeConversion, +}; +use chrono::{DateTime, NaiveDateTime, Utc}; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum OracleArrow2TransportError { + #[error(transparent)] + Source(#[from] OracleSourceError), + + #[error(transparent)] + Destination(#[from] Arrow2DestinationError), + + #[error(transparent)] + ConnectorX(#[from] crate::errors::ConnectorXError), +} + +pub struct OracleArrow2Transport; + +impl_transport!( + name = OracleArrow2Transport, + error = OracleArrow2TransportError, + systems = OracleTypeSystem => Arrow2TypeSystem, + route = OracleSource => Arrow2Destination, + mappings = { + { NumFloat[f64] => Float64[f64] | conversion auto } + { Float[f64] => Float64[f64] | conversion none } + { BinaryFloat[f64] => Float64[f64] | conversion none } + { BinaryDouble[f64] => Float64[f64] | conversion none } + { NumInt[i64] => Int64[i64] | conversion auto } + { Blob[Vec] => LargeBinary[Vec] | conversion auto } + { Clob[String] => LargeUtf8[String] | conversion none } + { VarChar[String] => LargeUtf8[String] | conversion auto } + { Char[String] => LargeUtf8[String] | conversion none } + { NVarChar[String] => LargeUtf8[String] | conversion none } + { NChar[String] => LargeUtf8[String] | conversion none } + { Date[NaiveDateTime] => Date64[NaiveDateTime] | conversion auto } + { Timestamp[NaiveDateTime] => Date64[NaiveDateTime] | conversion none } + { TimestampTz[DateTime] => DateTimeTz[DateTime] | conversion auto } + } +); diff --git a/connectorx/src/transports/oracle_arrowstream.rs b/connectorx/src/transports/oracle_arrowstream.rs new file mode 100644 index 0000000..c6f53bc --- /dev/null +++ b/connectorx/src/transports/oracle_arrowstream.rs @@ -0,0 +1,47 @@ +use crate::{ + destinations::arrowstream::{ + typesystem::ArrowTypeSystem, ArrowDestination, ArrowDestinationError, + }, + impl_transport, + sources::oracle::{OracleSource, OracleSourceError, OracleTypeSystem}, + typesystem::TypeConversion, +}; +use chrono::{DateTime, NaiveDateTime, Utc}; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum OracleArrowTransportError { + #[error(transparent)] + Source(#[from] OracleSourceError), + + #[error(transparent)] + Destination(#[from] ArrowDestinationError), + + #[error(transparent)] + ConnectorX(#[from] crate::errors::ConnectorXError), +} + +pub struct OracleArrowTransport; + +impl_transport!( + name = OracleArrowTransport, + error = OracleArrowTransportError, + systems = OracleTypeSystem => ArrowTypeSystem, + route = OracleSource => ArrowDestination, + mappings = { + { NumFloat[f64] => Float64[f64] | conversion auto } + { Float[f64] => Float64[f64] | conversion none } + { BinaryFloat[f64] => Float64[f64] | conversion none } + { BinaryDouble[f64] => Float64[f64] | conversion none } + { NumInt[i64] => Int64[i64] | conversion auto } + { Blob[Vec] => LargeBinary[Vec] | conversion auto } + { Clob[String] => LargeUtf8[String] | conversion none } + { VarChar[String] => LargeUtf8[String] | conversion auto } + { Char[String] => LargeUtf8[String] | conversion none } + { NVarChar[String] => LargeUtf8[String] | conversion none } + { NChar[String] => LargeUtf8[String] | conversion none } + { Date[NaiveDateTime] => Date64[NaiveDateTime] | conversion auto } + { Timestamp[NaiveDateTime] => Date64[NaiveDateTime] | conversion none } + { TimestampTz[DateTime] => DateTimeTz[DateTime] | conversion auto } + } +); diff --git a/connectorx/src/transports/postgres_arrow.rs b/connectorx/src/transports/postgres_arrow.rs new file mode 100644 index 0000000..73c076f --- /dev/null +++ b/connectorx/src/transports/postgres_arrow.rs @@ -0,0 +1,95 @@ +//! Transport from Postgres Source to Arrow Destination. + +use crate::destinations::arrow::{ + typesystem::ArrowTypeSystem, ArrowDestination, ArrowDestinationError, +}; +use crate::sources::postgres::{ + BinaryProtocol, CSVProtocol, CursorProtocol, PostgresSource, PostgresSourceError, + PostgresTypeSystem, SimpleProtocol, +}; +use crate::typesystem::TypeConversion; +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Utc}; +use num_traits::ToPrimitive; +use postgres::NoTls; +use postgres_openssl::MakeTlsConnector; +use rust_decimal::Decimal; +use serde_json::Value; +use std::marker::PhantomData; +use thiserror::Error; +use uuid::Uuid; + +#[derive(Error, Debug)] +pub enum PostgresArrowTransportError { + #[error(transparent)] + Source(#[from] PostgresSourceError), + + #[error(transparent)] + Destination(#[from] ArrowDestinationError), + + #[error(transparent)] + ConnectorX(#[from] crate::errors::ConnectorXError), +} + +/// Convert Postgres data types to Arrow data types. +pub struct PostgresArrowTransport(PhantomData

, PhantomData); + +macro_rules! impl_postgres_transport { + ($proto:ty, $tls:ty) => { + impl_transport!( + name = PostgresArrowTransport<$proto, $tls>, + error = PostgresArrowTransportError, + systems = PostgresTypeSystem => ArrowTypeSystem, + route = PostgresSource<$proto, $tls> => ArrowDestination, + mappings = { + { Float4[f32] => Float64[f64] | conversion auto } + { Float8[f64] => Float64[f64] | conversion auto } + { Numeric[Decimal] => Float64[f64] | conversion option } + { Int2[i16] => Int64[i64] | conversion auto } + { Int4[i32] => Int64[i64] | conversion auto } + { Int8[i64] => Int64[i64] | conversion auto } + { Bool[bool] => Boolean[bool] | conversion auto } + { Text[&'r str] => LargeUtf8[String] | conversion owned } + { BpChar[&'r str] => LargeUtf8[String] | conversion none } + { VarChar[&'r str] => LargeUtf8[String] | conversion none } + { Name[&'r str] => LargeUtf8[String] | conversion none } + { Timestamp[NaiveDateTime] => Date64[NaiveDateTime] | conversion auto } + { Date[NaiveDate] => Date32[NaiveDate] | conversion auto } + { Time[NaiveTime] => Time64[NaiveTime] | conversion auto } + { TimestampTz[DateTime] => DateTimeTz[DateTime] | conversion auto } + { UUID[Uuid] => LargeUtf8[String] | conversion option } + { Char[&'r str] => LargeUtf8[String] | conversion none } + { ByteA[Vec] => LargeBinary[Vec] | conversion auto } + { JSON[Value] => LargeUtf8[String] | conversion option } + { JSONB[Value] => LargeUtf8[String] | conversion none } + } + ); + } +} + +impl_postgres_transport!(BinaryProtocol, NoTls); +impl_postgres_transport!(BinaryProtocol, MakeTlsConnector); +impl_postgres_transport!(CSVProtocol, NoTls); +impl_postgres_transport!(CSVProtocol, MakeTlsConnector); +impl_postgres_transport!(CursorProtocol, NoTls); +impl_postgres_transport!(CursorProtocol, MakeTlsConnector); +impl_postgres_transport!(SimpleProtocol, NoTls); +impl_postgres_transport!(SimpleProtocol, MakeTlsConnector); + +impl TypeConversion for PostgresArrowTransport { + fn convert(val: Uuid) -> String { + val.to_string() + } +} + +impl TypeConversion for PostgresArrowTransport { + fn convert(val: Decimal) -> f64 { + val.to_f64() + .unwrap_or_else(|| panic!("cannot convert decimal {:?} to float64", val)) + } +} + +impl TypeConversion for PostgresArrowTransport { + fn convert(val: Value) -> String { + val.to_string() + } +} diff --git a/connectorx/src/transports/postgres_arrow2.rs b/connectorx/src/transports/postgres_arrow2.rs new file mode 100644 index 0000000..7d3f31a --- /dev/null +++ b/connectorx/src/transports/postgres_arrow2.rs @@ -0,0 +1,117 @@ +//! Transport from Postgres Source to Arrow2 Destination. + +use crate::destinations::arrow2::{ + typesystem::Arrow2TypeSystem, Arrow2Destination, Arrow2DestinationError, +}; +use crate::sources::postgres::{ + BinaryProtocol, CSVProtocol, CursorProtocol, PostgresSource, PostgresSourceError, + PostgresTypeSystem, SimpleProtocol, +}; +use crate::typesystem::TypeConversion; +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Utc}; +use num_traits::ToPrimitive; +use postgres::NoTls; +use postgres_openssl::MakeTlsConnector; +use rust_decimal::Decimal; +use serde_json::Value; +use std::marker::PhantomData; +use thiserror::Error; +use uuid::Uuid; + +#[derive(Error, Debug)] +pub enum PostgresArrow2TransportError { + #[error(transparent)] + Source(#[from] PostgresSourceError), + + #[error(transparent)] + Destination(#[from] Arrow2DestinationError), + + #[error(transparent)] + ConnectorX(#[from] crate::errors::ConnectorXError), +} + +/// Convert Postgres data types to Arrow2 data types. +pub struct PostgresArrow2Transport(PhantomData

, PhantomData); + +macro_rules! impl_postgres_transport { + ($proto:ty, $tls:ty) => { + impl_transport!( + name = PostgresArrow2Transport<$proto, $tls>, + error = PostgresArrow2TransportError, + systems = PostgresTypeSystem => Arrow2TypeSystem, + route = PostgresSource<$proto, $tls> => Arrow2Destination, + mappings = { + { Float4[f32] => Float32[f32] | conversion auto } + { Float8[f64] => Float64[f64] | conversion auto } + { Numeric[Decimal] => Float64[f64] | conversion option } + { Int2[i16] => Int32[i32] | conversion auto } + { Int4[i32] => Int32[i32] | conversion auto } + { Int8[i64] => Int64[i64] | conversion auto } + { Bool[bool] => Boolean[bool] | conversion auto } + { Text[&'r str] => LargeUtf8[String] | conversion owned } + { BpChar[&'r str] => LargeUtf8[String] | conversion none } + { VarChar[&'r str] => LargeUtf8[String] | conversion none } + { Enum[&'r str] => LargeUtf8[String] | conversion none } + { Name[&'r str] => LargeUtf8[String] | conversion none } + { Timestamp[NaiveDateTime] => Date64[NaiveDateTime] | conversion auto } + { Date[NaiveDate] => Date32[NaiveDate] | conversion auto } + { Time[NaiveTime] => Time64[NaiveTime] | conversion auto } + { TimestampTz[DateTime] => DateTimeTz[DateTime] | conversion auto } + { UUID[Uuid] => LargeUtf8[String] | conversion option } + { Char[&'r str] => LargeUtf8[String] | conversion none } + { ByteA[Vec] => LargeBinary[Vec] | conversion auto } + { JSON[Value] => LargeUtf8[String] | conversion option } + { JSONB[Value] => LargeUtf8[String] | conversion none } + { BoolArray[Vec] => BoolArray[Vec] | conversion auto_vec } + { Int2Array[Vec] => Int64Array[Vec] | conversion auto_vec } + { Int4Array[Vec] => Int64Array[Vec] | conversion auto_vec } + { Int8Array[Vec] => Int64Array[Vec] | conversion auto } + { Float4Array[Vec] => Float64Array[Vec] | conversion auto_vec } + { Float8Array[Vec] => Float64Array[Vec] | conversion auto } + { NumericArray[Vec] => Float64Array[Vec] | conversion option } + { VarcharArray[Vec] => Utf8Array[Vec] | conversion none } + { TextArray[Vec] => Utf8Array[Vec] | conversion auto } + + } + ); + } +} + +impl_postgres_transport!(BinaryProtocol, NoTls); +impl_postgres_transport!(BinaryProtocol, MakeTlsConnector); +impl_postgres_transport!(CSVProtocol, NoTls); +impl_postgres_transport!(CSVProtocol, MakeTlsConnector); +impl_postgres_transport!(CursorProtocol, NoTls); +impl_postgres_transport!(CursorProtocol, MakeTlsConnector); +impl_postgres_transport!(SimpleProtocol, NoTls); +impl_postgres_transport!(SimpleProtocol, MakeTlsConnector); + +impl TypeConversion for PostgresArrow2Transport { + fn convert(val: Uuid) -> String { + val.to_string() + } +} + +impl TypeConversion for PostgresArrow2Transport { + fn convert(val: Decimal) -> f64 { + val.to_f64() + .unwrap_or_else(|| panic!("cannot convert decimal {:?} to float64", val)) + } +} + +impl TypeConversion, Vec> for PostgresArrow2Transport { + fn convert(val: Vec) -> Vec { + val.into_iter() + .map(|v| { + v.to_f64() + .unwrap_or_else(|| panic!("cannot convert decimal {:?} to float64", v)) + }) + .collect() + } +} + +impl TypeConversion for PostgresArrow2Transport { + fn convert(val: Value) -> String { + val.to_string() + } +} diff --git a/connectorx/src/transports/postgres_arrowstream.rs b/connectorx/src/transports/postgres_arrowstream.rs new file mode 100644 index 0000000..7d1c20d --- /dev/null +++ b/connectorx/src/transports/postgres_arrowstream.rs @@ -0,0 +1,95 @@ +//! Transport from Postgres Source to Arrow Destination. + +use crate::destinations::arrowstream::{ + typesystem::ArrowTypeSystem, ArrowDestination, ArrowDestinationError, +}; +use crate::sources::postgres::{ + BinaryProtocol, CSVProtocol, CursorProtocol, PostgresSource, PostgresSourceError, + PostgresTypeSystem, SimpleProtocol, +}; +use crate::typesystem::TypeConversion; +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Utc}; +use num_traits::ToPrimitive; +use postgres::NoTls; +use postgres_openssl::MakeTlsConnector; +use rust_decimal::Decimal; +use serde_json::Value; +use std::marker::PhantomData; +use thiserror::Error; +use uuid::Uuid; + +#[derive(Error, Debug)] +pub enum PostgresArrowTransportError { + #[error(transparent)] + Source(#[from] PostgresSourceError), + + #[error(transparent)] + Destination(#[from] ArrowDestinationError), + + #[error(transparent)] + ConnectorX(#[from] crate::errors::ConnectorXError), +} + +/// Convert Postgres data types to Arrow data types. +pub struct PostgresArrowTransport(PhantomData

, PhantomData); + +macro_rules! impl_postgres_transport { + ($proto:ty, $tls:ty) => { + impl_transport!( + name = PostgresArrowTransport<$proto, $tls>, + error = PostgresArrowTransportError, + systems = PostgresTypeSystem => ArrowTypeSystem, + route = PostgresSource<$proto, $tls> => ArrowDestination, + mappings = { + { Float4[f32] => Float64[f64] | conversion auto } + { Float8[f64] => Float64[f64] | conversion auto } + { Numeric[Decimal] => Float64[f64] | conversion option } + { Int2[i16] => Int64[i64] | conversion auto } + { Int4[i32] => Int64[i64] | conversion auto } + { Int8[i64] => Int64[i64] | conversion auto } + { Bool[bool] => Boolean[bool] | conversion auto } + { Text[&'r str] => LargeUtf8[String] | conversion owned } + { BpChar[&'r str] => LargeUtf8[String] | conversion none } + { VarChar[&'r str] => LargeUtf8[String] | conversion none } + { Name[&'r str] => LargeUtf8[String] | conversion none } + { Timestamp[NaiveDateTime] => Date64[NaiveDateTime] | conversion auto } + { Date[NaiveDate] => Date32[NaiveDate] | conversion auto } + { Time[NaiveTime] => Time64[NaiveTime] | conversion auto } + { TimestampTz[DateTime] => DateTimeTz[DateTime] | conversion auto } + { UUID[Uuid] => LargeUtf8[String] | conversion option } + { Char[&'r str] => LargeUtf8[String] | conversion none } + { ByteA[Vec] => LargeBinary[Vec] | conversion auto } + { JSON[Value] => LargeUtf8[String] | conversion option } + { JSONB[Value] => LargeUtf8[String] | conversion none } + } + ); + } +} + +impl_postgres_transport!(BinaryProtocol, NoTls); +impl_postgres_transport!(BinaryProtocol, MakeTlsConnector); +impl_postgres_transport!(CSVProtocol, NoTls); +impl_postgres_transport!(CSVProtocol, MakeTlsConnector); +impl_postgres_transport!(CursorProtocol, NoTls); +impl_postgres_transport!(CursorProtocol, MakeTlsConnector); +impl_postgres_transport!(SimpleProtocol, NoTls); +impl_postgres_transport!(SimpleProtocol, MakeTlsConnector); + +impl TypeConversion for PostgresArrowTransport { + fn convert(val: Uuid) -> String { + val.to_string() + } +} + +impl TypeConversion for PostgresArrowTransport { + fn convert(val: Decimal) -> f64 { + val.to_f64() + .unwrap_or_else(|| panic!("cannot convert decimal {:?} to float64", val)) + } +} + +impl TypeConversion for PostgresArrowTransport { + fn convert(val: Value) -> String { + val.to_string() + } +} diff --git a/connectorx/src/transports/sqlite_arrow.rs b/connectorx/src/transports/sqlite_arrow.rs new file mode 100644 index 0000000..d410738 --- /dev/null +++ b/connectorx/src/transports/sqlite_arrow.rs @@ -0,0 +1,50 @@ +//! Transport from SQLite Source to Arrow Destination. + +use crate::{ + destinations::arrow::{typesystem::ArrowTypeSystem, ArrowDestination, ArrowDestinationError}, + impl_transport, + sources::sqlite::{SQLiteSource, SQLiteSourceError, SQLiteTypeSystem}, + typesystem::TypeConversion, +}; +use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum SQLiteArrowTransportError { + #[error(transparent)] + Source(#[from] SQLiteSourceError), + + #[error(transparent)] + Destination(#[from] ArrowDestinationError), + + #[error(transparent)] + ConnectorX(#[from] crate::errors::ConnectorXError), +} + +/// Convert SQLite data types to Arrow data types. +pub struct SQLiteArrowTransport; + +impl_transport!( + name = SQLiteArrowTransport, + error = SQLiteArrowTransportError, + systems = SQLiteTypeSystem => ArrowTypeSystem, + route = SQLiteSource => ArrowDestination, + mappings = { + { Bool[bool] => Boolean[bool] | conversion auto } + { Int8[i64] => Int64[i64] | conversion auto } + { Int4[i32] => Int64[i64] | conversion auto } + { Int2[i16] => Int64[i64] | conversion auto } + { Real[f64] => Float64[f64] | conversion auto } + { Text[Box] => LargeUtf8[String] | conversion option } + { Blob[Vec] => LargeBinary[Vec] | conversion auto } + { Date[NaiveDate] => Date32[NaiveDate] | conversion auto } + { Time[NaiveTime] => Time64[NaiveTime] | conversion auto } + { Timestamp[NaiveDateTime] => Date64[NaiveDateTime] | conversion auto } + } +); + +impl TypeConversion, String> for SQLiteArrowTransport { + fn convert(val: Box) -> String { + val.to_string() + } +} diff --git a/connectorx/src/transports/sqlite_arrow2.rs b/connectorx/src/transports/sqlite_arrow2.rs new file mode 100644 index 0000000..caeb1ed --- /dev/null +++ b/connectorx/src/transports/sqlite_arrow2.rs @@ -0,0 +1,52 @@ +//! Transport from SQLite Source to Arrow2 Destination. + +use crate::{ + destinations::arrow2::{ + typesystem::Arrow2TypeSystem, Arrow2Destination, Arrow2DestinationError, + }, + impl_transport, + sources::sqlite::{SQLiteSource, SQLiteSourceError, SQLiteTypeSystem}, + typesystem::TypeConversion, +}; +use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum SQLiteArrow2TransportError { + #[error(transparent)] + Source(#[from] SQLiteSourceError), + + #[error(transparent)] + Destination(#[from] Arrow2DestinationError), + + #[error(transparent)] + ConnectorX(#[from] crate::errors::ConnectorXError), +} + +/// Convert SQLite data types to Arrow2 data types. +pub struct SQLiteArrow2Transport; + +impl_transport!( + name = SQLiteArrow2Transport, + error = SQLiteArrow2TransportError, + systems = SQLiteTypeSystem => Arrow2TypeSystem, + route = SQLiteSource => Arrow2Destination, + mappings = { + { Bool[bool] => Boolean[bool] | conversion auto } + { Int8[i64] => Int64[i64] | conversion auto } + { Int4[i32] => Int64[i64] | conversion auto } + { Int2[i16] => Int64[i64] | conversion auto } + { Real[f64] => Float64[f64] | conversion auto } + { Text[Box] => LargeUtf8[String] | conversion option } + { Blob[Vec] => LargeBinary[Vec] | conversion auto } + { Date[NaiveDate] => Date32[NaiveDate] | conversion auto } + { Time[NaiveTime] => Time64[NaiveTime] | conversion auto } + { Timestamp[NaiveDateTime] => Date64[NaiveDateTime] | conversion auto } + } +); + +impl TypeConversion, String> for SQLiteArrow2Transport { + fn convert(val: Box) -> String { + val.to_string() + } +} diff --git a/connectorx/src/transports/sqlite_arrowstream.rs b/connectorx/src/transports/sqlite_arrowstream.rs new file mode 100644 index 0000000..0c23db5 --- /dev/null +++ b/connectorx/src/transports/sqlite_arrowstream.rs @@ -0,0 +1,52 @@ +//! Transport from SQLite Source to Arrow Destination. + +use crate::{ + destinations::arrowstream::{ + typesystem::ArrowTypeSystem, ArrowDestination, ArrowDestinationError, + }, + impl_transport, + sources::sqlite::{SQLiteSource, SQLiteSourceError, SQLiteTypeSystem}, + typesystem::TypeConversion, +}; +use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum SQLiteArrowTransportError { + #[error(transparent)] + Source(#[from] SQLiteSourceError), + + #[error(transparent)] + Destination(#[from] ArrowDestinationError), + + #[error(transparent)] + ConnectorX(#[from] crate::errors::ConnectorXError), +} + +/// Convert SQLite data types to Arrow data types. +pub struct SQLiteArrowTransport; + +impl_transport!( + name = SQLiteArrowTransport, + error = SQLiteArrowTransportError, + systems = SQLiteTypeSystem => ArrowTypeSystem, + route = SQLiteSource => ArrowDestination, + mappings = { + { Bool[bool] => Boolean[bool] | conversion auto } + { Int8[i64] => Int64[i64] | conversion auto } + { Int4[i32] => Int64[i64] | conversion auto } + { Int2[i16] => Int64[i64] | conversion auto } + { Real[f64] => Float64[f64] | conversion auto } + { Text[Box] => LargeUtf8[String] | conversion option } + { Blob[Vec] => LargeBinary[Vec] | conversion auto } + { Date[NaiveDate] => Date32[NaiveDate] | conversion auto } + { Time[NaiveTime] => Time64[NaiveTime] | conversion auto } + { Timestamp[NaiveDateTime] => Date64[NaiveDateTime] | conversion auto } + } +); + +impl TypeConversion, String> for SQLiteArrowTransport { + fn convert(val: Box) -> String { + val.to_string() + } +} diff --git a/connectorx/src/typesystem.rs b/connectorx/src/typesystem.rs new file mode 100644 index 0000000..d1716ff --- /dev/null +++ b/connectorx/src/typesystem.rs @@ -0,0 +1,143 @@ +//! This module defines traits that required to define a typesystem. +//! +//! A typesystem is an enum that describes what types can be produced by a source and accepted by a destination. +//! A typesystem also needs to implement [`TypeAssoc`] to associate the enum variants to the physical representation +//! of the types in the typesystem. + +use crate::destinations::{Consume, Destination, DestinationPartition}; +use crate::errors::{ConnectorXError, Result as CXResult}; +use crate::sources::{PartitionParser, Produce, Source, SourcePartition}; + +#[doc(hidden)] +/// `TypeSystem` describes all the types a source or destination support +/// using enum variants. +/// The variant can be used to type check with a static type `T` through the `check` method. +pub trait TypeSystem: Copy + Clone + Send + Sync { + /// Check whether T is the same type as defined by self. + fn check>(self) -> CXResult<()> { + T::check(self) + } +} + +#[doc(hidden)] +/// Associate a static type to a TypeSystem +pub trait TypeAssoc { + fn check(ts: TS) -> CXResult<()>; +} + +#[doc(hidden)] +/// Realize means that a TypeSystem can realize a parameterized func F, based on its current variants. +pub trait Realize +where + F: ParameterizedFunc, +{ + /// realize a parameterized function with the type that self currently is. + fn realize(self) -> CXResult; +} + +#[doc(hidden)] +/// A ParameterizedFunc refers to a function that is parameterized on a type T, +/// where type T will be dynaically determined by the variant of a TypeSystem. +/// An example is the `transmit` function. When piping values from a source +/// to the destination, its type `T` is determined by the schema at the runtime. +pub trait ParameterizedFunc { + type Function; + fn realize() -> Self::Function + where + Self: ParameterizedOn, + { + Self::parameterize() + } +} + +#[doc(hidden)] +/// `ParameterizedOn` indicates a parameterized function `Self` +/// is parameterized on type `T` +pub trait ParameterizedOn: ParameterizedFunc { + fn parameterize() -> Self::Function; +} + +/// Defines a rule to convert a type `T` to a type `U`. +pub trait TypeConversion { + fn convert(val: T) -> U; +} + +/// Transport asks the source to produce a value, do type conversion and then write +/// the value to a destination. Do not manually implement this trait for types. +/// Use [`impl_transport!`] to create a struct that implements this trait instead. +pub trait Transport { + type TSS: TypeSystem; + type TSD: TypeSystem; + type S: Source; + type D: Destination; + type Error: From + + From<::Error> + + From<::Error> + + Send + + std::fmt::Debug; + + /// convert_typesystem convert the source type system TSS to the destination + /// type system TSD. + fn convert_typesystem(ts: Self::TSS) -> CXResult; + + /// convert_type convert the type T1 associated with the source type system + /// TSS to a type T2 which is associated with the destination type system TSD. + fn convert_type(val: T1) -> T2 + where + Self: TypeConversion, + { + >::convert(val) + } + + /// `process` will ask source to produce a value with type T1, based on TSS, and then do + /// type conversion using `convert_type` to get value with type T2, which is associated to + /// TSD. Finally, it will write the value with type T2 to the destination. + fn process<'s, 'd, 'r>( + ts1: Self::TSS, + ts2: Self::TSD, + src: &'r mut <::Partition as SourcePartition>::Parser<'s>, + dst: &'r mut ::Partition<'d>, + ) -> Result<(), Self::Error> + where + Self: 'd; + + #[allow(clippy::type_complexity)] + fn processor<'s, 'd>( + ts1: Self::TSS, + ts2: Self::TSD, + ) -> CXResult< + fn( + src: &mut <::Partition as SourcePartition>::Parser<'s>, + dst: &mut ::Partition<'d>, + ) -> Result<(), Self::Error>, + > + where + Self: 'd; +} + +#[doc(hidden)] +pub fn process<'s, 'd, 'r, T1, T2, TP, S, D, ES, ED, ET>( + src: &'r mut <::Partition as SourcePartition>::Parser<'s>, + dst: &'r mut ::Partition<'d>, +) -> Result<(), ET> +where + T1: TypeAssoc<::TypeSystem>, + S: Source, + ::Partition: SourcePartition, + + <::Partition as SourcePartition>::Parser<'s>: Produce<'r, T1, Error = ES>, + ES: From + Send, + + T2: TypeAssoc<::TypeSystem>, + D: Destination, + ::Partition<'d>: Consume, + ED: From + Send, + + TP: TypeConversion, + ET: From + From, +{ + let val: T1 = PartitionParser::parse(src)?; + let val: T2 = >::convert(val); + DestinationPartition::write(dst, val)?; + Ok(()) +} diff --git a/connectorx/src/utils.rs b/connectorx/src/utils.rs new file mode 100644 index 0000000..ec3919e --- /dev/null +++ b/connectorx/src/utils.rs @@ -0,0 +1,17 @@ +use std::ops::{Deref, DerefMut}; + +pub struct DummyBox(pub T); + +impl Deref for DummyBox { + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for DummyBox { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} diff --git a/connectorx/tests/data/empty.csv b/connectorx/tests/data/empty.csv new file mode 100644 index 0000000..e69de29 diff --git a/connectorx/tests/data/infer_0.csv b/connectorx/tests/data/infer_0.csv new file mode 100644 index 0000000..9f1a144 --- /dev/null +++ b/connectorx/tests/data/infer_0.csv @@ -0,0 +1,5 @@ +c0,c1,c2,c3,c4,c5,c6,c7 +0,1.1,true,Nick,4,true,2015-05-18,2020-03-12T19:39:49 +1,2.2,,Charlie,5,6,2015/05/18,2019-10-12T01:39:49 +2,3.3,false,,6,,2015.05.18, +3,4.4,false,Jess,7.1,,2015May18,abc2020-03-12T19:39:49 \ No newline at end of file diff --git a/connectorx/tests/data/uint_0.csv b/connectorx/tests/data/uint_0.csv new file mode 100644 index 0000000..200c7b5 --- /dev/null +++ b/connectorx/tests/data/uint_0.csv @@ -0,0 +1,5 @@ +c1,c2,c3,c4,c5 +0,1,2,3,4 +5,6,7,8,9 +10,11,12,13,14 +15,16,17,18,19 \ No newline at end of file diff --git a/connectorx/tests/data/uint_1.csv b/connectorx/tests/data/uint_1.csv new file mode 100644 index 0000000..8436934 --- /dev/null +++ b/connectorx/tests/data/uint_1.csv @@ -0,0 +1,8 @@ +c1,c2,c3,c4,c5 +20,21,22,23,24 +25,26,27,28,29 +30,31,32,33,34 +35,36,37,38,39 +40,41,42,43,44 +45,46,47,48,49 +50,51,52,53,54 \ No newline at end of file diff --git a/connectorx/tests/data/uspop_0.csv b/connectorx/tests/data/uspop_0.csv new file mode 100644 index 0000000..a013347 --- /dev/null +++ b/connectorx/tests/data/uspop_0.csv @@ -0,0 +1,4 @@ +Location,State,Zip,Lat,Long +Kenai,AK,7610,60.5544444,-151.2583333 +Selma,AL,18980,32.4072222,-87.0211111 +El Mirage,AZ,32308,33.6130556,-112.3238889 \ No newline at end of file diff --git a/connectorx/tests/test_arrow.rs b/connectorx/tests/test_arrow.rs new file mode 100644 index 0000000..31c93fa --- /dev/null +++ b/connectorx/tests/test_arrow.rs @@ -0,0 +1,293 @@ +use arrow::{ + array::{BooleanArray, Float64Array, Int64Array, StringArray}, + record_batch::RecordBatch, +}; +use connectorx::{ + constants::RECORD_BATCH_SIZE, + destinations::arrow::{ArrowDestination, ArrowTypeSystem}, + prelude::*, + sources::{ + dummy::{DummySource, DummyTypeSystem}, + postgres::{rewrite_tls_args, BinaryProtocol, PostgresSource}, + }, + sql::CXQuery, + transports::{DummyArrowTransport, PostgresArrowTransport}, +}; +use postgres::NoTls; +use std::env; +use url::Url; + +#[test] +#[should_panic] +fn arrow_destination_col_major() { + let mut dw = ArrowDestination::new(); + let _ = dw + .allocate( + 11, + &["a", "b", "c"], + &[ + ArrowTypeSystem::Int64(false), + ArrowTypeSystem::Float64(true), + ArrowTypeSystem::LargeUtf8(true), + ], + DataOrder::ColumnMajor, + ) + .unwrap(); +} + +#[test] +fn test_arrow() { + let schema = [ + DummyTypeSystem::I64(true), + DummyTypeSystem::F64(true), + DummyTypeSystem::Bool(false), + DummyTypeSystem::String(true), + DummyTypeSystem::F64(false), + ]; + let nrows = vec![4, 7]; + let ncols = schema.len(); + let queries: Vec = nrows + .iter() + .map(|v| CXQuery::naked(format!("{},{}", v, ncols))) + .collect(); + let mut destination = ArrowDestination::new(); + + let dispatcher = Dispatcher::<_, _, DummyArrowTransport>::new( + DummySource::new(&["a", "b", "c", "d", "e"], &schema), + &mut destination, + &queries, + None, + ); + dispatcher.run().expect("run dispatcher"); + + let records: Vec = destination.arrow().unwrap(); + assert_eq!(2, records.len()); + + for r in records { + match r.num_rows() { + 4 => { + assert!(r + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Int64Array::from(vec![0, 1, 2, 3]))); + + assert!(r + .column(1) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Float64Array::from(vec![0.0, 1.0, 2.0, 3.0]))); + assert!(r + .column(2) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&BooleanArray::from(vec![true, false, true, false]))); + assert!(r + .column(3) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&StringArray::from(vec!["0", "1", "2", "3"]))); + assert!(r + .column(4) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Float64Array::from(vec![0.0, 1.0, 2.0, 3.0]))); + } + 7 => { + assert!(r + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Int64Array::from(vec![0, 1, 2, 3, 4, 5, 6]))); + assert!(r + .column(1) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Float64Array::from(vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]))); + + assert!(r + .column(2) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&BooleanArray::from(vec![ + true, false, true, false, true, false, true + ]))); + assert!(r + .column(3) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&StringArray::from(vec!["0", "1", "2", "3", "4", "5", "6"]))); + assert!(r + .column(4) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Float64Array::from(vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]))); + } + _ => { + println!("got a batch record with {} rows", r.num_rows()); + unreachable!(); + } + } + } +} + +#[test] +fn test_arrow_large() { + let schema = [ + DummyTypeSystem::I64(true), + DummyTypeSystem::F64(true), + DummyTypeSystem::Bool(false), + DummyTypeSystem::String(true), + DummyTypeSystem::F64(false), + ]; + let nrows = vec![RECORD_BATCH_SIZE * 2 + 1, RECORD_BATCH_SIZE * 2 - 1]; + let ncols = schema.len(); + let queries: Vec = nrows + .iter() + .map(|v| CXQuery::naked(format!("{},{}", v, ncols))) + .collect(); + let mut destination = ArrowDestination::new(); + + let dispatcher = Dispatcher::<_, _, DummyArrowTransport>::new( + DummySource::new(&["a", "b", "c", "d", "e"], &schema), + &mut destination, + &queries, + None, + ); + dispatcher.run().expect("run dispatcher"); + + let records: Vec = destination.arrow().unwrap(); + assert_eq!(5, records.len()); + let mut rsizes = vec![]; + for r in records { + rsizes.push(r.num_rows()); + } + rsizes.sort(); + assert_eq!( + vec![ + 1, + RECORD_BATCH_SIZE - 1, + RECORD_BATCH_SIZE, + RECORD_BATCH_SIZE, + RECORD_BATCH_SIZE + ], + rsizes + ); +} + +#[test] +fn test_postgres_arrow() { + let _ = env_logger::builder().is_test(true).try_init(); + + let dburl = env::var("POSTGRES_URL").unwrap(); + + let queries = [ + CXQuery::naked("select * from test_table where test_int < 2"), + CXQuery::naked("select * from test_table where test_int >= 2"), + ]; + let url = Url::parse(dburl.as_str()).unwrap(); + let (config, _tls) = rewrite_tls_args(&url).unwrap(); + let builder = PostgresSource::::new(config, NoTls, 2).unwrap(); + let mut destination = ArrowDestination::new(); + let dispatcher = Dispatcher::<_, _, PostgresArrowTransport>::new( + builder, + &mut destination, + &queries, + Some(format!("select * from test_table")), + ); + + dispatcher.run().expect("run dispatcher"); + + let records: Vec = destination.arrow().unwrap(); + assert_eq!(2, records.len()); + + for r in records { + match r.num_rows() { + 2 => { + assert!(r + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Int64Array::from(vec![1, 0]))); + assert!(r + .column(1) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Int64Array::from(vec![3, 5]))); + assert!(r + .column(2) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&StringArray::from(vec!["str1", "a"]))); + assert!(r + .column(3) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Float64Array::from(vec![None, Some(3.1)]))); + assert!(r + .column(4) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&BooleanArray::from(vec![Some(true), None]))); + } + 4 => { + assert!(r + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Int64Array::from(vec![2, 3, 4, 1314]))); + assert!(r + .column(1) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Int64Array::from(vec![None, Some(7), Some(9), Some(2)]))); + assert!(r + .column(2) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&StringArray::from(vec![ + Some("str2"), + Some("b"), + Some("c"), + None + ]))); + assert!(r + .column(3) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Float64Array::from(vec![2.2, 3., 7.8, -10.]))); + assert!(r + .column(4) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&BooleanArray::from(vec![ + Some(false), + Some(false), + None, + Some(true) + ]))); + } + _ => unreachable!(), + } + } +} diff --git a/connectorx/tests/test_bigquery.rs b/connectorx/tests/test_bigquery.rs new file mode 100644 index 0000000..a8eb04e --- /dev/null +++ b/connectorx/tests/test_bigquery.rs @@ -0,0 +1,37 @@ +use connectorx::{ + destinations::arrow::ArrowDestination, prelude::*, sources::bigquery::BigQuerySource, + sql::CXQuery, transports::BigQueryArrowTransport, +}; +use std::env; +use std::sync::Arc; +use tokio::runtime::Runtime; + +#[test] +#[ignore] +fn test_source() { + let dburl = env::var("BIGQUERY_URL").unwrap(); + let rt = Arc::new(Runtime::new().unwrap()); + let mut source = BigQuerySource::new(rt, &dburl).unwrap(); + source.set_queries(&[ + CXQuery::naked("SELECT * FROM (SELECT * FROM `dataprep-bigquery.dataprep.lineitem` LIMIT 1000) AS CXTMPTAB_PART WHERE 1281 <= CXTMPTAB_PART.L_ORDERKEY AND CXTMPTAB_PART.L_ORDERKEY < 19419500"), + CXQuery::naked("SELECT * FROM (SELECT * FROM `dataprep-bigquery.dataprep.lineitem` LIMIT 1000) AS CXTMPTAB_PART WHERE 19419500 <= CXTMPTAB_PART.L_ORDERKEY AND CXTMPTAB_PART.L_ORDERKEY < 38837719"), + CXQuery::naked("SELECT * FROM (SELECT * FROM `dataprep-bigquery.dataprep.lineitem` LIMIT 1000) AS CXTMPTAB_PART WHERE 38837719 <= CXTMPTAB_PART.L_ORDERKEY AND CXTMPTAB_PART.L_ORDERKEY < 58255940"),]); + source.fetch_metadata().unwrap(); +} + +#[test] +#[ignore] +fn test_bigquery_partition() { + let dburl = env::var("BIGQUERY_URL").unwrap(); + let rt = Arc::new(Runtime::new().unwrap()); + let source = BigQuerySource::new(rt, &dburl).unwrap(); + let queries = [ + CXQuery::naked("SELECT * FROM (SELECT * FROM `dataprep-bigquery.dataprep.lineitem` LIMIT 1000) AS CXTMPTAB_PART WHERE 1281 <= CXTMPTAB_PART.L_ORDERKEY AND CXTMPTAB_PART.L_ORDERKEY < 29128610"), + CXQuery::naked("SELECT * FROM (SELECT * FROM `dataprep-bigquery.dataprep.lineitem` LIMIT 1000) AS CXTMPTAB_PART WHERE 29128610 <= CXTMPTAB_PART.L_ORDERKEY AND CXTMPTAB_PART.L_ORDERKEY < 58255940"), + ]; + let mut destination = ArrowDestination::new(); + let dispatcher = + Dispatcher::<_, _, BigQueryArrowTransport>::new(source, &mut destination, &queries, None); + dispatcher.run().unwrap(); + let _result = destination.arrow().unwrap(); +} diff --git a/connectorx/tests/test_csv.rs b/connectorx/tests/test_csv.rs new file mode 100644 index 0000000..9da3749 --- /dev/null +++ b/connectorx/tests/test_csv.rs @@ -0,0 +1,165 @@ +use arrow::array::Int64Array; +use connectorx::prelude::*; +use connectorx::{ + destinations::arrow::{ArrowDestination, ArrowTypeSystem}, + sources::{ + csv::{CSVSource, CSVTypeSystem}, + PartitionParser, + }, + sql::CXQuery, + transports::CSVArrowTransport, +}; + +#[test] +#[should_panic] +fn no_file() { + let mut source = CSVSource::new(&[]); + source.set_queries(&[CXQuery::naked("./a_fake_file.csv")]); + let partitions = source.partition().unwrap(); + for mut p in partitions { + p.result_rows().expect("run query"); + } +} + +#[test] +#[should_panic] +fn empty_file() { + let mut source = CSVSource::new(&[]); + source.set_queries(&[CXQuery::naked("./tests/data/empty.csv")]); + let mut partitions = source.partition().unwrap(); + for p in &mut partitions { + p.result_rows().expect("run query"); + } + assert_eq!(0, partitions[0].nrows()); + assert_eq!(0, partitions[0].ncols()); + let mut parser = partitions[0].parser().unwrap(); + + parser.fetch_next().unwrap(); + + let _v: i64 = parser.produce().expect("produce from emtpy"); +} + +#[test] +fn load_and_parse() { + #[derive(Debug, PartialEq)] + enum Value { + City(String), + State(String), + Population(i64), + Longitude(f64), + Latitude(f64), + } + + let mut source = CSVSource::new(&[ + CSVTypeSystem::String(false), + CSVTypeSystem::String(false), + CSVTypeSystem::I64(false), + CSVTypeSystem::F64(false), + CSVTypeSystem::F64(false), + ]); + source.set_queries(&[CXQuery::naked("./tests/data/uspop_0.csv")]); + + let mut partitions = source.partition().unwrap(); + + let mut partition = partitions.remove(0); + partition.result_rows().expect("run query"); + + assert_eq!(3, partition.nrows()); + assert_eq!(5, partition.ncols()); + + let mut results: Vec = Vec::new(); + let mut parser = partition.parser().unwrap(); + loop { + let (n, is_last) = parser.fetch_next().unwrap(); + for _i in 0..n { + results.push(Value::City(parser.produce().expect("parse city"))); + results.push(Value::State(parser.produce().expect("parse state"))); + results.push(Value::Population( + parser.produce().expect("parse population"), + )); + results.push(Value::Longitude(parser.produce().expect("parse longitude"))); + results.push(Value::Latitude(parser.produce().expect("parse latitude"))); + } + if is_last { + break; + } + } + assert_eq!( + vec![ + Value::City(String::from("Kenai")), + Value::State(String::from("AK")), + Value::Population(7610), + Value::Longitude(60.5544444), + Value::Latitude(-151.2583333), + Value::City(String::from("Selma")), + Value::State(String::from("AL")), + Value::Population(18980), + Value::Longitude(32.4072222), + Value::Latitude(-87.0211111), + Value::City(String::from("El Mirage")), + Value::State(String::from("AZ")), + Value::Population(32308), + Value::Longitude(33.6130556), + Value::Latitude(-112.3238889) + ], + results + ); +} + +#[test] +fn test_csv() { + let schema = [CSVTypeSystem::I64(false); 5]; + let files = [ + CXQuery::naked("./tests/data/uint_0.csv"), + CXQuery::naked("./tests/data/uint_1.csv"), + ]; + let source = CSVSource::new(&schema); + + let mut destination = ArrowDestination::new(); + let dispatcher = + Dispatcher::<_, _, CSVArrowTransport>::new(source, &mut destination, &files, None); + + dispatcher.run().expect("run dispatcher"); + + let result = destination.arrow().unwrap(); + + println!("result len: {}", result.len()); + assert!(result.len() == 2); + + for rb in result { + for i in 0..5 { + let col = rb.column(i).as_any().downcast_ref::().unwrap(); + assert!( + col.eq(&Int64Array::from_iter_values( + (4i64..=10).map(|v| v * 5 + i as i64), + )) || col.eq(&Int64Array::from_iter_values( + (0i64..4).map(|v| v * 5 + i as i64), + )) + ); + } + } +} + +#[test] +fn test_csv_infer_schema() { + let files = [CXQuery::naked("./tests/data/infer_0.csv")]; + let source = CSVSource::new(&[]); + + let mut writer = ArrowDestination::new(); + let dispatcher = Dispatcher::<_, _, CSVArrowTransport>::new(source, &mut writer, &files, None); + + dispatcher.run().expect("run dispatcher"); + + let expected_schema = vec![ + ArrowTypeSystem::Int64(false), + ArrowTypeSystem::Float64(false), + ArrowTypeSystem::Boolean(true), + ArrowTypeSystem::LargeUtf8(true), + ArrowTypeSystem::Float64(false), + ArrowTypeSystem::LargeUtf8(true), + ArrowTypeSystem::LargeUtf8(false), + ArrowTypeSystem::LargeUtf8(true), + ]; + + assert_eq!(expected_schema, writer.schema()); +} diff --git a/connectorx/tests/test_fed.rs b/connectorx/tests/test_fed.rs new file mode 100644 index 0000000..cfa200a --- /dev/null +++ b/connectorx/tests/test_fed.rs @@ -0,0 +1,21 @@ +use connectorx::fed_dispatcher::run; +use std::collections::HashMap; +use std::env; + +#[test] +#[ignore] +fn test_fed() { + let _ = env_logger::builder().is_test(true).try_init(); + + let sql = "select test_bool, AVG(test_float) as avg_float, SUM(test_int) as sum_int from db1.test_table as a, db2.test_str as b where a.test_int = b.id AND test_nullint is not NULL GROUP BY test_bool ORDER BY sum_int"; + let db_map = HashMap::from([ + (String::from("db1"), env::var("DB1").unwrap()), + (String::from("db2"), env::var("DB2").unwrap()), + ]); + + println!("db_map: {:?}", db_map); + + // make sure no error here + let rbs = run(sql.to_string(), db_map, None).unwrap(); + arrow::util::pretty::print_batches(&rbs).unwrap(); +} diff --git a/connectorx/tests/test_mssql.rs b/connectorx/tests/test_mssql.rs new file mode 100644 index 0000000..ba241af --- /dev/null +++ b/connectorx/tests/test_mssql.rs @@ -0,0 +1,178 @@ +use arrow::{ + array::{BooleanArray, Float64Array, Int64Array, StringArray}, + record_batch::RecordBatch, +}; +use connectorx::{ + destinations::arrow::ArrowDestination, prelude::*, sources::mssql::MsSQLSource, sql::CXQuery, + transports::MsSQLArrowTransport, +}; +use std::env; +use std::sync::Arc; +use tokio::runtime::Runtime; + +#[test] +fn test_mssql() { + let _ = env_logger::builder().is_test(true).try_init(); + + let dburl = env::var("MSSQL_URL").unwrap(); + + let queries = [ + CXQuery::naked("select * from test_table where test_int < 2"), + CXQuery::naked("select * from test_table where test_int >= 2"), + ]; + let rt = Arc::new(Runtime::new().unwrap()); + + let builder = MsSQLSource::new(rt, &dburl, 2).unwrap(); + let mut destination = ArrowDestination::new(); + let dispatcher = + Dispatcher::<_, _, MsSQLArrowTransport>::new(builder, &mut destination, &queries, None); + dispatcher.run().unwrap(); + + let result = destination.arrow().unwrap(); + verify_arrow_results(result); +} + +#[test] +fn test_mssql_agg() { + let _ = env_logger::builder().is_test(true).try_init(); + + let dburl = env::var("MSSQL_URL").unwrap(); + + let queries = [CXQuery::naked( + "SELECT test_bool, SUM(test_float) AS SUM FROM test_table GROUP BY test_bool", + )]; + let rt = Arc::new(Runtime::new().unwrap()); + + let builder = MsSQLSource::new(rt, &dburl, 1).unwrap(); + let mut destination = ArrowDestination::new(); + let dispatcher = Dispatcher::<_, _, MsSQLArrowTransport>::new( + builder, + &mut destination, + &queries, + Some(String::from( + "SELECT test_bool, SUM(test_float) AS SUM FROM test_table GROUP BY test_bool", + )), + ); + dispatcher.run().unwrap(); + + let mut result = destination.arrow().unwrap(); + assert!(result.len() == 1); + let rb = result.pop().unwrap(); + assert!(rb.columns().len() == 2); + + assert!(rb + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&BooleanArray::from(vec![None, Some(false), Some(true)]))); + + assert!(rb + .column(1) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Float64Array::from(vec![ + Some(10.9), + Some(5.2), + Some(-10.0), + ]))); +} + +pub fn verify_arrow_results(result: Vec) { + assert!(result.len() == 2); + + for rb in result { + assert!(rb.columns().len() == 5); + match rb.num_rows() { + 2 => { + assert!(rb + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Int64Array::from(vec![1, 0]))); + + assert!(rb + .column(1) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Int64Array::from(vec![Some(3), Some(5)]))); + + assert!(rb + .column(2) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&StringArray::from(vec![Some("str1"), Some("a"),]))); + + assert!(rb + .column(3) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Float64Array::from(vec![None, Some(3.1_f64)]))); + + assert!(rb + .column(4) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&BooleanArray::from(vec![Some(true), None]))); + } + 4 => { + assert!(rb + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Int64Array::from(vec![2, 3, 4, 1314]))); + + assert!(rb + .column(1) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Int64Array::from(vec![None, Some(7), Some(9), Some(2)]))); + + assert!(rb + .column(2) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&StringArray::from(vec![ + Some("str2"), + Some("b"), + Some("c"), + None, + ]))); + + assert!(rb + .column(3) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Float64Array::from(vec![ + Some(2.2_f64), + Some(3_f64), + Some(7.8_f64), + Some(-10_f64), + ]))); + + assert!(rb + .column(4) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&BooleanArray::from(vec![ + Some(false), + Some(false), + None, + Some(true), + ]))); + } + _ => unreachable!(), + } + } +} diff --git a/connectorx/tests/test_mysql.rs b/connectorx/tests/test_mysql.rs new file mode 100644 index 0000000..4bc21bc --- /dev/null +++ b/connectorx/tests/test_mysql.rs @@ -0,0 +1,127 @@ +use arrow::{ + array::{Float64Array, Int64Array, StringArray}, + record_batch::RecordBatch, +}; +use connectorx::{ + destinations::arrow::ArrowDestination, + prelude::*, + sources::mysql::{BinaryProtocol, MySQLSource, TextProtocol}, + sql::CXQuery, + transports::MySQLArrowTransport, +}; +use std::env; + +#[test] +fn test_mysql() { + let _ = env_logger::builder().is_test(true).try_init(); + + let dburl = env::var("MYSQL_URL").unwrap(); + + let queries = [ + CXQuery::naked("select * from test_table where test_int <= 2"), + CXQuery::naked("select * from test_table where test_int > 2"), + ]; + + let builder = MySQLSource::::new(&dburl, 2).unwrap(); + let mut destination = ArrowDestination::new(); + let dispatcher = Dispatcher::<_, _, MySQLArrowTransport>::new( + builder, + &mut destination, + &queries, + Some(String::from("select * from test_table")), + ); + dispatcher.run().unwrap(); + + let result = destination.arrow().unwrap(); + verify_arrow_results(result); +} + +#[test] +fn test_mysql_text() { + let _ = env_logger::builder().is_test(true).try_init(); + + let dburl = env::var("MYSQL_URL").unwrap(); + + let queries = [ + CXQuery::naked("select * from test_table where test_int <= 2"), + CXQuery::naked("select * from test_table where test_int > 2"), + ]; + + let builder = MySQLSource::::new(&dburl, 2).unwrap(); + let mut destination = ArrowDestination::new(); + let dispatcher = Dispatcher::<_, _, MySQLArrowTransport>::new( + builder, + &mut destination, + &queries, + None, + ); + dispatcher.run().unwrap(); + + let result = destination.arrow().unwrap(); + verify_arrow_results(result); +} + +pub fn verify_arrow_results(result: Vec) { + assert!(result.len() == 2); + + for r in result { + match r.num_rows() { + 2 => { + assert!(r + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Int64Array::from(vec![1, 2]))); + assert!(r + .column(1) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Float64Array::from(vec![1.1, 2.2]))); + assert!(r + .column(2) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&StringArray::from(vec!["odd", "even"]))); + assert!(r + .column(3) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Int64Array::from(vec![None, None]))); + } + 4 => { + assert!(r + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Int64Array::from(vec![3, 4, 5, 6]))); + assert!(r + .column(1) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Float64Array::from(vec![3.3, 4.4, 5.5, 6.6]))); + assert!(r + .column(2) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&StringArray::from(vec!["odd", "even", "odd", "even"]))); + assert!(r + .column(3) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Int64Array::from(vec![None, None, None, None]))); + } + _ => { + println!("got {} rows in a record batch!", r.num_rows()); + unreachable!() + } + } + } +} diff --git a/connectorx/tests/test_oracle.rs b/connectorx/tests/test_oracle.rs new file mode 100644 index 0000000..05c46fd --- /dev/null +++ b/connectorx/tests/test_oracle.rs @@ -0,0 +1,81 @@ +use connectorx::prelude::*; +use connectorx::sources::oracle::OracleSource; +use connectorx::sql::CXQuery; +use std::env; + +#[test] +#[ignore] +fn test_types() { + let _ = env_logger::builder().is_test(true).try_init(); + let dburl = env::var("ORACLE_URL").unwrap(); + let mut source = OracleSource::new(&dburl, 1).unwrap(); + #[derive(Debug, PartialEq)] + struct Row(i64, i64, f64, f64, String, String, String, String); + + source.set_queries(&[CXQuery::naked("select * from admin.test_table")]); + source.fetch_metadata().unwrap(); + let mut partitions = source.partition().unwrap(); + assert!(partitions.len() == 1); + let mut partition = partitions.remove(0); + partition.result_rows().expect("run query"); + assert_eq!(3, partition.nrows()); + assert_eq!(8, partition.ncols()); + + let mut parser = partition.parser().unwrap(); + + let mut rows: Vec = Vec::new(); + loop { + let (n, is_last) = parser.fetch_next().unwrap(); + for _i in 0..n { + rows.push(Row( + parser.produce().unwrap(), + parser.produce().unwrap(), + parser.produce().unwrap(), + parser.produce().unwrap(), + parser.produce().unwrap(), + parser.produce().unwrap(), + parser.produce().unwrap(), + parser.produce().unwrap(), + )); + } + if is_last { + break; + } + } + + assert_eq!( + vec![ + Row( + 1, + 1, + 1.1, + 1.1, + "varchar1".to_string(), + "char1".to_string(), + "nvarchar1".to_string(), + "nchar1".to_string() + ), + Row( + 2, + 2, + 2.2, + 2.2, + "varchar2".to_string(), + "char2".to_string(), + "nvarchar2".to_string(), + "nchar2".to_string() + ), + Row( + 3, + 3, + 3.3, + 3.3, + "varchar3".to_string(), + "char3".to_string(), + "nvarchar3".to_string(), + "nchar3".to_string() + ) + ], + rows + ); +} diff --git a/connectorx/tests/test_polars.rs b/connectorx/tests/test_polars.rs new file mode 100644 index 0000000..8fc63eb --- /dev/null +++ b/connectorx/tests/test_polars.rs @@ -0,0 +1,299 @@ +use connectorx::{ + constants::RECORD_BATCH_SIZE, + destinations::arrow2::Arrow2Destination, + prelude::*, + sources::{ + dummy::{DummySource, DummyTypeSystem}, + postgres::{rewrite_tls_args, BinaryProtocol, PostgresSource}, + }, + sql::CXQuery, + transports::{DummyArrow2Transport, PostgresArrow2Transport}, +}; +use polars::{df, prelude::*}; +use postgres::NoTls; +use std::env; +use url::Url; + +#[test] +fn test_polars() { + let schema = [ + DummyTypeSystem::I64(true), + DummyTypeSystem::F64(true), + DummyTypeSystem::Bool(false), + DummyTypeSystem::String(true), + DummyTypeSystem::F64(false), + ]; + let nrows = vec![4, 7]; + let ncols = schema.len(); + let queries: Vec = nrows + .iter() + .map(|v| CXQuery::naked(format!("{},{}", v, ncols))) + .collect(); + let mut destination = Arrow2Destination::new(); + + let dispatcher = Dispatcher::<_, _, DummyArrow2Transport>::new( + DummySource::new(&["a", "b", "c", "d", "e"], &schema), + &mut destination, + &queries, + None, + ); + dispatcher.run().expect("run dispatcher"); + + let df: DataFrame = destination.polars().unwrap(); + let expected = df!( + "a" => &[0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6], + "b" => &[0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0], + "c" => &[true, false, true, false, true, false, true, false, true, false, true], + "d" => &["0", "1", "2", "3", "0", "1", "2", "3", "4", "5", "6"], + "e" => &[0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0] + ) + .unwrap(); + + // order of each batch is not guaranteed + let expected2 = df!( + "a" => &[0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3], + "b" => &[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.0, 1.0, 2.0, 3.0], + "c" => &[true, false, true, false, true, false, true, true, false, true, false], + "d" => &["0", "1", "2", "3", "4", "5", "6", "0", "1", "2", "3"], + "e" => &[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.0, 1.0, 2.0, 3.0] + ) + .unwrap(); + + assert!(df.frame_equal_missing(&expected) || df.frame_equal_missing(&expected2)); +} + +#[test] +fn test_polars_large() { + let schema = [ + DummyTypeSystem::I64(true), + DummyTypeSystem::F64(true), + DummyTypeSystem::Bool(false), + DummyTypeSystem::String(true), + DummyTypeSystem::F64(false), + ]; + let nrows = vec![RECORD_BATCH_SIZE * 2 - 1, RECORD_BATCH_SIZE * 2 + 10]; + let ncols = schema.len(); + let queries: Vec = nrows + .iter() + .map(|v| CXQuery::naked(format!("{},{}", v, ncols))) + .collect(); + let mut destination = Arrow2Destination::new(); + + let dispatcher = Dispatcher::<_, _, DummyArrow2Transport>::new( + DummySource::new(&["a", "b", "c", "d", "e"], &schema), + &mut destination, + &queries, + None, + ); + dispatcher.run().expect("run dispatcher"); + + let df: DataFrame = destination.polars().unwrap(); + assert_eq!(RECORD_BATCH_SIZE * 4 + 9, df.height()); + assert_eq!(5, df.width()); +} + +#[test] +fn test_postgres_arrow() { + let _ = env_logger::builder().is_test(true).try_init(); + + let dburl = env::var("POSTGRES_URL").unwrap(); + + let queries = [ + CXQuery::naked("select * from test_table where test_int < 2"), + CXQuery::naked("select * from test_table where test_int >= 2"), + ]; + let url = Url::parse(dburl.as_str()).unwrap(); + let (config, _tls) = rewrite_tls_args(&url).unwrap(); + let builder = PostgresSource::::new(config, NoTls, 2).unwrap(); + let mut destination = Arrow2Destination::new(); + let dispatcher = Dispatcher::<_, _, PostgresArrow2Transport>::new( + builder, + &mut destination, + &queries, + Some(format!("select * from test_table")), + ); + + dispatcher.run().expect("run dispatcher"); + + let df: DataFrame = destination.polars().unwrap(); + + let expected = df!( + "test_int" => &[1, 0, 2, 3, 4, 1314], + "test_nullint" => &[Some(3), Some(5), None, Some(7), Some(9), Some(2)], + "test_str" => &[Some("str1"), Some("a"), Some("str2"), Some("b"), Some("c"), None], + "test_float" => &[None, Some(3.1), Some(2.2), Some(3.), Some(7.8), Some(-10.)], + "test_bool" => &[Some(true), None, Some(false), Some(false), None, Some(true)] + ) + .unwrap(); + + let expected2 = df!( + "test_int" => &[2, 3, 4, 1314, 1, 0], + "test_nullint" => &[None, Some(7), Some(9), Some(2), Some(3), Some(5)], + "test_str" => &[Some("str2"), Some("b"), Some("c"), None, Some("str1"), Some("a")], + "test_float" => &[Some(2.2), Some(3.), Some(7.8), Some(-10.), None, Some(3.1)], + "test_bool" => &[Some(false), Some(false), None, Some(true), Some(true), None] + ) + .unwrap(); + + assert!(df.frame_equal_missing(&expected) || df.frame_equal_missing(&expected2)); +} + +#[test] +fn test_pg_pl_bool_array() { + let _ = env_logger::builder().is_test(true).try_init(); + + let dburl = env::var("POSTGRES_URL").unwrap(); + + let queries = [CXQuery::naked( + "select test_boolarray from test_types where test_boolarray is not null", + )]; + let url = Url::parse(dburl.as_str()).unwrap(); + let (config, _tls) = rewrite_tls_args(&url).unwrap(); + let builder = PostgresSource::::new(config, NoTls, 2).unwrap(); + let mut destination = Arrow2Destination::new(); + let dispatcher = Dispatcher::<_, _, PostgresArrow2Transport>::new( + builder, + &mut destination, + &queries, + Some(format!("select * from test_types")), + ); + + dispatcher.run().expect("run dispatcher"); + + let s1 = Series::new("a", [true, false]); + let empty_vec: Vec = vec![]; + let s2 = Series::new("b", empty_vec); + let s3 = Series::new("c", [true]); + + let df: DataFrame = destination.polars().unwrap(); + let test_df: DataFrame = df!( + "test_boolarray" => &[s1,s2,s3] + ) + .unwrap(); + + println!("{:?}", df); + assert_eq!(df, test_df); +} + +#[test] +fn test_pg_pl_varchar_array() { + let _ = env_logger::builder().is_test(true).try_init(); + + let dburl = env::var("POSTGRES_URL").unwrap(); + + let queries = [CXQuery::naked("select test_varchararray from test_types")]; + let url = Url::parse(dburl.as_str()).unwrap(); + let (config, _tls) = rewrite_tls_args(&url).unwrap(); + let builder = PostgresSource::::new(config, NoTls, 2).unwrap(); + let mut destination = Arrow2Destination::new(); + let dispatcher = Dispatcher::<_, _, PostgresArrow2Transport>::new( + builder, + &mut destination, + &queries, + Some(format!("select * from test_types")), + ); + + dispatcher.run().expect("run dispatcher"); + + let s1 = Series::new("a", ["str1", "str2"]); + let s2 = Series::new( + "b", + [ + "0123456789", + "abcdefghijklmnopqrstuvwxyz", + "!@#$%^&*()_-+=~`:;<>?/", + ], + ); + let s3 = Series::new("c", ["", " "]); + let empty_vec: Vec<&str> = vec![]; + let s4 = Series::new("d", empty_vec); + + let df: DataFrame = destination.polars().unwrap(); + let test_df: DataFrame = df!( + "test_varchararray" => &[s1,s2,s3,s4] + ) + .unwrap(); + + println!("{:?}", df); + // panic!("spurious"); + assert_eq!(df, test_df); +} + +#[test] +fn test_pg_pl_text_array() { + let _ = env_logger::builder().is_test(true).try_init(); + + let dburl = env::var("POSTGRES_URL").unwrap(); + + let queries = [CXQuery::naked("select test_textarray from test_types")]; + let url = Url::parse(dburl.as_str()).unwrap(); + let (config, _tls) = rewrite_tls_args(&url).unwrap(); + let builder = PostgresSource::::new(config, NoTls, 2).unwrap(); + let mut destination = Arrow2Destination::new(); + let dispatcher = Dispatcher::<_, _, PostgresArrow2Transport>::new( + builder, + &mut destination, + &queries, + Some(format!("select * from test_types")), + ); + + dispatcher.run().expect("run dispatcher"); + + let s1 = Series::new("a", ["text1", "text2"]); + let s2 = Series::new( + "b", + [ + "0123456789", + "abcdefghijklmnopqrstuvwxyz", + "!@#$%^&*()_-+=~`:;<>?/", + ], + ); + let s3 = Series::new("c", ["", " "]); + let empty_vec: Vec<&str> = vec![]; + let s4 = Series::new("d", empty_vec); + + let df: DataFrame = destination.polars().unwrap(); + let test_df: DataFrame = df!( + "test_textarray" => &[s1,s2,s3,s4] + ) + .unwrap(); + + println!("{:?}", df); + assert_eq!(df, test_df); +} + +#[test] + +fn test_pg_pl_name() { + let _ = env_logger::builder().is_test(true).try_init(); + + let dburl = env::var("POSTGRES_URL").unwrap(); + + let queries = [CXQuery::naked("select test_name from test_types")]; + let url = Url::parse(dburl.as_str()).unwrap(); + let (config, _tls) = rewrite_tls_args(&url).unwrap(); + let builder = PostgresSource::::new(config, NoTls, 2).unwrap(); + let mut destination = Arrow2Destination::new(); + let dispatcher = Dispatcher::<_, _, PostgresArrow2Transport>::new( + builder, + &mut destination, + &queries, + Some(format!("select * from test_types")), + ); + + dispatcher.run().expect("run dispatcher"); + + let s1 = "0"; + let s2 = "21"; + let s3 = "someName"; + let s4 = "101203203-1212323-22131235"; + + let df: DataFrame = destination.polars().unwrap(); + let test_df: DataFrame = df!( + "test_name" => &[s1,s2,s3,s4] + ) + .unwrap(); + + println!("{:?}", df); + assert_eq!(df, test_df); +} diff --git a/connectorx/tests/test_postgres.rs b/connectorx/tests/test_postgres.rs new file mode 100644 index 0000000..2beb6f1 --- /dev/null +++ b/connectorx/tests/test_postgres.rs @@ -0,0 +1,304 @@ +use arrow::{ + array::{BooleanArray, Float64Array, Int64Array, StringArray}, + record_batch::RecordBatch, +}; +use connectorx::{ + destinations::arrow::ArrowDestination, + prelude::*, + sources::postgres::{rewrite_tls_args, BinaryProtocol, CSVProtocol, PostgresSource}, + sources::PartitionParser, + sql::CXQuery, + transports::PostgresArrowTransport, +}; +use postgres::NoTls; +use std::env; +use url::Url; + +#[test] +fn load_and_parse() { + let _ = env_logger::builder().is_test(true).try_init(); + + let dburl = env::var("POSTGRES_URL").unwrap(); + #[derive(Debug, PartialEq)] + struct Row(i32, Option, Option, Option, Option); + + let url = Url::parse(dburl.as_str()).unwrap(); + let (config, _tls) = rewrite_tls_args(&url).unwrap(); + let mut source = PostgresSource::::new(config, NoTls, 1).unwrap(); + source.set_queries(&[CXQuery::naked("select * from test_table")]); + source.fetch_metadata().unwrap(); + + let mut partitions = source.partition().unwrap(); + assert!(partitions.len() == 1); + let mut partition = partitions.remove(0); + partition.result_rows().expect("run query"); + + let mut parser = partition.parser().unwrap(); + + let mut rows: Vec = Vec::new(); + loop { + let (n, is_last) = parser.fetch_next().unwrap(); + for _i in 0..n { + rows.push(Row( + parser.produce().unwrap(), + parser.produce().unwrap(), + Produce::>::produce(&mut parser) + .unwrap() + .map(ToString::to_string), + parser.produce().unwrap(), + parser.produce().unwrap(), + )); + } + if is_last { + break; + } + } + + assert_eq!( + vec![ + Row(1, Some(3), Some("str1".into()), None, Some(true)), + Row(2, None, Some("str2".into()), Some(2.2), Some(false)), + Row(0, Some(5), Some("a".into()), Some(3.1), None), + Row(3, Some(7), Some("b".into()), Some(3.), Some(false)), + Row(4, Some(9), Some("c".into()), Some(7.8), None), + Row(1314, Some(2), None, Some(-10.), Some(true)), + ], + rows + ); +} + +#[test] +fn load_and_parse_csv() { + let _ = env_logger::builder().is_test(true).try_init(); + + let dburl = env::var("POSTGRES_URL").unwrap(); + #[derive(Debug, PartialEq)] + struct Row(i32, Option, Option, Option, Option); + + let url = Url::parse(dburl.as_str()).unwrap(); + let (config, _tls) = rewrite_tls_args(&url).unwrap(); + let mut source = PostgresSource::::new(config, NoTls, 1).unwrap(); + source.set_queries(&[CXQuery::naked("select * from test_table")]); + source.fetch_metadata().unwrap(); + + let mut partitions = source.partition().unwrap(); + assert!(partitions.len() == 1); + let mut partition = partitions.remove(0); + partition.result_rows().expect("run query"); + + assert_eq!(6, partition.nrows()); + assert_eq!(5, partition.ncols()); + + let mut parser = partition.parser().unwrap(); + + let mut rows: Vec = Vec::new(); + loop { + let (n, is_last) = parser.fetch_next().unwrap(); + for _i in 0..n { + rows.push(Row( + parser.produce().unwrap(), + parser.produce().unwrap(), + Produce::>::produce(&mut parser) + .unwrap() + .map(ToString::to_string), + parser.produce().unwrap(), + parser.produce().unwrap(), + )); + } + if is_last { + break; + } + } + assert_eq!( + vec![ + Row(1, Some(3), Some("str1".into()), None, Some(true)), + Row(2, None, Some("str2".into()), Some(2.2), Some(false)), + Row(0, Some(5), Some("a".into()), Some(3.1), None), + Row(3, Some(7), Some("b".into()), Some(3.), Some(false)), + Row(4, Some(9), Some("c".into()), Some(7.8), None), + Row(1314, Some(2), None, Some(-10.), Some(true)), + ], + rows + ); +} + +#[test] +fn test_postgres() { + let _ = env_logger::builder().is_test(true).try_init(); + + let dburl = env::var("POSTGRES_URL").unwrap(); + + let queries = [ + CXQuery::naked("select * from test_table where test_int < 2"), + CXQuery::naked("select * from test_table where test_int >= 2"), + ]; + let url = Url::parse(dburl.as_str()).unwrap(); + let (config, _tls) = rewrite_tls_args(&url).unwrap(); + let builder = PostgresSource::::new(config, NoTls, 2).unwrap(); + let mut destination = ArrowDestination::new(); + let dispatcher = Dispatcher::<_, _, PostgresArrowTransport>::new( + builder, + &mut destination, + &queries, + Some(String::from("select * from test_table")), + ); + + dispatcher.run().expect("run dispatcher"); + + let result = destination.arrow().unwrap(); + verify_arrow_results(result); +} + +#[test] +fn test_postgres_csv() { + let _ = env_logger::builder().is_test(true).try_init(); + + let dburl = env::var("POSTGRES_URL").unwrap(); + + let queries = [ + CXQuery::naked("select * from test_table where test_int < 2"), + CXQuery::naked("select * from test_table where test_int >= 2"), + ]; + let url = Url::parse(dburl.as_str()).unwrap(); + let (config, _tls) = rewrite_tls_args(&url).unwrap(); + let builder = PostgresSource::::new(config, NoTls, 2).unwrap(); + let mut dst = ArrowDestination::new(); + let dispatcher = Dispatcher::<_, _, PostgresArrowTransport>::new( + builder, &mut dst, &queries, None, + ); + + dispatcher.run().expect("run dispatcher"); + let result = dst.arrow().unwrap(); + verify_arrow_results(result); +} + +#[test] +fn test_postgres_agg() { + let _ = env_logger::builder().is_test(true).try_init(); + + let dburl = env::var("POSTGRES_URL").unwrap(); + + let queries = [CXQuery::naked( + "SELECT test_bool, SUM(test_float) FROM test_table GROUP BY test_bool", + )]; + + let url = Url::parse(dburl.as_str()).unwrap(); + let (config, _tls) = rewrite_tls_args(&url).unwrap(); + let builder = PostgresSource::::new(config, NoTls, 1).unwrap(); + let mut destination = ArrowDestination::new(); + let dispatcher = Dispatcher::<_, _, PostgresArrowTransport>::new( + builder, + &mut destination, + &queries, + Some("SELECT test_bool, SUM(test_float) FROM test_table GROUP BY test_bool".to_string()), + ); + + dispatcher.run().expect("run dispatcher"); + + let mut result = destination.arrow().unwrap(); + assert!(result.len() == 1); + let rb = result.pop().unwrap(); + assert!(rb.columns().len() == 2); + + assert!(rb + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&BooleanArray::from(vec![None, Some(false), Some(true)]))); + + assert!(rb + .column(1) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Float64Array::from(vec![ + Some(10.9), + Some(5.2), + Some(-10.0), + ]))); +} + +pub fn verify_arrow_results(result: Vec) { + assert!(result.len() == 2); + + for r in result { + match r.num_rows() { + 2 => { + assert!(r + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Int64Array::from(vec![1, 0]))); + assert!(r + .column(1) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Int64Array::from(vec![3, 5]))); + assert!(r + .column(2) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&StringArray::from(vec!["str1", "a"]))); + assert!(r + .column(3) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Float64Array::from(vec![None, Some(3.1)]))); + assert!(r + .column(4) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&BooleanArray::from(vec![Some(true), None]))); + } + 4 => { + assert!(r + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Int64Array::from(vec![2, 3, 4, 1314]))); + assert!(r + .column(1) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Int64Array::from(vec![None, Some(7), Some(9), Some(2)]))); + assert!(r + .column(2) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&StringArray::from(vec![ + Some("str2"), + Some("b"), + Some("c"), + None + ]))); + assert!(r + .column(3) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&Float64Array::from(vec![2.2, 3., 7.8, -10.]))); + assert!(r + .column(4) + .as_any() + .downcast_ref::() + .unwrap() + .eq(&BooleanArray::from(vec![ + Some(false), + Some(false), + None, + Some(true) + ]))); + } + _ => unreachable!(), + } + } +} diff --git a/docs/_config.yml b/docs/_config.yml new file mode 100644 index 0000000..afd3af8 --- /dev/null +++ b/docs/_config.yml @@ -0,0 +1,24 @@ +# Book settings +# Learn more at https://jupyterbook.org/customize/config.html + +title: ConnectorX +author: SFU-DB +logo: logo.png + +# Force re-execution of notebooks on each build. +# See https://jupyterbook.org/content/execute.html +execute: + execute_notebooks: force + + +# Information about where the book exists on the web +repository: + url: https://github.com/sfu-db/connector-x # Online location of your book + path_to_book: docs # Optional path to your book, relative to the repository root + branch: main # Which branch of the repository should be used when creating links (optional) + +# Add GitHub buttons to your book +# See https://jupyterbook.org/customize/config.html#add-a-link-to-your-repository +html: + use_issues_button: true + use_repository_button: true diff --git a/docs/_toc.yml b/docs/_toc.yml new file mode 100644 index 0000000..4f7274d --- /dev/null +++ b/docs/_toc.yml @@ -0,0 +1,18 @@ +# Table of contents +# Learn more at https://jupyterbook.org/customize/toc.html + +format: jb-book +root: intro + +chapters: + - file: install + - file: api + - file: databases + sections: + - file: databases/bigquery + - file: databases/mssql + - file: databases/mysql + - file: databases/oracle + - file: databases/postgres + - file: databases/sqlite + - file: freq_questions diff --git a/docs/api.md b/docs/api.md new file mode 100644 index 0000000..50fa4da --- /dev/null +++ b/docs/api.md @@ -0,0 +1,96 @@ +# Basic usage +ConnectorX enables you to run the SQL query, load data from databases into a Pandas Dataframe in the fastest and most memory efficient way. + +## API +```python +connectorx.read_sql(conn: Union[str, Dict[str, str]], query: Union[List[str], str], *, return_type: str = "pandas", protocol: str = "binary", partition_on: Optional[str] = None, partition_range: Optional[Tuple[int, int]] = None, partition_num: Optional[int] = None) +``` + +## Parameters +- `conn: Union[str, Dict[str, str]]`: Connection string URI for querying single database or dict of database names (key) and connection string URIs (value) for querying multiple databases. + - Please check out [here](https://sfu-db.github.io/connector-x/databases.html) for connection string examples of each database +- `query: Union[str, List[str]]`: SQL query or list of partitioned SQL queries for fetching data. +- `return_type: str = "pandas"`: The return type of this function. It can be `arrow` (`arrow2`), `pandas`, `modin`, `dask` or `polars`. +- `protocol: str = "binary"`: The protocol used to fetch data from source, default is `binary`. Check out [here](./databases.md) to see more details. +- `partition_on: Optional[str]`: The column to partition the result. +- `partition_range: Optional[Tuple[int, int]]`: The value range of the partition column. +- `partition_num: Optional[int]`: The number of partitions to generate. +- `index_col: Optional[str]`: The index column to set for the result dataframe. Only applicable when `return_type` is `pandas`, `modin` or `dask`. + + +## Examples +- Read a DataFrame from a SQL using a single thread + + ```python + import connectorx as cx + + postgres_url = "postgresql://username:password@server:port/database" + query = "SELECT * FROM lineitem" + + cx.read_sql(postgres_url, query) + ``` + +- Read a DataFrame parallelly using 10 threads by automatically partitioning the provided SQL on the partition column (`partition_range` will be automatically queried if not given) + + ```python + import connectorx as cx + + postgres_url = "postgresql://username:password@server:port/database" + query = "SELECT * FROM lineitem" + + cx.read_sql(postgres_url, query, partition_on="l_orderkey", partition_num=10) + ``` + +- Read a DataFrame parallelly using 2 threads by manually providing two partition SQLs (the schemas of all the query results should be same) + + ```python + import connectorx as cx + + postgres_url = "postgresql://username:password@server:port/database" + queries = ["SELECT * FROM lineitem WHERE l_orderkey <= 30000000", "SELECT * FROM lineitem WHERE l_orderkey > 30000000"] + + cx.read_sql(postgres_url, queries) + + ``` + +- Read a DataFrame parallelly using 4 threads from a more complex query + + ```python + import connectorx as cx + + postgres_url = "postgresql://username:password@server:port/database" + query = f""" + SELECT l_orderkey, + SUM(l_extendedprice * ( 1 - l_discount )) AS revenue, + o_orderdate, + o_shippriority + FROM customer, + orders, + lineitem + WHERE c_mktsegment = 'BUILDING' + AND c_custkey = o_custkey + AND l_orderkey = o_orderkey + AND o_orderdate < DATE '1995-03-15' + AND l_shipdate > DATE '1995-03-15' + GROUP BY l_orderkey, + o_orderdate, + o_shippriority + """ + + cx.read_sql(postgres_url, query, partition_on="l_orderkey", partition_num=4) + + ``` + +- Read a DataFrame from a SQL joined from multiple databases (experimental, only support PostgreSQL for now) + + ```python + import connectorx as cx + + db1 = "postgresql://username1:password1@server1:port1/database1" + db2 = "postgresql://username2:password2@server2:port2/database2" + query = "SELECT * FROM db1.nation n, db2.region r where n.n_regionkey = r.r_regionkey" + + cx.read_sql({"db1": db1, "db2": db2}, query) + + ``` + diff --git a/docs/databases.md b/docs/databases.md new file mode 100644 index 0000000..6eb371f --- /dev/null +++ b/docs/databases.md @@ -0,0 +1,10 @@ +# Databases configuration and performance + +ConnectorX supports retrieving data from Postgres, MsSQL, MySQL, Oracle, SQLite, and BigQuery. This chapter introduces how to use ConnectorX to connect each database and the conversion between database types and Pandas types. + +* [BigQuery](./databases/bigquery.md) +* [MsSQL](./databases/mssql.md) +* [MySQL](./databases/mysql.md) +* [Oracle](./databases/oracle.md) +* [Postgres](./databases/postgres.md) +* [SQLite](./databases/sqlite.md) \ No newline at end of file diff --git a/docs/databases/bigquery.md b/docs/databases/bigquery.md new file mode 100644 index 0000000..f1b0b79 --- /dev/null +++ b/docs/databases/bigquery.md @@ -0,0 +1,37 @@ +# BigQuery + +```{note} +BigQuery does not need to specify protocol. +``` + +```{warning} +Currently, BigQuery does not support to apply paritition on Query with limit clause. +For example, `cx.read_sql(conn, 'select * from table limit 10', parition_num=3, partition_on='int')` will fail. +If you want to fetch result from query with limit clause, please do not use partitioning. +``` + +### BigQuery Connection + +**Authentication File:** BigQuery connection need an authentication json file from Google Cloud Platform. If you do not have an authentication json file, you can create your BigQuery authentication [here](https://cloud.google.com/docs/authentication/getting-started). + +```py +import connectorx as cx +authentication_file_path = '/home/user/path/auth.json' # path to your authentication json file +conn = 'bigquery://' + authentication_file_path # connection token +query = 'SELECT * FROM `database.dataset.table`' # query string +cx.read_sql(conn, query) # read data from BigQuery +``` + +### BigQuery-Pandas Type Mapping +| BigQuery Type | Pandas Type | Comment | +|:-------------------------:|:---------------------------:|:----------------------------------:| +| Bool, Boolean | bool, boolean(nullable) | | +| Int64, Integer | int64, Int64(nullable) | | +| Float64, Float | float64 | | +| Numeric | float64 | | +| String | object | | +| BYTES | object | | +| Time | object | | +| DATE | datetime64[ns] | | +| Datetime | datetime64[ns] | | +| TIMESTAMP | datetime64[ns] | UTC | \ No newline at end of file diff --git a/docs/databases/mssql.md b/docs/databases/mssql.md new file mode 100644 index 0000000..f637f4e --- /dev/null +++ b/docs/databases/mssql.md @@ -0,0 +1,63 @@ +# MsSQL + +```{note} +SQLServer does not need to specify protocol. +``` + +### MsSQL Connection +```{hint} +By adding `trusted_connection=true` to connection uri parameter, windows authentication will be enabled. Example: `mssql://host:port/db?trusted_connection=true` +By adding `encrypt=true` to connection uri parameter, SQLServer will use SSL encryption. Example: `mssql://host:port/db?encrypt=true&trusted_connection=true` +``` +```{hint} +if the user password has special characters, they need to be sanitized. example: `from urllib import parse; password = parse.quote_plus(password)` +``` + +```py +import connectorx as cx +conn = 'mssql://username:password@server:port/database?encrypt=true&trusted_connection=true' # connection token +query = 'SELECT * FROM table' # query string +cx.read_sql(conn, query) # read data from MsSQL +``` + +### SQLServer-Pandas Type Mapping +| SQLServer Type | Pandas Type | Comment | +|:---------------:|:---------------------------:|:----------------------------------:| +| TINYINT | int64, Int64(nullable) | | +| SMALLINT | int64, Int64(nullable) | | +| INT | int64, Int64(nullable) | | +| BIGINT | int64, Int64(nullable) | | +| FLOAT | float64 | | +| NUMERIC | float64 | | +| DECIMAL | float64 | cannot support precision larger than 28 | +| BIT | bool, boolean(nullable) | | +| VARCHAR | object | | +| CHAR | object | | +| TEXT | object | | +| NVARCHAR | object | | +| NCHAR | object | | +| NTEXT | object | | +| VARBINARY | object | | +| BINARY | object | | +| IMAGE | object | | +| DATETIME | datetime64[ns] | | +| DATETIME2 | datetime64[ns] | | +| SMALLDATETIME | datetime64[ns] | | +| DATE | datetime64[ns] | | +| DATETIMEOFFSET | datetime64[ns] | | +| TIME | object | | +| UNIQUEIDENTIFIER| object | | + +### Performance (r5.4xlarge docker in another EC2 instance) + +**Modin does not support read_sql on Mssql** + +- Time chart, lower is better. + +

time chart

+ +- Memory consumption chart, lower is better. + +

memory chart

+ +In conclusion, ConnectorX uses **3x** less memory and **14x** less time compared with Pandas. diff --git a/docs/databases/mysql.md b/docs/databases/mysql.md new file mode 100644 index 0000000..d486b3a --- /dev/null +++ b/docs/databases/mysql.md @@ -0,0 +1,51 @@ +# MySQL + +## Protocols +* `binary`: [MySQL Binary protocol](https://github.com/blackbeam/rust-mysql-simple), recommend to use in general. +* `text`: [MySQL Text protocol](https://github.com/blackbeam/rust-mysql-simple), slower than `binary`, recommend to use only when `binary` protocol is not supported by the source (e.g. Clickhouse). + +## MySQL Connection +```py +import connectorx as cx +conn = 'mysql://username:password@server:port/database' # connection token +query = 'SELECT * FROM table' # query string +cx.read_sql(conn, query) # read data from MySQL +``` + +## MySQL-Pandas Type Mapping +| MySQL Type | Pandas Type | Comment | +|:---------------:|:---------------------------:|:----------------------------------:| +| TINYINT | int64, Int64(nullable) | | +| SMALLINT | int64, Int64(nullable) | | +| MEDIUMINT | int64, Int64(nullable) | | +| INT | int64, Int64(nullable) | | +| BIGINT | int64, Int64(nullable) | | +| FLOAT | float64 | | +| DOUBLE | float64 | | +| DECIMAL | float64, object(Clickhouse) | Clickhouse return DECIMAL in string, cannot support precision larger than 28 | +| VARCHAR | object | | +| CHAR | object | | +| DATE | datetime64[ns] | only support date after year 1970 | +| TIME | object | | +| DATETIME | datetime64[ns] | only support date after year 1970 | +| TIMESTAMP | datetime64[ns] | | +| YEAR | int64, Int64(nullable) | | +| TINYBLOB | object | | +| BLOB | object | | +| MEDIUMBLOB | object | | +| LONGBLOB | object | | +| JSON | object | | +| ENUM | object | | + + +### Performance (db.m6g.4xlarge RDS) + +- Time chart, lower is better. + +

time chart

+ +- Memory consumption chart, lower is better. + +

memory chart

+ +In conclusion, ConnectorX uses **3x** less memory and **8x** less time compared with Pandas. diff --git a/docs/databases/oracle.md b/docs/databases/oracle.md new file mode 100644 index 0000000..f25e05e --- /dev/null +++ b/docs/databases/oracle.md @@ -0,0 +1,40 @@ +# Oracle + + +### Oracle Connection +```py +import connectorx as cx +conn = 'oracle://username:password@server:port/database' # connection token +query = 'SELECT * FROM table' # query string +cx.read_sql(conn, query) # read data from Oracle +``` + +### Oracle-Pandas Type Mapping +| Oracle Type | Pandas Type | Comment | +|:-------------------------:|:---------------------------:|:----------------------------------:| +| Number(\*,0) | int64, Int64(nullable) | | +| Number(\*,>0) | float64 | | +| Float | float64 | | +| BINARY_FLOAT | float64 | | +| BINARY_DOUBLE | float64 | | +| VARCHAR2 | object | | +| CHAR | object | | +| NCHAR | object | | +| NVarchar2 | object | | +| DATE | datetime64[ns] | | +| TIMESTAMP | datetime64[ns] | | +| TIMESTAMP WITH TIME ZONE | datetime64[ns] | | + +### Performance (db.r5.4xlarge RDS) + +**Modin and Turbodbc does not support read_sql on Oracle** + +- Time chart, lower is better. + +

time chart

+ +- Memory consumption chart, lower is better. + +

memory chart

+ +In conclusion, ConnectorX uses **3x** less memory and **3x** less time compared with Pandas. diff --git a/docs/databases/postgres.md b/docs/databases/postgres.md new file mode 100644 index 0000000..caa39cf --- /dev/null +++ b/docs/databases/postgres.md @@ -0,0 +1,66 @@ +# Postgres + +### Protocols +* `binary`: [Postgres Binary COPY protocol](https://www.postgresql.org/docs/current/sql-copy.html), recommend to use in general since fast data parsing speed. +* `csv`: [Postgres CSV COPY protocol](https://www.postgresql.org/docs/current/sql-copy.html), recommend to use when network is slow (`csv` usually results in smaller size than `binary`). +* `cursor`: Conventional wire protocol (slowest one), recommend to use only when `binary` and `csv` is not supported by the source (e.g. Redshift). + +## Postgres Connection +```{hint} +Adding `sslmode=require` to connection uri parameter force SSL connection. Example: `postgresql://username:password@host:port/db?sslmode=require`. `sslmode=disable` to disable SSL connection. + +To connect to redshift, replace `postgresql://` with `redshift://`. +``` + +```py +import connectorx as cx +conn = 'postgres://username:password@server:port/database' # connection token +query = "SELECT * FROM table" # query string +cx.read_sql(conn, query) # read data from Postgres +``` + +## Postgres-Pandas Type Mapping + +| Postgres Type | Pandas Type | Comment | +|:---------------:|:-------------------------:|:----------------------------------:| +| BOOL | bool, boolean(nullable) | | +| INT2 | int64, Int64(nullable) | | +| INT4 | int64, Int64(nullable) | | +| INT8 | int64, Int64(nullable) | | +| FLOAT4 | float64 | | +| FLOAT8 | float64 | | +| NUMERIC | float64 | cannot support precision larger than 28 | +| TEXT | object | | +| BPCHAR | object | | +| VARCHAR | object | | +| CHAR | object | | +| BYTEA | object | | +| DATE | datetime64[ns] | | +| TIME | object | | +| TIMESTAMP | datetime64[ns] | | +| TIMESTAMPZ | datetime64[ns] | | +| UUID | object | | +| JSON | object | | +| JSONB | object | | +| ENUM | object | need to convert enum column to text manually (`::text`) when using `csv` and `cursor` protocol | +| ltree | object | binary protocol supported only after Postgres version 13 | +| lquery | object | binary protocol supported only after Postgres version 13 | +| ltxtquery | object | binary protocol supported only after Postgres version 13 | +| INT2[] | object | list of i64 | +| INT4[] | object | list of i64 | +| INT8[] | object | list of i64 | +| FLOAT4[] | object | list of f64 | +| FLOAT8[] | object | list of f64 | +| NUMERIC[] | object | list of f64 | + +## Performance (db.m6g.4xlarge RDS) + +- Time chart, lower is better. + +

time chart

+ +- Memory consumption chart, lower is better. + +

memory chart

+ +In conclusion, ConnectorX uses **3x** less memory and **13x** less time compared with Pandas. diff --git a/docs/databases/sqlite.md b/docs/databases/sqlite.md new file mode 100644 index 0000000..2d8d5ab --- /dev/null +++ b/docs/databases/sqlite.md @@ -0,0 +1,50 @@ +# SQLite +Since SQLite adopts a [dynamic type system](https://www.sqlite.org/datatype3.html), we infer type as follow: +* If there is a declared type of the column, we derive the type using [column affinity rules](https://www.sqlite.org/datatype3.html#affname), code can be found [here](https://github.com/sfu-db/connector-x/blob/main/connectorx/src/sources/sqlite/typesystem.rs#L47). +* Otherwise we directly adopt the value's type in the first row of the result (in each partition), which results in INTEGER, REAL, TEXT and BLOB. +* If the first row of the result is NULL in the partition, try next partition. Throw an error if first rows of all partitions are NULL for a column. + +### SQLite Connection +```py +import connectorx as cx +db_path = '/home/user/path/test.db' # path to your SQLite database +conn = 'sqlite://' + db_path # connection token +query = 'SELECT * FROM `database.dataset.table`' # query string +cx.read_sql(conn, query) # read data from SQLite +``` + +Example on windows: +```py +import connectorx as cx +import urllib +db_path = urllib.parse.quote("C:\\user\\path\\test.db") # url encode the path to your SQLite database +conn = 'sqlite://' + db_path # connection token +query = 'SELECT * FROM `database.dataset.table`' # query string +cx.read_sql(conn, query) # read data from SQLite +``` + +### SQLite Type Mapping +| SQLite Type | Pandas Type | Comment | +|:----------------:|:---------------------------:|:----------------------------------:| +| INTEGER | int64, Int64(nullable) | declared type that contains substring "int" | +| BOOL | bool, boolean(nullable) | declared type is "boolean" or "bool" | +| REAL | float64 | declared type that contains substring "real", "floa", "doub" | +| TEXT | object | declared type that contains substring "char", "clob", "text" | +| BLOB | object | declared type that contains substring "blob" | +| DATE | datetime64[ns] | declared type is "date" | +| TIME | object | declared type is "time" | +| TIMESTAMP | datetime64[ns] | declared type is "datetime" or "timestamp", the format must follow `YYYY-MM-DD HH:MM:SS"/"YYYY-MM-DD HH:MM:SS.SSS`| + +## Performance (r5.4xlarge EC2 same instance) + +**Turbodbc does not support read_sql on SQLite** + +- Time chart, lower is better. + +

time chart

+ +- Memory consumption chart, lower is better. + +

memory chart

+ +In conclusion, ConnectorX uses **2x** less memory and **5x** less time compared with Pandas. diff --git a/docs/freq_questions.md b/docs/freq_questions.md new file mode 100644 index 0000000..71e9f0a --- /dev/null +++ b/docs/freq_questions.md @@ -0,0 +1,37 @@ +# Frequently asked questions + +## How to specify the partition number? + +`partition_num` will determine how many queries we are going to split from the original one and issue to the database. Underlying, we use [rayon](https://github.com/rayon-rs/rayon) as our parallel executor, which adopts a pool of threads to handle each partitioned query. The number of threads in the pool equals to the number of logical cores on the machine. It is recommended to set the `partition_num` to the number of available logical cores. + +## How to choose the partition column? + +`partition_on` specifies on which column we will partition the query. In order to achieve the best performance, it is ideal that each partitioned query will return the same number of rows. And since we partition the column evenly, it is recommended that the numerical `partition_on` column is evenly distributed. Whether a column has index or not might also affect the performance depends on the source database. You can give it a try if you have multiple candidates. Also, you can manually partition the query if our partition method cannot match your need. ConnectorX will still return a whole dataframe with all the results of the list of queries you input. + +## How to print log in Python? + +Set the environment variable `RUST_LOG` to have a detailed look at Rust log. +```python +import os +os.environ["RUST_LOG"]="connectorx=debug,connectorx_python=debug" +import connectorx as cx + +df = cx.read_sql(conn, query) // It will be more clear to test when no partitioning first +``` + +## Why is my query slow on ConnectorX? + +ConnectorX is mainly targeting on the large query result fetching scenario. It speeds up the process by optimizing the client-side execution and saturating both network and machine resource through parallelism. When query execution on the database server is the bottleneck (for example when the result size is small, and/or the query is very complex), there will be overhead coming from metadata fetching. In ConnectorX, there are up to three info that will be fetched before issue the query to database: + +* MIN, MAX query for partition range (if partition is enabled and `partition_range` is not given) +* COUNT query (if `return_type="pandas"`) +* schema fetching query, which gets type and name for each column in the result + +For users who want to have pandas.DataFrame as final result. In order to avoid the costly COUNT query, one workaround is to use Arrow as an intermediate destination from ConnectorX and convert it into Pandas using Arrow’s [to_pandas API](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html?pyarrow.Table.to_pandas). For example: + +```Python +import connectorx as cx + +table = cx.read_sql(db_uri, query, return_type="arrow") # or arrow2 https://github.com/jorgecarleitao/arrow2 +df = table.to_pandas(split_blocks=False, date_as_object=False) +``` \ No newline at end of file diff --git a/docs/install.md b/docs/install.md new file mode 100644 index 0000000..0a77c3e --- /dev/null +++ b/docs/install.md @@ -0,0 +1,46 @@ +# Getting Started + +## Installation + +### Pip + +The easiest way to install ConnectorX is using pip, with the following command: + +```bash +pip install connectorx +``` + +### Build from source code + +* Step 0: Install tools. + * Install Rust: `curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh` + * Install [just](https://github.com/casey/just): `cargo install just` + * Install [Poetry](https://python-poetry.org/docs/): `pip3 install poetry` + +* Step 1: Fresh clone of source. +```bash +git clone https://github.com/sfu-db/connector-x.git +``` + +* Step 2: Install and switch to the correct rust version (please refer [this file](https://github.com/sfu-db/connector-x/blob/main/.github/workflows/release.yml) and search for `rust` for the latest using version). +```bash +rustup install {version} +rustup override set {version} +``` + +* Step 3: Install system dependencies. Please refer to [release.yml](https://github.com/sfu-db/connector-x/blob/main/.github/workflows/release.yml) for dependencies needed for different os. + +* Step 4: Install python dependencies. +```bash +just bootstrap-python +``` + +* Step 5: Build wheel. +```bash +just build-python-wheel +``` + +NOTES: +* `OPENSSL_NO_VENDOR=1` might required to compile for windows users. +* Dynamic library is required for the python installation. (e.g. If you are using `pyenv`, use command `PYTHON_CONFIGURE_OPTS=“--enable-shared” pyenv install {version}` to install python since dylib is not enabled by default.) + diff --git a/docs/intro.md b/docs/intro.md new file mode 100644 index 0000000..53fb8ea --- /dev/null +++ b/docs/intro.md @@ -0,0 +1,115 @@ +# Introduction + +Load data from to , the fastest way. + +ConnectorX enables you to load data from databases into Python in the fastest and most memory efficient way. It is a Python package that provides a high-level interface to the popular database connectors. Here is our rust documentation: [rust-docs](https://sfu-db.github.io/connector-x/rust-docs/connectorx/). + +What you need is one line of code: + +```python +import connectorx as cx + +cx.read_sql("postgresql://username:password@server:port/database", "SELECT * FROM lineitem") +``` + +Optionally, you can accelerate the data loading using parallelism by specifying a partition column. + +```python +import connectorx as cx + +cx.read_sql("postgresql://username:password@server:port/database", "SELECT * FROM lineitem", partition_on="l_orderkey", partition_num=10) +``` + +The function will partition the query by **evenly** splitting the specified column to the amount of partitions. +ConnectorX will assign one thread for each partition to load and write data in parallel. +Currently, we support partitioning on **numerical** columns (**cannot contain NULL**) for **SPJA** queries. + +**Experimental: We are now providing federated query support (PostgreSQL only and do not support partition for now), you can write a single query to join tables from two or more databases! (JRE >= 1.8 is required)** +```python +import connectorx as cx + +db1 = "postgresql://username1:password1@server1:port1/database1" +db2 = "postgresql://username2:password2@server2:port2/database2" + +cx.read_sql({"db1": db1, "db2": db2}, "SELECT * FROM db1.nation n, db2.region r where n.n_regionkey = r.r_regionkey") +``` + +Check out more detailed usage and examples [here](https://sfu-db.github.io/connector-x/api.html). A general introduction of the project can be found in this [blog post](https://towardsdatascience.com/connectorx-the-fastest-way-to-load-data-from-databases-a65d4d4062d5). + +# Performance + +We compared different solutions in Python that provides the `read_sql` function, by loading a 10x TPC-H lineitem table (8.6GB) from Postgres into a DataFrame, with 4 cores parallelism. + +## Time chart, lower is better. + +

time chart

+ +## Memory consumption chart, lower is better. + +

memory chart

+ +In conclusion, ConnectorX uses up to **3x** less memory and **21x** less time (**3x** less memory and **13x** less time compared with Pandas.). More benchmark result can be found under each database pages [here](https://sfu-db.github.io/connector-x/databases.html). + +## How does ConnectorX achieve a lightening speed while keeping the memory footprint low? + +We observe that existing solutions more or less do data copy multiple times when downloading the data. +Additionally, implementing a data intensive application in Python brings additional cost. + +ConnectorX is written in Rust and follows "zero-copy" principle. +This allows it to make full use of the CPU by becoming cache and branch predictor friendly. Moreover, the architecture of ConnectorX ensures the data will be copied exactly once, directly from the source to the destination. + +## How does ConnectorX download the data? + +Upon receiving the query, e.g. `SELECT * FROM lineitem`, ConnectorX will first issue a `LIMIT 1` query `SELECT * FROM lineitem LIMIT 1` to get the schema of the result set. + +Then, if `partition_on` is specified, ConnectorX will issue `SELECT MIN($partition_on), MAX($partition_on) FROM (SELECT * FROM lineitem)` to know the range of the partition column. +After that, the original query is split into partitions based on the min/max information, e.g. `SELECT * FROM (SELECT * FROM lineitem) WHERE $partition_on > 0 AND $partition_on < 10000`. +ConnectorX will then run a count query to get the partition size (e.g. `SELECT COUNT(*) FROM (SELECT * FROM lineitem) WHERE $partition_on > 0 AND $partition_on < 10000`). If the partition +is not specified, the count query will be `SELECT COUNT(*) FROM (SELECT * FROM lineitem)`. + +Finally, ConnectorX will use the schema info as well as the count info to allocate memory and download data by executing the queries normally. + +Once the downloading begins, there will be one thread for each partition so that the data are downloaded in parallel at the partition level. The thread will issue the query of the corresponding +partition to the database and then write the returned data to the destination row-wise or column-wise (depends on the database) in a streaming fashion. + +# Supported Sources & Destinations + +Example connection string, supported protocols and data types for each data source can be found [here](https://sfu-db.github.io/connector-x/databases.html). + +For more planned data sources, please check out our [discussion](https://github.com/sfu-db/connector-x/discussions/61). + +## Sources +- [x] Postgres +- [x] Mysql +- [x] Mariadb (through mysql protocol) +- [x] Sqlite +- [x] Redshift (through postgres protocol) +- [x] Clickhouse (through mysql protocol) +- [x] SQL Server +- [x] Azure SQL Database (through mssql protocol) +- [x] Oracle +- [x] Big Query +- [ ] ODBC (WIP) +- [ ] ... + +## Destinations +- [x] Pandas +- [x] PyArrow +- [x] Modin (through Pandas) +- [x] Dask (through Pandas) +- [x] Polars (through PyArrow) + +# Supports + +You are always welcomed to: +1. Ask questions in stackoverflow. Make sure to have #connectorx attached. +2. Ask questions & propose new ideas in our [forum][discussion_page]. +3. Help us developing this project (adding databases and dataframes), please check out this [guide](https://github.com/sfu-db/connector-x/blob/main/CONTRIBUTING.md). + +# Organizations and Projects using ConnectorX + +[](https://github.com/pola-rs/polars) +[](https://dataprep.ai/) +[](https://modin.readthedocs.io) + +To add your project/organization here, reply our post [here](https://github.com/sfu-db/connector-x/discussions/146) diff --git a/docs/logo.png b/docs/logo.png new file mode 100644 index 0000000..f788fda Binary files /dev/null and b/docs/logo.png differ diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..7e821e4 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,3 @@ +jupyter-book +matplotlib +numpy diff --git a/scripts/benchmarks/tpch-clickhouse.sql b/scripts/benchmarks/tpch-clickhouse.sql new file mode 100644 index 0000000..a63ed02 --- /dev/null +++ b/scripts/benchmarks/tpch-clickhouse.sql @@ -0,0 +1,21 @@ +-- mysql --local-infile --protocol tcp -h$CLICKHOUSE_HOST -P$CLICKHOUSE_PORT -u$CLICKHOUSE_USER -p$CLICKHOUSE_PASSWORD $CLICKHOUSE_DB < tpch-clickhouse.sql +-- clickhouse-client --user $CLICKHOUSE_USER --password $CLICKHOUSE_PASSWORD --database $CLICKHOUSE_DB --format_csv_delimiter="|" --query="INSERT INTO tpch.lineitem FORMAT CSV" < $TPCH_DIR/lineitem.tbl + +DROP TABLE IF EXISTS lineitem; +CREATE TABLE lineitem ( L_ORDERKEY INTEGER NOT NULL, + L_PARTKEY INTEGER NOT NULL, + L_SUPPKEY INTEGER NOT NULL, + L_LINENUMBER INTEGER NOT NULL, + L_QUANTITY DOUBLE NOT NULL, + L_EXTENDEDPRICE DOUBLE NOT NULL, + L_DISCOUNT DOUBLE NOT NULL, + L_TAX DOUBLE NOT NULL, + L_RETURNFLAG CHAR(1) NOT NULL, + L_LINESTATUS CHAR(1) NOT NULL, + L_SHIPDATE DATE NOT NULL, + L_COMMITDATE DATE NOT NULL, + L_RECEIPTDATE DATE NOT NULL, + L_SHIPINSTRUCT CHAR(25) NOT NULL, + L_SHIPMODE CHAR(10) NOT NULL, + L_COMMENT VARCHAR(44) NOT NULL + )Engine=MergeTree() ORDER BY L_ORDERKEY; diff --git a/scripts/benchmarks/tpch-mssql.sql b/scripts/benchmarks/tpch-mssql.sql new file mode 100644 index 0000000..5fa9e49 --- /dev/null +++ b/scripts/benchmarks/tpch-mssql.sql @@ -0,0 +1,35 @@ +-- mssql-cli -S$MSSQL_HOST -U$MSSQL_USER -P$MSSQL_PSWD -d$MSSQL_DB -i tpch-mssql.sql + +DROP TABLE IF EXISTS LINEITEM; +CREATE TABLE LINEITEM ( L_ORDERKEY INTEGER NOT NULL, + L_PARTKEY INTEGER NOT NULL, + L_SUPPKEY INTEGER NOT NULL, + L_LINENUMBER INTEGER NOT NULL, + L_QUANTITY DECIMAL(15,2) NOT NULL, + L_EXTENDEDPRICE DECIMAL(15,2) NOT NULL, + L_DISCOUNT DECIMAL(15,2) NOT NULL, + L_TAX DECIMAL(15,2) NOT NULL, + L_RETURNFLAG CHAR(1) NOT NULL, + L_LINESTATUS CHAR(1) NOT NULL, + L_SHIPDATE DATE NOT NULL, + L_COMMITDATE DATE NOT NULL, + L_RECEIPTDATE DATE NOT NULL, + L_SHIPINSTRUCT CHAR(25) NOT NULL, + L_SHIPMODE CHAR(10) NOT NULL, + L_COMMENT VARCHAR(44) NOT NULL); + +CREATE INDEX lineitem_l_orderkey_idx ON LINEITEM (l_orderkey); + +BULK INSERT LINEITEM +FROM '/tmp/lineitem.tbl' +WITH +( + FORMAT = 'CSV', + FIELDQUOTE = '"', + FIRSTROW = 1, + FIELDTERMINATOR = '|', --CSV field delimiter + ROWTERMINATOR = '\n', --Use to shift the control to next row + TABLOCK +) + +-- bcp tpch.dbo.lineitem in '$TPCH_DIR/lineitem.tbl' -f format.fmt diff --git a/scripts/benchmarks/tpch-mysql.sql b/scripts/benchmarks/tpch-mysql.sql new file mode 100644 index 0000000..289cd7e --- /dev/null +++ b/scripts/benchmarks/tpch-mysql.sql @@ -0,0 +1,25 @@ +-- mysql --local-infile --protocol tcp -h$MYSQL_HOST -P$MYSQL_PORT -u$MYSQL_USER -p$MYSQL_PASSWORD $MYSQL_DB < tpch-mysql.sql + +DROP TABLE IF EXISTS lineitem; +CREATE TABLE lineitem ( L_ORDERKEY INTEGER NOT NULL, + L_PARTKEY INTEGER NOT NULL, + L_SUPPKEY INTEGER NOT NULL, + L_LINENUMBER INTEGER NOT NULL, + L_QUANTITY DECIMAL(15,2) NOT NULL, + L_EXTENDEDPRICE DECIMAL(15,2) NOT NULL, + L_DISCOUNT DECIMAL(15,2) NOT NULL, + L_TAX DECIMAL(15,2) NOT NULL, + L_RETURNFLAG CHAR(1) NOT NULL, + L_LINESTATUS CHAR(1) NOT NULL, + L_SHIPDATE DATE NOT NULL, + L_COMMITDATE DATE NOT NULL, + L_RECEIPTDATE DATE NOT NULL, + L_SHIPINSTRUCT CHAR(25) NOT NULL, + L_SHIPMODE CHAR(10) NOT NULL, + L_COMMENT VARCHAR(44) NOT NULL); + +ALTER TABLE `lineitem` ADD INDEX `lineitem_orderkey_index` (`l_orderkey`); + +SET GLOBAL local_infile = 'ON'; +SHOW GLOBAL VARIABLES LIKE 'local_infile'; +LOAD DATA LOCAL INFILE '$TPCH_DIR/lineitem.tbl' INTO TABLE `lineitem` FIELDS TERMINATED BY '|' ENCLOSED BY '\"' LINES TERMINATED BY '\n'; diff --git a/scripts/benchmarks/tpch-postgres.sql b/scripts/benchmarks/tpch-postgres.sql new file mode 100644 index 0000000..5e3640b --- /dev/null +++ b/scripts/benchmarks/tpch-postgres.sql @@ -0,0 +1,23 @@ +-- psql $POSTGRES_URL -f tpch-postgres.sql + +DROP TABLE IF EXISTS LINEITEM; +CREATE TABLE LINEITEM ( L_ORDERKEY INTEGER NOT NULL, + L_PARTKEY INTEGER NOT NULL, + L_SUPPKEY INTEGER NOT NULL, + L_LINENUMBER INTEGER NOT NULL, + L_QUANTITY DECIMAL(15,2) NOT NULL, + L_EXTENDEDPRICE DECIMAL(15,2) NOT NULL, + L_DISCOUNT DECIMAL(15,2) NOT NULL, + L_TAX DECIMAL(15,2) NOT NULL, + L_RETURNFLAG CHAR(1) NOT NULL, + L_LINESTATUS CHAR(1) NOT NULL, + L_SHIPDATE DATE NOT NULL, + L_COMMITDATE DATE NOT NULL, + L_RECEIPTDATE DATE NOT NULL, + L_SHIPINSTRUCT CHAR(25) NOT NULL, + L_SHIPMODE CHAR(10) NOT NULL, + L_COMMENT VARCHAR(44) NOT NULL); + +CREATE INDEX lineitem_l_orderkey_idx ON LINEITEM USING btree (l_orderkey); + +\copy LINEITEM FROM '$TPCH_DIR/lineitem.tbl' DELIMITER '|' ENCODING 'LATIN1'; diff --git a/scripts/benchmarks/tpch_redshift.sql b/scripts/benchmarks/tpch_redshift.sql new file mode 100644 index 0000000..10954ef --- /dev/null +++ b/scripts/benchmarks/tpch_redshift.sql @@ -0,0 +1,127 @@ +DROP table if exists customer; +DROP table if exists lineitem; +DROP table if exists nation; +DROP table if exists orders; +DROP table if exists part; +DROP table if exists partsupp; +DROP table if exists region; +DROP table if exists supplier; + +create table customer ( + c_custkey int8 not null , + c_name varchar(25) not null, + c_address varchar(40) not null, + c_nationkey int4 not null, + c_phone char(15) not null, + c_acctbal numeric(12,2) not null, + c_mktsegment char(10) not null, + c_comment varchar(117) not null, + Primary Key(C_CUSTKEY) +) distkey(c_custkey) sortkey(c_custkey); + +create table lineitem ( + l_orderkey int8 not null , + l_partkey int8 not null, + l_suppkey int4 not null, + l_linenumber int4 not null, + l_quantity numeric(12,2) not null, + l_extendedprice numeric(12,2) not null, + l_discount numeric(12,2) not null, + l_tax numeric(12,2) not null, + l_returnflag char(1) not null, + l_linestatus char(1) not null, + l_shipdate date not null , + l_commitdate date not null, + l_receiptdate date not null, + l_shipinstruct char(25) not null, + l_shipmode char(10) not null, + l_comment varchar(44) not null, + Primary Key(L_ORDERKEY, L_LINENUMBER) +) distkey(l_orderkey) sortkey(l_orderkey) ; +/* distkey(l_orderkey) sortkey(l_shipdate,l_orderkey) ;*/ + +create table nation ( + n_nationkey int4 not null, + n_name char(25) not null , + n_regionkey int4 not null, + n_comment varchar(152) not null, + Primary Key(N_NATIONKEY) +) distkey(n_nationkey) sortkey(n_nationkey) ; + +create table orders ( + o_orderkey int8 not null, + o_custkey int8 not null, + o_orderstatus char(1) not null, + o_totalprice numeric(12,2) not null, + o_orderdate date not null, + o_orderpriority char(15) not null, + o_clerk char(15) not null, + o_shippriority int4 not null, + o_comment varchar(79) not null, + Primary Key(O_ORDERKEY) +) distkey(o_orderkey) sortkey(o_orderdate, o_orderkey) ; + +create table part ( + p_partkey int8 not null , + p_name varchar(55) not null, + p_mfgr char(25) not null, + p_brand char(10) not null, + p_type varchar(25) not null, + p_size int4 not null, + p_container char(10) not null, + p_retailprice numeric(12,2) not null, + p_comment varchar(23) not null, + PRIMARY KEY (P_PARTKEY) +) distkey(p_partkey) sortkey(p_partkey); + +create table partsupp ( + ps_partkey int8 not null, + ps_suppkey int4 not null, + ps_availqty int4 not null, + ps_supplycost numeric(12,2) not null, + ps_comment varchar(199) not null, + Primary Key(PS_PARTKEY, PS_SUPPKEY) +) distkey(ps_partkey) sortkey(ps_partkey); + +create table region ( + r_regionkey int4 not null, + r_name char(25) not null , + r_comment varchar(152) not null, + Primary Key(R_REGIONKEY) +) distkey(r_regionkey) sortkey(r_regionkey); + +create table supplier ( + s_suppkey int4 not null, + s_name char(25) not null, + s_address varchar(40) not null, + s_nationkey int4 not null, + s_phone char(15) not null, + s_acctbal numeric(12,2) not null, + s_comment varchar(101) not null, + Primary Key(S_SUPPKEY) +) distkey(s_suppkey) sortkey(s_suppkey) +; + +/* + To load the sample data, you must provide authentication for your cluster to access Amazon S3 on your behalf. + You can provide either role-based authentication or key-based authentication. + + Text files needed to load test data under s3://redshift-downloads/TPC-H/10GB are publicly available. + Any valid credentials should have read access. + + The COPY commands include a placeholder for the aws_access_key_id and aws_secret_access_key. + User must update the credentials clause below with valid credentials or the command will fail. + e.g. (1) aws_iam_role=arn:aws:iam::xxxxxxxxxxx:role/xxxxxxx + (2) aws_access_key_id= ;aws_secret_access_key= + + For more information check samples in https://docs.aws.amazon.com/redshift/latest/gsg/rs-gsg-create-sample-db.html +*/ + +copy region from 's3://redshift-downloads/TPC-H/10GB/region/' credentials 'aws_access_key_id= ;aws_secret_access_key=' gzip delimiter '|' region 'us-east-1'; +copy nation from 's3://redshift-downloads/TPC-H/10GB/nation/' credentials 'aws_access_key_id= ;aws_secret_access_key=' gzip delimiter '|' region 'us-east-1'; +copy lineitem from 's3://redshift-downloads/TPC-H/10GB/lineitem/' credentials 'aws_access_key_id= ;aws_secret_access_key=' gzip delimiter '|' region 'us-east-1'; +copy orders from 's3://redshift-downloads/TPC-H/10GB/orders/' credentials 'aws_access_key_id= ;aws_secret_access_key=' gzip delimiter '|' region 'us-east-1'; +copy part from 's3://redshift-downloads/TPC-H/10GB/part/' credentials 'aws_access_key_id= ;aws_secret_access_key=' gzip delimiter '|' region 'us-east-1'; +copy supplier from 's3://redshift-downloads/TPC-H/10GB/supplier/' credentials 'aws_access_key_id= ;aws_secret_access_key=' gzip delimiter '|' region 'us-east-1'; +copy partsupp from 's3://redshift-downloads/TPC-H/10GB/partsupp/' credentials 'aws_access_key_id= ;aws_secret_access_key=' gzip delimiter '|' region 'us-east-1'; +copy customer from 's3://redshift-downloads/TPC-H/10GB/customer/' credentials 'aws_access_key_id= ;aws_secret_access_key=' gzip delimiter '|' region 'us-east-1'; diff --git a/scripts/bigquery.sql b/scripts/bigquery.sql new file mode 100644 index 0000000..9d38911 --- /dev/null +++ b/scripts/bigquery.sql @@ -0,0 +1,33 @@ +DROP TABLE IF EXISTS `dataprep-bigquery.dataprep.test_table`; + +CREATE TABLE`dataprep-bigquery.dataprep.test_table`( + test_int INT64, + test_string STRING, + test_float FLOAT64, + test_bool BOOL, +); + +INSERT INTO `dataprep-bigquery.dataprep.test_table` VALUES (1, 'str1', 1.1, TRUE); +INSERT INTO `dataprep-bigquery.dataprep.test_table` VALUES (2, 'str2', 2.2, FALSE); +INSERT INTO `dataprep-bigquery.dataprep.test_table` VALUES (2333, NULL, NULL, TRUE); +INSERT INTO `dataprep-bigquery.dataprep.test_table` VALUES (4, NULL, -4.44, FALSE); +INSERT INTO `dataprep-bigquery.dataprep.test_table` VALUES (5, 'str05', NULL, NULL); + + +DROP TABLE IF EXISTS `dataprep-bigquery.dataprep.test_types`; + +CREATE TABLE IF NOT EXISTS `dataprep-bigquery.dataprep.test_types`( + test_int INTEGER, + test_numeric NUMERIC(5, 2), + test_bool BOOL, + test_date DATE, + test_time TIME, + test_datetime DATETIME, + test_timestamp TIMESTAMP, + test_str STRING, + test_bytes BYTES, +); + +INSERT INTO `dataprep-bigquery.dataprep.test_types` VALUES (1, 1.23, TRUE, '1937-01-28', '00:00:00', NULL, '1970-01-01 00:00:01.00Z', '😁😂😜', CAST('😁😂😜' AS BYTES)); +INSERT INTO `dataprep-bigquery.dataprep.test_types` VALUES (2, 234.56, NULL, '2053-07-25', '12:59:59', '2053-07-25 12:59:59', NULL, 'こんにちはЗдра́в', CAST('こんにちはЗдра́в' AS BYTES)); +INSERT INTO `dataprep-bigquery.dataprep.test_types` VALUES (NULL, NULL, FALSE, NULL, NULL, '1937-01-28 00:00:00', '2004-2-29 12:00:01.30+3:00', NULL, NULL); diff --git a/scripts/clickhouse.sql b/scripts/clickhouse.sql new file mode 100644 index 0000000..80e4b9c --- /dev/null +++ b/scripts/clickhouse.sql @@ -0,0 +1,31 @@ +DROP TABLE IF EXISTS test_table; + +CREATE TABLE IF NOT EXISTS test_table( + test_int UInt64, + test_str String +) ENGINE = MergeTree() +PRIMARY KEY test_int; + +INSERT INTO test_table VALUES (1, 'abc'); +INSERT INTO test_table VALUES (2, 'defg'); +INSERT INTO test_table VALUES (3, 'hijkl'); +INSERT INTO test_table VALUES (4, 'mnopqr'); +INSERT INTO test_table VALUES (5, 'st'); +INSERT INTO test_table VALUES (6, 'u'); + +DROP TABLE IF EXISTS test_types; + +CREATE TABLE IF NOT EXISTS test_types( + test_int Int32, + test_float Float64, + test_date DATE, + test_datetime DATETIME, + test_decimal DECIMAL(15,2), + test_varchar VARCHAR(15), + test_char CHAR(10) +) ENGINE = MergeTree() +PRIMARY KEY test_int; + +INSERT INTO test_types VALUES (1, 2.3, '1999-07-25', '1999-07-25 23:14:07', 2.22, 'こんにちは', '0123456789'); +INSERT INTO test_types VALUES (2, 3.3, '1979-04-07', '1979-04-07 03:04:37', 3.33, 'Ha好ち😁ðy', 'abcdefghij'); +INSERT INTO test_types VALUES (3, 4.3, '1999-09-22', '1999-07-25 20:21:14', 4.44, 'b', '321'); \ No newline at end of file diff --git a/scripts/duckdb.sql b/scripts/duckdb.sql new file mode 100644 index 0000000..52d9da4 --- /dev/null +++ b/scripts/duckdb.sql @@ -0,0 +1,30 @@ +DROP TABLE IF EXISTS test_table_duckdb; +DROP TABLE IF EXISTS test_str_duckdb; + +CREATE TABLE IF NOT EXISTS test_table_duckdb( + test_int INTEGER NOT NULL, + test_nullint INTEGER, + test_str TEXT, + test_float DOUBLE PRECISION, + test_bool BOOLEAN +); +INSERT INTO test_table_duckdb VALUES (1, 3, 'str1', NULL, TRUE); +INSERT INTO test_table_duckdb VALUES (2, NULL, 'str2', 2.2, FALSE); +INSERT INTO test_table_duckdb VALUES (0, 5, 'a', 3.1, NULL); +INSERT INTO test_table_duckdb VALUES (3, 7, 'b', 3, FALSE); +INSERT INTO test_table_duckdb VALUES (4, 9, 'c', 7.8, NULL); +INSERT INTO test_table_duckdb VALUES (1314, 2, NULL, -10, TRUE); +CREATE TABLE IF NOT EXISTS test_str_duckdb( + id INTEGER NOT NULL, + test_language TEXT, + test_hello TEXT +); +INSERT INTO test_str_duckdb VALUES (0, 'English', 'Hello'); +INSERT INTO test_str_duckdb VALUES (1, '中文', '你好'); +INSERT INTO test_str_duckdb VALUES (2, '日本語', 'こんにちは'); +INSERT INTO test_str_duckdb VALUES (3, 'русский', 'Здра́вствуйте'); +INSERT INTO test_str_duckdb VALUES (4, 'Emoji', '😁😂😜'); +INSERT INTO test_str_duckdb VALUES (5, 'Latin1', '¥§¤®ð'); +INSERT INTO test_str_duckdb VALUES (6, 'Extra', 'y̆'); +INSERT INTO test_str_duckdb VALUES (7, 'Mixed', 'Ha好ち😁ðy̆'); +INSERT INTO test_str_duckdb VALUES (8, '', NULL); diff --git a/scripts/mem_monitor.sh b/scripts/mem_monitor.sh new file mode 100755 index 0000000..fc5b659 --- /dev/null +++ b/scripts/mem_monitor.sh @@ -0,0 +1,17 @@ +echo "user: $USER" +pgid=$(ps -o pgid,comm -u $USER | grep just | awk '{print $1}') +echo "pgid of command: $pgid" +max=0 + +for((i=0;i<1000000;i++)) +do + ps -o pid,ppid,pgid,comm,rss -u $USER | grep $pgid + sum=$(ps -o pid,ppid,pgid,comm,rss -u $USER | grep $pgid | awk '{sum += $NF} END {print sum}') + echo "current sum: $sum" + if (( sum > max )); then + max=$sum + fi + echo "current max: $max" + [ -z $sum ] && exit 0 || echo "continue..." + sleep 2 +done diff --git a/scripts/mssql.sql b/scripts/mssql.sql new file mode 100644 index 0000000..8dbce28 --- /dev/null +++ b/scripts/mssql.sql @@ -0,0 +1,76 @@ +DROP TABLE IF EXISTS test_table; + +CREATE TABLE test_table( + test_int INTEGER NOT NULL, + test_nullint INTEGER, + test_str VARCHAR(128), + test_float FLOAT(53), + test_bool BIT +); + + +INSERT INTO test_table VALUES (1, 3, 'str1', NULL, 1); +INSERT INTO test_table VALUES (2, NULL, 'str2', 2.2, 0); +INSERT INTO test_table VALUES (0, 5, 'a', 3.1, NULL); +INSERT INTO test_table VALUES (3, 7, 'b', 3, 0); +INSERT INTO test_table VALUES (4, 9, 'c', 7.8, NULL); +INSERT INTO test_table VALUES (1314, 2, NULL, -10, 1); + +DROP TABLE IF EXISTS test_str; +CREATE TABLE test_str( + id INTEGER NOT NULL, + test_language NVARCHAR(max), + test_hello NVARCHAR(max), +); + +INSERT INTO test_str VALUES (0, N'English', N'Hello'); +INSERT INTO test_str VALUES (1, N'中文', N'你好'); +INSERT INTO test_str VALUES (2, N'日本語', N'こんにちは'); +INSERT INTO test_str VALUES (3, N'русский', N'Здра́вствуйте'); +INSERT INTO test_str VALUES (4, N'Emoji', N'😁😂😜'); +INSERT INTO test_str VALUES (5, N'Latin1', N'¥§¤®ð'); +INSERT INTO test_str VALUES (6, N'Extra', N'y̆'); +INSERT INTO test_str VALUES (7, N'Mixed', N'Ha好ち😁ðy̆'); +INSERT INTO test_str VALUES (8, N'', NULL); + + +DROP TABLE IF EXISTS test_types; + +CREATE TABLE test_types( + test_int1 TINYINT, +​ test_int2 SMALLINT, +​ test_int4 INT, +​ test_int8 BIGINT, +​ test_float24 REAL, +​ test_float53 DOUBLE PRECISION, +​ test_floatn FLOAT(18), + test_date DATE, + test_time TIME, + test_datetime DATETIMEOFFSET, + test_smalldatetime SMALLDATETIME, +​ test_naivedatetime DATETIME, +​ test_naivedatetime2 DATETIME2, + test_new_decimal NUMERIC(5, 2), + test_decimal DECIMAL, + test_varchar VARCHAR(15), + test_char CHAR(10), + test_varbinary VARBINARY(10), + test_binary BINARY(5), + test_nchar NCHAR(4), + test_text TEXT, + test_ntext NTEXT, + test_uuid UNIQUEIDENTIFIER, + test_money MONEY, + test_smallmoney SMALLMONEY +); + +INSERT INTO test_types VALUES (0, -32768, -2147483648, -9223372036854775808, NULL, NULL, NULL, '1999-07-25', '00:00:00', NULL, '1990-01-01 10:00:00', '1753-01-01 12:00:00', '1900-01-01 12:00:00.12345', 1.1, 1, NULL, NULL, NULL, NULL, '1234', 'text', 'ntext', '86b494cc-96b2-11eb-9298-3e22fbb9fe9d', NULL, NULL); +INSERT INTO test_types VALUES (255, 32767, 2147483647, 9223372036854775807, -1.18E-38, -2.23E-308, 0, NULL, '23:59:59', '2020-12-31 23:59:59 +00:00', NULL, '2038-12-31 01:00:00', NULl, 2.2, 2, 'varchar2', 'char2', CONVERT(VARBINARY(10), '1234'), CONVERT(BINARY(5), '12'), NULL, 't', 'nt', NULL, 922337203685477.5807, 214748.3647); +INSERT INTO test_types VALUES (NULL, NULL, NULL, NULL, 3.40E+38, 1.79E+308, 123.1234567, '2021-01-28', NULL, '2021-01-28 12:30:30 +01:00', '2079-06-05 23:00:00', NULL, '2027-03-18 14:30:30.54321', NULL, NULL, 'varchar3', 'char3', CONVERT(VARBINARY(10), ''), CONVERT(BINARY(5), ''), '12', NULL, NULL, '86b49b84-96b2-11eb-9298-3e22fbb9fe9d', -922337203685477.5808, -214748.3648); + +CREATE FUNCTION increment(@val int) +RETURNS int +AS +BEGIN + RETURN @val + 1; +END; \ No newline at end of file diff --git a/scripts/mysql.sql b/scripts/mysql.sql new file mode 100644 index 0000000..7906415 --- /dev/null +++ b/scripts/mysql.sql @@ -0,0 +1,69 @@ +show variables like 'char%'; + +DROP TABLE IF EXISTS test_table; + +CREATE TABLE IF NOT EXISTS test_table( + test_int INTEGER, + test_float DOUBLE, + test_enum ENUM('even', 'odd'), + test_null INTEGER +); + +INSERT INTO test_table VALUES (1, 1.1, 'odd', NULL); +INSERT INTO test_table VALUES (2, 2.2,'even', NULL); +INSERT INTO test_table VALUES (3, 3.3, 'odd', NULL); +INSERT INTO test_table VALUES (4, 4.4, 'even', NULL); +INSERT INTO test_table VALUES (5, 5.5, 'odd', NULL); +INSERT INTO test_table VALUES (6, 6.6, 'even', NULL); + + +DROP TABLE IF EXISTS test_table_extra; + +CREATE TABLE IF NOT EXISTS test_table_extra( + test_int INTEGER, + test_str VARCHAR(30) +); + +INSERT INTO test_table_extra VALUES (1, 'Ha好ち😁ðy̆'); +INSERT INTO test_table_extra VALUES (2, 'こんにちは'); +INSERT INTO test_table_extra VALUES (3, 'русский'); + +DROP TABLE IF EXISTS test_types; + +CREATE TABLE IF NOT EXISTS test_types( + test_timestamp TIMESTAMP NULL, + test_date DATE, + test_time TIME, + test_datetime DATETIME, + test_new_decimal DECIMAL(15,2), + test_decimal DECIMAL, + test_varchar VARCHAR(15), + test_char CHAR(10), + test_tiny TINYINT, + test_short SMALLINT, + test_int24 MEDIUMINT, + test_long INT, + test_longlong BIGINT, + test_tiny_unsigned TINYINT UNSIGNED, + test_short_unsigned SMALLINT UNSIGNED, + test_int24_unsigned MEDIUMINT UNSIGNED, + test_long_unsigned INT UNSIGNED, + test_longlong_unsigned BIGINT UNSIGNED, + test_long_notnull INT NOT NULL, + test_short_unsigned_notnull SMALLINT UNSIGNED NOT NULL, + test_float FLOAT, + test_double DOUBLE, + test_double_notnull DOUBLE NOT NULL, + test_year YEAR, + test_tinyblob TINYBLOB, + test_blob BLOB, + test_mediumblob MEDIUMBLOB, + test_longblob LONGBLOB, + test_enum ENUM('apple', 'banana', 'orange', 'mango'), + test_json JSON, + test_mediumtext MEDIUMTEXT +); + +INSERT INTO test_types VALUES ('1970-01-01 00:00:01', NULL, '00:00:00', '1970-01-01 00:00:01', 1.1, 1, NULL, 'char1', -128, -32768, -8388608, -2147483648, -9223372036854775808, NULL, NULL, NULL, NULL, NULL, 1, 1, NULL, -2.2E-308, 1.2345, 1901, NULL, NULL, NULL, NULL, 'apple', '{"name": "piggy", "age": 1}', NULL); +INSERT INTO test_types VALUES ('2038-01-19 00:00:00', '1970-01-01', NULL, '2038-01-19 00:0:00', NULL, 2, 'varchar2', NULL, 127, 32767, 8388607, 2147483647, 9223372036854775807, 255, 65535, 16777215, 4294967295, 1.844674407E19, 2147483647, 65535, -1.1E-38, NULL, -1.1E-3, 2155, 'tinyblob2', 'blobblobblobblob2', 'mediumblob2', 'longblob2', NULL, '{"name": "kitty", "age": 2}', ''); +INSERT INTO test_types VALUES (NULL, '2038-01-19', '23:59:59', NULL, 3.3, NULL, 'varchar3', 'char3', NULL, NULL, NULL, NULL, NULL, 0, 0, 0, 0, 0, -2147483648, 0, 3.4E38, 1.7E308, 1.7E30, NULL, 'tinyblob3', 'blobblobblobblob3', 'mediumblob3', 'longblob3', 'mango', NULL, 'medium text!!!!'); \ No newline at end of file diff --git a/scripts/oracle.sql b/scripts/oracle.sql new file mode 100644 index 0000000..e70766e --- /dev/null +++ b/scripts/oracle.sql @@ -0,0 +1,52 @@ +DROP TABLE test_table; +DROP TABLE test_types; +DROP TABLE test_issue; + +CREATE TABLE test_table( + test_int NUMBER(7), + test_char CHAR(5), + test_float FLOAT(53) +); + +INSERT INTO test_table VALUES (1, 'str1', 1.1); +INSERT INTO test_table VALUES (2, 'str2', 2.2); +INSERT INTO test_table VALUES (2333, NULL, NULL); +INSERT INTO test_table VALUES (4, NULL, -4.44); +INSERT INTO test_table VALUES (5, 'str05', NULL); + +CREATE TABLE test_issue( + v BINARY_FLOAT +); + +INSERT INTO test_issue VALUES (1.111); +INSERT INTO test_issue VALUES (2.222); +INSERT INTO test_issue VALUES (3.333); +INSERT INTO test_issue VALUES (NULL); + + +CREATE TABLE test_types( + test_num_int NUMBER(8), + test_int INTEGER, + test_num_float NUMBER(10,1), + test_float FLOAT(38), + test_binary_float BINARY_FLOAT, + test_binary_double BINARY_DOUBLE, + test_char CHAR(5), + test_varchar VARCHAR2(10), + test_nchar NCHAR(6), + test_nvarchar NVARCHAR2(20), + test_date DATE, + test_timestamp TIMESTAMP, + test_timestamptz TIMESTAMP WITH TIME ZONE, + test_clob CLOB, + test_blob BLOB +); + + +INSERT INTO test_types VALUES (1, -10, 2.3, 2.34, -3.456, 9999.99991, 'char1', 'varchar1', 'y123', 'aK>?KJ@#$%', TO_DATE('2019-05-21', 'YYYY-MM-DD'), TO_TIMESTAMP('2019-05-21 01:02:33', 'YYYY-MM-DD HH24:MI:SS'), TO_TIMESTAMP_TZ('1999-12-01 11:00:00 -8:00', + 'YYYY-MM-DD HH:MI:SS TZH:TZM'), '13ab', '39af'); +INSERT INTO test_types VALUES (5, 22, -0.1, 123.455, 3.1415926535, -111111.2345, 'char2', 'varchar222', 'aab123', ')>KDS)(F*&%J', TO_DATE('2020-05-21', 'YYYY-MM-DD'), TO_TIMESTAMP('2020-05-21 01:02:33', 'YYYY-MM-DD HH24:MI:SS'), TO_TIMESTAMP_TZ('1899-12-01 11:00:00 +1:00', + 'YYYY-MM-DD HH:MI:SS TZH:TZM'), '13ab', '39af'); +INSERT INTO test_types VALUES (5, 22, -0.1, 123.455, 3.1415926535, -111111.2345, 'char2', 'varchar222', 'aab123', ')>KDS)(F*&%J', TO_DATE('2020-05-21', 'YYYY-MM-DD'), TO_TIMESTAMP('2020-05-21 01:02:33', 'YYYY-MM-DD HH24:MI:SS'), TO_TIMESTAMP_TZ('1899-12-01 11:00:00 +1:00', + 'YYYY-MM-DD HH:MI:SS TZH:TZM'), '13ab', '39af'); +INSERT INTO test_types VALUES (NULL, 100, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL); diff --git a/scripts/postgres.sql b/scripts/postgres.sql new file mode 100644 index 0000000..559f222 --- /dev/null +++ b/scripts/postgres.sql @@ -0,0 +1,85 @@ +DROP TABLE IF EXISTS test_table; +DROP TABLE IF EXISTS test_str; +DROP TABLE IF EXISTS test_types; +DROP TYPE IF EXISTS happiness; +DROP EXTENSION IF EXISTS citext; +DROP EXTENSION IF EXISTS ltree; + +CREATE TABLE IF NOT EXISTS test_table( + test_int INTEGER NOT NULL, + test_nullint INTEGER, + test_str TEXT, + test_float DOUBLE PRECISION, + test_bool BOOLEAN +); + +INSERT INTO test_table VALUES (1, 3, 'str1', NULL, TRUE); +INSERT INTO test_table VALUES (2, NULL, 'str2', 2.2, FALSE); +INSERT INTO test_table VALUES (0, 5, 'a', 3.1, NULL); +INSERT INTO test_table VALUES (3, 7, 'b', 3, FALSE); +INSERT INTO test_table VALUES (4, 9, 'c', 7.8, NULL); +INSERT INTO test_table VALUES (1314, 2, NULL, -10, TRUE); + +CREATE TABLE IF NOT EXISTS test_str( + id INTEGER NOT NULL, + test_language TEXT, + test_hello TEXT +); + +INSERT INTO test_str VALUES (0, 'English', 'Hello'); +INSERT INTO test_str VALUES (1, '中文', '你好'); +INSERT INTO test_str VALUES (2, '日本語', 'こんにちは'); +INSERT INTO test_str VALUES (3, 'русский', 'Здра́вствуйте'); +INSERT INTO test_str VALUES (4, 'Emoji', '😁😂😜'); +INSERT INTO test_str VALUES (5, 'Latin1', '¥§¤®ð'); +INSERT INTO test_str VALUES (6, 'Extra', 'y̆'); +INSERT INTO test_str VALUES (7, 'Mixed', 'Ha好ち😁ðy̆'); +INSERT INTO test_str VALUES (8, '', NULL); + +CREATE TYPE happiness AS ENUM ('happy', 'very happy', 'ecstatic'); +CREATE EXTENSION citext; +CREATE EXTENSION ltree; +CREATE TABLE IF NOT EXISTS test_types( + test_date DATE, + test_timestamp TIMESTAMP, + test_timestamptz TIMESTAMPTZ, + test_int16 SMALLINT, + test_int64 BIGINT, + test_float32 REAL, + test_numeric NUMERIC(5,2), + test_bpchar BPCHAR(5), + test_char CHAR, + test_varchar VARCHAR(10), + test_uuid UUID, + test_time TIME, + test_interval INTERVAL, + test_json JSON, + test_jsonb JSONB, + test_bytea BYTEA, + test_enum happiness, + test_f4array REAL[], + test_f8array DOUBLE PRECISION[], + test_narray NUMERIC(5,2)[], + test_boolarray BOOLEAN[], + test_i2array SMALLINT[], + test_i4array Integer[], + test_i8array BIGINT[], + test_citext CITEXT, + test_ltree ltree, + test_lquery lquery, + test_ltxtquery ltxtquery, + test_varchararray VARCHAR[], + test_textarray TEXT[], + test_name NAME +); + +INSERT INTO test_types VALUES ('1970-01-01', '1970-01-01 00:00:01', '1970-01-01 00:00:01-00', 0, -9223372036854775808, NULL, NULL, 'a', 'a', NULL, '86b494cc-96b2-11eb-9298-3e22fbb9fe9d', '08:12:40', '1 year 2 months 3 days', '{"customer": "John Doe", "items": {"product": "Beer","qty": 6}}', '{"product": "Beer","qty": 6}', NULL, 'happy','{}', '{}', '{}', '{true, false}', '{-1, 0, 1}', '{-1, 0, 1123}', '{-9223372036854775808, 9223372036854775807}', 'str_citext', 'A.B.C.D', '*.B.*', 'A & B*',ARRAY['str1','str2'],ARRAY['text1','text2'],'0'); +INSERT INTO test_types VALUES ('2000-02-28', '2000-02-28 12:00:10', '2000-02-28 12:00:10-04', 1, 0, 3.1415926535, 521.34, 'bb', 'b', 'bb', '86b49b84-96b2-11eb-9298-3e22fbb9fe9d', NULL, '2 weeks ago', '{"customer": "Lily Bush", "items": {"product": "Diaper","qty": 24}}', '{"product": "Diaper","qty": 24}', 'Здра́вствуйте', 'very happy', NULL, NULL, NULL, '{}', '{}', '{}', '{}', '', 'A.B.E', 'A.*', 'A | B','{"0123456789","abcdefghijklmnopqrstuvwxyz","!@#$%^&*()_-+=~`:;<>?/"}','{"0123456789","abcdefghijklmnopqrstuvwxyz","!@#$%^&*()_-+=~`:;<>?/"}','21'); +INSERT INTO test_types VALUES ('2038-01-18', '2038-01-18 23:59:59', '2038-01-18 23:59:59+08', 2, 9223372036854775807, 2.71, '1e-130', 'ccc', NULL, 'c', '86b49c42-96b2-11eb-9298-3e22fbb9fe9d', '23:00:10', '3 months 2 days ago', '{"customer": "Josh William", "items": {"product": "Toy Car","qty": 1}}', '{"product": "Toy Car","qty": 1}', '', 'ecstatic', '{123.123}', '{-1e-307, 1e308}', '{521.34}', '{true}', '{-32768, 32767}', '{-2147483648, 2147483647}', '{0}', 's', 'A', '*', 'A@',ARRAY['',' '],ARRAY['',' '],'someName'); +INSERT INTO test_types VALUES (NULL, NULL, NULL, 3, NULL, 0.00, -1e-37, NULL, 'd', 'defghijklm', NULL, '18:30:00', '3 year', NULL, NULL, '😜', NULL, '{-1e-37, 1e37}', '{0.000234, -12.987654321}', '{0.12, 333.33, 22.22}', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,'{}','{}','101203203-1212323-22131235'); + +CREATE OR REPLACE FUNCTION increment(i integer) RETURNS integer AS $$ + BEGIN + RETURN i + 1; + END; +$$ LANGUAGE plpgsql; diff --git a/scripts/python-helper.py b/scripts/python-helper.py new file mode 100644 index 0000000..945530f --- /dev/null +++ b/scripts/python-helper.py @@ -0,0 +1,87 @@ +""" +Usage: + python-helper.py (copy-extension|rename-wheel) + +Options: + -h --help Show this screen. + --version Show version. +""" +import platform +import sys +import sysconfig +from shutil import copyfile +from pathlib import Path +import os +from docopt import docopt + +# copied from the maturin project +METADATA = { + "major": sys.version_info.major, + "minor": sys.version_info.minor, + "abiflags": sysconfig.get_config_var("ABIFLAGS"), + "interpreter": platform.python_implementation().lower(), + "ext_suffix": sysconfig.get_config_var("EXT_SUFFIX"), + "abi_tag": (sysconfig.get_config_var("SOABI") or "-").split("-")[1] or None, + "m": sysconfig.get_config_var("WITH_PYMALLOC") == 1, + "u": sysconfig.get_config_var("Py_UNICODE_SIZE") == 4, + "d": sysconfig.get_config_var("Py_DEBUG") == 1, + # This one isn't technically necessary, but still very useful for sanity checks + "platform": platform.system().lower(), + # We need this one for windows abi3 builds + "base_prefix": sys.base_prefix, +} + + + +def main() -> None: + args = docopt(__doc__) + if args["copy-extension"]: + if METADATA["platform"] == "windows": + suffix = ".dll" + src = Path("./target/release/connectorx") + elif METADATA["platform"] == "linux": + suffix = ".so" + src = Path("./target/release/libconnectorx") + elif METADATA["platform"] == "darwin": + suffix = ".dylib" + src = Path("./target/release/libconnectorx") + else: + raise NotImplementedError(f"platform '{METADATA['platform']}' not supported") + + dst = Path("./connectorx/connectorx") + copyfile(src.with_suffix(suffix), dst.with_suffix(METADATA["ext_suffix"])) + elif args["rename-wheel"]: + pyver = f"{METADATA['major']}{METADATA['minor']}" + + if METADATA["platform"] == "windows": + arch = "win_amd64" + # abitag = METADATA["abi_tag"] # this does not work on windows + if pyver == "37": + abitag = "37m" + else: + abitag = pyver + elif METADATA["platform"] == "linux": + arch = "manylinux_2_28_x86_64" + abitag = METADATA["abi_tag"] + elif METADATA["platform"] == "darwin": + arch = "macosx_10_15_intel" + abitag = METADATA["abi_tag"] + else: + raise NotImplementedError(f"platform '{platform}' not supported") + + for p in Path("./dist").iterdir(): + if p.suffix == ".whl": + pkgname, version, *rest = p.stem.split("-") + break + + + os.rename( + p, + f"./dist/{pkgname}-{version}-cp{pyver}-cp{abitag}-{arch}.whl", + ) + else: + raise ValueError(f"args not understand {args}") + +if __name__ == "__main__": + main() + diff --git a/scripts/redshift.sql b/scripts/redshift.sql new file mode 100644 index 0000000..5c1ab2a --- /dev/null +++ b/scripts/redshift.sql @@ -0,0 +1,45 @@ +DROP TABLE IF EXISTS test_table; +DROP TABLE IF EXISTS test_str; +DROP TABLE IF EXISTS test_types; + +CREATE TABLE IF NOT EXISTS test_table( + test_int INTEGER NOT NULL, + test_nullint INTEGER, + test_str TEXT, + test_float DOUBLE PRECISION, + test_bool BOOLEAN +); + +INSERT INTO test_table VALUES (1, 3, 'str1', NULL, TRUE); +INSERT INTO test_table VALUES (2, NULL, 'str2', 2.2, FALSE); +INSERT INTO test_table VALUES (0, 5, 'a', 3.1, NULL); +INSERT INTO test_table VALUES (3, 7, 'b', 3, FALSE); +INSERT INTO test_table VALUES (4, 9, 'c', 7.8, NULL); +INSERT INTO test_table VALUES (1314, 2, NULL, -10, TRUE); + +CREATE TABLE IF NOT EXISTS test_str( + id INTEGER NOT NULL, + test_language TEXT, + test_hello TEXT +); + +INSERT INTO test_str VALUES (0, 'English', 'Hello'); +INSERT INTO test_str VALUES (1, '中文', '你好'); +INSERT INTO test_str VALUES (2, '日本語', 'こんにちは'); +INSERT INTO test_str VALUES (3, 'русский', 'Здра́вствуйте'); +INSERT INTO test_str VALUES (4, 'Emoji', '😁😂😜'); +INSERT INTO test_str VALUES (5, 'Latin1', '¥§¤®ð'); +INSERT INTO test_str VALUES (6, 'Extra', 'y̆'); +INSERT INTO test_str VALUES (7, 'Mixed', 'Ha好ち😁ðy̆'); + +CREATE TABLE IF NOT EXISTS test_types( + test_int16 SMALLINT, + test_char CHAR, + test_time TIME, + test_datetime DATETIME +); + +INSERT INTO test_types VALUES (0, 'a', '08:12:40', '2007-01-01 10:00:19'); +INSERT INTO test_types VALUES (1, 'b', '10:03:00', '2005-01-01 22:03:00'); +INSERT INTO test_types VALUES (2, 'c', '23:00:10', NULL); +INSERT INTO test_types VALUES (3, 'd', '18:30:00', '1987-01-01 11:00:00'); diff --git a/scripts/sqlite.sql b/scripts/sqlite.sql new file mode 100644 index 0000000..d115430 --- /dev/null +++ b/scripts/sqlite.sql @@ -0,0 +1,19 @@ +DROP TABLE IF EXISTS test_table; + +CREATE TABLE IF NOT EXISTS test_table( + test_int INTEGER NOT NULL, + test_nullint INTEGER, + test_str TEXT, + test_float REAL, + test_bool BOOLEAN, + test_date DATE, + test_time TIME, + test_datetime DATETIME +); + +INSERT INTO test_table VALUES (1, 3, 'str1', NULL, True, '1996-03-13', '08:12:40', '2007-01-01 10:00:19'); +INSERT INTO test_table VALUES (2, NULL, 'str2', 2.2, False, '1996-01-30', '10:03:00', '2005-01-01 22:03:00'); +INSERT INTO test_table VALUES (0, 5, 'こんにちは', 3.1, NULL, '1996-02-28', '23:00:10', NULL); +INSERT INTO test_table VALUES (3, 7, 'b', 3, False, '2020-01-12', '23:00:10', '1987-01-01 11:00:00'); +INSERT INTO test_table VALUES (4, 9, 'Ha好ち😁ðy̆', 7.8, NULL, '1996-04-20', '18:30:00', NULL); +INSERT INTO test_table VALUES (1314, 2, NULL, -10, True, NULL, '18:30:00', '2007-10-01 10:32:00'); \ No newline at end of file