From 0c3084902fcb6efbff519320056b21d8bc1a9705 Mon Sep 17 00:00:00 2001 From: David Riordan Date: Mon, 9 Sep 2024 17:21:45 -0400 Subject: [PATCH 1/2] a LLM wrote this for me; please dont kill me --- Cargo.lock | 118 ++++++++++++++++++++++++++++++++++++++++++++---- Cargo.toml | 2 +- Makefile | 9 +++- cli/src/main.rs | 72 +++++++++++++++++++++++++++++ 4 files changed, 189 insertions(+), 12 deletions(-) create mode 100644 cli/src/main.rs diff --git a/Cargo.lock b/Cargo.lock index 01a85e7..7a72b73 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -149,7 +149,7 @@ dependencies = [ "arrow-schema", "chrono", "half 2.3.1", - "hashbrown", + "hashbrown 0.14.0", "num", ] @@ -239,7 +239,7 @@ dependencies = [ "arrow-schema", "chrono", "half 2.3.1", - "indexmap", + "indexmap 2.0.0", "lexical-core", "num", "serde", @@ -273,7 +273,7 @@ dependencies = [ "arrow-data", "arrow-schema", "half 2.3.1", - "hashbrown", + "hashbrown 0.14.0", ] [[package]] @@ -314,6 +314,17 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi 0.1.19", + "libc", + "winapi", +] + [[package]] name = "autocfg" version = "1.1.0" @@ -382,9 +393,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "bytes" -version = "1.5.0" +version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" +checksum = "8318a53db07bb3f8dca91a600466bdb3f2eaadeedfdbcf02e1accbad9271ba50" [[package]] name = "cast" @@ -447,6 +458,21 @@ dependencies = [ "half 1.8.2", ] +[[package]] +name = "clap" +version = "3.2.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ea181bf566f71cb9a5d17a59e1871af638180a18fb0035c92ae62b705207123" +dependencies = [ + "atty", + "bitflags 1.3.2", + "clap_lex 0.2.4", + "indexmap 1.9.3", + "strsim", + "termcolor", + "textwrap", +] + [[package]] name = "clap" version = "4.4.4" @@ -463,7 +489,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5179bb514e4d7c2051749d8fcefa2ed6d06a9f4e6d69faf3805f5d80b8cf8d56" dependencies = [ "anstyle", - "clap_lex", + "clap_lex 0.5.1", +] + +[[package]] +name = "clap_lex" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2850f2f5a82cbf437dd5af4d49848fbdfc27c157c3d010345776f952765261c5" +dependencies = [ + "os_str_bytes", ] [[package]] @@ -527,7 +562,7 @@ dependencies = [ "anes", "cast", "ciborium", - "clap", + "clap 4.4.4", "criterion-plot", "is-terminal", "itertools", @@ -860,12 +895,27 @@ dependencies = [ "num-traits", ] +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + [[package]] name = "hashbrown" version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a" +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + [[package]] name = "hermit-abi" version = "0.3.3" @@ -914,6 +964,16 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown 0.12.3", +] + [[package]] name = "indexmap" version = "2.0.0" @@ -921,7 +981,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d5477fe2230a79769d8dc68e0eabf5437907c0457a5614a9e8dddb67f65eb65d" dependencies = [ "equivalent", - "hashbrown", + "hashbrown 0.14.0", ] [[package]] @@ -942,7 +1002,7 @@ version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" dependencies = [ - "hermit-abi", + "hermit-abi 0.3.3", "rustix", "windows-sys", ] @@ -1235,6 +1295,12 @@ dependencies = [ "num-traits", ] +[[package]] +name = "os_str_bytes" +version = "6.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2355d85b9a3786f481747ced0e0ff2ba35213a1f9bd406ed906554d7af805a1" + [[package]] name = "parking_lot" version = "0.12.1" @@ -1277,7 +1343,7 @@ dependencies = [ "bytes", "chrono", "flate2", - "hashbrown", + "hashbrown 0.14.0", "lz4", "num", "num-bigint", @@ -1320,6 +1386,17 @@ dependencies = [ "ureq", ] +[[package]] +name = "pgpq-cli" +version = "0.1.0" +dependencies = [ + "arrow", + "bytes", + "clap 3.2.25", + "parquet", + "pgpq", +] + [[package]] name = "pin-project-lite" version = "0.2.13" @@ -1807,6 +1884,12 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + [[package]] name = "subtle" version = "2.5.0" @@ -1841,6 +1924,21 @@ version = "0.12.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d0e916b1148c8e263850e1ebcbd046f333e0683c724876bb0da63ea4373dc8a" +[[package]] +name = "termcolor" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "textwrap" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23d434d3f8967a09480fb04132ebe0a3e088c173e6d0ee7897abbdf4eab0f8b9" + [[package]] name = "thiserror" version = "1.0.48" diff --git a/Cargo.toml b/Cargo.toml index 994cced..b1fb619 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [workspace] -members = ["core", "py", "json"] +members = ["core", "py", "json", "cli"] resolver = "2" [profile.bench.package.pgpq] diff --git a/Makefile b/Makefile index 947f51d..179d9a9 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -PHONY: init build test +PHONY: init build-develop build-cli test .init: rm -rf .venv @@ -15,6 +15,7 @@ init: .clean .init build-develop: .init . ./.venv/bin/activate && maturin develop -m py/Cargo.toml . ./.venv/bin/activate && maturin develop -m json/Cargo.toml + cargo build --package pgpq-cli test: build-develop cargo test @@ -22,3 +23,9 @@ test: build-develop lint: build-develop ./.venv/bin/pre-commit run --all-files + +build-cli: + cargo build --package pgpq-cli --release + +install-cli: build-cli + cp target/release/pgpq /usr/local/bin/ diff --git a/cli/src/main.rs b/cli/src/main.rs new file mode 100644 index 0000000..4957250 --- /dev/null +++ b/cli/src/main.rs @@ -0,0 +1,72 @@ +use arrow::record_batch::RecordBatch; +use bytes::BytesMut; +use clap::{App, Arg}; +use parquet::arrow::arrow_reader::ParquetRecordBatchReader; +use pgpq::ArrowToPostgresBinaryEncoder; +use std::fs::File; +use std::io::{self, Read, Write}; + +fn main() -> Result<(), Box> { + let matches = App::new("pgpq") + .version("0.1.0") + .about("Converts Parquet files to PostgreSQL binary format") + .arg( + Arg::with_name("input") + .short('i') + .long("input") + .value_name("FILE") + .help("Input Parquet file (use '-' for stdin)") + .takes_value(true) + .required(true), + ) + .get_matches(); + + let input = matches.value_of("input").unwrap(); + + let record_batches = if input == "-" { + read_parquet_from_stdin()? + } else { + read_parquet_from_file(input)? + }; + + let mut encoder = ArrowToPostgresBinaryEncoder::try_new(&record_batches[0].schema())?; + let mut buffer = BytesMut::new(); + + // Write header + encoder.write_header(&mut buffer); + io::stdout().write_all(&buffer)?; + buffer.clear(); + + // Write batches + for batch in record_batches { + encoder.write_batch(&batch, &mut buffer)?; + io::stdout().write_all(&buffer)?; + buffer.clear(); + } + + Ok(()) +} + +fn read_parquet_from_stdin() -> Result, Box> { + let mut buffer = Vec::new(); + io::stdin().read_to_end(&mut buffer)?; + // Implement Parquet reading from buffer + // This part needs to be implemented using the parquet crate + unimplemented!() +} + +fn read_parquet_from_file(path: &str) -> Result, Box> { + // Open the file + let file = File::open(path)?; + + // Create a ParquetRecordBatchReader directly from the file + let record_batch_reader = ParquetRecordBatchReader::try_new(file, 8192)?; + + // Read all record batches + let mut record_batches = Vec::new(); + for batch in record_batch_reader { + record_batches.push(batch?); + } + + Ok(record_batches) +} From 229f00e1be2befc10a529399761df94893d4664e Mon Sep 17 00:00:00 2001 From: David Riordan Date: Mon, 9 Sep 2024 17:26:45 -0400 Subject: [PATCH 2/2] realized I was missing the cli folder --- .gitignore | 1 + cli/Cargo.toml | 15 +++++++++++++++ 2 files changed, 16 insertions(+) create mode 100644 cli/Cargo.toml diff --git a/.gitignore b/.gitignore index bc33d05..18978c5 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ !/.github !/LICENSE.txt !/.gitattributes +!/cli __pycache__ *.so diff --git a/cli/Cargo.toml b/cli/Cargo.toml new file mode 100644 index 0000000..d323baa --- /dev/null +++ b/cli/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "pgpq-cli" +version = "0.1.0" +edition = "2021" + +[dependencies] +pgpq = { path = "../core" } +clap = "3.0" +arrow = "46.0.0" +parquet = "46.0.0" +bytes = "1.7.1" + +[[bin]] +name = "pgpq" +path = "src/main.rs"