From 935f9f028046616285944a45014c830096e48cbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20Kr=C3=B6tzsch?= Date: Thu, 5 Sep 2024 14:50:35 +0200 Subject: [PATCH 01/11] typos --- nemo-physical/src/management/bytesized.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/nemo-physical/src/management/bytesized.rs b/nemo-physical/src/management/bytesized.rs index 57ce1f782..b4b0d5cc9 100644 --- a/nemo-physical/src/management/bytesized.rs +++ b/nemo-physical/src/management/bytesized.rs @@ -1,5 +1,6 @@ //! This module defines the trait [ByteSized], -//! which should be implemented by objects +//! which should be implemented by types that can +//! calculate their own size. use bytesize::ByteSize; @@ -9,8 +10,8 @@ pub trait ByteSized { fn size_bytes(&self) -> ByteSize; } -/// Helper method to sum a collection of [ByteSize], -/// since the `Sum`is not implemented. +/// Helper method to sum up a collection of [ByteSize], +/// since the `Sum` is not implemented. pub(crate) fn sum_bytes>( iterator: ByteIterator, ) -> ByteSize { From 015e6e65a772bacb3519e7c47cfcd567ded23820 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20Kr=C3=B6tzsch?= Date: Thu, 5 Sep 2024 17:22:58 +0200 Subject: [PATCH 02/11] provide basic byte counting methods for buffers --- nemo-physical/src/dictionary/bytes_buffer.rs | 36 +++++++++++++++++--- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/nemo-physical/src/dictionary/bytes_buffer.rs b/nemo-physical/src/dictionary/bytes_buffer.rs index 2eee29dd4..f78545e64 100644 --- a/nemo-physical/src/dictionary/bytes_buffer.rs +++ b/nemo-physical/src/dictionary/bytes_buffer.rs @@ -19,7 +19,7 @@ const PAGE_SIZE: usize = 1 << PAGE_ADDR_BITS; /// Buffers might be dropped, upon which all of its pages will be freed. There is no other way /// of removing contents from a buffer. /// -/// Individual pages have a size of at most [PAGE_SIZE`] bytes, so that [`PAGE_ADDR_BITS] +/// Individual pages have a size of at most [PAGE_SIZE] bytes, so that [PAGE_ADDR_BITS] /// are needed to specify a position within a page. References to buffered strings are represented /// by [BytesRef], which stores a starting address and length of the slice. The `usize` starting /// address is global (uniform for all buffers), with the lower [PAGE_ADDR_BITS] bits encoding a position within a page, @@ -30,9 +30,9 @@ const PAGE_SIZE: usize = 1 << PAGE_ADDR_BITS; /// The number of bits reserved for length is [BYTESREF_BYTES_LENGTH_BITS], which should always be less /// than [PAGE_ADDR_BITS] since longer tuples would not fit any buffer page anyway. /// -/// The implementaion can be used in multiple parallel threads. +/// The implementation can be used in multiple parallel threads. /// -/// Note: The multi-thrading support is based on aggressive locking of all major operations. It might be +/// Note: The multi-threading support is based on aggressive locking of all major operations. It might be /// possible to reduce the amount of locking by designing more careful data structures. For example, locking /// could be limited to the rare page-writing operations if Vectors would not move existing entries on (some) /// writes, which causes races that may lead to reading errors unless all reads are also locked. @@ -121,7 +121,7 @@ impl BytesBuffer { } } - /// Acquire the lock that we use for operations that read or write any of the internal data + /// Acquires the lock that we use for operations that read or write any of the internal data /// structures that multiple buffers might use. fn acquire_page_lock(&self) { while self @@ -131,7 +131,7 @@ impl BytesBuffer { {} } - /// Release the lock. + /// Releases the lock. fn release_page_lock(&self) { self.lock.store(false, Ordering::Release); } @@ -142,6 +142,24 @@ impl BytesBuffer { self.release_page_lock(); result } + + /// Computes and returns the overall number of bytes that this [BytesBuffer] occupies. + fn size_bytes(&self) -> u64 { + (self.pages.len() * (size_of::<(usize, Vec)>() + PAGE_SIZE) + + self.cur_pages.len() * size_of::() + + size_of::()) as u64 + } + + /// Computes and returns the overall number of bytes that have been alocated for managing + /// a specific buffer. This includes management data for that buffer but no shared management + /// data for the [BytesBuffer] as such. + fn buffer_size_bytes(&self, buffer: usize) -> u64 { + let page_count: usize = + self.pages + .iter() + .fold(0, |acc, x| if x.0 == buffer { acc + 1 } else { acc }); + (page_count * (size_of::<(usize, Vec)>() + PAGE_SIZE) + size_of::()) as u64 + } } /// Trait to encapsulate (static) functions for accessing a single [BytesBuffer]. @@ -186,6 +204,14 @@ pub(crate) unsafe trait GlobalBytesBuffer: Debug + Sized { BytesBuffer::drop_buffer(&mut *Self::get(), buffer); } } + + /// Computes and returns the overall number of bytes that have been alocated for managing + /// a specific buffer. + fn buffer_size_bytes(buffer: usize) -> u64 { + unsafe { + BytesBuffer::buffer_size_bytes(&*Self::get(), buffer) + } + } } /// Number of bits reserved for encoding the length of referenced byte arrays. From 353cc53067838b4c296417d0e54c01a2a6901010 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20Kr=C3=B6tzsch?= Date: Thu, 5 Sep 2024 17:23:15 +0200 Subject: [PATCH 03/11] simplify ByteSized to use u64 to count bytes --- nemo-physical/src/columnar/column.rs | 6 ++---- nemo-physical/src/columnar/column/rle.rs | 6 ++---- nemo-physical/src/columnar/column/vector.rs | 8 +++----- nemo-physical/src/columnar/intervalcolumn.rs | 5 ++--- .../intervalcolumn/interval_lookup.rs | 2 +- .../interval_lookup/lookup_column.rs | 4 +--- .../src/datatypes/run_length_encodable.rs | 13 ++++++------ nemo-physical/src/management/bytesized.rs | 20 ++++--------------- nemo-physical/src/management/database.rs | 2 +- .../src/management/database/order.rs | 6 +++--- .../src/management/database/sources.rs | 16 ++++++--------- .../src/management/database/storage.rs | 10 ++++------ nemo-physical/src/tabular/trie.rs | 6 +++--- nemo/src/io/formats/dsv_reader.rs | 5 ++--- nemo/src/io/formats/json_reader.rs | 5 ++--- nemo/src/io/formats/rdf_reader.rs | 5 ++--- 16 files changed, 44 insertions(+), 75 deletions(-) diff --git a/nemo-physical/src/columnar/column.rs b/nemo-physical/src/columnar/column.rs index f5083fc8a..3a533cca6 100644 --- a/nemo-physical/src/columnar/column.rs +++ b/nemo-physical/src/columnar/column.rs @@ -7,8 +7,6 @@ pub(crate) mod vector; use std::{fmt::Debug, mem::size_of}; -use bytesize::ByteSize; - use crate::{ datatypes::{ColumnDataType, RunLengthEncodable}, generate_forwarder, @@ -82,8 +80,8 @@ where } impl ByteSized for ColumnEnum { - fn size_bytes(&self) -> ByteSize { + fn size_bytes(&self) -> u64 { let size_column = forward_to_column!(self, size_bytes); - ByteSize::b(size_of::() as u64) + size_column + size_of::() as u64 + size_column } } diff --git a/nemo-physical/src/columnar/column/rle.rs b/nemo-physical/src/columnar/column/rle.rs index cc49d7b35..8ec0715db 100644 --- a/nemo-physical/src/columnar/column/rle.rs +++ b/nemo-physical/src/columnar/column/rle.rs @@ -2,8 +2,6 @@ use std::{fmt::Debug, mem::size_of, num::NonZeroUsize, ops::Range}; -use bytesize::ByteSize; - use crate::{ columnar::{columnbuilder::rle::RleElement, columnscan::ColumnScan}, datatypes::{ColumnDataType, RunLengthEncodable}, @@ -134,13 +132,13 @@ where } impl ByteSized for ColumnRle { - fn size_bytes(&self) -> ByteSize { + fn size_bytes(&self) -> u64 { let size_values = size_of::() as u64 * self.values.capacity() as u64; let size_end_indices = size_of::() as u64 * self.end_indices.capacity() as u64; let size_increments = size_of::() as u64 * self.increments.capacity() as u64; - ByteSize::b(size_of::() as u64 + size_values + size_end_indices + size_increments) + size_of::() as u64 + size_values + size_end_indices + size_increments } } diff --git a/nemo-physical/src/columnar/column/vector.rs b/nemo-physical/src/columnar/column/vector.rs index c28a84d84..74b501977 100644 --- a/nemo-physical/src/columnar/column/vector.rs +++ b/nemo-physical/src/columnar/column/vector.rs @@ -6,8 +6,6 @@ use std::{ ops::{Index, Range}, }; -use bytesize::ByteSize; - use crate::{columnar::columnscan::ColumnScan, management::bytesized::ByteSized}; use super::Column; @@ -53,9 +51,9 @@ impl Index for ColumnVector { } impl ByteSized for ColumnVector { - fn size_bytes(&self) -> ByteSize { - // We cast everything to u64 separately to avoid overflows - ByteSize::b(size_of::() as u64 + self.data.capacity() as u64 * size_of::() as u64) + fn size_bytes(&self) -> u64 { + // cast everything to u64 separately to avoid overflows + size_of::() as u64 + self.data.capacity() as u64 * size_of::() as u64 } } diff --git a/nemo-physical/src/columnar/intervalcolumn.rs b/nemo-physical/src/columnar/intervalcolumn.rs index 2a84c1f44..a86a77963 100644 --- a/nemo-physical/src/columnar/intervalcolumn.rs +++ b/nemo-physical/src/columnar/intervalcolumn.rs @@ -7,7 +7,6 @@ pub(crate) mod interval_lookup; use std::ops::Range; -use bytesize::ByteSize; use delegate::delegate; use crate::{ @@ -93,7 +92,7 @@ where T: ColumnDataType, LookupMethod: IntervalLookup, { - fn size_bytes(&self) -> ByteSize { + fn size_bytes(&self) -> u64 { self.data.size_bytes() + self.intervals.size_bytes() + self.interval_lookup.size_bytes() } } @@ -199,7 +198,7 @@ impl ByteSized for IntervalColumnT where LookupMethod: IntervalLookup, { - fn size_bytes(&self) -> ByteSize { + fn size_bytes(&self) -> u64 { self.column_id32.size_bytes() + self.column_id64.size_bytes() + self.column_int64.size_bytes() diff --git a/nemo-physical/src/columnar/intervalcolumn/interval_lookup.rs b/nemo-physical/src/columnar/intervalcolumn/interval_lookup.rs index f77f86b39..8e2b9db30 100644 --- a/nemo-physical/src/columnar/intervalcolumn/interval_lookup.rs +++ b/nemo-physical/src/columnar/intervalcolumn/interval_lookup.rs @@ -59,7 +59,7 @@ impl ByteSized for IntervalLookupT where LookupMethod: IntervalLookup, { - fn size_bytes(&self) -> bytesize::ByteSize { + fn size_bytes(&self) -> u64 { self.lookup_id32.size_bytes() + self.lookup_id64.size_bytes() + self.lookup_int64.size_bytes() diff --git a/nemo-physical/src/columnar/intervalcolumn/interval_lookup/lookup_column.rs b/nemo-physical/src/columnar/intervalcolumn/interval_lookup/lookup_column.rs index 455a52eed..f90caa2eb 100644 --- a/nemo-physical/src/columnar/intervalcolumn/interval_lookup/lookup_column.rs +++ b/nemo-physical/src/columnar/intervalcolumn/interval_lookup/lookup_column.rs @@ -1,8 +1,6 @@ //! This module implements [IntervalLookupColumn] //! and the associated builder [IntervalLookupColumnBuilder]. -use bytesize::ByteSize; - use crate::{ columnar::{ column::{Column, ColumnEnum}, @@ -57,7 +55,7 @@ impl IntervalLookup for IntervalLookupColumn { } impl ByteSized for IntervalLookupColumn { - fn size_bytes(&self) -> ByteSize { + fn size_bytes(&self) -> u64 { self.lookup.size_bytes() } } diff --git a/nemo-physical/src/datatypes/run_length_encodable.rs b/nemo-physical/src/datatypes/run_length_encodable.rs index 90e67c26b..b5e256599 100644 --- a/nemo-physical/src/datatypes/run_length_encodable.rs +++ b/nemo-physical/src/datatypes/run_length_encodable.rs @@ -3,7 +3,6 @@ use std::{ ops::{Add, Sub}, }; -use bytesize::ByteSize; use num::Zero; use crate::management::bytesized::ByteSized; @@ -64,8 +63,8 @@ impl IntStep { } impl ByteSized for IntStep { - fn size_bytes(&self) -> ByteSize { - ByteSize::b(size_of::() as u64) + fn size_bytes(&self) -> u64 { + size_of::() as u64 } } @@ -166,8 +165,8 @@ impl RunLengthEncodable for i64 { pub(crate) struct SmallIntStep(i8); impl ByteSized for SmallIntStep { - fn size_bytes(&self) -> ByteSize { - ByteSize::b(size_of::() as u64) + fn size_bytes(&self) -> u64 { + size_of::() as u64 } } @@ -213,7 +212,7 @@ impl RunLengthEncodable for i8 { pub(crate) struct FloatingStep; impl ByteSized for FloatingStep { - fn size_bytes(&self) -> ByteSize { - ByteSize::b(0) + fn size_bytes(&self) -> u64 { + 0 } } diff --git a/nemo-physical/src/management/bytesized.rs b/nemo-physical/src/management/bytesized.rs index b4b0d5cc9..793ab1705 100644 --- a/nemo-physical/src/management/bytesized.rs +++ b/nemo-physical/src/management/bytesized.rs @@ -2,24 +2,12 @@ //! which should be implemented by types that can //! calculate their own size. -use bytesize::ByteSize; - -/// Objects that are able calculate their current size in bytes +/// Objects that are able calculate their current approximate size in bytes. +/// +/// We use `u64` rather than `usize` here to avoid overflows in case of overestimations. pub trait ByteSized { /// Return the number of bytes this object consumes - fn size_bytes(&self) -> ByteSize; + fn size_bytes(&self) -> u64; } -/// Helper method to sum up a collection of [ByteSize], -/// since the `Sum` is not implemented. -pub(crate) fn sum_bytes>( - iterator: ByteIterator, -) -> ByteSize { - let mut result = ByteSize::b(0); - - for byte_size in iterator { - result += byte_size; - } - result -} diff --git a/nemo-physical/src/management/database.rs b/nemo-physical/src/management/database.rs index 9362c28e3..1cae5423a 100644 --- a/nemo-physical/src/management/database.rs +++ b/nemo-physical/src/management/database.rs @@ -558,7 +558,7 @@ impl DatabaseInstance { } impl ByteSized for DatabaseInstance { - fn size_bytes(&self) -> ByteSize { + fn size_bytes(&self) -> u64 { // TODO: Add size of the dictionary self.reference_manager.size_bytes() } diff --git a/nemo-physical/src/management/database/order.rs b/nemo-physical/src/management/database/order.rs index 8d7bf7488..55c2f3bfc 100644 --- a/nemo-physical/src/management/database/order.rs +++ b/nemo-physical/src/management/database/order.rs @@ -10,7 +10,7 @@ use bytesize::ByteSize; use crate::{ error::Error, management::{ - bytesized::sum_bytes, bytesized::ByteSized, execution_plan::ColumnOrder, + bytesized::ByteSized, execution_plan::ColumnOrder, util::closest_order, }, meta::timing::TimedCode, @@ -326,8 +326,8 @@ impl OrderedReferenceManager { } impl ByteSized for OrderedReferenceManager { - fn size_bytes(&self) -> ByteSize { - sum_bytes(self.stored_tables.iter().map(|table| table.size_bytes())) + fn size_bytes(&self) -> u64 { + self.stored_tables.iter().fold(0, |acc, table| acc+table.size_bytes()) } } diff --git a/nemo-physical/src/management/database/sources.rs b/nemo-physical/src/management/database/sources.rs index 3e06a63a2..4f2db57fe 100644 --- a/nemo-physical/src/management/database/sources.rs +++ b/nemo-physical/src/management/database/sources.rs @@ -2,8 +2,6 @@ use std::{error::Error, fmt::Display, mem::size_of}; -use bytesize::ByteSize; - use crate::{ datasources::{table_providers::TableProvider, tuple_writer::TupleWriter}, datavalues::AnyDataValue, @@ -57,12 +55,10 @@ impl TableProvider for SimpleTable { } impl ByteSized for SimpleTable { - fn size_bytes(&self) -> ByteSize { - // We cast everything to u64 separately to avoid overflows - ByteSize::b( - size_of::() as u64 - + self.data.capacity() as u64 * size_of::() as u64, - ) + fn size_bytes(&self) -> u64 { + // cast everything to u64 separately to avoid overflows + size_of::() as u64 + + self.data.capacity() as u64 * size_of::() as u64 } } @@ -107,7 +103,7 @@ impl Display for TableSource { } impl ByteSized for TableSource { - fn size_bytes(&self) -> ByteSize { - ByteSize::b(size_of::() as u64) + self.provider.size_bytes() + fn size_bytes(&self) -> u64 { + size_of::() as u64 + self.provider.size_bytes() } } diff --git a/nemo-physical/src/management/database/storage.rs b/nemo-physical/src/management/database/storage.rs index cbfc650ce..0dab80586 100644 --- a/nemo-physical/src/management/database/storage.rs +++ b/nemo-physical/src/management/database/storage.rs @@ -4,12 +4,10 @@ use std::cell::RefCell; -use bytesize::ByteSize; - use crate::{ datasources::tuple_writer::TupleWriter, error::{Error, ReadingError}, - management::{bytesized::sum_bytes, bytesized::ByteSized}, + management::bytesized::ByteSized, tabular::trie::Trie, }; @@ -114,13 +112,13 @@ impl TableStorage { } impl ByteSized for TableStorage { - fn size_bytes(&self) -> ByteSize { + fn size_bytes(&self) -> u64 { match self { TableStorage::InMemory(trie) => trie.size_bytes(), TableStorage::FromSources(sources) => { - sum_bytes(sources.iter().map(|source| source.size_bytes())) + sources.iter().fold(0, |acc, source| acc+source.size_bytes()) } - TableStorage::Empty => ByteSize::b(0), + TableStorage::Empty => 0, } } } diff --git a/nemo-physical/src/tabular/trie.rs b/nemo-physical/src/tabular/trie.rs index cae49db27..5d75f9ae7 100644 --- a/nemo-physical/src/tabular/trie.rs +++ b/nemo-physical/src/tabular/trie.rs @@ -18,7 +18,7 @@ use crate::{ storage_type_name::{StorageTypeBitSet, STORAFE_TYPES}, StorageTypeName, StorageValueT, }, - management::bytesized::{sum_bytes, ByteSized}, + management::bytesized::ByteSized, tabular::{buffer::tuple_buffer::TupleBuffer, rowscan::RowScan}, util::bitset::BitSet, }; @@ -519,8 +519,8 @@ impl Trie { } impl ByteSized for Trie { - fn size_bytes(&self) -> bytesize::ByteSize { - sum_bytes(self.columns.iter().map(|column| column.size_bytes())) + fn size_bytes(&self) -> u64 { + self.columns.iter().fold(0, |acc, column| acc + column.size_bytes()) } } diff --git a/nemo/src/io/formats/dsv_reader.rs b/nemo/src/io/formats/dsv_reader.rs index 90e7d5186..091c39905 100644 --- a/nemo/src/io/formats/dsv_reader.rs +++ b/nemo/src/io/formats/dsv_reader.rs @@ -3,7 +3,6 @@ use std::io::BufRead; use std::mem::size_of; -use bytesize::ByteSize; use csv::{Reader, ReaderBuilder}; use nemo_physical::management::bytesized::ByteSized; @@ -139,8 +138,8 @@ impl std::fmt::Debug for DsvReader { } impl ByteSized for DsvReader { - fn size_bytes(&self) -> ByteSize { - ByteSize::b(size_of::() as u64) + fn size_bytes(&self) -> u64 { + size_of::() as u64 } } diff --git a/nemo/src/io/formats/json_reader.rs b/nemo/src/io/formats/json_reader.rs index fa97f5d4e..7179266eb 100644 --- a/nemo/src/io/formats/json_reader.rs +++ b/nemo/src/io/formats/json_reader.rs @@ -3,7 +3,6 @@ use std::{fmt::Debug, io::BufRead, mem::size_of}; -use bytesize::ByteSize; use nemo_physical::{ datasources::{table_providers::TableProvider, tuple_writer::TupleWriter}, datavalues::AnyDataValue, @@ -28,8 +27,8 @@ impl Debug for JsonReader { } impl ByteSized for JsonReader { - fn size_bytes(&self) -> ByteSize { - ByteSize::b(size_of::() as u64) + fn size_bytes(&self) -> u64 { + size_of::() as u64 } } diff --git a/nemo/src/io/formats/rdf_reader.rs b/nemo/src/io/formats/rdf_reader.rs index b9cfa9d8a..3791c6a90 100644 --- a/nemo/src/io/formats/rdf_reader.rs +++ b/nemo/src/io/formats/rdf_reader.rs @@ -1,6 +1,5 @@ //! Reader for various RDF formats, which supports triples files (N-Triples, Turtle, RDF/XML) and //! quads files (N-Quads, TriG). -use bytesize::ByteSize; use nemo_physical::{ datasources::{table_providers::TableProvider, tuple_writer::TupleWriter}, datavalues::{AnyDataValue, DataValueCreationError}, @@ -343,8 +342,8 @@ impl std::fmt::Debug for RdfReader { } impl ByteSized for RdfReader { - fn size_bytes(&self) -> ByteSize { - ByteSize::b(size_of::() as u64) + fn size_bytes(&self) -> u64 { + size_of::() as u64 } } From c3fb53d50279095acc7be11bdc0708848faabba5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20Kr=C3=B6tzsch?= Date: Thu, 5 Sep 2024 17:32:43 +0200 Subject: [PATCH 04/11] remove bytesize dependency --- Cargo.lock | 8 -------- nemo-physical/Cargo.toml | 1 - nemo-physical/src/management/database.rs | 8 +++----- nemo-physical/src/management/database/order.rs | 8 +++----- nemo/Cargo.toml | 1 - nemo/src/table_manager.rs | 12 +++++------- 6 files changed, 11 insertions(+), 27 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3a2bf0955..1b4ec8c49 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -267,12 +267,6 @@ version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" -[[package]] -name = "bytesize" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3e368af43e418a04d52505cf3dbc23dda4e3407ae2fa99fd0e4f308ce546acc" - [[package]] name = "cc" version = "1.0.98" @@ -1157,7 +1151,6 @@ dependencies = [ "ascii_tree", "assert_fs", "bytecount", - "bytesize", "csv", "dyn-clone", "env_logger 0.11.3", @@ -1212,7 +1205,6 @@ dependencies = [ "arbitrary", "ascii_tree", "bitvec", - "bytesize", "delegate", "enum_dispatch", "env_logger 0.11.3", diff --git a/nemo-physical/Cargo.toml b/nemo-physical/Cargo.toml index 70700e0fb..8799072a3 100644 --- a/nemo-physical/Cargo.toml +++ b/nemo-physical/Cargo.toml @@ -24,7 +24,6 @@ path = "src/benches/dict-bench.rs" [dependencies] enum_dispatch = "0.3.12" log = "0.4" -bytesize = "1.2" thiserror = "1.0" num = "0.4.0" ascii_tree = "0.1.1" diff --git a/nemo-physical/src/management/database.rs b/nemo-physical/src/management/database.rs index 1cae5423a..b143ca95c 100644 --- a/nemo-physical/src/management/database.rs +++ b/nemo-physical/src/management/database.rs @@ -16,8 +16,6 @@ use std::{ fmt::Debug, }; -use bytesize::ByteSize; - use crate::{ datasources::table_providers::TableProvider, datavalues::AnyDataValue, @@ -120,13 +118,13 @@ impl DatabaseInstance { self.dictionary.borrow_mut() } - /// Return the amount of memory consumed by the table under the given [PermanentTableId]. + /// Returns the approximate number of bytes of memory used by the table under the given [PermanentTableId]. /// This also includes additional index structures but excludes tables that are currently stored on disk. /// /// # Panics /// Panics if the given id does not exist. - pub fn memory_consumption(&self, id: PermanentTableId) -> ByteSize { - self.reference_manager.memory_consumption(id) + pub fn table_size_bytes(&self, id: PermanentTableId) -> u64 { + self.reference_manager.table_size_bytes(id) } /// Return the number of rows contained in this table. diff --git a/nemo-physical/src/management/database/order.rs b/nemo-physical/src/management/database/order.rs index 55c2f3bfc..6a0c5faad 100644 --- a/nemo-physical/src/management/database/order.rs +++ b/nemo-physical/src/management/database/order.rs @@ -5,8 +5,6 @@ use std::{ collections::{hash_map::Entry, HashMap}, }; -use bytesize::ByteSize; - use crate::{ error::Error, management::{ @@ -119,15 +117,15 @@ impl OrderedReferenceManager { } } - /// Return the amount of memory consumed by the table under the given [PermanentTableId]. + /// Returns the approximate number of bytes of memory used by the table under the given [PermanentTableId]. /// This also includes additional index structures but excludes tables that are currently stored on disk. /// /// # Panics /// Panics if the given id does not exist. - pub(crate) fn memory_consumption(&self, id: PermanentTableId) -> ByteSize { + pub(crate) fn table_size_bytes(&self, id: PermanentTableId) -> u64 { let (id, _) = self.resolve_reference(id, ColumnOrder::default()); - let mut result = ByteSize::b(0); + let mut result = 0; for &storage_id in self .storage_map .get(&id) diff --git a/nemo/Cargo.toml b/nemo/Cargo.toml index 5ccc66f02..ae595011e 100644 --- a/nemo/Cargo.toml +++ b/nemo/Cargo.toml @@ -40,7 +40,6 @@ oxiri = "0.2.2" tokio = { version = "1.29.1", features = [ "rt" ] } reqwest = { version = "0.12.2" } num = "0.4.0" -bytesize = "1.2" ascii_tree = "0.1.1" serde_json = "1.0.108" serde = {version = "1.0.138", features = ["derive"] } diff --git a/nemo/src/table_manager.rs b/nemo/src/table_manager.rs index fa9b0409e..d60ae3c34 100644 --- a/nemo/src/table_manager.rs +++ b/nemo/src/table_manager.rs @@ -4,7 +4,6 @@ use crate::error::Error; use super::model::Identifier; -use bytesize::ByteSize; use nemo_physical::{ datavalues::any_datavalue::AnyDataValue, management::{ @@ -265,14 +264,13 @@ impl SubtableExecutionPlan { #[derive(Debug)] pub struct MemoryUsage { name: String, - memory: ByteSize, - + memory: u64, sub_blocks: Vec, } impl MemoryUsage { /// Create a new [MemoryUsage]. - pub fn new(name: &str, memory: ByteSize) -> Self { + pub fn new(name: &str, memory: u64) -> Self { Self { name: name.to_string(), memory, @@ -282,7 +280,7 @@ impl MemoryUsage { /// Create a new [MemoryUsage] block. pub fn new_block(name: &str) -> Self { - Self::new(name, ByteSize(0)) + Self::new(name, 0) } /// Add a sub-block. @@ -627,11 +625,11 @@ impl TableManager { let mut predicate_usage = MemoryUsage::new_block(&identifier.to_string()); for (step, id) in &subtable_handler.single { - let memory = self.database.memory_consumption(*id); + let memory = self.database.table_size_bytes(*id); predicate_usage.add_sub_block(MemoryUsage::new(&format!("Step {}", step), memory)); } for (steps, id) in &subtable_handler.combined { - let memory = self.database.memory_consumption(*id); + let memory = self.database.table_size_bytes(*id); predicate_usage.add_sub_block(MemoryUsage::new( &format!("Steps {}-{}", steps.start, steps.start + steps.len), memory, From 7a4e5e9ca010b20c7ee980a1b69e287663296e4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20Kr=C3=B6tzsch?= Date: Thu, 5 Sep 2024 17:35:54 +0200 Subject: [PATCH 05/11] remove unused dep bytecount --- Cargo.lock | 1 - nemo/Cargo.toml | 1 - 2 files changed, 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1b4ec8c49..ac241f41b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1150,7 +1150,6 @@ version = "0.5.2-dev" dependencies = [ "ascii_tree", "assert_fs", - "bytecount", "csv", "dyn-clone", "env_logger 0.11.3", diff --git a/nemo/Cargo.toml b/nemo/Cargo.toml index ae595011e..da23c11a4 100644 --- a/nemo/Cargo.toml +++ b/nemo/Cargo.toml @@ -30,7 +30,6 @@ thiserror = "1.0" flate2 = "1" sanitise-file-name = "1.0.0" nom_locate = { version = "4.1.0", features = [ "runtime-dispatch-simd" ] } -bytecount = "0.6.7" getrandom = { version = "0.2.9", default-features = false } path-slash = "0.2.1" rio_api = "0.8.4" From 8315c33a3e1b65f86a7c9fc19160b8250f0a3d7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20Kr=C3=B6tzsch?= Date: Fri, 6 Sep 2024 09:55:29 +0200 Subject: [PATCH 06/11] add size estimate to dictionaries --- nemo-physical/src/dictionary/bytes_buffer.rs | 32 +++++++++++++------ .../src/dictionary/bytes_dictionary.rs | 11 +++++++ .../src/dictionary/datavalue_dictionary.rs | 4 +-- nemo-physical/src/dictionary/meta_dv_dict.rs | 13 ++++++++ nemo-physical/src/dictionary/null_dv_dict.rs | 11 ++++++- .../src/dictionary/string_dictionary.rs | 8 +++++ .../src/dictionary/string_dv_dict.rs | 11 ++++++- nemo-physical/src/management/bytesized.rs | 26 ++++++++++++++- .../src/management/database/order.rs | 9 +++--- .../src/management/database/sources.rs | 3 +- .../src/management/database/storage.rs | 6 ++-- nemo-physical/src/tabular/trie.rs | 4 ++- 12 files changed, 112 insertions(+), 26 deletions(-) diff --git a/nemo-physical/src/dictionary/bytes_buffer.rs b/nemo-physical/src/dictionary/bytes_buffer.rs index f78545e64..c5b9810c7 100644 --- a/nemo-physical/src/dictionary/bytes_buffer.rs +++ b/nemo-physical/src/dictionary/bytes_buffer.rs @@ -143,13 +143,6 @@ impl BytesBuffer { result } - /// Computes and returns the overall number of bytes that this [BytesBuffer] occupies. - fn size_bytes(&self) -> u64 { - (self.pages.len() * (size_of::<(usize, Vec)>() + PAGE_SIZE) - + self.cur_pages.len() * size_of::() - + size_of::()) as u64 - } - /// Computes and returns the overall number of bytes that have been alocated for managing /// a specific buffer. This includes management data for that buffer but no shared management /// data for the [BytesBuffer] as such. @@ -162,6 +155,15 @@ impl BytesBuffer { } } +impl ByteSized for BytesBuffer { + /// Computes and returns the overall number of bytes that this [BytesBuffer] occupies. + fn size_bytes(&self) -> u64 { + (self.pages.len() * (size_of::<(usize, Vec)>() + PAGE_SIZE) + + self.cur_pages.len() * size_of::() + + size_of::()) as u64 + } +} + /// Trait to encapsulate (static) functions for accessing a single [BytesBuffer]. /// This can be implemented mutiple times to select the buffer in different contexts /// based on a generic parameter. It is still global, but this can further reduce the @@ -208,9 +210,7 @@ pub(crate) unsafe trait GlobalBytesBuffer: Debug + Sized { /// Computes and returns the overall number of bytes that have been alocated for managing /// a specific buffer. fn buffer_size_bytes(buffer: usize) -> u64 { - unsafe { - BytesBuffer::buffer_size_bytes(&*Self::get(), buffer) - } + unsafe { BytesBuffer::buffer_size_bytes(&*Self::get(), buffer) } } } @@ -296,6 +296,12 @@ impl BytesRef { } } +impl ByteSized for BytesRef { + fn size_bytes(&self) -> u64 { + size_of::() as u64 + } +} + impl Display for BytesRef { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { unsafe { @@ -355,8 +361,12 @@ macro_rules! declare_bytes_buffer { } pub(crate) use declare_bytes_buffer; +use crate::management::bytesized::ByteSized; + #[cfg(test)] mod test { + use crate::dictionary::bytes_buffer::PAGE_SIZE; + use super::{BytesBuffer, GlobalBytesBuffer}; crate::dictionary::bytes_buffer::declare_bytes_buffer!(TestGlobalBuffer, TEST_BUFFER); @@ -389,6 +399,8 @@ mod test { assert_ne!(bytes_ref1, bytes_ref2); assert_eq!(bytes_ref1, bytes_ref1); + assert!(TestGlobalBuffer::buffer_size_bytes(bufid1) > PAGE_SIZE as u64); + TestGlobalBuffer::drop_buffer(bufid1); TestGlobalBuffer::drop_buffer(bufid2); TestGlobalBuffer2::drop_buffer(bufid3); diff --git a/nemo-physical/src/dictionary/bytes_dictionary.rs b/nemo-physical/src/dictionary/bytes_dictionary.rs index 13b6e99c9..b87822da0 100644 --- a/nemo-physical/src/dictionary/bytes_dictionary.rs +++ b/nemo-physical/src/dictionary/bytes_dictionary.rs @@ -5,6 +5,7 @@ use hashbrown::HashMap; use super::bytes_buffer::{BytesRef, GlobalBytesBuffer}; use super::{AddResult, KNOWN_ID_MARK}; use crate::dictionary::datavalue_dictionary::{SMALL_KNOWN_ID_MARK, SMALL_KNOWN_ID_MARK_AS_USIZE}; +use crate::management::bytesized::{ByteSizeHelpers, ByteSized}; pub(crate) struct IdUtils {} @@ -183,6 +184,16 @@ impl Drop for BytesDictionary { } } +impl ByteSized for BytesDictionary { + fn size_bytes(&self) -> u64 { + size_of::() as u64 + + ByteSizeHelpers::size_inner_vec_flat(&self.store) + + ByteSizeHelpers::size_inner_hashmap_flat(&self.map_short) + + ByteSizeHelpers::size_inner_hashmap_flat(&self.map_long) + + B::buffer_size_bytes(self.buffer_id) + } +} + #[cfg(test)] mod test { use crate::dictionary::{ diff --git a/nemo-physical/src/dictionary/datavalue_dictionary.rs b/nemo-physical/src/dictionary/datavalue_dictionary.rs index 4889f05a8..b61271bd0 100644 --- a/nemo-physical/src/dictionary/datavalue_dictionary.rs +++ b/nemo-physical/src/dictionary/datavalue_dictionary.rs @@ -1,6 +1,6 @@ //! General traits and global constants for dictionaries that work for datavalues. -use crate::datavalues::AnyDataValue; +use crate::{datavalues::AnyDataValue, management::bytesized::ByteSized}; use std::fmt::Debug; /// Fake id that dictionaries use to indicate that an entry has no id. @@ -51,7 +51,7 @@ impl AddResult { /// /// The id values are provided when the dictionary is used, whereas the ids are newly /// assigned by the dictionary itself. -pub trait DvDict: Debug { +pub trait DvDict: Debug + ByteSized { /// Adds a new [AnyDataValue] to the dictionary. If the value is not known yet, it will /// be assigned a new id. Unsupported datavalues can also be rejected, which specialized /// dictionary implementations might do. diff --git a/nemo-physical/src/dictionary/meta_dv_dict.rs b/nemo-physical/src/dictionary/meta_dv_dict.rs index 6bd3df842..5aecb0f41 100644 --- a/nemo-physical/src/dictionary/meta_dv_dict.rs +++ b/nemo-physical/src/dictionary/meta_dv_dict.rs @@ -3,6 +3,7 @@ use crate::datavalues::ValueDomain; use crate::datavalues::{AnyDataValue, DataValue}; use crate::dictionary::NONEXISTING_ID_MARK; +use crate::management::bytesized::{ByteSizeHelpers, ByteSized}; use super::DvDict; use super::IriDvDictionary; @@ -537,6 +538,18 @@ impl DvDict for MetaDvDictionary { } } +impl ByteSized for MetaDvDictionary { + fn size_bytes(&self) -> u64 { + size_of::() as u64 + + ByteSizeHelpers::size_inner_vec_flat(&self.dictblocks) + + ByteSizeHelpers::size_inner_vec_flat(&self.dicts) + + ByteSizeHelpers::size_inner_vec_flat(&self.generic_dicts) + + self.dicts.iter().fold(0, |acc, dr| { + acc + dr.dict.size_bytes() + ByteSizeHelpers::size_inner_vec_flat(&dr.gblocks) + }) + } +} + #[cfg(test)] mod test { use crate::{ diff --git a/nemo-physical/src/dictionary/null_dv_dict.rs b/nemo-physical/src/dictionary/null_dv_dict.rs index f86661b8f..73a54d1cf 100644 --- a/nemo-physical/src/dictionary/null_dv_dict.rs +++ b/nemo-physical/src/dictionary/null_dv_dict.rs @@ -1,6 +1,9 @@ //! A [DvDict] implementation for nulls (and nulls only). -use crate::datavalues::{AnyDataValue, DataValue, NullDataValue, ValueDomain}; +use crate::{ + datavalues::{AnyDataValue, DataValue, NullDataValue, ValueDomain}, + management::bytesized::ByteSized, +}; use super::{AddResult, DvDict}; @@ -101,6 +104,12 @@ impl DvDict for NullDvDictionary { } } +impl ByteSized for NullDvDictionary { + fn size_bytes(&self) -> u64 { + size_of::() as u64 + } +} + #[cfg(test)] mod test { use crate::{ diff --git a/nemo-physical/src/dictionary/string_dictionary.rs b/nemo-physical/src/dictionary/string_dictionary.rs index 7770adb2a..28adf88e5 100644 --- a/nemo-physical/src/dictionary/string_dictionary.rs +++ b/nemo-physical/src/dictionary/string_dictionary.rs @@ -1,5 +1,7 @@ //! This module defines a string based dictionary. +use crate::management::bytesized::ByteSized; + use super::{ bytes_buffer::{BytesBuffer, GlobalBytesBuffer}, bytes_dictionary::BytesDictionary, @@ -79,6 +81,12 @@ impl Default for GenericStringDictionary { } } +impl ByteSized for GenericStringDictionary { + fn size_bytes(&self) -> u64 { + self.bytes_dict.size_bytes() + } +} + crate::dictionary::bytes_buffer::declare_bytes_buffer!( StringDictBytesBuffer, STRING_DICT_BYTES_BUFFER diff --git a/nemo-physical/src/dictionary/string_dv_dict.rs b/nemo-physical/src/dictionary/string_dv_dict.rs index fcb19313b..dd1aca06b 100644 --- a/nemo-physical/src/dictionary/string_dv_dict.rs +++ b/nemo-physical/src/dictionary/string_dv_dict.rs @@ -4,7 +4,10 @@ //! string representations without any risk of confusion. use super::{AddResult, DvDict, StringDictionary}; -use crate::datavalues::{AnyDataValue, ValueDomain}; +use crate::{ + datavalues::{AnyDataValue, ValueDomain}, + management::bytesized::ByteSized, +}; use std::{fmt::Debug, marker::PhantomData}; use crate::dictionary::dv_converter::{DvConverter, IriDvConverter, StringDvConverter}; @@ -115,6 +118,12 @@ impl DvDict for StringBasedDvDictionary { } } +impl ByteSized for StringBasedDvDictionary { + fn size_bytes(&self) -> u64 { + self.string_dict.size_bytes() + } +} + #[cfg(test)] mod test { diff --git a/nemo-physical/src/management/bytesized.rs b/nemo-physical/src/management/bytesized.rs index 793ab1705..6157e68eb 100644 --- a/nemo-physical/src/management/bytesized.rs +++ b/nemo-physical/src/management/bytesized.rs @@ -3,11 +3,35 @@ //! calculate their own size. /// Objects that are able calculate their current approximate size in bytes. -/// +/// /// We use `u64` rather than `usize` here to avoid overflows in case of overestimations. pub trait ByteSized { /// Return the number of bytes this object consumes fn size_bytes(&self) -> u64; } +/// Collection of some simple helper methods for estimating sizes. +pub(crate) struct ByteSizeHelpers; +impl ByteSizeHelpers { + /// Estimates the memory required for managing the content of a Hashbrown hashmap using only + /// the direct size of keys and values, without taking into accont any data they might point to. + /// + /// The computation is approximate since the hashmap does not provide access to its current bucket + /// structure and control byte overhead, so we merely consider the reported capacity. + pub(crate) fn size_inner_hashmap_flat(object: &hashbrown::HashMap) -> u64 + where + Self: Sized, + { + object.capacity() as u64 * size_of::<(K, V)>() as u64 + } + + /// Computes the memory required for managing the content of a vector using only + /// the direct size of content objects, without taking into accont any data they might point to. + pub(crate) fn size_inner_vec_flat(object: &Vec) -> u64 + where + Self: Sized, + { + object.capacity() as u64 * size_of::() as u64 + } +} diff --git a/nemo-physical/src/management/database/order.rs b/nemo-physical/src/management/database/order.rs index 6a0c5faad..224b07a58 100644 --- a/nemo-physical/src/management/database/order.rs +++ b/nemo-physical/src/management/database/order.rs @@ -7,10 +7,7 @@ use std::{ use crate::{ error::Error, - management::{ - bytesized::ByteSized, execution_plan::ColumnOrder, - util::closest_order, - }, + management::{bytesized::ByteSized, execution_plan::ColumnOrder, util::closest_order}, meta::timing::TimedCode, tabular::{operations::projectreorder::GeneratorProjectReorder, trie::Trie}, util::mapping::{permutation::Permutation, traits::NatMapping}, @@ -325,7 +322,9 @@ impl OrderedReferenceManager { impl ByteSized for OrderedReferenceManager { fn size_bytes(&self) -> u64 { - self.stored_tables.iter().fold(0, |acc, table| acc+table.size_bytes()) + self.stored_tables + .iter() + .fold(0, |acc, table| acc + table.size_bytes()) } } diff --git a/nemo-physical/src/management/database/sources.rs b/nemo-physical/src/management/database/sources.rs index 4f2db57fe..6211f4184 100644 --- a/nemo-physical/src/management/database/sources.rs +++ b/nemo-physical/src/management/database/sources.rs @@ -57,8 +57,7 @@ impl TableProvider for SimpleTable { impl ByteSized for SimpleTable { fn size_bytes(&self) -> u64 { // cast everything to u64 separately to avoid overflows - size_of::() as u64 - + self.data.capacity() as u64 * size_of::() as u64 + size_of::() as u64 + self.data.capacity() as u64 * size_of::() as u64 } } diff --git a/nemo-physical/src/management/database/storage.rs b/nemo-physical/src/management/database/storage.rs index 0dab80586..f8c320422 100644 --- a/nemo-physical/src/management/database/storage.rs +++ b/nemo-physical/src/management/database/storage.rs @@ -115,9 +115,9 @@ impl ByteSized for TableStorage { fn size_bytes(&self) -> u64 { match self { TableStorage::InMemory(trie) => trie.size_bytes(), - TableStorage::FromSources(sources) => { - sources.iter().fold(0, |acc, source| acc+source.size_bytes()) - } + TableStorage::FromSources(sources) => sources + .iter() + .fold(0, |acc, source| acc + source.size_bytes()), TableStorage::Empty => 0, } } diff --git a/nemo-physical/src/tabular/trie.rs b/nemo-physical/src/tabular/trie.rs index 5d75f9ae7..3c7a23e95 100644 --- a/nemo-physical/src/tabular/trie.rs +++ b/nemo-physical/src/tabular/trie.rs @@ -520,7 +520,9 @@ impl Trie { impl ByteSized for Trie { fn size_bytes(&self) -> u64 { - self.columns.iter().fold(0, |acc, column| acc + column.size_bytes()) + self.columns + .iter() + .fold(0, |acc, column| acc + column.size_bytes()) } } From 07cca33b9ed15fece5d001577ed3cc55442c8067 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20Kr=C3=B6tzsch?= Date: Fri, 6 Sep 2024 10:55:29 +0200 Subject: [PATCH 07/11] Include dict size report in dict bench --- nemo-physical/src/benches/dict-bench.rs | 11 +++++++++++ nemo-physical/src/dictionary/string_dictionary.rs | 5 +++++ 2 files changed, 16 insertions(+) diff --git a/nemo-physical/src/benches/dict-bench.rs b/nemo-physical/src/benches/dict-bench.rs index fb6582ac9..b18c4b5b0 100644 --- a/nemo-physical/src/benches/dict-bench.rs +++ b/nemo-physical/src/benches/dict-bench.rs @@ -5,6 +5,7 @@ use nemo_physical::datavalues::ValueDomain; use nemo_physical::dictionary::meta_dv_dict::MetaDvDictionary; use nemo_physical::dictionary::string_dictionary::BenchmarkStringDictionary; use nemo_physical::dictionary::DvDict; +use nemo_physical::management::bytesized::ByteSized; use std::env; use std::fs::File; use std::io::prelude::*; @@ -69,6 +70,15 @@ impl DictEnum { DictEnum::DvMeta(dict) => dict.len(), } } + + fn size_bytes(&mut self) -> u64 { + match &self { + DictEnum::StringHash(_) => 0, + DictEnum::StringMeta(_) => 0, + DictEnum::StringBuffer(dict) => dict.size_bytes(), + DictEnum::DvMeta(dict) => dict.size_bytes(), + } + } } fn main() { @@ -203,6 +213,7 @@ fn main() { " Dictionary rejected {} (non-unique) strings with {} bytes overall.", count_rejected, bytes_rejected ); + println!(" Dictionary reports own size as {}.", dict.size_bytes()); TimedCode::instance().stop(); diff --git a/nemo-physical/src/dictionary/string_dictionary.rs b/nemo-physical/src/dictionary/string_dictionary.rs index 28adf88e5..89853e637 100644 --- a/nemo-physical/src/dictionary/string_dictionary.rs +++ b/nemo-physical/src/dictionary/string_dictionary.rs @@ -115,6 +115,11 @@ impl BenchmarkStringDictionary { self.0.is_empty() } } +impl ByteSized for BenchmarkStringDictionary { + fn size_bytes(&self) -> u64 { + self.0.size_bytes() + } +} #[cfg(test)] mod test { From e26d5304db570a3e0d68dfde992d7d6c6349608f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20Kr=C3=B6tzsch?= Date: Fri, 6 Sep 2024 11:04:08 +0200 Subject: [PATCH 08/11] report dictionary as part of database size --- nemo-physical/src/management/database.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nemo-physical/src/management/database.rs b/nemo-physical/src/management/database.rs index b143ca95c..94a86a16d 100644 --- a/nemo-physical/src/management/database.rs +++ b/nemo-physical/src/management/database.rs @@ -557,8 +557,7 @@ impl DatabaseInstance { impl ByteSized for DatabaseInstance { fn size_bytes(&self) -> u64 { - // TODO: Add size of the dictionary - self.reference_manager.size_bytes() + self.reference_manager.size_bytes() + self.dictionary().size_bytes() } } From 73a935026d8d979aece8a3240511406a9486a757 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20Kr=C3=B6tzsch?= Date: Fri, 6 Sep 2024 11:15:49 +0200 Subject: [PATCH 09/11] include dictionary size in memory reporting --- nemo/src/table_manager.rs | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/nemo/src/table_manager.rs b/nemo/src/table_manager.rs index d60ae3c34..b365fa9ab 100644 --- a/nemo/src/table_manager.rs +++ b/nemo/src/table_manager.rs @@ -7,11 +7,11 @@ use super::model::Identifier; use nemo_physical::{ datavalues::any_datavalue::AnyDataValue, management::{ - database::DatabaseInstance, + bytesized::ByteSized, database::{ id::{ExecutionId, PermanentTableId}, sources::TableSource, - Dict, + DatabaseInstance, Dict, }, execution_plan::{ColumnOrder, ExecutionNodeRef, ExecutionPlan}, }, @@ -619,7 +619,12 @@ impl TableManager { /// Return the current [MemoryUsage]. pub fn memory_usage(&self) -> MemoryUsage { - let mut result = MemoryUsage::new_block("Chase"); + let mut result = MemoryUsage::new_block("Total"); + + let memory = self.database.dictionary().size_bytes(); + result.add_sub_block(MemoryUsage::new("Dictionary", memory)); + + let mut steps = MemoryUsage::new_block("Chase steps"); for (identifier, subtable_handler) in &self.predicate_subtables { let mut predicate_usage = MemoryUsage::new_block(&identifier.to_string()); @@ -636,8 +641,9 @@ impl TableManager { )); } - result.add_sub_block(predicate_usage) + steps.add_sub_block(predicate_usage); } + result.add_sub_block(steps); result } From 450f47abad200c1f61b951b33b5e1045e1d78689 Mon Sep 17 00:00:00 2001 From: Maximilian Marx Date: Mon, 9 Sep 2024 13:04:45 +0200 Subject: [PATCH 10/11] Prefer count/sum over fold --- nemo-physical/src/dictionary/bytes_buffer.rs | 5 +---- nemo-physical/src/dictionary/meta_dv_dict.rs | 8 +++++--- nemo-physical/src/management/database/order.rs | 3 ++- nemo-physical/src/management/database/storage.rs | 6 +++--- nemo-physical/src/tabular/trie.rs | 4 +--- nemo/src/io/formats/dsv.rs | 10 +++------- nemo/src/io/formats/dsv_reader.rs | 8 +------- nemo/src/io/formats/import_export.rs | 15 +++++++-------- nemo/src/io/formats/rdf.rs | 10 +++------- nemo/src/io/formats/rdf_reader.rs | 16 ++-------------- 10 files changed, 28 insertions(+), 57 deletions(-) diff --git a/nemo-physical/src/dictionary/bytes_buffer.rs b/nemo-physical/src/dictionary/bytes_buffer.rs index c5b9810c7..22fcbd632 100644 --- a/nemo-physical/src/dictionary/bytes_buffer.rs +++ b/nemo-physical/src/dictionary/bytes_buffer.rs @@ -147,10 +147,7 @@ impl BytesBuffer { /// a specific buffer. This includes management data for that buffer but no shared management /// data for the [BytesBuffer] as such. fn buffer_size_bytes(&self, buffer: usize) -> u64 { - let page_count: usize = - self.pages - .iter() - .fold(0, |acc, x| if x.0 == buffer { acc + 1 } else { acc }); + let page_count: usize = self.pages.iter().filter(|x| x.0 == buffer).count(); (page_count * (size_of::<(usize, Vec)>() + PAGE_SIZE) + size_of::()) as u64 } } diff --git a/nemo-physical/src/dictionary/meta_dv_dict.rs b/nemo-physical/src/dictionary/meta_dv_dict.rs index 5aecb0f41..a6b90e0a7 100644 --- a/nemo-physical/src/dictionary/meta_dv_dict.rs +++ b/nemo-physical/src/dictionary/meta_dv_dict.rs @@ -544,9 +544,11 @@ impl ByteSized for MetaDvDictionary { + ByteSizeHelpers::size_inner_vec_flat(&self.dictblocks) + ByteSizeHelpers::size_inner_vec_flat(&self.dicts) + ByteSizeHelpers::size_inner_vec_flat(&self.generic_dicts) - + self.dicts.iter().fold(0, |acc, dr| { - acc + dr.dict.size_bytes() + ByteSizeHelpers::size_inner_vec_flat(&dr.gblocks) - }) + + self + .dicts + .iter() + .map(|dr| dr.dict.size_bytes() + ByteSizeHelpers::size_inner_vec_flat(&dr.gblocks)) + .sum::() } } diff --git a/nemo-physical/src/management/database/order.rs b/nemo-physical/src/management/database/order.rs index 224b07a58..1c400ee2d 100644 --- a/nemo-physical/src/management/database/order.rs +++ b/nemo-physical/src/management/database/order.rs @@ -324,7 +324,8 @@ impl ByteSized for OrderedReferenceManager { fn size_bytes(&self) -> u64 { self.stored_tables .iter() - .fold(0, |acc, table| acc + table.size_bytes()) + .map(|table| table.size_bytes()) + .sum() } } diff --git a/nemo-physical/src/management/database/storage.rs b/nemo-physical/src/management/database/storage.rs index f8c320422..10228f277 100644 --- a/nemo-physical/src/management/database/storage.rs +++ b/nemo-physical/src/management/database/storage.rs @@ -115,9 +115,9 @@ impl ByteSized for TableStorage { fn size_bytes(&self) -> u64 { match self { TableStorage::InMemory(trie) => trie.size_bytes(), - TableStorage::FromSources(sources) => sources - .iter() - .fold(0, |acc, source| acc + source.size_bytes()), + TableStorage::FromSources(sources) => { + sources.iter().map(|source| source.size_bytes()).sum() + } TableStorage::Empty => 0, } } diff --git a/nemo-physical/src/tabular/trie.rs b/nemo-physical/src/tabular/trie.rs index 3c7a23e95..84fc83138 100644 --- a/nemo-physical/src/tabular/trie.rs +++ b/nemo-physical/src/tabular/trie.rs @@ -520,9 +520,7 @@ impl Trie { impl ByteSized for Trie { fn size_bytes(&self) -> u64 { - self.columns - .iter() - .fold(0, |acc, column| acc + column.size_bytes()) + self.columns.iter().map(|column| column.size_bytes()).sum() } } diff --git a/nemo/src/io/formats/dsv.rs b/nemo/src/io/formats/dsv.rs index 4632bc0c9..adb4bc513 100644 --- a/nemo/src/io/formats/dsv.rs +++ b/nemo/src/io/formats/dsv.rs @@ -227,13 +227,9 @@ impl ImportExportHandler for DsvHandler { fn predicate_arity(&self) -> Option { match self.direction { Direction::Import => self.value_formats.as_ref().map(|vfs| { - vfs.iter().fold(0, |acc, fmt| { - if *fmt == DsvValueFormat::Skip { - acc - } else { - acc + 1 - } - }) + vfs.iter() + .filter(|&&fmt| fmt != DsvValueFormat::Skip) + .count() }), Direction::Export => self.value_formats.as_ref().map(|vfs| vfs.len()), } diff --git a/nemo/src/io/formats/dsv_reader.rs b/nemo/src/io/formats/dsv_reader.rs index 091c39905..b139e4f18 100644 --- a/nemo/src/io/formats/dsv_reader.rs +++ b/nemo/src/io/formats/dsv_reader.rs @@ -74,13 +74,7 @@ impl DsvReader { let expected_file_arity = parsers.len(); assert_eq!( tuple_writer.column_number(), - skip.iter().fold(0, |acc: usize, b| { - if *b { - acc - } else { - acc + 1 - } - }) + skip.iter().filter(|b| !*b).count() ); let stop_limit = self.limit.unwrap_or(0); diff --git a/nemo/src/io/formats/import_export.rs b/nemo/src/io/formats/import_export.rs index abc212400..6c15764e2 100644 --- a/nemo/src/io/formats/import_export.rs +++ b/nemo/src/io/formats/import_export.rs @@ -484,14 +484,13 @@ impl ImportExportHandlers { if let Some(ref vfs) = value_format_strings { let declared_file_arity = match direction { Direction::Import => vfs.len(), - Direction::Export => vfs.iter().fold(0, |acc: usize, fmt| { - // Only count formats other than VALUE_FORMAT_SKIP: - if *fmt == VALUE_FORMAT_SKIP { - acc - } else { - acc + 1 - } - }), + Direction::Export => vfs + .iter() + .filter(|fmt| { + // Only count formats other than VALUE_FORMAT_SKIP: + *fmt != VALUE_FORMAT_SKIP + }) + .count(), }; // Check if arity is consistent with given value formats. diff --git a/nemo/src/io/formats/rdf.rs b/nemo/src/io/formats/rdf.rs index de518d886..f42ee56b0 100644 --- a/nemo/src/io/formats/rdf.rs +++ b/nemo/src/io/formats/rdf.rs @@ -286,13 +286,9 @@ impl ImportExportHandler for RdfHandler { // list of value formats if we know the RDF variant. match self.direction { Direction::Import => self.value_formats.as_ref().map(|vfs| { - vfs.iter().fold(0, |acc, fmt| { - if *fmt == RdfValueFormat::Skip { - acc - } else { - acc + 1 - } - }) + vfs.iter() + .filter(|&&fmt| fmt != RdfValueFormat::Skip) + .count() }), Direction::Export => self.value_formats.as_ref().map(|vfs| vfs.len()), } diff --git a/nemo/src/io/formats/rdf_reader.rs b/nemo/src/io/formats/rdf_reader.rs index 3791c6a90..3cae3c580 100644 --- a/nemo/src/io/formats/rdf_reader.rs +++ b/nemo/src/io/formats/rdf_reader.rs @@ -155,13 +155,7 @@ impl RdfReader { assert_eq!(skip.len(), 3); assert_eq!( tuple_writer.column_number(), - skip.iter().fold(0, |acc: usize, b| { - if *b { - acc - } else { - acc + 1 - } - }) + skip.iter().filter(|b| !*b).count() ); let stop_limit = self.limit.unwrap_or(u64::MAX); @@ -236,13 +230,7 @@ impl RdfReader { assert_eq!(skip.len(), 4); assert_eq!( tuple_writer.column_number(), - skip.iter().fold(0, |acc: usize, b| { - if *b { - acc - } else { - acc + 1 - } - }) + skip.iter().filter(|b| !*b).count() ); let stop_limit = self.limit.unwrap_or(u64::MAX); From d03d32da17eb62c92a4dc708247c3464df48b2f4 Mon Sep 17 00:00:00 2001 From: Maximilian Marx Date: Mon, 9 Sep 2024 13:23:29 +0200 Subject: [PATCH 11/11] Drop superfluous struct This isn't Java, we don't need static functions to be part of a class. If anything, this should have been a module. --- .../src/dictionary/bytes_dictionary.rs | 8 ++--- nemo-physical/src/dictionary/meta_dv_dict.rs | 10 +++--- nemo-physical/src/management/bytesized.rs | 35 +++++++------------ 3 files changed, 21 insertions(+), 32 deletions(-) diff --git a/nemo-physical/src/dictionary/bytes_dictionary.rs b/nemo-physical/src/dictionary/bytes_dictionary.rs index b87822da0..59c7b5af1 100644 --- a/nemo-physical/src/dictionary/bytes_dictionary.rs +++ b/nemo-physical/src/dictionary/bytes_dictionary.rs @@ -5,7 +5,7 @@ use hashbrown::HashMap; use super::bytes_buffer::{BytesRef, GlobalBytesBuffer}; use super::{AddResult, KNOWN_ID_MARK}; use crate::dictionary::datavalue_dictionary::{SMALL_KNOWN_ID_MARK, SMALL_KNOWN_ID_MARK_AS_USIZE}; -use crate::management::bytesized::{ByteSizeHelpers, ByteSized}; +use crate::management::bytesized::{size_inner_hashmap_flat, size_inner_vec_flat, ByteSized}; pub(crate) struct IdUtils {} @@ -187,9 +187,9 @@ impl Drop for BytesDictionary { impl ByteSized for BytesDictionary { fn size_bytes(&self) -> u64 { size_of::() as u64 - + ByteSizeHelpers::size_inner_vec_flat(&self.store) - + ByteSizeHelpers::size_inner_hashmap_flat(&self.map_short) - + ByteSizeHelpers::size_inner_hashmap_flat(&self.map_long) + + size_inner_vec_flat(&self.store) + + size_inner_hashmap_flat(&self.map_short) + + size_inner_hashmap_flat(&self.map_long) + B::buffer_size_bytes(self.buffer_id) } } diff --git a/nemo-physical/src/dictionary/meta_dv_dict.rs b/nemo-physical/src/dictionary/meta_dv_dict.rs index a6b90e0a7..ab3c71bf7 100644 --- a/nemo-physical/src/dictionary/meta_dv_dict.rs +++ b/nemo-physical/src/dictionary/meta_dv_dict.rs @@ -3,7 +3,7 @@ use crate::datavalues::ValueDomain; use crate::datavalues::{AnyDataValue, DataValue}; use crate::dictionary::NONEXISTING_ID_MARK; -use crate::management::bytesized::{ByteSizeHelpers, ByteSized}; +use crate::management::bytesized::{size_inner_vec_flat, ByteSized}; use super::DvDict; use super::IriDvDictionary; @@ -541,13 +541,13 @@ impl DvDict for MetaDvDictionary { impl ByteSized for MetaDvDictionary { fn size_bytes(&self) -> u64 { size_of::() as u64 - + ByteSizeHelpers::size_inner_vec_flat(&self.dictblocks) - + ByteSizeHelpers::size_inner_vec_flat(&self.dicts) - + ByteSizeHelpers::size_inner_vec_flat(&self.generic_dicts) + + size_inner_vec_flat(&self.dictblocks) + + size_inner_vec_flat(&self.dicts) + + size_inner_vec_flat(&self.generic_dicts) + self .dicts .iter() - .map(|dr| dr.dict.size_bytes() + ByteSizeHelpers::size_inner_vec_flat(&dr.gblocks)) + .map(|dr| dr.dict.size_bytes() + size_inner_vec_flat(&dr.gblocks)) .sum::() } } diff --git a/nemo-physical/src/management/bytesized.rs b/nemo-physical/src/management/bytesized.rs index 6157e68eb..99dd7f1c9 100644 --- a/nemo-physical/src/management/bytesized.rs +++ b/nemo-physical/src/management/bytesized.rs @@ -10,28 +10,17 @@ pub trait ByteSized { fn size_bytes(&self) -> u64; } -/// Collection of some simple helper methods for estimating sizes. -pub(crate) struct ByteSizeHelpers; - -impl ByteSizeHelpers { - /// Estimates the memory required for managing the content of a Hashbrown hashmap using only - /// the direct size of keys and values, without taking into accont any data they might point to. - /// - /// The computation is approximate since the hashmap does not provide access to its current bucket - /// structure and control byte overhead, so we merely consider the reported capacity. - pub(crate) fn size_inner_hashmap_flat(object: &hashbrown::HashMap) -> u64 - where - Self: Sized, - { - object.capacity() as u64 * size_of::<(K, V)>() as u64 - } +/// Estimates the memory required for managing the content of a Hashbrown hashmap using only +/// the direct size of keys and values, without taking into accont any data they might point to. +/// +/// The computation is approximate since the hashmap does not provide access to its current bucket +/// structure and control byte overhead, so we merely consider the reported capacity. +pub(crate) fn size_inner_hashmap_flat(object: &hashbrown::HashMap) -> u64 { + object.capacity() as u64 * size_of::<(K, V)>() as u64 +} - /// Computes the memory required for managing the content of a vector using only - /// the direct size of content objects, without taking into accont any data they might point to. - pub(crate) fn size_inner_vec_flat(object: &Vec) -> u64 - where - Self: Sized, - { - object.capacity() as u64 * size_of::() as u64 - } +/// Computes the memory required for managing the content of a vector using only +/// the direct size of content objects, without taking into accont any data they might point to. +pub(crate) fn size_inner_vec_flat(object: &Vec) -> u64 { + object.capacity() as u64 * size_of::() as u64 }