Skip to content

Commit

Permalink
Compute and report dictionary size (#527)
Browse files Browse the repository at this point in the history
This pull request fixes #339 and integrates the result in the reporting
on the command line.
  • Loading branch information
mmarx authored Sep 9, 2024
2 parents 1d322d1 + d03d32d commit d675b5d
Show file tree
Hide file tree
Showing 31 changed files with 210 additions and 165 deletions.
9 changes: 0 additions & 9 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion nemo-physical/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ path = "src/benches/dict-bench.rs"
[dependencies]
enum_dispatch = "0.3.12"
log = "0.4"
bytesize = "1.2"
thiserror = "1.0"
num = "0.4.0"
ascii_tree = "0.1.1"
Expand Down
11 changes: 11 additions & 0 deletions nemo-physical/src/benches/dict-bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use nemo_physical::datavalues::ValueDomain;
use nemo_physical::dictionary::meta_dv_dict::MetaDvDictionary;
use nemo_physical::dictionary::string_dictionary::BenchmarkStringDictionary;
use nemo_physical::dictionary::DvDict;
use nemo_physical::management::bytesized::ByteSized;
use std::env;
use std::fs::File;
use std::io::prelude::*;
Expand Down Expand Up @@ -69,6 +70,15 @@ impl DictEnum {
DictEnum::DvMeta(dict) => dict.len(),
}
}

fn size_bytes(&mut self) -> u64 {
match &self {
DictEnum::StringHash(_) => 0,
DictEnum::StringMeta(_) => 0,
DictEnum::StringBuffer(dict) => dict.size_bytes(),
DictEnum::DvMeta(dict) => dict.size_bytes(),
}
}
}

fn main() {
Expand Down Expand Up @@ -203,6 +213,7 @@ fn main() {
" Dictionary rejected {} (non-unique) strings with {} bytes overall.",
count_rejected, bytes_rejected
);
println!(" Dictionary reports own size as {}.", dict.size_bytes());

TimedCode::instance().stop();

Expand Down
6 changes: 2 additions & 4 deletions nemo-physical/src/columnar/column.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ pub(crate) mod vector;

use std::{fmt::Debug, mem::size_of};

use bytesize::ByteSize;

use crate::{
datatypes::{ColumnDataType, RunLengthEncodable},
generate_forwarder,
Expand Down Expand Up @@ -82,8 +80,8 @@ where
}

impl<T: RunLengthEncodable> ByteSized for ColumnEnum<T> {
fn size_bytes(&self) -> ByteSize {
fn size_bytes(&self) -> u64 {
let size_column = forward_to_column!(self, size_bytes);
ByteSize::b(size_of::<Self>() as u64) + size_column
size_of::<Self>() as u64 + size_column
}
}
6 changes: 2 additions & 4 deletions nemo-physical/src/columnar/column/rle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

use std::{fmt::Debug, mem::size_of, num::NonZeroUsize, ops::Range};

use bytesize::ByteSize;

use crate::{
columnar::{columnbuilder::rle::RleElement, columnscan::ColumnScan},
datatypes::{ColumnDataType, RunLengthEncodable},
Expand Down Expand Up @@ -134,13 +132,13 @@ where
}

impl<T: RunLengthEncodable> ByteSized for ColumnRle<T> {
fn size_bytes(&self) -> ByteSize {
fn size_bytes(&self) -> u64 {
let size_values = size_of::<T>() as u64 * self.values.capacity() as u64;
let size_end_indices =
size_of::<NonZeroUsize>() as u64 * self.end_indices.capacity() as u64;
let size_increments = size_of::<T::Step>() as u64 * self.increments.capacity() as u64;

ByteSize::b(size_of::<Self>() as u64 + size_values + size_end_indices + size_increments)
size_of::<Self>() as u64 + size_values + size_end_indices + size_increments
}
}

Expand Down
8 changes: 3 additions & 5 deletions nemo-physical/src/columnar/column/vector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@ use std::{
ops::{Index, Range},
};

use bytesize::ByteSize;

use crate::{columnar::columnscan::ColumnScan, management::bytesized::ByteSized};

use super::Column;
Expand Down Expand Up @@ -53,9 +51,9 @@ impl<T: Debug + Copy + Ord> Index<usize> for ColumnVector<T> {
}

impl<T> ByteSized for ColumnVector<T> {
fn size_bytes(&self) -> ByteSize {
// We cast everything to u64 separately to avoid overflows
ByteSize::b(size_of::<Self>() as u64 + self.data.capacity() as u64 * size_of::<T>() as u64)
fn size_bytes(&self) -> u64 {
// cast everything to u64 separately to avoid overflows
size_of::<Self>() as u64 + self.data.capacity() as u64 * size_of::<T>() as u64
}
}

Expand Down
5 changes: 2 additions & 3 deletions nemo-physical/src/columnar/intervalcolumn.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ pub(crate) mod interval_lookup;

use std::ops::Range;

use bytesize::ByteSize;
use delegate::delegate;

use crate::{
Expand Down Expand Up @@ -93,7 +92,7 @@ where
T: ColumnDataType,
LookupMethod: IntervalLookup,
{
fn size_bytes(&self) -> ByteSize {
fn size_bytes(&self) -> u64 {
self.data.size_bytes() + self.intervals.size_bytes() + self.interval_lookup.size_bytes()
}
}
Expand Down Expand Up @@ -199,7 +198,7 @@ impl<LookupMethod> ByteSized for IntervalColumnT<LookupMethod>
where
LookupMethod: IntervalLookup,
{
fn size_bytes(&self) -> ByteSize {
fn size_bytes(&self) -> u64 {
self.column_id32.size_bytes()
+ self.column_id64.size_bytes()
+ self.column_int64.size_bytes()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ impl<LookupMethod> ByteSized for IntervalLookupT<LookupMethod>
where
LookupMethod: IntervalLookup,
{
fn size_bytes(&self) -> bytesize::ByteSize {
fn size_bytes(&self) -> u64 {
self.lookup_id32.size_bytes()
+ self.lookup_id64.size_bytes()
+ self.lookup_int64.size_bytes()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
//! This module implements [IntervalLookupColumn]
//! and the associated builder [IntervalLookupColumnBuilder].

use bytesize::ByteSize;

use crate::{
columnar::{
column::{Column, ColumnEnum},
Expand Down Expand Up @@ -57,7 +55,7 @@ impl IntervalLookup for IntervalLookupColumn {
}

impl ByteSized for IntervalLookupColumn {
fn size_bytes(&self) -> ByteSize {
fn size_bytes(&self) -> u64 {
self.lookup.size_bytes()
}
}
Expand Down
13 changes: 6 additions & 7 deletions nemo-physical/src/datatypes/run_length_encodable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ use std::{
ops::{Add, Sub},
};

use bytesize::ByteSize;
use num::Zero;

use crate::management::bytesized::ByteSized;
Expand Down Expand Up @@ -64,8 +63,8 @@ impl IntStep {
}

impl ByteSized for IntStep {
fn size_bytes(&self) -> ByteSize {
ByteSize::b(size_of::<IntStep>() as u64)
fn size_bytes(&self) -> u64 {
size_of::<IntStep>() as u64
}
}

Expand Down Expand Up @@ -166,8 +165,8 @@ impl RunLengthEncodable for i64 {
pub(crate) struct SmallIntStep(i8);

impl ByteSized for SmallIntStep {
fn size_bytes(&self) -> ByteSize {
ByteSize::b(size_of::<SmallIntStep>() as u64)
fn size_bytes(&self) -> u64 {
size_of::<SmallIntStep>() as u64
}
}

Expand Down Expand Up @@ -213,7 +212,7 @@ impl RunLengthEncodable for i8 {
pub(crate) struct FloatingStep;

impl ByteSized for FloatingStep {
fn size_bytes(&self) -> ByteSize {
ByteSize::b(0)
fn size_bytes(&self) -> u64 {
0
}
}
45 changes: 40 additions & 5 deletions nemo-physical/src/dictionary/bytes_buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ const PAGE_SIZE: usize = 1 << PAGE_ADDR_BITS;
/// Buffers might be dropped, upon which all of its pages will be freed. There is no other way
/// of removing contents from a buffer.
///
/// Individual pages have a size of at most [PAGE_SIZE`] bytes, so that [`PAGE_ADDR_BITS]
/// Individual pages have a size of at most [PAGE_SIZE] bytes, so that [PAGE_ADDR_BITS]
/// are needed to specify a position within a page. References to buffered strings are represented
/// by [BytesRef], which stores a starting address and length of the slice. The `usize` starting
/// address is global (uniform for all buffers), with the lower [PAGE_ADDR_BITS] bits encoding a position within a page,
Expand All @@ -30,9 +30,9 @@ const PAGE_SIZE: usize = 1 << PAGE_ADDR_BITS;
/// The number of bits reserved for length is [BYTESREF_BYTES_LENGTH_BITS], which should always be less
/// than [PAGE_ADDR_BITS] since longer tuples would not fit any buffer page anyway.
///
/// The implementaion can be used in multiple parallel threads.
/// The implementation can be used in multiple parallel threads.
///
/// Note: The multi-thrading support is based on aggressive locking of all major operations. It might be
/// Note: The multi-threading support is based on aggressive locking of all major operations. It might be
/// possible to reduce the amount of locking by designing more careful data structures. For example, locking
/// could be limited to the rare page-writing operations if Vectors would not move existing entries on (some)
/// writes, which causes races that may lead to reading errors unless all reads are also locked.
Expand Down Expand Up @@ -121,7 +121,7 @@ impl BytesBuffer {
}
}

/// Acquire the lock that we use for operations that read or write any of the internal data
/// Acquires the lock that we use for operations that read or write any of the internal data
/// structures that multiple buffers might use.
fn acquire_page_lock(&self) {
while self
Expand All @@ -131,7 +131,7 @@ impl BytesBuffer {
{}
}

/// Release the lock.
/// Releases the lock.
fn release_page_lock(&self) {
self.lock.store(false, Ordering::Release);
}
Expand All @@ -142,6 +142,23 @@ impl BytesBuffer {
self.release_page_lock();
result
}

/// Computes and returns the overall number of bytes that have been alocated for managing
/// a specific buffer. This includes management data for that buffer but no shared management
/// data for the [BytesBuffer] as such.
fn buffer_size_bytes(&self, buffer: usize) -> u64 {
let page_count: usize = self.pages.iter().filter(|x| x.0 == buffer).count();
(page_count * (size_of::<(usize, Vec<u8>)>() + PAGE_SIZE) + size_of::<usize>()) as u64
}
}

impl ByteSized for BytesBuffer {
/// Computes and returns the overall number of bytes that this [BytesBuffer] occupies.
fn size_bytes(&self) -> u64 {
(self.pages.len() * (size_of::<(usize, Vec<u8>)>() + PAGE_SIZE)
+ self.cur_pages.len() * size_of::<usize>()
+ size_of::<Self>()) as u64
}
}

/// Trait to encapsulate (static) functions for accessing a single [BytesBuffer].
Expand Down Expand Up @@ -186,6 +203,12 @@ pub(crate) unsafe trait GlobalBytesBuffer: Debug + Sized {
BytesBuffer::drop_buffer(&mut *Self::get(), buffer);
}
}

/// Computes and returns the overall number of bytes that have been alocated for managing
/// a specific buffer.
fn buffer_size_bytes(buffer: usize) -> u64 {
unsafe { BytesBuffer::buffer_size_bytes(&*Self::get(), buffer) }
}
}

/// Number of bits reserved for encoding the length of referenced byte arrays.
Expand Down Expand Up @@ -270,6 +293,12 @@ impl<B: GlobalBytesBuffer> BytesRef<B> {
}
}

impl<B: GlobalBytesBuffer> ByteSized for BytesRef<B> {
fn size_bytes(&self) -> u64 {
size_of::<Self>() as u64
}
}

impl<B: GlobalBytesBuffer> Display for BytesRef<B> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
unsafe {
Expand Down Expand Up @@ -329,8 +358,12 @@ macro_rules! declare_bytes_buffer {
}
pub(crate) use declare_bytes_buffer;

use crate::management::bytesized::ByteSized;

#[cfg(test)]
mod test {
use crate::dictionary::bytes_buffer::PAGE_SIZE;

use super::{BytesBuffer, GlobalBytesBuffer};

crate::dictionary::bytes_buffer::declare_bytes_buffer!(TestGlobalBuffer, TEST_BUFFER);
Expand Down Expand Up @@ -363,6 +396,8 @@ mod test {
assert_ne!(bytes_ref1, bytes_ref2);
assert_eq!(bytes_ref1, bytes_ref1);

assert!(TestGlobalBuffer::buffer_size_bytes(bufid1) > PAGE_SIZE as u64);

TestGlobalBuffer::drop_buffer(bufid1);
TestGlobalBuffer::drop_buffer(bufid2);
TestGlobalBuffer2::drop_buffer(bufid3);
Expand Down
11 changes: 11 additions & 0 deletions nemo-physical/src/dictionary/bytes_dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use hashbrown::HashMap;
use super::bytes_buffer::{BytesRef, GlobalBytesBuffer};
use super::{AddResult, KNOWN_ID_MARK};
use crate::dictionary::datavalue_dictionary::{SMALL_KNOWN_ID_MARK, SMALL_KNOWN_ID_MARK_AS_USIZE};
use crate::management::bytesized::{size_inner_hashmap_flat, size_inner_vec_flat, ByteSized};

pub(crate) struct IdUtils {}

Expand Down Expand Up @@ -183,6 +184,16 @@ impl<B: GlobalBytesBuffer> Drop for BytesDictionary<B> {
}
}

impl<B: GlobalBytesBuffer> ByteSized for BytesDictionary<B> {
fn size_bytes(&self) -> u64 {
size_of::<Self>() as u64
+ size_inner_vec_flat(&self.store)
+ size_inner_hashmap_flat(&self.map_short)
+ size_inner_hashmap_flat(&self.map_long)
+ B::buffer_size_bytes(self.buffer_id)
}
}

#[cfg(test)]
mod test {
use crate::dictionary::{
Expand Down
4 changes: 2 additions & 2 deletions nemo-physical/src/dictionary/datavalue_dictionary.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
//! General traits and global constants for dictionaries that work for datavalues.

use crate::datavalues::AnyDataValue;
use crate::{datavalues::AnyDataValue, management::bytesized::ByteSized};
use std::fmt::Debug;

/// Fake id that dictionaries use to indicate that an entry has no id.
Expand Down Expand Up @@ -51,7 +51,7 @@ impl AddResult {
///
/// The id values are provided when the dictionary is used, whereas the ids are newly
/// assigned by the dictionary itself.
pub trait DvDict: Debug {
pub trait DvDict: Debug + ByteSized {
/// Adds a new [AnyDataValue] to the dictionary. If the value is not known yet, it will
/// be assigned a new id. Unsupported datavalues can also be rejected, which specialized
/// dictionary implementations might do.
Expand Down
Loading

0 comments on commit d675b5d

Please sign in to comment.