Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cpp/src/parquet/column_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1480,6 +1480,10 @@ class TypedColumnWriterImpl : public ColumnWriterImpl,
return current_encoder_->EstimatedDataEncodedSize();
}

int64_t EstimatedBufferedLevelsBytes() const override {
return definition_levels_sink_.length() + repetition_levels_sink_.length();
}

protected:
std::shared_ptr<Buffer> GetValuesBuffer() override {
return current_encoder_->FlushValues();
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/parquet/column_writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,9 @@ class PARQUET_EXPORT ColumnWriter {
/// \brief Estimated size of the values that are not written to a page yet.
virtual int64_t estimated_buffered_value_bytes() const = 0;

/// \brief Estimated size of the levels that are not written to a page yet.
virtual int64_t EstimatedBufferedLevelsBytes() const = 0;

/// \brief The file-level writer properties
virtual const WriterProperties* properties() = 0;

Expand Down
19 changes: 19 additions & 0 deletions cpp/src/parquet/file_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ int64_t RowGroupWriter::total_compressed_bytes_written() const {
return contents_->total_compressed_bytes_written();
}

int64_t RowGroupWriter::total_buffered_bytes() const {
return contents_->EstimatedBufferedBytes();
}

bool RowGroupWriter::buffered() const { return contents_->buffered(); }

int RowGroupWriter::current_column() { return contents_->current_column(); }
Expand Down Expand Up @@ -195,6 +199,21 @@ class RowGroupSerializer : public RowGroupWriter::Contents {
return total_compressed_bytes_written;
}

int64_t EstimatedBufferedBytes() const override {
if (closed_) {
return 0;
}
int64_t estimated_buffered_value_bytes = 0;
for (size_t i = 0; i < column_writers_.size(); i++) {
if (column_writers_[i]) {
estimated_buffered_value_bytes +=
column_writers_[i]->estimated_buffered_value_bytes() +
column_writers_[i]->EstimatedBufferedLevelsBytes();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still have the question: do we need to consider buffered dictionary values?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of return a sum, should we return a struct like below:

struct BufferedStats {
  int64_t def_level_bytes;
  int64_t rep_level_bytes;
  int64_t value_bytes;
  int64_t dict_bytes;
};

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed.

}
}
return estimated_buffered_value_bytes;
}

bool buffered() const override { return buffered_row_group_; }

void Close() override {
Expand Down
5 changes: 5 additions & 0 deletions cpp/src/parquet/file_writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ class PARQUET_EXPORT RowGroupWriter {
virtual int64_t total_compressed_bytes() const = 0;
/// \brief total compressed bytes written by the page writer
virtual int64_t total_compressed_bytes_written() const = 0;
/// \brief Estimated bytes of values and levels that are buffered by the page writer
/// but not written to a page yet
virtual int64_t EstimatedBufferedBytes() const = 0;

virtual bool buffered() const = 0;
};
Expand Down Expand Up @@ -99,6 +102,8 @@ class PARQUET_EXPORT RowGroupWriter {
int64_t total_compressed_bytes() const;
/// \brief total compressed bytes written by the page writer
int64_t total_compressed_bytes_written() const;
/// \brief total bytes of values and levels that are buffered by the page writer
int64_t total_buffered_bytes() const;

/// Returns whether the current RowGroupWriter is in the buffered mode and is created
/// by calling ParquetFileWriter::AppendBufferedRowGroup.
Expand Down
Loading