diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 94b67dfa807e..795ce679cb1f 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1480,6 +1480,21 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, return current_encoder_->EstimatedDataEncodedSize(); } + int64_t estimated_buffered_def_level_bytes() const override { + return definition_levels_sink_.length(); + } + + int64_t estimated_buffered_rep_level_bytes() const override { + return repetition_levels_sink_.length(); + } + + int64_t estimated_buffered_dict_bytes() const override { + if (current_dict_encoder_) { + return current_dict_encoder_->dict_encoded_size(); + } + return 0; + } + protected: std::shared_ptr GetValuesBuffer() override { return current_encoder_->FlushValues(); diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index 5b56eb010a24..0516d7937cba 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -164,6 +164,15 @@ class PARQUET_EXPORT ColumnWriter { /// \brief Estimated size of the values that are not written to a page yet. virtual int64_t estimated_buffered_value_bytes() const = 0; + /// \brief Estimated size of the definition levels that are not written to a page yet. + virtual int64_t estimated_buffered_def_level_bytes() const = 0; + + /// \brief Estimated size of the repetition levels that are not written to a page yet. + virtual int64_t estimated_buffered_rep_level_bytes() const = 0; + + /// \brief Estimated size of the dictionary that are not written to a page yet. + virtual int64_t estimated_buffered_dict_bytes() const = 0; + /// \brief The file-level writer properties virtual const WriterProperties* properties() = 0; diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index ddec2c0a5602..b8b1d06ea3ad 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -68,6 +68,10 @@ int64_t RowGroupWriter::total_compressed_bytes_written() const { return contents_->total_compressed_bytes_written(); } +RowGroupWriter::BufferedStats RowGroupWriter::estimated_buffered_stats() const { + return contents_->EstimatedBufferedStats(); +} + bool RowGroupWriter::buffered() const { return contents_->buffered(); } int RowGroupWriter::current_column() { return contents_->current_column(); } @@ -195,6 +199,22 @@ class RowGroupSerializer : public RowGroupWriter::Contents { return total_compressed_bytes_written; } + RowGroupWriter::BufferedStats EstimatedBufferedStats() const override { + RowGroupWriter::BufferedStats stats; + if (closed_) { + return stats; + } + for (size_t i = 0; i < column_writers_.size(); i++) { + if (column_writers_[i]) { + stats.def_level_bytes += column_writers_[i]->estimated_buffered_def_level_bytes(); + stats.rep_level_bytes += column_writers_[i]->estimated_buffered_rep_level_bytes(); + stats.value_bytes += column_writers_[i]->estimated_buffered_value_bytes(); + stats.dict_bytes += column_writers_[i]->estimated_buffered_dict_bytes(); + } + } + return stats; + } + bool buffered() const override { return buffered_row_group_; } void Close() override { diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index d5ea1d7c98a0..3ca71875d9db 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -36,6 +36,15 @@ static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'}; class PARQUET_EXPORT RowGroupWriter { public: + // Estimated uncompressed byte sizes of data buffered by column writers + // that have not yet been serialized into data pages. + struct BufferedStats { + int64_t def_level_bytes = 0; + int64_t rep_level_bytes = 0; + int64_t value_bytes = 0; + int64_t dict_bytes = 0; + }; + // Forward declare a virtual class 'Contents' to aid dependency injection and more // easily create test fixtures // An implementation of the Contents class is defined in the .cc file @@ -58,6 +67,9 @@ class PARQUET_EXPORT RowGroupWriter { virtual int64_t total_compressed_bytes() const = 0; /// \brief total compressed bytes written by the page writer virtual int64_t total_compressed_bytes_written() const = 0; + /// \brief Estimated sizes of buffered data (levels, values, dict) not yet + /// written to a page. + virtual BufferedStats EstimatedBufferedStats() const = 0; virtual bool buffered() const = 0; }; @@ -99,6 +111,9 @@ class PARQUET_EXPORT RowGroupWriter { int64_t total_compressed_bytes() const; /// \brief total compressed bytes written by the page writer int64_t total_compressed_bytes_written() const; + /// \brief Estimated sizes of buffered data (levels, values, dict) not yet + /// written to a page. + BufferedStats estimated_buffered_stats() const; /// Returns whether the current RowGroupWriter is in the buffered mode and is created /// by calling ParquetFileWriter::AppendBufferedRowGroup.