Skip to content

Commit 32c627b

Browse files
macvincentfacebook-github-bot
authored andcommitted
Write Column Logical Size Into Nimble Files (#179)
Summary: Pull Request resolved: #179 Differential Revision: D75973860
1 parent a9e2ea0 commit 32c627b

File tree

1 file changed

+60
-14
lines changed

1 file changed

+60
-14
lines changed

dwio/nimble/velox/VeloxWriter.cpp

Lines changed: 60 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,11 @@ namespace detail {
4747

4848
class WriterContext : public FieldWriterContext {
4949
public:
50+
struct ColumnStats {
51+
uint64_t logicalSize{0};
52+
uint64_t nullCount{0};
53+
};
54+
5055
const VeloxWriterOptions options;
5156
std::unique_ptr<FlushPolicy> flushPolicy;
5257
velox::CpuWallTiming totalFlushTiming;
@@ -63,6 +68,8 @@ class WriterContext : public FieldWriterContext {
6368
uint64_t stripeSize{0};
6469
uint64_t rawSize{0};
6570
std::vector<uint64_t> rowsPerStripe;
71+
std::map<offset_size, std::atomic<uint64_t>> streamPhysicalSize;
72+
std::vector<ColumnStats> columnStats;
6673

6774
WriterContext(
6875
velox::memory::MemoryPool& memoryPool,
@@ -516,10 +523,21 @@ bool VeloxWriter::write(const velox::VectorPtr& vector) {
516523
auto size = vector->size();
517524

518525
// Calculate raw size.
519-
auto rawSize = nimble::getRawSizeFromVector(
520-
vector, velox::common::Ranges::of(0, size));
526+
RawSizeContext context;
527+
auto rawSize = nimble::getRawSizeFromRowVector(
528+
vector, velox::common::Ranges::of(0, size), context, /*topLevel=*/true);
529+
LOG(INFO) << "Raw size: " << rawSize;
521530
DWIO_ENSURE_GE(rawSize, 0, "Invalid raw size");
522531
context_->rawSize += rawSize;
532+
auto columnCount = context.columnCount();
533+
if (context_->columnStats.empty()) {
534+
context_->columnStats =
535+
std::vector<detail::WriterContext::ColumnStats>(columnCount);
536+
}
537+
for (auto i = 0; i < columnCount; ++i) {
538+
context_->columnStats[i].logicalSize += context.sizeAt(i);
539+
context_->columnStats[i].nullCount += context.nullsAt(i);
540+
}
523541

524542
if (context_->options.writeExecutor) {
525543
velox::dwio::common::ExecutorBarrier barrier{
@@ -580,6 +598,26 @@ void VeloxWriter::close() {
580598
builder.GetSize()});
581599
}
582600

601+
{
602+
// Accumulate column physical size.
603+
std::vector<uint64_t> columnPhysicalSize(
604+
context_->columnStats.size(), 0);
605+
const auto& physicalSize = context_->streamPhysicalSize;
606+
for (const auto& node : context_->schemaBuilder.getSchemaNodes()) {
607+
auto column = node->column();
608+
offset_size offset = node->offset();
609+
if (physicalSize.contains(offset) && column.has_value()) {
610+
NIMBLE_CHECK(
611+
column.value() < columnPhysicalSize.size(),
612+
fmt::format(
613+
"Column {} is out of range. Schema has {} columns.",
614+
column.value(),
615+
columnPhysicalSize.size()));
616+
columnPhysicalSize[column.value()] += physicalSize.at(offset);
617+
}
618+
}
619+
}
620+
583621
{
584622
flatbuffers::FlatBufferBuilder builder;
585623
builder.Finish(serialization::CreateStats(builder, context_->rawSize));
@@ -691,14 +729,16 @@ void VeloxWriter::writeChunk(bool lastChunk) {
691729
StreamData& streamData_;
692730
};
693731

694-
auto encode = [&](StreamData& streamData) {
732+
auto encode = [&](StreamData& streamData,
733+
std::atomic<uint64_t>& streamSize) {
695734
const auto offset = streamData.descriptor().offset();
696735
auto encoded = encodeStream(*context_, *encodingBuffer_, streamData);
697736
if (!encoded.empty()) {
698737
ChunkedStreamWriter chunkWriter{*encodingBuffer_};
699738
NIMBLE_DASSERT(offset < streams_.size(), "Stream offset out of range.");
700739
auto& stream = streams_[offset];
701740
for (auto& buffer : chunkWriter.encode(encoded)) {
741+
streamSize += buffer.size();
702742
chunkSize += buffer.size();
703743
stream.content.push_back(std::move(buffer));
704744
}
@@ -739,29 +779,35 @@ void VeloxWriter::writeChunk(bool lastChunk) {
739779
velox::dwio::common::ExecutorBarrier barrier{
740780
context_->options.encodingExecutor};
741781
for (auto& streamData : context_->streams()) {
782+
auto& streamSize =
783+
context_->streamPhysicalSize[streamData->descriptor().offset()];
742784
processStream(
743785
*streamData, [&](StreamData& innerStreamData, bool isNullStream) {
744-
barrier.add([&innerStreamData, isNullStream, &encode]() {
745-
if (isNullStream) {
746-
NullsAsDataStreamData nullsStreamData{innerStreamData};
747-
encode(nullsStreamData);
748-
} else {
749-
encode(innerStreamData);
750-
}
751-
});
786+
barrier.add(
787+
[&innerStreamData, isNullStream, &encode, &streamSize]() {
788+
if (isNullStream) {
789+
NullsAsDataStreamData nullsStreamData{innerStreamData};
790+
encode(nullsStreamData, streamSize);
791+
} else {
792+
encode(innerStreamData, streamSize);
793+
}
794+
});
752795
});
753796
}
754797
barrier.waitAll();
755798
} else {
756799
for (auto& streamData : context_->streams()) {
800+
auto& streamSize =
801+
context_->streamPhysicalSize[streamData->descriptor().offset()];
757802
processStream(
758803
*streamData,
759-
[&encode](StreamData& innerStreamData, bool isNullStream) {
804+
[&encode, &streamSize](
805+
StreamData& innerStreamData, bool isNullStream) {
760806
if (isNullStream) {
761807
NullsAsDataStreamData nullsStreamData{innerStreamData};
762-
encode(nullsStreamData);
808+
encode(nullsStreamData, streamSize);
763809
} else {
764-
encode(innerStreamData);
810+
encode(innerStreamData, streamSize);
765811
}
766812
});
767813
}

0 commit comments

Comments
 (0)