Skip to content

Commit 6b42f10

Browse files
macvincentfacebook-github-bot
authored andcommitted
Write Column Logical Size Into Nimble Files (facebookincubator#179)
Summary: Pull Request resolved: facebookincubator#179 Differential Revision: D75973860
1 parent 83e48f0 commit 6b42f10

File tree

1 file changed

+68
-14
lines changed

1 file changed

+68
-14
lines changed

dwio/nimble/velox/VeloxWriter.cpp

Lines changed: 68 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
#include "dwio/nimble/velox/SchemaSerialization.h"
3737
#include "dwio/nimble/velox/SchemaTypes.h"
3838
#include "dwio/nimble/velox/StatsGenerated.h"
39+
#include "dwio/nimble/velox/StreamLabels.h"
3940
#include "folly/ScopeGuard.h"
4041
#include "velox/common/time/CpuWallTimer.h"
4142
#include "velox/dwio/common/ExecutorBarrier.h"
@@ -47,6 +48,11 @@ namespace detail {
4748

4849
class WriterContext : public FieldWriterContext {
4950
public:
51+
struct ColumnStats {
52+
uint64_t logicalSize{0};
53+
uint64_t nullCount{0};
54+
};
55+
5056
const VeloxWriterOptions options;
5157
std::unique_ptr<FlushPolicy> flushPolicy;
5258
velox::CpuWallTiming totalFlushTiming;
@@ -63,6 +69,8 @@ class WriterContext : public FieldWriterContext {
6369
uint64_t stripeSize{0};
6470
uint64_t rawSize{0};
6571
std::vector<uint64_t> rowsPerStripe;
72+
std::unordered_map<offset_size, std::atomic<uint64_t>> streamPhysicalSize;
73+
std::vector<ColumnStats> columnStats;
6674

6775
WriterContext(
6876
velox::memory::MemoryPool& memoryPool,
@@ -516,10 +524,20 @@ bool VeloxWriter::write(const velox::VectorPtr& vector) {
516524
auto size = vector->size();
517525

518526
// Calculate raw size.
519-
auto rawSize = nimble::getRawSizeFromVector(
520-
vector, velox::common::Ranges::of(0, size));
527+
RawSizeContext context;
528+
auto rawSize = nimble::getRawSizeFromRowVector(
529+
vector, velox::common::Ranges::of(0, size), context, /*topLevel=*/true);
521530
DWIO_ENSURE_GE(rawSize, 0, "Invalid raw size");
522531
context_->rawSize += rawSize;
532+
auto columnCount = context.columnCount();
533+
if (context_->columnStats.empty()) {
534+
context_->columnStats =
535+
std::vector<detail::WriterContext::ColumnStats>(columnCount);
536+
}
537+
for (auto i = 0; i < columnCount; ++i) {
538+
context_->columnStats[i].logicalSize += context.sizeAt(i);
539+
context_->columnStats[i].nullCount += context.nullsAt(i);
540+
}
523541

524542
if (context_->options.writeExecutor) {
525543
velox::dwio::common::ExecutorBarrier barrier{
@@ -580,6 +598,34 @@ void VeloxWriter::close() {
580598
builder.GetSize()});
581599
}
582600

601+
{
602+
// Accumulate column physical size.
603+
std::vector<uint64_t> columnPhysicalSize(
604+
context_->columnStats.size(), 0);
605+
nimble::StreamLabels streamLabels{nimble::SchemaReader::getSchema(
606+
context_->schemaBuilder.getSchemaNodes())};
607+
for (const auto& [offset, streamSize] : context_->streamPhysicalSize) {
608+
if (offset == 0) {
609+
continue;
610+
}
611+
std::vector<std::string> streamLabel;
612+
folly::split(
613+
'/',
614+
streamLabels.streamLabel(offset),
615+
streamLabel,
616+
/*ignoreEmpty=*/true);
617+
NIMBLE_ASSERT(!streamLabel.empty(), "Invalid stream label");
618+
auto column = std::stoi(streamLabel[0]);
619+
NIMBLE_ASSERT(
620+
column < columnPhysicalSize.size(),
621+
fmt::format(
622+
"Index {} is out of range for physical size vector of size {}",
623+
column,
624+
columnPhysicalSize.size()));
625+
columnPhysicalSize[column] += streamSize;
626+
}
627+
}
628+
583629
{
584630
flatbuffers::FlatBufferBuilder builder;
585631
builder.Finish(serialization::CreateStats(builder, context_->rawSize));
@@ -691,14 +737,16 @@ void VeloxWriter::writeChunk(bool lastChunk) {
691737
StreamData& streamData_;
692738
};
693739

694-
auto encode = [&](StreamData& streamData) {
740+
auto encode = [&](StreamData& streamData,
741+
std::atomic<uint64_t>& streamSize) {
695742
const auto offset = streamData.descriptor().offset();
696743
auto encoded = encodeStream(*context_, *encodingBuffer_, streamData);
697744
if (!encoded.empty()) {
698745
ChunkedStreamWriter chunkWriter{*encodingBuffer_};
699746
NIMBLE_DASSERT(offset < streams_.size(), "Stream offset out of range.");
700747
auto& stream = streams_[offset];
701748
for (auto& buffer : chunkWriter.encode(encoded)) {
749+
streamSize += buffer.size();
702750
chunkSize += buffer.size();
703751
stream.content.push_back(std::move(buffer));
704752
}
@@ -739,29 +787,35 @@ void VeloxWriter::writeChunk(bool lastChunk) {
739787
velox::dwio::common::ExecutorBarrier barrier{
740788
context_->options.encodingExecutor};
741789
for (auto& streamData : context_->streams()) {
790+
auto& streamSize =
791+
context_->streamPhysicalSize[streamData->descriptor().offset()];
742792
processStream(
743793
*streamData, [&](StreamData& innerStreamData, bool isNullStream) {
744-
barrier.add([&innerStreamData, isNullStream, &encode]() {
745-
if (isNullStream) {
746-
NullsAsDataStreamData nullsStreamData{innerStreamData};
747-
encode(nullsStreamData);
748-
} else {
749-
encode(innerStreamData);
750-
}
751-
});
794+
barrier.add(
795+
[&innerStreamData, isNullStream, &encode, &streamSize]() {
796+
if (isNullStream) {
797+
NullsAsDataStreamData nullsStreamData{innerStreamData};
798+
encode(nullsStreamData, streamSize);
799+
} else {
800+
encode(innerStreamData, streamSize);
801+
}
802+
});
752803
});
753804
}
754805
barrier.waitAll();
755806
} else {
756807
for (auto& streamData : context_->streams()) {
808+
auto& streamSize =
809+
context_->streamPhysicalSize[streamData->descriptor().offset()];
757810
processStream(
758811
*streamData,
759-
[&encode](StreamData& innerStreamData, bool isNullStream) {
812+
[&encode, &streamSize](
813+
StreamData& innerStreamData, bool isNullStream) {
760814
if (isNullStream) {
761815
NullsAsDataStreamData nullsStreamData{innerStreamData};
762-
encode(nullsStreamData);
816+
encode(nullsStreamData, streamSize);
763817
} else {
764-
encode(innerStreamData);
818+
encode(innerStreamData, streamSize);
765819
}
766820
});
767821
}

0 commit comments

Comments
 (0)