Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
import org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException;
import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
import org.apache.hadoop.ozone.common.Checksum;
import org.apache.hadoop.ozone.common.ChecksumByteBuffer;
import org.apache.hadoop.ozone.common.ChecksumData;
import org.apache.hadoop.ozone.common.ChunkBuffer;
import org.apache.hadoop.ozone.common.OzoneChecksumException;
Expand Down Expand Up @@ -137,6 +138,11 @@ public class BlockOutputStream extends OutputStream {

private final List<DatanodeDetails> failedServers;
private final Checksum checksum;
// Running checksum updated alongside write() to avoid a second data read in writeChunkToContainer().
// Non-null only for CRC32/CRC32C; other types fall back to checksum.computeChecksum().
private final ChecksumByteBuffer runningCrc;
private int runningCrcBytesInSegment;
private final List<ByteString> runningCrcChecksums;

//number of buffers used before doing a flush/putBlock.
private int flushPeriod;
Expand Down Expand Up @@ -233,6 +239,9 @@ public BlockOutputStream(
failedServers = new CopyOnWriteArrayList<>();
ioException = new AtomicReference<>(null);
this.checksum = new Checksum(config.getChecksumType(), config.getBytesPerChecksum(), true);
this.runningCrc = this.checksum.newChecksumByteBuffer();
this.runningCrcBytesInSegment = 0;
this.runningCrcChecksums = new ArrayList<>();
this.clientMetrics = clientMetrics;
this.streamBufferArgs = streamBufferArgs;
this.allowPutBlockPiggybacking = canEnablePutblockPiggybacking();
Expand Down Expand Up @@ -351,6 +360,14 @@ public void write(int b) throws IOException {
allocateNewBufferIfNeeded();
currentBuffer.put((byte) b);
currentBufferRemaining--;
if (runningCrc != null) {
runningCrc.update(b);
if (++runningCrcBytesInSegment == config.getBytesPerChecksum()) {
runningCrcChecksums.add(Checksum.int2ByteString((int) runningCrc.getValue()));
runningCrc.reset();
runningCrcBytesInSegment = 0;
}
}
updateWrittenDataLength(1);
writeChunkIfNeeded();
doFlushOrWatchIfNeeded();
Expand Down Expand Up @@ -385,6 +402,9 @@ public void write(byte[] b, int off, int len) throws IOException {
allocateNewBufferIfNeeded();
final int writeLen = Math.min(currentBufferRemaining, len);
currentBuffer.put(b, off, writeLen);
if (runningCrc != null) {
accumulateRunningCrc(b, off, writeLen);
}
currentBufferRemaining -= writeLen;
updateWrittenDataLength(writeLen);
writeChunkIfNeeded();
Expand Down Expand Up @@ -414,6 +434,38 @@ private void doFlushOrWatchIfNeeded() throws IOException {
}
}

private void accumulateRunningCrc(byte[] b, int off, int len) {
while (len > 0) {
final int space = config.getBytesPerChecksum() - runningCrcBytesInSegment;
final int toUpdate = Math.min(space, len);
runningCrc.update(b, off, toUpdate);
runningCrcBytesInSegment += toUpdate;
if (runningCrcBytesInSegment == config.getBytesPerChecksum()) {
runningCrcChecksums.add(Checksum.int2ByteString((int) runningCrc.getValue()));
runningCrc.reset();
runningCrcBytesInSegment = 0;
}
off += toUpdate;
len -= toUpdate;
}
}

// Returns a ChecksumData built from the running CRC accumulated during write(), then clears
// the running state. Returns null if no running CRC is available (retry path, non-CRC types).
private ChecksumData consumeRunningCrc() {
if (runningCrc == null || (runningCrcChecksums.isEmpty() && runningCrcBytesInSegment == 0)) {
return null;
}
final List<ByteString> checksumList = new ArrayList<>(runningCrcChecksums);
if (runningCrcBytesInSegment > 0) {
checksumList.add(Checksum.int2ByteString((int) runningCrc.getValue()));
runningCrc.reset();
runningCrcBytesInSegment = 0;
}
runningCrcChecksums.clear();
return new ChecksumData(config.getChecksumType(), config.getBytesPerChecksum(), checksumList);
}

private void recordWatchForCommitAsync(CompletableFuture<PutBlockResult> putBlockResultFuture) {
final CompletableFuture<Void> flushFuture = putBlockResultFuture.thenCompose(x -> watchForCommit(x.commitIndex));

Expand Down Expand Up @@ -903,8 +955,12 @@ private CompletableFuture<PutBlockResult> writeChunkToContainer(
final long offset = chunkOffset.getAndAdd(effectiveChunkSize);
final ByteString data = chunk.toByteString(
bufferPool.byteStringConversion());
// chunk is incremental, don't cache its checksum
ChecksumData checksumData = checksum.computeChecksum(chunk, false);
// Use running CRC accumulated during write() when available (avoids a second full read over the chunk).
// Falls back to computeChecksum for the retry path (fresh stream, runningCrc empty) and non-CRC32 types.
ChecksumData checksumData = consumeRunningCrc();
if (checksumData == null) {
checksumData = checksum.computeChecksum(chunk, false);
}
// side note: checksum object is shared with PutBlock's (blockData) checksum calc,
// current impl does not support caching both
ChunkInfo chunkInfo = ChunkInfo.newBuilder()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.hdds.scm.storage;

import java.io.IOException;
import java.util.Map;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.LockSupport;
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerCommandRequestProto;
import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerCommandResponseProto;
import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.GetCommittedBlockLengthResponseProto;
import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.PutBlockResponseProto;
import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.Result;
import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.Type;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType;
import org.apache.hadoop.hdds.scm.XceiverClientReply;
import org.apache.hadoop.hdds.scm.XceiverClientSpi;
import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
import org.apache.ratis.thirdparty.io.netty.buffer.ByteBuf;
import org.apache.ratis.thirdparty.io.netty.buffer.ByteBufOutputStream;
import org.apache.ratis.thirdparty.io.netty.buffer.PooledByteBufAllocator;

/**
* Minimal xceiver client for {@link BlockOutputStreamWriteBenchmark}.
*
* <p>Replicates the production serialization path without real network I/O:
* the proto is serialized via {@code request.writeTo(OutputStream)} — which
* causes protobuf to use {@code OutputStreamEncoder}, the same encoder gRPC's
* {@code MessageFramer} uses — into a pooled direct {@code ByteBuf}, then the
* buffer is released immediately.
*
* <p>This exercises the full cross-domain copy chain:
* <ul>
* <li><b>direct chunk (pre-patch):</b> {@code NioByteString.writeTo} allocates
* a temporary {@code byte[]}, copies from off-heap via {@code copyMemory}
* (copy 1), then writes heap→direct ByteBuf (copy 2).</li>
* <li><b>heap chunk (this patch):</b> {@code BoundedByteString.writeTo} writes
* directly heap→direct ByteBuf (copy 1 only).</li>
* </ul>
*/
final class BenchmarkMockXceiverClient extends XceiverClientSpi {

private final Pipeline pipeline;
private final AtomicLong logIndex = new AtomicLong();
// Simulated Raft commit latency. The calling thread parks for this duration in
// watchForCommit(), mimicking the real-cluster scenario where the writer blocks
// waiting for consensus before allocating the next buffer.
private final long commitLatencyNs;

BenchmarkMockXceiverClient(Pipeline pipeline, long commitLatencyNs) {
this.pipeline = pipeline;
this.commitLatencyNs = commitLatencyNs;
}

@Override
public void connect() {
}

@Override
public void close() {
}

@Override
public Pipeline getPipeline() {
return pipeline;
}

@Override
public XceiverClientReply sendCommandAsync(ContainerCommandRequestProto request) {
// Replicate: gRPC MessageFramer allocates a pooled direct ByteBuf via
// NettyWritableBufferAllocator, then serializes the proto into it through
// OutputStreamEncoder → WritableBufferOutputStream → ByteBuf.writeBytes().
// For NioByteString (direct chunk data): copyMemory(direct→heap tmp) + write(heap→direct).
// For BoundedByteString (heap chunk data): write(heap→direct) only.
// The buffer is released immediately rather than being enqueued to a socket.
final int size = request.getSerializedSize();
final ByteBuf frame = PooledByteBufAllocator.DEFAULT.directBuffer(size, size);
try {
final ByteBufOutputStream out = new ByteBufOutputStream(frame);
try {
request.writeTo(out);
} catch (IOException e) {
throw new RuntimeException(e);
}
} finally {
frame.release();
}

final ContainerCommandResponseProto.Builder builder =
ContainerCommandResponseProto.newBuilder()
.setResult(Result.SUCCESS)
.setCmdType(request.getCmdType());
if (request.getCmdType() == Type.PutBlock) {
builder.setPutBlock(PutBlockResponseProto.newBuilder()
.setCommittedBlockLength(
GetCommittedBlockLengthResponseProto.newBuilder()
.setBlockID(request.getPutBlock().getBlockData().getBlockID())
.setBlockLength(request.getPutBlock().getBlockData().getSize())
.build())
.build());
}
final XceiverClientReply reply =
new XceiverClientReply(CompletableFuture.completedFuture(builder.build()));
reply.setLogIndex(logIndex.incrementAndGet());
return reply;
}

@Override
public ReplicationType getPipelineType() {
return pipeline.getType();
}

@Override
public CompletableFuture<XceiverClientReply> watchForCommit(long index) {
// Block the calling thread for commitLatencyNs to mimic the Raft leader
// requiring consensus before acknowledging a putBlock. This causes back-pressure:
// buffers stay in-flight, pool fills up, and concurrent threads thrash L3 cache
// while each writer waits — reproducing the cold-staging-buffer scenario captured
// by the real-cluster async-profiler (12% CPU in computeChecksum).
if (commitLatencyNs > 0) {
LockSupport.parkNanos(commitLatencyNs);
}
final ContainerCommandResponseProto response =
ContainerCommandResponseProto.newBuilder()
.setCmdType(Type.WriteChunk)
.setResult(Result.SUCCESS)
.build();
final XceiverClientReply reply =
new XceiverClientReply(CompletableFuture.completedFuture(response));
reply.setLogIndex(index);
return CompletableFuture.completedFuture(reply);
}

@Override
public long getReplicatedMinCommitIndex() {
return logIndex.get();
}

@Override
public Map<DatanodeDetails, ContainerCommandResponseProto> sendCommandOnAllNodes(
ContainerCommandRequestProto request) {
return null;
}
}
Loading