apache · abhishekrb19 · Jun 18, 2026 · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026
diff --git a/...ltalake-extensions/src/main/java/org/apache/druid/delta/input/DeltaInputSourceReader.java b/...ltalake-extensions/src/main/java/org/apache/druid/delta/input/DeltaInputSourceReader.java
@@ -95,6 +95,13 @@ private static class DeltaInputSourceIterator implements CloseableIterator<Input
   {
     private final Iterator<io.delta.kernel.utils.CloseableIterator<FilteredColumnarBatch>> filteredColumnarBatchIterators;
 
+    // Keep a reference to the current file's batch iterator so we drain ALL
+    // its batches before advancing to the next file.
+    // Bug fix for https://github.com/apache/druid/issues/18606:
+    // the original code used a local variable for filteredBatchIterator which
+    // was discarded on return, causing only the first batch (1024 rows) of each
+    // file to be read.
+    private io.delta.kernel.utils.CloseableIterator<FilteredColumnarBatch> currentFileIterator = null;
     private io.delta.kernel.utils.CloseableIterator<Row> currentBatch = null;
     private final InputRowSchema inputRowSchema;
 
@@ -111,20 +118,20 @@ public DeltaInputSourceIterator(
     public boolean hasNext()
     {
       while (currentBatch == null || !currentBatch.hasNext()) {
-        if (!filteredColumnarBatchIterators.hasNext()) {
-          return false; // No more batches or records to read!
-        }
-
-        final io.delta.kernel.utils.CloseableIterator<FilteredColumnarBatch> filteredBatchIterator =
-            filteredColumnarBatchIterators.next();
-
-        while (filteredBatchIterator.hasNext()) {
-          final FilteredColumnarBatch nextBatch = filteredBatchIterator.next();
+        // Drain remaining batches from the current file before moving to the next.
+        while (currentFileIterator != null && currentFileIterator.hasNext()) {
+          final FilteredColumnarBatch nextBatch = currentFileIterator.next();
           currentBatch = nextBatch.getRows();
           if (currentBatch.hasNext()) {
             return true;
           }
         }
+
+        // Advance to the next file.
+        if (!filteredColumnarBatchIterators.hasNext()) {
+          return false;
+        }
+        currentFileIterator = filteredColumnarBatchIterators.next();
       }
       return true;
     }
@@ -146,8 +153,10 @@ public void close() throws IOException
       if (currentBatch != null) {
         currentBatch.close();
       }
-
-      if (filteredColumnarBatchIterators.hasNext()) {
+      if (currentFileIterator != null) {
+        currentFileIterator.close();
+      }
+      while (filteredColumnarBatchIterators.hasNext()) {
         filteredColumnarBatchIterators.next().close();
       }
     }

diff --git a/...deltalake-extensions/src/test/java/org/apache/druid/delta/input/DeltaInputSourceTest.java b/...deltalake-extensions/src/test/java/org/apache/druid/delta/input/DeltaInputSourceTest.java
@@ -439,6 +439,46 @@ private static List<InputRow> readAllRows(InputSourceReader reader) throws IOExc
     return rows;
   }
 
+  /**
+   * Regression test for https://github.com/apache/druid/issues/18606.
+   *
+   * {@link DeltaInputSourceReader.DeltaInputSourceIterator} used a local variable for the
+   * per-file {@code CloseableIterator<FilteredColumnarBatch>}. When {@code hasNext()} returned
+   * after the first non-empty batch of a file, that iterator went out of scope. The next
+   * {@code hasNext()} call advanced to the next file, skipping all remaining batches of the
+   * current file. With the Delta kernel default batch size of 1024 rows this produced exactly
+   * {@code 1024 * numFiles} rows regardless of actual file size.
+   *
+   * Test table: 2 Parquet files x 2000 rows = 4000 rows total.
+   * Without the fix: 1024 x 2 = 2048 rows.
+   * With the fix:    4000 rows.
+   */
+  public static class BatchDrainRegressionTests
+  {
+    @Test
+    public void testAllRowsReturnedWhenFileExceedsOneBatch() throws IOException
+    {
+      final DeltaInputSource deltaInputSource = new DeltaInputSource(
+          LargeRowGroupDeltaTable.DELTA_TABLE_PATH,
+          null,
+          null,
+          null
+      );
+      final InputSourceReader inputSourceReader = deltaInputSource.reader(
+          LargeRowGroupDeltaTable.SCHEMA,
+          null,
+          null
+      );
+      final List<InputRow> rows = readAllRows(inputSourceReader);
+      Assert.assertEquals(
+          "Expected all rows to be read. "
+          + "If this fails with " + (1024 * 2) + " rows, the per-file batch drain bug (GH-18606) has regressed.",
+          LargeRowGroupDeltaTable.EXPECTED_ROW_COUNT,
+          rows.size()
+      );
+    }
+  }
+
   private static void validateRows(
       final List<Map<String, Object>> expectedRows,
       final List<InputRow> actualReadRows,

diff --git a/...talake-extensions/src/test/java/org/apache/druid/delta/input/LargeRowGroupDeltaTable.java b/...talake-extensions/src/test/java/org/apache/druid/delta/input/LargeRowGroupDeltaTable.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.delta.input;
+
+import com.google.common.collect.ImmutableList;
+import org.apache.druid.data.input.ColumnsFilter;
+import org.apache.druid.data.input.InputRowSchema;
+import org.apache.druid.data.input.impl.DimensionsSpec;
+import org.apache.druid.data.input.impl.LongDimensionSchema;
+import org.apache.druid.data.input.impl.StringDimensionSchema;
+import org.apache.druid.data.input.impl.TimestampSpec;
+
+/**
+ * Descriptor for a Delta table with 2 Parquet files × 2000 rows = 4000 rows total.
+ *
+ * Each file has more than 1024 rows, ensuring the Delta kernel reads more than one
+ * batch per file. Used as a regression test for GH-18606 where
+ * {@link DeltaInputSourceReader} only returned the first 1024 rows per file.
+ *
+ * Generated by src/test/resources/create_delta_table.py (large-row-group-table).
+ */
+public class LargeRowGroupDeltaTable
+{
+  public static final String DELTA_TABLE_PATH =
+      "src/test/resources/large-row-group-table";
+
+  public static final int EXPECTED_ROW_COUNT = 4000;
+
+  public static final InputRowSchema SCHEMA = new InputRowSchema(
+      new TimestampSpec("id", "posix", null),
+      new DimensionsSpec(ImmutableList.of(
+          new LongDimensionSchema("id"),
+          new StringDimensionSchema("name")
+      )),
+      ColumnsFilter.all()
+  );
+}
diff --git a/...-row-group-table/.part-00000-42349806-104f-42a0-a6fe-5397f37d29d8-c000.snappy.parquet.crc b/...-row-group-table/.part-00000-42349806-104f-42a0-a6fe-5397f37d29d8-c000.snappy.parquet.crc
diff --git a/...-row-group-table/.part-00001-42b5d278-2c32-4094-af60-5eaca2f7ba03-c000.snappy.parquet.crc b/...-row-group-table/.part-00001-42b5d278-2c32-4094-af60-5eaca2f7ba03-c000.snappy.parquet.crc
diff --git a/...nsions/src/test/resources/large-row-group-table/_delta_log/.00000000000000000000.json.crc b/...nsions/src/test/resources/large-row-group-table/_delta_log/.00000000000000000000.json.crc
diff --git a/...-extensions/src/test/resources/large-row-group-table/_delta_log/00000000000000000000.json b/...-extensions/src/test/resources/large-row-group-table/_delta_log/00000000000000000000.json
@@ -0,0 +1,5 @@
+{"commitInfo":{"timestamp":1781690365208,"operation":"WRITE","operationParameters":{"mode":"Overwrite","partitionBy":"[]"},"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numFiles":"2","numOutputRows":"4000","numOutputBytes":"36263"},"engineInfo":"Apache-Spark/3.5.0 Delta-Lake/3.2.0","txnId":"f2a1da56-9880-474d-80cb-520430c4d221"}}
+{"metaData":{"id":"c1c6ec87-61f6-4ca9-8b67-2edd4a2e6acb","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1781690363342}}
+{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}
+{"add":{"path":"part-00000-42349806-104f-42a0-a6fe-5397f37d29d8-c000.snappy.parquet","partitionValues":{},"size":18078,"modificationTime":1781690364771,"dataChange":true,"stats":"{\"numRecords\":2002,\"minValues\":{\"id\":1,\"name\":\"name_1\"},\"maxValues\":{\"id\":3995,\"name\":\"name_995\"},\"nullCount\":{\"id\":0,\"name\":0}}"}}
+{"add":{"path":"part-00001-42b5d278-2c32-4094-af60-5eaca2f7ba03-c000.snappy.parquet","partitionValues":{},"size":18185,"modificationTime":1781690364771,"dataChange":true,"stats":"{\"numRecords\":1998,\"minValues\":{\"id\":0,\"name\":\"name_0\"},\"maxValues\":{\"id\":3999,\"name\":\"name_999\"},\"nullCount\":{\"id\":0,\"name\":0}}"}}
diff --git a/...large-row-group-table/part-00000-42349806-104f-42a0-a6fe-5397f37d29d8-c000.snappy.parquet b/...large-row-group-table/part-00000-42349806-104f-42a0-a6fe-5397f37d29d8-c000.snappy.parquet
diff --git a/...large-row-group-table/part-00001-42b5d278-2c32-4094-af60-5eaca2f7ba03-c000.snappy.parquet b/...large-row-group-table/part-00001-42b5d278-2c32-4094-af60-5eaca2f7ba03-c000.snappy.parquet