diff --git a/core/src/main/java/dev/hardwood/internal/reader/NestedBatchDataView.java b/core/src/main/java/dev/hardwood/internal/reader/NestedBatchDataView.java
index e094a00e..74625295 100644
--- a/core/src/main/java/dev/hardwood/internal/reader/NestedBatchDataView.java
+++ b/core/src/main/java/dev/hardwood/internal/reader/NestedBatchDataView.java
@@ -570,12 +570,25 @@ public PqVariant getVariant(int projectedIndex) {
private static final byte[] VARIANT_NULL_VALUE = new byte[] { 0x00 };
private PqVariant createVariant(TopLevelFieldMap.FieldDesc.Variant desc) {
+ // Variant bytes are decoded lazily here, so a malformed `metadata`/`value`
+ // surfaces as a decode error from deep inside the variant codec. Enrich it
+ // (and the local projection checks) with the originating file name, matching
+ // the other accessors in this view.
+ try {
+ return createVariantInternal(desc);
+ }
+ catch (RuntimeException e) {
+ throw ExceptionContext.addFileContext(currentFileName, e);
+ }
+ }
+
+ private PqVariant createVariantInternal(TopLevelFieldMap.FieldDesc.Variant desc) {
if (isVariantNull(desc)) {
return null;
}
if (desc.metadataCol() < 0) {
- throw new IllegalStateException(prefix()
- + "Variant column '" + desc.schema().name() + "' requires its 'metadata' child in the projection");
+ throw new IllegalStateException(
+ "Variant column '" + desc.schema().name() + "' requires its 'metadata' child in the projection");
}
int metaIdx = cachedValueIndex[desc.metadataCol()];
byte[] metadataBytes = batchIndex.getBinary(desc.metadataCol(), metaIdx);
@@ -597,8 +610,8 @@ private PqVariant createVariant(TopLevelFieldMap.FieldDesc.Variant desc) {
// Unshredded: the raw value bytes ARE the canonical value bytes.
int valueCol = desc.valueCol();
if (valueCol < 0) {
- throw new IllegalStateException(prefix()
- + "Variant column '" + desc.schema().name() + "' requires its 'value' child in the projection");
+ throw new IllegalStateException(
+ "Variant column '" + desc.schema().name() + "' requires its 'value' child in the projection");
}
int valIdx = cachedValueIndex[valueCol];
byte[] value = batchIndex.getBinary(valueCol, valIdx);
diff --git a/core/src/main/java/dev/hardwood/internal/variant/PqVariantObjectImpl.java b/core/src/main/java/dev/hardwood/internal/variant/PqVariantObjectImpl.java
index a7d1f0da..684f8765 100644
--- a/core/src/main/java/dev/hardwood/internal/variant/PqVariantObjectImpl.java
+++ b/core/src/main/java/dev/hardwood/internal/variant/PqVariantObjectImpl.java
@@ -22,9 +22,11 @@
import dev.hardwood.row.VariantType;
/// [PqVariantObject] implementation. Caches the parsed [ObjectLayout] so
-/// repeated field accesses don't re-walk the header; field-id-to-index lookup
-/// uses binary search over the object's id array (ids are guaranteed sorted
-/// ascending per the Variant spec).
+/// repeated field accesses don't re-walk the header. Field lookup resolves the
+/// name to its dictionary id once, then scans the object's id array linearly:
+/// the ids are ordered by field name (not numerically), so a numeric binary
+/// search would be incorrect unless the metadata dictionary is itself sorted.
+/// See [#indexOf(String)].
final class PqVariantObjectImpl implements PqVariantObject {
private final VariantMetadata metadata;
diff --git a/core/src/main/java/dev/hardwood/internal/variant/VariantBinary.java b/core/src/main/java/dev/hardwood/internal/variant/VariantBinary.java
index 6d96f98c..536bb5f0 100644
--- a/core/src/main/java/dev/hardwood/internal/variant/VariantBinary.java
+++ b/core/src/main/java/dev/hardwood/internal/variant/VariantBinary.java
@@ -13,7 +13,7 @@
/// encoding. All multi-byte integers in the encoding are unsigned little-endian
/// unless otherwise noted.
///
-/// @see Parquet Variant Encoding
+/// @see Parquet Variant Encoding
public final class VariantBinary {
private VariantBinary() {}
@@ -21,13 +21,15 @@ private VariantBinary() {}
// ==================== Metadata header byte ====================
// bit 0-3: version (must be 1)
// bit 4: sorted_strings flag
- // bit 5-6: offset_size_minus_one (0..3 → 1..4 bytes)
- // bit 7: unused
+ // bit 5: unused
+ // bit 6-7: offset_size_minus_one (0..3 → 1..4 bytes)
+ // Bit layout per the Variant spec's "Metadata encoding" section:
+ // https://parquet.apache.org/docs/file-format/types/variantencoding/#metadata-encoding
public static final int METADATA_VERSION = 1;
public static final int METADATA_VERSION_MASK = 0x0F;
public static final int METADATA_SORTED_MASK = 0x10;
- public static final int METADATA_OFFSET_SIZE_SHIFT = 5;
+ public static final int METADATA_OFFSET_SIZE_SHIFT = 6;
public static final int METADATA_OFFSET_SIZE_MASK = 0x03;
// ==================== Value header byte — basic_type ====================
diff --git a/core/src/main/java/dev/hardwood/internal/variant/VariantMetadata.java b/core/src/main/java/dev/hardwood/internal/variant/VariantMetadata.java
index 544b5748..6cddb063 100644
--- a/core/src/main/java/dev/hardwood/internal/variant/VariantMetadata.java
+++ b/core/src/main/java/dev/hardwood/internal/variant/VariantMetadata.java
@@ -14,7 +14,7 @@
/// field id or by name.
///
/// Layout:
-/// - 1-byte header: version (bit 0-3) | sorted_strings (bit 4) | offset_size_minus_one (bit 5-6)
+/// - 1-byte header: version (bit 0-3) | sorted_strings (bit 4) | offset_size_minus_one (bit 6-7)
/// - `offset_size` bytes: dictionary_size (unsigned LE)
/// - `(dictionary_size + 1) * offset_size` bytes: offsets into the strings section (unsigned LE)
/// - `offset_size` bytes of the last offset define the total byte length
@@ -48,6 +48,11 @@ public VariantMetadata(byte[] buf) {
throw new IllegalArgumentException("Variant metadata buffer truncated before dictionary_size");
}
this.dictionarySize = VariantBinary.readUnsignedLE(buf, headerEnd, offsetSize);
+ // Reject an out-of-range dictionary_size before it feeds later arithmetic.
+ if (dictionarySize < 0) {
+ throw new IllegalArgumentException(
+ "Variant metadata dictionary_size is not a valid unsigned int: " + dictionarySize);
+ }
this.offsetsStart = headerEnd + offsetSize;
int offsetsLen = (dictionarySize + 1) * offsetSize;
if (buf.length < offsetsStart + offsetsLen) {
diff --git a/core/src/test/java/dev/hardwood/VariantLogicalTypeTest.java b/core/src/test/java/dev/hardwood/VariantLogicalTypeTest.java
index f757f10c..c582ae9e 100644
--- a/core/src/test/java/dev/hardwood/VariantLogicalTypeTest.java
+++ b/core/src/test/java/dev/hardwood/VariantLogicalTypeTest.java
@@ -17,11 +17,13 @@
import dev.hardwood.reader.ParquetFileReader;
import dev.hardwood.reader.RowReader;
import dev.hardwood.row.PqVariant;
+import dev.hardwood.row.PqVariantObject;
import dev.hardwood.row.VariantType;
import dev.hardwood.schema.FileSchema;
import dev.hardwood.schema.SchemaNode;
import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
/// End-to-end recognition of the Parquet `VARIANT` logical-type annotation.
/// Reads the generated `variant_test.parquet` fixture and asserts that:
@@ -36,6 +38,10 @@
class VariantLogicalTypeTest {
private static final Path FILE = Paths.get("src/test/resources/variant_test.parquet");
+ private static final Path FILE_OFFSET_SIZE2 =
+ Paths.get("src/test/resources/variant_metadata_offset_size2.parquet");
+ private static final Path FILE_BAD_METADATA =
+ Paths.get("src/test/resources/variant_negative_dict_size.parquet");
@Test
void schemaReportsVariantLogicalType() throws IOException {
@@ -82,4 +88,41 @@ void rowReaderSurfacesVariantBytes() throws IOException {
assertThat(rowReader.hasNext()).isFalse();
}
}
+
+ /// Reads a real fixture whose Variant metadata dictionary is large enough to
+ /// require `offset_size = 2`, then resolves the object's two fields through
+ /// the VARIANT row API.
+ @Test
+ void readsVariantWhoseMetadataUsesOffsetSizeTwo() throws IOException {
+ String k0 = "a".repeat(160);
+ String k1 = "b".repeat(160);
+ try (ParquetFileReader fileReader = ParquetFileReader.open(InputFile.of(FILE_OFFSET_SIZE2));
+ RowReader rowReader = fileReader.rowReader()) {
+ rowReader.next();
+ PqVariant v = rowReader.getVariant("var");
+ assertThat(v).isNotNull();
+ assertThat(v.type()).isEqualTo(VariantType.OBJECT);
+
+ PqVariantObject obj = v.asObject();
+ assertThat(obj.getFieldCount()).isEqualTo(2);
+ assertThat(obj.getInt(k0)).isEqualTo(5);
+ assertThat(obj.getBoolean(k1)).isTrue();
+
+ assertThat(rowReader.hasNext()).isFalse();
+ }
+ }
+
+ /// A malformed-variant decode error raised while reading is enriched with the
+ /// originating file name, so it is attributable like other read-path errors.
+ @Test
+ void malformedVariantMetadataErrorCarriesFileName() throws IOException {
+ try (ParquetFileReader fileReader = ParquetFileReader.open(InputFile.of(FILE_BAD_METADATA));
+ RowReader rowReader = fileReader.rowReader()) {
+ rowReader.next();
+ assertThatThrownBy(() -> rowReader.getVariant("var"))
+ .isInstanceOf(IllegalArgumentException.class)
+ .hasMessage("[variant_negative_dict_size.parquet] "
+ + "Variant metadata dictionary_size is not a valid unsigned int: -1");
+ }
+ }
}
diff --git a/core/src/test/java/dev/hardwood/internal/variant/VariantMetadataTest.java b/core/src/test/java/dev/hardwood/internal/variant/VariantMetadataTest.java
index 946804c1..dac14b46 100644
--- a/core/src/test/java/dev/hardwood/internal/variant/VariantMetadataTest.java
+++ b/core/src/test/java/dev/hardwood/internal/variant/VariantMetadataTest.java
@@ -59,6 +59,16 @@ void objectPrimitiveDictionary() throws IOException {
assertThat(metadata.findField("missing_field")).isEqualTo(-1);
}
+ @Test
+ void negativeDictionarySizeRejected() {
+ // Header 0xC1: version=1 (bits 0-3), offset_size=4 (bits 6-7 = 11).
+ // The 4-byte dictionary_size 0xFFFFFFFF reads back as a negative int.
+ byte[] bytes = { (byte) 0xC1, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF };
+ assertThatThrownBy(() -> new VariantMetadata(bytes))
+ .isInstanceOf(IllegalArgumentException.class)
+ .hasMessage("Variant metadata dictionary_size is not a valid unsigned int: -1");
+ }
+
@Test
void truncatedBufferRejected() {
byte[] bytes = { 0x01 }; // header only, no dictionary size bytes
diff --git a/core/src/test/resources/variant_metadata_offset_size2.parquet b/core/src/test/resources/variant_metadata_offset_size2.parquet
new file mode 100644
index 00000000..b27433d8
Binary files /dev/null and b/core/src/test/resources/variant_metadata_offset_size2.parquet differ
diff --git a/core/src/test/resources/variant_negative_dict_size.parquet b/core/src/test/resources/variant_negative_dict_size.parquet
new file mode 100644
index 00000000..bda664b3
Binary files /dev/null and b/core/src/test/resources/variant_negative_dict_size.parquet differ
diff --git a/tools/simple-datagen.py b/tools/simple-datagen.py
index 5f136dd6..97772395 100644
--- a/tools/simple-datagen.py
+++ b/tools/simple-datagen.py
@@ -2431,6 +2431,104 @@ def _schema_elem(name, type_val=None, rep=None, num_children=None, ct=None, logi
print(" - 4 rows: BOOLEAN_TRUE, BOOLEAN_FALSE, INT32(42), short string 'hi'")
print(" - `var` is a VARIANT-annotated group of {metadata, value} binaries")
+# ============================================================================
+# Variant logical type — metadata with offset_size = 2
+# ============================================================================
+#
+# Regression fixture for the metadata offset-size parse: a Variant whose
+# dictionary string section exceeds 255 bytes, so the writer must use
+# offset_size = 2, encoded in bits 6-7 of the metadata header. Reading that
+# field from the wrong bits corrupts the whole dictionary. The `var` value is an
+# object with two long-named fields so the read path actually resolves the keys.
+
+def _v_le(value, width):
+ return bytes((value >> (8 * i)) & 0xFF for i in range(width))
+
+# Two 160-byte keys → 320-byte string section (> 255), forcing offset_size = 2.
+# Keys are lexicographically ordered to match the sorted-strings flag.
+_v_k0 = b'a' * 160
+_v_k1 = b'b' * 160
+
+# metadata header 0x51 = version 1 | sorted (bit 4) | offset_size_minus_one=1 (bits 6-7)
+_md_offsize2 = (
+ bytes([0x51])
+ + _v_le(2, 2) # dictionary_size = 2 (2-byte LE)
+ + _v_le(0, 2) + _v_le(len(_v_k0), 2) + _v_le(len(_v_k0) + len(_v_k1), 2) # offset table
+ + _v_k0 + _v_k1 # string data
+)
+
+# value: object { k0: int8(5), k1: boolean true }
+# header 0x02 (OBJECT, idSize=1, offsetSize=1, small); numElements=2;
+# field_ids=[0,1]; field_offsets=[0,2,3]; values int8(5)=[0x0C,0x05], true=[0x04]
+_val_object = bytes([
+ 0x02, # object header
+ 0x02, # numElements = 2
+ 0x00, 0x01, # field_ids: id0, id1
+ 0x00, 0x02, 0x03, # field_offsets: 0, 2, 3
+ 0x0C, 0x05, # field 0 value: INT8 = 5
+ 0x04, # field 1 value: BOOLEAN_TRUE
+])
+
+variant_offsize2_schema = pa.schema([
+ ('id', pa.int32(), False),
+ ('var', pa.struct([
+ pa.field('metadata', pa.binary(), False),
+ pa.field('value', pa.binary(), False),
+ ]), True),
+])
+
+variant_offsize2_table = pa.table({
+ 'id': [1],
+ 'var': [{'metadata': _md_offsize2, 'value': _val_object}],
+}, schema=variant_offsize2_schema)
+
+pq.write_table(
+ variant_offsize2_table,
+ 'core/src/test/resources/variant_metadata_offset_size2.parquet',
+ compression='NONE',
+ use_dictionary=False,
+ data_page_version='1.0',
+)
+annotate_group_as_variant('core/src/test/resources/variant_metadata_offset_size2.parquet', 'var')
+
+print("\nGenerated variant_metadata_offset_size2.parquet:")
+print(" - `var` is a VARIANT object whose metadata uses offset_size=2 (320-byte dict)")
+print(" - fields: ('a'*160)=INT8 5, ('b'*160)=BOOLEAN_TRUE")
+
+# ============================================================================
+# Variant logical type — malformed metadata (negative dictionary_size)
+# ============================================================================
+#
+# A VARIANT column whose `metadata` is deliberately corrupt: header 0xC1 selects
+# offset_size=4, and the 4-byte dictionary_size 0xFFFFFFFF reads back as a
+# negative int. Decoding is rejected when the row is read, exercising the
+# file-name enrichment on the variant read path.
+
+variant_bad_meta_schema = pa.schema([
+ ('id', pa.int32(), False),
+ ('var', pa.struct([
+ pa.field('metadata', pa.binary(), False),
+ pa.field('value', pa.binary(), False),
+ ]), True),
+])
+
+variant_bad_meta_table = pa.table({
+ 'id': [1],
+ 'var': [{'metadata': bytes([0xC1, 0xFF, 0xFF, 0xFF, 0xFF]), 'value': bytes([0x00])}],
+}, schema=variant_bad_meta_schema)
+
+pq.write_table(
+ variant_bad_meta_table,
+ 'core/src/test/resources/variant_negative_dict_size.parquet',
+ compression='NONE',
+ use_dictionary=False,
+ data_page_version='1.0',
+)
+annotate_group_as_variant('core/src/test/resources/variant_negative_dict_size.parquet', 'var')
+
+print("\nGenerated variant_negative_dict_size.parquet:")
+print(" - `var` metadata is corrupt (4-byte dictionary_size reads as negative)")
+
# ============================================================================
# Variant logical type — shredded (typed_value: int64)
# ============================================================================