From 649eebda2693e75d3be50bdf2b862e8febc5526a Mon Sep 17 00:00:00 2001 From: laskoviymishka Date: Fri, 26 Jun 2026 11:21:45 +0200 Subject: [PATCH] Collation prototype: field annotation, schema round-trip, comparator Proof of concept for collation support in Java, mirroring the iceberg-go prototype and the spec change in apache/iceberg#16972. - Types.NestedField carries an optional collation (String). StringType is a stateless singleton, so the collation lives on the field, not the type; it is only valid on string fields and is threaded through the builder, the asOptional/asRequired/withFieldId copies, and equals/hashCode/toString. - SchemaParser (de)serializes the field's "collation" attribute. - Comparators.charSequences(collation) returns a locale-aware comparator backed by java.text.Collator; a null or "utf8" collation yields the default UTF-8 byte-order comparator. A full implementation would use ICU for the complete modifier set, matching the spec's collation provider. Out of scope for this POC (documented in the spec change): collation-aware bounds in data_file (collation_bounds) and metrics-evaluator pruning. --- .../org/apache/iceberg/types/Comparators.java | 78 ++++++++++++++++ .../java/org/apache/iceberg/types/Types.java | 48 +++++++++- .../java/org/apache/iceberg/SchemaParser.java | 7 ++ .../iceberg/TestCollationPrototype.java | 92 +++++++++++++++++++ 4 files changed, 220 insertions(+), 5 deletions(-) create mode 100644 core/src/test/java/org/apache/iceberg/TestCollationPrototype.java diff --git a/api/src/main/java/org/apache/iceberg/types/Comparators.java b/api/src/main/java/org/apache/iceberg/types/Comparators.java index ab59c895686d..63c31356cda7 100644 --- a/api/src/main/java/org/apache/iceberg/types/Comparators.java +++ b/api/src/main/java/org/apache/iceberg/types/Comparators.java @@ -19,8 +19,10 @@ package org.apache.iceberg.types; import java.nio.ByteBuffer; +import java.text.Collator; import java.util.Comparator; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.function.IntFunction; import org.apache.iceberg.StructLike; @@ -229,10 +231,86 @@ public static Comparator charSequences() { return CharSeqComparator.INSTANCE; } + /** + * PROTOTYPE: returns a comparator for the given collation. A null or "utf8" collation yields the + * default UTF-8 byte-order comparator; otherwise comparison is locale-aware via {@link Collator}. + * + *

This POC uses the JDK {@link Collator}, which maps case/accent sensitivity onto collator + * strength approximately (e.g. accent-insensitive also ignores case). A full implementation would + * use ICU for the complete set of modifiers, matching the collation provider in the spec. + */ + public static Comparator charSequences(String collation) { + if (collation == null || collation.equalsIgnoreCase("utf8")) { + return CharSeqComparator.INSTANCE; + } + + return new CollationComparator(collation); + } + public static Comparator filePath() { return FilePathComparator.INSTANCE; } + private static class CollationComparator implements Comparator { + // Collator is not thread-safe; hand each thread its own configured instance. + private final ThreadLocal collator; + + private CollationComparator(String collation) { + Locale locale = localeFor(collation); + int strength = strengthFor(collation); + this.collator = + ThreadLocal.withInitial( + () -> { + Collator instance = Collator.getInstance(locale); + instance.setStrength(strength); + return instance; + }); + } + + @Override + public int compare(CharSequence left, CharSequence right) { + if (left == right) { + return 0; + } + + return collator.get().compare(left.toString(), right.toString()); + } + + private static String name(String collation) { + int dot = collation.indexOf('.'); + return dot < 0 ? collation : collation.substring(dot + 1); + } + + private static Locale localeFor(String collation) { + String localePart = name(collation).split("-")[0]; + if (localePart.isEmpty() || localePart.equalsIgnoreCase("utf8")) { + return Locale.ROOT; + } + + return Locale.forLanguageTag(localePart.replace('_', '-')); + } + + private static int strengthFor(String collation) { + boolean caseInsensitive = false; + boolean accentInsensitive = false; + for (String part : name(collation).split("-")) { + if (part.equalsIgnoreCase("ci")) { + caseInsensitive = true; + } else if (part.equalsIgnoreCase("ai")) { + accentInsensitive = true; + } + } + + if (accentInsensitive) { + return Collator.PRIMARY; // ignores case and accents + } else if (caseInsensitive) { + return Collator.SECONDARY; // ignores case, keeps accents + } + + return Collator.TERTIARY; // case- and accent-sensitive + } + } + private static class NullsFirst implements Comparator { private static final NullsFirst INSTANCE = new NullsFirst<>(); diff --git a/api/src/main/java/org/apache/iceberg/types/Types.java b/api/src/main/java/org/apache/iceberg/types/Types.java index ec6076b04fa0..194ee1c336bd 100644 --- a/api/src/main/java/org/apache/iceberg/types/Types.java +++ b/api/src/main/java/org/apache/iceberg/types/Types.java @@ -761,6 +761,7 @@ public static class Builder { private String doc = null; private Literal initialDefault = null; private Literal writeDefault = null; + private String collation = null; private Builder() {} @@ -777,6 +778,7 @@ private Builder(NestedField toCopy) { this.doc = toCopy.doc; this.initialDefault = toCopy.initialDefault; this.writeDefault = toCopy.writeDefault; + this.collation = toCopy.collation; } public Builder asRequired() { @@ -844,10 +846,17 @@ public Builder withWriteDefault(Literal fieldWriteDefault) { return this; } + /** PROTOTYPE: set the field's collation (only valid on string fields). */ + public Builder withCollation(String fieldCollation) { + this.collation = fieldCollation; + return this; + } + public NestedField build() { Preconditions.checkNotNull(id, "Id cannot be null"); // the constructor validates the other fields - return new NestedField(isOptional, id, name, type, doc, initialDefault, writeDefault); + return new NestedField( + isOptional, id, name, type, doc, initialDefault, writeDefault, collation); } } @@ -858,6 +867,10 @@ public NestedField build() { private final String doc; private final Literal initialDefault; private final Literal writeDefault; + // PROTOTYPE: optional collation for string fields. null means default UTF-8 + // byte-order comparison. StringType is a stateless singleton, so the + // collation is carried on the field rather than the type. + private final String collation; private NestedField( boolean isOptional, @@ -867,12 +880,28 @@ private NestedField( String doc, Literal initialDefault, Literal writeDefault) { + this(isOptional, id, name, type, doc, initialDefault, writeDefault, null); + } + + private NestedField( + boolean isOptional, + int id, + String name, + Type type, + String doc, + Literal initialDefault, + Literal writeDefault, + String collation) { Preconditions.checkNotNull(name, "Name cannot be null"); Preconditions.checkNotNull(type, "Type cannot be null"); Preconditions.checkArgument( isOptional || !type.equals(UnknownType.get()), "Cannot create required field with unknown type: %s", name); + Preconditions.checkArgument( + collation == null || type.typeId() == Type.TypeID.STRING, + "Cannot set collation on non-string field: %s", + name); this.isOptional = isOptional; this.id = id; this.name = name; @@ -880,6 +909,7 @@ private NestedField( this.doc = doc; this.initialDefault = castDefault(initialDefault, type); this.writeDefault = castDefault(writeDefault, type); + this.collation = collation; } private static Literal castDefault(Literal defaultValue, Type type) { @@ -904,7 +934,7 @@ public NestedField asOptional() { if (isOptional) { return this; } - return new NestedField(true, id, name, type, doc, initialDefault, writeDefault); + return new NestedField(true, id, name, type, doc, initialDefault, writeDefault, collation); } public boolean isRequired() { @@ -915,7 +945,7 @@ public NestedField asRequired() { if (!isOptional) { return this; } - return new NestedField(false, id, name, type, doc, initialDefault, writeDefault); + return new NestedField(false, id, name, type, doc, initialDefault, writeDefault, collation); } /** @@ -923,7 +953,7 @@ public NestedField asRequired() { */ @Deprecated public NestedField withFieldId(int newId) { - return new NestedField(isOptional, newId, name, type, doc, initialDefault, writeDefault); + return new NestedField(isOptional, newId, name, type, doc, initialDefault, writeDefault, collation); } public int fieldId() { @@ -958,10 +988,16 @@ public Object writeDefault() { return writeDefault != null ? writeDefault.value() : null; } + /** PROTOTYPE: the field's collation, or null for default UTF-8 byte order. */ + public String collation() { + return collation; + } + @Override public String toString() { return String.format( Locale.ROOT, "%d: %s: %s %s", id, name, isOptional ? "optional" : "required", type) + + (collation != null ? " collate " + collation : "") + (doc != null ? " (" + doc + ")" : ""); } @@ -988,13 +1024,15 @@ public boolean equals(Object o) { return false; } else if (!Objects.equals(writeDefault, that.writeDefault)) { return false; + } else if (!Objects.equals(collation, that.collation)) { + return false; } return true; } @Override public int hashCode() { - return Objects.hash(NestedField.class, id, isOptional, name, type); + return Objects.hash(NestedField.class, id, isOptional, name, type, collation); } } diff --git a/core/src/main/java/org/apache/iceberg/SchemaParser.java b/core/src/main/java/org/apache/iceberg/SchemaParser.java index 7481af0284f6..225867969898 100644 --- a/core/src/main/java/org/apache/iceberg/SchemaParser.java +++ b/core/src/main/java/org/apache/iceberg/SchemaParser.java @@ -53,6 +53,7 @@ private SchemaParser() {} private static final String ID = "id"; private static final String INITIAL_DEFAULT = "initial-default"; private static final String WRITE_DEFAULT = "write-default"; + private static final String COLLATION = "collation"; private static final String ELEMENT_ID = "element-id"; private static final String KEY_ID = "key-id"; private static final String VALUE_ID = "value-id"; @@ -103,6 +104,10 @@ private static void toJson( SingleValueParser.toJson(field.type(), field.writeDefault(), generator); } + if (field.collation() != null) { + generator.writeStringField(COLLATION, field.collation()); + } + generator.writeEndObject(); } generator.writeEndArray(); @@ -239,6 +244,7 @@ private static Types.StructType structFromJson(JsonNode json) { String doc = JsonUtil.getStringOrNull(DOC, field); boolean isRequired = JsonUtil.getBool(REQUIRED, field); + String collation = JsonUtil.getStringOrNull(COLLATION, field); fields.add( fieldBuilder(isRequired, name) .withId(id) @@ -246,6 +252,7 @@ private static Types.StructType structFromJson(JsonNode json) { .withDoc(doc) .withInitialDefault(initialDefault) .withWriteDefault(writeDefault) + .withCollation(collation) .build()); } diff --git a/core/src/test/java/org/apache/iceberg/TestCollationPrototype.java b/core/src/test/java/org/apache/iceberg/TestCollationPrototype.java new file mode 100644 index 000000000000..c36e40bed23c --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/TestCollationPrototype.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.util.Comparator; +import org.apache.iceberg.types.Comparators; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +/** PROTOTYPE: collation support — field annotation, schema round-trip, and comparison. */ +public class TestCollationPrototype { + + private static Types.NestedField collatedField(int id, String name, String collation) { + return Types.NestedField.optional(name) + .withId(id) + .ofType(Types.StringType.get()) + .withCollation(collation) + .build(); + } + + @Test + public void collationRoundTripsThroughSchemaJson() { + Schema schema = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + collatedField(2, "name", "icu.en_US-ci")); + + String json = SchemaParser.toJson(schema); + assertThat(json).contains("\"collation\":\"icu.en_US-ci\""); + + Schema roundTripped = SchemaParser.fromJson(json); + assertThat(roundTripped.findField("name").collation()).isEqualTo("icu.en_US-ci"); + } + + @Test + public void plainStringHasNoCollation() { + Schema schema = new Schema(Types.NestedField.optional(1, "name", Types.StringType.get())); + + assertThat(SchemaParser.toJson(schema)).doesNotContain("collation"); + assertThat(SchemaParser.fromJson(SchemaParser.toJson(schema)).findField("name").collation()) + .isNull(); + } + + @Test + public void collationOnNonStringFieldIsRejected() { + assertThatThrownBy( + () -> + Types.NestedField.optional("n") + .withId(1) + .ofType(Types.LongType.get()) + .withCollation("icu.en_US-ci") + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Cannot set collation on non-string field"); + } + + @Test + public void caseInsensitiveComparatorTreatsCaseAsEqual() { + Comparator ci = Comparators.charSequences("icu.en_US-ci"); + assertThat(ci.compare("APPLE", "apple")).isEqualTo(0); + assertThat(ci.compare("apple", "banana")).isNotEqualTo(0); + + // The default (binary) comparator distinguishes case. + assertThat(Comparators.charSequences(null).compare("APPLE", "apple")).isNotEqualTo(0); + assertThat(Comparators.charSequences("utf8").compare("APPLE", "apple")).isNotEqualTo(0); + } + + @Test + public void accentInsensitiveComparatorIgnoresDiacritics() { + Comparator ai = Comparators.charSequences("icu.en_US-ai"); + assertThat(ai.compare("résumé", "resume")).isEqualTo(0); + } +}