fix comments

apache · zhongqishang · Jun 20, 2024 · Jun 20, 2024 · Jun 21, 2024 · Jun 21, 2024
commit 60d183d857f758a5547db8c1d2f2c1738ad8d7d4
diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java
@@ -212,15 +212,7 @@ public void snapshotState(StateSnapshotContext context) throws Exception {
 
     // Update the checkpoint state.
     long startNano = System.nanoTime();
-    if (writeResultsSinceLastSnapshot.isEmpty()) {
-      dataFilesPerCheckpoint.put(checkpointId, EMPTY_MANIFEST_DATA);
-    } else {
-      for (Map.Entry<Long, List<WriteResult>> writeResultsOfCkpt :
-          writeResultsSinceLastSnapshot.entrySet()) {
-        dataFilesPerCheckpoint.put(
-            writeResultsOfCkpt.getKey(), writeToManifest(writeResultsOfCkpt.getKey()));
-      }
-    }
+    writeToManifestSinceLastSnapshot(checkpointId);
 
     // Reset the snapshot state to the latest state.
     checkpointsState.clear();
@@ -229,8 +221,6 @@ public void snapshotState(StateSnapshotContext context) throws Exception {
     jobIdState.clear();
     jobIdState.add(flinkJobId);
 
-    // Clear the local buffer for current checkpoint.
-    writeResultsSinceLastSnapshot.clear();
     committerMetrics.checkpointDuration(
         TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNano));
   }
@@ -446,13 +436,26 @@ public void processElement(StreamRecord<FlinkWriteResult> element) {
   @Override
   public void endInput() throws IOException {
     // Flush the buffered data files into 'dataFilesPerCheckpoint' firstly.
-    long currentCheckpointId = Long.MAX_VALUE;
-    dataFilesPerCheckpoint.put(currentCheckpointId, writeToManifest(currentCheckpointId));
-    writeResultsSinceLastSnapshot.clear();
-
+    long currentCheckpointId = IcebergStreamWriter.END_INPUT_CHECKPOINT_ID;
+    writeToManifestSinceLastSnapshot(currentCheckpointId);
     commitUpToCheckpoint(dataFilesPerCheckpoint, flinkJobId, operatorUniqueId, currentCheckpointId);
   }
 
+  private void writeToManifestSinceLastSnapshot(long checkpointId) throws IOException {
+    if (writeResultsSinceLastSnapshot.isEmpty()) {
+      dataFilesPerCheckpoint.put(checkpointId, EMPTY_MANIFEST_DATA);
+    } else {
+      for (Map.Entry<Long, List<WriteResult>> writeResultsOfCkpt :
+          writeResultsSinceLastSnapshot.entrySet()) {
+        dataFilesPerCheckpoint.put(
+            writeResultsOfCkpt.getKey(), writeToManifest(writeResultsOfCkpt.getKey()));
+      }
+    }
+
+    // Clear the local buffer for current checkpoint.
+    writeResultsSinceLastSnapshot.clear();
+  }
+
   /**
    * Write all the complete data files to a newly created manifest file and return the manifest's
    * avro serialized bytes.

diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java
@@ -33,6 +33,7 @@ class IcebergStreamWriter<T> extends AbstractStreamOperator<FlinkWriteResult>
     implements OneInputStreamOperator<T, FlinkWriteResult>, BoundedOneInput {
 
   private static final long serialVersionUID = 1L;
+  static final long END_INPUT_CHECKPOINT_ID = Long.MAX_VALUE;
 
   private final String fullTableName;
   private final TaskWriterFactory<T> taskWriterFactory;

diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java
@@ -66,7 +66,6 @@
 import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.relocated.com.google.common.collect.Maps;
-import org.apache.iceberg.relocated.com.google.common.collect.Sets;
 import org.apache.iceberg.types.Types;
 import org.apache.iceberg.util.Pair;
 import org.apache.iceberg.util.StructLikeSet;
@@ -83,13 +82,6 @@ private SimpleDataUtil() {}
           Types.NestedField.optional(1, "id", Types.IntegerType.get()),
           Types.NestedField.optional(2, "data", Types.StringType.get()));
 
-  public static final Schema SCHEMA_WITH_PRIMARY_KEY =
-      new Schema(
-          Lists.newArrayList(
-              Types.NestedField.required(1, "id", Types.IntegerType.get()),
-              Types.NestedField.optional(2, "data", Types.StringType.get())),
-          Sets.newHashSet(1));
-
   public static final TableSchema FLINK_SCHEMA =
       TableSchema.builder().field("id", DataTypes.INT()).field("data", DataTypes.STRING()).build();
 
@@ -318,10 +310,6 @@ public static void assertTableRecords(Table table, List<Record> expected, String
       StructLikeSet actualSet = StructLikeSet.create(type);
 
       for (Record record : iterable) {
-        if (!table.schema().identifierFieldNames().isEmpty()) {
-          Assert.assertFalse("Should not have the identical record", actualSet.contains(record));
-        }
-
         actualSet.add(record);
       }
 

diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java
@@ -63,7 +63,6 @@
 import org.apache.iceberg.Parameters;
 import org.apache.iceberg.PartitionData;
 import org.apache.iceberg.PartitionSpec;
-import org.apache.iceberg.Schema;
 import org.apache.iceberg.StructLike;
 import org.apache.iceberg.TestBase;
 import org.apache.iceberg.flink.FlinkSchemaUtil;
@@ -93,21 +92,15 @@ public class TestIcebergFilesCommitter extends TestBase {
   @Parameter(index = 2)
   private String branch;
 
-  @Parameter(index = 3)
-  private boolean hasPrimaryKey;
-
-  @Parameters(name = "formatVersion = {0}, fileFormat = {1}, branch = {2}, hasPrimaryKey = {3}")
+  @Parameters(name = "formatVersion = {0}, fileFormat = {1}, branch = {2}")
   protected static List<Object> parameters() {
     return Arrays.asList(
-        new Object[] {1, FileFormat.AVRO, "main", false},
-        new Object[] {2, FileFormat.AVRO, "test-branch", false},
-        new Object[] {1, FileFormat.PARQUET, "main", false},
-        new Object[] {2, FileFormat.PARQUET, "test-branch", false},
-        new Object[] {1, FileFormat.ORC, "main", false},
-        new Object[] {2, FileFormat.ORC, "test-branch", false},
-        new Object[] {2, FileFormat.AVRO, "main", true},
-        new Object[] {2, FileFormat.PARQUET, "test-branch", true},
-        new Object[] {2, FileFormat.ORC, "main", true});
+        new Object[] {1, FileFormat.AVRO, "main"},
+        new Object[] {2, FileFormat.AVRO, "test-branch"},
+        new Object[] {1, FileFormat.PARQUET, "main"},
+        new Object[] {2, FileFormat.PARQUET, "test-branch"},
+        new Object[] {1, FileFormat.ORC, "main"},
+        new Object[] {2, FileFormat.ORC, "test-branch"});
   }
 
   @Override
@@ -119,9 +112,8 @@ public void setupTable() throws IOException {
     this.metadataDir = new File(tableDir, "metadata");
     assertThat(tableDir.delete()).isTrue();
 
-    Schema schema = hasPrimaryKey ? SimpleDataUtil.SCHEMA_WITH_PRIMARY_KEY : SimpleDataUtil.SCHEMA;
     // Construct the iceberg table.
-    table = create(schema, PartitionSpec.unpartitioned());
+    table = create(SimpleDataUtil.SCHEMA, PartitionSpec.unpartitioned());
 
     table
         .updateProperties()
@@ -506,7 +498,6 @@ public void testRecoveryFromSnapshotWithoutCompletedNotification() throws Except
 
   @TestTemplate
   public void testStartAnotherJobToWriteSameTable() throws Exception {
-    assumeThat(hasPrimaryKey).as("The test case only for non-primary table.").isEqualTo(false);
 
     long checkpointId = 0;
     long timestamp = 0;
@@ -922,19 +913,36 @@ public void testCommitTwoCheckpointsInSingleTxn() throws Exception {
   }
 
   @TestTemplate
-  public void testCommitMultipleCheckpointsWithDuplicateData() throws Exception {
+  public void testCommitMultipleCheckpointsForV2Table() throws Exception {
+    // The test case are designed to solve the following scenarios:
+
+    // V2 table with Upsert enabled.
+    // And the previous checkpoint is not executed normally, the next snapshot submits a single
+    // snapshot include multiple checkpoint. That is, prepareSnapshotPreBarrier is triggered twice,
+    // but snapshotState() is only triggered once.
+    // And the data with the same primary key is required in both checkpoints, and the data file and
+    // eq-delete file are generated.
+
     assumeThat(formatVersion)
         .as("Only support equality-delete in format v2 or later.")
         .isGreaterThan(1);
 
-    assumeThat(hasPrimaryKey).as("The test case only for primary table.").isEqualTo(true);
-
     long timestamp = 0;
     long checkpoint = 10;
 
     JobID jobId = new JobID();
     OperatorID operatorId;
-    FileAppenderFactory<RowData> appenderFactory = createDeletableAppenderFactory();
+
+    FileAppenderFactory<RowData> appenderFactory =
+        new FlinkAppenderFactory(
+            table,
+            table.schema(),
+            FlinkSchemaUtil.convert(table.schema()),
+            table.properties(),
+            table.spec(),
+            new int[] {table.schema().findField("id").fieldId()},
+            table.schema(),
+            null);
 
     try (OneInputStreamOperatorTestHarness<FlinkWriteResult, Void> harness =
         createStreamSink(jobId)) {
@@ -944,9 +952,11 @@ public void testCommitMultipleCheckpointsWithDuplicateData() throws Exception {
 
       assertMaxCommittedCheckpointId(jobId, operatorId, -1L);
 
-      RowData insert1 = SimpleDataUtil.createInsert(1, "aaa");
-      RowData insert2 = SimpleDataUtil.createInsert(2, "bbb");
+      RowData insert1 = null;
+      RowData insert2 = null;
       for (int i = 1; i <= 3; i++) {
+        insert1 = SimpleDataUtil.createInsert(1, "aaa" + i);
+        insert2 = SimpleDataUtil.createInsert(2, "bbb" + i);
         DataFile dataFile = writeDataFile("data-file-" + i, ImmutableList.of(insert1, insert2));
         DeleteFile deleteFile =
             writeEqDeleteFile(
@@ -958,10 +968,7 @@ public void testCommitMultipleCheckpointsWithDuplicateData() throws Exception {
             ++timestamp);
       }
 
-      // The 1th snapshotState.
       harness.snapshot(checkpoint, ++timestamp);
-
-      // Notify the 1th snapshot to complete.
       harness.notifyOfCompletedCheckpoint(checkpoint);
       SimpleDataUtil.assertTableRows(table, ImmutableList.of(insert1, insert2), branch);
       assertMaxCommittedCheckpointId(jobId, operatorId, checkpoint);