devgrok/DeduplicationKeyValueTransformerIntegrationTest.java

## DeduplicationKeyValueTransformerIntegrationTest.java
/*
 * Copyright Confluent Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.confluent.examples.streams;

import org.apache.kafka.common.serialization.*;
import org.apache.kafka.common.serialization.Serdes.ByteArraySerde;
import org.apache.kafka.common.utils.Bytes;
import org.apache.kafka.streams.*;
import org.apache.kafka.streams.kstream.KStream;
import org.apache.kafka.streams.kstream.Materialized;
import org.apache.kafka.streams.kstream.Transformer;
import org.apache.kafka.streams.kstream.TransformerSupplier;
import org.apache.kafka.streams.kstream.internals.MaterializedInternal;
import org.apache.kafka.streams.processor.ProcessorContext;
import org.apache.kafka.streams.processor.PunctuationType;
import org.apache.kafka.streams.state.*;
import org.apache.kafka.test.TestUtils;
import org.junit.Test;

import java.nio.charset.StandardCharsets;
import java.time.Duration;
import java.util.*;
import java.util.concurrent.TimeUnit;
import java.util.function.BiFunction;
import java.util.stream.Collectors;

import static org.hamcrest.CoreMatchers.equalTo;
import static org.hamcrest.MatcherAssert.assertThat;

/**
 * End-to-end integration test that demonstrates how to remove duplicate records from an input
 * stream.
 * <p>
 * Here, a stateful {@link Transformer} (from the Processor API)
 * detects and discards duplicate input records based on an "event id" that is embedded in each
 * input record.  This transformer is then included in a topology defined via the DSL.
 * <p>
 * In this simplified example, the values of input records represent the event ID by which
 * duplicates will be detected.  In practice, record values would typically be a more complex data
 * structure, with perhaps one of the fields being such an event ID.  De-duplication by an event ID
 * is but one example of how to perform de-duplication in general.  The code example below can be
 * adapted to other de-duplication approaches.
 * <p>
 * Note: This example uses lambda expressions and thus works with Java 8+ only.
 */
public class DeduplicationKeyValueTransformerIntegrationTest {

  private static final String storeName = "eventId-store";

  /**
   * Discards duplicate records from the input stream.
   * <p>
   * Duplicates are determined by a function passed to it on creation to allow for custom comparisons, that compares
   * new values with the previous.
   * <p>
   * Duplicate records are detected based on an value. The transformer remembers the last value for each key in a state
   * store, which automatically purges/expires values from the store after a certain amount of
   * time has passed to prevent the store from growing indefinitely. This assumes the consumer can tolerate some duplicates.
   * This processor's purpose is to reduce the 'noise' caused by a lot of duplicate messages.
   * <p>
   * Note: This code is for demonstration purposes and was not tested for production usage.
   */
  private static class DedupeTransformer<K, V> implements Transformer<K, V, KeyValue<K, V>> {

    private ProcessorContext context;

    /**
     * Key: event ID
     * Value: timestamp (event-time) of the corresponding event when the event ID was seen for the
     * first time
     */
    private TimestampedKeyValueStore<K, V> eventStore;

    public static final int CLEAR_INTERVAL_MILLIS = 1000;
    private final long maintainDurationMs;

    private final BiFunction<V, V, Boolean> valueComparator;

    /**
     */
    DedupeTransformer(BiFunction<V, V, Boolean> valueComparator, long maintainDurationMs) {
      this.valueComparator = valueComparator;
      this.maintainDurationMs = maintainDurationMs;
    }

    @Override
    @SuppressWarnings("unchecked")
    public void init(final ProcessorContext context) {
      this.context = context;
      eventStore = (TimestampedKeyValueStore<K, V>) context.getStateStore(storeName);
      this.context.schedule(Duration.ofMillis(CLEAR_INTERVAL_MILLIS), PunctuationType.WALL_CLOCK_TIME, this::clearExpired);
    }

    private void clearExpired(long currentStreamTimeMs) {
      try (KeyValueIterator<K, ValueAndTimestamp<V>> iterator = eventStore.all()) {
        while (iterator.hasNext()) {
          final KeyValue<K, ValueAndTimestamp<V>> entry = iterator.next();
          final long eventTimestamp = entry.value.timestamp();
          if (hasExpired(eventTimestamp, currentStreamTimeMs)) {
            eventStore.delete(entry.key);
          }
        }
      }
    }

    private boolean hasExpired(final long eventTimestamp, final long currentStreamTimeMs) {
      return (currentStreamTimeMs - eventTimestamp) > maintainDurationMs;
    }

    public KeyValue<K, V> transform(final K key, final V value) {
      if (value == null) {
        return KeyValue.pair(key, null);
      } else {
        final KeyValue<K, V> output;
        if (isDuplicate(key, value)) {
          output = null;
          updateTimestampOfExistingEventToPreventExpiry(key, value, context.timestamp());
        } else {
          output = KeyValue.pair(key, value);
          rememberNewEvent(key, value, context.timestamp());
        }
        return output;
      }
    }

    private boolean isDuplicate(final K key, final V value) {
      boolean isDuplicate = false;
      ValueAndTimestamp<V> storedValue = eventStore.get(key);
      if (storedValue != null) {
        V previous = storedValue.value();
        isDuplicate = (valueComparator.apply(previous, value) == Boolean.TRUE);
      }
      return isDuplicate;
    }

    private void updateTimestampOfExistingEventToPreventExpiry(final K key, final V value, final long newTimestamp) {
      eventStore.put(key, ValueAndTimestamp.make(value, newTimestamp));
    }

    private void rememberNewEvent(final K key, final V value, final long timestamp) {
      eventStore.put(key, ValueAndTimestamp.make(value, timestamp));
    }

    @Override
    public void close() {
      // Note: The store should NOT be closed manually here via `eventStore.close()`!
      // The Kafka Streams API will automatically close stores when necessary.
    }

  }

  private class DedupeValueForKey<K, V> implements TransformerSupplier<K, V, KeyValue<K, V>> {
    private final MaterializedInternal<K, V, KeyValueStore<Bytes, byte[]>> materialized;
    private final BiFunction<V, V, Boolean> valueComparator;
    private long maintainDurationMs = TimeUnit.HOURS.toMillis(30);

    public DedupeValueForKey(BiFunction<V, V, Boolean> valueComparator, Materialized<K, V, KeyValueStore<Bytes, byte[]>> materialized) {
      this.materialized = new MaterializedInternal<>(materialized, null, null);
      // can't pass in name provider
      // new MaterializedInternal<>(materialized, builder, REDUCE_NAME);
      this.valueComparator = valueComparator;
    }

    @Override
    public Transformer<K, V, KeyValue<K, V>> get() {
      return new DedupeTransformer(
              valueComparator,
              maintainDurationMs
      );
    }


    // provide store(s) that will be added and connected to the associated transformer
    // the store name from the builder ("myTransformState") is used to access the store later via the ProcessorContext
    @Override
    public Set<StoreBuilder<?>> stores() {
      KeyValueBytesStoreSupplier supplier = Stores.persistentTimestampedKeyValueStore(storeName);
      StoreBuilder<TimestampedKeyValueStore<K, V>> builder = Stores.timestampedKeyValueStoreBuilder(
              supplier,
              materialized.keySerde(),
              materialized.valueSerde());

      return Collections.singleton(builder);
    }

    public void setMaintainDurationMs(long maintainDurationMs) {
      this.maintainDurationMs = maintainDurationMs;
    }
  }

  @Test
  public void shouldRemoveDuplicatesFromTheInput() {
    final String firstId = UUID.randomUUID().toString(); // e.g. "4ff3cb44-abcb-46e3-8f9a-afb7cc74fbb8"
    final String secondId = UUID.randomUUID().toString();
    final String thirdId = UUID.randomUUID().toString();
    final byte[] keyId = UUID.randomUUID().toString().getBytes(StandardCharsets.UTF_8);
    final List<String> inputValues = Arrays.asList(firstId, secondId, secondId, thirdId,
                                                   thirdId, thirdId);
    final List<KeyValue<byte[], String>> inputs = inputValues.stream().map(v -> new KeyValue<>(keyId, v)).collect(Collectors.toList());
    final List<String> expectedValues = Arrays.asList(firstId, secondId, thirdId);

    //
    // Step 1: Configure and start the processor topology.
    //
    final StreamsBuilder builder = new StreamsBuilder();

    final Properties streamsConfiguration = new Properties();
    streamsConfiguration.put(StreamsConfig.APPLICATION_ID_CONFIG, "deduplication-lambda-integration-test");
    streamsConfiguration.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "dummy config");
    streamsConfiguration.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.ByteArray().getClass().getName());
    streamsConfiguration.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass().getName());
    // Use a temporary directory for storing state, which will be automatically removed after the test.
    streamsConfiguration.put(StreamsConfig.STATE_DIR_CONFIG, TestUtils.tempDirectory().getAbsolutePath());

    // How long we "remember" an event.  During this time, any incoming duplicates of the event
    // will be, well, dropped, thereby de-duplicating the input data.
    //
    // The actual value depends on your use case.  To reduce memory and disk usage, you could
    // decrease the size to purge old windows more frequently at the cost of potentially missing out
    // on de-duplicating late-arriving records.
    final Duration windowSize = Duration.ofMinutes(10);

    // retention period must be at least window size -- for this use case, we don't need a longer retention period
    // and thus just use the window size as retention time
    final Duration retentionPeriod = windowSize;

    final String inputTopic = "inputTopic";
    final String outputTopic = "outputTopic";

    final KStream<byte[], String> stream = builder.stream(inputTopic);

    DedupeValueForKey transformerSupplier = new DedupeValueForKey(
            (value1, value2) -> value1.equals(value2),
            Materialized.with(new ByteArraySerde(), new Serdes.StringSerde())
    );
    transformerSupplier.setMaintainDurationMs(retentionPeriod.toMillis());

    final KStream<byte[], String> deduplicated = stream.transform(
            // In this example, we assume that the record value as-is represents a unique event ID by
            // which we can perform de-duplication.  If your records are different, adapt the comparison
            //    function as needed
            transformerSupplier
    );
    deduplicated.to(outputTopic);

    try (final TopologyTestDriver topologyTestDriver = new TopologyTestDriver(builder.build(), streamsConfiguration)) {
      //
      // Step 2: Setup input and output topics.
      //
      final TestInputTopic<byte[], String> input = topologyTestDriver
              .createInputTopic(inputTopic,
                      new ByteArraySerializer(),
                      new StringSerializer());
      final TestOutputTopic<byte[], String> output = topologyTestDriver
              .createOutputTopic(outputTopic,
                      new ByteArrayDeserializer(),
                      new StringDeserializer());

      //
      // Step 3: Produce some input data to the input topic.
      //
      input.pipeKeyValueList(inputs);

      //
      // Step 4: Verify the application's output data.
      //
      assertThat(output.readValuesToList(), equalTo(expectedValues));
    }
  }

  @Test
  public void shouldForgetAfterExpiry() {
    final String firstId = UUID.randomUUID().toString(); // e.g. "4ff3cb44-abcb-46e3-8f9a-afb7cc74fbb8"
    final String secondId = UUID.randomUUID().toString();
    final String thirdId = UUID.randomUUID().toString();
    final byte[] keyId = UUID.randomUUID().toString().getBytes(StandardCharsets.UTF_8);
    final List<String> inputValues = Arrays.asList(firstId, secondId, secondId, thirdId,
                                                   thirdId, thirdId);
    final List<KeyValue<byte[], String>> inputs = inputValues.stream().map(v -> new KeyValue<>(keyId, v)).collect(Collectors.toList());
    final List<String> expectedValues = Arrays.asList(firstId, secondId, thirdId, thirdId);

    //
    // Step 1: Configure and start the processor topology.
    //
    final StreamsBuilder builder = new StreamsBuilder();

    final Properties streamsConfiguration = new Properties();
    streamsConfiguration.put(StreamsConfig.APPLICATION_ID_CONFIG, "deduplication-lambda-integration-test");
    streamsConfiguration.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "dummy config");
    streamsConfiguration.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.ByteArray().getClass().getName());
    streamsConfiguration.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass().getName());
    // Use a temporary directory for storing state, which will be automatically removed after the test.
    streamsConfiguration.put(StreamsConfig.STATE_DIR_CONFIG, TestUtils.tempDirectory().getAbsolutePath());

    // How long we "remember" an event.  During this time, any incoming duplicates of the event
    // will be, well, dropped, thereby de-duplicating the input data.
    //
    // The actual value depends on your use case.  To reduce memory and disk usage, you could
    // decrease the size to purge old windows more frequently at the cost of potentially missing out
    // on de-duplicating late-arriving records.
    final Duration windowSize = Duration.ofMinutes(10);

    // retention period must be at least window size -- for this use case, we don't need a longer retention period
    // and thus just use the window size as retention time
    final Duration retentionPeriod = windowSize;

    final String inputTopic = "inputTopic";
    final String outputTopic = "outputTopic";

    final KStream<byte[], String> stream = builder.stream(inputTopic);

    DedupeValueForKey transformerSupplier = new DedupeValueForKey(
            (value1, value2) -> value1.equals(value2),
            Materialized.with(new ByteArraySerde(), new Serdes.StringSerde())
    );
    transformerSupplier.setMaintainDurationMs(retentionPeriod.toMillis());

    final KStream<byte[], String> deduplicated = stream.transform(
            // In this example, we assume that the record value as-is represents a unique event ID by
            // which we can perform de-duplication.  If your records are different, adapt the comparison
            //    function as needed
            transformerSupplier
    );
    deduplicated.to(outputTopic);

    try (final TopologyTestDriver topologyTestDriver = new TopologyTestDriver(builder.build(), streamsConfiguration)) {
      //
      // Step 2: Setup input and output topics.
      //
      final TestInputTopic<byte[], String> input = topologyTestDriver
              .createInputTopic(inputTopic,
                      new ByteArraySerializer(),
                      new StringSerializer());
      final TestOutputTopic<byte[], String> output = topologyTestDriver
              .createOutputTopic(outputTopic,
                      new ByteArrayDeserializer(),
                      new StringDeserializer());

      //
      // Step 3: Produce some input data to the input topic.
      //
      input.pipeKeyValueList(inputs);

      //
      // Step 4: Advance wall clock time
      //
      topologyTestDriver.advanceWallClockTime(windowSize.plus(Duration.ofMillis(DedupeTransformer.CLEAR_INTERVAL_MILLIS)));

      //
      // Step 5: Send another
      //
      input.pipeInput(keyId, thirdId);

      //
      // Step 6: Verify the application's output data.
      //
      assertThat(output.readValuesToList(), equalTo(expectedValues));
    }
  }
}

## DeduplicationValueTransformerIntegrationTest.java
/*
 * Copyright Confluent Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.confluent.examples.streams;

import org.apache.kafka.common.serialization.*;
import org.apache.kafka.common.serialization.Serdes.ByteArraySerde;
import org.apache.kafka.common.utils.Bytes;
import org.apache.kafka.streams.*;
import org.apache.kafka.streams.kstream.*;
import org.apache.kafka.streams.kstream.internals.MaterializedInternal;
import org.apache.kafka.streams.processor.ProcessorContext;
import org.apache.kafka.streams.processor.PunctuationType;
import org.apache.kafka.streams.state.*;
import org.apache.kafka.test.TestUtils;
import org.junit.Test;

import java.nio.charset.StandardCharsets;
import java.time.Duration;
import java.util.*;
import java.util.concurrent.TimeUnit;
import java.util.function.BiFunction;
import java.util.stream.Collectors;

import static org.hamcrest.CoreMatchers.equalTo;
import static org.hamcrest.MatcherAssert.assertThat;

/**
 * End-to-end integration test that demonstrates how to remove duplicate records from an input
 * stream.
 * <p>
 * Here, a stateful {@link Transformer} (from the Processor API)
 * detects and discards duplicate input records based on an "event id" that is embedded in each
 * input record.  This transformer is then included in a topology defined via the DSL.
 * <p>
 * In this simplified example, the values of input records represent the event ID by which
 * duplicates will be detected.  In practice, record values would typically be a more complex data
 * structure, with perhaps one of the fields being such an event ID.  De-duplication by an event ID
 * is but one example of how to perform de-duplication in general.  The code example below can be
 * adapted to other de-duplication approaches.
 * <p>
 * Note: This example uses lambda expressions and thus works with Java 8+ only.
 */
public class DeduplicationValueTransformerIntegrationTest {

  private static final String storeName = "eventId-store";

  /**
   * Discards duplicate records from the input stream. Uses a ValueTransformerWithKey instead of Transformer
   * to remove the need for re-partitioning in the KStreams DSL due to a possible key change.
   * <p>
   * Duplicates are determined by a function passed to it on creation to allow for custom comparisons, that compares
   * new values with the previous.
   * <p>
   * Duplicate records are detected based on an value. The transformer remembers the last value for each key in a state
   * store, which automatically purges/expires values from the store after a certain amount of
   * time has passed to prevent the store from growing indefinitely. This assumes the consumer can tolerate some duplicates.
   * This processor's purpose is to reduce the 'noise' caused by a lot of duplicate messages.
   * <p>
   * Note: This code is for demonstration purposes and was not tested for production usage.
   */
  private static class DedupeValueTransformer<K, V> implements ValueTransformerWithKey<K, V, Iterable<V>> {

    private ProcessorContext context;

    /**
     * Key: event ID
     * Value: timestamp (event-time) of the corresponding event when the event ID was seen for the
     * first time
     */
    private TimestampedKeyValueStore<K, V> eventStore;

    public static final int CLEAR_INTERVAL_MILLIS = 1000;
    private final long maintainDurationMs;

    private final BiFunction<V, V, Boolean> valueComparator;

    /**
     */
    DedupeValueTransformer(BiFunction<V, V, Boolean> valueComparator, long maintainDurationMs) {
      this.valueComparator = valueComparator;
      this.maintainDurationMs = maintainDurationMs;
    }

    @Override
    @SuppressWarnings("unchecked")
    public void init(final ProcessorContext context) {
      this.context = context;
      eventStore = (TimestampedKeyValueStore<K, V>) context.getStateStore(storeName);
      this.context.schedule(Duration.ofMillis(CLEAR_INTERVAL_MILLIS), PunctuationType.WALL_CLOCK_TIME, this::clearExpired);
    }

    private void clearExpired(long currentStreamTimeMs) {
      try (KeyValueIterator<K, ValueAndTimestamp<V>> iterator = eventStore.all()) {
        while (iterator.hasNext()) {
          final KeyValue<K, ValueAndTimestamp<V>> entry = iterator.next();
          final long eventTimestamp = entry.value.timestamp();
          if (hasExpired(eventTimestamp, currentStreamTimeMs)) {
            eventStore.delete(entry.key);
          }
        }
      }
    }

    private boolean hasExpired(final long eventTimestamp, final long currentStreamTimeMs) {
      return (currentStreamTimeMs - eventTimestamp) > maintainDurationMs;
    }

    @Override
    public Iterable<V> transform(K key, V value) {
      if (value == null) {
        return Collections.emptyList();
      } else {
        final Iterable<V> output;
        ValueAndTimestamp<V> storedValue = eventStore.get(key);
        if (context.timestamp() < storedValue.timestamp()) {
          // message is older than last stored, ignoring
          output = Collections.emptyList();
        } else if (isDuplicate(key, value, storedValue)) {
          output = Collections.emptyList();
          updateTimestampOfExistingEventToPreventExpiry(key, storedValue.value(), context.timestamp());
        } else {
          output = Collections.singleton(value);
          rememberNewEvent(key, value, context.timestamp());
        }
        return output;
      }
    }

    private boolean isDuplicate(final K key, final V value, ValueAndTimestamp<V> storedValue) {
      boolean isDuplicate = false;
      if (storedValue != null) {
        V previous = storedValue.value();
        isDuplicate = (valueComparator.apply(previous, value) == Boolean.TRUE);
      }
      return isDuplicate;
    }

    private void updateTimestampOfExistingEventToPreventExpiry(final K key, final V value, final long newTimestamp) {
      eventStore.put(key, ValueAndTimestamp.make(value, newTimestamp));
    }

    private void rememberNewEvent(final K key, final V value, final long timestamp) {
      eventStore.put(key, ValueAndTimestamp.make(value, timestamp));
    }

    @Override
    public void close() {
      // Note: The store should NOT be closed manually here via `eventStore.close()`!
      // The Kafka Streams API will automatically close stores when necessary.
    }

  }

  private class DedupeValueWithKey<K, V> implements ValueTransformerWithKeySupplier<K, V, Iterable<V>> {
    private final MaterializedInternal<K, V, KeyValueStore<Bytes, byte[]>> materialized;
    private final BiFunction<V, V, Boolean> valueComparator;
    private long maintainDurationMs = TimeUnit.HOURS.toMillis(30);

    public DedupeValueWithKey(BiFunction<V, V, Boolean> valueComparator, Materialized<K, V, KeyValueStore<Bytes, byte[]>> materialized) {
      this.materialized = new MaterializedInternal<>(materialized, null, null);
      // can't pass in name provider
      // new MaterializedInternal<>(materialized, builder, REDUCE_NAME);
      this.valueComparator = valueComparator;
    }

    @Override
    public ValueTransformerWithKey<K, V, Iterable<V>> get() {
      return new DedupeValueTransformer(
              valueComparator,
              maintainDurationMs
      );
    }


    // provide store(s) that will be added and connected to the associated transformer
    // the store name from the builder ("myTransformState") is used to access the store later via the ProcessorContext
    @Override
    public Set<StoreBuilder<?>> stores() {
      KeyValueBytesStoreSupplier supplier = Stores.persistentTimestampedKeyValueStore(storeName);
      StoreBuilder<TimestampedKeyValueStore<K, V>> builder = Stores.timestampedKeyValueStoreBuilder(
              supplier,
              materialized.keySerde(),
              materialized.valueSerde());

      return Collections.singleton(builder);
    }

    public void setMaintainDurationMs(long maintainDurationMs) {
      this.maintainDurationMs = maintainDurationMs;
    }
  }

  @Test
  public void shouldRemoveDuplicatesFromTheInput() {
    final String firstId = UUID.randomUUID().toString(); // e.g. "4ff3cb44-abcb-46e3-8f9a-afb7cc74fbb8"
    final String secondId = UUID.randomUUID().toString();
    final String thirdId = UUID.randomUUID().toString();
    final byte[] keyId = UUID.randomUUID().toString().getBytes(StandardCharsets.UTF_8);
    final List<String> inputValues = Arrays.asList(firstId, secondId, secondId, thirdId,
                                                   thirdId, thirdId);
    final List<KeyValue<byte[], String>> inputs = inputValues.stream().map(v -> new KeyValue<>(keyId, v)).collect(Collectors.toList());
    final List<String> expectedValues = Arrays.asList(firstId, secondId, thirdId);

    //
    // Step 1: Configure and start the processor topology.
    //
    final StreamsBuilder builder = new StreamsBuilder();

    final Properties streamsConfiguration = new Properties();
    streamsConfiguration.put(StreamsConfig.APPLICATION_ID_CONFIG, "deduplication-lambda-integration-test");
    streamsConfiguration.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "dummy config");
    streamsConfiguration.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.ByteArray().getClass().getName());
    streamsConfiguration.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass().getName());
    // Use a temporary directory for storing state, which will be automatically removed after the test.
    streamsConfiguration.put(StreamsConfig.STATE_DIR_CONFIG, TestUtils.tempDirectory().getAbsolutePath());

    // How long we "remember" an event.  During this time, any incoming duplicates of the event
    // will be, well, dropped, thereby de-duplicating the input data.
    //
    // The actual value depends on your use case.  To reduce memory and disk usage, you could
    // decrease the size to purge old windows more frequently at the cost of potentially missing out
    // on de-duplicating late-arriving records.
    final Duration windowSize = Duration.ofMinutes(10);

    // retention period must be at least window size -- for this use case, we don't need a longer retention period
    // and thus just use the window size as retention time
    final Duration retentionPeriod = windowSize;

    final String inputTopic = "inputTopic";
    final String outputTopic = "outputTopic";

    final KStream<byte[], String> stream = builder.stream(inputTopic);

    DedupeValueWithKey transformerSupplier = new DedupeValueWithKey(
            (value1, value2) -> value1.equals(value2),
            Materialized.with(new ByteArraySerde(), new Serdes.StringSerde())
    );
    transformerSupplier.setMaintainDurationMs(retentionPeriod.toMillis());

    final KStream<byte[], String> deduplicated = stream.flatTransformValues(
            // In this example, we assume that the record value as-is represents a unique event ID by
            // which we can perform de-duplication.  If your records are different, adapt the comparison
            //    function as needed
            transformerSupplier
    );
    deduplicated.to(outputTopic);

    try (final TopologyTestDriver topologyTestDriver = new TopologyTestDriver(builder.build(), streamsConfiguration)) {
      //
      // Step 2: Setup input and output topics.
      //
      final TestInputTopic<byte[], String> input = topologyTestDriver
              .createInputTopic(inputTopic,
                      new ByteArraySerializer(),
                      new StringSerializer());
      final TestOutputTopic<byte[], String> output = topologyTestDriver
              .createOutputTopic(outputTopic,
                      new ByteArrayDeserializer(),
                      new StringDeserializer());

      //
      // Step 3: Produce some input data to the input topic.
      //
      input.pipeKeyValueList(inputs);

      //
      // Step 4: Verify the application's output data.
      //
      assertThat(output.readValuesToList(), equalTo(expectedValues));
    }
  }

  @Test
  public void shouldForgetAfterExpiry() {
    final String firstId = UUID.randomUUID().toString(); // e.g. "4ff3cb44-abcb-46e3-8f9a-afb7cc74fbb8"
    final String secondId = UUID.randomUUID().toString();
    final String thirdId = UUID.randomUUID().toString();
    final byte[] keyId = UUID.randomUUID().toString().getBytes(StandardCharsets.UTF_8);
    final List<String> inputValues = Arrays.asList(firstId, secondId, secondId, thirdId,
                                                   thirdId, thirdId);
    final List<KeyValue<byte[], String>> inputs = inputValues.stream().map(v -> new KeyValue<>(keyId, v)).collect(Collectors.toList());
    final List<String> expectedValues = Arrays.asList(firstId, secondId, thirdId, thirdId);

    //
    // Step 1: Configure and start the processor topology.
    //
    final StreamsBuilder builder = new StreamsBuilder();

    final Properties streamsConfiguration = new Properties();
    streamsConfiguration.put(StreamsConfig.APPLICATION_ID_CONFIG, "deduplication-lambda-integration-test");
    streamsConfiguration.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "dummy config");
    streamsConfiguration.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.ByteArray().getClass().getName());
    streamsConfiguration.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass().getName());
    // Use a temporary directory for storing state, which will be automatically removed after the test.
    streamsConfiguration.put(StreamsConfig.STATE_DIR_CONFIG, TestUtils.tempDirectory().getAbsolutePath());

    // How long we "remember" an event.  During this time, any incoming duplicates of the event
    // will be, well, dropped, thereby de-duplicating the input data.
    //
    // The actual value depends on your use case.  To reduce memory and disk usage, you could
    // decrease the size to purge old windows more frequently at the cost of potentially missing out
    // on de-duplicating late-arriving records.
    final Duration windowSize = Duration.ofMinutes(10);

    // retention period must be at least window size -- for this use case, we don't need a longer retention period
    // and thus just use the window size as retention time
    final Duration retentionPeriod = windowSize;

    final String inputTopic = "inputTopic";
    final String outputTopic = "outputTopic";

    final KStream<byte[], String> stream = builder.stream(inputTopic);

    DedupeValueWithKey transformerSupplier = new DedupeValueWithKey(
            (value1, value2) -> value1.equals(value2),
            Materialized.with(new ByteArraySerde(), new Serdes.StringSerde())
    );
    transformerSupplier.setMaintainDurationMs(retentionPeriod.toMillis());

    final KStream<byte[], String> deduplicated = stream.flatTransformValues(
            // In this example, we assume that the record value as-is represents a unique event ID by
            // which we can perform de-duplication.  If your records are different, adapt the comparison
            //    function as needed
            transformerSupplier
    );
    deduplicated.to(outputTopic);

    try (final TopologyTestDriver topologyTestDriver = new TopologyTestDriver(builder.build(), streamsConfiguration)) {
      //
      // Step 2: Setup input and output topics.
      //
      final TestInputTopic<byte[], String> input = topologyTestDriver
              .createInputTopic(inputTopic,
                      new ByteArraySerializer(),
                      new StringSerializer());
      final TestOutputTopic<byte[], String> output = topologyTestDriver
              .createOutputTopic(outputTopic,
                      new ByteArrayDeserializer(),
                      new StringDeserializer());

      //
      // Step 3: Produce some input data to the input topic.
      //
      input.pipeKeyValueList(inputs);

      //
      // Step 4: Advance wall clock time
      //
      topologyTestDriver.advanceWallClockTime(windowSize.plus(Duration.ofMillis(DedupeValueTransformer.CLEAR_INTERVAL_MILLIS)));

      //
      // Step 5: Send another
      //
      input.pipeInput(keyId, thirdId);

      //
      // Step 6: Verify the application's output data.
      //
      assertThat(output.readValuesToList(), equalTo(expectedValues));
    }
  }
}

## EventDeduplicationLambdaIntegrationTest.java
/*
 * Copyright Confluent Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.confluent.examples.streams;

import org.apache.kafka.common.serialization.*;
import org.apache.kafka.common.serialization.Serdes.ByteArraySerde;
import org.apache.kafka.common.utils.Bytes;
import org.apache.kafka.streams.*;
import org.apache.kafka.streams.kstream.KStream;
import org.apache.kafka.streams.kstream.Materialized;
import org.apache.kafka.streams.kstream.Transformer;
import org.apache.kafka.streams.kstream.TransformerSupplier;
import org.apache.kafka.streams.kstream.internals.MaterializedInternal;
import org.apache.kafka.streams.processor.ProcessorContext;
import org.apache.kafka.streams.state.*;
import org.apache.kafka.test.TestUtils;
import org.junit.Test;

import java.nio.charset.StandardCharsets;
import java.time.Duration;
import java.util.*;
import java.util.function.BiFunction;
import java.util.stream.Collectors;

import static org.hamcrest.CoreMatchers.equalTo;
import static org.hamcrest.MatcherAssert.assertThat;

/**
 * End-to-end integration test that demonstrates how to remove duplicate records from an input
 * stream.
 * <p>
 * Here, a stateful {@link org.apache.kafka.streams.kstream.Transformer} (from the Processor API)
 * detects and discards duplicate input records based on an "event id" that is embedded in each
 * input record.  This transformer is then included in a topology defined via the DSL.
 * <p>
 * In this simplified example, the values of input records represent the event ID by which
 * duplicates will be detected.  In practice, record values would typically be a more complex data
 * structure, with perhaps one of the fields being such an event ID.  De-duplication by an event ID
 * is but one example of how to perform de-duplication in general.  The code example below can be
 * adapted to other de-duplication approaches.
 * <p>
 * Note: This example uses lambda expressions and thus works with Java 8+ only.
 */
public class EventDeduplicationLambdaIntegrationTest {

  private static final String storeName = "eventId-store";

  /**
   * Discards duplicate records from the input stream.
   * <p>
   * Duplicate records are detected based on an event ID;  in this simplified example, the record
   * value is the event ID.  The transformer remembers known event IDs in an associated window state
   * store, which automatically purges/expires event IDs from the store after a certain amount of
   * time has passed to prevent the store from growing indefinitely.
   * <p>
   * Note: This code is for demonstration purposes and was not tested for production usage.
   */
  private static class DedupeTransformer<K, V> implements Transformer<K, V, KeyValue<K, V>> {

    private ProcessorContext context;

    /**
     * Key: event ID
     * Value: timestamp (event-time) of the corresponding event when the event ID was seen for the
     * first time
     */
    private WindowStore<K, V> eventStore;

    private final long leftDurationMs;
    private final long rightDurationMs;

    private final BiFunction<V, V, Boolean> valueComparator;

    /**2
     * @param maintainDurationPerEventInMs how long to "remember" a known event (or rather, an event
     *                                     ID), during the time of which any incoming duplicates of
     *                                     the event will be dropped, thereby de-duplicating the
     *                                     input.
     * @param valueComparator              compares the value of previous record to current
     */
    DedupeTransformer(BiFunction<V, V, Boolean> valueComparator, final long maintainDurationPerEventInMs) {
      if (maintainDurationPerEventInMs < 1) {
        throw new IllegalArgumentException("maintain duration per event must be >= 1");
      }
      leftDurationMs = maintainDurationPerEventInMs / 2;
      rightDurationMs = maintainDurationPerEventInMs - leftDurationMs;
      this.valueComparator = valueComparator;
    }

    @Override
    @SuppressWarnings("unchecked")
    public void init(final ProcessorContext context) {
      this.context = context;
      eventStore = (WindowStore<K, V>) context.getStateStore(storeName);
    }

    public KeyValue<K, V> transform(final K key, final V value) {
      if (value == null) {
        return KeyValue.pair(key, null);
      } else {
        final KeyValue<K, V> output;
        if (isDuplicate(key, value)) {
          output = null;
          updateTimestampOfExistingEventToPreventExpiry(key, value, context.timestamp());
        } else {
          output = KeyValue.pair(key, value);
          rememberNewEvent(key, value, context.timestamp());
        }
        return output;
      }
    }

    private boolean isDuplicate(final K key, final V value) {
      boolean isDuplicate = false;
      final long eventTime = context.timestamp();
      final WindowStoreIterator<V> timeIterator = eventStore.fetch(
              key,
              eventTime - leftDurationMs,
              eventTime + rightDurationMs);
      if (!timeIterator.hasNext()) {
        return false;
      }
      KeyValue<Long, V> storedValue = timeIterator.next();
      if (storedValue != null) {
        V previous = storedValue.value;
        isDuplicate = (valueComparator.apply(previous, value) == Boolean.TRUE);
      }
      return isDuplicate;
    }

    private void updateTimestampOfExistingEventToPreventExpiry(final K key, final V value, final long newTimestamp) {
      eventStore.put(key, value, newTimestamp);
    }

    private void rememberNewEvent(final K key, final V value, final long timestamp) {
      eventStore.put(key, value, timestamp);
    }

    @Override
    public void close() {
      // Note: The store should NOT be closed manually here via `eventStore.close()`!
      // The Kafka Streams API will automatically close stores when necessary.
    }

  }

  private class DedupeValueForKey<K, V> implements TransformerSupplier<K, V, KeyValue<K, V>> {
    private final MaterializedInternal<K, V, KeyValueStore<Bytes, byte[]>> materialized;
    private final BiFunction<V, V, Boolean> valueComparator;

    private final Duration retentionPeriod;

    public DedupeValueForKey(BiFunction<V, V, Boolean> valueComparator, Duration retentionPeriod, Materialized<K, V, KeyValueStore<Bytes, byte[]>> materialized) {

      this.materialized = new MaterializedInternal<>(materialized, null, null);
      // can't pass in name provider
      // new MaterializedInternal<>(materialized, builder, REDUCE_NAME);
      this.valueComparator = valueComparator;

      this.retentionPeriod = retentionPeriod;
    }

    @Override
    public Transformer<K, V, KeyValue<K, V>> get() {
      return new DedupeTransformer(
              valueComparator,
              retentionPeriod.toMillis()
      );
    }


    // provide store(s) that will be added and connected to the associated transformer
    @Override
    public Set<StoreBuilder<?>> stores() {
      WindowBytesStoreSupplier supplier = Stores.persistentWindowStore(storeName,
              retentionPeriod,
              retentionPeriod,
              false
      );
      StoreBuilder<WindowStore<K, V>> builder = Stores.windowStoreBuilder(
              supplier,
              materialized.keySerde(),
              materialized.valueSerde());

      return Collections.singleton(builder);
    }

  }

  @Test
  public void shouldRemoveDuplicatesFromTheInput() {
    final String firstId = UUID.randomUUID().toString(); // e.g. "4ff3cb44-abcb-46e3-8f9a-afb7cc74fbb8"
    final String secondId = UUID.randomUUID().toString();
    final String thirdId = UUID.randomUUID().toString();
    final byte[] keyId = UUID.randomUUID().toString().getBytes(StandardCharsets.UTF_8);
    final List<String> inputValues = Arrays.asList(firstId, secondId, secondId, thirdId,
                                                   thirdId, thirdId);
    final List<KeyValue<byte[], String>> inputs = inputValues.stream().map(v -> new KeyValue<>(keyId, v)).collect(Collectors.toList());
    final List<String> expectedValues = Arrays.asList(firstId, secondId, thirdId);

    //
    // Step 1: Configure and start the processor topology.
    //
    final StreamsBuilder builder = new StreamsBuilder();

    final Properties streamsConfiguration = new Properties();
    streamsConfiguration.put(StreamsConfig.APPLICATION_ID_CONFIG, "deduplication-lambda-integration-test");
    streamsConfiguration.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "dummy config");
    streamsConfiguration.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.ByteArray().getClass().getName());
    streamsConfiguration.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass().getName());
    // Use a temporary directory for storing state, which will be automatically removed after the test.
    streamsConfiguration.put(StreamsConfig.STATE_DIR_CONFIG, TestUtils.tempDirectory().getAbsolutePath());

    // How long we "remember" an event.  During this time, any incoming duplicates of the event
    // will be, well, dropped, thereby de-duplicating the input data.
    //
    // The actual value depends on your use case.  To reduce memory and disk usage, you could
    // decrease the size to purge old windows more frequently at the cost of potentially missing out
    // on de-duplicating late-arriving records.
    final Duration windowSize = Duration.ofMinutes(10);

    // retention period must be at least window size -- for this use case, we don't need a longer retention period
    // and thus just use the window size as retention time
    final Duration retentionPeriod = windowSize;

    final String inputTopic = "inputTopic";
    final String outputTopic = "outputTopic";

    final KStream<byte[], String> stream = builder.stream(inputTopic);
    final KStream<byte[], String> deduplicated = stream.transform(
        // In this example, we assume that the record value as-is represents a unique event ID by
        // which we can perform de-duplication.  If your records are different, adapt the comparison
        //    function as needed
            new DedupeValueForKey(
                    (value1, value2) -> value1.equals(value2),
                    retentionPeriod,
                    Materialized.with(new ByteArraySerde(), new Serdes.StringSerde())
            )
    );
    deduplicated.to(outputTopic);

    try (final TopologyTestDriver topologyTestDriver = new TopologyTestDriver(builder.build(), streamsConfiguration)) {
      //
      // Step 2: Setup input and output topics.
      //
      final TestInputTopic<byte[], String> input = topologyTestDriver
              .createInputTopic(inputTopic,
                      new ByteArraySerializer(),
                      new StringSerializer());
      final TestOutputTopic<byte[], String> output = topologyTestDriver
              .createOutputTopic(outputTopic,
                      new ByteArrayDeserializer(),
                      new StringDeserializer());

      //
      // Step 3: Produce some input data to the input topic.
      //
      input.pipeKeyValueList(inputs);

      //
      // Step 4: Verify the application's output data.
      //
      assertThat(output.readValuesToList(), equalTo(expectedValues));
    }
  }
}
	/*
	* Copyright Confluent Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package io.confluent.examples.streams;

	import org.apache.kafka.common.serialization.*;
	import org.apache.kafka.common.serialization.Serdes.ByteArraySerde;
	import org.apache.kafka.common.utils.Bytes;
	import org.apache.kafka.streams.*;
	import org.apache.kafka.streams.kstream.KStream;
	import org.apache.kafka.streams.kstream.Materialized;
	import org.apache.kafka.streams.kstream.Transformer;
	import org.apache.kafka.streams.kstream.TransformerSupplier;
	import org.apache.kafka.streams.kstream.internals.MaterializedInternal;
	import org.apache.kafka.streams.processor.ProcessorContext;
	import org.apache.kafka.streams.processor.PunctuationType;
	import org.apache.kafka.streams.state.*;
	import org.apache.kafka.test.TestUtils;
	import org.junit.Test;

	import java.nio.charset.StandardCharsets;
	import java.time.Duration;
	import java.util.*;
	import java.util.concurrent.TimeUnit;
	import java.util.function.BiFunction;
	import java.util.stream.Collectors;

	import static org.hamcrest.CoreMatchers.equalTo;
	import static org.hamcrest.MatcherAssert.assertThat;

	/**
	* End-to-end integration test that demonstrates how to remove duplicate records from an input
	* stream.
	* <p>
	* Here, a stateful {@link Transformer} (from the Processor API)
	* detects and discards duplicate input records based on an "event id" that is embedded in each
	* input record. This transformer is then included in a topology defined via the DSL.
	* <p>
	* In this simplified example, the values of input records represent the event ID by which
	* duplicates will be detected. In practice, record values would typically be a more complex data
	* structure, with perhaps one of the fields being such an event ID. De-duplication by an event ID
	* is but one example of how to perform de-duplication in general. The code example below can be
	* adapted to other de-duplication approaches.
	* <p>
	* Note: This example uses lambda expressions and thus works with Java 8+ only.
	*/
	public class DeduplicationKeyValueTransformerIntegrationTest {

	private static final String storeName = "eventId-store";

	/**
	* Discards duplicate records from the input stream.
	* <p>
	* Duplicates are determined by a function passed to it on creation to allow for custom comparisons, that compares
	* new values with the previous.
	* <p>
	* Duplicate records are detected based on an value. The transformer remembers the last value for each key in a state
	* store, which automatically purges/expires values from the store after a certain amount of
	* time has passed to prevent the store from growing indefinitely. This assumes the consumer can tolerate some duplicates.
	* This processor's purpose is to reduce the 'noise' caused by a lot of duplicate messages.
	* <p>
	* Note: This code is for demonstration purposes and was not tested for production usage.
	*/
	private static class DedupeTransformer<K, V> implements Transformer<K, V, KeyValue<K, V>> {

	private ProcessorContext context;

	/**
	* Key: event ID
	* Value: timestamp (event-time) of the corresponding event when the event ID was seen for the
	* first time
	*/
	private TimestampedKeyValueStore<K, V> eventStore;

	public static final int CLEAR_INTERVAL_MILLIS = 1000;
	private final long maintainDurationMs;

	private final BiFunction<V, V, Boolean> valueComparator;

	/**
	*/
	DedupeTransformer(BiFunction<V, V, Boolean> valueComparator, long maintainDurationMs) {
	this.valueComparator = valueComparator;
	this.maintainDurationMs = maintainDurationMs;
	}

	@Override
	@SuppressWarnings("unchecked")
	public void init(final ProcessorContext context) {
	this.context = context;
	eventStore = (TimestampedKeyValueStore<K, V>) context.getStateStore(storeName);
	this.context.schedule(Duration.ofMillis(CLEAR_INTERVAL_MILLIS), PunctuationType.WALL_CLOCK_TIME, this::clearExpired);
	}

	private void clearExpired(long currentStreamTimeMs) {
	try (KeyValueIterator<K, ValueAndTimestamp<V>> iterator = eventStore.all()) {
	while (iterator.hasNext()) {
	final KeyValue<K, ValueAndTimestamp<V>> entry = iterator.next();
	final long eventTimestamp = entry.value.timestamp();
	if (hasExpired(eventTimestamp, currentStreamTimeMs)) {
	eventStore.delete(entry.key);
	}
	}
	}
	}

	private boolean hasExpired(final long eventTimestamp, final long currentStreamTimeMs) {
	return (currentStreamTimeMs - eventTimestamp) > maintainDurationMs;
	}

	public KeyValue<K, V> transform(final K key, final V value) {
	if (value == null) {
	return KeyValue.pair(key, null);
	} else {
	final KeyValue<K, V> output;
	if (isDuplicate(key, value)) {
	output = null;
	updateTimestampOfExistingEventToPreventExpiry(key, value, context.timestamp());
	} else {
	output = KeyValue.pair(key, value);
	rememberNewEvent(key, value, context.timestamp());
	}
	return output;
	}
	}

	private boolean isDuplicate(final K key, final V value) {
	boolean isDuplicate = false;
	ValueAndTimestamp<V> storedValue = eventStore.get(key);
	if (storedValue != null) {
	V previous = storedValue.value();
	isDuplicate = (valueComparator.apply(previous, value) == Boolean.TRUE);
	}
	return isDuplicate;
	}

	private void updateTimestampOfExistingEventToPreventExpiry(final K key, final V value, final long newTimestamp) {
	eventStore.put(key, ValueAndTimestamp.make(value, newTimestamp));
	}

	private void rememberNewEvent(final K key, final V value, final long timestamp) {
	eventStore.put(key, ValueAndTimestamp.make(value, timestamp));
	}

	@Override
	public void close() {
	// Note: The store should NOT be closed manually here via `eventStore.close()`!
	// The Kafka Streams API will automatically close stores when necessary.
	}

	}

	private class DedupeValueForKey<K, V> implements TransformerSupplier<K, V, KeyValue<K, V>> {
	private final MaterializedInternal<K, V, KeyValueStore<Bytes, byte[]>> materialized;
	private final BiFunction<V, V, Boolean> valueComparator;
	private long maintainDurationMs = TimeUnit.HOURS.toMillis(30);

	public DedupeValueForKey(BiFunction<V, V, Boolean> valueComparator, Materialized<K, V, KeyValueStore<Bytes, byte[]>> materialized) {
	this.materialized = new MaterializedInternal<>(materialized, null, null);
	// can't pass in name provider
	// new MaterializedInternal<>(materialized, builder, REDUCE_NAME);
	this.valueComparator = valueComparator;
	}

	@Override
	public Transformer<K, V, KeyValue<K, V>> get() {
	return new DedupeTransformer(
	valueComparator,
	maintainDurationMs
	);
	}


	// provide store(s) that will be added and connected to the associated transformer
	// the store name from the builder ("myTransformState") is used to access the store later via the ProcessorContext
	@Override
	public Set<StoreBuilder<?>> stores() {
	KeyValueBytesStoreSupplier supplier = Stores.persistentTimestampedKeyValueStore(storeName);
	StoreBuilder<TimestampedKeyValueStore<K, V>> builder = Stores.timestampedKeyValueStoreBuilder(
	supplier,
	materialized.keySerde(),
	materialized.valueSerde());

	return Collections.singleton(builder);
	}

	public void setMaintainDurationMs(long maintainDurationMs) {
	this.maintainDurationMs = maintainDurationMs;
	}
	}

	@Test
	public void shouldRemoveDuplicatesFromTheInput() {
	final String firstId = UUID.randomUUID().toString(); // e.g. "4ff3cb44-abcb-46e3-8f9a-afb7cc74fbb8"
	final String secondId = UUID.randomUUID().toString();
	final String thirdId = UUID.randomUUID().toString();
	final byte[] keyId = UUID.randomUUID().toString().getBytes(StandardCharsets.UTF_8);
	final List<String> inputValues = Arrays.asList(firstId, secondId, secondId, thirdId,
	thirdId, thirdId);
	final List<KeyValue<byte[], String>> inputs = inputValues.stream().map(v -> new KeyValue<>(keyId, v)).collect(Collectors.toList());
	final List<String> expectedValues = Arrays.asList(firstId, secondId, thirdId);

	//
	// Step 1: Configure and start the processor topology.
	//
	final StreamsBuilder builder = new StreamsBuilder();

	final Properties streamsConfiguration = new Properties();
	streamsConfiguration.put(StreamsConfig.APPLICATION_ID_CONFIG, "deduplication-lambda-integration-test");
	streamsConfiguration.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "dummy config");
	streamsConfiguration.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.ByteArray().getClass().getName());
	streamsConfiguration.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass().getName());
	// Use a temporary directory for storing state, which will be automatically removed after the test.
	streamsConfiguration.put(StreamsConfig.STATE_DIR_CONFIG, TestUtils.tempDirectory().getAbsolutePath());

	// How long we "remember" an event. During this time, any incoming duplicates of the event
	// will be, well, dropped, thereby de-duplicating the input data.
	//
	// The actual value depends on your use case. To reduce memory and disk usage, you could
	// decrease the size to purge old windows more frequently at the cost of potentially missing out
	// on de-duplicating late-arriving records.
	final Duration windowSize = Duration.ofMinutes(10);

	// retention period must be at least window size -- for this use case, we don't need a longer retention period
	// and thus just use the window size as retention time
	final Duration retentionPeriod = windowSize;

	final String inputTopic = "inputTopic";
	final String outputTopic = "outputTopic";

	final KStream<byte[], String> stream = builder.stream(inputTopic);

	DedupeValueForKey transformerSupplier = new DedupeValueForKey(
	(value1, value2) -> value1.equals(value2),
	Materialized.with(new ByteArraySerde(), new Serdes.StringSerde())
	);
	transformerSupplier.setMaintainDurationMs(retentionPeriod.toMillis());

	final KStream<byte[], String> deduplicated = stream.transform(
	// In this example, we assume that the record value as-is represents a unique event ID by
	// which we can perform de-duplication. If your records are different, adapt the comparison
	// function as needed
	transformerSupplier
	);
	deduplicated.to(outputTopic);

	try (final TopologyTestDriver topologyTestDriver = new TopologyTestDriver(builder.build(), streamsConfiguration)) {
	//
	// Step 2: Setup input and output topics.
	//
	final TestInputTopic<byte[], String> input = topologyTestDriver
	.createInputTopic(inputTopic,
	new ByteArraySerializer(),
	new StringSerializer());
	final TestOutputTopic<byte[], String> output = topologyTestDriver
	.createOutputTopic(outputTopic,
	new ByteArrayDeserializer(),
	new StringDeserializer());

	//
	// Step 3: Produce some input data to the input topic.
	//
	input.pipeKeyValueList(inputs);

	//
	// Step 4: Verify the application's output data.
	//
	assertThat(output.readValuesToList(), equalTo(expectedValues));
	}
	}

	@Test
	public void shouldForgetAfterExpiry() {
	final String firstId = UUID.randomUUID().toString(); // e.g. "4ff3cb44-abcb-46e3-8f9a-afb7cc74fbb8"
	final String secondId = UUID.randomUUID().toString();
	final String thirdId = UUID.randomUUID().toString();
	final byte[] keyId = UUID.randomUUID().toString().getBytes(StandardCharsets.UTF_8);
	final List<String> inputValues = Arrays.asList(firstId, secondId, secondId, thirdId,
	thirdId, thirdId);
	final List<KeyValue<byte[], String>> inputs = inputValues.stream().map(v -> new KeyValue<>(keyId, v)).collect(Collectors.toList());
	final List<String> expectedValues = Arrays.asList(firstId, secondId, thirdId, thirdId);

	//
	// Step 1: Configure and start the processor topology.
	//
	final StreamsBuilder builder = new StreamsBuilder();

	final Properties streamsConfiguration = new Properties();
	streamsConfiguration.put(StreamsConfig.APPLICATION_ID_CONFIG, "deduplication-lambda-integration-test");
	streamsConfiguration.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "dummy config");
	streamsConfiguration.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.ByteArray().getClass().getName());
	streamsConfiguration.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass().getName());
	// Use a temporary directory for storing state, which will be automatically removed after the test.
	streamsConfiguration.put(StreamsConfig.STATE_DIR_CONFIG, TestUtils.tempDirectory().getAbsolutePath());

	// How long we "remember" an event. During this time, any incoming duplicates of the event
	// will be, well, dropped, thereby de-duplicating the input data.
	//
	// The actual value depends on your use case. To reduce memory and disk usage, you could
	// decrease the size to purge old windows more frequently at the cost of potentially missing out
	// on de-duplicating late-arriving records.
	final Duration windowSize = Duration.ofMinutes(10);

	// retention period must be at least window size -- for this use case, we don't need a longer retention period
	// and thus just use the window size as retention time
	final Duration retentionPeriod = windowSize;

	final String inputTopic = "inputTopic";
	final String outputTopic = "outputTopic";

	final KStream<byte[], String> stream = builder.stream(inputTopic);

	DedupeValueForKey transformerSupplier = new DedupeValueForKey(
	(value1, value2) -> value1.equals(value2),
	Materialized.with(new ByteArraySerde(), new Serdes.StringSerde())
	);
	transformerSupplier.setMaintainDurationMs(retentionPeriod.toMillis());

	final KStream<byte[], String> deduplicated = stream.transform(
	// In this example, we assume that the record value as-is represents a unique event ID by
	// which we can perform de-duplication. If your records are different, adapt the comparison
	// function as needed
	transformerSupplier
	);
	deduplicated.to(outputTopic);

	try (final TopologyTestDriver topologyTestDriver = new TopologyTestDriver(builder.build(), streamsConfiguration)) {
	//
	// Step 2: Setup input and output topics.
	//
	final TestInputTopic<byte[], String> input = topologyTestDriver
	.createInputTopic(inputTopic,
	new ByteArraySerializer(),
	new StringSerializer());
	final TestOutputTopic<byte[], String> output = topologyTestDriver
	.createOutputTopic(outputTopic,
	new ByteArrayDeserializer(),
	new StringDeserializer());

	//
	// Step 3: Produce some input data to the input topic.
	//
	input.pipeKeyValueList(inputs);

	//
	// Step 4: Advance wall clock time
	//
	topologyTestDriver.advanceWallClockTime(windowSize.plus(Duration.ofMillis(DedupeTransformer.CLEAR_INTERVAL_MILLIS)));

	//
	// Step 5: Send another
	//
	input.pipeInput(keyId, thirdId);

	//
	// Step 6: Verify the application's output data.
	//
	assertThat(output.readValuesToList(), equalTo(expectedValues));
	}
	}
	}