gianm/Appenderator.java

## Appenderator.java
/*
 * Licensed to Metamarkets Group Inc. (Metamarkets) under one
 * or more contributor license agreements. See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. Metamarkets licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package io.druid.segment.realtime.appenderator;

import com.google.common.util.concurrent.ListenableFuture;
import io.druid.data.input.Committer;
import io.druid.data.input.InputRow;
import io.druid.query.Query;
import io.druid.query.QueryRunner;
import io.druid.segment.incremental.IndexSizeExceededException;
import io.druid.segment.realtime.plumber.CommitterMaker;

import java.io.Closeable;
import java.util.List;

/**
 * Like a Plumber, but different.
 * <p/>
 * An Appenderator manages indexed data. It has some in-memory data and some persisted-on-disk data. It can serve
 * queries on both of those. It can also push data to deep storage. But, it cannot publish segments to the metadata
 * store; you have to do that yourself!
 * <p/>
 * Any time you call one of the methods that adds, persists, or pushes data, you must provide a Committer or a
 * CommitterMaker that represents all data you have given to the Appenderator so far. The Committer will be used when
 * that data has been persisted to disk.
 */
public interface Appenderator extends Closeable
{
  /**
   * Return the name of the dataSource associated with this Appenderator.
   */
  String getDataSource();

  /**
   * Perform any initial setup. Should be called before using any other methods.
   *
   * @return currently persisted commit metadata
   */
  Object startJob();

  /**
   * Add a row. Must not be called concurrently from multiple threads.
   * <p/>
   * If no pending segment exists for the provided identifier, a new one will be created.
   * <p/>
   * This method may trigger a {@link #persistAll(Committer)} using the provided CommitterMaker. If it does this, the
   * Committer is guaranteed to be *created* synchronously with the cal to add, but will actually be used
   * asynchronously.
   * <p/>
   * The add, clear, persistAll, pushAll, and push methods should all be called from the same thread to ensure that
   * commit metadata matches data that has actually been persisted to disk.
   *
   * @param identifier     the segment into which this row should be added
   * @param row            the row to add
   * @param committerMaker a committer maker associated with all data that has been added so far, including this row
   *
   * @return positive number indicating how many summarized rows exist in this segment so far
   *
   * @throws IndexSizeExceededException  if this row cannot be added because it is too large
   * @throws SegmentNotWritableException if the requested segment is known, but has been closed
   */
  int add(SegmentIdentifier identifier, InputRow row, CommitterMaker committerMaker)
      throws IndexSizeExceededException, SegmentNotWritableException;

  /**
   * Returns a list of all currently pending segments.
   */
  List<SegmentIdentifier> getSegments();

  /**
   * Returns the number of rows in a particular pending segment.
   *
   * @param identifier segment to examine
   *
   * @return row count
   *
   * @throws IllegalStateException if the segment is unknown
   */
  int getRowCount(final SegmentIdentifier identifier);

  /**
   * Drop all in-memory and on-disk data, and forget any previously-remembered commit metadata. This could be useful if,
   * for some reason, rows have been added that we do not actually want to hand off. Blocks until all data has been
   * cleared. This may take some time, since all pending persists must finish first.
   */
  void clear() throws InterruptedException;

  /**
   * Drop all data associated with a particular pending segment. Unlike {@link #clear()}), any on-disk commit
   * metadata will remain unchanged. If there is no pending segment with this identifier, then this method will
   * do nothing.
   * <p/>
   * You should not write to the dropped segment after calling "drop". If you need to drop all your data and
   * re-write it, consider {@link #clear()} instead.
   *
   * @param identifier the pending segment to drop
   *
   * @return future that resolves when data is dropped
   */
  ListenableFuture<?> drop(SegmentIdentifier identifier);

  /**
   * Persist any in-memory indexed data to durable storage. This may be only somewhat durable, e.g. the
   * machine's local disk. The Committer will be made synchronously will the call to persistAll, but will actually
   * be used asynchronously. Any metadata returned by the committer will be associated with the data persisted to
   * disk.
   * <p/>
   * The add, clear, persistAll, pushAll, and push methods should all be called from the same thread to ensure that
   * commit metadata matches data that has actually been persisted to disk.
   *
   * @param committer a committer associated with all data that has been added so far
   *
   * @return future that resolves when all pending data has been persisted, contains commit metadata for this persist
   */
  ListenableFuture<Object> persistAll(Committer committer);

  /**
   * Merge and push any pending data to deep storage. This will trigger an implicit {@link #persistAll(Committer)}
   * using the provided Committer.
   * <p/>
   * After this method is called, you cannot add new data to any segments that were previously under construction.
   * <p/>
   * The add, clear, persistAll, pushAll, and push methods should all be called from the same thread to ensure that
   * commit metadata matches data that has actually been persisted to disk.
   *
   * @param committer a committer associated with all data that has been added so far
   *
   * @return future that resolves when all segments have been pushed. The segment list will be the list of segments
   * that have been pushed and the commit metadata from the Committer.
   */
  ListenableFuture<SegmentsAndMetadata> pushAll(Committer committer);

  /**
   * Merge and push particular segments to deep storage. This will trigger an implicit {@link #persistAll(Committer)}
   * using the provided Committer.
   * <p/>
   * After this method is called, you cannot add new data to any segments that were previously under construction.
   * <p/>
   * The add, clear, persistAll, pushAll, and push methods should all be called from the same thread to ensure that
   * commit metadata matches data that has actually been persisted to disk.
   *
   * @param identifiers list of segments to push
   * @param committer   a committer associated with all data that has been added so far
   *
   * @return future that resolves when all segments have been pushed. The segment list will be the list of segments
   * that have been pushed and the commit metadata from the Committer.
   */
  ListenableFuture<SegmentsAndMetadata> push(List<SegmentIdentifier> identifiers, Committer committer);

  /**
   * Return a query runner for a particular query. The runner will read all pending data that has been added to
   * this Appenderator.
   */
  <T> QueryRunner<T> getQueryRunner(Query<T> query);

  /**
   * Stop any currently-running processing and clean up after ourselves. This will not remove any on-disk persisted
   * data, but it will drop any data that has not yet been persisted.
   */
  void close();
}
	/*
	* Licensed to Metamarkets Group Inc. (Metamarkets) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. Metamarkets licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package io.druid.segment.realtime.appenderator;

	import com.google.common.util.concurrent.ListenableFuture;
	import io.druid.data.input.Committer;
	import io.druid.data.input.InputRow;
	import io.druid.query.Query;
	import io.druid.query.QueryRunner;
	import io.druid.segment.incremental.IndexSizeExceededException;
	import io.druid.segment.realtime.plumber.CommitterMaker;

	import java.io.Closeable;
	import java.util.List;

	/**
	* Like a Plumber, but different.
	* <p/>
	* An Appenderator manages indexed data. It has some in-memory data and some persisted-on-disk data. It can serve
	* queries on both of those. It can also push data to deep storage. But, it cannot publish segments to the metadata
	* store; you have to do that yourself!
	* <p/>
	* Any time you call one of the methods that adds, persists, or pushes data, you must provide a Committer or a
	* CommitterMaker that represents all data you have given to the Appenderator so far. The Committer will be used when
	* that data has been persisted to disk.
	*/
	public interface Appenderator extends Closeable
	{
	/**
	* Return the name of the dataSource associated with this Appenderator.
	*/
	String getDataSource();

	/**
	* Perform any initial setup. Should be called before using any other methods.
	*
	* @return currently persisted commit metadata
	*/
	Object startJob();

	/**
	* Add a row. Must not be called concurrently from multiple threads.
	* <p/>
	* If no pending segment exists for the provided identifier, a new one will be created.
	* <p/>
	* This method may trigger a {@link #persistAll(Committer)} using the provided CommitterMaker. If it does this, the
	* Committer is guaranteed to be created synchronously with the cal to add, but will actually be used
	* asynchronously.
	* <p/>
	* The add, clear, persistAll, pushAll, and push methods should all be called from the same thread to ensure that
	* commit metadata matches data that has actually been persisted to disk.
	*
	* @param identifier the segment into which this row should be added
	* @param row the row to add
	* @param committerMaker a committer maker associated with all data that has been added so far, including this row
	*
	* @return positive number indicating how many summarized rows exist in this segment so far
	*
	* @throws IndexSizeExceededException if this row cannot be added because it is too large
	* @throws SegmentNotWritableException if the requested segment is known, but has been closed
	*/
	int add(SegmentIdentifier identifier, InputRow row, CommitterMaker committerMaker)
	throws IndexSizeExceededException, SegmentNotWritableException;

	/**
	* Returns a list of all currently pending segments.
	*/
	List<SegmentIdentifier> getSegments();

	/**
	* Returns the number of rows in a particular pending segment.
	*
	* @param identifier segment to examine
	*
	* @return row count
	*
	* @throws IllegalStateException if the segment is unknown
	*/
	int getRowCount(final SegmentIdentifier identifier);

	/**
	* Drop all in-memory and on-disk data, and forget any previously-remembered commit metadata. This could be useful if,
	* for some reason, rows have been added that we do not actually want to hand off. Blocks until all data has been
	* cleared. This may take some time, since all pending persists must finish first.
	*/
	void clear() throws InterruptedException;

	/**
	* Drop all data associated with a particular pending segment. Unlike {@link #clear()}), any on-disk commit
	* metadata will remain unchanged. If there is no pending segment with this identifier, then this method will
	* do nothing.
	* <p/>
	* You should not write to the dropped segment after calling "drop". If you need to drop all your data and
	* re-write it, consider {@link #clear()} instead.
	*
	* @param identifier the pending segment to drop
	*
	* @return future that resolves when data is dropped
	*/
	ListenableFuture<?> drop(SegmentIdentifier identifier);

	/**
	* Persist any in-memory indexed data to durable storage. This may be only somewhat durable, e.g. the
	* machine's local disk. The Committer will be made synchronously will the call to persistAll, but will actually
	* be used asynchronously. Any metadata returned by the committer will be associated with the data persisted to
	* disk.
	* <p/>
	* The add, clear, persistAll, pushAll, and push methods should all be called from the same thread to ensure that
	* commit metadata matches data that has actually been persisted to disk.
	*
	* @param committer a committer associated with all data that has been added so far
	*
	* @return future that resolves when all pending data has been persisted, contains commit metadata for this persist
	*/
	ListenableFuture<Object> persistAll(Committer committer);

	/**
	* Merge and push any pending data to deep storage. This will trigger an implicit {@link #persistAll(Committer)}
	* using the provided Committer.
	* <p/>
	* After this method is called, you cannot add new data to any segments that were previously under construction.
	* <p/>
	* The add, clear, persistAll, pushAll, and push methods should all be called from the same thread to ensure that
	* commit metadata matches data that has actually been persisted to disk.
	*
	* @param committer a committer associated with all data that has been added so far
	*
	* @return future that resolves when all segments have been pushed. The segment list will be the list of segments
	* that have been pushed and the commit metadata from the Committer.
	*/
	ListenableFuture<SegmentsAndMetadata> pushAll(Committer committer);

	/**
	* Merge and push particular segments to deep storage. This will trigger an implicit {@link #persistAll(Committer)}
	* using the provided Committer.
	* <p/>
	* After this method is called, you cannot add new data to any segments that were previously under construction.
	* <p/>
	* The add, clear, persistAll, pushAll, and push methods should all be called from the same thread to ensure that
	* commit metadata matches data that has actually been persisted to disk.
	*
	* @param identifiers list of segments to push
	* @param committer a committer associated with all data that has been added so far
	*
	* @return future that resolves when all segments have been pushed. The segment list will be the list of segments
	* that have been pushed and the commit metadata from the Committer.
	*/
	ListenableFuture<SegmentsAndMetadata> push(List<SegmentIdentifier> identifiers, Committer committer);

	/**
	* Return a query runner for a particular query. The runner will read all pending data that has been added to
	* this Appenderator.
	*/
	<T> QueryRunner<T> getQueryRunner(Query<T> query);

	/**
	* Stop any currently-running processing and clean up after ourselves. This will not remove any on-disk persisted
	* data, but it will drop any data that has not yet been persisted.
	*/
	void close();
	}