Wowfunhappy/ipc_mqueue

## ipc_mqueue
/*
 * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 *
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */
/*
 * @OSF_FREE_COPYRIGHT@
 */
/*
 * Mach Operating System
 * Copyright (c) 1991,1990,1989 Carnegie Mellon University
 * All Rights Reserved.
 *
 * Permission to use, copy, modify and distribute this software and its
 * documentation is hereby granted, provided that both the copyright
 * notice and this permission notice appear in all copies of the
 * software, derivative works or modified versions, and any portions
 * thereof, and that both notices appear in supporting documentation.
 *
 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
 *
 * Carnegie Mellon requests users of this software to return to
 *
 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
 *  School of Computer Science
 *  Carnegie Mellon University
 *  Pittsburgh PA 15213-3890
 *
 * any improvements or extensions that they make and grant Carnegie Mellon
 * the rights to redistribute these changes.
 */
/*
 */
/*
 *	File:	ipc/ipc_mqueue.c
 *	Author:	Rich Draves
 *	Date:	1989
 *
 *	Functions to manipulate IPC message queues.
 */
/*
 * NOTICE: This file was modified by SPARTA, Inc. in 2006 to introduce
 * support for mandatory and extensible security protections.  This notice
 * is included in support of clause 2.2 (b) of the Apple Public License,
 * Version 2.0.
 */


#include <mach/port.h>
#include <mach/message.h>
#include <mach/sync_policy.h>

#include <kern/assert.h>
#include <kern/counters.h>
#include <kern/sched_prim.h>
#include <kern/ipc_kobject.h>
#include <kern/ipc_mig.h>	/* XXX - for mach_msg_receive_continue */
#include <kern/misc_protos.h>
#include <kern/task.h>
#include <kern/thread.h>
#include <kern/wait_queue.h>

#include <ipc/ipc_mqueue.h>
#include <ipc/ipc_kmsg.h>
#include <ipc/ipc_port.h>
#include <ipc/ipc_pset.h>
#include <ipc/ipc_space.h>

#ifdef __LP64__
#include <vm/vm_map.h>
#endif

#if CONFIG_MACF_MACH
#include <security/mac_mach_internal.h>
#endif

int ipc_mqueue_full;		/* address is event for queue space */
int ipc_mqueue_rcv;		/* address is event for message arrival */

/* forward declarations */
void ipc_mqueue_receive_results(wait_result_t result);

/*
 *	Routine:	ipc_mqueue_init
 *	Purpose:
 *		Initialize a newly-allocated message queue.
 */
void
ipc_mqueue_init(
	ipc_mqueue_t	mqueue,
	boolean_t	is_set)
{
	if (is_set) {
		wait_queue_set_init(&mqueue->imq_set_queue, SYNC_POLICY_FIFO|SYNC_POLICY_PREPOST);
	} else {
		wait_queue_init(&mqueue->imq_wait_queue, SYNC_POLICY_FIFO);
		ipc_kmsg_queue_init(&mqueue->imq_messages);
		mqueue->imq_seqno = 0;
		mqueue->imq_msgcount = 0;
		mqueue->imq_qlimit = MACH_PORT_QLIMIT_DEFAULT;
		mqueue->imq_fullwaiters = FALSE;
	}
}

/*
 *	Routine:	ipc_mqueue_member
 *	Purpose:
 *		Indicate whether the (port) mqueue is a member of
 *		this portset's mqueue.  We do this by checking
 *		whether the portset mqueue's waitq is an member of
 *		the port's mqueue waitq.
 *	Conditions:
 *		the portset's mqueue is not already a member
 *		this may block while allocating linkage structures.
 */

boolean_t
ipc_mqueue_member(
	ipc_mqueue_t		port_mqueue,
	ipc_mqueue_t		set_mqueue)
{
	wait_queue_t	port_waitq = &port_mqueue->imq_wait_queue;
	wait_queue_set_t set_waitq = &set_mqueue->imq_set_queue;

	return (wait_queue_member(port_waitq, set_waitq));

}

/*
 *	Routine:	ipc_mqueue_remove
 *	Purpose:
 *		Remove the association between the queue and the specified
 *		set message queue.
 */

kern_return_t
ipc_mqueue_remove(
	ipc_mqueue_t	  mqueue,
	ipc_mqueue_t	  set_mqueue,
	wait_queue_link_t *wqlp)
{
	wait_queue_t	 mq_waitq = &mqueue->imq_wait_queue;
	wait_queue_set_t set_waitq = &set_mqueue->imq_set_queue;

	return wait_queue_unlink_nofree(mq_waitq, set_waitq, wqlp);
}

/*
 *	Routine:	ipc_mqueue_remove_from_all
 *	Purpose:
 *		Remove the mqueue from all the sets it is a member of
 *	Conditions:
 *		Nothing locked.
 */
void
ipc_mqueue_remove_from_all(
	ipc_mqueue_t	mqueue,
	queue_t 	links)
{
	wait_queue_t	mq_waitq = &mqueue->imq_wait_queue;

	wait_queue_unlink_all_nofree(mq_waitq, links);
	return;
}

/*
 *	Routine:	ipc_mqueue_remove_all
 *	Purpose:
 *		Remove all the member queues from the specified set.
 *	Conditions:
 *		Nothing locked.
 */
void
ipc_mqueue_remove_all(
	ipc_mqueue_t	mqueue,
	queue_t		links)
{
	wait_queue_set_t	mq_setq = &mqueue->imq_set_queue;

	wait_queue_set_unlink_all_nofree(mq_setq, links);
	return;
}


/*
 *	Routine:	ipc_mqueue_add
 *	Purpose:
 *		Associate the portset's mqueue with the port's mqueue.
 *		This has to be done so that posting the port will wakeup
 *		a portset waiter.  If there are waiters on the portset
 *		mqueue and messages on the port mqueue, try to match them
 *		up now.
 *	Conditions:
 *		May block.
 */
kern_return_t
ipc_mqueue_add(
	ipc_mqueue_t	 port_mqueue,
	ipc_mqueue_t	 set_mqueue,
	wait_queue_link_t wql)
{
	wait_queue_t	 port_waitq = &port_mqueue->imq_wait_queue;
	wait_queue_set_t set_waitq = &set_mqueue->imq_set_queue;
	ipc_kmsg_queue_t kmsgq;
	ipc_kmsg_t       kmsg, next;
	kern_return_t	 kr;
	spl_t		 s;

	kr = wait_queue_link_noalloc(port_waitq, set_waitq, wql);
	if (kr != KERN_SUCCESS)
		return kr;

	/*
	 * Now that the set has been added to the port, there may be
	 * messages queued on the port and threads waiting on the set
	 * waitq.  Lets get them together.
	 */
	s = splsched();
	imq_lock(port_mqueue);
	kmsgq = &port_mqueue->imq_messages;
	for (kmsg = ipc_kmsg_queue_first(kmsgq);
	     kmsg != IKM_NULL;
	     kmsg = next) {
		next = ipc_kmsg_queue_next(kmsgq, kmsg);

		for (;;) {
			thread_t th;
			mach_msg_size_t msize;

			th = wait_queue_wakeup64_identity_locked(
						port_waitq,
						IPC_MQUEUE_RECEIVE,
						THREAD_AWAKENED,
						FALSE);
			/* waitq/mqueue still locked, thread locked */

			if (th == THREAD_NULL)
				goto leave;

			/*
			 * If the receiver waited with a facility not directly
			 * related to Mach messaging, then it isn't prepared to get
			 * handed the message directly.  Just set it running, and
			 * go look for another thread that can.
			 */
			if (th->ith_state != MACH_RCV_IN_PROGRESS) {
				  thread_unlock(th);
				  continue;
			}

			/*
			 * Found a receiver. see if they can handle the message
			 * correctly (the message is not too large for them, or
			 * they didn't care to be informed that the message was
			 * too large).  If they can't handle it, take them off
			 * the list and let them go back and figure it out and
			 * just move onto the next.
			 */
			msize = ipc_kmsg_copyout_size(kmsg, th->map);
			if (th->ith_msize <
					(msize + REQUESTED_TRAILER_SIZE(thread_is_64bit(th), th->ith_option))) {
				th->ith_state = MACH_RCV_TOO_LARGE;
				th->ith_msize = msize;
				if (th->ith_option & MACH_RCV_LARGE) {
					/*
					 * let him go without message
					 */
					th->ith_receiver_name = port_mqueue->imq_receiver_name;
					th->ith_kmsg = IKM_NULL;
					th->ith_seqno = 0;
					thread_unlock(th);
					continue; /* find another thread */
				}
			} else {
				th->ith_state = MACH_MSG_SUCCESS;
			}

			/*
			 * This thread is going to take this message,
			 * so give it to him.
			 */
			ipc_kmsg_rmqueue(kmsgq, kmsg);
			ipc_mqueue_release_msgcount(port_mqueue);

			th->ith_kmsg = kmsg;
			th->ith_seqno = port_mqueue->imq_seqno++;
			thread_unlock(th);
			break;  /* go to next message */
		}

	}
 leave:
	imq_unlock(port_mqueue);
	splx(s);
	return KERN_SUCCESS;
}

/*
 *	Routine:	ipc_mqueue_changed
 *	Purpose:
 *		Wake up receivers waiting in a message queue.
 *	Conditions:
 *		The message queue is locked.
 */

void
ipc_mqueue_changed(
	ipc_mqueue_t		mqueue)
{
	printf("\nAbout to call wait_queue_wakeup64_all_locked from ipc_mqueue_changed\n");
	wait_queue_wakeup64_all_locked(
				&mqueue->imq_wait_queue,
				IPC_MQUEUE_RECEIVE,
				THREAD_RESTART,
				FALSE);		/* unlock waitq? */
}


/*
 *	Routine:	ipc_mqueue_send
 *	Purpose:
 *		Send a message to a message queue.  The message holds a reference
 *		for the destination port for this message queue in the
 *		msgh_remote_port field.
 *
 *		If unsuccessful, the caller still has possession of
 *		the message and must do something with it.  If successful,
 *		the message is queued, given to a receiver, or destroyed.
 *	Conditions:
 *		mqueue is locked.
 *	Returns:
 *		MACH_MSG_SUCCESS	The message was accepted.
 *		MACH_SEND_TIMED_OUT	Caller still has message.
 *		MACH_SEND_INTERRUPTED	Caller still has message.
 */
mach_msg_return_t
ipc_mqueue_send(
	ipc_mqueue_t		mqueue,
	ipc_kmsg_t		kmsg,
	mach_msg_option_t	option,
	mach_msg_timeout_t	send_timeout,
	spl_t			s)
{
	int wresult;

	/*
	 *  Don't block if:
	 *	1) We're under the queue limit.
	 *	2) Caller used the MACH_SEND_ALWAYS internal option.
	 *	3) Message is sent to a send-once right.
	 */
	if (!imq_full(mqueue) ||
	    (!imq_full_kernel(mqueue) &&
	     ((option & MACH_SEND_ALWAYS) ||
	      (MACH_MSGH_BITS_REMOTE(kmsg->ikm_header->msgh_bits) ==
	       MACH_MSG_TYPE_PORT_SEND_ONCE)))) {
		mqueue->imq_msgcount++;
		assert(mqueue->imq_msgcount > 0);
		imq_unlock(mqueue);
		splx(s);
	} else {
		thread_t cur_thread = current_thread();
		uint64_t deadline;

		/*
		 * We have to wait for space to be granted to us.
		 */
		if ((option & MACH_SEND_TIMEOUT) && (send_timeout == 0)) {
			imq_unlock(mqueue);
			splx(s);
			return MACH_SEND_TIMED_OUT;
		}
		if (imq_full_kernel(mqueue)) {
			imq_unlock(mqueue);
			splx(s);
			return MACH_SEND_NO_BUFFER;
		}
		mqueue->imq_fullwaiters = TRUE;
		thread_lock(cur_thread);
		if (option & MACH_SEND_TIMEOUT)
			clock_interval_to_deadline(send_timeout, 1000*NSEC_PER_USEC, &deadline);
		else
			deadline = 0;
		wresult = wait_queue_assert_wait64_locked(
						&mqueue->imq_wait_queue,
						IPC_MQUEUE_FULL,
						THREAD_ABORTSAFE,
						TIMEOUT_URGENCY_USER_NORMAL,
						deadline, 0,
						cur_thread);
		thread_unlock(cur_thread);
		imq_unlock(mqueue);
		splx(s);

		if (wresult == THREAD_WAITING) {
			wresult = thread_block(THREAD_CONTINUE_NULL);
			counter(c_ipc_mqueue_send_block++);
		}

		switch (wresult) {
		case THREAD_TIMED_OUT:
			assert(option & MACH_SEND_TIMEOUT);
			return MACH_SEND_TIMED_OUT;

		case THREAD_AWAKENED:
			/* we can proceed - inherited msgcount from waker */
			assert(mqueue->imq_msgcount > 0);
			break;

		case THREAD_INTERRUPTED:
			return MACH_SEND_INTERRUPTED;

		case THREAD_RESTART:
			/* mqueue is being destroyed */
			return MACH_SEND_INVALID_DEST;
		default:
			panic("ipc_mqueue_send");
		}
	}

	ipc_mqueue_post(mqueue, kmsg);
	return MACH_MSG_SUCCESS;
}


/*
 *	Routine:	ipc_mqueue_release_msgcount
 *	Purpose:
 *		Release a message queue reference in the case where we
 *		found a waiter.
 *
 *	Conditions:
 *		The message queue is locked.
 *		The message corresponding to this reference is off the queue.
 */
void
ipc_mqueue_release_msgcount(
	ipc_mqueue_t mqueue)
{
	assert(imq_held(mqueue));
	assert(mqueue->imq_msgcount > 1 || ipc_kmsg_queue_empty(&mqueue->imq_messages));

	mqueue->imq_msgcount--;

	if (!imq_full(mqueue) && mqueue->imq_fullwaiters) {
		if (wait_queue_wakeup64_one_locked(
						&mqueue->imq_wait_queue,
						IPC_MQUEUE_FULL,
						THREAD_AWAKENED,
						FALSE) != KERN_SUCCESS) {
			mqueue->imq_fullwaiters = FALSE;
		} else {
			/* gave away our slot - add reference back */
			mqueue->imq_msgcount++;
		}
	}
}

/*
 *	Routine:	ipc_mqueue_post
 *	Purpose:
 *		Post a message to a waiting receiver or enqueue it.  If a
 *		receiver is waiting, we can release our reserved space in
 *		the message queue.
 *
 *	Conditions:
 *		If we need to queue, our space in the message queue is reserved.
 */
void
ipc_mqueue_post(
	register ipc_mqueue_t 	mqueue,
	register ipc_kmsg_t		kmsg)
{
	spl_t s;

	/*
	 *	While the msg queue	is locked, we have control of the
	 *  kmsg, so the ref in	it for the port is still good.
	 *
	 *	Check for a receiver for the message.
	 */
	s = splsched();
	imq_lock(mqueue);
	for (;;) {
		wait_queue_t waitq = &mqueue->imq_wait_queue;
		thread_t receiver;
		mach_msg_size_t msize;

		receiver = wait_queue_wakeup64_identity_locked(
							waitq,
							IPC_MQUEUE_RECEIVE,
							THREAD_AWAKENED,
							FALSE);
		/* waitq still locked, thread locked */

		if (receiver == THREAD_NULL) {
			/*
			 * no receivers; queue kmsg
			 */
			assert(mqueue->imq_msgcount > 0);
			ipc_kmsg_enqueue_macro(&mqueue->imq_messages, kmsg);
			break;
		}

		/*
		 * If the receiver waited with a facility not directly
		 * related to Mach messaging, then it isn't prepared to get
		 * handed the message directly.  Just set it running, and
		 * go look for another thread that can.
		 */
		if (receiver->ith_state != MACH_RCV_IN_PROGRESS) {
				  thread_unlock(receiver);
				  continue;
		}


		/*
		 * We found a waiting thread.
		 * If the message is too large or the scatter list is too small
		 * the thread we wake up will get that as its status.
		 */
		msize =	ipc_kmsg_copyout_size(kmsg, receiver->map);
		if (receiver->ith_msize <
				(msize + REQUESTED_TRAILER_SIZE(thread_is_64bit(receiver), receiver->ith_option))) {
			receiver->ith_msize = msize;
			receiver->ith_state = MACH_RCV_TOO_LARGE;
		} else {
			receiver->ith_state = MACH_MSG_SUCCESS;
		}

		/*
		 * If there is no problem with the upcoming receive, or the
		 * receiver thread didn't specifically ask for special too
		 * large error condition, go ahead and select it anyway.
		 */
		if ((receiver->ith_state == MACH_MSG_SUCCESS) ||
		    !(receiver->ith_option & MACH_RCV_LARGE)) {

			receiver->ith_kmsg = kmsg;
			receiver->ith_seqno = mqueue->imq_seqno++;
			thread_unlock(receiver);

			/* we didn't need our reserved spot in the queue */
			ipc_mqueue_release_msgcount(mqueue);
			break;
		}

		/*
		 * Otherwise, this thread needs to be released to run
		 * and handle its error without getting the message.  We
		 * need to go back and pick another one.
		 */
		receiver->ith_receiver_name = mqueue->imq_receiver_name;
		receiver->ith_kmsg = IKM_NULL;
		receiver->ith_seqno = 0;
		thread_unlock(receiver);
	}

	imq_unlock(mqueue);
	splx(s);

	current_task()->messages_sent++;
	return;
}


/* static */ void
ipc_mqueue_receive_results(wait_result_t saved_wait_result)
{
	thread_t     		self = current_thread();
	mach_msg_option_t	option = self->ith_option;

	/*
	 * why did we wake up?
	 */
	switch (saved_wait_result) {
	case THREAD_TIMED_OUT:
		self->ith_state = MACH_RCV_TIMED_OUT;
		return;

	case THREAD_INTERRUPTED:
		self->ith_state = MACH_RCV_INTERRUPTED;
		return;

	case THREAD_RESTART:
		/* something bad happened to the port/set */
		self->ith_state = MACH_RCV_PORT_CHANGED;
		return;

	case THREAD_AWAKENED:
		/*
		 * We do not need to go select a message, somebody
		 * handed us one (or a too-large indication).
		 */
		switch (self->ith_state) {
		case MACH_RCV_SCATTER_SMALL:
		case MACH_RCV_TOO_LARGE:
			/*
			 * Somebody tried to give us a too large
			 * message. If we indicated that we cared,
			 * then they only gave us the indication,
			 * otherwise they gave us the indication
			 * AND the message anyway.
			 */
			if (option & MACH_RCV_LARGE) {
				return;
			}

		case MACH_MSG_SUCCESS:
			return;

		default:
			panic("ipc_mqueue_receive_results: strange ith_state");
		}

	default:
		panic("ipc_mqueue_receive_results: strange wait_result");
	}
}

void
ipc_mqueue_receive_continue(
	__unused void *param,
	wait_result_t wresult)
{
	ipc_mqueue_receive_results(wresult);
	mach_msg_receive_continue();  /* hard-coded for now */
}

/*
 *	Routine:	ipc_mqueue_receive
 *	Purpose:
 *		Receive a message from a message queue.
 *
 *		If continuation is non-zero, then we might discard
 *		our kernel stack when we block.  We will continue
 *		after unblocking by executing continuation.
 *
 *		If resume is true, then we are resuming a receive
 *		operation after a blocked receive discarded our stack.
 *	Conditions:
 *		Our caller must hold a reference for the port or port set
 *		to which this queue belongs, to keep the queue
 *		from being deallocated.
 *
 *		The kmsg is returned with clean header fields
 *		and with the circular bit turned off.
 *	Returns:
 *		MACH_MSG_SUCCESS	Message returned in kmsgp.
 *		MACH_RCV_TOO_LARGE	Message size returned in kmsgp.
 *		MACH_RCV_TIMED_OUT	No message obtained.
 *		MACH_RCV_INTERRUPTED	No message obtained.
 *		MACH_RCV_PORT_DIED	Port/set died; no message.
 *		MACH_RCV_PORT_CHANGED	Port moved into set; no msg.
 *
 */

void
ipc_mqueue_receive(
	ipc_mqueue_t            mqueue,
	mach_msg_option_t       option,
	mach_msg_size_t         max_size,
	mach_msg_timeout_t      rcv_timeout,
	int                     interruptible)
{
	wait_result_t           wresult;
        thread_t                self = current_thread();

        wresult = ipc_mqueue_receive_on_thread(mqueue, option, max_size,
                                               rcv_timeout, interruptible,
                                               self);
        if (wresult == THREAD_NOT_WAITING)
                return;

	if (wresult == THREAD_WAITING) {
		counter((interruptible == THREAD_ABORTSAFE) ?
			c_ipc_mqueue_receive_block_user++ :
			c_ipc_mqueue_receive_block_kernel++);

		if (self->ith_continuation)
			thread_block(ipc_mqueue_receive_continue);
			/* NOTREACHED */

		wresult = thread_block(THREAD_CONTINUE_NULL);
	}
	ipc_mqueue_receive_results(wresult);
}

wait_result_t
ipc_mqueue_receive_on_thread(
        ipc_mqueue_t            mqueue,
	mach_msg_option_t       option,
	mach_msg_size_t         max_size,
	mach_msg_timeout_t      rcv_timeout,
	int                     interruptible,
	thread_t                thread)
{
	ipc_kmsg_queue_t        kmsgs;
	wait_result_t           wresult;
	uint64_t		deadline;
	spl_t                   s;
#if CONFIG_MACF_MACH
	ipc_labelh_t lh;
	task_t task;
	int rc;
#endif

	s = splsched();
	imq_lock(mqueue);

	if (imq_is_set(mqueue)) {
		queue_t q;

		q = &mqueue->imq_preposts;

		/*
		 * If we are waiting on a portset mqueue, we need to see if
		 * any of the member ports have work for us.  Ports that
		 * have (or recently had) messages will be linked in the
		 * prepost queue for the portset. By holding the portset's
		 * mqueue lock during the search, we tie up any attempts by
		 * mqueue_deliver or portset membership changes that may
		 * cross our path.
		 */
	search_set:
		while(!queue_empty(q)) {
			wait_queue_link_t wql;
			ipc_mqueue_t port_mq;

			queue_remove_first(q, wql, wait_queue_link_t, wql_preposts);
			assert(!wql_is_preposted(wql));

			/*
			 * This is a lock order violation, so we have to do it
			 * "softly," putting the link back on the prepost list
			 * if it fails (at the tail is fine since the order of
			 * handling messages from different sources in a set is
			 * not guaranteed and we'd like to skip to the next source
			 * if one is available).
			 */
			port_mq = (ipc_mqueue_t)wql->wql_queue;
			if (!imq_lock_try(port_mq)) {
				queue_enter(q, wql, wait_queue_link_t, wql_preposts);
				imq_unlock(mqueue);
				splx(s);
				mutex_pause(0);
				s = splsched();
				imq_lock(mqueue);
				goto search_set; /* start again at beginning - SMP */
			}

			/*
			 * If there are no messages on this queue, just skip it
			 * (we already removed the link from the set's prepost queue).
			 */
			kmsgs = &port_mq->imq_messages;
			if (ipc_kmsg_queue_first(kmsgs) == IKM_NULL) {
				imq_unlock(port_mq);
				continue;
			}

			/*
			 * There are messages, so reinsert the link back
			 * at the tail of the preposted queue (for fairness)
			 * while we still have the portset mqueue locked.
			 */
			queue_enter(q, wql, wait_queue_link_t, wql_preposts);
			imq_unlock(mqueue);

			/*
			 * Continue on to handling the message with just
			 * the port mqueue locked.
			 */
			ipc_mqueue_select_on_thread(port_mq, option, max_size, thread);
			imq_unlock(port_mq);
#if CONFIG_MACF_MACH
			if (thread->task != TASK_NULL &&
			    thread->ith_kmsg != NULL &&
			    thread->ith_kmsg->ikm_sender != NULL) {
				lh = thread->ith_kmsg->ikm_sender->label;
				tasklabel_lock(thread->task);
				ip_lock(lh->lh_port);
				rc = mac_port_check_receive(&thread->task->maclabel,
                                                            &lh->lh_label);
				ip_unlock(lh->lh_port);
				tasklabel_unlock(thread->task);
				if (rc)
					thread->ith_state = MACH_RCV_INVALID_DATA;
			}
#endif
			splx(s);
			return THREAD_NOT_WAITING;

		}

	} else {

		/*
		 * Receive on a single port. Just try to get the messages.
		 */
	  	kmsgs = &mqueue->imq_messages;
		if (ipc_kmsg_queue_first(kmsgs) != IKM_NULL) {
			ipc_mqueue_select_on_thread(mqueue, option, max_size, thread);
			imq_unlock(mqueue);
#if CONFIG_MACF_MACH
			if (thread->task != TASK_NULL &&
			    thread->ith_kmsg != NULL &&
			    thread->ith_kmsg->ikm_sender != NULL) {
				lh = thread->ith_kmsg->ikm_sender->label;
				tasklabel_lock(thread->task);
				ip_lock(lh->lh_port);
				rc = mac_port_check_receive(&thread->task->maclabel,
                                                            &lh->lh_label);
				ip_unlock(lh->lh_port);
				tasklabel_unlock(thread->task);
				if (rc)
					thread->ith_state = MACH_RCV_INVALID_DATA;
			}
#endif
			splx(s);
			return THREAD_NOT_WAITING;
		}
	}

	/*
	 * Looks like we'll have to block.  The mqueue we will
	 * block on (whether the set's or the local port's) is
	 * still locked.
	 */
	if (option & MACH_RCV_TIMEOUT) {
		if (rcv_timeout == 0) {
			imq_unlock(mqueue);
			splx(s);
			thread->ith_state = MACH_RCV_TIMED_OUT;
			return THREAD_NOT_WAITING;
		}
	}

	thread_lock(thread);
	thread->ith_state = MACH_RCV_IN_PROGRESS;
	thread->ith_option = option;
	thread->ith_msize = max_size;

	if (option & MACH_RCV_TIMEOUT)
		clock_interval_to_deadline(rcv_timeout, 1000*NSEC_PER_USEC, &deadline);
	else
		deadline = 0;

	wresult = wait_queue_assert_wait64_locked(&mqueue->imq_wait_queue,
						  IPC_MQUEUE_RECEIVE,
						  interruptible,
						  TIMEOUT_URGENCY_USER_NORMAL,
						  deadline, 0,
						  thread);
	/* preposts should be detected above, not here */
	if (wresult == THREAD_AWAKENED)
		panic("ipc_mqueue_receive_on_thread: sleep walking");

	thread_unlock(thread);
	imq_unlock(mqueue);
	splx(s);
	return wresult;
}


/*
 *	Routine:	ipc_mqueue_select_on_thread
 *	Purpose:
 *		A receiver discovered that there was a message on the queue
 *		before he had to block.  Pick the message off the queue and
 *		"post" it to thread.
 *	Conditions:
 *		mqueue locked.
 *              thread not locked.
 *		There is a message.
 *	Returns:
 *		MACH_MSG_SUCCESS	Actually selected a message for ourselves.
 *		MACH_RCV_TOO_LARGE  May or may not have pull it, but it is large
 */
void
ipc_mqueue_select_on_thread(
	ipc_mqueue_t		mqueue,
	mach_msg_option_t	option,
	mach_msg_size_t		max_size,
	thread_t                thread)
{
	ipc_kmsg_t kmsg;
	mach_msg_return_t mr = MACH_MSG_SUCCESS;
	mach_msg_size_t rcv_size;

	/*
	 * Do some sanity checking of our ability to receive
	 * before pulling the message off the queue.
	 */
	kmsg = ipc_kmsg_queue_first(&mqueue->imq_messages);
	assert(kmsg != IKM_NULL);

	/*
	 * If we really can't receive it, but we had the
	 * MACH_RCV_LARGE option set, then don't take it off
	 * the queue, instead return the appropriate error
	 * (and size needed).
	 */
	rcv_size = ipc_kmsg_copyout_size(kmsg, thread->map);
	if (rcv_size + REQUESTED_TRAILER_SIZE(thread_is_64bit(thread), option) > max_size) {
		mr = MACH_RCV_TOO_LARGE;
		if (option & MACH_RCV_LARGE) {
			thread->ith_receiver_name = mqueue->imq_receiver_name;
			thread->ith_kmsg = IKM_NULL;
			thread->ith_msize = rcv_size;
			thread->ith_seqno = 0;
			thread->ith_state = mr;
			return;
		}
	}

	ipc_kmsg_rmqueue_first_macro(&mqueue->imq_messages, kmsg);
	ipc_mqueue_release_msgcount(mqueue);
	thread->ith_seqno = mqueue->imq_seqno++;
	thread->ith_kmsg = kmsg;
	thread->ith_state = mr;

	current_task()->messages_received++;
	return;
}

/*
 *	Routine:	ipc_mqueue_peek
 *	Purpose:
 *		Peek at a (non-set) message queue to see if it has a message
 *		matching the sequence number provided (if zero, then the
 *		first message in the queue) and return vital info about the
 *		message.
 *
 *	Conditions:
 *		Locks may be held by callers, so this routine cannot block.
 *		Caller holds reference on the message queue.
 */
unsigned
ipc_mqueue_peek(ipc_mqueue_t 		mq,
		mach_port_seqno_t	*seqnop,
		mach_msg_size_t		*msg_sizep,
		mach_msg_id_t		*msg_idp,
		mach_msg_max_trailer_t 	*msg_trailerp)
{
	ipc_kmsg_queue_t kmsgq;
	ipc_kmsg_t kmsg;
	mach_port_seqno_t seqno, msgoff;
	int res = 0;
	spl_t s;

	assert(!imq_is_set(mq));

	s = splsched();
	imq_lock(mq);

	seqno = (seqnop != NULL) ? seqno = *seqnop : 0;

	if (seqno == 0) {
		seqno = mq->imq_seqno;
		msgoff = 0;
	} else if (seqno >= mq->imq_seqno &&
		   seqno < mq->imq_seqno + mq->imq_msgcount) {
		msgoff = seqno - mq->imq_seqno;
	} else
		goto out;

	/* look for the message that would match that seqno */
	kmsgq = &mq->imq_messages;
	kmsg = ipc_kmsg_queue_first(kmsgq);
	while (msgoff-- && kmsg != IKM_NULL) {
		kmsg = ipc_kmsg_queue_next(kmsgq, kmsg);
	}
	if (kmsg == IKM_NULL)
		goto out;

	/* found one - return the requested info */
	if (seqnop != NULL)
		*seqnop = seqno;
	if (msg_sizep != NULL)
		*msg_sizep = kmsg->ikm_header->msgh_size;
	if (msg_idp != NULL)
		*msg_idp = kmsg->ikm_header->msgh_id;
	if (msg_trailerp != NULL)
		memcpy(msg_trailerp,
		       (mach_msg_max_trailer_t *)((vm_offset_t)kmsg->ikm_header +
						  round_msg(kmsg->ikm_header->msgh_size)),
		       sizeof(mach_msg_max_trailer_t));
	res = 1;

 out:
	imq_unlock(mq);
	splx(s);
	return res;
}

/*
 *	Routine:	ipc_mqueue_set_peek
 *	Purpose:
 *		Peek at a message queue set to see if it has any ports
 *		with messages.
 *
 *	Conditions:
 *		Locks may be held by callers, so this routine cannot block.
 *		Caller holds reference on the message queue.
 */
unsigned
ipc_mqueue_set_peek(ipc_mqueue_t mq)
{
	wait_queue_link_t	wql;
	queue_t			q;
	spl_t s;
	int res;

	assert(imq_is_set(mq));

	s = splsched();
	imq_lock(mq);

	/*
	 * peek at the contained port message queues, return as soon as
	 * we spot a message on one of the message queues linked on the
	 * prepost list.  No need to lock each message queue, as only the
	 * head of each queue is checked. If a message wasn't there before
	 * we entered here, no need to find it (if we do, great).
	 */
	res = 0;
	q = &mq->imq_preposts;
	queue_iterate(q, wql, wait_queue_link_t, wql_preposts) {
		ipc_mqueue_t port_mq = (ipc_mqueue_t)wql->wql_queue;
		ipc_kmsg_queue_t kmsgs = &port_mq->imq_messages;

		if (ipc_kmsg_queue_first(kmsgs) != IKM_NULL) {
			res = 1;
			break;
		}
	}
	imq_unlock(mq);
	splx(s);
	return res;
}

/*
 *	Routine:	ipc_mqueue_set_gather_member_names
 *	Purpose:
 *		Iterate a message queue set to identify the member port
 *		names. Actual returned names is limited to maxnames entries,
 *		but we keep counting the actual number of members to let
 *		the caller decide to retry if necessary.
 *
 *	Conditions:
 *		Locks may be held by callers, so this routine cannot block.
 *		Caller holds reference on the message queue.
 */
void
ipc_mqueue_set_gather_member_names(
	ipc_mqueue_t mq,
	ipc_entry_num_t maxnames,
	mach_port_name_t *names,
	ipc_entry_num_t *actualp)
{
	wait_queue_link_t	wql;
	queue_t			q;
	spl_t s;
	ipc_entry_num_t actual = 0;

	assert(imq_is_set(mq));

	s = splsched();
	imq_lock(mq);

	/*
	 * Iterate over the member ports through the mqueue set links
	 * capturing as many names as we can.
	 */
	q = &mq->imq_setlinks;
	queue_iterate(q, wql, wait_queue_link_t, wql_setlinks) {
		ipc_mqueue_t port_mq = (ipc_mqueue_t)wql->wql_queue;

		if (actual < maxnames)
			names[actual] = port_mq->imq_receiver_name;
		actual++;
	}
	imq_unlock(mq);
	splx(s);

	*actualp = actual;
}


/*
 *	Routine:	ipc_mqueue_destroy
 *	Purpose:
 *		Destroy a (non-set) message queue.
 *		Set any blocked senders running.
 *	   	Destroy the kmsgs in the queue.
 *	Conditions:
 *		Nothing locked.
 *		Receivers were removed when the receive right was "changed"
 */
void
ipc_mqueue_destroy(
	ipc_mqueue_t	mqueue)
{
	ipc_kmsg_queue_t kmqueue;
	ipc_kmsg_t kmsg;
	boolean_t reap = FALSE;
	spl_t s;

	s = splsched();
	imq_lock(mqueue);
	/*
	 *	rouse all blocked senders
	 */
	mqueue->imq_fullwaiters = FALSE;
	printf("\nAbout to call wait_queue_wakeup64_all_locked from ipc_mqueue_destroy\n");
	wait_queue_wakeup64_all_locked(
				&mqueue->imq_wait_queue,
				IPC_MQUEUE_FULL,
				THREAD_RESTART,
				FALSE);

	/*
	 * Move messages from the specified queue to the per-thread
	 * clean/drain queue while we have the mqueue lock.
	 */
	kmqueue = &mqueue->imq_messages;
	while ((kmsg = ipc_kmsg_dequeue(kmqueue)) != IKM_NULL) {
		boolean_t first;
		first = ipc_kmsg_delayed_destroy(kmsg);
		if (first)
			reap = first;
	}

	imq_unlock(mqueue);
	splx(s);

	/*
	 * Destroy the messages we enqueued if we aren't nested
	 * inside some other attempt to drain the same queue.
	 */
	if (reap)
		ipc_kmsg_reap_delayed();
}

/*
 *	Routine:	ipc_mqueue_set_qlimit
 *	Purpose:
 *		Changes a message queue limit; the maximum number
 *		of messages which may be queued.
 *	Conditions:
 *		Nothing locked.
 */

void
ipc_mqueue_set_qlimit(
	 ipc_mqueue_t			mqueue,
	 mach_port_msgcount_t	qlimit)
{
	 spl_t s;

	 assert(qlimit <= MACH_PORT_QLIMIT_MAX);

	 /* wake up senders allowed by the new qlimit */
	 s = splsched();
	 imq_lock(mqueue);
	 if (qlimit > mqueue->imq_qlimit) {
		 mach_port_msgcount_t i, wakeup;

		 /* caution: wakeup, qlimit are unsigned */
		 wakeup = qlimit - mqueue->imq_qlimit;

		 for (i = 0; i < wakeup; i++) {
			 if (wait_queue_wakeup64_one_locked(
							&mqueue->imq_wait_queue,
							IPC_MQUEUE_FULL,
							THREAD_AWAKENED,
							FALSE) == KERN_NOT_WAITING) {
					 mqueue->imq_fullwaiters = FALSE;
					 break;
			 }
			 mqueue->imq_msgcount++;  /* give it to the awakened thread */
		 }
	 }
	mqueue->imq_qlimit = qlimit;
	imq_unlock(mqueue);
	splx(s);
}

/*
 *	Routine:	ipc_mqueue_set_seqno
 *	Purpose:
 *		Changes an mqueue's sequence number.
 *	Conditions:
 *		Caller holds a reference to the queue's containing object.
 */
void
ipc_mqueue_set_seqno(
	ipc_mqueue_t		mqueue,
	mach_port_seqno_t	seqno)
{
	spl_t s;

	s = splsched();
	imq_lock(mqueue);
	mqueue->imq_seqno = seqno;
	imq_unlock(mqueue);
	splx(s);
}


/*
 *	Routine:	ipc_mqueue_copyin
 *	Purpose:
 *		Convert a name in a space to a message queue.
 *	Conditions:
 *		Nothing locked.  If successful, the caller gets a ref for
 *		for the object.	This ref ensures the continued existence of
 *		the queue.
 *	Returns:
 *		MACH_MSG_SUCCESS	Found a message queue.
 *		MACH_RCV_INVALID_NAME	The space is dead.
 *		MACH_RCV_INVALID_NAME	The name doesn't denote a right.
 *		MACH_RCV_INVALID_NAME
 *			The denoted right is not receive or port set.
 *		MACH_RCV_IN_SET		Receive right is a member of a set.
 */

mach_msg_return_t
ipc_mqueue_copyin(
	ipc_space_t		space,
	mach_port_name_t	name,
	ipc_mqueue_t		*mqueuep,
	ipc_object_t		*objectp)
{
	ipc_entry_t entry;
	ipc_object_t object;
	ipc_mqueue_t mqueue;

	is_read_lock(space);
	if (!is_active(space)) {
		is_read_unlock(space);
		return MACH_RCV_INVALID_NAME;
	}

	entry = ipc_entry_lookup(space, name);
	if (entry == IE_NULL) {
		is_read_unlock(space);
		return MACH_RCV_INVALID_NAME;
	}

	object = entry->ie_object;

	if (entry->ie_bits & MACH_PORT_TYPE_RECEIVE) {
		ipc_port_t port;

		port = (ipc_port_t) object;
		assert(port != IP_NULL);

		ip_lock(port);
		assert(ip_active(port));
		assert(port->ip_receiver_name == name);
		assert(port->ip_receiver == space);
		is_read_unlock(space);
		mqueue = &port->ip_messages;

	} else if (entry->ie_bits & MACH_PORT_TYPE_PORT_SET) {
		ipc_pset_t pset;

		pset = (ipc_pset_t) object;
		assert(pset != IPS_NULL);

		ips_lock(pset);
		assert(ips_active(pset));
		assert(pset->ips_local_name == name);
		is_read_unlock(space);

		mqueue = &pset->ips_messages;
	} else {
		is_read_unlock(space);
		return MACH_RCV_INVALID_NAME;
	}

	/*
	 *	At this point, the object is locked and active,
	 *	the space is unlocked, and mqueue is initialized.
	 */

	io_reference(object);
	io_unlock(object);

	*objectp = object;
	*mqueuep = mqueue;
	return MACH_MSG_SUCCESS;
}

## kern_event.c
/*
 * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 *
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 *
 */
/*-
 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
/*
 *	@(#)kern_event.c       1.0 (3/31/2000)
 */
#include <stdint.h>

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/proc_internal.h>
#include <sys/kauth.h>
#include <sys/malloc.h>
#include <sys/unistd.h>
#include <sys/file_internal.h>
#include <sys/fcntl.h>
#include <sys/select.h>
#include <sys/queue.h>
#include <sys/event.h>
#include <sys/eventvar.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/stat.h>
#include <sys/sysctl.h>
#include <sys/uio.h>
#include <sys/sysproto.h>
#include <sys/user.h>
#include <sys/vnode_internal.h>
#include <string.h>
#include <sys/proc_info.h>
#include <sys/codesign.h>

#include <kern/lock.h>
#include <kern/clock.h>
#include <kern/thread_call.h>
#include <kern/sched_prim.h>
#include <kern/zalloc.h>
#include <kern/assert.h>

#include <libkern/libkern.h>
#include "net/net_str_id.h"

#include <mach/task.h>

#if VM_PRESSURE_EVENTS
#include <kern/vm_pressure.h>
#endif

#if CONFIG_MEMORYSTATUS
#include <sys/kern_memorystatus.h>
#endif

MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");

#define	KQ_EVENT	NULL


/*Plumbing needed for wait queue logging.*/
#define EVENT_MASK_BITS

typedef struct my_wait_queue { // happy little wait queue
	unsigned long int
	/* boolean_t */	wq_type:2,		/* only public field */
					wq_fifo:1,		/* fifo wakeup policy? */
					wq_prepost:1,	/* waitq supports prepost? set only */
					wq_eventmask:((sizeof(long) * 8) - 4);
} myWaitQueue;

#define _WAIT_QUEUE_inited		0x2
#define wait_queue_is_valid(wq)	\
	(((wq)->wq_type & ~1) == _WAIT_QUEUE_inited)


static inline void kqlock(struct kqueue *kq);
static inline void kqunlock(struct kqueue *kq);

static int kqlock2knoteuse(struct kqueue *kq, struct knote *kn);
static int kqlock2knoteusewait(struct kqueue *kq, struct knote *kn);
static int kqlock2knotedrop(struct kqueue *kq, struct knote *kn);
static int knoteuse2kqlock(struct kqueue *kq, struct knote *kn);

static void kqueue_wakeup(struct kqueue *kq, int closed);
static int kqueue_read(struct fileproc *fp, struct uio *uio,
    int flags, vfs_context_t ctx);
static int kqueue_write(struct fileproc *fp, struct uio *uio,
    int flags, vfs_context_t ctx);
static int kqueue_ioctl(struct fileproc *fp, u_long com, caddr_t data,
    vfs_context_t ctx);
static int kqueue_select(struct fileproc *fp, int which, void *wql,
    vfs_context_t ctx);
static int kqueue_close(struct fileglob *fg, vfs_context_t ctx);
static int kqueue_kqfilter(struct fileproc *fp, struct knote *kn,
	vfs_context_t ctx);
static int kqueue_drain(struct fileproc *fp, vfs_context_t ctx);
extern int kqueue_stat(struct fileproc *fp, void  *ub, int isstat64,
	vfs_context_t ctx);

static const struct fileops kqueueops = {
	.fo_type = DTYPE_KQUEUE,
	.fo_read = kqueue_read,
	.fo_write = kqueue_write,
	.fo_ioctl = kqueue_ioctl,
	.fo_select = kqueue_select,
	.fo_close = kqueue_close,
	.fo_kqfilter = kqueue_kqfilter,
	.fo_drain = kqueue_drain,
};

static int kevent_internal(struct proc *p, int iskev64, user_addr_t changelist,
    int nchanges, user_addr_t eventlist, int nevents, int fd,
    user_addr_t utimeout, unsigned int flags, int32_t *retval);
static int kevent_copyin(user_addr_t *addrp, struct kevent64_s *kevp,
    struct proc *p, int iskev64);
static int kevent_copyout(struct kevent64_s *kevp, user_addr_t *addrp,
    struct proc *p, int iskev64);
char * kevent_description(struct kevent64_s *kevp, char *s, size_t n);

static int kevent_callback(struct kqueue *kq, struct kevent64_s *kevp,
    void *data);
static void kevent_continue(struct kqueue *kq, void *data, int error);
static void kqueue_scan_continue(void *contp, wait_result_t wait_result);
static int kqueue_process(struct kqueue *kq, kevent_callback_t callback,
    void *data, int *countp, struct proc *p);
static int kqueue_begin_processing(struct kqueue *kq);
static void kqueue_end_processing(struct kqueue *kq);
static int knote_process(struct knote *kn, kevent_callback_t callback,
    void *data, struct kqtailq *inprocessp, struct proc *p);
static void knote_put(struct knote *kn);
static int knote_fdpattach(struct knote *kn, struct filedesc *fdp,
    struct proc *p);
static void knote_drop(struct knote *kn, struct proc *p);
static void knote_activate(struct knote *kn, int);
static void knote_deactivate(struct knote *kn);
static void knote_enqueue(struct knote *kn);
static void knote_dequeue(struct knote *kn);
static struct knote *knote_alloc(void);
static void knote_free(struct knote *kn);

static int filt_fileattach(struct knote *kn);
static struct filterops file_filtops = {
	.f_isfd = 1,
	.f_attach = filt_fileattach,
};

static void filt_kqdetach(struct knote *kn);
static int filt_kqueue(struct knote *kn, long hint);
static struct filterops kqread_filtops = {
	.f_isfd = 1,
	.f_detach = filt_kqdetach,
	.f_event = filt_kqueue,
};

/* placeholder for not-yet-implemented filters */
static int filt_badattach(struct knote *kn);
static struct filterops bad_filtops = {
	.f_attach = filt_badattach,
};

static int filt_procattach(struct knote *kn);
static void filt_procdetach(struct knote *kn);
static int filt_proc(struct knote *kn, long hint);
static struct filterops proc_filtops = {
	.f_attach = filt_procattach,
	.f_detach = filt_procdetach,
	.f_event = filt_proc,
};

#if VM_PRESSURE_EVENTS
static int filt_vmattach(struct knote *kn);
static void filt_vmdetach(struct knote *kn);
static int filt_vm(struct knote *kn, long hint);
static struct filterops vm_filtops = {
	.f_attach = filt_vmattach,
	.f_detach = filt_vmdetach,
	.f_event = filt_vm,
};
#endif /* VM_PRESSURE_EVENTS */

#if CONFIG_MEMORYSTATUS
extern struct filterops memorystatus_filtops;
#endif /* CONFIG_MEMORYSTATUS */

extern struct filterops fs_filtops;

extern struct filterops sig_filtops;

/* Timer filter */
static int filt_timerattach(struct knote *kn);
static void filt_timerdetach(struct knote *kn);
static int filt_timer(struct knote *kn, long hint);
static void filt_timertouch(struct knote *kn, struct kevent64_s *kev,
    long type);
static struct filterops timer_filtops = {
	.f_attach = filt_timerattach,
	.f_detach = filt_timerdetach,
	.f_event = filt_timer,
	.f_touch = filt_timertouch,
};

/* Helpers */
static void filt_timerexpire(void *knx, void *param1);
static int filt_timervalidate(struct knote *kn);
static void filt_timerupdate(struct knote *kn);
static void filt_timercancel(struct knote *kn);

#define	TIMER_RUNNING		0x1
#define	TIMER_CANCELWAIT	0x2

static lck_mtx_t _filt_timerlock;
static void filt_timerlock(void);
static void filt_timerunlock(void);

static zone_t knote_zone;

#define	KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))

#if 0
extern struct filterops aio_filtops;
#endif

/* Mach portset filter */
extern struct filterops machport_filtops;

/* User filter */
static int filt_userattach(struct knote *kn);
static void filt_userdetach(struct knote *kn);
static int filt_user(struct knote *kn, long hint);
static void filt_usertouch(struct knote *kn, struct kevent64_s *kev,
    long type);
static struct filterops user_filtops = {
	.f_attach = filt_userattach,
	.f_detach = filt_userdetach,
	.f_event = filt_user,
	.f_touch = filt_usertouch,
};

/*
 * Table for all system-defined filters.
 */
static struct filterops *sysfilt_ops[] = {
	&file_filtops,			/* EVFILT_READ */
	&file_filtops,			/* EVFILT_WRITE */
#if 0
	&aio_filtops,			/* EVFILT_AIO */
#else
	&bad_filtops,			/* EVFILT_AIO */
#endif
	&file_filtops,			/* EVFILT_VNODE */
	&proc_filtops,			/* EVFILT_PROC */
	&sig_filtops,			/* EVFILT_SIGNAL */
	&timer_filtops,			/* EVFILT_TIMER */
	&machport_filtops,		/* EVFILT_MACHPORT */
	&fs_filtops,			/* EVFILT_FS */
	&user_filtops,			/* EVFILT_USER */
	&bad_filtops,			/* unused */
#if VM_PRESSURE_EVENTS
	&vm_filtops,			/* EVFILT_VM */
#else
	&bad_filtops,			/* EVFILT_VM */
#endif
	&file_filtops,			/* EVFILT_SOCK */
#if CONFIG_MEMORYSTATUS
	&memorystatus_filtops,  /* EVFILT_MEMORYSTATUS */
#else
	&bad_filtops,			/* EVFILT_MEMORYSTATUS */
#endif
};

/*
 * kqueue/note lock attributes and implementations
 *
 *	kqueues have locks, while knotes have use counts
 *	Most of the knote state is guarded by the object lock.
 *	the knote "inuse" count and status use the kqueue lock.
 */
lck_grp_attr_t * kq_lck_grp_attr;
lck_grp_t * kq_lck_grp;
lck_attr_t * kq_lck_attr;

static inline void
kqlock(struct kqueue *kq)
{
	lck_spin_lock(&kq->kq_lock);
}

static inline void
kqunlock(struct kqueue *kq)
{
	lck_spin_unlock(&kq->kq_lock);
}

/*
 * Convert a kq lock to a knote use referece.
 *
 *	If the knote is being dropped, we can't get
 *	a use reference, so just return with it
 *	still locked.
 *	- kq locked at entry
 *	- unlock on exit if we get the use reference
 */
static int
kqlock2knoteuse(struct kqueue *kq, struct knote *kn)
{
	if (kn->kn_status & KN_DROPPING)
		return (0);
	kn->kn_inuse++;
	kqunlock(kq);
	return (1);
}

/*
 * Convert a kq lock to a knote use referece,
 * but wait for attach and drop events to complete.
 *
 *	If the knote is being dropped, we can't get
 *	a use reference, so just return with it
 *	still locked.
 *	- kq locked at entry
 *	- kq always unlocked on exit
 */
static int
kqlock2knoteusewait(struct kqueue *kq, struct knote *kn)
{
	if ((kn->kn_status & (KN_DROPPING | KN_ATTACHING)) != 0) {
		kn->kn_status |= KN_USEWAIT;
		if (!wait_queue_is_valid((myWaitQueue*)kq->kq_wqs)) {
			printf("\nAbout to call wait_queue_assert_wait with invalid wait queue from kqlock2knoteusewait.\n");
		}
		wait_queue_assert_wait((wait_queue_t)kq->kq_wqs,
		    &kn->kn_status, THREAD_UNINT, 0);
		kqunlock(kq);
		thread_block(THREAD_CONTINUE_NULL);
		return (0);
	}
	kn->kn_inuse++;
	kqunlock(kq);
	return (1);
}

/*
 * Convert from a knote use reference back to kq lock.
 *
 *	Drop a use reference and wake any waiters if
 *	this is the last one.
 *
 *	The exit return indicates if the knote is
 *	still alive - but the kqueue lock is taken
 *	unconditionally.
 */
static int
knoteuse2kqlock(struct kqueue *kq, struct knote *kn)
{
	kqlock(kq);
	if (--kn->kn_inuse == 0) {
		if ((kn->kn_status & KN_ATTACHING) != 0) {
			kn->kn_status &= ~KN_ATTACHING;
		}
		if ((kn->kn_status & KN_USEWAIT) != 0) {
			kn->kn_status &= ~KN_USEWAIT;
			wait_queue_wakeup_all((wait_queue_t)kq->kq_wqs,
			    &kn->kn_status, THREAD_AWAKENED);
		}
	}
	return ((kn->kn_status & KN_DROPPING) == 0);
}

/*
 * Convert a kq lock to a knote drop reference.
 *
 *	If the knote is in use, wait for the use count
 *	to subside.  We first mark our intention to drop
 *	it - keeping other users from "piling on."
 *	If we are too late, we have to wait for the
 *	other drop to complete.
 *
 *	- kq locked at entry
 *	- always unlocked on exit.
 *	- caller can't hold any locks that would prevent
 *	  the other dropper from completing.
 */
static int
kqlock2knotedrop(struct kqueue *kq, struct knote *kn)
{
	int oktodrop;

	oktodrop = ((kn->kn_status & (KN_DROPPING | KN_ATTACHING)) == 0);
	kn->kn_status |= KN_DROPPING;
	if (oktodrop) {
		if (kn->kn_inuse == 0) {
			kqunlock(kq);
			return (oktodrop);
		}
	}
	kn->kn_status |= KN_USEWAIT;
	if (!wait_queue_is_valid((myWaitQueue*)kq->kq_wqs)) {
		printf("\nAbout to call wait_queue_assert_wait with invalid wait queue from kqlock2knotedrop.\n");
	}
	wait_queue_assert_wait((wait_queue_t)kq->kq_wqs, &kn->kn_status,
	    THREAD_UNINT, 0);
	kqunlock(kq);
	thread_block(THREAD_CONTINUE_NULL);
	return (oktodrop);
}

/*
 * Release a knote use count reference.
 */
static void
knote_put(struct knote *kn)
{
	struct kqueue *kq = kn->kn_kq;

	kqlock(kq);
	if (--kn->kn_inuse == 0) {
		if ((kn->kn_status & KN_USEWAIT) != 0) {
			kn->kn_status &= ~KN_USEWAIT;
			wait_queue_wakeup_all((wait_queue_t)kq->kq_wqs,
			    &kn->kn_status, THREAD_AWAKENED);
		}
	}
	kqunlock(kq);
}

static int
filt_fileattach(struct knote *kn)
{
	return (fo_kqfilter(kn->kn_fp, kn, vfs_context_current()));
}

#define	f_flag f_fglob->fg_flag
#define	f_msgcount f_fglob->fg_msgcount
#define	f_cred f_fglob->fg_cred
#define	f_ops f_fglob->fg_ops
#define	f_offset f_fglob->fg_offset
#define	f_data f_fglob->fg_data

static void
filt_kqdetach(struct knote *kn)
{
	struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;

	kqlock(kq);
	KNOTE_DETACH(&kq->kq_sel.si_note, kn);
	kqunlock(kq);
}

/*ARGSUSED*/
static int
filt_kqueue(struct knote *kn, __unused long hint)
{
	struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;

	kn->kn_data = kq->kq_count;
	return (kn->kn_data > 0);
}

static int
filt_procattach(struct knote *kn)
{
	struct proc *p;

	assert(PID_MAX < NOTE_PDATAMASK);

	if ((kn->kn_sfflags & (NOTE_TRACK | NOTE_TRACKERR | NOTE_CHILD)) != 0)
		return (ENOTSUP);

	p = proc_find(kn->kn_id);
	if (p == NULL) {
		return (ESRCH);
	}

	const int NoteExitStatusBits = NOTE_EXIT | NOTE_EXITSTATUS;

	if ((kn->kn_sfflags & NoteExitStatusBits) == NoteExitStatusBits)
		do {
			pid_t selfpid = proc_selfpid();

			if (p->p_ppid == selfpid)
				break;	/* parent => ok */

			if ((p->p_lflag & P_LTRACED) != 0 &&
			    (p->p_oppid == selfpid))
				break;	/* parent-in-waiting => ok */

			proc_rele(p);
			return (EACCES);
		} while (0);

	proc_klist_lock();

	kn->kn_flags |= EV_CLEAR;	/* automatically set */
	kn->kn_ptr.p_proc = p;		/* store the proc handle */

	KNOTE_ATTACH(&p->p_klist, kn);

	proc_klist_unlock();

	proc_rele(p);

	return (0);
}

/*
 * The knote may be attached to a different process, which may exit,
 * leaving nothing for the knote to be attached to.  In that case,
 * the pointer to the process will have already been nulled out.
 */
static void
filt_procdetach(struct knote *kn)
{
	struct proc *p;

	proc_klist_lock();

	p = kn->kn_ptr.p_proc;
	if (p != PROC_NULL) {
		kn->kn_ptr.p_proc = PROC_NULL;
		KNOTE_DETACH(&p->p_klist, kn);
	}

	proc_klist_unlock();
}

static int
filt_proc(struct knote *kn, long hint)
{
	/*
	 * Note: a lot of bits in hint may be obtained from the knote
	 * To free some of those bits, see <rdar://problem/12592988> Freeing up
	 * bits in hint for filt_proc
	 */
	/* hint is 0 when called from above */
	if (hint != 0) {
		u_int event;

		/* ALWAYS CALLED WITH proc_klist_lock when (hint != 0) */

		/*
		 * mask off extra data
		 */
		event = (u_int)hint & NOTE_PCTRLMASK;

		/*
		 * termination lifecycle events can happen while a debugger
		 * has reparented a process, in which case notifications
		 * should be quashed except to the tracing parent. When
		 * the debugger reaps the child (either via wait4(2) or
		 * process exit), the child will be reparented to the original
		 * parent and these knotes re-fired.
		 */
		if (event & NOTE_EXIT) {
			if ((kn->kn_ptr.p_proc->p_oppid != 0)
				&& (kn->kn_kq->kq_p->p_pid != kn->kn_ptr.p_proc->p_ppid)) {
				/*
				 * This knote is not for the current ptrace(2) parent, ignore.
				 */
				return 0;
			}
		}

		/*
		 * if the user is interested in this event, record it.
		 */
		if (kn->kn_sfflags & event)
			kn->kn_fflags |= event;

#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
		if ((event == NOTE_REAP) || ((event == NOTE_EXIT) && !(kn->kn_sfflags & NOTE_REAP))) {
			kn->kn_flags |= (EV_EOF | EV_ONESHOT);
		}
#pragma clang diagnostic pop

		if (event == NOTE_EXIT) {
			kn->kn_data = 0;
			if ((kn->kn_sfflags & NOTE_EXITSTATUS) != 0) {
				kn->kn_fflags |= NOTE_EXITSTATUS;
				kn->kn_data |= (hint & NOTE_PDATAMASK);
			}
			if ((kn->kn_sfflags & NOTE_EXIT_DETAIL) != 0) {
				kn->kn_fflags |= NOTE_EXIT_DETAIL;
				if ((kn->kn_ptr.p_proc->p_lflag &
				    P_LTERM_DECRYPTFAIL) != 0) {
					kn->kn_data |= NOTE_EXIT_DECRYPTFAIL;
				}
				if ((kn->kn_ptr.p_proc->p_lflag &
				    P_LTERM_JETSAM) != 0) {
					kn->kn_data |= NOTE_EXIT_MEMORY;
					switch (kn->kn_ptr.p_proc->p_lflag &
					    P_JETSAM_MASK) {
						case P_JETSAM_VMPAGESHORTAGE:
							kn->kn_data |= NOTE_EXIT_MEMORY_VMPAGESHORTAGE;
							break;
						case P_JETSAM_VMTHRASHING:
							kn->kn_data |= NOTE_EXIT_MEMORY_VMTHRASHING;
							break;
						case P_JETSAM_VNODE:
							kn->kn_data |= NOTE_EXIT_MEMORY_VNODE;
							break;
						case P_JETSAM_HIWAT:
							kn->kn_data |= NOTE_EXIT_MEMORY_HIWAT;
							break;
						case P_JETSAM_PID:
							kn->kn_data |= NOTE_EXIT_MEMORY_PID;
							break;
						case P_JETSAM_IDLEEXIT:
							kn->kn_data |= NOTE_EXIT_MEMORY_IDLE;
							break;
					}
				}
				if ((kn->kn_ptr.p_proc->p_csflags &
				    CS_KILLED) != 0) {
					kn->kn_data |= NOTE_EXIT_CSERROR;
				}
			}
		}

	}

	/* atomic check, no locking need when called from above */
	return (kn->kn_fflags != 0);
}

#if VM_PRESSURE_EVENTS
/*
 * Virtual memory kevents
 *
 * author: Matt Jacobson [matthew_jacobson@apple.com]
 */

static int
filt_vmattach(struct knote *kn)
{
	/*
	 * The note will be cleared once the information has been flushed to
	 * the client. If there is still pressure, we will be re-alerted.
	 */
	kn->kn_flags |= EV_CLEAR;
	return (vm_knote_register(kn));
}

static void
filt_vmdetach(struct knote *kn)
{
	vm_knote_unregister(kn);
}

static int
filt_vm(struct knote *kn, long hint)
{
	/* hint == 0 means this is just an alive? check (always true) */
	if (hint != 0) {
		const pid_t pid = (pid_t)hint;
		if ((kn->kn_sfflags & NOTE_VM_PRESSURE) &&
		    (kn->kn_kq->kq_p->p_pid == pid)) {
			kn->kn_fflags |= NOTE_VM_PRESSURE;
		}
	}

	return (kn->kn_fflags != 0);
}
#endif /* VM_PRESSURE_EVENTS */

/*
 * filt_timervalidate - process data from user
 *
 *	Converts to either interval or deadline format.
 *
 *	The saved-data field in the knote contains the
 *	time value.  The saved filter-flags indicates
 *	the unit of measurement.
 *
 *	After validation, either the saved-data field
 *	contains the interval in absolute time, or ext[0]
 *	contains the expected deadline. If that deadline
 *	is in the past, ext[0] is 0.
 *
 *	Returns EINVAL for unrecognized units of time.
 *
 *	Timer filter lock is held.
 *
 */
static int
filt_timervalidate(struct knote *kn)
{
	uint64_t multiplier;
	uint64_t raw = 0;

	switch (kn->kn_sfflags & (NOTE_SECONDS|NOTE_USECONDS|NOTE_NSECONDS)) {
	case NOTE_SECONDS:
		multiplier = NSEC_PER_SEC;
		break;
	case NOTE_USECONDS:
		multiplier = NSEC_PER_USEC;
		break;
	case NOTE_NSECONDS:
		multiplier = 1;
		break;
	case 0: /* milliseconds (default) */
		multiplier = NSEC_PER_SEC / 1000;
		break;
	default:
		return (EINVAL);
	}

	/* transform the slop delta(leeway) in kn_ext[1] if passed to same time scale */
	if(kn->kn_sfflags & NOTE_LEEWAY){
		nanoseconds_to_absolutetime((uint64_t)kn->kn_ext[1] * multiplier, &raw);
		kn->kn_ext[1] = raw;
	}

	nanoseconds_to_absolutetime((uint64_t)kn->kn_sdata * multiplier, &raw);

	kn->kn_ext[0] = 0;
	kn->kn_sdata = 0;

	if (kn->kn_sfflags & NOTE_ABSOLUTE) {
		clock_sec_t seconds;
		clock_nsec_t nanoseconds;
		uint64_t now;

		clock_get_calendar_nanotime(&seconds, &nanoseconds);
		nanoseconds_to_absolutetime((uint64_t)seconds * NSEC_PER_SEC +
		    nanoseconds, &now);

		if (raw < now) {
			/* time has already passed */
			kn->kn_ext[0] = 0;
		} else {
			raw -= now;
			clock_absolutetime_interval_to_deadline(raw,
			    &kn->kn_ext[0]);
		}
	} else {
		kn->kn_sdata = raw;
	}

	return (0);
}

/*
 * filt_timerupdate - compute the next deadline
 *
 * 	Repeating timers store their interval in kn_sdata. Absolute
 * 	timers have already calculated the deadline, stored in ext[0].
 *
 * 	On return, the next deadline (or zero if no deadline is needed)
 * 	is stored in kn_ext[0].
 *
 * 	Timer filter lock is held.
 */
static void
filt_timerupdate(struct knote *kn)
{
	/* if there's no interval, deadline is just in kn_ext[0] */
	if (kn->kn_sdata == 0)
		return;

	/* if timer hasn't fired before, fire in interval nsecs */
	if (kn->kn_ext[0] == 0) {
		clock_absolutetime_interval_to_deadline(kn->kn_sdata,
		    &kn->kn_ext[0]);
	} else {
		/*
		 * If timer has fired before, schedule the next pop
		 * relative to the last intended deadline.
		 *
		 * We could check for whether the deadline has expired,
		 * but the thread call layer can handle that.
		 */
		kn->kn_ext[0] += kn->kn_sdata;
	}
}

/*
 * filt_timerexpire - the timer callout routine
 *
 * Just propagate the timer event into the knote
 * filter routine (by going through the knote
 * synchronization point).  Pass a hint to
 * indicate this is a real event, not just a
 * query from above.
 */
static void
filt_timerexpire(void *knx, __unused void *spare)
{
	struct klist timer_list;
	struct knote *kn = knx;

	filt_timerlock();

	kn->kn_hookid &= ~TIMER_RUNNING;

	/* no "object" for timers, so fake a list */
	SLIST_INIT(&timer_list);
	SLIST_INSERT_HEAD(&timer_list, kn, kn_selnext);
	KNOTE(&timer_list, 1);

	/* if someone is waiting for timer to pop */
	if (kn->kn_hookid & TIMER_CANCELWAIT) {
		struct kqueue *kq = kn->kn_kq;
		wait_queue_wakeup_all((wait_queue_t)kq->kq_wqs, &kn->kn_hook,
		    THREAD_AWAKENED);
	}

	filt_timerunlock();
}

/*
 * Cancel a running timer (or wait for the pop).
 * Timer filter lock is held.
 */
static void
filt_timercancel(struct knote *kn)
{
	struct kqueue *kq = kn->kn_kq;
	thread_call_t callout = kn->kn_hook;
	boolean_t cancelled;

	if (kn->kn_hookid & TIMER_RUNNING) {
		/* cancel the callout if we can */
		cancelled = thread_call_cancel(callout);
		if (cancelled) {
			kn->kn_hookid &= ~TIMER_RUNNING;
		} else {
			/* we have to wait for the expire routine.  */
			kn->kn_hookid |= TIMER_CANCELWAIT;
			if (!wait_queue_is_valid((myWaitQueue*)kq->kq_wqs)) {
				printf("\nAbout to call wait_queue_assert_wait with invalid wait queue from filt_timercancel.\n");
			}
			wait_queue_assert_wait((wait_queue_t)kq->kq_wqs,
			    &kn->kn_hook, THREAD_UNINT, 0);
			filt_timerunlock();
			thread_block(THREAD_CONTINUE_NULL);
			filt_timerlock();
			assert((kn->kn_hookid & TIMER_RUNNING) == 0);
		}
	}
}

/*
 * Allocate a thread call for the knote's lifetime, and kick off the timer.
 */
static int
filt_timerattach(struct knote *kn)
{
	thread_call_t callout;
	int error;

	callout = thread_call_allocate(filt_timerexpire, kn);
	if (NULL == callout)
		return (ENOMEM);

	filt_timerlock();
	error = filt_timervalidate(kn);
	if (error != 0) {
		filt_timerunlock();
		return (error);
	}

	kn->kn_hook = (void*)callout;
	kn->kn_hookid = 0;

	/* absolute=EV_ONESHOT */
	if (kn->kn_sfflags & NOTE_ABSOLUTE)
		kn->kn_flags |= EV_ONESHOT;

	filt_timerupdate(kn);
	if (kn->kn_ext[0]) {
		kn->kn_flags |= EV_CLEAR;
		unsigned int timer_flags = 0;
		if (kn->kn_sfflags & NOTE_CRITICAL)
			timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
		else if (kn->kn_sfflags & NOTE_BACKGROUND)
			timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
		else
			timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;

		if (kn->kn_sfflags & NOTE_LEEWAY)
			timer_flags |= THREAD_CALL_DELAY_LEEWAY;

		thread_call_enter_delayed_with_leeway(callout, NULL,
				kn->kn_ext[0], kn->kn_ext[1], timer_flags);

		kn->kn_hookid |= TIMER_RUNNING;
	} else {
		/* fake immediate */
		kn->kn_data = 1;
	}

	filt_timerunlock();
	return (0);
}

/*
 * Shut down the timer if it's running, and free the callout.
 */
static void
filt_timerdetach(struct knote *kn)
{
	thread_call_t callout;

	filt_timerlock();

	callout = (thread_call_t)kn->kn_hook;
	filt_timercancel(kn);

	filt_timerunlock();

	thread_call_free(callout);
}


static int
filt_timer(struct knote *kn, long hint)
{
	int result;

	if (hint) {
		/* real timer pop -- timer lock held by filt_timerexpire */
		kn->kn_data++;

		if (((kn->kn_hookid & TIMER_CANCELWAIT) == 0) &&
				((kn->kn_flags & EV_ONESHOT) == 0)) {

			/* evaluate next time to fire */
			filt_timerupdate(kn);

			if (kn->kn_ext[0]) {
				unsigned int timer_flags = 0;

				/* keep the callout and re-arm */
				if (kn->kn_sfflags & NOTE_CRITICAL)
					timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
				else if (kn->kn_sfflags & NOTE_BACKGROUND)
					timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
				else
					timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;

				if (kn->kn_sfflags & NOTE_LEEWAY)
					timer_flags |= THREAD_CALL_DELAY_LEEWAY;

				thread_call_enter_delayed_with_leeway(kn->kn_hook, NULL,
						kn->kn_ext[0], kn->kn_ext[1], timer_flags);

				kn->kn_hookid |= TIMER_RUNNING;
			}
		}

		return (1);
	}

	/* user-query */
	filt_timerlock();

	result = (kn->kn_data != 0);

	filt_timerunlock();

	return (result);
}


/*
 * filt_timertouch - update knote with new user input
 *
 * Cancel and restart the timer based on new user data. When
 * the user picks up a knote, clear the count of how many timer
 * pops have gone off (in kn_data).
 */
static void
filt_timertouch(struct knote *kn, struct kevent64_s *kev, long type)
{
	int error;
	filt_timerlock();

	switch (type) {
	case EVENT_REGISTER:
		/* cancel current call */
		filt_timercancel(kn);

		/* recalculate deadline */
		kn->kn_sdata = kev->data;
		kn->kn_sfflags = kev->fflags;
		kn->kn_ext[0] = kev->ext[0];
		kn->kn_ext[1] = kev->ext[1];

		error = filt_timervalidate(kn);
		if (error) {
			/* no way to report error, so mark it in the knote */
			kn->kn_flags |= EV_ERROR;
			kn->kn_data = error;
			break;
		}

		/* start timer if necessary */
		filt_timerupdate(kn);

		if (kn->kn_ext[0]) {
			unsigned int timer_flags = 0;
			if (kn->kn_sfflags & NOTE_CRITICAL)
				timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
			else if (kn->kn_sfflags & NOTE_BACKGROUND)
				timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
			else
				timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;

			if (kn->kn_sfflags & NOTE_LEEWAY)
				timer_flags |= THREAD_CALL_DELAY_LEEWAY;

			thread_call_enter_delayed_with_leeway(kn->kn_hook, NULL,
					kn->kn_ext[0], kn->kn_ext[1], timer_flags);

			kn->kn_hookid |= TIMER_RUNNING;
		} else {
			/* pretend the timer has fired */
			kn->kn_data = 1;
		}

		break;

	case EVENT_PROCESS:
		/* reset the timer pop count in kn_data */
		*kev = kn->kn_kevent;
		kev->ext[0] = 0;
		kn->kn_data = 0;
		if (kn->kn_flags & EV_CLEAR)
			kn->kn_fflags = 0;
		break;
	default:
		panic("%s: - invalid type (%ld)", __func__, type);
		break;
	}

	filt_timerunlock();
}

static void
filt_timerlock(void)
{
	lck_mtx_lock(&_filt_timerlock);
}

static void
filt_timerunlock(void)
{
	lck_mtx_unlock(&_filt_timerlock);
}

static int
filt_userattach(struct knote *kn)
{
	/* EVFILT_USER knotes are not attached to anything in the kernel */
	kn->kn_hook = NULL;
	if (kn->kn_fflags & NOTE_TRIGGER) {
		kn->kn_hookid = 1;
	} else {
		kn->kn_hookid = 0;
	}
	return (0);
}

static void
filt_userdetach(__unused struct knote *kn)
{
	/* EVFILT_USER knotes are not attached to anything in the kernel */
}

static int
filt_user(struct knote *kn, __unused long hint)
{
	return (kn->kn_hookid);
}

static void
filt_usertouch(struct knote *kn, struct kevent64_s *kev, long type)
{
	uint32_t ffctrl;
	switch (type) {
	case EVENT_REGISTER:
		if (kev->fflags & NOTE_TRIGGER) {
			kn->kn_hookid = 1;
		}

		ffctrl = kev->fflags & NOTE_FFCTRLMASK;
		kev->fflags &= NOTE_FFLAGSMASK;
		switch (ffctrl) {
		case NOTE_FFNOP:
			break;
		case NOTE_FFAND:
			OSBitAndAtomic(kev->fflags, &kn->kn_sfflags);
			break;
		case NOTE_FFOR:
			OSBitOrAtomic(kev->fflags, &kn->kn_sfflags);
			break;
		case NOTE_FFCOPY:
			kn->kn_sfflags = kev->fflags;
			break;
		}
		kn->kn_sdata = kev->data;
		break;
	case EVENT_PROCESS:
		*kev = kn->kn_kevent;
		kev->fflags = (volatile UInt32)kn->kn_sfflags;
		kev->data = kn->kn_sdata;
		if (kn->kn_flags & EV_CLEAR) {
			kn->kn_hookid = 0;
			kn->kn_data = 0;
			kn->kn_fflags = 0;
		}
		break;
	default:
		panic("%s: - invalid type (%ld)", __func__, type);
		break;
	}
}

/*
 * JMM - placeholder for not-yet-implemented filters
 */
static int
filt_badattach(__unused struct knote *kn)
{
	return (ENOTSUP);
}

struct kqueue *
kqueue_alloc(struct proc *p)
{
	struct filedesc *fdp = p->p_fd;
	struct kqueue *kq;

	MALLOC_ZONE(kq, struct kqueue *, sizeof (struct kqueue), M_KQUEUE,
	    M_WAITOK);
	if (kq != NULL) {
		wait_queue_set_t wqs;

		wqs = wait_queue_set_alloc(SYNC_POLICY_FIFO |
		    SYNC_POLICY_PREPOST);
		if (wqs != NULL) {
			bzero(kq, sizeof (struct kqueue));
			lck_spin_init(&kq->kq_lock, kq_lck_grp, kq_lck_attr);
			TAILQ_INIT(&kq->kq_head);
			kq->kq_wqs = wqs;
			kq->kq_p = p;
		} else {
			FREE_ZONE(kq, sizeof (struct kqueue), M_KQUEUE);
		}
	}

	if (fdp->fd_knlistsize < 0) {
		proc_fdlock(p);
		if (fdp->fd_knlistsize < 0)
			fdp->fd_knlistsize = 0;	/* this process has had a kq */
		proc_fdunlock(p);
	}

	return (kq);
}

/*
 * kqueue_dealloc - detach all knotes from a kqueue and free it
 *
 * 	We walk each list looking for knotes referencing this
 *	this kqueue.  If we find one, we try to drop it.  But
 *	if we fail to get a drop reference, that will wait
 *	until it is dropped.  So, we can just restart again
 *	safe in the assumption that the list will eventually
 *	not contain any more references to this kqueue (either
 *	we dropped them all, or someone else did).
 *
 *	Assumes no new events are being added to the kqueue.
 *	Nothing locked on entry or exit.
 */
void
kqueue_dealloc(struct kqueue *kq)
{
	struct proc *p = kq->kq_p;
	struct filedesc *fdp = p->p_fd;
	struct knote *kn;
	int i;

	proc_fdlock(p);
	for (i = 0; i < fdp->fd_knlistsize; i++) {
		kn = SLIST_FIRST(&fdp->fd_knlist[i]);
		while (kn != NULL) {
			if (kq == kn->kn_kq) {
				kqlock(kq);
				proc_fdunlock(p);
				/* drop it ourselves or wait */
				if (kqlock2knotedrop(kq, kn)) {
					kn->kn_fop->f_detach(kn);
					knote_drop(kn, p);
				}
				proc_fdlock(p);
				/* start over at beginning of list */
				kn = SLIST_FIRST(&fdp->fd_knlist[i]);
				continue;
			}
			kn = SLIST_NEXT(kn, kn_link);
		}
	}
	if (fdp->fd_knhashmask != 0) {
		for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
			kn = SLIST_FIRST(&fdp->fd_knhash[i]);
			while (kn != NULL) {
				if (kq == kn->kn_kq) {
					kqlock(kq);
					proc_fdunlock(p);
					/* drop it ourselves or wait */
					if (kqlock2knotedrop(kq, kn)) {
						kn->kn_fop->f_detach(kn);
						knote_drop(kn, p);
					}
					proc_fdlock(p);
					/* start over at beginning of list */
					kn = SLIST_FIRST(&fdp->fd_knhash[i]);
					continue;
				}
				kn = SLIST_NEXT(kn, kn_link);
			}
		}
	}
	proc_fdunlock(p);

	/*
	 * before freeing the wait queue set for this kqueue,
	 * make sure it is unlinked from all its containing (select) sets.
	 */
	wait_queue_unlink_all((wait_queue_t)kq->kq_wqs);
	wait_queue_set_free(kq->kq_wqs);
	lck_spin_destroy(&kq->kq_lock, kq_lck_grp);
	FREE_ZONE(kq, sizeof (struct kqueue), M_KQUEUE);
}

int
kqueue_body(struct proc *p, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval)
{
	struct kqueue *kq;
	struct fileproc *fp;
	int fd, error;

	error = falloc_withalloc(p,
	    &fp, &fd, vfs_context_current(), fp_zalloc, cra);
	if (error) {
		return (error);
	}

	kq = kqueue_alloc(p);
	if (kq == NULL) {
		fp_free(p, fd, fp);
		return (ENOMEM);
	}

	fp->f_flag = FREAD | FWRITE;
	fp->f_ops = &kqueueops;
	fp->f_data = kq;

	proc_fdlock(p);
	*fdflags(p, fd) |= UF_EXCLOSE;
	procfdtbl_releasefd(p, fd, NULL);
	fp_drop(p, fd, fp, 1);
	proc_fdunlock(p);

	*retval = fd;
	return (error);
}

int
kqueue(struct proc *p, __unused struct kqueue_args *uap, int32_t *retval)
{
	return (kqueue_body(p, fileproc_alloc_init, NULL, retval));
}

static int
kevent_copyin(user_addr_t *addrp, struct kevent64_s *kevp, struct proc *p,
    int iskev64)
{
	int advance;
	int error;

	if (iskev64) {
		advance = sizeof (struct kevent64_s);
		error = copyin(*addrp, (caddr_t)kevp, advance);
	} else if (IS_64BIT_PROCESS(p)) {
		struct user64_kevent kev64;
		bzero(kevp, sizeof (struct kevent64_s));

		advance = sizeof (kev64);
		error = copyin(*addrp, (caddr_t)&kev64, advance);
		if (error)
			return (error);
		kevp->ident = kev64.ident;
		kevp->filter = kev64.filter;
		kevp->flags = kev64.flags;
		kevp->fflags = kev64.fflags;
		kevp->data = kev64.data;
		kevp->udata = kev64.udata;
	} else {
		struct user32_kevent kev32;
		bzero(kevp, sizeof (struct kevent64_s));

		advance = sizeof (kev32);
		error = copyin(*addrp, (caddr_t)&kev32, advance);
		if (error)
			return (error);
		kevp->ident = (uintptr_t)kev32.ident;
		kevp->filter = kev32.filter;
		kevp->flags = kev32.flags;
		kevp->fflags = kev32.fflags;
		kevp->data = (intptr_t)kev32.data;
		kevp->udata = CAST_USER_ADDR_T(kev32.udata);
	}
	if (!error)
		*addrp += advance;
	return (error);
}

static int
kevent_copyout(struct kevent64_s *kevp, user_addr_t *addrp, struct proc *p,
    int iskev64)
{
	int advance;
	int error;

	if (iskev64) {
		advance = sizeof (struct kevent64_s);
		error = copyout((caddr_t)kevp, *addrp, advance);
	} else if (IS_64BIT_PROCESS(p)) {
		struct user64_kevent kev64;

		/*
		 * deal with the special case of a user-supplied
		 * value of (uintptr_t)-1.
		 */
		kev64.ident = (kevp->ident == (uintptr_t)-1) ?
		    (uint64_t)-1LL : (uint64_t)kevp->ident;

		kev64.filter = kevp->filter;
		kev64.flags = kevp->flags;
		kev64.fflags = kevp->fflags;
		kev64.data = (int64_t) kevp->data;
		kev64.udata = kevp->udata;
		advance = sizeof (kev64);
		error = copyout((caddr_t)&kev64, *addrp, advance);
	} else {
		struct user32_kevent kev32;

		kev32.ident = (uint32_t)kevp->ident;
		kev32.filter = kevp->filter;
		kev32.flags = kevp->flags;
		kev32.fflags = kevp->fflags;
		kev32.data = (int32_t)kevp->data;
		kev32.udata = kevp->udata;
		advance = sizeof (kev32);
		error = copyout((caddr_t)&kev32, *addrp, advance);
	}
	if (!error)
		*addrp += advance;
	return (error);
}

/*
 * kevent_continue - continue a kevent syscall after blocking
 *
 *	assume we inherit a use count on the kq fileglob.
 */

static void
kevent_continue(__unused struct kqueue *kq, void *data, int error)
{
	struct _kevent *cont_args;
	struct fileproc *fp;
	int32_t *retval;
	int noutputs;
	int fd;
	struct proc *p = current_proc();

	cont_args = (struct _kevent *)data;
	noutputs = cont_args->eventout;
	retval = cont_args->retval;
	fd = cont_args->fd;
	fp = cont_args->fp;

	fp_drop(p, fd, fp, 0);

	/* don't restart after signals... */
	if (error == ERESTART)
		error = EINTR;
	else if (error == EWOULDBLOCK)
		error = 0;
	if (error == 0)
		*retval = noutputs;
	unix_syscall_return(error);
}

/*
 * kevent - [syscall] register and wait for kernel events
 *
 */
int
kevent(struct proc *p, struct kevent_args *uap, int32_t *retval)
{
	return (kevent_internal(p,
	    0,
	    uap->changelist,
	    uap->nchanges,
	    uap->eventlist,
	    uap->nevents,
	    uap->fd,
	    uap->timeout,
	    0, /* no flags from old kevent() call */
	    retval));
}

int
kevent64(struct proc *p, struct kevent64_args *uap, int32_t *retval)
{
	return (kevent_internal(p,
	    1,
	    uap->changelist,
	    uap->nchanges,
	    uap->eventlist,
	    uap->nevents,
	    uap->fd,
	    uap->timeout,
	    uap->flags,
	    retval));
}

static int
kevent_internal(struct proc *p, int iskev64, user_addr_t changelist,
    int nchanges, user_addr_t ueventlist, int nevents, int fd,
    user_addr_t utimeout, __unused unsigned int flags,
    int32_t *retval)
{
	struct _kevent *cont_args;
	uthread_t ut;
	struct kqueue *kq;
	struct fileproc *fp;
	struct kevent64_s kev;
	int error, noutputs;
	struct timeval atv;

	/* convert timeout to absolute - if we have one */
	if (utimeout != USER_ADDR_NULL) {
		struct timeval rtv;
		if (IS_64BIT_PROCESS(p)) {
			struct user64_timespec ts;
			error = copyin(utimeout, &ts, sizeof(ts));
			if ((ts.tv_sec & 0xFFFFFFFF00000000ull) != 0)
				error = EINVAL;
			else
				TIMESPEC_TO_TIMEVAL(&rtv, &ts);
		} else {
			struct user32_timespec ts;
			error = copyin(utimeout, &ts, sizeof(ts));
			TIMESPEC_TO_TIMEVAL(&rtv, &ts);
		}
		if (error)
			return (error);
		if (itimerfix(&rtv))
			return (EINVAL);
		getmicrouptime(&atv);
		timevaladd(&atv, &rtv);
	} else {
		atv.tv_sec = 0;
		atv.tv_usec = 0;
	}

	/* get a usecount for the kq itself */
	if ((error = fp_getfkq(p, fd, &fp, &kq)) != 0)
		return (error);

	/* each kq should only be used for events of one type */
	kqlock(kq);
	if (kq->kq_state & (KQ_KEV32 | KQ_KEV64)) {
		if (((iskev64 && (kq->kq_state & KQ_KEV32)) ||
			(!iskev64 && (kq->kq_state & KQ_KEV64)))) {
			error = EINVAL;
			kqunlock(kq);
			goto errorout;
		}
	} else {
		kq->kq_state |= (iskev64 ? KQ_KEV64 : KQ_KEV32);
	}
	kqunlock(kq);

	/* register all the change requests the user provided... */
	noutputs = 0;
	while (nchanges > 0 && error == 0) {
		error = kevent_copyin(&changelist, &kev, p, iskev64);
		if (error)
			break;

		kev.flags &= ~EV_SYSFLAGS;
		error = kevent_register(kq, &kev, p);
		if ((error || (kev.flags & EV_RECEIPT)) && nevents > 0) {
			kev.flags = EV_ERROR;
			kev.data = error;
			error = kevent_copyout(&kev, &ueventlist, p, iskev64);
			if (error == 0) {
				nevents--;
				noutputs++;
			}
		}
		nchanges--;
	}

	/* store the continuation/completion data in the uthread */
	ut = (uthread_t)get_bsdthread_info(current_thread());
	cont_args = &ut->uu_kevent.ss_kevent;
	cont_args->fp = fp;
	cont_args->fd = fd;
	cont_args->retval = retval;
	cont_args->eventlist = ueventlist;
	cont_args->eventcount = nevents;
	cont_args->eventout = noutputs;
	cont_args->eventsize = iskev64;

	if (nevents > 0 && noutputs == 0 && error == 0)
		error = kqueue_scan(kq, kevent_callback,
		    kevent_continue, cont_args,
		    &atv, p);
	kevent_continue(kq, cont_args, error);

errorout:
	fp_drop(p, fd, fp, 0);
	return (error);
}


/*
 * kevent_callback - callback for each individual event
 *
 * called with nothing locked
 * caller holds a reference on the kqueue
 */
static int
kevent_callback(__unused struct kqueue *kq, struct kevent64_s *kevp,
    void *data)
{
	struct _kevent *cont_args;
	int error;
	int iskev64;

	cont_args = (struct _kevent *)data;
	assert(cont_args->eventout < cont_args->eventcount);

	iskev64 = cont_args->eventsize;

	/*
	 * Copy out the appropriate amount of event data for this user.
	 */
	error = kevent_copyout(kevp, &cont_args->eventlist, current_proc(),
	    iskev64);

	/*
	 * If there isn't space for additional events, return
	 * a harmless error to stop the processing here
	 */
	if (error == 0 && ++cont_args->eventout == cont_args->eventcount)
		error = EWOULDBLOCK;
	return (error);
}

/*
 * kevent_description - format a description of a kevent for diagnostic output
 *
 * called with a 128-byte string buffer
 */

char *
kevent_description(struct kevent64_s *kevp, char *s, size_t n)
{
	snprintf(s, n,
	    "kevent="
	    "{.ident=%#llx, .filter=%d, .flags=%#x, .fflags=%#x, .data=%#llx, .udata=%#llx, .ext[0]=%#llx, .ext[1]=%#llx}",
	    kevp->ident,
	    kevp->filter,
	    kevp->flags,
	    kevp->fflags,
	    kevp->data,
	    kevp->udata,
	    kevp->ext[0],
	    kevp->ext[1]);

	return (s);
}

/*
 * kevent_register - add a new event to a kqueue
 *
 *	Creates a mapping between the event source and
 *	the kqueue via a knote data structure.
 *
 *	Because many/most the event sources are file
 *	descriptor related, the knote is linked off
 *	the filedescriptor table for quick access.
 *
 *	called with nothing locked
 *	caller holds a reference on the kqueue
 */

int
kevent_register(struct kqueue *kq, struct kevent64_s *kev,
    __unused struct proc *ctxp)
{
	struct proc *p = kq->kq_p;
	struct filedesc *fdp = p->p_fd;
	struct filterops *fops;
	struct fileproc *fp = NULL;
	struct knote *kn = NULL;
	int error = 0;

	if (kev->filter < 0) {
		if (kev->filter + EVFILT_SYSCOUNT < 0)
			return (EINVAL);
		fops = sysfilt_ops[~kev->filter];	/* to 0-base index */
	} else {
		/*
		 * XXX
		 * filter attach routine is responsible for insuring that
		 * the identifier can be attached to it.
		 */
		printf("unknown filter: %d\n", kev->filter);
		return (EINVAL);
	}

restart:
	/* this iocount needs to be dropped if it is not registered */
	proc_fdlock(p);
	if (fops->f_isfd && (error = fp_lookup(p, kev->ident, &fp, 1)) != 0) {
		proc_fdunlock(p);
		return (error);
	}

	if (fops->f_isfd) {
		/* fd-based knotes are linked off the fd table */
		if (kev->ident < (u_int)fdp->fd_knlistsize) {
			SLIST_FOREACH(kn, &fdp->fd_knlist[kev->ident], kn_link)
				if (kq == kn->kn_kq &&
				    kev->filter == kn->kn_filter)
					break;
		}
	} else {
		/* hash non-fd knotes here too */
		if (fdp->fd_knhashmask != 0) {
			struct klist *list;

			list = &fdp->fd_knhash[
			    KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
			SLIST_FOREACH(kn, list, kn_link)
				if (kev->ident == kn->kn_id &&
				    kq == kn->kn_kq &&
				    kev->filter == kn->kn_filter)
					break;
		}
	}

	/*
	 * kn now contains the matching knote, or NULL if no match
	 */
	if (kn == NULL) {
		if ((kev->flags & (EV_ADD|EV_DELETE)) == EV_ADD) {
			kn = knote_alloc();
			if (kn == NULL) {
				proc_fdunlock(p);
				error = ENOMEM;
				goto done;
			}
			kn->kn_fp = fp;
			kn->kn_kq = kq;
			kn->kn_tq = &kq->kq_head;
			kn->kn_fop = fops;
			kn->kn_sfflags = kev->fflags;
			kn->kn_sdata = kev->data;
			kev->fflags = 0;
			kev->data = 0;
			kn->kn_kevent = *kev;
			kn->kn_inuse = 1;  /* for f_attach() */
			kn->kn_status = KN_ATTACHING;

			/* before anyone can find it */
			if (kev->flags & EV_DISABLE)
				kn->kn_status |= KN_DISABLED;

			error = knote_fdpattach(kn, fdp, p);
			proc_fdunlock(p);

			if (error) {
				knote_free(kn);
				goto done;
			}

			/*
			 * apply reference count to knote structure, and
			 * do not release it at the end of this routine.
			 */
			fp = NULL;

			error = fops->f_attach(kn);

			kqlock(kq);

			if (error != 0) {
				/*
				 * Failed to attach correctly, so drop.
				 * All other possible users/droppers
				 * have deferred to us.
				 */
				kn->kn_status |= KN_DROPPING;
				kqunlock(kq);
				knote_drop(kn, p);
				goto done;
			} else if (kn->kn_status & KN_DROPPING) {
				/*
				 * Attach succeeded, but someone else
				 * deferred their drop - now we have
				 * to do it for them (after detaching).
				 */
				kqunlock(kq);
				kn->kn_fop->f_detach(kn);
				knote_drop(kn, p);
				goto done;
			}
			kn->kn_status &= ~KN_ATTACHING;
			kqunlock(kq);
		} else {
			proc_fdunlock(p);
			error = ENOENT;
			goto done;
		}
	} else {
		/* existing knote - get kqueue lock */
		kqlock(kq);
		proc_fdunlock(p);

		if (kev->flags & EV_DELETE) {
			knote_dequeue(kn);
			kn->kn_status |= KN_DISABLED;
			if (kqlock2knotedrop(kq, kn)) {
				kn->kn_fop->f_detach(kn);
				knote_drop(kn, p);
			}
			goto done;
		}

		/* update status flags for existing knote */
		if (kev->flags & EV_DISABLE) {
			knote_dequeue(kn);
			kn->kn_status |= KN_DISABLED;
		} else if (kev->flags & EV_ENABLE) {
			kn->kn_status &= ~KN_DISABLED;
			if (kn->kn_status & KN_ACTIVE)
				knote_enqueue(kn);
		}

		/*
		 * The user may change some filter values after the
		 * initial EV_ADD, but doing so will not reset any
		 * filter which have already been triggered.
		 */
		kn->kn_kevent.udata = kev->udata;
		if (fops->f_isfd || fops->f_touch == NULL) {
			kn->kn_sfflags = kev->fflags;
			kn->kn_sdata = kev->data;
		}

		/*
		 * If somebody is in the middle of dropping this
		 * knote - go find/insert a new one.  But we have
		 * wait for this one to go away first. Attaches
		 * running in parallel may also drop/modify the
		 * knote.  Wait for those to complete as well and
		 * then start over if we encounter one.
		 */
		if (!kqlock2knoteusewait(kq, kn)) {
			/* kqueue, proc_fdlock both unlocked */
			goto restart;
		}

		/*
		 * Call touch routine to notify filter of changes
		 * in filter values.
		 */
		if (!fops->f_isfd && fops->f_touch != NULL)
			fops->f_touch(kn, kev, EVENT_REGISTER);
	}
	/* still have use ref on knote */

	/*
	 * If the knote is not marked to always stay enqueued,
	 * invoke the filter routine to see if it should be
	 * enqueued now.
	 */
	if ((kn->kn_status & KN_STAYQUEUED) == 0 && kn->kn_fop->f_event(kn, 0)) {
		if (knoteuse2kqlock(kq, kn))
			knote_activate(kn, 1);
		kqunlock(kq);
	} else {
		knote_put(kn);
	}

done:
	if (fp != NULL)
		fp_drop(p, kev->ident, fp, 0);
	return (error);
}


/*
 * knote_process - process a triggered event
 *
 *	Validate that it is really still a triggered event
 *	by calling the filter routines (if necessary).  Hold
 *	a use reference on the knote to avoid it being detached.
 *	If it is still considered triggered, invoke the callback
 *	routine provided and move it to the provided inprocess
 *	queue.
 *
 *	caller holds a reference on the kqueue.
 *	kqueue locked on entry and exit - but may be dropped
 */
static int
knote_process(struct knote *kn,
    kevent_callback_t callback,
    void *data,
    struct kqtailq *inprocessp,
    struct proc *p)
{
	struct kqueue *kq = kn->kn_kq;
	struct kevent64_s kev;
	int touch;
	int result;
	int error;

	/*
	 * Determine the kevent state we want to return.
	 *
	 * Some event states need to be revalidated before returning
	 * them, others we take the snapshot at the time the event
	 * was enqueued.
	 *
	 * Events with non-NULL f_touch operations must be touched.
	 * Triggered events must fill in kev for the callback.
	 *
	 * Convert our lock to a use-count and call the event's
	 * filter routine(s) to update.
	 */
	if ((kn->kn_status & KN_DISABLED) != 0) {
		result = 0;
		touch = 0;
	} else {
		int revalidate;

		result = 1;
		revalidate = ((kn->kn_status & KN_STAYQUEUED) != 0 ||
		    (kn->kn_flags & EV_ONESHOT) == 0);
		touch = (!kn->kn_fop->f_isfd && kn->kn_fop->f_touch != NULL);

		if (revalidate || touch) {
			if (revalidate)
				knote_deactivate(kn);

			/* call the filter/touch routines with just a ref */
			if (kqlock2knoteuse(kq, kn)) {
				/* if we have to revalidate, call the filter */
				if (revalidate) {
					result = kn->kn_fop->f_event(kn, 0);
				}

				/*
				 * capture the kevent data - using touch if
				 * specified
				 */
				if (result && touch) {
					kn->kn_fop->f_touch(kn, &kev,
					    EVENT_PROCESS);
				}

				/*
				 * convert back to a kqlock - bail if the knote
				 * went away
				 */
				if (!knoteuse2kqlock(kq, kn)) {
					return (EJUSTRETURN);
				} else if (result) {
					/*
					 * if revalidated as alive, make sure
					 * it's active
					 */
					if (!(kn->kn_status & KN_ACTIVE)) {
						knote_activate(kn, 0);
					}

					/*
					 * capture all events that occurred
					 * during filter
					 */
					if (!touch) {
						kev = kn->kn_kevent;
					}

				} else if ((kn->kn_status & KN_STAYQUEUED) == 0) {
					/*
					 * was already dequeued, so just bail on
					 * this one
					 */
					return (EJUSTRETURN);
				}
			} else {
				return (EJUSTRETURN);
			}
		} else {
			kev = kn->kn_kevent;
		}
	}

	/* move knote onto inprocess queue */
	assert(kn->kn_tq == &kq->kq_head);
	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
	kn->kn_tq = inprocessp;
	TAILQ_INSERT_TAIL(inprocessp, kn, kn_tqe);

	/*
	 * Determine how to dispatch the knote for future event handling.
	 * not-fired: just return (do not callout).
	 * One-shot: deactivate it.
	 * Clear: deactivate and clear the state.
	 * Dispatch: don't clear state, just deactivate it and mark it disabled.
	 * All others: just leave where they are.
	 */

	if (result == 0) {
		return (EJUSTRETURN);
	} else if ((kn->kn_flags & EV_ONESHOT) != 0) {
		knote_deactivate(kn);
		if (kqlock2knotedrop(kq, kn)) {
			kn->kn_fop->f_detach(kn);
			knote_drop(kn, p);
		}
	} else if ((kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) != 0) {
		if ((kn->kn_flags & EV_DISPATCH) != 0) {
			/* deactivate and disable all dispatch knotes */
			knote_deactivate(kn);
			kn->kn_status |= KN_DISABLED;
		} else if (!touch || kn->kn_fflags == 0) {
			/* only deactivate if nothing since the touch */
			knote_deactivate(kn);
		}
		if (!touch && (kn->kn_flags & EV_CLEAR) != 0) {
			/* manually clear non-touch knotes */
			kn->kn_data = 0;
			kn->kn_fflags = 0;
		}
		kqunlock(kq);
	} else {
		/*
		 * leave on inprocess queue.  We'll
		 * move all the remaining ones back
		 * the kq queue and wakeup any
		 * waiters when we are done.
		 */
		kqunlock(kq);
	}

	/* callback to handle each event as we find it */
	error = (callback)(kq, &kev, data);

	kqlock(kq);
	return (error);
}

/*
 * Return 0 to indicate that processing should proceed,
 * -1 if there is nothing to process.
 *
 * Called with kqueue locked and returns the same way,
 * but may drop lock temporarily.
 */
static int
kqueue_begin_processing(struct kqueue *kq)
{
	for (;;) {
		if (kq->kq_count == 0) {
			return (-1);
		}

		/* if someone else is processing the queue, wait */
		if (kq->kq_nprocess != 0) {
			if (!wait_queue_is_valid((myWaitQueue*)kq->kq_wqs)) {
				printf("\nAbout to call wait_queue_assert_wait with invalid wait queue from kqueue_begin_processing.\n");
			}
			wait_queue_assert_wait((wait_queue_t)kq->kq_wqs,
			    &kq->kq_nprocess, THREAD_UNINT, 0);
			kq->kq_state |= KQ_PROCWAIT;
			kqunlock(kq);
			thread_block(THREAD_CONTINUE_NULL);
			kqlock(kq);
		} else {
			kq->kq_nprocess = 1;
			return (0);
		}
	}
}

/*
 * Called with kqueue lock held.
 */
static void
kqueue_end_processing(struct kqueue *kq)
{
	kq->kq_nprocess = 0;
	if (kq->kq_state & KQ_PROCWAIT) {
		kq->kq_state &= ~KQ_PROCWAIT;
		wait_queue_wakeup_all((wait_queue_t)kq->kq_wqs,
		    &kq->kq_nprocess, THREAD_AWAKENED);
	}
}

/*
 * kqueue_process - process the triggered events in a kqueue
 *
 *	Walk the queued knotes and validate that they are
 *	really still triggered events by calling the filter
 *	routines (if necessary).  Hold a use reference on
 *	the knote to avoid it being detached. For each event
 *	that is still considered triggered, invoke the
 *	callback routine provided.
 *
 *	caller holds a reference on the kqueue.
 *	kqueue locked on entry and exit - but may be dropped
 *	kqueue list locked (held for duration of call)
 */

static int
kqueue_process(struct kqueue *kq,
    kevent_callback_t callback,
    void *data,
    int *countp,
    struct proc *p)
{
	struct kqtailq inprocess;
	struct knote *kn;
	int nevents;
	int error;

	TAILQ_INIT(&inprocess);

	if (kqueue_begin_processing(kq) == -1) {
		*countp = 0;
		/* Nothing to process */
		return (0);
	}

	/*
	 * Clear any pre-posted status from previous runs, so we
	 * only detect events that occur during this run.
	 */
	wait_queue_sub_clearrefs(kq->kq_wqs);

	/*
	 * loop through the enqueued knotes, processing each one and
	 * revalidating those that need it. As they are processed,
	 * they get moved to the inprocess queue (so the loop can end).
	 */
	error = 0;
	nevents = 0;

	while (error == 0 &&
	    (kn = TAILQ_FIRST(&kq->kq_head)) != NULL) {
		error = knote_process(kn, callback, data, &inprocess, p);
		if (error == EJUSTRETURN)
			error = 0;
		else
			nevents++;
	}

	/*
	 * With the kqueue still locked, move any knotes
	 * remaining on the inprocess queue back to the
	 * kq's queue and wake up any waiters.
	 */
	while ((kn = TAILQ_FIRST(&inprocess)) != NULL) {
		assert(kn->kn_tq == &inprocess);
		TAILQ_REMOVE(&inprocess, kn, kn_tqe);
		kn->kn_tq = &kq->kq_head;
		TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
	}

	kqueue_end_processing(kq);

	*countp = nevents;
	return (error);
}


static void
kqueue_scan_continue(void *data, wait_result_t wait_result)
{
	thread_t self = current_thread();
	uthread_t ut = (uthread_t)get_bsdthread_info(self);
	struct _kqueue_scan * cont_args = &ut->uu_kevent.ss_kqueue_scan;
	struct kqueue *kq = (struct kqueue *)data;
	int error;
	int count;

	/* convert the (previous) wait_result to a proper error */
	switch (wait_result) {
	case THREAD_AWAKENED:
		kqlock(kq);
		error = kqueue_process(kq, cont_args->call, cont_args, &count,
		    current_proc());
		if (error == 0 && count == 0) {
			if (!wait_queue_is_valid((myWaitQueue*)kq->kq_wqs)) {
				printf("\nAbout to call wait_queue_assert_wait with invalid wait queue from kqueue_scan_continue.\n");
			}
			wait_queue_assert_wait((wait_queue_t)kq->kq_wqs,
			    KQ_EVENT, THREAD_ABORTSAFE, cont_args->deadline);
			kq->kq_state |= KQ_SLEEP;
			kqunlock(kq);
			thread_block_parameter(kqueue_scan_continue, kq);
			/* NOTREACHED */
		}
		kqunlock(kq);
		break;
	case THREAD_TIMED_OUT:
		error = EWOULDBLOCK;
		break;
	case THREAD_INTERRUPTED:
		error = EINTR;
		break;
	case THREAD_RESTART:
		printf("\nkqueue_scan_continue was called with a wait_result of THREAD_RESTART. A vanilla 2422 XNU kernel would have panicked!\n");
		error = EBADF;
		break;
	default:
		panic("%s: - invalid wait_result (%d)", __func__,
		    wait_result);
		error = 0;
	}

	/* call the continuation with the results */
	assert(cont_args->cont != NULL);
	(cont_args->cont)(kq, cont_args->data, error);
}


/*
 * kqueue_scan - scan and wait for events in a kqueue
 *
 *	Process the triggered events in a kqueue.
 *
 *	If there are no events triggered arrange to
 *	wait for them. If the caller provided a
 *	continuation routine, then kevent_scan will
 *	also.
 *
 *	The callback routine must be valid.
 *	The caller must hold a use-count reference on the kq.
 */

int
kqueue_scan(struct kqueue *kq,
	    kevent_callback_t callback,
	    kqueue_continue_t continuation,
	    void *data,
	    struct timeval *atvp,
	    struct proc *p)
{
	thread_continue_t cont = THREAD_CONTINUE_NULL;
	uint64_t deadline;
	int error;
	int first;

	assert(callback != NULL);

	first = 1;
	for (;;) {
		wait_result_t wait_result;
		int count;

		/*
		 * Make a pass through the kq to find events already
		 * triggered.
		 */
		kqlock(kq);
		error = kqueue_process(kq, callback, data, &count, p);
		if (error || count)
			break; /* lock still held */

		/* looks like we have to consider blocking */
		if (first) {
			first = 0;
			/* convert the timeout to a deadline once */
			if (atvp->tv_sec || atvp->tv_usec) {
				uint64_t now;

				clock_get_uptime(&now);
				nanoseconds_to_absolutetime((uint64_t)atvp->tv_sec * NSEC_PER_SEC +
							    atvp->tv_usec * (long)NSEC_PER_USEC,
							    &deadline);
				if (now >= deadline) {
					/* non-blocking call */
					error = EWOULDBLOCK;
					break; /* lock still held */
				}
				deadline -= now;
				clock_absolutetime_interval_to_deadline(deadline, &deadline);
			} else {
				deadline = 0; 	/* block forever */
			}

			if (continuation) {
				uthread_t ut = (uthread_t)get_bsdthread_info(current_thread());
				struct _kqueue_scan *cont_args = &ut->uu_kevent.ss_kqueue_scan;

				cont_args->call = callback;
				cont_args->cont = continuation;
				cont_args->deadline = deadline;
				cont_args->data = data;
				cont = kqueue_scan_continue;
			}
		}

		/* go ahead and wait */
		if (!wait_queue_is_valid((myWaitQueue*)kq->kq_wqs)) {
			printf("\nAbout to call wait_queue_assert_wait_with_leeway with invalid wait queue from kqueue_scan.\n");
		}
		wait_queue_assert_wait_with_leeway((wait_queue_t)kq->kq_wqs,
		    KQ_EVENT, THREAD_ABORTSAFE, TIMEOUT_URGENCY_USER_NORMAL,
		    deadline, 0);
		kq->kq_state |= KQ_SLEEP;
		kqunlock(kq);
		wait_result = thread_block_parameter(cont, kq);
		/* NOTREACHED if (continuation != NULL) */

		switch (wait_result) {
		case THREAD_AWAKENED:
			continue;
		case THREAD_TIMED_OUT:
			return (EWOULDBLOCK);
		case THREAD_INTERRUPTED:
			return (EINTR);
		default:
			panic("%s: - bad wait_result (%d)", __func__,
			    wait_result);
			error = 0;
		}
	}
	kqunlock(kq);
	return (error);
}


/*
 * XXX
 * This could be expanded to call kqueue_scan, if desired.
 */
/*ARGSUSED*/
static int
kqueue_read(__unused struct fileproc *fp,
    __unused struct uio *uio,
    __unused int flags,
    __unused vfs_context_t ctx)
{
	return (ENXIO);
}

/*ARGSUSED*/
static int
kqueue_write(__unused struct fileproc *fp,
    __unused struct uio *uio,
    __unused int flags,
    __unused vfs_context_t ctx)
{
	return (ENXIO);
}

/*ARGSUSED*/
static int
kqueue_ioctl(__unused struct fileproc *fp,
    __unused u_long com,
    __unused caddr_t data,
    __unused vfs_context_t ctx)
{
	return (ENOTTY);
}

/*ARGSUSED*/
static int
kqueue_select(struct fileproc *fp, int which, void *wql,
    __unused vfs_context_t ctx)
{
	struct kqueue *kq = (struct kqueue *)fp->f_data;
	struct knote *kn;
	struct kqtailq inprocessq;
	int retnum = 0;

	if (which != FREAD)
		return (0);

	TAILQ_INIT(&inprocessq);

	kqlock(kq);
	/*
	 * If this is the first pass, link the wait queue associated with the
	 * the kqueue onto the wait queue set for the select().  Normally we
	 * use selrecord() for this, but it uses the wait queue within the
	 * selinfo structure and we need to use the main one for the kqueue to
	 * catch events from KN_STAYQUEUED sources. So we do the linkage manually.
	 * (The select() call will unlink them when it ends).
	 */
	if (wql != NULL) {
		thread_t cur_act = current_thread();
		struct uthread * ut = get_bsdthread_info(cur_act);

		kq->kq_state |= KQ_SEL;
		wait_queue_link_noalloc((wait_queue_t)kq->kq_wqs, ut->uu_wqset,
		    (wait_queue_link_t)wql);
	}

	if (kqueue_begin_processing(kq) == -1) {
		kqunlock(kq);
		return (0);
	}

	if (kq->kq_count != 0) {
		/*
		 * there is something queued - but it might be a
		 * KN_STAYQUEUED knote, which may or may not have
		 * any events pending.  So, we have to walk the
		 * list of knotes to see, and peek at the stay-
		 * queued ones to be really sure.
		 */
		while ((kn = (struct knote *)TAILQ_FIRST(&kq->kq_head)) != NULL) {
			if ((kn->kn_status & KN_STAYQUEUED) == 0) {
				retnum = 1;
				goto out;
			}

			TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
			TAILQ_INSERT_TAIL(&inprocessq, kn, kn_tqe);

			if (kqlock2knoteuse(kq, kn)) {
				unsigned peek;

				peek = kn->kn_fop->f_peek(kn);
				if (knoteuse2kqlock(kq, kn)) {
					if (peek > 0) {
						retnum = 1;
						goto out;
					}
				} else {
					retnum = 0;
				}
			}
		}
	}

out:
	/* Return knotes to active queue */
	while ((kn = TAILQ_FIRST(&inprocessq)) != NULL) {
		TAILQ_REMOVE(&inprocessq, kn, kn_tqe);
		kn->kn_tq = &kq->kq_head;
		TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
	}

	kqueue_end_processing(kq);
	kqunlock(kq);
	return (retnum);
}

/*
 * kqueue_close -
 */
/*ARGSUSED*/
static int
kqueue_close(struct fileglob *fg, __unused vfs_context_t ctx)
{
	struct kqueue *kq = (struct kqueue *)fg->fg_data;

	kqueue_dealloc(kq);
	fg->fg_data = NULL;
	return (0);
}

/*ARGSUSED*/
/*
 * The callers has taken a use-count reference on this kqueue and will donate it
 * to the kqueue we are being added to.  This keeps the kqueue from closing until
 * that relationship is torn down.
 */
static int
kqueue_kqfilter(__unused struct fileproc *fp, struct knote *kn, __unused vfs_context_t ctx)
{
	struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
	struct kqueue *parentkq = kn->kn_kq;

	if (parentkq == kq ||
	    kn->kn_filter != EVFILT_READ)
		return (1);

	/*
	 * We have to avoid creating a cycle when nesting kqueues
	 * inside another.  Rather than trying to walk the whole
	 * potential DAG of nested kqueues, we just use a simple
	 * ceiling protocol.  When a kqueue is inserted into another,
	 * we check that the (future) parent is not already nested
	 * into another kqueue at a lower level than the potenial
	 * child (because it could indicate a cycle).  If that test
	 * passes, we just mark the nesting levels accordingly.
	 */

	kqlock(parentkq);
	if (parentkq->kq_level > 0 &&
	    parentkq->kq_level < kq->kq_level)
	{
		kqunlock(parentkq);
		return (1);
	} else {
		/* set parent level appropriately */
		if (parentkq->kq_level == 0)
			parentkq->kq_level = 2;
		if (parentkq->kq_level < kq->kq_level + 1)
			parentkq->kq_level = kq->kq_level + 1;
		kqunlock(parentkq);

		kn->kn_fop = &kqread_filtops;
		kqlock(kq);
		KNOTE_ATTACH(&kq->kq_sel.si_note, kn);
		/* indicate nesting in child, if needed */
		if (kq->kq_level == 0)
			kq->kq_level = 1;
		kqunlock(kq);
		return (0);
	}
}

/*
 * kqueue_drain - called when kq is closed
 */
/*ARGSUSED*/
static int
kqueue_drain(struct fileproc *fp, __unused vfs_context_t ctx)
{
	struct kqueue *kq = (struct kqueue *)fp->f_fglob->fg_data;
	kqlock(kq);
	kqueue_wakeup(kq, 1);
	kqunlock(kq);
	return (0);
}

/*ARGSUSED*/
int
kqueue_stat(struct fileproc *fp, void *ub, int isstat64,  __unused vfs_context_t ctx)
{

	struct kqueue *kq = (struct kqueue *)fp->f_data;
	if (isstat64 != 0) {
		struct stat64 *sb64 = (struct stat64 *)ub;

		bzero((void *)sb64, sizeof(*sb64));
		sb64->st_size = kq->kq_count;
		if (kq->kq_state & KQ_KEV64)
			sb64->st_blksize = sizeof(struct kevent64_s);
		else
			sb64->st_blksize = sizeof(struct kevent);
		sb64->st_mode = S_IFIFO;
	} else {
		struct stat *sb = (struct stat *)ub;

		bzero((void *)sb, sizeof(*sb));
		sb->st_size = kq->kq_count;
		if (kq->kq_state & KQ_KEV64)
			sb->st_blksize = sizeof(struct kevent64_s);
		else
			sb->st_blksize = sizeof(struct kevent);
		sb->st_mode = S_IFIFO;
	}

	return (0);
}

/*
 * Called with the kqueue locked
 */
static void
kqueue_wakeup(struct kqueue *kq, int closed)
{
	if ((kq->kq_state & (KQ_SLEEP | KQ_SEL)) != 0 || kq->kq_nprocess > 0) {
		kq->kq_state &= ~(KQ_SLEEP | KQ_SEL);
		wait_queue_wakeup_all((wait_queue_t)kq->kq_wqs, KQ_EVENT,
		    (closed) ? THREAD_INTERRUPTED : THREAD_AWAKENED);
	}
}

void
klist_init(struct klist *list)
{
	SLIST_INIT(list);
}


/*
 * Query/Post each knote in the object's list
 *
 *	The object lock protects the list. It is assumed
 *	that the filter/event routine for the object can
 *	determine that the object is already locked (via
 *	the hint) and not deadlock itself.
 *
 *	The object lock should also hold off pending
 *	detach/drop operations.  But we'll prevent it here
 *	too - just in case.
 */
void
knote(struct klist *list, long hint)
{
	struct knote *kn;

	SLIST_FOREACH(kn, list, kn_selnext) {
		struct kqueue *kq = kn->kn_kq;

		kqlock(kq);
		if (kqlock2knoteuse(kq, kn)) {
			int result;

			/* call the event with only a use count */
			result = kn->kn_fop->f_event(kn, hint);

			/* if its not going away and triggered */
			if (knoteuse2kqlock(kq, kn) && result)
				knote_activate(kn, 1);
			/* lock held again */
		}
		kqunlock(kq);
	}
}

/*
 * attach a knote to the specified list.  Return true if this is the first entry.
 * The list is protected by whatever lock the object it is associated with uses.
 */
int
knote_attach(struct klist *list, struct knote *kn)
{
	int ret = SLIST_EMPTY(list);
	SLIST_INSERT_HEAD(list, kn, kn_selnext);
	return (ret);
}

/*
 * detach a knote from the specified list.  Return true if that was the last entry.
 * The list is protected by whatever lock the object it is associated with uses.
 */
int
knote_detach(struct klist *list, struct knote *kn)
{
	SLIST_REMOVE(list, kn, knote, kn_selnext);
	return (SLIST_EMPTY(list));
}

/*
 * For a given knote, link a provided wait queue directly with the kqueue.
 * Wakeups will happen via recursive wait queue support.  But nothing will move
 * the knote to the active list at wakeup (nothing calls knote()).  Instead,
 * we permanently enqueue them here.
 *
 * kqueue and knote references are held by caller.
 *
 * caller provides the wait queue link structure.
 */
int
knote_link_wait_queue(struct knote *kn, struct wait_queue *wq, wait_queue_link_t wql)
{
	struct kqueue *kq = kn->kn_kq;
	kern_return_t kr;

	kr = wait_queue_link_noalloc(wq, kq->kq_wqs, wql);
	if (kr == KERN_SUCCESS) {
		knote_markstayqueued(kn);
		return (0);
	} else {
		return (EINVAL);
	}
}

/*
 * Unlink the provided wait queue from the kqueue associated with a knote.
 * Also remove it from the magic list of directly attached knotes.
 *
 * Note that the unlink may have already happened from the other side, so
 * ignore any failures to unlink and just remove it from the kqueue list.
 *
 * On success, caller is responsible for the link structure
 */
int
knote_unlink_wait_queue(struct knote *kn, struct wait_queue *wq, wait_queue_link_t *wqlp)
{
	struct kqueue *kq = kn->kn_kq;
	kern_return_t kr;

	kr = wait_queue_unlink_nofree(wq, kq->kq_wqs, wqlp);
	kqlock(kq);
	kn->kn_status &= ~KN_STAYQUEUED;
	knote_dequeue(kn);
	kqunlock(kq);
	return ((kr != KERN_SUCCESS) ? EINVAL : 0);
}

/*
 * remove all knotes referencing a specified fd
 *
 * Essentially an inlined knote_remove & knote_drop
 * when we know for sure that the thing is a file
 *
 * Entered with the proc_fd lock already held.
 * It returns the same way, but may drop it temporarily.
 */
void
knote_fdclose(struct proc *p, int fd)
{
	struct filedesc *fdp = p->p_fd;
	struct klist *list;
	struct knote *kn;

	list = &fdp->fd_knlist[fd];
	while ((kn = SLIST_FIRST(list)) != NULL) {
		struct kqueue *kq = kn->kn_kq;

		if (kq->kq_p != p)
			panic("%s: proc mismatch (kq->kq_p=%p != p=%p)",
			    __func__, kq->kq_p, p);

		kqlock(kq);
		proc_fdunlock(p);

		/*
		 * Convert the lock to a drop ref.
		 * If we get it, go ahead and drop it.
		 * Otherwise, we waited for it to
		 * be dropped by the other guy, so
		 * it is safe to move on in the list.
		 */
		if (kqlock2knotedrop(kq, kn)) {
			kn->kn_fop->f_detach(kn);
			knote_drop(kn, p);
		}

		proc_fdlock(p);

		/* the fd tables may have changed - start over */
		list = &fdp->fd_knlist[fd];
	}
}

/* proc_fdlock held on entry (and exit) */
static int
knote_fdpattach(struct knote *kn, struct filedesc *fdp, struct proc *p)
{
	struct klist *list = NULL;

	if (! kn->kn_fop->f_isfd) {
		if (fdp->fd_knhashmask == 0)
			fdp->fd_knhash = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE,
			    &fdp->fd_knhashmask);
		list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
	} else {
		if ((u_int)fdp->fd_knlistsize <= kn->kn_id) {
			u_int size = 0;

			if (kn->kn_id >= (uint64_t)p->p_rlimit[RLIMIT_NOFILE].rlim_cur
			    || kn->kn_id >= (uint64_t)maxfiles)
				return (EINVAL);

			/* have to grow the fd_knlist */
			size = fdp->fd_knlistsize;
			while (size <= kn->kn_id)
				size += KQEXTENT;

			if (size >= (UINT_MAX/sizeof(struct klist *)))
				return (EINVAL);

			MALLOC(list, struct klist *,
			    size * sizeof(struct klist *), M_KQUEUE, M_WAITOK);
			if (list == NULL)
				return (ENOMEM);

			bcopy((caddr_t)fdp->fd_knlist, (caddr_t)list,
			    fdp->fd_knlistsize * sizeof(struct klist *));
			bzero((caddr_t)list +
			    fdp->fd_knlistsize * sizeof(struct klist *),
			    (size - fdp->fd_knlistsize) * sizeof(struct klist *));
			FREE(fdp->fd_knlist, M_KQUEUE);
			fdp->fd_knlist = list;
			fdp->fd_knlistsize = size;
		}
		list = &fdp->fd_knlist[kn->kn_id];
	}
	SLIST_INSERT_HEAD(list, kn, kn_link);
	return (0);
}


/*
 * should be called at spl == 0, since we don't want to hold spl
 * while calling fdrop and free.
 */
static void
knote_drop(struct knote *kn, __unused struct proc *ctxp)
{
	struct kqueue *kq = kn->kn_kq;
	struct proc *p = kq->kq_p;
	struct filedesc *fdp = p->p_fd;
	struct klist *list;
	int needswakeup;

	proc_fdlock(p);
	if (kn->kn_fop->f_isfd)
		list = &fdp->fd_knlist[kn->kn_id];
	else
		list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];

	SLIST_REMOVE(list, kn, knote, kn_link);
	kqlock(kq);
	knote_dequeue(kn);
	needswakeup = (kn->kn_status & KN_USEWAIT);
	kqunlock(kq);
	proc_fdunlock(p);

	if (needswakeup)
		wait_queue_wakeup_all((wait_queue_t)kq->kq_wqs, &kn->kn_status,
		    THREAD_AWAKENED);

	if (kn->kn_fop->f_isfd)
		fp_drop(p, kn->kn_id, kn->kn_fp, 0);

	knote_free(kn);
}

/* called with kqueue lock held */
static void
knote_activate(struct knote *kn, int propagate)
{
	struct kqueue *kq = kn->kn_kq;

	kn->kn_status |= KN_ACTIVE;
	knote_enqueue(kn);
	kqueue_wakeup(kq, 0);

	/* this is a real event: wake up the parent kq, too */
	if (propagate)
		KNOTE(&kq->kq_sel.si_note, 0);
}

/* called with kqueue lock held */
static void
knote_deactivate(struct knote *kn)
{
	kn->kn_status &= ~KN_ACTIVE;
	knote_dequeue(kn);
}

/* called with kqueue lock held */
static void
knote_enqueue(struct knote *kn)
{
	if ((kn->kn_status & (KN_QUEUED | KN_STAYQUEUED)) == KN_STAYQUEUED ||
	    (kn->kn_status & (KN_QUEUED | KN_STAYQUEUED | KN_DISABLED)) == 0) {
		struct kqtailq *tq = kn->kn_tq;
		struct kqueue *kq = kn->kn_kq;

		TAILQ_INSERT_TAIL(tq, kn, kn_tqe);
		kn->kn_status |= KN_QUEUED;
		kq->kq_count++;
	}
}

/* called with kqueue lock held */
static void
knote_dequeue(struct knote *kn)
{
	struct kqueue *kq = kn->kn_kq;

	if ((kn->kn_status & (KN_QUEUED | KN_STAYQUEUED)) == KN_QUEUED) {
		struct kqtailq *tq = kn->kn_tq;

		TAILQ_REMOVE(tq, kn, kn_tqe);
		kn->kn_tq = &kq->kq_head;
		kn->kn_status &= ~KN_QUEUED;
		kq->kq_count--;
	}
}

void
knote_init(void)
{
	knote_zone = zinit(sizeof(struct knote), 8192*sizeof(struct knote),
	    8192, "knote zone");

	/* allocate kq lock group attribute and group */
	kq_lck_grp_attr = lck_grp_attr_alloc_init();

	kq_lck_grp = lck_grp_alloc_init("kqueue",  kq_lck_grp_attr);

	/* Allocate kq lock attribute */
	kq_lck_attr = lck_attr_alloc_init();

	/* Initialize the timer filter lock */
	lck_mtx_init(&_filt_timerlock, kq_lck_grp, kq_lck_attr);

#if VM_PRESSURE_EVENTS
	/* Initialize the vm pressure list lock */
	vm_pressure_init(kq_lck_grp, kq_lck_attr);
#endif

#if CONFIG_MEMORYSTATUS
	/* Initialize the memorystatus list lock */
	memorystatus_kevent_init(kq_lck_grp, kq_lck_attr);
#endif
}
SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL)

static struct knote *
knote_alloc(void)
{
	return ((struct knote *)zalloc(knote_zone));
}

static void
knote_free(struct knote *kn)
{
	zfree(knote_zone, kn);
}

#if SOCKETS
#include <sys/param.h>
#include <sys/socket.h>
#include <sys/protosw.h>
#include <sys/domain.h>
#include <sys/mbuf.h>
#include <sys/kern_event.h>
#include <sys/malloc.h>
#include <sys/sys_domain.h>
#include <sys/syslog.h>

static lck_grp_attr_t *kev_lck_grp_attr;
static lck_attr_t *kev_lck_attr;
static lck_grp_t *kev_lck_grp;
static decl_lck_rw_data(,kev_lck_data);
static lck_rw_t *kev_rwlock = &kev_lck_data;

static int kev_attach(struct socket *so, int proto, struct proc *p);
static int kev_detach(struct socket *so);
static int kev_control(struct socket *so, u_long cmd, caddr_t data,
    struct ifnet *ifp, struct proc *p);
static lck_mtx_t * event_getlock(struct socket *, int);
static int event_lock(struct socket *, int, void *);
static int event_unlock(struct socket *, int, void *);

static int event_sofreelastref(struct socket *);
static void kev_delete(struct kern_event_pcb *);

static struct pr_usrreqs event_usrreqs = {
	.pru_attach =		kev_attach,
	.pru_control =		kev_control,
	.pru_detach =		kev_detach,
	.pru_soreceive =	soreceive,
};

static struct protosw eventsw[] = {
{
	.pr_type =		SOCK_RAW,
	.pr_protocol =		SYSPROTO_EVENT,
	.pr_flags =		PR_ATOMIC,
	.pr_usrreqs =		&event_usrreqs,
	.pr_lock =		event_lock,
	.pr_unlock =		event_unlock,
	.pr_getlock =		event_getlock,
}
};

static lck_mtx_t *
event_getlock(struct socket *so, int locktype)
{
#pragma unused(locktype)
	struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;

	if (so->so_pcb != NULL)  {
		if (so->so_usecount < 0)
			panic("%s: so=%p usecount=%d lrh= %s\n", __func__,
			    so, so->so_usecount, solockhistory_nr(so));
			/* NOTREACHED */
	} else {
		panic("%s: so=%p NULL NO so_pcb %s\n", __func__,
		    so, solockhistory_nr(so));
		/* NOTREACHED */
	}
	return (&ev_pcb->evp_mtx);
}

static int
event_lock(struct socket *so, int refcount, void *lr)
{
	void *lr_saved;

	if (lr == NULL)
		lr_saved = __builtin_return_address(0);
	else
		lr_saved = lr;

	if (so->so_pcb != NULL) {
		lck_mtx_lock(&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
	} else  {
		panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
		    so, lr_saved, solockhistory_nr(so));
		/* NOTREACHED */
	}

	if (so->so_usecount < 0) {
		panic("%s: so=%p so_pcb=%p lr=%p ref=%d lrh= %s\n", __func__,
		    so, so->so_pcb, lr_saved, so->so_usecount,
		    solockhistory_nr(so));
		/* NOTREACHED */
	}

	if (refcount)
		so->so_usecount++;

	so->lock_lr[so->next_lock_lr] = lr_saved;
	so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
	return (0);
}

static int
event_unlock(struct socket *so, int refcount, void *lr)
{
	void *lr_saved;
	lck_mtx_t *mutex_held;

	if (lr == NULL)
		lr_saved = __builtin_return_address(0);
	else
		lr_saved = lr;

	if (refcount)
		so->so_usecount--;

	if (so->so_usecount < 0) {
		panic("%s: so=%p usecount=%d lrh= %s\n", __func__,
		    so, so->so_usecount, solockhistory_nr(so));
		/* NOTREACHED */
	}
	if (so->so_pcb == NULL) {
		panic("%s: so=%p NO PCB usecount=%d lr=%p lrh= %s\n", __func__,
		    so, so->so_usecount, (void *)lr_saved,
		    solockhistory_nr(so));
		/* NOTREACHED */
	}
	mutex_held = (&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);

	lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
	so->unlock_lr[so->next_unlock_lr] = lr_saved;
	so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;

	if (so->so_usecount == 0) {
		VERIFY(so->so_flags & SOF_PCBCLEARING);
		event_sofreelastref(so);
	} else {
		lck_mtx_unlock(mutex_held);
	}

	return (0);
}

static int
event_sofreelastref(struct socket *so)
{
	struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;

	lck_mtx_assert(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_OWNED);

	so->so_pcb = NULL;

	/*
	 * Disable upcall in the event another thread is in kev_post_msg()
	 * appending record to the receive socket buffer, since sbwakeup()
	 * may release the socket lock otherwise.
	 */
	so->so_rcv.sb_flags &= ~SB_UPCALL;
	so->so_snd.sb_flags &= ~SB_UPCALL;
	so->so_event = NULL;
	lck_mtx_unlock(&(ev_pcb->evp_mtx));

	lck_mtx_assert(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_NOTOWNED);
	lck_rw_lock_exclusive(kev_rwlock);
	LIST_REMOVE(ev_pcb, evp_link);
	lck_rw_done(kev_rwlock);
	kev_delete(ev_pcb);

	sofreelastref(so, 1);
	return (0);
}

static int event_proto_count = (sizeof (eventsw) / sizeof (struct protosw));

static
struct kern_event_head kern_event_head;

static u_int32_t static_event_id = 0;

#define	EVPCB_ZONE_MAX		65536
#define	EVPCB_ZONE_NAME		"kerneventpcb"
static struct zone *ev_pcb_zone;

/*
 * Install the protosw's for the NKE manager.  Invoked at extension load time
 */
void
kern_event_init(struct domain *dp)
{
	struct protosw *pr;
	int i;

	VERIFY(!(dp->dom_flags & DOM_INITIALIZED));
	VERIFY(dp == systemdomain);

	kev_lck_grp_attr = lck_grp_attr_alloc_init();
	if (kev_lck_grp_attr == NULL) {
		panic("%s: lck_grp_attr_alloc_init failed\n", __func__);
		/* NOTREACHED */
	}

	kev_lck_grp = lck_grp_alloc_init("Kernel Event Protocol",
	    kev_lck_grp_attr);
	if (kev_lck_grp == NULL) {
		panic("%s: lck_grp_alloc_init failed\n", __func__);
		/* NOTREACHED */
	}

	kev_lck_attr = lck_attr_alloc_init();
	if (kev_lck_attr == NULL) {
		panic("%s: lck_attr_alloc_init failed\n", __func__);
		/* NOTREACHED */
	}

	lck_rw_init(kev_rwlock, kev_lck_grp, kev_lck_attr);
	if (kev_rwlock == NULL) {
		panic("%s: lck_mtx_alloc_init failed\n", __func__);
		/* NOTREACHED */
	}

	for (i = 0, pr = &eventsw[0]; i < event_proto_count; i++, pr++)
		net_add_proto(pr, dp, 1);

	ev_pcb_zone = zinit(sizeof(struct kern_event_pcb),
	    EVPCB_ZONE_MAX * sizeof(struct kern_event_pcb), 0, EVPCB_ZONE_NAME);
	if (ev_pcb_zone == NULL) {
		panic("%s: failed allocating ev_pcb_zone", __func__);
		/* NOTREACHED */
	}
	zone_change(ev_pcb_zone, Z_EXPAND, TRUE);
	zone_change(ev_pcb_zone, Z_CALLERACCT, TRUE);
}

static int
kev_attach(struct socket *so, __unused int proto, __unused struct proc *p)
{
	int error = 0;
	struct kern_event_pcb *ev_pcb;

	error = soreserve(so, KEV_SNDSPACE, KEV_RECVSPACE);
	if (error != 0)
		return (error);

	if ((ev_pcb = (struct kern_event_pcb *)zalloc(ev_pcb_zone)) == NULL) {
		return (ENOBUFS);
	}
	bzero(ev_pcb, sizeof(struct kern_event_pcb));
	lck_mtx_init(&ev_pcb->evp_mtx, kev_lck_grp, kev_lck_attr);

	ev_pcb->evp_socket = so;
	ev_pcb->evp_vendor_code_filter = 0xffffffff;

	so->so_pcb = (caddr_t) ev_pcb;
	lck_rw_lock_exclusive(kev_rwlock);
	LIST_INSERT_HEAD(&kern_event_head, ev_pcb, evp_link);
	lck_rw_done(kev_rwlock);

	return (error);
}

static void
kev_delete(struct kern_event_pcb *ev_pcb)
{
	VERIFY(ev_pcb != NULL);
	lck_mtx_destroy(&ev_pcb->evp_mtx, kev_lck_grp);
	zfree(ev_pcb_zone, ev_pcb);
}

static int
kev_detach(struct socket *so)
{
	struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *) so->so_pcb;

	if (ev_pcb != NULL) {
		soisdisconnected(so);
		so->so_flags |= SOF_PCBCLEARING;
	}

	return (0);
}

/*
 * For now, kev_vendor_code and mbuf_tags use the same
 * mechanism.
 */
errno_t kev_vendor_code_find(
	const char	*string,
	u_int32_t 	*out_vendor_code)
{
	if (strlen(string) >= KEV_VENDOR_CODE_MAX_STR_LEN) {
		return (EINVAL);
	}
	return (net_str_id_find_internal(string, out_vendor_code,
	    NSI_VENDOR_CODE, 1));
}

errno_t
kev_msg_post(struct kev_msg *event_msg)
{
	mbuf_tag_id_t min_vendor, max_vendor;

	net_str_id_first_last(&min_vendor, &max_vendor, NSI_VENDOR_CODE);

	if (event_msg == NULL)
		return (EINVAL);

	/*
	 * Limit third parties to posting events for registered vendor codes
	 * only
	 */
	if (event_msg->vendor_code < min_vendor ||
	    event_msg->vendor_code > max_vendor)
		return (EINVAL);

	return (kev_post_msg(event_msg));
}

int
kev_post_msg(struct kev_msg *event_msg)
{
	struct mbuf *m, *m2;
	struct kern_event_pcb *ev_pcb;
	struct kern_event_msg *ev;
	char *tmp;
	u_int32_t total_size;
	int i;

	/* Verify the message is small enough to fit in one mbuf w/o cluster */
	total_size = KEV_MSG_HEADER_SIZE;

	for (i = 0; i < 5; i++) {
		if (event_msg->dv[i].data_length == 0)
			break;
		total_size += event_msg->dv[i].data_length;
	}

	if (total_size > MLEN) {
		return (EMSGSIZE);
	}

	m = m_get(M_DONTWAIT, MT_DATA);
	if (m == 0)
	    return (ENOBUFS);

	ev = mtod(m, struct kern_event_msg *);
	total_size = KEV_MSG_HEADER_SIZE;

	tmp = (char *) &ev->event_data[0];
	for (i = 0; i < 5; i++) {
		if (event_msg->dv[i].data_length == 0)
			break;

		total_size += event_msg->dv[i].data_length;
		bcopy(event_msg->dv[i].data_ptr, tmp,
		    event_msg->dv[i].data_length);
		tmp += event_msg->dv[i].data_length;
	}

	ev->id = ++static_event_id;
	ev->total_size   = total_size;
	ev->vendor_code  = event_msg->vendor_code;
	ev->kev_class    = event_msg->kev_class;
	ev->kev_subclass = event_msg->kev_subclass;
	ev->event_code   = event_msg->event_code;

	m->m_len = total_size;
	lck_rw_lock_shared(kev_rwlock);
	for (ev_pcb = LIST_FIRST(&kern_event_head);
	    ev_pcb;
	    ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
		lck_mtx_lock(&ev_pcb->evp_mtx);
		if (ev_pcb->evp_socket->so_pcb == NULL) {
			lck_mtx_unlock(&ev_pcb->evp_mtx);
			continue;
		}
		if (ev_pcb->evp_vendor_code_filter != KEV_ANY_VENDOR) {
			if (ev_pcb->evp_vendor_code_filter != ev->vendor_code) {
				lck_mtx_unlock(&ev_pcb->evp_mtx);
				continue;
			}

			if (ev_pcb->evp_class_filter != KEV_ANY_CLASS) {
				if (ev_pcb->evp_class_filter != ev->kev_class) {
					lck_mtx_unlock(&ev_pcb->evp_mtx);
					continue;
				}

				if ((ev_pcb->evp_subclass_filter != KEV_ANY_SUBCLASS) &&
				    (ev_pcb->evp_subclass_filter != ev->kev_subclass)) {
					lck_mtx_unlock(&ev_pcb->evp_mtx);
					continue;
				}
			}
		}

		m2 = m_copym(m, 0, m->m_len, M_NOWAIT);
		if (m2 == 0) {
			m_free(m);
			lck_mtx_unlock(&ev_pcb->evp_mtx);
			lck_rw_done(kev_rwlock);
			return (ENOBUFS);
		}
		if (sbappendrecord(&ev_pcb->evp_socket->so_rcv, m2))
			sorwakeup(ev_pcb->evp_socket);
		lck_mtx_unlock(&ev_pcb->evp_mtx);
	}
	m_free(m);
	lck_rw_done(kev_rwlock);

	return (0);
}

static int
kev_control(struct socket *so,
    u_long cmd,
    caddr_t data,
    __unused struct ifnet *ifp,
    __unused struct proc *p)
{
	struct kev_request *kev_req = (struct kev_request *) data;
	struct kern_event_pcb  *ev_pcb;
	struct kev_vendor_code *kev_vendor;
	u_int32_t  *id_value = (u_int32_t *) data;

	switch (cmd) {
		case SIOCGKEVID:
			*id_value = static_event_id;
			break;
		case SIOCSKEVFILT:
			ev_pcb = (struct kern_event_pcb *) so->so_pcb;
			ev_pcb->evp_vendor_code_filter = kev_req->vendor_code;
			ev_pcb->evp_class_filter = kev_req->kev_class;
			ev_pcb->evp_subclass_filter  = kev_req->kev_subclass;
			break;
		case SIOCGKEVFILT:
			ev_pcb = (struct kern_event_pcb *) so->so_pcb;
			kev_req->vendor_code = ev_pcb->evp_vendor_code_filter;
			kev_req->kev_class   = ev_pcb->evp_class_filter;
			kev_req->kev_subclass = ev_pcb->evp_subclass_filter;
			break;
		case SIOCGKEVVENDOR:
			kev_vendor = (struct kev_vendor_code *)data;
			/* Make sure string is NULL terminated */
			kev_vendor->vendor_string[KEV_VENDOR_CODE_MAX_STR_LEN-1] = 0;
			return (net_str_id_find_internal(kev_vendor->vendor_string,
			    &kev_vendor->vendor_code, NSI_VENDOR_CODE, 0));
		default:
			return (ENOTSUP);
	}

	return (0);
}

#endif /* SOCKETS */


int
fill_kqueueinfo(struct kqueue *kq, struct kqueue_info * kinfo)
{
	struct vinfo_stat * st;

	/* No need for the funnel as fd is kept alive */
	st = &kinfo->kq_stat;

	st->vst_size = kq->kq_count;
	if (kq->kq_state & KQ_KEV64)
		st->vst_blksize = sizeof(struct kevent64_s);
	else
		st->vst_blksize = sizeof(struct kevent);
	st->vst_mode = S_IFIFO;
	if (kq->kq_state & KQ_SEL)
		kinfo->kq_state |=  PROC_KQUEUE_SELECT;
	if (kq->kq_state & KQ_SLEEP)
		kinfo->kq_state |= PROC_KQUEUE_SLEEP;

	return (0);
}


void
knote_markstayqueued(struct knote *kn)
{
	kqlock(kn->kn_kq);
	kn->kn_status |= KN_STAYQUEUED;
	knote_enqueue(kn);
	kqunlock(kn->kn_kq);
}

## wait_queue.c
/*
 * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 *
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */
/*
 * @OSF_FREE_COPYRIGHT@
 */
/*
 * Mach Operating System
 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
 * All Rights Reserved.
 *
 * Permission to use, copy, modify and distribute this software and its
 * documentation is hereby granted, provided that both the copyright
 * notice and this permission notice appear in all copies of the
 * software, derivative works or modified versions, and any portions
 * thereof, and that both notices appear in supporting documentation.
 *
 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
 *
 * Carnegie Mellon requests users of this software to return to
 *
 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
 *  School of Computer Science
 *  Carnegie Mellon University
 *  Pittsburgh PA 15213-3890
 *
 * any improvements or extensions that they make and grant Carnegie Mellon
 * the rights to redistribute these changes.
 */
/*
 */
/*
 *	File:	wait_queue.c (adapted from sched_prim.c)
 *	Author:	Avadis Tevanian, Jr.
 *	Date:	1986
 *
 *	Primitives for manipulating wait queues: either global
 *	ones from sched_prim.c, or private ones associated with
 *	particular structures(pots, semaphores, etc..).
 */

#include <kern/kern_types.h>
#include <kern/simple_lock.h>
#include <kern/zalloc.h>
#include <kern/queue.h>
#include <kern/spl.h>
#include <mach/sync_policy.h>
#include <kern/mach_param.h>
#include <kern/sched_prim.h>

#include <kern/wait_queue.h>
#include <vm/vm_kern.h>

/* forward declarations */
static boolean_t wait_queue_member_locked(
			wait_queue_t		wq,
			wait_queue_set_t	wq_set);

static void wait_queues_init(void);

#define WAIT_QUEUE_MAX thread_max
#define WAIT_QUEUE_SET_MAX task_max * 3
#define WAIT_QUEUE_LINK_MAX PORT_MAX / 2 + (WAIT_QUEUE_MAX * WAIT_QUEUE_SET_MAX) / 64

static zone_t _wait_queue_link_zone;
static zone_t _wait_queue_set_zone;
static zone_t _wait_queue_zone;

/* see rdar://6737748&5561610; we need an unshadowed
 * definition of a WaitQueueLink for debugging,
 * but it needs to be used somewhere to wind up in
 * the dSYM file. */
volatile WaitQueueLink *unused_except_for_debugging;


/*
 *	Waiting protocols and implementation:
 *
 *	Each thread may be waiting for exactly one event; this event
 *	is set using assert_wait().  That thread may be awakened either
 *	by performing a thread_wakeup_prim() on its event,
 *	or by directly waking that thread up with clear_wait().
 *
 *	The implementation of wait events uses a hash table.  Each
 *	bucket is queue of threads having the same hash function
 *	value; the chain for the queue (linked list) is the run queue
 *	field.  [It is not possible to be waiting and runnable at the
 *	same time.]
 *
 *	Locks on both the thread and on the hash buckets govern the
 *	wait event field and the queue chain field.  Because wakeup
 *	operations only have the event as an argument, the event hash
 *	bucket must be locked before any thread.
 *
 *	Scheduling operations may also occur at interrupt level; therefore,
 *	interrupts below splsched() must be prevented when holding
 *	thread or hash bucket locks.
 *
 *	The wait event hash table declarations are as follows:
 */

struct wait_queue boot_wait_queue[1];
__private_extern__ struct wait_queue *wait_queues = &boot_wait_queue[0];
__private_extern__ uint32_t num_wait_queues = 1;

#define	P2ROUNDUP(x, align) (-(-((uint32_t)(x)) & -(align)))
#define ROUNDDOWN(x,y)	(((x)/(y))*(y))

static uint32_t
compute_wait_hash_size(void)
{
	uint32_t hsize, queues;

	if (PE_parse_boot_argn("wqsize", &hsize, sizeof(hsize)))
		return (hsize);

	queues = thread_max / 11;
	hsize = P2ROUNDUP(queues * sizeof(struct wait_queue), PAGE_SIZE);

	return hsize;
}

static void
wait_queues_init(void)
{
	uint32_t	i, whsize, qsz;
	kern_return_t	kret;

	/*
	 * Determine the amount of memory we're willing to reserve for
	 * the waitqueue hash table
	 */
	whsize = compute_wait_hash_size();

	/* Determine the number of waitqueues we can fit. */
	qsz = sizeof (struct wait_queue);
	whsize = ROUNDDOWN(whsize, qsz);
	num_wait_queues = whsize / qsz;

	/*
	 * The hash algorithm requires that this be a power of 2, so we
	 * just mask off all the low-order bits.
	 */
	for (i = 0; i < 31; i++) {
		uint32_t bit = (1 << i);
		if ((num_wait_queues & bit) == num_wait_queues)
			break;
		num_wait_queues &= ~bit;
	}
	assert(num_wait_queues > 0);

	/* Now determine how much memory we really need. */
	whsize = P2ROUNDUP(num_wait_queues * qsz, PAGE_SIZE);

	kret = kernel_memory_allocate(kernel_map, (vm_offset_t *) &wait_queues,
	    whsize, 0, KMA_KOBJECT|KMA_NOPAGEWAIT);

	if (kret != KERN_SUCCESS || wait_queues == NULL)
		panic("kernel_memory_allocate() failed to allocate wait queues, error: %d, whsize: 0x%x", kret, whsize);

	for (i = 0; i < num_wait_queues; i++) {
		wait_queue_init(&wait_queues[i], SYNC_POLICY_FIFO);
	}
}

void
wait_queue_bootstrap(void)
{
	wait_queues_init();
	_wait_queue_zone = zinit(sizeof(struct wait_queue),
				      WAIT_QUEUE_MAX * sizeof(struct wait_queue),
				      sizeof(struct wait_queue),
				      "wait queues");
	zone_change(_wait_queue_zone, Z_NOENCRYPT, TRUE);

	_wait_queue_set_zone = zinit(sizeof(struct wait_queue_set),
				      WAIT_QUEUE_SET_MAX * sizeof(struct wait_queue_set),
				      sizeof(struct wait_queue_set),
				      "wait queue sets");
	zone_change(_wait_queue_set_zone, Z_NOENCRYPT, TRUE);

	_wait_queue_link_zone = zinit(sizeof(struct _wait_queue_link),
				      WAIT_QUEUE_LINK_MAX * sizeof(struct _wait_queue_link),
				      sizeof(struct _wait_queue_link),
				      "wait queue links");
	zone_change(_wait_queue_link_zone, Z_NOENCRYPT, TRUE);
}

/*
 *	Routine:        wait_queue_init
 *	Purpose:
 *		Initialize a previously allocated wait queue.
 *	Returns:
 *		KERN_SUCCESS - The wait_queue_t was initialized
 *		KERN_INVALID_ARGUMENT - The policy parameter was invalid
 */
kern_return_t
wait_queue_init(
	wait_queue_t wq,
	int policy)
{
	/* only FIFO and LIFO for now */
	if ((policy & SYNC_POLICY_FIXED_PRIORITY) != 0)
		return KERN_INVALID_ARGUMENT;

	wq->wq_fifo = ((policy & SYNC_POLICY_REVERSED) == 0);
	wq->wq_type = _WAIT_QUEUE_inited;
	wq->wq_eventmask = 0;
	queue_init(&wq->wq_queue);
	hw_lock_init(&wq->wq_interlock);
	return KERN_SUCCESS;
}

/*
 *	Routine:		   wait_queue_alloc
 *	Purpose:
 *		Allocate and initialize a wait queue for use outside of
 *		of the mach part of the kernel.
 *	Conditions:
 *		Nothing locked - can block.
 *	Returns:
 *		The allocated and initialized wait queue
 *		WAIT_QUEUE_NULL if there is a resource shortage
 */
wait_queue_t
wait_queue_alloc(
	int policy)
{
	wait_queue_t wq;
	kern_return_t ret;

	wq = (wait_queue_t) zalloc(_wait_queue_zone);
	if (wq != WAIT_QUEUE_NULL) {
		ret = wait_queue_init(wq, policy);
		if (ret != KERN_SUCCESS) {
			zfree(_wait_queue_zone, wq);
			wq = WAIT_QUEUE_NULL;
		}
	}
	return wq;
}

/*
 *	Routine:        wait_queue_free
 *	Purpose:
 *		Free an allocated wait queue.
 *	Conditions:
 *		May block.
 */
kern_return_t
wait_queue_free(
	wait_queue_t wq)
{
	if (!wait_queue_is_queue(wq))
		return KERN_INVALID_ARGUMENT;
	if (!queue_empty(&wq->wq_queue))
		return KERN_FAILURE;
	zfree(_wait_queue_zone, wq);
	return KERN_SUCCESS;
}

/*
 *	Routine:        wait_queue_set_init
 *	Purpose:
 *		Initialize a previously allocated wait queue set.
 *	Returns:
 *		KERN_SUCCESS - The wait_queue_set_t was initialized
 *		KERN_INVALID_ARGUMENT - The policy parameter was invalid
 */
kern_return_t
wait_queue_set_init(
	wait_queue_set_t wqset,
	int policy)
{
	kern_return_t ret;

	ret = wait_queue_init(&wqset->wqs_wait_queue, policy);
	if (ret != KERN_SUCCESS)
		return ret;

	wqset->wqs_wait_queue.wq_type = _WAIT_QUEUE_SET_inited;
	if (policy & SYNC_POLICY_PREPOST)
		wqset->wqs_wait_queue.wq_prepost = TRUE;
	else
		wqset->wqs_wait_queue.wq_prepost = FALSE;
	queue_init(&wqset->wqs_setlinks);
	queue_init(&wqset->wqs_preposts);
	return KERN_SUCCESS;
}


kern_return_t
wait_queue_sub_init(
	wait_queue_set_t wqset,
	int policy)
{
	return wait_queue_set_init(wqset, policy);
}

kern_return_t
wait_queue_sub_clearrefs(
        wait_queue_set_t wq_set)
{
	wait_queue_link_t wql;
	queue_t q;
	spl_t s;

	if (!wait_queue_is_set(wq_set))
		return KERN_INVALID_ARGUMENT;

	s = splsched();
	wqs_lock(wq_set);
	q = &wq_set->wqs_preposts;
	while (!queue_empty(q)) {
		queue_remove_first(q, wql, wait_queue_link_t, wql_preposts);
		assert(!wql_is_preposted(wql));
	}
	wqs_unlock(wq_set);
	splx(s);
	return KERN_SUCCESS;
}

/*
 *	Routine:        wait_queue_set_alloc
 *	Purpose:
 *		Allocate and initialize a wait queue set for
 *		use outside of the mach part of the kernel.
 *	Conditions:
 *		May block.
 *	Returns:
 *		The allocated and initialized wait queue set
 *		WAIT_QUEUE_SET_NULL if there is a resource shortage
 */
wait_queue_set_t
wait_queue_set_alloc(
    int policy)
{
	wait_queue_set_t wq_set;

	wq_set = (wait_queue_set_t) zalloc(_wait_queue_set_zone);
	if (wq_set != WAIT_QUEUE_SET_NULL) {
		kern_return_t ret;

		ret = wait_queue_set_init(wq_set, policy);
		if (ret != KERN_SUCCESS) {
			zfree(_wait_queue_set_zone, wq_set);
			wq_set = WAIT_QUEUE_SET_NULL;
		}
	}
	return wq_set;
}

/*
 *     Routine:        wait_queue_set_free
 *     Purpose:
 *             Free an allocated wait queue set
 *     Conditions:
 *             May block.
 */
kern_return_t
wait_queue_set_free(
	wait_queue_set_t wq_set)
{
	if (!wait_queue_is_set(wq_set))
		return KERN_INVALID_ARGUMENT;

	if (!queue_empty(&wq_set->wqs_wait_queue.wq_queue))
		return KERN_FAILURE;

	zfree(_wait_queue_set_zone, wq_set);
	return KERN_SUCCESS;
}


/*
 *
 *     Routine:        wait_queue_set_size
 *     Routine:        wait_queue_link_size
 *     Purpose:
 *             Return the size of opaque wait queue structures
 */
unsigned int wait_queue_set_size(void) { return sizeof(WaitQueueSet); }
unsigned int wait_queue_link_size(void) { return sizeof(WaitQueueLink); }

/* declare a unique type for wait queue link structures */
static unsigned int _wait_queue_link;
static unsigned int _wait_queue_link_noalloc;
static unsigned int _wait_queue_unlinked;

#define WAIT_QUEUE_LINK ((void *)&_wait_queue_link)
#define WAIT_QUEUE_LINK_NOALLOC ((void *)&_wait_queue_link_noalloc)
#define WAIT_QUEUE_UNLINKED ((void *)&_wait_queue_unlinked)

#define WAIT_QUEUE_ELEMENT_CHECK(wq, wqe) \
	WQASSERT(((wqe)->wqe_queue == (wq) && \
	  queue_next(queue_prev((queue_t) (wqe))) == (queue_t)(wqe)), \
	  "wait queue element list corruption: wq=%#x, wqe=%#x", \
	  (wq), (wqe))

#define WQSPREV(wqs, wql) ((wait_queue_link_t)queue_prev( \
			((&(wqs)->wqs_setlinks == (queue_t)(wql)) ? \
			(queue_t)(wql) : &(wql)->wql_setlinks)))

#define WQSNEXT(wqs, wql) ((wait_queue_link_t)queue_next( \
			((&(wqs)->wqs_setlinks == (queue_t)(wql)) ? \
			(queue_t)(wql) : &(wql)->wql_setlinks)))

#define WAIT_QUEUE_SET_LINK_CHECK(wqs, wql) \
		WQASSERT(((((wql)->wql_type == WAIT_QUEUE_LINK) || \
			   ((wql)->wql_type == WAIT_QUEUE_LINK_NOALLOC)) && \
			((wql)->wql_setqueue == (wqs)) && \
			(((wql)->wql_queue->wq_type == _WAIT_QUEUE_inited) || \
			 ((wql)->wql_queue->wq_type == _WAIT_QUEUE_SET_inited)) && \
			(WQSNEXT((wqs), WQSPREV((wqs),(wql))) == (wql))), \
			"wait queue set links corruption: wqs=%#x, wql=%#x", \
			 (wqs), (wql))

#if defined(_WAIT_QUEUE_DEBUG_)

#define WQASSERT(e, s, p0, p1) ((e) ? 0 : panic(s, p0, p1))

#define WAIT_QUEUE_CHECK(wq) \
MACRO_BEGIN \
	queue_t q2 = &(wq)->wq_queue; \
	wait_queue_element_t wqe2 = (wait_queue_element_t) queue_first(q2); \
	while (!queue_end(q2, (queue_entry_t)wqe2)) { \
		WAIT_QUEUE_ELEMENT_CHECK((wq), wqe2); \
		wqe2 = (wait_queue_element_t) queue_next((queue_t) wqe2); \
	} \
MACRO_END

#define WAIT_QUEUE_SET_CHECK(wqs) \
MACRO_BEGIN \
	queue_t q2 = &(wqs)->wqs_setlinks; \
	wait_queue_link_t wql2 = (wait_queue_link_t) queue_first(q2); \
	while (!queue_end(q2, (queue_entry_t)wql2)) { \
		WAIT_QUEUE_SET_LINK_CHECK((wqs), wql2); \
		wql2 = (wait_queue_link_t) wql2->wql_setlinks.next; \
	} \
MACRO_END

#else /* !_WAIT_QUEUE_DEBUG_ */

#define WQASSERT(e, s, p0, p1) assert(e)

#define WAIT_QUEUE_CHECK(wq)
#define WAIT_QUEUE_SET_CHECK(wqs)

#endif /* !_WAIT_QUEUE_DEBUG_ */

/*
 *	Routine:	wait_queue_global
 *	Purpose:
 *		Indicate if this wait queue is a global wait queue or not.
 */
static boolean_t
wait_queue_global(
	wait_queue_t wq)
{
	if ((wq >= wait_queues) && (wq <= (wait_queues + num_wait_queues))) {
		return TRUE;
	}
	return FALSE;
}


/*
 *	Routine:	wait_queue_member_locked
 *	Purpose:
 *		Indicate if this set queue is a member of the queue
 *	Conditions:
 *		The wait queue is locked
 *		The set queue is just that, a set queue
 */
static boolean_t
wait_queue_member_locked(
	wait_queue_t wq,
	wait_queue_set_t wq_set)
{
	wait_queue_element_t wq_element;
	queue_t q;

	assert(wait_queue_held(wq));
	assert(wait_queue_is_set(wq_set));

	q = &wq->wq_queue;

	wq_element = (wait_queue_element_t) queue_first(q);
	while (!queue_end(q, (queue_entry_t)wq_element)) {
		WAIT_QUEUE_ELEMENT_CHECK(wq, wq_element);
		if ((wq_element->wqe_type == WAIT_QUEUE_LINK) ||
		    (wq_element->wqe_type == WAIT_QUEUE_LINK_NOALLOC)) {
			wait_queue_link_t wql = (wait_queue_link_t)wq_element;

			if (wql->wql_setqueue == wq_set)
				return TRUE;
		}
		wq_element = (wait_queue_element_t)
			     queue_next((queue_t) wq_element);
	}
	return FALSE;
}


/*
 *	Routine:	wait_queue_member
 *	Purpose:
 *		Indicate if this set queue is a member of the queue
 *	Conditions:
 *		The set queue is just that, a set queue
 */
boolean_t
wait_queue_member(
	wait_queue_t wq,
	wait_queue_set_t wq_set)
{
	boolean_t ret;
	spl_t s;

	if (!wait_queue_is_set(wq_set))
		return FALSE;

	s = splsched();
	wait_queue_lock(wq);
	ret = wait_queue_member_locked(wq, wq_set);
	wait_queue_unlock(wq);
	splx(s);

	return ret;
}


/*
 *	Routine:	wait_queue_link_internal
 *	Purpose:
 *		Insert a set wait queue into a wait queue.  This
 *		requires us to link the two together using a wait_queue_link
 *		structure that was provided.
 *	Conditions:
 *		The wait queue being inserted must be inited as a set queue
 *		The wait_queue_link structure must already be properly typed
 */
static
kern_return_t
wait_queue_link_internal(
	wait_queue_t wq,
	wait_queue_set_t wq_set,
	wait_queue_link_t wql)
{
	wait_queue_element_t wq_element;
	queue_t q;
	spl_t s;

	if (!wait_queue_is_valid(wq) || !wait_queue_is_set(wq_set))
  		return KERN_INVALID_ARGUMENT;

	/*
	 * There are probably fewer threads and sets associated with
	 * the wait queue than there are wait queues associated with
	 * the set.  So let's validate it that way.
	 */
	s = splsched();
	wait_queue_lock(wq);
	q = &wq->wq_queue;
	wq_element = (wait_queue_element_t) queue_first(q);
	while (!queue_end(q, (queue_entry_t)wq_element)) {
		WAIT_QUEUE_ELEMENT_CHECK(wq, wq_element);
		if ((wq_element->wqe_type == WAIT_QUEUE_LINK ||
		     wq_element->wqe_type == WAIT_QUEUE_LINK_NOALLOC) &&
		    ((wait_queue_link_t)wq_element)->wql_setqueue == wq_set) {
			wait_queue_unlock(wq);
			splx(s);
			return KERN_ALREADY_IN_SET;
		}
		wq_element = (wait_queue_element_t)
				queue_next((queue_t) wq_element);
	}

	/*
	 * Not already a member, so we can add it.
	 */
	wqs_lock(wq_set);

	WAIT_QUEUE_SET_CHECK(wq_set);

	assert(wql->wql_type == WAIT_QUEUE_LINK ||
	       wql->wql_type == WAIT_QUEUE_LINK_NOALLOC);

	wql->wql_queue = wq;
	wql_clear_prepost(wql);
	queue_enter(&wq->wq_queue, wql, wait_queue_link_t, wql_links);
	wql->wql_setqueue = wq_set;
	queue_enter(&wq_set->wqs_setlinks, wql, wait_queue_link_t, wql_setlinks);

	wqs_unlock(wq_set);
	wait_queue_unlock(wq);
	splx(s);

	return KERN_SUCCESS;
}

/*
 *	Routine:	wait_queue_link_noalloc
 *	Purpose:
 *		Insert a set wait queue into a wait queue.  This
 *		requires us to link the two together using a wait_queue_link
 *		structure that we allocate.
 *	Conditions:
 *		The wait queue being inserted must be inited as a set queue
 */
kern_return_t
wait_queue_link_noalloc(
	wait_queue_t wq,
	wait_queue_set_t wq_set,
	wait_queue_link_t wql)
{
	wql->wql_type = WAIT_QUEUE_LINK_NOALLOC;
	return wait_queue_link_internal(wq, wq_set, wql);
}

/*
 *	Routine:	wait_queue_link
 *	Purpose:
 *		Insert a set wait queue into a wait queue.  This
 *		requires us to link the two together using a wait_queue_link
 *		structure that we allocate.
 *	Conditions:
 *		The wait queue being inserted must be inited as a set queue
 */
kern_return_t
wait_queue_link(
	wait_queue_t wq,
	wait_queue_set_t wq_set)
{
	wait_queue_link_t wql;
	kern_return_t ret;

	wql = (wait_queue_link_t) zalloc(_wait_queue_link_zone);
	if (wql == WAIT_QUEUE_LINK_NULL)
		return KERN_RESOURCE_SHORTAGE;

	wql->wql_type = WAIT_QUEUE_LINK;
	ret = wait_queue_link_internal(wq, wq_set, wql);
	if (ret != KERN_SUCCESS)
		zfree(_wait_queue_link_zone, wql);

	return ret;
}

wait_queue_link_t
wait_queue_link_allocate(void)
{
	wait_queue_link_t wql;

	wql = zalloc(_wait_queue_link_zone); /* Can't fail */
	bzero(wql, sizeof(*wql));
	wql->wql_type = WAIT_QUEUE_UNLINKED;

	return wql;
}

kern_return_t
wait_queue_link_free(wait_queue_link_t wql)
{
	zfree(_wait_queue_link_zone, wql);
	return KERN_SUCCESS;
}


/*
 *	Routine:	wait_queue_unlink_locked
 *	Purpose:
 *		Undo the linkage between a wait queue and a set.
 */
static void
wait_queue_unlink_locked(
	wait_queue_t wq,
	wait_queue_set_t wq_set,
	wait_queue_link_t wql)
{
	assert(wait_queue_held(wq));
	assert(wait_queue_held(&wq_set->wqs_wait_queue));

	wql->wql_queue = WAIT_QUEUE_NULL;
	queue_remove(&wq->wq_queue, wql, wait_queue_link_t, wql_links);
	wql->wql_setqueue = WAIT_QUEUE_SET_NULL;
	queue_remove(&wq_set->wqs_setlinks, wql, wait_queue_link_t, wql_setlinks);
	if (wql_is_preposted(wql)) {
		queue_t ppq = &wq_set->wqs_preposts;
		queue_remove(ppq, wql, wait_queue_link_t, wql_preposts);
	}
	wql->wql_type = WAIT_QUEUE_UNLINKED;

	WAIT_QUEUE_CHECK(wq);
	WAIT_QUEUE_SET_CHECK(wq_set);
}

/*
 *	Routine:	wait_queue_unlink_nofree
 *	Purpose:
 *		Remove the linkage between a wait queue and a set,
 *		returning the linkage structure to the caller to
 *		free later.
 *	Conditions:
 *		The wait queue being must be a member set queue
 */
kern_return_t
wait_queue_unlink_nofree(
	wait_queue_t wq,
	wait_queue_set_t wq_set,
	wait_queue_link_t *wqlp)
{
	wait_queue_element_t wq_element;
	wait_queue_link_t wql;
	queue_t q;
	spl_t s;

	if (!wait_queue_is_valid(wq) || !wait_queue_is_set(wq_set)) {
		return KERN_INVALID_ARGUMENT;
	}
	s = splsched();
	wait_queue_lock(wq);

	q = &wq->wq_queue;
	wq_element = (wait_queue_element_t) queue_first(q);
	while (!queue_end(q, (queue_entry_t)wq_element)) {
		WAIT_QUEUE_ELEMENT_CHECK(wq, wq_element);
		if (wq_element->wqe_type == WAIT_QUEUE_LINK ||
		    wq_element->wqe_type == WAIT_QUEUE_LINK_NOALLOC) {

		   	wql = (wait_queue_link_t)wq_element;

			if (wql->wql_setqueue == wq_set) {

				wqs_lock(wq_set);
				wait_queue_unlink_locked(wq, wq_set, wql);
				wqs_unlock(wq_set);
				wait_queue_unlock(wq);
				splx(s);
				*wqlp = wql;
				return KERN_SUCCESS;
			}
		}
		wq_element = (wait_queue_element_t)
				queue_next((queue_t) wq_element);
	}
	wait_queue_unlock(wq);
	splx(s);
	return KERN_NOT_IN_SET;
}

/*
 *	Routine:	wait_queue_unlink
 *	Purpose:
 *		Remove the linkage between a wait queue and a set,
 *		freeing the linkage structure.
 *	Conditions:
 *		The wait queue being must be a member set queue
 */
kern_return_t
wait_queue_unlink(
	wait_queue_t wq,
	wait_queue_set_t wq_set)
{
	wait_queue_element_t wq_element;
	wait_queue_link_t wql;
	queue_t q;
	spl_t s;

	if (!wait_queue_is_valid(wq) || !wait_queue_is_set(wq_set)) {
		return KERN_INVALID_ARGUMENT;
	}
	s = splsched();
	wait_queue_lock(wq);

	q = &wq->wq_queue;
	wq_element = (wait_queue_element_t) queue_first(q);
	while (!queue_end(q, (queue_entry_t)wq_element)) {
		WAIT_QUEUE_ELEMENT_CHECK(wq, wq_element);
		if (wq_element->wqe_type == WAIT_QUEUE_LINK ||
		    wq_element->wqe_type == WAIT_QUEUE_LINK_NOALLOC) {

		   	wql = (wait_queue_link_t)wq_element;

			if (wql->wql_setqueue == wq_set) {
				boolean_t alloced;

				alloced = (wql->wql_type == WAIT_QUEUE_LINK);
				wqs_lock(wq_set);
				wait_queue_unlink_locked(wq, wq_set, wql);
				wqs_unlock(wq_set);
				wait_queue_unlock(wq);
				splx(s);
				if (alloced)
					zfree(_wait_queue_link_zone, wql);
				return KERN_SUCCESS;
			}
		}
		wq_element = (wait_queue_element_t)
				queue_next((queue_t) wq_element);
	}
	wait_queue_unlock(wq);
	splx(s);
	return KERN_NOT_IN_SET;
}

/*
 *	Routine:	wait_queue_unlink_all_nofree_locked
 *	Purpose:
 *		Remove the linkage between a wait queue and all its sets.
 *		All the linkage structures are returned to the caller for
 *		later freeing.
 *	Conditions:
 *		Wait queue locked.
 */

static void
wait_queue_unlink_all_nofree_locked(
	wait_queue_t wq,
	queue_t links)
{
	wait_queue_element_t wq_element;
	wait_queue_element_t wq_next_element;
	wait_queue_set_t wq_set;
	wait_queue_link_t wql;
	queue_t q;

	q = &wq->wq_queue;

	wq_element = (wait_queue_element_t) queue_first(q);
	while (!queue_end(q, (queue_entry_t)wq_element)) {

		WAIT_QUEUE_ELEMENT_CHECK(wq, wq_element);
		wq_next_element = (wait_queue_element_t)
			     queue_next((queue_t) wq_element);

		if (wq_element->wqe_type == WAIT_QUEUE_LINK ||
		    wq_element->wqe_type == WAIT_QUEUE_LINK_NOALLOC) {
			wql = (wait_queue_link_t)wq_element;
			wq_set = wql->wql_setqueue;
			wqs_lock(wq_set);
			wait_queue_unlink_locked(wq, wq_set, wql);
			wqs_unlock(wq_set);
			enqueue(links, &wql->wql_links);
		}
		wq_element = wq_next_element;
	}
}

/*
 *	Routine:	wait_queue_unlink_all_nofree
 *	Purpose:
 *		Remove the linkage between a wait queue and all its sets.
 *		All the linkage structures are returned to the caller for
 *		later freeing.
 *	Conditions:
 *		Nothing of interest locked.
 */

kern_return_t
wait_queue_unlink_all_nofree(
	wait_queue_t wq,
	queue_t links)
{
	spl_t s;

	if (!wait_queue_is_valid(wq)) {
		printf("\nReturning KERN_INVALID_ARGUMENT from wait_queue_unlink_all_nofree\n");
		return KERN_INVALID_ARGUMENT;
	}

	s = splsched();
	wait_queue_lock(wq);
	wait_queue_unlink_all_nofree_locked(wq, links);
	wait_queue_unlock(wq);
	splx(s);

	return(KERN_SUCCESS);
}

/*
 *	Routine:	wait_queue_unlink_all_locked
 *	Purpose:
 *		Remove the linkage between a locked wait queue and all its
 *		sets and enqueue the allocated ones onto the links queue
 *		provided.
 *	Conditions:
 *		Wait queue locked.
 */
static void
wait_queue_unlink_all_locked(
	wait_queue_t wq,
	queue_t links)
{
	wait_queue_element_t wq_element;
	wait_queue_element_t wq_next_element;
	wait_queue_set_t wq_set;
	wait_queue_link_t wql;
	queue_t q;

	q = &wq->wq_queue;

	wq_element = (wait_queue_element_t) queue_first(q);
	while (!queue_end(q, (queue_entry_t)wq_element)) {
		boolean_t alloced;

		WAIT_QUEUE_ELEMENT_CHECK(wq, wq_element);
		wq_next_element = (wait_queue_element_t)
			     queue_next((queue_t) wq_element);

		alloced = (wq_element->wqe_type == WAIT_QUEUE_LINK);
		if (alloced || wq_element->wqe_type == WAIT_QUEUE_LINK_NOALLOC) {
			wql = (wait_queue_link_t)wq_element;
			wq_set = wql->wql_setqueue;
			wqs_lock(wq_set);
			wait_queue_unlink_locked(wq, wq_set, wql);
			wqs_unlock(wq_set);
			if (alloced)
				enqueue(links, &wql->wql_links);
		}
		wq_element = wq_next_element;
	}

}


/*
 *	Routine:	wait_queue_unlink_all
 *	Purpose:
 *		Remove the linkage between a wait queue and all its sets.
 *		All the linkage structures that were allocated internally
 *		are freed.  The others are the caller's responsibility.
 *	Conditions:
 *		Nothing of interest locked.
 */

kern_return_t
wait_queue_unlink_all(
	wait_queue_t wq)
{
	wait_queue_link_t wql;
	queue_head_t links_queue_head;
	queue_t links = &links_queue_head;
	spl_t s;

	if (!wait_queue_is_valid(wq)) {
		printf("\nReturning KERN_INVALID_ARGUMENT from wait_queue_unlink_all\n");
		return KERN_INVALID_ARGUMENT;
	}

	queue_init(links);

	s = splsched();
	wait_queue_lock(wq);
	wait_queue_unlink_all_locked(wq, links);
	wait_queue_unlock(wq);
	splx(s);

	while(!queue_empty(links)) {
		wql = (wait_queue_link_t) dequeue(links);
		zfree(_wait_queue_link_zone, wql);
	}

	return(KERN_SUCCESS);
}

/* legacy interface naming */
kern_return_t
wait_subqueue_unlink_all(
	wait_queue_set_t	wq_set)
{
	return wait_queue_set_unlink_all(wq_set);
}


/*
 *	Routine:	wait_queue_set_unlink_all_nofree
 *	Purpose:
 *		Remove the linkage between a set wait queue and all its
 *		member wait queues and all the sets it may be a member of.
 *		The links structures are returned for later freeing by the
 *		caller.
 *	Conditions:
 *		The wait queue must be a set
 */
kern_return_t
wait_queue_set_unlink_all_nofree(
	wait_queue_set_t wq_set,
	queue_t		links)
{
	wait_queue_link_t wql;
	wait_queue_t wq;
	queue_t q;
	spl_t s;

	if (!wait_queue_is_set(wq_set)) {
		return KERN_INVALID_ARGUMENT;
	}

retry:
	s = splsched();
	wqs_lock(wq_set);

	/* remove the wait queues that are members of our set */
	q = &wq_set->wqs_setlinks;

	wql = (wait_queue_link_t)queue_first(q);
	while (!queue_end(q, (queue_entry_t)wql)) {
		WAIT_QUEUE_SET_LINK_CHECK(wq_set, wql);
		wq = wql->wql_queue;
		if (wait_queue_lock_try(wq)) {
			wait_queue_unlink_locked(wq, wq_set, wql);
			wait_queue_unlock(wq);
			enqueue(links, &wql->wql_links);
			wql = (wait_queue_link_t)queue_first(q);
		} else {
			wqs_unlock(wq_set);
			splx(s);
			delay(1);
			goto retry;
		}
	}

	/* remove this set from sets it belongs to */
	wait_queue_unlink_all_nofree_locked(&wq_set->wqs_wait_queue, links);

	wqs_unlock(wq_set);
	splx(s);

	return(KERN_SUCCESS);
}

/*
 *	Routine:	wait_queue_set_unlink_all
 *	Purpose:
 *		Remove the linkage between a set wait queue and all its
 *		member wait queues and all the sets it may be members of.
 *		The link structures are freed for those	links which were
 *		dynamically allocated.
 *	Conditions:
 *		The wait queue must be a set
 */
kern_return_t
wait_queue_set_unlink_all(
	wait_queue_set_t wq_set)
{
	wait_queue_link_t wql;
	wait_queue_t wq;
	queue_t q;
	queue_head_t links_queue_head;
	queue_t links = &links_queue_head;
	spl_t s;

	if (!wait_queue_is_set(wq_set)) {
		return KERN_INVALID_ARGUMENT;
	}

	queue_init(links);

retry:
	s = splsched();
	wqs_lock(wq_set);

	/* remove the wait queues that are members of our set */
	q = &wq_set->wqs_setlinks;

	wql = (wait_queue_link_t)queue_first(q);
	while (!queue_end(q, (queue_entry_t)wql)) {
		WAIT_QUEUE_SET_LINK_CHECK(wq_set, wql);
		wq = wql->wql_queue;
		if (wait_queue_lock_try(wq)) {
			boolean_t alloced;

			alloced = (wql->wql_type == WAIT_QUEUE_LINK);
			wait_queue_unlink_locked(wq, wq_set, wql);
			wait_queue_unlock(wq);
			if (alloced)
				enqueue(links, &wql->wql_links);
			wql = (wait_queue_link_t)queue_first(q);
		} else {
			wqs_unlock(wq_set);
			splx(s);
			delay(1);
			goto retry;
		}
	}


	/* remove this set from sets it belongs to */
	wait_queue_unlink_all_locked(&wq_set->wqs_wait_queue, links);

	wqs_unlock(wq_set);
	splx(s);

	while (!queue_empty (links)) {
		wql = (wait_queue_link_t) dequeue(links);
		zfree(_wait_queue_link_zone, wql);
	}
	return(KERN_SUCCESS);
}

kern_return_t
wait_queue_set_unlink_one(
	wait_queue_set_t wq_set,
	wait_queue_link_t wql)
{
	wait_queue_t wq;
	spl_t s;

	assert(wait_queue_is_set(wq_set));

retry:
	s = splsched();
	wqs_lock(wq_set);

	WAIT_QUEUE_SET_CHECK(wq_set);

	/* Already unlinked, e.g. by selclearthread() */
	if (wql->wql_type == WAIT_QUEUE_UNLINKED) {
		goto out;
	}

	WAIT_QUEUE_SET_LINK_CHECK(wq_set, wql);

	/* On a wait queue, and we hold set queue lock ... */
	wq = wql->wql_queue;
	if (wait_queue_lock_try(wq)) {
		wait_queue_unlink_locked(wq, wq_set, wql);
		wait_queue_unlock(wq);
	} else {
		wqs_unlock(wq_set);
		splx(s);
		delay(1);
		goto retry;
	}

out:
	wqs_unlock(wq_set);
	splx(s);

	return KERN_SUCCESS;
}

/*
 *	Routine:	wait_queue_assert_wait64_locked
 *	Purpose:
 *		Insert the current thread into the supplied wait queue
 *		waiting for a particular event to be posted to that queue.
 *
 *	Conditions:
 *		The wait queue is assumed locked.
 *		The waiting thread is assumed locked.
 *
 */
__private_extern__ wait_result_t
wait_queue_assert_wait64_locked(
	wait_queue_t wq,
	event64_t event,
	wait_interrupt_t interruptible,
	wait_timeout_urgency_t urgency,
	uint64_t deadline,
	uint64_t leeway,
	thread_t thread)
{
	wait_result_t wait_result;
	boolean_t realtime;

	if (!wait_queue_assert_possible(thread))
		panic("wait_queue_assert_wait64_locked");

	if (wq->wq_type == _WAIT_QUEUE_SET_inited) {
		wait_queue_set_t wqs = (wait_queue_set_t)wq;

		if (event == NO_EVENT64 && wqs_is_preposted(wqs))
			return(THREAD_AWAKENED);
	}

	/*
	 * Realtime threads get priority for wait queue placements.
	 * This allows wait_queue_wakeup_one to prefer a waiting
	 * realtime thread, similar in principle to performing
	 * a wait_queue_wakeup_all and allowing scheduler prioritization
	 * to run the realtime thread, but without causing the
	 * lock contention of that scenario.
	 */
	realtime = (thread->sched_pri >= BASEPRI_REALTIME);

	/*
	 * This is the extent to which we currently take scheduling attributes
	 * into account.  If the thread is vm priviledged, we stick it at
	 * the front of the queue.  Later, these queues will honor the policy
	 * value set at wait_queue_init time.
	 */
	wait_result = thread_mark_wait_locked(thread, interruptible);
	if (wait_result == THREAD_WAITING) {
		if (!wq->wq_fifo
			|| (thread->options & TH_OPT_VMPRIV)
			|| realtime)
			enqueue_head(&wq->wq_queue, (queue_entry_t) thread);
		else
			enqueue_tail(&wq->wq_queue, (queue_entry_t) thread);

		thread->wait_event = event;
		thread->wait_queue = wq;

		if (deadline != 0) {

			if (!timer_call_enter_with_leeway(&thread->wait_timer, NULL,
				deadline, leeway, urgency, FALSE))
				thread->wait_timer_active++;
			thread->wait_timer_is_set = TRUE;
		}
		if (wait_queue_global(wq)) {
			wq->wq_eventmask = wq->wq_eventmask | CAST_TO_EVENT_MASK(event);
		}

	}
	return(wait_result);
}

/*
 *	Routine:	wait_queue_assert_wait
 *	Purpose:
 *		Insert the current thread into the supplied wait queue
 *		waiting for a particular event to be posted to that queue.
 *
 *	Conditions:
 *		nothing of interest locked.
 */
wait_result_t
wait_queue_assert_wait(
	wait_queue_t wq,
	event_t event,
	wait_interrupt_t interruptible,
	uint64_t deadline)
{
	spl_t s;
	wait_result_t ret;
	thread_t thread = current_thread();

	/* If it is an invalid wait queue, you can't wait on it */
	if (!wait_queue_is_valid(wq)) {
		printf("\nReturning thread->wait_result = THREAD_RESTART from wait_queue_assert_wait\n");
		return (thread->wait_result = THREAD_RESTART);
	}

	s = splsched();
	wait_queue_lock(wq);
	thread_lock(thread);
	ret = wait_queue_assert_wait64_locked(wq, CAST_DOWN(event64_t,event),
					      interruptible,
					      TIMEOUT_URGENCY_SYS_NORMAL,
					      deadline, 0,
					      thread);
	thread_unlock(thread);
	wait_queue_unlock(wq);
	splx(s);
	return(ret);
}

/*
 *	Routine:	wait_queue_assert_wait_with_leeway
 *	Purpose:
 *		Insert the current thread into the supplied wait queue
 *		waiting for a particular event to be posted to that queue.
 *		Deadline values are specified with urgency and leeway.
 *
 *	Conditions:
 *		nothing of interest locked.
 */
wait_result_t
wait_queue_assert_wait_with_leeway(
	wait_queue_t wq,
	event_t event,
	wait_interrupt_t interruptible,
	wait_timeout_urgency_t urgency,
	uint64_t deadline,
	uint64_t leeway)
{
	spl_t s;
	wait_result_t ret;
	thread_t thread = current_thread();

	/* If it is an invalid wait queue, you can't wait on it */
	if (!wait_queue_is_valid(wq)) {
		printf("\nReturning thread->wait_result = THREAD_RESTART from wait_queue_assert_wait_with_leeway\n");
		return (thread->wait_result = THREAD_RESTART);
	}

	s = splsched();
	wait_queue_lock(wq);
	thread_lock(thread);
	ret = wait_queue_assert_wait64_locked(wq, CAST_DOWN(event64_t,event),
					      interruptible,
					      urgency, deadline, leeway,
					      thread);
	thread_unlock(thread);
	wait_queue_unlock(wq);
	splx(s);
	return(ret);
}

/*
 *	Routine:	wait_queue_assert_wait64
 *	Purpose:
 *		Insert the current thread into the supplied wait queue
 *		waiting for a particular event to be posted to that queue.
 *	Conditions:
 *		nothing of interest locked.
 */
wait_result_t
wait_queue_assert_wait64(
	wait_queue_t wq,
	event64_t event,
	wait_interrupt_t interruptible,
	uint64_t deadline)
{
	spl_t s;
	wait_result_t ret;
	thread_t thread = current_thread();

	/* If it is an invalid wait queue, you cant wait on it */
	if (!wait_queue_is_valid(wq)) {
		printf("\nReturning thread->wait_result = THREAD_RESTART from wait_queue_assert_wait64\n");
		return (thread->wait_result = THREAD_RESTART);
	}

	s = splsched();
	wait_queue_lock(wq);
	thread_lock(thread);
	ret = wait_queue_assert_wait64_locked(wq, event, interruptible,
					      TIMEOUT_URGENCY_SYS_NORMAL,
					      deadline, 0,
					      thread);
	thread_unlock(thread);
	wait_queue_unlock(wq);
	splx(s);
	return(ret);
}

/*
 *	Routine:	wait_queue_assert_wait64_with_leeway
 *	Purpose:
 *		Insert the current thread into the supplied wait queue
 *		waiting for a particular event to be posted to that queue.
 *		Deadline values are specified with urgency and leeway.
 *	Conditions:
 *		nothing of interest locked.
 */
wait_result_t
wait_queue_assert_wait64_with_leeway(
	wait_queue_t wq,
	event64_t event,
	wait_interrupt_t interruptible,
	wait_timeout_urgency_t urgency,
	uint64_t deadline,
	uint64_t leeway)
{
	spl_t s;
	wait_result_t ret;
	thread_t thread = current_thread();

	/* If it is an invalid wait queue, you cant wait on it */
	if (!wait_queue_is_valid(wq)) {
		printf("\nReturning thread->wait_result = THREAD_RESTART from wait_queue_assert_wait64_with_leeway\n");
		return (thread->wait_result = THREAD_RESTART);
	}

	s = splsched();
	wait_queue_lock(wq);
	thread_lock(thread);
	ret = wait_queue_assert_wait64_locked(wq, event, interruptible,
					      urgency, deadline, leeway,
					      thread);
	thread_unlock(thread);
	wait_queue_unlock(wq);
	splx(s);
	return(ret);
}

/*
 *	Routine:	_wait_queue_select64_all
 *	Purpose:
 *		Select all threads off a wait queue that meet the
 *		supplied criteria.
 *	Conditions:
 *		at splsched
 *		wait queue locked
 *		wake_queue initialized and ready for insertion
 *		possibly recursive
 *	Returns:
 *		a queue of locked threads
 */
static void
_wait_queue_select64_all(
	wait_queue_t wq,
	event64_t event,
	queue_t wake_queue)
{
	wait_queue_element_t wq_element;
	wait_queue_element_t wqe_next;
	unsigned long eventmask = 0;
	boolean_t is_queue_global = FALSE;
	queue_t q;

	is_queue_global = wait_queue_global(wq);
	if (is_queue_global) {
		eventmask = CAST_TO_EVENT_MASK(event);
		if ((wq->wq_eventmask & eventmask) != eventmask) {
			return;
		}
		eventmask = 0;
	}
	q = &wq->wq_queue;

	wq_element = (wait_queue_element_t) queue_first(q);
	while (!queue_end(q, (queue_entry_t)wq_element)) {
		WAIT_QUEUE_ELEMENT_CHECK(wq, wq_element);
		wqe_next = (wait_queue_element_t)
			   queue_next((queue_t) wq_element);

		/*
		 * We may have to recurse if this is a compound wait queue.
		 */
		if (wq_element->wqe_type == WAIT_QUEUE_LINK ||
		    wq_element->wqe_type == WAIT_QUEUE_LINK_NOALLOC) {
			wait_queue_link_t wql = (wait_queue_link_t)wq_element;
			wait_queue_set_t set_queue = wql->wql_setqueue;

			/*
			 * We have to check the set wait queue. If it is marked
			 * as pre-post, and it is the "generic event" then mark
			 * it pre-posted now (if not already).
			 */
			wqs_lock(set_queue);
			if (event == NO_EVENT64 && set_queue->wqs_prepost && !wql_is_preposted(wql)) {
				queue_t ppq = &set_queue->wqs_preposts;
				queue_enter(ppq, wql, wait_queue_link_t, wql_preposts);
			}
			if (! wait_queue_empty(&set_queue->wqs_wait_queue))
				_wait_queue_select64_all(&set_queue->wqs_wait_queue, event, wake_queue);
			wqs_unlock(set_queue);
		} else {

			/*
			 * Otherwise, its a thread.  If it is waiting on
			 * the event we are posting to this queue, pull
			 * it off the queue and stick it in out wake_queue.
			 */
			thread_t t = (thread_t)(void *)wq_element;

			if (t->wait_event == event) {
				thread_lock(t);
				remqueue((queue_entry_t) t);
				enqueue (wake_queue, (queue_entry_t) t);
				t->wait_queue = WAIT_QUEUE_NULL;
				t->wait_event = NO_EVENT64;
				t->at_safe_point = FALSE;
				/* returned locked */
			} else {
				if (is_queue_global) {
					eventmask = eventmask |
						CAST_TO_EVENT_MASK(t->wait_event);
				}
			}
		}
		wq_element = wqe_next;
	}
	/* Update event mask if global wait queue */
	if (is_queue_global) {
		wq->wq_eventmask = eventmask;
	}

}

/*
 *	Routine:        wait_queue_wakeup64_all_locked
 *	Purpose:
 *		Wakeup some number of threads that are in the specified
 *		wait queue and waiting on the specified event.
 *	Conditions:
 *		wait queue already locked (may be released).
 *	Returns:
 *		KERN_SUCCESS - Threads were woken up
 *		KERN_NOT_WAITING - No threads were waiting <wq,event> pair
 */
__private_extern__ kern_return_t
wait_queue_wakeup64_all_locked(
	wait_queue_t wq,
	event64_t event,
	wait_result_t result,
	boolean_t unlock)
{
	queue_head_t wake_queue_head;
	queue_t q = &wake_queue_head;
	kern_return_t res;

//	assert(wait_queue_held(wq));
//	if(!wq->wq_interlock.lock_data) {		/* (BRINGUP */
//		panic("wait_queue_wakeup64_all_locked: lock not held on %p\n", wq);	/* (BRINGUP) */
//	}

	queue_init(q);

	/*
	 * Select the threads that we will wake up.	 The threads
	 * are returned to us locked and cleanly removed from the
	 * wait queue.
	 */
	_wait_queue_select64_all(wq, event, q);
	if (unlock)
		wait_queue_unlock(wq);

	/*
	 * For each thread, set it running.
	 */
	res = KERN_NOT_WAITING;
	while (!queue_empty (q)) {
		thread_t thread = (thread_t)(void *) dequeue(q);
		res = thread_go(thread, result);
		assert(res == KERN_SUCCESS);
		thread_unlock(thread);
	}
	return res;
}


/*
 *	Routine:		wait_queue_wakeup_all
 *	Purpose:
 *		Wakeup some number of threads that are in the specified
 *		wait queue and waiting on the specified event.
 *	Conditions:
 *		Nothing locked
 *	Returns:
 *		KERN_SUCCESS - Threads were woken up
 *		KERN_NOT_WAITING - No threads were waiting <wq,event> pair
 */
kern_return_t
wait_queue_wakeup_all(
	wait_queue_t wq,
	event_t event,
	wait_result_t result)
{
	kern_return_t ret;
	spl_t s;

	if (!wait_queue_is_valid(wq)) {
		printf("\nReturning KERN_INVALID_ARGUMENT from wait_queue_wakeup_all\n");
		return KERN_INVALID_ARGUMENT;
	}

	s = splsched();
	wait_queue_lock(wq);
//	if(!wq->wq_interlock.lock_data) {		/* (BRINGUP */
//		panic("wait_queue_wakeup_all: we did not get the lock on %p\n", wq);	/* (BRINGUP) */
//	}
	ret = wait_queue_wakeup64_all_locked(
				wq, CAST_DOWN(event64_t,event),
				result, TRUE);
	/* lock released */
	splx(s);
	return ret;
}

/*
 *	Routine:		wait_queue_wakeup64_all
 *	Purpose:
 *		Wakeup some number of threads that are in the specified
 *		wait queue and waiting on the specified event.
 *	Conditions:
 *		Nothing locked
 *	Returns:
 *		KERN_SUCCESS - Threads were woken up
 *		KERN_NOT_WAITING - No threads were waiting <wq,event> pair
 */
kern_return_t
wait_queue_wakeup64_all(
	wait_queue_t wq,
	event64_t event,
	wait_result_t result)
{
	kern_return_t ret;
	spl_t s;

	if (!wait_queue_is_valid(wq)) {
		printf("\nReturning KERN_INVALID_ARGUMENT from wait_queue_wakeup64_all\n");
		return KERN_INVALID_ARGUMENT;
	}

	s = splsched();
	wait_queue_lock(wq);
	ret = wait_queue_wakeup64_all_locked(wq, event, result, TRUE);
	/* lock released */
	splx(s);
	return ret;
}

/*
 *	Routine:	_wait_queue_select64_one
 *	Purpose:
 *		Select the best thread off a wait queue that meet the
 *		supplied criteria.
 * 	Conditions:
 *		at splsched
 *		wait queue locked
 *		possibly recursive
 * 	Returns:
 *		a locked thread - if one found
 *	Note:
 *		This is where the sync policy of the wait queue comes
 *		into effect.  For now, we just assume FIFO/LIFO.
 */
static thread_t
_wait_queue_select64_one(
	wait_queue_t wq,
	event64_t event)
{
	wait_queue_element_t wq_element;
	wait_queue_element_t wqe_next;
	thread_t t = THREAD_NULL;
	thread_t fifo_thread = THREAD_NULL;
	boolean_t is_queue_fifo = TRUE;
	boolean_t is_queue_global = FALSE;
	boolean_t thread_imp_donor = FALSE;
	boolean_t realtime = FALSE;
	unsigned long eventmask = 0;
	queue_t q;

	if (wait_queue_global(wq)) {
		eventmask = CAST_TO_EVENT_MASK(event);
		if ((wq->wq_eventmask & eventmask) != eventmask) {
			return THREAD_NULL;
		}
		eventmask = 0;
		is_queue_global = TRUE;
#if IMPORTANCE_INHERITANCE
		is_queue_fifo = FALSE;
#endif /* IMPORTANCE_INHERITANCE */
	}

	q = &wq->wq_queue;

	wq_element = (wait_queue_element_t) queue_first(q);
	while (!queue_end(q, (queue_entry_t)wq_element)) {
		WAIT_QUEUE_ELEMENT_CHECK(wq, wq_element);
		wqe_next = (wait_queue_element_t)
			       queue_next((queue_t) wq_element);

		/*
		 * We may have to recurse if this is a compound wait queue.
		 */
		if (wq_element->wqe_type == WAIT_QUEUE_LINK ||
		    wq_element->wqe_type == WAIT_QUEUE_LINK_NOALLOC) {
			wait_queue_link_t wql = (wait_queue_link_t)wq_element;
			wait_queue_set_t set_queue = wql->wql_setqueue;

			/*
			 * We have to check the set wait queue. If the set
			 * supports pre-posting, it isn't already preposted,
			 * and we didn't find a thread in the set, then mark it.
			 *
			 * If we later find a thread, there may be a spurious
			 * pre-post here on this set.  The wait side has to check
			 * for that either pre- or post-wait.
			 */
			wqs_lock(set_queue);
			if (! wait_queue_empty(&set_queue->wqs_wait_queue)) {
				t = _wait_queue_select64_one(&set_queue->wqs_wait_queue, event);
			}
			if (t != THREAD_NULL) {
				wqs_unlock(set_queue);
				return t;
			}
			if (event == NO_EVENT64 && set_queue->wqs_prepost && !wql_is_preposted(wql)) {
				queue_t ppq = &set_queue->wqs_preposts;
				queue_enter(ppq, wql, wait_queue_link_t, wql_preposts);
			}
			wqs_unlock(set_queue);

		} else {

			/*
			 * Otherwise, its a thread.  If it is waiting on
			 * the event we are posting to this queue, pull
			 * it off the queue and stick it in out wake_queue.
			 */
			t = (thread_t)(void *)wq_element;
			if (t->wait_event == event) {
				if (fifo_thread == THREAD_NULL) {
					fifo_thread = t;
				}
#if IMPORTANCE_INHERITANCE
				/*
				 * Checking imp donor bit does not need thread lock or
				 * or task lock since we have the wait queue lock and
				 * thread can not be removed from it without acquiring
				 * wait queue lock. The imp donor bit may change
				 * once we read its value, but it is ok to wake
				 * a thread while someone drops importance assertion
				 * on the that thread.
				 */
				thread_imp_donor = task_is_importance_donor(t->task);
#endif /* IMPORTANCE_INHERITANCE */
				realtime = (t->sched_pri >= BASEPRI_REALTIME);
				if (is_queue_fifo || thread_imp_donor || realtime ||
						(t->options & TH_OPT_VMPRIV)) {
					thread_lock(t);
					remqueue((queue_entry_t) t);
					t->wait_queue = WAIT_QUEUE_NULL;
					t->wait_event = NO_EVENT64;
					t->at_safe_point = FALSE;
					return t;	/* still locked */
				}
			}
			if (is_queue_global) {
				eventmask = eventmask | CAST_TO_EVENT_MASK(t->wait_event);
			}
			t = THREAD_NULL;
		}
		wq_element = wqe_next;
	}

	if (is_queue_global) {
		wq->wq_eventmask = eventmask;
	}
#if IMPORTANCE_INHERITANCE
	if (fifo_thread != THREAD_NULL) {
		thread_lock(fifo_thread);
		remqueue((queue_entry_t) fifo_thread);
		fifo_thread->wait_queue = WAIT_QUEUE_NULL;
		fifo_thread->wait_event = NO_EVENT64;
		fifo_thread->at_safe_point = FALSE;
		return fifo_thread;	/* still locked */
	}
#endif /* IMPORTANCE_INHERITANCE */
	return THREAD_NULL;
}


/*
 *	Routine:	wait_queue_pull_thread_locked
 *	Purpose:
 *		Pull a thread off its wait queue and (possibly) unlock
 *		the waitq.
 * 	Conditions:
 *		at splsched
 *		wait queue locked
 *		thread locked
 * 	Returns:
 *		with the thread still locked.
 */
void
wait_queue_pull_thread_locked(
	wait_queue_t waitq,
	thread_t thread,
	boolean_t unlock)
{

	assert(thread->wait_queue == waitq);

	remqueue((queue_entry_t)thread );
	thread->wait_queue = WAIT_QUEUE_NULL;
	thread->wait_event = NO_EVENT64;
	thread->at_safe_point = FALSE;
	if (unlock)
		wait_queue_unlock(waitq);
}


/*
 *	Routine:	wait_queue_select64_thread
 *	Purpose:
 *		Look for a thread and remove it from the queues, if
 *		(and only if) the thread is waiting on the supplied
 *		<wait_queue, event> pair.
 * 	Conditions:
 *		at splsched
 *		wait queue locked
 *		possibly recursive
 * 	Returns:
 *		KERN_NOT_WAITING: Thread is not waiting here.
 *		KERN_SUCCESS: It was, and is now removed (returned locked)
 */
static kern_return_t
_wait_queue_select64_thread(
	wait_queue_t wq,
	event64_t event,
	thread_t thread)
{
	wait_queue_element_t wq_element;
	wait_queue_element_t wqe_next;
	kern_return_t res = KERN_NOT_WAITING;
	queue_t q = &wq->wq_queue;

	thread_lock(thread);
	if ((thread->wait_queue == wq) && (thread->wait_event == event)) {
		remqueue((queue_entry_t) thread);
		thread->at_safe_point = FALSE;
		thread->wait_event = NO_EVENT64;
		thread->wait_queue = WAIT_QUEUE_NULL;
		/* thread still locked */
		return KERN_SUCCESS;
	}
	thread_unlock(thread);

	/*
	 * The wait_queue associated with the thread may be one of this
	 * wait queue's sets.  Go see.  If so, removing it from
	 * there is like removing it from here.
	 */
	wq_element = (wait_queue_element_t) queue_first(q);
	while (!queue_end(q, (queue_entry_t)wq_element)) {
		WAIT_QUEUE_ELEMENT_CHECK(wq, wq_element);
		wqe_next = (wait_queue_element_t)
			       queue_next((queue_t) wq_element);

		if (wq_element->wqe_type == WAIT_QUEUE_LINK ||
		    wq_element->wqe_type == WAIT_QUEUE_LINK_NOALLOC) {
			wait_queue_link_t wql = (wait_queue_link_t)wq_element;
			wait_queue_set_t set_queue = wql->wql_setqueue;

			wqs_lock(set_queue);
			if (! wait_queue_empty(&set_queue->wqs_wait_queue)) {
				res = _wait_queue_select64_thread(&set_queue->wqs_wait_queue,
								event,
								thread);
			}
			wqs_unlock(set_queue);
			if (res == KERN_SUCCESS)
				return KERN_SUCCESS;
		}
		wq_element = wqe_next;
	}
	return res;
}


/*
 *	Routine:	wait_queue_wakeup64_identity_locked
 *	Purpose:
 *		Select a single thread that is most-eligible to run and set
 *		set it running.  But return the thread locked.
 *
 * 	Conditions:
 *		at splsched
 *		wait queue locked
 *		possibly recursive
 * 	Returns:
 *		a pointer to the locked thread that was awakened
 */
__private_extern__ thread_t
wait_queue_wakeup64_identity_locked(
	wait_queue_t wq,
	event64_t event,
	wait_result_t result,
	boolean_t unlock)
{
	kern_return_t res;
	thread_t thread;

	assert(wait_queue_held(wq));

	thread = _wait_queue_select64_one(wq, event);
	if (unlock)
		wait_queue_unlock(wq);

	if (thread) {
		res = thread_go(thread, result);
		assert(res == KERN_SUCCESS);
	}
	return thread;  /* still locked if not NULL */
}


/*
 *	Routine:	wait_queue_wakeup64_one_locked
 *	Purpose:
 *		Select a single thread that is most-eligible to run and set
 *		set it runnings.
 *
 * 	Conditions:
 *		at splsched
 *		wait queue locked
 *		possibly recursive
 * 	Returns:
 *		KERN_SUCCESS: It was, and is, now removed.
 *		KERN_NOT_WAITING - No thread was waiting <wq,event> pair
 */
__private_extern__ kern_return_t
wait_queue_wakeup64_one_locked(
	wait_queue_t wq,
	event64_t event,
	wait_result_t result,
	boolean_t unlock)
{
	thread_t thread;

	assert(wait_queue_held(wq));

	thread = _wait_queue_select64_one(wq, event);
	if (unlock)
		wait_queue_unlock(wq);

	if (thread) {
		kern_return_t res;

		res = thread_go(thread, result);
		assert(res == KERN_SUCCESS);
		thread_unlock(thread);
		return res;
	}

	return KERN_NOT_WAITING;
}

/*
 *	Routine:	wait_queue_wakeup_one
 *	Purpose:
 *		Wakeup the most appropriate thread that is in the specified
 *		wait queue for the specified event.
 *	Conditions:
 *		Nothing locked
 *	Returns:
 *		KERN_SUCCESS - Thread was woken up
 *		KERN_NOT_WAITING - No thread was waiting <wq,event> pair
 */
kern_return_t
wait_queue_wakeup_one(
	wait_queue_t wq,
	event_t event,
	wait_result_t result,
	int priority)
{
	thread_t thread;
	spl_t s;

	if (!wait_queue_is_valid(wq)) {
		printf("\nReturning KERN_INVALID_ARGUMENT from wait_queue_wakeup_one\n");
		return KERN_INVALID_ARGUMENT;
	}

	s = splsched();
	wait_queue_lock(wq);
	thread = _wait_queue_select64_one(wq, CAST_DOWN(event64_t,event));
	wait_queue_unlock(wq);

	if (thread) {
		kern_return_t res;

		if (thread->sched_pri < priority) {
			if (priority <= MAXPRI) {
				set_sched_pri(thread, priority);

				thread->was_promoted_on_wakeup = 1;
				thread->sched_flags |= TH_SFLAG_PROMOTED;
			}
		}
		res = thread_go(thread, result);
		assert(res == KERN_SUCCESS);
		thread_unlock(thread);
		splx(s);
		return res;
	}

	splx(s);
	return KERN_NOT_WAITING;
}

/*
 *	Routine:	wait_queue_wakeup64_one
 *	Purpose:
 *		Wakeup the most appropriate thread that is in the specified
 *		wait queue for the specified event.
 *	Conditions:
 *		Nothing locked
 *	Returns:
 *		KERN_SUCCESS - Thread was woken up
 *		KERN_NOT_WAITING - No thread was waiting <wq,event> pair
 */
kern_return_t
wait_queue_wakeup64_one(
	wait_queue_t wq,
	event64_t event,
	wait_result_t result)
{
	thread_t thread;
	spl_t s;

	if (!wait_queue_is_valid(wq)) {
		printf("\nReturning KERN_INVALID_ARGUMENT from wait_queue_wakeup64_one\n");
		return KERN_INVALID_ARGUMENT;
	}
	s = splsched();
	wait_queue_lock(wq);
	thread = _wait_queue_select64_one(wq, event);
	wait_queue_unlock(wq);

	if (thread) {
		kern_return_t res;

		res = thread_go(thread, result);
		assert(res == KERN_SUCCESS);
		thread_unlock(thread);
		splx(s);
		return res;
	}

	splx(s);
	return KERN_NOT_WAITING;
}


/*
 *	Routine:	wait_queue_wakeup64_thread_locked
 *	Purpose:
 *		Wakeup the particular thread that was specified if and only
 *		it was in this wait queue (or one of it's set queues)
 *		and waiting on the specified event.
 *
 *		This is much safer than just removing the thread from
 *		whatever wait queue it happens to be on.  For instance, it
 *		may have already been awoken from the wait you intended to
 *		interrupt and waited on something else (like another
 *		semaphore).
 *	Conditions:
 *		at splsched
 *		wait queue already locked (may be released).
 *	Returns:
 *		KERN_SUCCESS - the thread was found waiting and awakened
 *		KERN_NOT_WAITING - the thread was not waiting here
 */
__private_extern__ kern_return_t
wait_queue_wakeup64_thread_locked(
	wait_queue_t wq,
	event64_t event,
	thread_t thread,
	wait_result_t result,
	boolean_t unlock)
{
	kern_return_t res;

	assert(wait_queue_held(wq));

	/*
	 * See if the thread was still waiting there.  If so, it got
	 * dequeued and returned locked.
	 */
	res = _wait_queue_select64_thread(wq, event, thread);
	if (unlock)
	    wait_queue_unlock(wq);

	if (res != KERN_SUCCESS)
		return KERN_NOT_WAITING;

	res = thread_go(thread, result);
	assert(res == KERN_SUCCESS);
	thread_unlock(thread);
	return res;
}

/*
 *	Routine:	wait_queue_wakeup_thread
 *	Purpose:
 *		Wakeup the particular thread that was specified if and only
 *		it was in this wait queue (or one of it's set queues)
 *		and waiting on the specified event.
 *
 *		This is much safer than just removing the thread from
 *		whatever wait queue it happens to be on.  For instance, it
 *		may have already been awoken from the wait you intended to
 *		interrupt and waited on something else (like another
 *		semaphore).
 *	Conditions:
 *		nothing of interest locked
 *		we need to assume spl needs to be raised
 *	Returns:
 *		KERN_SUCCESS - the thread was found waiting and awakened
 *		KERN_NOT_WAITING - the thread was not waiting here
 */
kern_return_t
wait_queue_wakeup_thread(
	wait_queue_t wq,
	event_t event,
	thread_t thread,
	wait_result_t result)
{
	kern_return_t res;
	spl_t s;

	if (!wait_queue_is_valid(wq)) {
		printf("\nReturning KERN_INVALID_ARGUMENT from wait_queue_wakeup_thread\n");
		return KERN_INVALID_ARGUMENT;
	}

	s = splsched();
	wait_queue_lock(wq);
	res = _wait_queue_select64_thread(wq, CAST_DOWN(event64_t,event), thread);
	wait_queue_unlock(wq);

	if (res == KERN_SUCCESS) {
		res = thread_go(thread, result);
		assert(res == KERN_SUCCESS);
		thread_unlock(thread);
		splx(s);
		return res;
	}
	splx(s);
	return KERN_NOT_WAITING;
}

/*
 *	Routine:	wait_queue_wakeup64_thread
 *	Purpose:
 *		Wakeup the particular thread that was specified if and only
 *		it was in this wait queue (or one of it's set's queues)
 *		and waiting on the specified event.
 *
 *		This is much safer than just removing the thread from
 *		whatever wait queue it happens to be on.  For instance, it
 *		may have already been awoken from the wait you intended to
 *		interrupt and waited on something else (like another
 *		semaphore).
 *	Conditions:
 *		nothing of interest locked
 *		we need to assume spl needs to be raised
 *	Returns:
 *		KERN_SUCCESS - the thread was found waiting and awakened
 *		KERN_NOT_WAITING - the thread was not waiting here
 */
kern_return_t
wait_queue_wakeup64_thread(
	wait_queue_t wq,
	event64_t event,
	thread_t thread,
	wait_result_t result)
{
	kern_return_t res;
	spl_t s;

	if (!wait_queue_is_valid(wq)) {
		printf("\nReturning KERN_INVALID_ARGUMENT from wait_queue_wakeup64_thread\n");
		return KERN_INVALID_ARGUMENT;
	}

	s = splsched();
	wait_queue_lock(wq);
	res = _wait_queue_select64_thread(wq, event, thread);
	wait_queue_unlock(wq);

	if (res == KERN_SUCCESS) {
		res = thread_go(thread, result);
		assert(res == KERN_SUCCESS);
		thread_unlock(thread);
		splx(s);
		return res;
	}
	splx(s);
	return KERN_NOT_WAITING;
}