/*
 * NFS client-side locking for NetBSD (kernel part).
 * Edgar Fu, Mathematisches Institut der Universtitt Bonn, July 2006.
 * Do with this whatever you like, as long as you are either me or both
 * - acknowledge that I wrote it for NetBSD in the first place, and
 * - don't blame me if it doesn't do what you like or expect.
 * This program does exactly what it does. If that's not what you expect it
 * or would like it to do, don't complain with me, the NetBSD Foundation,
 * my employer's brother-in-law or anybody else, but rewrite it to your taste.
 */

int nfs_advlock(void *v);

#include <sys/param.h>
#include <sys/device.h>
#include <sys/errno.h>
#include <sys/null.h>
#include <sys/conf.h>
#include <sys/event.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/fcntl.h>
#include <sys/lock.h>
#include <sys/select.h>
#include <sys/poll.h>
#include <sys/queue.h>
#include <sys/proc.h>
#include <sys/mount.h>
#include <sys/kernel.h>
#include <sys/statvfs.h>
#include <sys/fstypes.h>
#include <sys/vnode.h>
#include <sys/mbuf.h>
#include <nfs/rpcv2.h>
#include <nfs/nfsproto.h>
#include <nfs/nfs.h>
#include <nfs/nfsmount.h>
#include <nfs/nfsnode.h>
#include <nfs/nfs_lock.h>

#define NFSLOCK_DEBUG 1

/*
 * We use a pseudo-device to talk with the userland lockd.
 * Everything is the wrong way around (userland serves the kernel,
 * sending request means having them read, receiving replies means
 * having them written), but this is much easier than setting up
 * a socket.
 */

void nfslockattach(struct device *parent, struct device *self, void *aux);
struct nfslock_softc {
        struct device sc_dev;
};

int nfslockopen(dev_t, int, int, struct proc *);
int nfslockclose(dev_t, int, int, struct proc *);
int nfslockread(dev_t, struct uio *, int);
int nfslockwrite(dev_t, struct uio *, int);
int nfslockpoll(dev_t, int, struct proc *);
const struct cdevsw nfslock_cdevsw = {
        nfslockopen,
	nfslockclose,
	nfslockread,
	nfslockwrite,
        noioctl,
	nostop,
	notty,
	nfslockpoll,
	nommap,
	nokqfilter,
	0
};

int nfslock_devisopen = 0;
int nfslock_devnoblock;

/*
 * We manage two queues for talking with lockd.
 * Request to be sent to lockd are queued on _sending.
 * The read() call removes them so the other process can tell
 * they have been accepted by lockd.
 * For replies, an empty reply just containing the right serial number
 * is queued on _receiving (so the write() call can discard replies from
 * lockd no-one will ever remove).
 * The write() call fills in valid replies and removes them from the
 * queue.
 * There is one special case: When a process is killed while the request
 * already has been read by lockd (no matter answered or not), we need to
 * cancel the request (in case it's a lock), but we can't wait for lockd
 * reading the request save answering it. So there is a "oneway" flag in
 * the request list entries telling read() to free() the request after
 * removing it from the _sending list.
 * The fact that we are using a pseudo-device and these queues to talk
 * to lockd is hidden within the _reqrep() routines.
 */
TAILQ_HEAD(nfslock_req_head, nfslock_req_ent);
struct nfslock_req_ent{
	TAILQ_ENTRY(nfslock_req_ent) link;
	int oneway; /* free() after request has been read */
	struct nfslock_req ent;
};
struct nfslock_req_head nfslock_sending = TAILQ_HEAD_INITIALIZER(nfslock_sending);

TAILQ_HEAD(nfslock_rep_head, nfslock_rep_ent);
struct nfslock_rep_ent{
	TAILQ_ENTRY(nfslock_rep_ent) link;
	struct nfslock_rep ent;
};
struct nfslock_rep_head nfslock_receiving = TAILQ_HEAD_INITIALIZER(nfslock_receiving);

struct selinfo nfslock_sel;

int nfslock_serial = 0; /* for matching replies to requests */

/* lock for accessing private global data */
struct lock nfslock_lock;
/* these funtions internally lock the private global data, not files */
#define nfslock_rdlock() (void)lockmgr(&nfslock_lock, LK_SHARED, NULL)
#define nfslock_wrlock() (void)lockmgr(&nfslock_lock, LK_EXCLUSIVE, NULL)
#define nfslock_unlock() (void)lockmgr(&nfslock_lock, LK_RELEASE, NULL)

#if NFSLOCK_DEBUG
#define ENTERPROC(name, proc) int _err = 0; char _nam[] = name; int _pid = proc->p_pid; printf("%s(%d)\n", _nam, _pid); (void)_err
#define ENTERCUR(name) int _err = 0; char _nam[] = name; int _pid = curproc->p_pid; printf("%s(%d)\n", _nam, _pid); (void)_err
#define SETERR(x) _err = (x)
#define ONERR if (_err)
#define RETONERR if (_err) { printf("%s(%d): %d\n", _nam, _pid, _err); return _err; }
#define RETERR(x) { printf("%s(%d): %d\n", _nam, _pid, x); return x; }
#define RET { printf("%s(%d): %d\n", _nam, _pid, _err); return _err; }
#define RET0 { printf("%s(%d): 0\n", _nam, _pid); return 0; }
#else
#define ENTERPROC(name, proc) int _err = 0; (void)_err
#define ENTERCUR(name) int _err = 0; (void)_err
#define SETERR(x) _err = (x)
#define ONERR if (_err)
#define RETONERR if (_err) return _err
#define RETERR(x) return (x)
#define RET return _err
#define RET0 return 0
#endif

MALLOC_DEFINE(M_NFSLOCK, "nfslock", "NFS advisory locks");

#if 0
extern int nfs_sigintr(struct nfsmount *, struct nfsreq *, struct proc *);
#else
/* don't pull in nfs/nfs_var.h just for struct nfsreq we don't use anyway */
extern int nfs_sigintr(struct nfsmount *, void *, struct proc *);
#endif

/*
 * Sleep and possibly timeout/get interrupted like NFS code.
 */
static int nfslock_sleep(struct nfsmount *nmp, const void *ident, const char *wmesg, int wait)
{
	int timeout = 0, priority = PSOCK;
	int ret;
	
	if (wait < 0) {
		priority |= PCATCH;
		timeout = -wait;
	} else {
		if (nmp->nm_flag & NFSMNT_INT)
			priority |= PCATCH;
		if (nmp->nm_flag & NFSMNT_SOFT && !wait)
			timeout = (nmp->nm_retry + 2) * nmp->nm_timeo;
	}
#if NFSLOCK_DEBUG
	printf("nfslock_sleep(%s, %d%s)\n", wmesg, timeout,
		priority & PCATCH ? ", catch": "");
#endif
	ret = tsleep(ident, priority, wmesg, timeout);
#if NFSLOCK_DEBUG
	printf("nfslock_sleep: ");
#endif
	/* The NFS code returns EINTR for timeouts */
	if (ret == EWOULDBLOCK) {
#if NFSLOCK_DEBUG
		printf("EWOULDBLOCK -> EINTR\n");
#endif
		return EINTR;
	} else if (ret == ERESTART || ret == EINTR) {
#if NFSLOCK_DEBUG
		printf("%s -> ", ret == ERESTART ? "ERESTART" : "EINTR");
#endif
		ret = nfs_sigintr(nmp, NULL, curproc);
	}
#if NFSLOCK_DEBUG
	if (ret == EINTR)
		printf("EINTR\n");
	else
		printf("%d\n", ret);
#endif
	return ret;
}

void nfslockattach(struct device *parent, struct device *self, void *aux)
{
#if NFSLOCK_DEBUG
	printf("nfslockattach\n");
#endif
	lockinit(&nfslock_lock, PZERO, "nfslock", 0, 0);
}

int nfslockopen(dev_t dev, int fmt, int flags, struct proc *p)
{
	ENTERPROC("nfslockopen", p);
	nfslock_wrlock();
	if (nfslock_devisopen) SETERR(EBUSY);
	nfslock_devisopen++;
	nfslock_devnoblock = flags && O_NONBLOCK;
	nfslock_unlock();
	RET;
}

int nfslockclose(dev_t dev, int fmt, int flags, struct proc *p)
{
	ENTERPROC("nfslockclose", p);
	/* XXX warn if any outstanding requests */
	nfslock_wrlock();
	nfslock_devisopen = 0;
	nfslock_unlock();
	RET0;
}

int nfslockread(dev_t dev, struct uio *uio, int flags)
{
	struct nfslock_req_ent *p;

	ENTERCUR("nfslockread");
	/* XXX can this happen? */
	if (!nfslock_devisopen) RETERR(EIO);
	/* XXX tapes return EIO on wrong blocksize */
	if (uio->uio_resid != sizeof(struct nfslock_req)) RETERR(EIO);
	nfslock_wrlock();
	while ((p = TAILQ_FIRST(&nfslock_sending)) == 0) {
		nfslock_unlock();
		if (nfslock_devnoblock) RETERR(EWOULDBLOCK);
#if NFSLOCK_DEBUG
		printf("nfslockread: sleeping\n");
#endif
		SETERR(tsleep(&nfslock_sending, PUSER|PCATCH, "nfslockread", 0));
#if NFSLOCK_DEBUG
		printf("nfslockread: woken up\n");
#endif
		RETONERR;
		nfslock_wrlock();
	}
	TAILQ_REMOVE(&nfslock_sending, p, link);
	nfslock_unlock();
#if NFSLOCK_DEBUG
	printf("nfslockread: uiomove\n");
#endif
	SETERR(uiomove(&p->ent, sizeof(struct nfslock_req), uio));
	RETONERR;
	if (p->oneway) free(p, M_NFSLOCK);
#if NFSLOCK_DEBUG
	printf("nfslockread: wakeup\n");
#endif
	wakeup(&nfslock_sending);
	RET0;
}

int nfslockwrite(dev_t dev, struct uio *uio, int flags)
{
	struct nfslock_rep_ent *p;
	struct nfslock_rep *q;
	
	ENTERCUR("nfslockwrite");
	/* XXX does the kernel let this happen? */
	if (!nfslock_devisopen) RETERR(EIO);
	/* XXX tapes return EIO on wrong blocksize, don't they? */
	if (uio->uio_resid != sizeof(struct nfslock_rep)) RETERR(EIO);
	q = malloc(sizeof(struct nfslock_rep), M_NFSLOCK, M_WAITOK);
	SETERR(uiomove(q, sizeof(struct nfslock_rep), uio));
	RETONERR;
	nfslock_wrlock();
	for (p = TAILQ_FIRST(&nfslock_receiving); p != NULL; p = TAILQ_NEXT(p, link)) {
		if (p->ent.serial == q->serial) {
			TAILQ_REMOVE(&nfslock_receiving, p, link);
			nfslock_unlock();
#if NFSLOCK_DEBUG
			printf("nfslockwrite: reply #%d\n", q->serial);
#endif
			memcpy(&p->ent, q, sizeof(struct nfslock_rep));
			wakeup(&nfslock_receiving);
			RET;
		}
	}
	nfslock_unlock();
#if NFSLOCK_DEBUG
	printf("nfslockwrite: spurious reply #%d\n", q->serial);
#endif
	free(q, M_NFSLOCK);
	RET0;
}

int nfslockpoll(dev_t dev, int events, struct proc *p)
{
	/*
	 * this is not a standard kernel procedure since
	 * it returns a value, not an error.
	 */
	int revents;

#if NFSLOCK_DEBUG
	printf("nfslockpoll(%d, %d)\n", p->p_pid, events);
#endif
	revents = 0;
	if (events & (POLLIN | POLLRDNORM)) {
		nfslock_wrlock();
		if (!TAILQ_EMPTY(&nfslock_sending))
			revents |= events & (POLLIN | POLLRDNORM);
		nfslock_unlock();
	}

#if 0
	if (events & (POLLOUT | POLLWRNORM))
		if ()
			revents |= events & (POLLOUT | POLLWRNORM);

	if (events & POLLHUP)
		if ()
			revents |= POLLHUP;
#endif

	if (revents == 0) {
		if (events & (POLLIN /* | POLLHUP */ | POLLRDNORM)) {
#if NFSLOCK_DEBUG
			printf("nfslockpoll: selrecord\n");
#endif
			selrecord(p, &nfslock_sel);
		}

#if 0
		if (events & (POLLOUT | POLLWRNORM)) {
#if NFSLOCK_DEBUG
			printf("nfslockpoll: selrecord\n");
#endif
			selrecord(p, );
		}
#endif
	}

#if NFSLOCK_DEBUG
	printf("nfslockpoll: %d\n", revents);
#endif
	return (revents);
}

/*
 * send a request to lockd and read the reply.
 * on error, both req and rep will bee freed (here or later in _read).
 */
static int nfslock_reqrep(struct nfsmount *nmp, struct nfslock_req_ent *req, struct nfslock_rep_ent *rep)
{
	struct nfslock_req_ent *q;
	struct nfslock_rep_ent *p;
	
	ENTERCUR("nfslock_reqrep");
	req->oneway = 0;
	req->ent.magic = NFSLOCK_MAGIC;
	rep->ent.magic = 0;
	nfslock_wrlock();
	req->ent.serial = rep->ent.serial = ++nfslock_serial;
	/* put reply on receiving queue */
	TAILQ_INSERT_TAIL(&nfslock_receiving, rep, link);
	/* put request on sending queue */
	TAILQ_INSERT_TAIL(&nfslock_sending, req, link);
	nfslock_unlock();
#if 0
#if NFSLOCK_DEBUG
	printf("nfslock_reqrep: selnotify(receive)\n");
#endif
	selnotify( );
#endif
#if NFSLOCK_DEBUG
	printf("nfslock_reqrep: wakeup(receive)\n");
#endif
	wakeup(&nfslock_receiving);
#if NFSLOCK_DEBUG
	printf("nfslock_reqrep: selnotify(send)\n");
#endif
	selnotify(&nfslock_sel, 0);
#if NFSLOCK_DEBUG
	printf("nfslock_reqrep: wakeup(send)\n");
#endif
	wakeup(&nfslock_sending);
	/* wait for lockd to pick up the request */
	for (;;) {
		/* sleep on lockd reading the request; don't wait forever in case lockd is dead */
		SETERR(nfslock_sleep(nmp, &nfslock_sending, "nfslockq", -5 * hz));
		ONERR goto error;
		/* check if this request has been sent */
		nfslock_wrlock();
		for (q = TAILQ_FIRST(&nfslock_sending); q != NULL; q = TAILQ_NEXT(q, link)) {
			if (q == req) break /* still on queue */;
		}
		nfslock_unlock();
		if (q == NULL) break /* not on queue */;
#if NFSLOCK_DEBUG
		printf("nfslock_reqrep: request still on sending queue\n");
#endif
	}
	/* wait for lockd to answer */
	for (;;) {
		/* sleep on lockd sending a reply; wait forever for blocking locks on hard mounts */
		SETERR(nfslock_sleep(nmp, &nfslock_receiving, "nfslockp", req->ent.wait));
		ONERR goto error;
		/* check if this reply has been received */
		nfslock_wrlock();
		for (p = TAILQ_FIRST(&nfslock_receiving); p != NULL; p = TAILQ_NEXT(p, link)) {
			if (p == rep) break /* still on queue */;
		}
		nfslock_unlock();
		if (p == NULL) break /* not on queue */;
#if NFSLOCK_DEBUG
		printf("nfslock_reqrep: reply still on receiving queue\n");
#endif
	}
#if NFSLOCK_DEBUG
	printf("nfslock_reqrep: reply has been received\n");
#endif
	if (rep->ent.magic != NFSLOCK_MAGIC + 1) {
		SETERR(EBADRPC);
#if NFSLOCK_DEBUG
		printf("nfslock_reqrep: wrong magic: %d should be %d\n", rep->ent.magic, NFSLOCK_MAGIC + 1);
#endif
	}
	RET;
error:	/* remove reply from receiving queue first */
	nfslock_wrlock();
	for (p = TAILQ_FIRST(&nfslock_receiving); p != NULL; p = TAILQ_NEXT(p, link)) {
		if (p == rep) {
			TAILQ_REMOVE(&nfslock_receiving, rep, link);
#if NFSLOCK_DEBUG
			printf("nfslock_reqrep: rep removed from receiving queue\n");
#endif
		}
	}
	/* still locked */
	/* XXX unlock/lock around the free()? */
	free(rep, M_NFSLOCK); /* to be consistent with req */
	/* look if request still on sending queue, done if still there */
	for (q = TAILQ_FIRST(&nfslock_sending); q != NULL; q = TAILQ_NEXT(q, link)) {
		if (q == req) {
			TAILQ_REMOVE(&nfslock_sending, req, link);
#if NFSLOCK_DEBUG
			printf("nfslock_reqrep: req removed from sending queue\n");
#endif
			/* no need to cancel, just free */
			nfslock_unlock();
			free(req, M_NFSLOCK);
			RET;
		}
	}
	/* still locked */
	if (req->ent.op == 2 /* LOCK */) {
		/* must send a cancel */
#if NFSLOCK_DEBUG
		printf("nfslock_reqrep: putting cancel on sending queue\n");
#endif
		req->ent.op = 3;
		req->oneway = 1; /* _read() will free */
		TAILQ_INSERT_TAIL(&nfslock_sending, req, link);
	} else {
#if NFSLOCK_DEBUG
		printf("nfslock_reqrep: op was %d, not canceling\n", req->ent.op);
#endif
		free(req, M_NFSLOCK);
	}
	/* XXX unlock before the free()? */
	nfslock_unlock();
	RET;
}

int nfs_advlock(void *v)
{
	struct vop_advlock_args /* {
		struct vnode *a_vp;
		caddr_t  a_id;
		int  a_op;
		struct flock *a_fl;
		int  a_flags;
	} */ *ap = v;
	struct nfsnode *np = VTONFS(ap->a_vp);
	struct nfsmount *nmp = VFSTONFS(ap->a_vp->v_mount);
	struct nfslock_req_ent *req;
	struct nfslock_rep_ent *rep;
	struct sockaddr_storage *nam;
	int namlen;
	
	ENTERCUR("nfs_advlock");
	/* allocate reqest/reply */
	/* M_ZERO so we don't expose garbage inside padding to userland */
	req = malloc(sizeof(struct nfslock_req_ent), M_NFSLOCK, M_WAITOK|M_ZERO);
	rep = malloc(sizeof(struct nfslock_rep_ent), M_NFSLOCK, M_WAITOK);
	/* operation */
	switch (ap->a_op) {
		case F_SETLK: req->ent.op = 2; break;
		case F_GETLK: req->ent.op = 1; break;
		case F_UNLCK: req->ent.op = 4; break;
		default: SETERR(EINVAL);
	}
	ONERR goto out;
	/* flags */
	switch (ap->a_flags & (F_POSIX | F_FLOCK)) {
		case F_POSIX: req->ent.posix = 1; break;
		case F_FLOCK: req->ent.posix = 0; break;
		default: SETERR(EINVAL);
	}
	ONERR goto out;
	req->ent.wait = ap->a_flags & F_WAIT ? 1 : 0;
	/* type */
	switch (ap->a_fl->l_type) {
		case F_RDLCK: req->ent.exclusive = 0; break;
		case F_WRLCK: req->ent.exclusive = 1; break;
		case F_UNLCK: 
			if (ap->a_op == F_UNLCK) {
				/* Hm. */
				req->ent.exclusive = 0; break;
			}
			/* else fall trough */
		default: SETERR(EINVAL);
	}
	ONERR goto out;
	/* lock range */
	switch (ap->a_fl->l_whence) {
		case SEEK_CUR:
			/* XXX: sys_fcntl() silently adjusted l_start */
		case SEEK_SET:
			req->ent.offset = ap->a_fl->l_start;
#if 0
			if (ap->a_fl->l_len == 0) /* lock whole file */
				req->ent.len = ap->a_vp->v_size - ap->a_fl->l_start;
			else
#endif
				req->ent.len = ap->a_fl->l_len;
			break;
		case SEEK_END:
			if (-ap->a_fl->l_start > ap->a_vp->v_size)
				SETERR(EINVAL);
			else
				req->ent.offset = ap->a_fl->l_start + ap->a_vp->v_size;
#if 0
			if (ap->a_fl->l_len == 0) /* lock whole file */
				req->ent.len = -ap->a_fl->l_start;
			else
#endif
				req->ent.len = ap->a_fl->l_len;
			break;
		default: SETERR(EINVAL);
	}
	req->ent.len = ap->a_fl->l_len;
	if (req->ent.offset < 0 || req->ent.len < 0) SETERR(EINVAL);
	ONERR goto out;
	req->ent.owner = curproc->p_pid;
	req->ent.handle_len = np->n_fhsize;
	memcpy(&req->ent.handle, np->n_fhp, np->n_fhsize);
	req->ent.mount_version = nmp->nm_flag & NFSMNT_NFSV3 ? 3 : 2;
	req->ent.mount_soft = nmp->nm_flag & NFSMNT_SOFT ? 1 : 0;
	req->ent.mount_retry = nmp->nm_retry;
	req->ent.mount_timeo = nmp->nm_timeo;
	nam = mtod(nmp->nm_nam, struct sockaddr_storage *);
	namlen = nam->ss_len;
	if (namlen > sizeof(struct sockaddr)) SETERR(EINVAL);
	ONERR goto out;
	memcpy(&req->ent.mount_nam, nam, namlen);
	/* talk to lockd */
	SETERR(nfslock_reqrep(nmp, req, rep));
	/* UNBELIEVABLE! We have a reply from lockd! */
	ONERR goto out;
	if (rep->ent.errnum) {
		SETERR(rep->ent.errnum);
	} else {
		if (req->ent.op == 1 /* TEST */ && req->ent.posix /* XXX does the kernel issue non-posix tests? */) {
			if (rep->ent.conflict) {
				ap->a_fl->l_start = rep->ent.offset;
				ap->a_fl->l_len = rep->ent.len;
				ap->a_fl->l_pid = rep->ent.owner;
				ap->a_fl->l_type = rep->ent.conflict > 1 ?
					F_WRLCK : F_RDLCK;
				ap->a_fl->l_whence = SEEK_SET;
			} else {
				ap->a_fl->l_type = F_UNLCK;
			}
		}
	}
out:	free(req, M_NFSLOCK);
	free(rep, M_NFSLOCK);
	/* Is this ever going to come true? */
	RET;
}
