/* $NetBSD: linux_futex.c,v 1.37 2017/04/10 15:04:32 dholland Exp $ */ /*- * Copyright (c) 2005 Emmanuel Dreyfus, all rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Emmanuel Dreyfus * 4. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS'' * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include __KERNEL_RCSID(1, "$NetBSD: linux_futex.c,v 1.37 2017/04/10 15:04:32 dholland Exp $"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct futex; struct waiting_proc { struct futex *wp_futex; kcondvar_t wp_futex_cv; TAILQ_ENTRY(waiting_proc) wp_list; bool wp_onlist; }; struct futex { void *f_uaddr; int f_refcount; uint32_t f_bitset; LIST_ENTRY(futex) f_list; TAILQ_HEAD(, waiting_proc) f_waiting_proc; }; static LIST_HEAD(futex_list, futex) futex_list; static kmutex_t futex_lock; #define FUTEX_LOCK mutex_enter(&futex_lock) #define FUTEX_UNLOCK mutex_exit(&futex_lock) #define FUTEX_LOCKASSERT KASSERT(mutex_owned(&futex_lock)) #define FUTEX_SYSTEM_LOCK KERNEL_LOCK(1, NULL) #define FUTEX_SYSTEM_UNLOCK KERNEL_UNLOCK_ONE(0) #ifdef DEBUG_LINUX_FUTEX int debug_futex = 1; #define FUTEXPRINTF(a) do { if (debug_futex) printf a; } while (0) #else #define FUTEXPRINTF(a) #endif void linux_futex_init(void) { FUTEXPRINTF(("%s: initializing futex\n", __func__)); mutex_init(&futex_lock, MUTEX_DEFAULT, IPL_NONE); } void linux_futex_fini(void) { FUTEXPRINTF(("%s: destroying futex\n", __func__)); mutex_destroy(&futex_lock); } static struct waiting_proc *futex_wp_alloc(void); static void futex_wp_free(struct waiting_proc *); static struct futex *futex_get(void *, uint32_t); static void futex_ref(struct futex *); static void futex_put(struct futex *); static int futex_sleep(struct futex **, lwp_t *, int, struct waiting_proc *); static int futex_wake(struct futex *, int, struct futex *, int); static int futex_atomic_op(lwp_t *, int, void *); int linux_sys_futex(struct lwp *l, const struct linux_sys_futex_args *uap, register_t *retval) { /* { syscallarg(int *) uaddr; syscallarg(int) op; syscallarg(int) val; syscallarg(const struct linux_timespec *) timeout; syscallarg(int *) uaddr2; syscallarg(int) val3; } */ struct linux_timespec lts; struct timespec ts = { 0, 0 }; int error; if ((SCARG(uap, op) & LINUX_FUTEX_CMD_MASK) == LINUX_FUTEX_WAIT && SCARG(uap, timeout) != NULL) { if ((error = copyin(SCARG(uap, timeout), <s, sizeof(lts))) != 0) { return error; } linux_to_native_timespec(&ts, <s); } return linux_do_futex(l, uap, &ts, retval); } /* * Note: TS can't be const because ts2timo destroys it. */ int linux_do_futex(struct lwp *l, const struct linux_sys_futex_args *uap, struct timespec *ts, register_t *retval) { /* { syscallarg(int *) uaddr; syscallarg(int) op; syscallarg(int) val; syscallarg(const struct linux_timespec *) timeout; syscallarg(int *) uaddr2; syscallarg(int) val3; } */ int val, val3; int ret; int error = 0; struct futex *f; struct futex *newf; int tout; struct futex *f2; struct waiting_proc *wp; int op_ret, cmd; clockid_t clk; cmd = SCARG(uap, op) & LINUX_FUTEX_CMD_MASK; val3 = SCARG(uap, val3); if (SCARG(uap, op) & LINUX_FUTEX_CLOCK_REALTIME) { switch (cmd) { case LINUX_FUTEX_WAIT_BITSET: case LINUX_FUTEX_WAIT: clk = CLOCK_REALTIME; break; default: return ENOSYS; } } else clk = CLOCK_MONOTONIC; /* * Our implementation provides only private futexes. Most of the apps * should use private futexes but don't claim so. Therefore we treat * all futexes as private by clearing the FUTEX_PRIVATE_FLAG. It works * in most cases (ie. when futexes are not shared on file descriptor * or between different processes). * * Note that we don't handle bitsets at all at the moment. We need * to move from refcounting uaddr's to handling multiple futex entries * pointing to the same uaddr, but having possibly different bitmask. * Perhaps move to an implementation where each uaddr has a list of * futexes. */ switch (cmd) { case LINUX_FUTEX_WAIT: val3 = FUTEX_BITSET_MATCH_ANY; /*FALLTHROUGH*/ case LINUX_FUTEX_WAIT_BITSET: if ((error = ts2timo(clk, 0, ts, &tout, NULL)) != 0) { if (error != ETIMEDOUT) return error; /* * If the user process requests a non null * timeout, make sure we do not turn it into * an infinite timeout because tout is 0. * * We use a minimal timeout of 1/hz. Maybe it * would make sense to just return ETIMEDOUT * without sleeping. */ if (SCARG(uap, timeout) != NULL) tout = 1; else tout = 0; } FUTEX_SYSTEM_LOCK; if ((error = copyin(SCARG(uap, uaddr), &val, sizeof(val))) != 0) { FUTEX_SYSTEM_UNLOCK; return error; } if (val != SCARG(uap, val)) { FUTEX_SYSTEM_UNLOCK; return EWOULDBLOCK; } FUTEXPRINTF(("FUTEX_WAIT %d.%d: val = %d, uaddr = %p, " "*uaddr = %d, timeout = %lld.%09ld\n", l->l_proc->p_pid, l->l_lid, SCARG(uap, val), SCARG(uap, uaddr), val, (long long)ts->tv_sec, ts->tv_nsec)); wp = futex_wp_alloc(); FUTEX_LOCK; f = futex_get(SCARG(uap, uaddr), val3); ret = futex_sleep(&f, l, tout, wp); futex_put(f); FUTEX_UNLOCK; futex_wp_free(wp); FUTEXPRINTF(("FUTEX_WAIT %d.%d: uaddr = %p, " "ret = %d\n", l->l_proc->p_pid, l->l_lid, SCARG(uap, uaddr), ret)); FUTEX_SYSTEM_UNLOCK; switch (ret) { case EWOULDBLOCK: /* timeout */ return ETIMEDOUT; break; case EINTR: /* signal */ return EINTR; break; case 0: /* FUTEX_WAKE received */ FUTEXPRINTF(("FUTEX_WAIT %d.%d: uaddr = %p, got it\n", l->l_proc->p_pid, l->l_lid, SCARG(uap, uaddr))); return 0; break; default: FUTEXPRINTF(("FUTEX_WAIT: unexpected ret = %d\n", ret)); break; } /* NOTREACHED */ break; case LINUX_FUTEX_WAKE: val = FUTEX_BITSET_MATCH_ANY; /*FALLTHROUGH*/ case LINUX_FUTEX_WAKE_BITSET: /* * XXX: Linux is able cope with different addresses * corresponding to the same mapped memory in the sleeping * and the waker process(es). */ FUTEXPRINTF(("FUTEX_WAKE %d.%d: uaddr = %p, val = %d\n", l->l_proc->p_pid, l->l_lid, SCARG(uap, uaddr), SCARG(uap, val))); FUTEX_SYSTEM_LOCK; FUTEX_LOCK; f = futex_get(SCARG(uap, uaddr), val3); *retval = futex_wake(f, SCARG(uap, val), NULL, 0); futex_put(f); FUTEX_UNLOCK; FUTEX_SYSTEM_UNLOCK; break; case LINUX_FUTEX_CMP_REQUEUE: FUTEX_SYSTEM_LOCK; if ((error = copyin(SCARG(uap, uaddr), &val, sizeof(val))) != 0) { FUTEX_SYSTEM_UNLOCK; return error; } if (val != val3) { FUTEX_SYSTEM_UNLOCK; return EAGAIN; } FUTEXPRINTF(("FUTEX_CMP_REQUEUE %d.%d: uaddr = %p, val = %d, " "uaddr2 = %p, val2 = %d\n", l->l_proc->p_pid, l->l_lid, SCARG(uap, uaddr), SCARG(uap, val), SCARG(uap, uaddr2), (int)(unsigned long)SCARG(uap, timeout))); FUTEX_LOCK; f = futex_get(SCARG(uap, uaddr), val3); newf = futex_get(SCARG(uap, uaddr2), val3); *retval = futex_wake(f, SCARG(uap, val), newf, (int)(unsigned long)SCARG(uap, timeout)); futex_put(f); futex_put(newf); FUTEX_UNLOCK; FUTEX_SYSTEM_UNLOCK; break; case LINUX_FUTEX_REQUEUE: FUTEX_SYSTEM_LOCK; FUTEXPRINTF(("FUTEX_REQUEUE %d.%d: uaddr = %p, val = %d, " "uaddr2 = %p, val2 = %d\n", l->l_proc->p_pid, l->l_lid, SCARG(uap, uaddr), SCARG(uap, val), SCARG(uap, uaddr2), (int)(unsigned long)SCARG(uap, timeout))); FUTEX_LOCK; f = futex_get(SCARG(uap, uaddr), val3); newf = futex_get(SCARG(uap, uaddr2), val3); *retval = futex_wake(f, SCARG(uap, val), newf, (int)(unsigned long)SCARG(uap, timeout)); futex_put(f); futex_put(newf); FUTEX_UNLOCK; FUTEX_SYSTEM_UNLOCK; break; case LINUX_FUTEX_FD: FUTEXPRINTF(("%s: unimplemented op %d\n", __func__, cmd)); return ENOSYS; case LINUX_FUTEX_WAKE_OP: FUTEX_SYSTEM_LOCK; FUTEXPRINTF(("FUTEX_WAKE_OP %d.%d: uaddr = %p, op = %d, " "val = %d, uaddr2 = %p, val2 = %d\n", l->l_proc->p_pid, l->l_lid, SCARG(uap, uaddr), cmd, SCARG(uap, val), SCARG(uap, uaddr2), (int)(unsigned long)SCARG(uap, timeout))); FUTEX_LOCK; f = futex_get(SCARG(uap, uaddr), val3); f2 = futex_get(SCARG(uap, uaddr2), val3); FUTEX_UNLOCK; /* * This function returns positive number as results and * negative as errors */ op_ret = futex_atomic_op(l, val3, SCARG(uap, uaddr2)); FUTEX_LOCK; if (op_ret < 0) { futex_put(f); futex_put(f2); FUTEX_UNLOCK; FUTEX_SYSTEM_UNLOCK; return -op_ret; } ret = futex_wake(f, SCARG(uap, val), NULL, 0); futex_put(f); if (op_ret > 0) { op_ret = 0; /* * Linux abuses the address of the timespec parameter * as the number of retries */ op_ret += futex_wake(f2, (int)(unsigned long)SCARG(uap, timeout), NULL, 0); ret += op_ret; } futex_put(f2); FUTEX_UNLOCK; FUTEX_SYSTEM_UNLOCK; *retval = ret; break; default: FUTEXPRINTF(("%s: unknown op %d\n", __func__, cmd)); return ENOSYS; } return 0; } static struct waiting_proc * futex_wp_alloc(void) { struct waiting_proc *wp; wp = kmem_zalloc(sizeof(*wp), KM_SLEEP); cv_init(&wp->wp_futex_cv, "futex"); return wp; } static void futex_wp_free(struct waiting_proc *wp) { cv_destroy(&wp->wp_futex_cv); kmem_free(wp, sizeof(*wp)); } static struct futex * futex_get(void *uaddr, uint32_t bitset) { struct futex *f; FUTEX_LOCKASSERT; LIST_FOREACH(f, &futex_list, f_list) { if (f->f_uaddr == uaddr) { f->f_refcount++; return f; } } /* Not found, create it */ f = kmem_zalloc(sizeof(*f), KM_SLEEP); f->f_uaddr = uaddr; f->f_bitset = bitset; f->f_refcount = 1; TAILQ_INIT(&f->f_waiting_proc); LIST_INSERT_HEAD(&futex_list, f, f_list); return f; } static void futex_ref(struct futex *f) { FUTEX_LOCKASSERT; f->f_refcount++; } static void futex_put(struct futex *f) { FUTEX_LOCKASSERT; f->f_refcount--; if (f->f_refcount == 0) { KASSERT(TAILQ_EMPTY(&f->f_waiting_proc)); LIST_REMOVE(f, f_list); kmem_free(f, sizeof(*f)); } } static int futex_sleep(struct futex **fp, lwp_t *l, int timeout, struct waiting_proc *wp) { struct futex *f; int ret; FUTEX_LOCKASSERT; f = *fp; wp->wp_futex = f; TAILQ_INSERT_TAIL(&f->f_waiting_proc, wp, wp_list); wp->wp_onlist = true; ret = cv_timedwait_sig(&wp->wp_futex_cv, &futex_lock, timeout); /* * we may have been requeued to a different futex before we were * woken up, so let the caller know which futex to put. if we were * woken by futex_wake() then it took us off the waiting list, * but if our sleep was interrupted or timed out then we might * need to take ourselves off the waiting list. */ f = wp->wp_futex; if (wp->wp_onlist) { TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); } *fp = f; return ret; } static int futex_wake(struct futex *f, int n, struct futex *newf, int n2) { struct waiting_proc *wp; int count = 0; FUTEX_LOCKASSERT; /* * wake up up to n threads waiting on this futex. */ while (n--) { wp = TAILQ_FIRST(&f->f_waiting_proc); if (wp == NULL) return count; KASSERT(f == wp->wp_futex); TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); wp->wp_onlist = false; cv_signal(&wp->wp_futex_cv); count++; } if (newf == NULL) return count; /* * then requeue up to n2 additional threads to newf * (without waking them up). */ while (n2--) { wp = TAILQ_FIRST(&f->f_waiting_proc); if (wp == NULL) return count; KASSERT(f == wp->wp_futex); TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); futex_put(f); wp->wp_futex = newf; futex_ref(newf); TAILQ_INSERT_TAIL(&newf->f_waiting_proc, wp, wp_list); count++; } return count; } static int futex_atomic_op(lwp_t *l, int encoded_op, void *uaddr) { const int op = (encoded_op >> 28) & 7; const int cmp = (encoded_op >> 24) & 15; const int cmparg = (encoded_op << 20) >> 20; int oparg = (encoded_op << 8) >> 20; int error, oldval, cval; if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; /* XXX: linux verifies access here and returns EFAULT */ if (copyin(uaddr, &cval, sizeof(int)) != 0) return -EFAULT; for (;;) { int nval; switch (op) { case FUTEX_OP_SET: nval = oparg; break; case FUTEX_OP_ADD: nval = cval + oparg; break; case FUTEX_OP_OR: nval = cval | oparg; break; case FUTEX_OP_ANDN: nval = cval & ~oparg; break; case FUTEX_OP_XOR: nval = cval ^ oparg; break; default: return -ENOSYS; } error = ucas_int(uaddr, cval, nval, &oldval); if (error || oldval == cval) { break; } cval = oldval; } if (error) return -EFAULT; switch (cmp) { case FUTEX_OP_CMP_EQ: return (oldval == cmparg); case FUTEX_OP_CMP_NE: return (oldval != cmparg); case FUTEX_OP_CMP_LT: return (oldval < cmparg); case FUTEX_OP_CMP_GE: return (oldval >= cmparg); case FUTEX_OP_CMP_LE: return (oldval <= cmparg); case FUTEX_OP_CMP_GT: return (oldval > cmparg); default: return -ENOSYS; } } int linux_sys_set_robust_list(struct lwp *l, const struct linux_sys_set_robust_list_args *uap, register_t *retval) { /* { syscallarg(struct linux_robust_list_head *) head; syscallarg(size_t) len; } */ struct linux_emuldata *led; if (SCARG(uap, len) != sizeof(struct linux_robust_list_head)) return EINVAL; led = l->l_emuldata; led->led_robust_head = SCARG(uap, head); *retval = 0; return 0; } int linux_sys_get_robust_list(struct lwp *l, const struct linux_sys_get_robust_list_args *uap, register_t *retval) { /* { syscallarg(int) pid; syscallarg(struct linux_robust_list_head **) head; syscallarg(size_t *) len; } */ struct proc *p; struct linux_emuldata *led; struct linux_robust_list_head *head; size_t len; int error = 0; p = l->l_proc; if (!SCARG(uap, pid)) { led = l->l_emuldata; head = led->led_robust_head; } else { mutex_enter(p->p_lock); l = lwp_find(p, SCARG(uap, pid)); if (l != NULL) { led = l->l_emuldata; head = led->led_robust_head; } mutex_exit(p->p_lock); if (l == NULL) { return ESRCH; } } #ifdef __arch64__ if (p->p_flag & PK_32) { uint32_t u32; u32 = 12; error = copyout(&u32, SCARG(uap, len), sizeof(u32)); if (error) return error; u32 = (uint32_t)(uintptr_t)head; return copyout(&u32, SCARG(uap, head), sizeof(u32)); } #endif len = sizeof(*head); error = copyout(&len, SCARG(uap, len), sizeof(len)); if (error) return error; return copyout(&head, SCARG(uap, head), sizeof(head)); } static int handle_futex_death(void *uaddr, pid_t pid, int pi) { int uval, nval, mval; struct futex *f; retry: if (copyin(uaddr, &uval, sizeof(uval))) return EFAULT; if ((uval & FUTEX_TID_MASK) == pid) { mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; nval = atomic_cas_32(uaddr, uval, mval); if (nval == -1) return EFAULT; if (nval != uval) goto retry; if (!pi && (uval & FUTEX_WAITERS)) { FUTEX_LOCK; f = futex_get(uaddr, FUTEX_BITSET_MATCH_ANY); futex_wake(f, 1, NULL, 0); FUTEX_UNLOCK; } } return 0; } static int fetch_robust_entry(struct lwp *l, struct linux_robust_list **entry, struct linux_robust_list **head, int *pi) { unsigned long uentry; #ifdef __arch64__ if (l->l_proc->p_flag & PK_32) { uint32_t u32; if (copyin(head, &u32, sizeof(u32))) return EFAULT; uentry = (unsigned long)u32; } else #endif if (copyin(head, &uentry, sizeof(uentry))) return EFAULT; *entry = (void *)(uentry & ~1UL); *pi = uentry & 1; return 0; } /* This walks the list of robust futexes, releasing them. */ void release_futexes(struct lwp *l) { struct linux_robust_list_head head; struct linux_robust_list *entry, *next_entry = NULL, *pending; unsigned int limit = 2048, pi, next_pi, pip; struct linux_emuldata *led; unsigned long futex_offset; int rc; led = l->l_emuldata; if (led->led_robust_head == NULL) return; #ifdef __arch64__ if (l->l_proc->p_flag & PK_32) { uint32_t u32s[3]; if (copyin(led->led_robust_head, u32s, sizeof(u32s))) return; head.list.next = (void *)(uintptr_t)u32s[0]; head.futex_offset = (unsigned long)u32s[1]; head.pending_list = (void *)(uintptr_t)u32s[2]; } else #endif if (copyin(led->led_robust_head, &head, sizeof(head))) return; if (fetch_robust_entry(l, &entry, &head.list.next, &pi)) return; #ifdef __arch64__ if (l->l_proc->p_flag & PK_32) { uint32_t u32; if (copyin(led->led_robust_head, &u32, sizeof(u32))) return; head.futex_offset = (unsigned long)u32; futex_offset = head.futex_offset; } else #endif if (copyin(&head.futex_offset, &futex_offset, sizeof(unsigned long))) return; if (fetch_robust_entry(l, &pending, &head.pending_list, &pip)) return; while (entry != &head.list) { rc = fetch_robust_entry(l, &next_entry, &entry->next, &next_pi); if (entry != pending) if (handle_futex_death((char *)entry + futex_offset, l->l_lid, pi)) return; if (rc) return; entry = next_entry; pi = next_pi; if (!--limit) break; yield(); /* XXX why? */ } if (pending) handle_futex_death((char *)pending + futex_offset, l->l_lid, pip); }