==== //depot/vendor/freebsd/src/sys/kern/init_main.c#113 (text+ko) - //depot/projects/ethng/src/sys/kern/init_main.c#5 (text+ko) ==== content @@ -153,6 +153,21 @@ newsysinit_end = newset + count; } +int cursubsystem; + +int +check_subsystem(int subsystem) +{ + + if (subsystem < cursubsystem) + return (1); + else if (subsystem == cursubsystem) + return (0); + else + return (-1); +} + + /* * System startup; initialize the world, create process 0, mount root * filesystem, and fork to create init and pagedaemon. Most of the @@ -221,7 +236,9 @@ if ((*sipp)->subsystem == SI_SUB_DONE) continue; - + + cursubsystem = (*sipp)->subsystem; + #if defined(VERBOSE_SYSINIT) if ((*sipp)->subsystem > last) { verbose = 1; ==== //depot/vendor/freebsd/src/sys/kern/kern_intr.c#92 (text+ko) - //depot/projects/ethng/src/sys/kern/kern_intr.c#5 (text+ko) ==== content @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -240,7 +241,7 @@ #ifndef INTR_FILTER int intr_event_create(struct intr_event **event, void *source, int flags, - void (*enable)(void *), const char *fmt, ...) + void (*enable)(void *), int (*assign_cpu)(void *, u_char), const char *fmt, ...) { struct intr_event *ie; va_list ap; @@ -251,6 +252,8 @@ ie = malloc(sizeof(struct intr_event), M_ITHREAD, M_WAITOK | M_ZERO); ie->ie_source = source; ie->ie_enable = enable; + ie->ie_assign_cpu = assign_cpu; + ie->ie_cpu = NOCPU; ie->ie_flags = flags; TAILQ_INIT(&ie->ie_handlers); mtx_init(&ie->ie_lock, "intr event", NULL, MTX_DEF); @@ -271,7 +274,7 @@ int intr_event_create(struct intr_event **event, void *source, int flags, void (*enable)(void *), void (*eoi)(void *), void (*disab)(void *), - const char *fmt, ...) + int (*assign_cpu)(void *, u_char), const char *fmt, ...) { struct intr_event *ie; va_list ap; @@ -302,6 +305,32 @@ } #endif +/* + * Bind an interrupt event to the specified CPU. + */ +int +intr_event_bind(struct intr_event *ie, u_char cpu) +{ + int error; + struct thread *td = curthread; + + /* Need a CPU to bind to. */ + if (cpu != NOCPU && CPU_ABSENT(cpu)) + return (EINVAL); + + if (ie->ie_assign_cpu == NULL) + return (EOPNOTSUPP); + + error = ie->ie_assign_cpu(ie->ie_source, cpu); + + thread_lock(td); + ie->ie_cpu = cpu; + thread_unlock(td); + + + return (0); +} + int intr_event_destroy(struct intr_event *ie) { @@ -893,10 +922,10 @@ } else { #ifdef INTR_FILTER error = intr_event_create(&ie, NULL, IE_SOFT, - NULL, NULL, NULL, "swi%d:", pri); + NULL, NULL, NULL, NULL, "swi%d:", pri); #else error = intr_event_create(&ie, NULL, IE_SOFT, - NULL, "swi%d:", pri); + NULL, NULL, "swi%d:", pri); #endif if (error) return (error); @@ -1079,6 +1108,7 @@ struct intr_event *ie; struct thread *td; struct proc *p; + u_char cpu; td = curthread; p = td->td_proc; @@ -1087,7 +1117,8 @@ ("%s: ithread and proc linkage out of sync", __func__)); ie = ithd->it_event; ie->ie_count = 0; - + cpu = NOCPU; + /* * As long as we have interrupts outstanding, go through the * list of handlers, giving each one a go at it. @@ -1132,6 +1163,21 @@ ie->ie_count = 0; mi_switch(SW_VOL, NULL); } + +#ifdef SMP + /* + * Ensure we are bound to the correct CPU. We can't + * move ithreads until SMP is running however, so just + * leave interrupts on the boor CPU during boot. + */ + if (ie->ie_cpu != cpu && smp_started) { + cpu = ie->ie_cpu; + if (cpu == NOCPU) + sched_unbind(td); + else + sched_bind(td, cpu); + } +#endif thread_unlock(td); } } @@ -1461,6 +1507,8 @@ db_printf("ADDING_THREAD"); comma = 1; } + if (ie->ie_cpu != NOCPU) + db_printf(" (CPU %d)", ie->ie_cpu); if (it != NULL && it->it_need) { if (comma) db_printf(", "); ==== //depot/vendor/freebsd/src/sys/kern/kern_mbuf.c#34 (text+ko) - //depot/projects/ethng/src/sys/kern/kern_mbuf.c#9 (text+ko) ==== content @@ -99,6 +99,8 @@ int nmbjumbop; /* limits number of page size jumbo clusters */ int nmbjumbo9; /* limits number of 9k jumbo clusters */ int nmbjumbo16; /* limits number of 16k jumbo clusters */ +int jumbo_phys_contig = 1; /* jumbo frames are physically contiguous */ + struct mbstat mbstat; static void @@ -140,7 +142,8 @@ "Maximum number of mbuf 16k jumbo clusters allowed"); SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat, "Mbuf general information and statistics"); - +SYSCTL_INT(_kern_ipc, OID_AUTO, jumbo_phys_contig, CTLFLAG_RD, &jumbo_phys_contig, 1, + "jumbo frames are physically contiguous"); /* * Zones from which we allocate. */ @@ -227,6 +230,9 @@ if (nmbjumbo9 > 0) uma_zone_set_max(zone_jumbo9, nmbjumbo9); + if (jumbo_phys_contig) + uma_zone_set_contig(zone_jumbo9); + zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES, mb_ctor_clust, mb_dtor_clust, #ifdef INVARIANTS @@ -238,6 +244,9 @@ if (nmbjumbo16 > 0) uma_zone_set_max(zone_jumbo16, nmbjumbo16); + if (jumbo_phys_contig) + uma_zone_set_contig(zone_jumbo16); + zone_ext_refcnt = uma_zcreate(MBUF_EXTREFCNT_MEM_NAME, sizeof(u_int), NULL, NULL, NULL, NULL, @@ -321,6 +330,7 @@ m->m_pkthdr.tso_segsz = 0; m->m_pkthdr.ether_vtag = 0; SLIST_INIT(&m->m_pkthdr.tags); + m->m_pkthdr.rss_hash = 0; #ifdef MAC /* If the label init fails, fail the alloc */ error = mac_mbuf_init(m, how); @@ -339,8 +349,8 @@ mb_dtor_mbuf(void *mem, int size, void *arg) { struct mbuf *m; - unsigned long flags; - + unsigned long flags = (unsigned long)arg; + m = (struct mbuf *)mem; flags = (unsigned long)arg; @@ -372,7 +382,6 @@ KASSERT(m->m_ext.ext_args == NULL, ("%s: ext_args != NULL", __func__)); KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__)); KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__)); - KASSERT(*m->m_ext.ref_cnt == 1, ("%s: ref_cnt != 1", __func__)); #ifdef INVARIANTS trash_dtor(m->m_ext.ext_buf, MCLBYTES, arg); #endif @@ -400,7 +409,6 @@ mb_ctor_clust(void *mem, int size, void *arg, int how) { struct mbuf *m; - u_int *refcnt; int type; uma_zone_t zone; @@ -431,10 +439,8 @@ break; } - m = (struct mbuf *)arg; - refcnt = uma_find_refcnt(zone, mem); - *refcnt = 1; - if (m != NULL) { + if (arg != NULL) { + m = (struct mbuf *)arg; m->m_ext.ext_buf = (caddr_t)mem; m->m_data = m->m_ext.ext_buf; m->m_flags |= M_EXT; @@ -442,7 +448,7 @@ m->m_ext.ext_args = NULL; m->m_ext.ext_size = size; m->m_ext.ext_type = type; - m->m_ext.ref_cnt = refcnt; + m->m_ext.ref_cnt = NULL; /* lazy assignment */ } return (0); @@ -533,7 +539,8 @@ m->m_len = 0; m->m_flags = (flags | M_EXT); m->m_type = type; - + m->m_ext.ref_cnt = NULL; /* lazy refcnt */ + if (flags & M_PKTHDR) { m->m_pkthdr.rcvif = NULL; m->m_pkthdr.len = 0; ==== //depot/vendor/freebsd/src/sys/kern/kern_rwlock.c#30 (text+ko) - //depot/projects/ethng/src/sys/kern/kern_rwlock.c#3 (text+ko) ==== content @@ -324,7 +324,7 @@ * the owner stops running or the state of the lock * changes. */ - owner = (struct thread *)RW_OWNER(x); + owner = (volatile struct thread *)RW_OWNER(x); if (TD_IS_RUNNING(owner)) { turnstile_cancel(ts); if (LOCK_LOG_TEST(&rw->lock_object, 0)) @@ -334,13 +334,16 @@ lock_profile_obtain_lock_failed(&rw->lock_object, &contested, &waittime); #endif - while ((struct thread*)RW_OWNER(rw->rw_lock)== owner && - TD_IS_RUNNING(owner)) + if (owner == curthread) + panic("logic error in rwlock"); + while ((volatile struct thread*)RW_OWNER(rw->rw_lock)== owner && + TD_IS_RUNNING(owner)) cpu_spinwait(); + continue; + } #endif - /* * We were unable to acquire the lock and the read waiters * flag is set, so we must block on the turnstile. ==== //depot/vendor/freebsd/src/sys/kern/kern_switch.c#123 (text+ko) - //depot/projects/ethng/src/sys/kern/kern_switch.c#6 (text+ko) ==== content @@ -151,45 +151,15 @@ return (td); } -/* - * Kernel thread preemption implementation. Critical sections mark - * regions of code in which preemptions are not allowed. - */ void -critical_enter(void) +critical_exit_owepreempt(struct thread *td) { - struct thread *td; - - td = curthread; - td->td_critnest++; - CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td, - (long)td->td_proc->p_pid, td->td_name, td->td_critnest); -} - -void -critical_exit(void) -{ - struct thread *td; - - td = curthread; - KASSERT(td->td_critnest != 0, - ("critical_exit: td_critnest == 0")); - - if (td->td_critnest == 1) { - td->td_critnest = 0; - if (td->td_owepreempt) { - td->td_critnest = 1; - thread_lock(td); - td->td_critnest--; - SCHED_STAT_INC(switch_owepreempt); - mi_switch(SW_INVOL|SW_PREEMPT, NULL); - thread_unlock(td); - } - } else - td->td_critnest--; - - CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td, - (long)td->td_proc->p_pid, td->td_name, td->td_critnest); + td->td_critnest = 1; + thread_lock(td); + td->td_critnest--; + SCHED_STAT_INC(switch_owepreempt); + mi_switch(SW_INVOL|SW_PREEMPT, NULL); + thread_unlock(td); } /* ==== //depot/vendor/freebsd/src/sys/kern/kern_timeout.c#40 (text+ko) - //depot/projects/ethng/src/sys/kern/kern_timeout.c#4 (text+ko) ==== content @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -58,6 +59,9 @@ static int avg_mtxcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_mtxcalls, CTLFLAG_RD, &avg_mtxcalls, 0, "Average number of mtx callouts made per softclock call. Units = 1/1000"); +static int avg_rwcalls; +SYSCTL_INT(_debug, OID_AUTO, to_avg_rwcalls, CTLFLAG_RD, &avg_rwcalls, 0, + "Average number of rw callouts made per softclock call. Units = 1/1000"); static int avg_mpcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0, "Average number of MP callouts made per softclock call. Units = 1/1000"); @@ -171,6 +175,7 @@ int depth; int mpcalls; int mtxcalls; + int rwcalls; int gcalls; #ifdef DIAGNOSTIC struct bintime bt1, bt2; @@ -185,6 +190,7 @@ mpcalls = 0; mtxcalls = 0; + rwcalls = 0; gcalls = 0; depth = 0; steps = 0; @@ -216,6 +222,7 @@ void (*c_func)(void *); void *c_arg; struct mtx *c_mtx; + struct rwlock *c_rwlock; int c_flags; nextsoftcheck = TAILQ_NEXT(c, c_links.tqe); @@ -223,6 +230,7 @@ c_func = c->c_func; c_arg = c->c_arg; c_mtx = c->c_mtx; + c_rwlock = c->c_rwlock; c_flags = c->c_flags; if (c->c_flags & CALLOUT_LOCAL_ALLOC) { c->c_func = NULL; @@ -237,7 +245,19 @@ } curr_cancelled = 0; mtx_unlock_spin(&callout_lock); - if (c_mtx != NULL) { + if (c_rwlock != NULL) { + rw_wlock(c_rwlock); + if (curr_cancelled) { + rw_wunlock(c_rwlock); + goto skip; + } + curr_cancelled = 1; + rwcalls++; + + CTR3(KTR_CALLOUT, "callout mtx" + " %p func %p arg %p", + c, c_func, c_arg); + } else if (c_mtx != NULL) { mtx_lock(c_mtx); /* * The callout may have been cancelled @@ -290,8 +310,10 @@ lastfunc = c_func; } #endif - if ((c_flags & CALLOUT_RETURNUNLOCKED) == 0) + if (c_mtx != NULL && (c_flags & CALLOUT_RETURNUNLOCKED) == 0) mtx_unlock(c_mtx); + if (c_rwlock != NULL && (c_flags & CALLOUT_RETURNUNLOCKED_RW) == 0) + rw_wunlock(c_rwlock); skip: mtx_lock_spin(&callout_lock); curr_callout = NULL; @@ -313,6 +335,7 @@ avg_depth += (depth * 1000 - avg_depth) >> 8; avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8; avg_mtxcalls += (mtxcalls * 1000 - avg_mtxcalls) >> 8; + avg_rwcalls += (rwcalls * 1000 - avg_rwcalls) >> 8; avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8; nextsoftcheck = NULL; mtx_unlock_spin(&callout_lock); @@ -614,7 +637,6 @@ { bzero(c, sizeof *c); if (mpsafe) { - c->c_mtx = NULL; c->c_flags = CALLOUT_RETURNUNLOCKED; } else { c->c_mtx = &Giant; @@ -630,7 +652,8 @@ { bzero(c, sizeof *c); c->c_mtx = mtx; - KASSERT((flags & ~(CALLOUT_RETURNUNLOCKED)) == 0, + c->c_rwlock = NULL; + KASSERT((flags & ~CALLOUT_RETURNUNLOCKED) == 0, ("callout_init_mtx: bad flags %d", flags)); /* CALLOUT_RETURNUNLOCKED makes no sense without a mutex. */ KASSERT(mtx != NULL || (flags & CALLOUT_RETURNUNLOCKED) == 0, @@ -638,6 +661,17 @@ c->c_flags = flags & (CALLOUT_RETURNUNLOCKED); } +void +callout_init_rwlock(struct callout *c, struct rwlock *rw, int flags) +{ + bzero(c, sizeof *c); + c->c_mtx = NULL; + c->c_rwlock = rw; + KASSERT((flags & ~CALLOUT_RETURNUNLOCKED_RW) == 0, + ("callout_init_rwlock: bad flags %d", flags)); + c->c_flags = flags & CALLOUT_RETURNUNLOCKED_RW; +} + #ifdef APM_FIXUP_CALLTODO /* * Adjust the kernel calltodo timeout list. This routine is used after ==== //depot/vendor/freebsd/src/sys/kern/subr_witness.c#157 (text+ko) - //depot/projects/ethng/src/sys/kern/subr_witness.c#4 (text+ko) ==== content @@ -306,8 +306,8 @@ * Routing */ { "so_rcv", &lock_class_mtx_sleep }, - { "radix node head", &lock_class_mtx_sleep }, - { "rtentry", &lock_class_mtx_sleep }, + { "radix node head", &lock_class_rw }, + { "rtentry", &lock_class_rw }, { "ifaddr", &lock_class_mtx_sleep }, { NULL, NULL }, /* ==== //depot/vendor/freebsd/src/sys/kern/uipc_mbuf.c#87 (text+ko) - //depot/projects/ethng/src/sys/kern/uipc_mbuf.c#5 (text+ko) ==== content @@ -109,9 +109,23 @@ /* Loop and append maximum sized mbufs to the chain tail. */ while (len > 0) { + int clsize = MJUMPAGESIZE; +#ifdef notyet + /* + * XXX seeing what appears to be a memory leak on blast + * overload conditions - turning this off won't fix + * but it will delay it + */ + if (jumbo_phys_contig) { + if (len >= MJUM9BYTES) + clsize = MJUM16BYTES; + else if (len >= MJUMPAGESIZE) + clsize = MJUM9BYTES; + } +#endif if (len > MCLBYTES) mb = m_getjcl(how, type, (flags & M_PKTHDR), - MJUMPAGESIZE); + clsize); else if (len >= MINCLSIZE) mb = m_getcl(how, type, (flags & M_PKTHDR)); else if (flags & M_PKTHDR) @@ -212,25 +226,39 @@ mb_free_ext(struct mbuf *m) { int skipmbuf; + int dofree; + u_int cnt; KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__)); - KASSERT(m->m_ext.ref_cnt != NULL, ("%s: ref_cnt not set", __func__)); - + /* Account for lazy ref count assign. */ + if (m->m_ext.ref_cnt == NULL) + dofree = 1; + else + dofree = 0; + /* * check if the header is embedded in the cluster - */ + */ skipmbuf = (m->m_flags & M_NOFREE); - + + /* + * This is tricky. We need to make sure to decrement the + * refcount in a safe way but to also clean up if we're the + * last reference. This method seems to do it without race. + */ + while (dofree == 0) { + cnt = *(m->m_ext.ref_cnt); + if (atomic_cmpset_int(m->m_ext.ref_cnt, cnt, cnt - 1)) { + if (cnt == 1) + dofree = 1; + break; + } + } + /* Free attached storage if this mbuf is the only reference to it. */ - if (*(m->m_ext.ref_cnt) == 1 || - atomic_fetchadd_int(m->m_ext.ref_cnt, -1) == 1) { + if (dofree) { switch (m->m_ext.ext_type) { - case EXT_PACKET: /* The packet zone is special. */ - if (*(m->m_ext.ref_cnt) == 0) - *(m->m_ext.ref_cnt) = 1; - uma_zfree(zone_pack, m); - return; /* Job done. */ case EXT_CLUSTER: uma_zfree(zone_clust, m->m_ext.ext_buf); break; @@ -262,6 +290,7 @@ ("%s: unknown ext_type", __func__)); } } + if (skipmbuf) return; @@ -287,13 +316,9 @@ mb_dupcl(struct mbuf *n, struct mbuf *m) { KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__)); - KASSERT(m->m_ext.ref_cnt != NULL, ("%s: ref_cnt not set", __func__)); KASSERT((n->m_flags & M_EXT) == 0, ("%s: M_EXT set", __func__)); - if (*(m->m_ext.ref_cnt) == 1) - *(m->m_ext.ref_cnt) += 1; - else - atomic_add_int(m->m_ext.ref_cnt, 1); + MEXT_ADD_REF(m); n->m_ext.ext_buf = m->m_ext.ext_buf; n->m_ext.ext_free = m->m_ext.ext_free; n->m_ext.ext_args = m->m_ext.ext_args; ==== //depot/vendor/freebsd/src/sys/kern/vfs_export.c#30 (text+ko) - //depot/projects/ethng/src/sys/kern/vfs_export.c#3 (text+ko) ==== content @@ -45,7 +45,7 @@ #include #include #include -#include +#include #include #include #include @@ -222,6 +222,7 @@ if ((rnh = nep->ne_rtable[i])) { RADIX_NODE_HEAD_LOCK(rnh); (*rnh->rnh_walktree) (rnh, vfs_free_netcred, rnh); + RADIX_NODE_HEAD_UNLOCK(rnh); RADIX_NODE_HEAD_DESTROY(rnh); free(rnh, M_RTABLE); nep->ne_rtable[i] = NULL; /* not SMP safe XXX */