diff --git a/sys/dev/dkwedge/dk.c b/sys/dev/dkwedge/dk.c index a452cff4cd17..6ac56fd5f12d 100644 --- a/sys/dev/dkwedge/dk.c +++ b/sys/dev/dkwedge/dk.c @@ -60,38 +60,68 @@ __KERNEL_RCSID(0, "$NetBSD: dk.c,v 1.125 2023/04/13 08:30:40 riastradh Exp $"); MALLOC_DEFINE(M_DKWEDGE, "dkwedge", "Disk wedge structures"); -typedef enum { - DKW_STATE_LARVAL = 0, - DKW_STATE_RUNNING = 1, - DKW_STATE_DYING = 2, - DKW_STATE_DEAD = 666 -} dkwedge_state_t; +/* + * Lock order: + * + * sc->sc_dk.dk_openlock + * => sc->sc_parent->dk_rawlock + * => sc->sc_parent->dk_openlock + * => dkwedges_lock + * + * Locking notes: + * + * W dkwedges_lock + * D device reference + * O sc->sc_dk.dk_openlock + * P sc->sc_parent->dk_openlock + * R sc->sc_parent->dk_rawlock + * I sc->sc_iolock + * $ stable after initialization + * 1 used only by a single thread + * + * x&y means both x and y must be held to write (with a write lock if + * one is rwlock), and either x or y must be held to read. + */ struct dkwedge_softc { - device_t sc_dev; /* pointer to our pseudo-device */ - struct cfdata sc_cfdata; /* our cfdata structure */ - uint8_t sc_wname[128]; /* wedge name (Unicode, UTF-8) */ - - dkwedge_state_t sc_state; /* state this wedge is in */ - - struct disk *sc_parent; /* parent disk */ - daddr_t sc_offset; /* LBA offset of wedge in parent */ - uint64_t sc_size; /* size of wedge in blocks */ - char sc_ptype[32]; /* partition type */ - dev_t sc_pdev; /* cached parent's dev_t */ - /* link on parent's wedge list */ + device_t sc_dev; /* P&W: pointer to our pseudo-device */ + /* sc_dev is also stable while device is referenced */ + struct cfdata sc_cfdata; /* 1: our cfdata structure */ + uint8_t sc_wname[128]; /* $: wedge name (Unicode, UTF-8) */ + + struct disk *sc_parent; /* $: parent disk */ + /* P: sc_parent->dk_openmask */ + /* P: sc_parent->dk_nwedges */ + /* P: sc_parent->dk_wedges */ + /* R: sc_parent->dk_rawopens */ + /* R: sc_parent->dk_rawvp */ + daddr_t sc_offset; /* $: LBA offset of wedge in parent */ + uint64_t sc_size; /* P&W: size of wedge in blocks */ + /* + * XXX sc_size should be exposed via seqlock on + * architectures without 64-bit atomic load/store. + */ + char sc_ptype[32]; /* $: partition type */ + dev_t sc_pdev; /* $: cached parent's dev_t */ + /* P: link on parent's wedge list */ LIST_ENTRY(dkwedge_softc) sc_plink; struct disk sc_dk; /* our own disk structure */ - struct bufq_state *sc_bufq; /* buffer queue */ - struct callout sc_restart_ch; /* callout to restart I/O */ + /* O&R: sc_dk.dk_openmask */ + struct bufq_state *sc_bufq; /* $: buffer queue */ + struct callout sc_restart_ch; /* I: callout to restart I/O */ kmutex_t sc_iolock; - kcondvar_t sc_dkdrn; - u_int sc_iopend; /* I/Os pending */ - int sc_mode; /* parent open mode */ + kcondvar_t sc_dkdrn; /* I: notifies when io_pend -> 0 */ + u_int sc_iopend; /* I: I/Os pending */ + bool sc_iostop; /* I: don't schedule restart */ + int sc_mode; /* O&R: parent open mode */ }; +static int dkwedge_match(device_t, cfdata_t, void *); +static void dkwedge_attach(device_t, device_t, void *); +static int dkwedge_detach(device_t, int); + static void dkstart(struct dkwedge_softc *); static void dkiodone(struct buf *); static void dkrestart(void *); @@ -99,7 +129,6 @@ static void dkminphys(struct buf *); static int dkfirstopen(struct dkwedge_softc *, int); static void dklastclose(struct dkwedge_softc *); -static int dkwedge_cleanup_parent(struct dkwedge_softc *, int); static int dkwedge_detach(device_t, int); static void dkwedge_delall1(struct disk *, bool); static int dkwedge_del1(struct dkwedge_info *, int); @@ -108,6 +137,7 @@ static int dk_close_parent(struct vnode *, int); static dev_type_open(dkopen); static dev_type_close(dkclose); +static dev_type_cancel(dkcancel); static dev_type_read(dkread); static dev_type_write(dkwrite); static dev_type_ioctl(dkioctl); @@ -116,9 +146,15 @@ static dev_type_dump(dkdump); static dev_type_size(dksize); static dev_type_discard(dkdiscard); +CFDRIVER_DECL(dk, DV_DISK, NULL); +CFATTACH_DECL3_NEW(dk, 0, + dkwedge_match, dkwedge_attach, dkwedge_detach, NULL, NULL, NULL, + DVF_DETACH_SHUTDOWN); + const struct bdevsw dk_bdevsw = { .d_open = dkopen, .d_close = dkclose, + .d_cancel = dkcancel, .d_strategy = dkstrategy, .d_ioctl = dkioctl, .d_dump = dkdump, @@ -130,6 +166,7 @@ const struct bdevsw dk_bdevsw = { const struct cdevsw dk_cdevsw = { .d_open = dkopen, .d_close = dkclose, + .d_cancel = dkcancel, .d_read = dkread, .d_write = dkwrite, .d_ioctl = dkioctl, @@ -139,6 +176,8 @@ const struct cdevsw dk_cdevsw = { .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = dkdiscard, + .d_cfdriver = &dk_cd, + .d_devtounit = dev_minor_unit, .d_flag = D_DISK | D_MPSAFE }; @@ -160,7 +199,7 @@ dkwedge_match(device_t parent, cfdata_t match, { /* Pseudo-device; always present. */ - return (1); + return 1; } /* @@ -169,19 +208,27 @@ dkwedge_match(device_t parent, cfdata_t match, * Autoconfiguration attach function for pseudo-device glue. */ static void -dkwedge_attach(device_t parent, device_t self, - void *aux) +dkwedge_attach(device_t parent, device_t self, void *aux) { + struct dkwedge_softc *sc = aux; + int unit = device_unit(self); + + KASSERTMSG(unit >= 0, "unit=%d", unit); + + mutex_enter(&sc->sc_parent->dk_openlock); + rw_enter(&dkwedges_lock, RW_WRITER); + KASSERTMSG(unit < ndkwedges, "unit=%d ndkwedges=%u", unit, ndkwedges); + KASSERTMSG(sc == dkwedges[unit], "sc=%p dkwedges[%d]=%p", + sc, unit, dkwedges[unit]); + KASSERTMSG(sc->sc_dev == NULL, "sc=%p sc->sc_dev=%p", sc, sc->sc_dev); + sc->sc_dev = self; + rw_exit(&dkwedges_lock); + mutex_exit(&sc->sc_parent->dk_openlock); if (!pmf_device_register(self, NULL, NULL)) aprint_error_dev(self, "couldn't establish power handler\n"); } -CFDRIVER_DECL(dk, DV_DISK, NULL); -CFATTACH_DECL3_NEW(dk, 0, - dkwedge_match, dkwedge_attach, dkwedge_detach, NULL, NULL, NULL, - DVF_DETACH_SHUTDOWN); - /* * dkwedge_wait_drain: * @@ -249,6 +296,8 @@ dkwedge_array_expand(void) int newcnt = ndkwedges + 16; struct dkwedge_softc **newarray, **oldarray; + KASSERT(rw_write_held(&dkwedges_lock)); + newarray = malloc(newcnt * sizeof(*newarray), M_DKWEDGE, M_WAITOK|M_ZERO); if ((oldarray = dkwedges) != NULL) @@ -265,6 +314,12 @@ dk_set_geometry(struct dkwedge_softc *sc, struct disk *pdk) struct disk *dk = &sc->sc_dk; struct disk_geom *dg = &dk->dk_geom; + /* + * Caller must hold pdk->dk_openlock to keep sc->sc_dev and + * sc->sc_size stable and to serialize disk_set_info. + */ + KASSERT(mutex_owned(&pdk->dk_openlock)); + memset(dg, 0, sizeof(*dg)); dg->dg_secperunit = sc->sc_size; @@ -294,18 +349,19 @@ dkwedge_add(struct dkwedge_info *dkw) u_int unit; int error; dev_t pdev; + device_t dev; dkw->dkw_parent[sizeof(dkw->dkw_parent) - 1] = '\0'; pdk = disk_find(dkw->dkw_parent); if (pdk == NULL) - return (ENODEV); + return ENODEV; error = dkwedge_compute_pdev(pdk->dk_name, &pdev, VBLK); if (error) - return (error); + return error; if (dkw->dkw_offset < 0) - return (EINVAL); + return EINVAL; /* * Check for an existing wedge at the same disk offset. Allow @@ -323,9 +379,15 @@ dkwedge_add(struct dkwedge_info *dkw) break; if (lsc->sc_size > dkw->dkw_size) break; + if (lsc->sc_dev == NULL) + break; sc = lsc; + device_acquire(sc->sc_dev); + dev = sc->sc_dev; + rw_enter(&dkwedges_lock, RW_WRITER); sc->sc_size = dkw->dkw_size; + rw_exit(&dkwedges_lock); dk_set_geometry(sc, pdk); break; @@ -336,7 +398,6 @@ dkwedge_add(struct dkwedge_info *dkw) goto announce; sc = malloc(sizeof(*sc), M_DKWEDGE, M_WAITOK|M_ZERO); - sc->sc_state = DKW_STATE_LARVAL; sc->sc_parent = pdk; sc->sc_pdev = pdev; sc->sc_offset = dkw->dkw_offset; @@ -398,14 +459,14 @@ dkwedge_add(struct dkwedge_info *dkw) mutex_destroy(&sc->sc_iolock); bufq_free(sc->sc_bufq); free(sc, M_DKWEDGE); - return (error); + return error; } /* Fill in our cfdata for the pseudo-device glue. */ sc->sc_cfdata.cf_name = dk_cd.cd_name; sc->sc_cfdata.cf_atname = dk_ca.ca_name; /* sc->sc_cfdata.cf_unit set below */ - sc->sc_cfdata.cf_fstate = FSTATE_STAR; + sc->sc_cfdata.cf_fstate = FSTATE_NOTFOUND; /* Insert the larval wedge into the array. */ rw_enter(&dkwedges_lock, RW_WRITER); @@ -453,7 +514,7 @@ dkwedge_add(struct dkwedge_info *dkw) mutex_destroy(&sc->sc_iolock); bufq_free(sc->sc_bufq); free(sc, M_DKWEDGE); - return (error); + return error; } /* @@ -464,11 +525,12 @@ dkwedge_add(struct dkwedge_info *dkw) * This should never fail, unless we're almost totally out of * memory. */ - if ((sc->sc_dev = config_attach_pseudo(&sc->sc_cfdata)) == NULL) { + if ((dev = config_attach_pseudo_acquire(&sc->sc_cfdata, sc)) == NULL) { aprint_error("%s%u: unable to attach pseudo-device\n", sc->sc_cfdata.cf_name, sc->sc_cfdata.cf_unit); rw_enter(&dkwedges_lock, RW_WRITER); + KASSERT(dkwedges[sc->sc_cfdata.cf_unit] == sc); dkwedges[sc->sc_cfdata.cf_unit] = NULL; rw_exit(&dkwedges_lock); @@ -481,39 +543,38 @@ dkwedge_add(struct dkwedge_info *dkw) mutex_destroy(&sc->sc_iolock); bufq_free(sc->sc_bufq); free(sc, M_DKWEDGE); - return (ENOMEM); + return ENOMEM; } - /* - * XXX Really ought to make the disk_attach() and the changing - * of state to RUNNING atomic. - */ + KASSERT(dev == sc->sc_dev); disk_init(&sc->sc_dk, device_xname(sc->sc_dev), NULL); + mutex_enter(&pdk->dk_openlock); dk_set_geometry(sc, pdk); + mutex_exit(&pdk->dk_openlock); disk_attach(&sc->sc_dk); /* Disk wedge is ready for use! */ - sc->sc_state = DKW_STATE_RUNNING; + device_set_private(dev, sc); announce: /* Announce our arrival. */ aprint_normal( "%s at %s: \"%s\", %"PRIu64" blocks at %"PRId64", type: %s\n", - device_xname(sc->sc_dev), pdk->dk_name, + device_xname(dev), pdk->dk_name, sc->sc_wname, /* XXX Unicode */ - sc->sc_size, sc->sc_offset, + dkw->dkw_size, sc->sc_offset, sc->sc_ptype[0] == '\0' ? "" : sc->sc_ptype); /* Return the devname to the caller. */ - strlcpy(dkw->dkw_devname, device_xname(sc->sc_dev), - sizeof(dkw->dkw_devname)); + strlcpy(dkw->dkw_devname, device_xname(dev), sizeof(dkw->dkw_devname)); - return (0); + device_release(dev); + return 0; } /* - * dkwedge_find: + * dkwedge_find_acquire: * * Lookup a disk wedge based on the provided information. * NOTE: We look up the wedge based on the wedge devname, @@ -521,10 +582,11 @@ announce: * * Return NULL if the wedge is not found, otherwise return * the wedge's softc. Assign the wedge's unit number to unitp - * if unitp is not NULL. + * if unitp is not NULL. The wedge's sc_dev is referenced and + * must be released by device_release or equivalent. */ static struct dkwedge_softc * -dkwedge_find(struct dkwedge_info *dkw, u_int *unitp) +dkwedge_find_acquire(struct dkwedge_info *dkw, u_int *unitp) { struct dkwedge_softc *sc = NULL; u_int unit; @@ -534,13 +596,15 @@ dkwedge_find(struct dkwedge_info *dkw, u_int *unitp) rw_enter(&dkwedges_lock, RW_READER); for (unit = 0; unit < ndkwedges; unit++) { if ((sc = dkwedges[unit]) != NULL && + sc->sc_dev != NULL && strcmp(device_xname(sc->sc_dev), dkw->dkw_devname) == 0 && strcmp(sc->sc_parent->dk_name, dkw->dkw_parent) == 0) { + device_acquire(sc->sc_dev); break; } } rw_exit(&dkwedges_lock); - if (unit == ndkwedges) + if (sc == NULL) return NULL; if (unitp != NULL) @@ -568,32 +632,10 @@ dkwedge_del1(struct dkwedge_info *dkw, int flags) struct dkwedge_softc *sc = NULL; /* Find our softc. */ - if ((sc = dkwedge_find(dkw, NULL)) == NULL) - return (ESRCH); + if ((sc = dkwedge_find_acquire(dkw, NULL)) == NULL) + return ESRCH; - return config_detach(sc->sc_dev, flags); -} - -static int -dkwedge_cleanup_parent(struct dkwedge_softc *sc, int flags) -{ - struct disk *dk = &sc->sc_dk; - int rc; - - rc = 0; - mutex_enter(&dk->dk_openlock); - if (dk->dk_openmask == 0) { - /* nothing to do */ - } else if ((flags & DETACH_FORCE) == 0) { - rc = EBUSY; - } else { - mutex_enter(&sc->sc_parent->dk_rawlock); - dklastclose(sc); - mutex_exit(&sc->sc_parent->dk_rawlock); - } - mutex_exit(&sc->sc_dk.dk_openlock); - - return rc; + return config_detach_release(sc->sc_dev, flags); } /* @@ -604,26 +646,38 @@ dkwedge_cleanup_parent(struct dkwedge_softc *sc, int flags) static int dkwedge_detach(device_t self, int flags) { - struct dkwedge_softc *sc = NULL; - u_int unit; + struct dkwedge_softc *const sc = device_private(self); + const u_int unit = device_unit(self); int bmaj, cmaj, rc; - rw_enter(&dkwedges_lock, RW_WRITER); - for (unit = 0; unit < ndkwedges; unit++) { - if ((sc = dkwedges[unit]) != NULL && sc->sc_dev == self) - break; - } - if (unit == ndkwedges) - rc = ENXIO; - else if ((rc = dkwedge_cleanup_parent(sc, flags)) == 0) { - /* Mark the wedge as dying. */ - sc->sc_state = DKW_STATE_DYING; + /* + * Fail if we're still open and this isn't forced detach; + * otherwise, mark the wedge as dying to prevent further opens + * from starting, so we can begin draining I/O. + */ + mutex_enter(&sc->sc_dk.dk_openlock); + if (sc->sc_dk.dk_openmask != 0 && (flags & DETACH_FORCE) == 0) { + rc = EBUSY; + } else { + mutex_enter(&sc->sc_parent->dk_openlock); + rw_enter(&dkwedges_lock, RW_WRITER); + sc->sc_dev = NULL; + rw_exit(&dkwedges_lock); + mutex_exit(&sc->sc_parent->dk_openlock); + rc = 0; } - rw_exit(&dkwedges_lock); + mutex_exit(&sc->sc_dk.dk_openlock); if (rc != 0) return rc; + /* + * Wake pending opens with failure -- those that had already + * begun before we marked the wedge dying. Detach must not + * fail after this point. + */ + config_detach_commit(self); + pmf_device_deregister(self); /* Locate the wedge major numbers. */ @@ -631,7 +685,14 @@ dkwedge_detach(device_t self, int flags) cmaj = cdevsw_lookup_major(&dk_cdevsw); /* Kill any pending restart. */ - callout_stop(&sc->sc_restart_ch); + mutex_enter(&sc->sc_iolock); + sc->sc_iostop = true; + mutex_exit(&sc->sc_iolock); + callout_halt(&sc->sc_restart_ch, NULL); + + /* Nuke the vnodes for any open instances. */ + vdevgone(bmaj, unit, unit, VBLK); + vdevgone(cmaj, unit, unit, VCHR); /* * dkstart() will kill any queued buffers now that the @@ -641,15 +702,11 @@ dkwedge_detach(device_t self, int flags) dkstart(sc); dkwedge_wait_drain(sc); - /* Nuke the vnodes for any open instances. */ - vdevgone(bmaj, unit, unit, VBLK); - vdevgone(cmaj, unit, unit, VCHR); - - /* Clean up the parent. */ - dkwedge_cleanup_parent(sc, flags | DETACH_FORCE); + /* At this point, all instances must be closed. */ + KASSERT(sc->sc_dk.dk_openmask == 0); /* Announce our departure. */ - aprint_normal("%s at %s (%s) deleted\n", device_xname(sc->sc_dev), + aprint_normal("%s at %s (%s) deleted\n", device_xname(self), sc->sc_parent->dk_name, sc->sc_wname); /* XXX Unicode */ @@ -667,8 +724,8 @@ dkwedge_detach(device_t self, int flags) /* Poof. */ rw_enter(&dkwedges_lock, RW_WRITER); + aprint_error_dev(self, "[%d] dkwedges[%u] = NULL\n", curlwp->l_lid, unit); dkwedges[unit] = NULL; - sc->sc_state = DKW_STATE_DEAD; rw_exit(&dkwedges_lock); mutex_destroy(&sc->sc_iolock); @@ -694,29 +751,39 @@ dkwedge_delall(struct disk *pdk) static void dkwedge_delall1(struct disk *pdk, bool idleonly) { - struct dkwedge_info dkw; struct dkwedge_softc *sc; int flags; flags = DETACH_QUIET; - if (!idleonly) flags |= DETACH_FORCE; + if (!idleonly) + flags |= DETACH_FORCE; for (;;) { + mutex_enter(&pdk->dk_rawlock); /* for sc->sc_dk.dk_openmask */ mutex_enter(&pdk->dk_openlock); LIST_FOREACH(sc, &pdk->dk_wedges, sc_plink) { - if (!idleonly || sc->sc_dk.dk_openmask == 0) + /* + * Wedge is not yet created. This is a race -- + * it may as well have been added just after we + * deleted all the wedges, so pretend it's not + * here yet. + */ + if (sc->sc_dev == NULL) + continue; + if (!idleonly || sc->sc_dk.dk_openmask == 0) { + device_acquire(sc->sc_dev); break; + } } if (sc == NULL) { KASSERT(idleonly || pdk->dk_nwedges == 0); mutex_exit(&pdk->dk_openlock); + mutex_exit(&pdk->dk_rawlock); return; } - strlcpy(dkw.dkw_parent, pdk->dk_name, sizeof(dkw.dkw_parent)); - strlcpy(dkw.dkw_devname, device_xname(sc->sc_dev), - sizeof(dkw.dkw_devname)); mutex_exit(&pdk->dk_openlock); - (void) dkwedge_del1(&dkw, flags); + mutex_exit(&pdk->dk_rawlock); + (void)config_detach_release(sc->sc_dev, flags); } } @@ -752,11 +819,11 @@ dkwedge_list(struct disk *pdk, struct dkwedge_list *dkwl, struct lwp *l) if (uio.uio_resid < sizeof(dkw)) break; - if (sc->sc_state != DKW_STATE_RUNNING) + if (sc->sc_dev == NULL) continue; strlcpy(dkw.dkw_devname, device_xname(sc->sc_dev), - sizeof(dkw.dkw_devname)); + sizeof(dkw.dkw_devname)); memcpy(dkw.dkw_wname, sc->sc_wname, sizeof(dkw.dkw_wname)); dkw.dkw_wname[sizeof(dkw.dkw_wname) - 1] = '\0'; strlcpy(dkw.dkw_parent, sc->sc_parent->dk_name, @@ -765,27 +832,37 @@ dkwedge_list(struct disk *pdk, struct dkwedge_list *dkwl, struct lwp *l) dkw.dkw_size = sc->sc_size; strlcpy(dkw.dkw_ptype, sc->sc_ptype, sizeof(dkw.dkw_ptype)); + /* + * Acquire a device reference so this wedge doesn't go + * away before our next iteration in LIST_FOREACH, and + * then release the lock for uiomove. + */ + device_acquire(sc->sc_dev); + mutex_exit(&pdk->dk_openlock); error = uiomove(&dkw, sizeof(dkw), &uio); + mutex_enter(&pdk->dk_openlock); + device_release(sc->sc_dev); if (error) break; + dkwl->dkwl_ncopied++; } dkwl->dkwl_nwedges = pdk->dk_nwedges; mutex_exit(&pdk->dk_openlock); - return (error); + return error; } -device_t -dkwedge_find_by_wname(const char *wname) +static device_t +dkwedge_find_by_wname_acquire(const char *wname) { device_t dv = NULL; struct dkwedge_softc *sc; int i; - rw_enter(&dkwedges_lock, RW_WRITER); + rw_enter(&dkwedges_lock, RW_READER); for (i = 0; i < ndkwedges; i++) { - if ((sc = dkwedges[i]) == NULL) + if ((sc = dkwedges[i]) == NULL || sc->sc_dev == NULL) continue; if (strcmp(sc->sc_wname, wname) == 0) { if (dv != NULL) { @@ -795,6 +872,7 @@ dkwedge_find_by_wname(const char *wname) device_xname(sc->sc_dev)); continue; } + device_acquire(sc->sc_dev); dv = sc->sc_dev; } } @@ -802,16 +880,18 @@ dkwedge_find_by_wname(const char *wname) return dv; } -device_t -dkwedge_find_by_parent(const char *name, size_t *i) +static device_t +dkwedge_find_by_parent_acquire(const char *name, size_t *i) { - rw_enter(&dkwedges_lock, RW_WRITER); + + rw_enter(&dkwedges_lock, RW_READER); for (; *i < (size_t)ndkwedges; (*i)++) { struct dkwedge_softc *sc; - if ((sc = dkwedges[*i]) == NULL) + if ((sc = dkwedges[*i]) == NULL || sc->sc_dev == NULL) continue; if (strcmp(sc->sc_parent->dk_name, name) != 0) continue; + device_acquire(sc->sc_dev); rw_exit(&dkwedges_lock); return sc->sc_dev; } @@ -819,15 +899,39 @@ dkwedge_find_by_parent(const char *name, size_t *i) return NULL; } +/* XXX unsafe */ +device_t +dkwedge_find_by_wname(const char *wname) +{ + device_t dv; + + if ((dv = dkwedge_find_by_wname_acquire(wname)) == NULL) + return NULL; + device_release(dv); + return dv; +} + +/* XXX unsafe */ +device_t +dkwedge_find_by_parent(const char *name, size_t *i) +{ + device_t dv; + + if ((dv = dkwedge_find_by_parent_acquire(name, i)) == NULL) + return NULL; + device_release(dv); + return dv; +} + void dkwedge_print_wnames(void) { struct dkwedge_softc *sc; int i; - rw_enter(&dkwedges_lock, RW_WRITER); + rw_enter(&dkwedges_lock, RW_READER); for (i = 0; i < ndkwedges; i++) { - if ((sc = dkwedges[i]) == NULL) + if ((sc = dkwedges[i]) == NULL || sc->sc_dev == NULL) continue; printf(" wedge:%s", sc->sc_wname); } @@ -1066,18 +1170,14 @@ dkwedge_read(struct disk *pdk, struct vnode *vp, daddr_t blkno, * dkwedge_lookup: * * Look up a dkwedge_softc based on the provided dev_t. + * + * Caller must guarantee the wedge is referenced. */ static struct dkwedge_softc * dkwedge_lookup(dev_t dev) { - int unit = minor(dev); - - if (unit >= ndkwedges) - return (NULL); - - KASSERT(dkwedges != NULL); - return (dkwedges[unit]); + return device_lookup_private(&dk_cd, minor(dev)); } static int @@ -1136,9 +1236,25 @@ dkopen(dev_t dev, int flags, int fmt, struct lwp *l) int error = 0; if (sc == NULL) - return (ENODEV); - if (sc->sc_state != DKW_STATE_RUNNING) - return (ENXIO); + return ENODEV; + + /* + * Take dkwedges_lock as a reader so we can refuse to open + * devices that have not yet been fully created yet -- they're + * still in the middle of dkwedge_add, between the creation of + * the softc and the creation of the device_t. + * + * Once we determine sc->sc_dev is not null, however, it will + * never transition to being null, because that happens only in + * dkwedge_detach, which waits in vdevgone until concurrent + * dkopen calls have completed. + */ + rw_enter(&dkwedges_lock, RW_READER); + if (sc->sc_dev == NULL) + error = ENXIO; + rw_exit(&dkwedges_lock); + if (error) + return error; /* * We go through a complicated little dance to only open the parent @@ -1196,6 +1312,7 @@ dkfirstopen(struct dkwedge_softc *sc, int flags) } if (error) return error; + KASSERT(vp != NULL); sc->sc_parent->dk_rawvp = vp; } else { /* @@ -1207,6 +1324,7 @@ dkfirstopen(struct dkwedge_softc *sc, int flags) * dk_rawopens is unsigned, this can't overflow. */ KASSERT(sc->sc_parent->dk_rawopens < UINT_MAX); + KASSERT(sc->sc_parent->dk_rawvp != NULL); mode = 0; LIST_FOREACH(nsc, &sc->sc_parent->dk_wedges, sc_plink) { if (nsc == sc || nsc->sc_dk.dk_openmask == 0) @@ -1227,6 +1345,8 @@ dklastclose(struct dkwedge_softc *sc) KASSERT(mutex_owned(&sc->sc_dk.dk_openlock)); KASSERT(mutex_owned(&sc->sc_parent->dk_rawlock)); + KASSERT(sc->sc_parent->dk_rawopens > 0); + KASSERT(sc->sc_parent->dk_rawvp != NULL); if (--sc->sc_parent->dk_rawopens == 0) { struct vnode *const vp = sc->sc_parent->dk_rawvp; @@ -1248,12 +1368,24 @@ static int dkclose(dev_t dev, int flags, int fmt, struct lwp *l) { struct dkwedge_softc *sc = dkwedge_lookup(dev); + int error = 0; if (sc == NULL) - return ENODEV; - if (sc->sc_state != DKW_STATE_RUNNING) return ENXIO; + /* + * dkclose can be called even if dkopen didn't succeed, so we + * have to handle the same possibility that the wedge was not + * yet created. It still won't be detached at this point until + * after dkclose returns. + */ + rw_enter(&dkwedges_lock, RW_READER); + if (sc->sc_dev == NULL) + error = ENXIO; + rw_exit(&dkwedges_lock); + if (error) + return error; + mutex_enter(&sc->sc_dk.dk_openlock); mutex_enter(&sc->sc_parent->dk_rawlock); @@ -1277,7 +1409,28 @@ dkclose(dev_t dev, int flags, int fmt, struct lwp *l) } /* - * dkstragegy: [devsw entry point] + * dkcancel: [devsw entry point] + * + * Cancel any pending I/O operations waiting on a wedge. + */ +static int +dkcancel(dev_t dev, int flags, int fmt, struct lwp *l) +{ + struct dkwedge_softc *sc = dkwedge_lookup(dev); + + KASSERT(sc != NULL); + KASSERT(sc->sc_dev != NULL); + + /* + * No dk(4) I/O operations sleep, so nothing to do -- I/O is + * asynchronous. + */ + + return 0; +} + +/* + * dkstrategy: [devsw entry point] * * Perform I/O based on the wedge I/O strategy. */ @@ -1292,18 +1445,14 @@ dkstrategy(struct buf *bp) goto done; } - if (sc->sc_state != DKW_STATE_RUNNING || - sc->sc_parent->dk_rawvp == NULL) { - bp->b_error = ENXIO; - goto done; - } - /* If it's an empty transfer, wake up the top half now. */ if (bp->b_bcount == 0) goto done; p_offset = sc->sc_offset << sc->sc_parent->dk_blkshift; + mutex_enter(&sc->sc_parent->dk_openlock); /* XXX atomic/seqlock */ p_size = sc->sc_size << sc->sc_parent->dk_blkshift; + mutex_exit(&sc->sc_parent->dk_openlock); /* Make sure it's in-range. */ if (bounds_check_with_mediasize(bp, DEV_BSIZE, p_size) <= 0) @@ -1342,7 +1491,7 @@ dkstart(struct dkwedge_softc *sc) /* Do as much work as has been enqueued. */ while ((bp = bufq_peek(sc->sc_bufq)) != NULL) { - if (sc->sc_state != DKW_STATE_RUNNING) { + if (sc->sc_iostop) { (void) bufq_get(sc->sc_bufq); if (--sc->sc_iopend == 0) cv_broadcast(&sc->sc_dkdrn); @@ -1364,7 +1513,8 @@ dkstart(struct dkwedge_softc *sc) * buffer queued up, and schedule a timer to * restart the queue in 1/2 a second. */ - callout_schedule(&sc->sc_restart_ch, hz / 2); + if (!sc->sc_iostop) + callout_schedule(&sc->sc_restart_ch, hz / 2); break; } @@ -1485,14 +1635,8 @@ dkminphys(struct buf *bp) static int dkread(dev_t dev, struct uio *uio, int flags) { - struct dkwedge_softc *sc = dkwedge_lookup(dev); - - if (sc == NULL) - return (ENODEV); - if (sc->sc_state != DKW_STATE_RUNNING) - return (ENXIO); - return (physio(dkstrategy, NULL, dev, B_READ, dkminphys, uio)); + return physio(dkstrategy, NULL, dev, B_READ, dkminphys, uio); } /* @@ -1503,14 +1647,8 @@ dkread(dev_t dev, struct uio *uio, int flags) static int dkwrite(dev_t dev, struct uio *uio, int flags) { - struct dkwedge_softc *sc = dkwedge_lookup(dev); - if (sc == NULL) - return (ENODEV); - if (sc->sc_state != DKW_STATE_RUNNING) - return (ENXIO); - - return (physio(dkstrategy, NULL, dev, B_WRITE, dkminphys, uio)); + return physio(dkstrategy, NULL, dev, B_WRITE, dkminphys, uio); } /* @@ -1524,20 +1662,13 @@ dkioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) struct dkwedge_softc *sc = dkwedge_lookup(dev); int error = 0; - if (sc == NULL) - return (ENODEV); - if (sc->sc_state != DKW_STATE_RUNNING) - return (ENXIO); - if (sc->sc_parent->dk_rawvp == NULL) - return (ENXIO); - /* * We pass NODEV instead of our device to indicate we don't * want to handle disklabel ioctls */ error = disk_ioctl(&sc->sc_dk, NODEV, cmd, data, flag, l); if (error != EPASSTHROUGH) - return (error); + return error; error = 0; @@ -1559,7 +1690,9 @@ dkioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) strlcpy(dkw->dkw_parent, sc->sc_parent->dk_name, sizeof(dkw->dkw_parent)); dkw->dkw_offset = sc->sc_offset; + mutex_enter(&sc->sc_parent->dk_openlock); dkw->dkw_size = sc->sc_size; + mutex_exit(&sc->sc_parent->dk_openlock); strlcpy(dkw->dkw_ptype, sc->sc_ptype, sizeof(dkw->dkw_ptype)); break; @@ -1586,7 +1719,7 @@ dkioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) error = ENOTTY; } - return (error); + return error; } /* @@ -1598,30 +1731,31 @@ static int dkdiscard(dev_t dev, off_t pos, off_t len) { struct dkwedge_softc *sc = dkwedge_lookup(dev); + uint64_t size; unsigned shift; off_t offset, maxlen; int error; - if (sc == NULL) - return (ENODEV); - if (sc->sc_state != DKW_STATE_RUNNING) - return (ENXIO); - if (sc->sc_parent->dk_rawvp == NULL) - return (ENXIO); + mutex_enter(&sc->sc_parent->dk_openlock); + size = sc->sc_size; + mutex_exit(&sc->sc_parent->dk_openlock); + /* + * XXX Assert all this earlier, on construction. + */ shift = (sc->sc_parent->dk_blkshift + DEV_BSHIFT); - KASSERT(__type_fit(off_t, sc->sc_size)); + KASSERT(__type_fit(off_t, size)); KASSERT(__type_fit(off_t, sc->sc_offset)); KASSERT(0 <= sc->sc_offset); - KASSERT(sc->sc_size <= (__type_max(off_t) >> shift)); - KASSERT(sc->sc_offset <= ((__type_max(off_t) >> shift) - sc->sc_size)); + KASSERT(size <= (__type_max(off_t) >> shift)); + KASSERT(sc->sc_offset <= ((__type_max(off_t) >> shift) - size)); offset = ((off_t)sc->sc_offset << shift); - maxlen = ((off_t)sc->sc_size << shift); + maxlen = ((off_t)size << shift); if (len > maxlen) - return (EINVAL); + return EINVAL; if (pos > (maxlen - len)) - return (EINVAL); + return EINVAL; pos += offset; @@ -1641,17 +1775,21 @@ dkdiscard(dev_t dev, off_t pos, off_t len) static int dksize(dev_t dev) { + /* + * Don't bother taking a reference because this is only used + * either (a) while the device is open (for swap), or (b) while + * any multiprocessing is quiescent (for crash dumps). + */ struct dkwedge_softc *sc = dkwedge_lookup(dev); uint64_t p_size; int rv = -1; if (sc == NULL) - return (-1); - if (sc->sc_state != DKW_STATE_RUNNING) - return (-1); + return -1; + if (sc->sc_dev == NULL) + return -1; - mutex_enter(&sc->sc_dk.dk_openlock); - mutex_enter(&sc->sc_parent->dk_rawlock); + mutex_enter(&sc->sc_parent->dk_openlock); /* for sc_size */ /* Our content type is static, no need to open the device. */ @@ -1664,10 +1802,9 @@ dksize(dev_t dev) rv = (int) p_size; } - mutex_exit(&sc->sc_parent->dk_rawlock); - mutex_exit(&sc->sc_dk.dk_openlock); + mutex_exit(&sc->sc_parent->dk_openlock); - return (rv); + return rv; } /* @@ -1678,18 +1815,24 @@ dksize(dev_t dev) static int dkdump(dev_t dev, daddr_t blkno, void *va, size_t size) { + /* + * Don't bother taking a reference because this is only used + * while any multiprocessing is quiescent. + */ struct dkwedge_softc *sc = dkwedge_lookup(dev); const struct bdevsw *bdev; uint64_t p_size, p_offset; int rv = 0; if (sc == NULL) - return (ENODEV); - if (sc->sc_state != DKW_STATE_RUNNING) - return (ENXIO); + return ENODEV; - mutex_enter(&sc->sc_dk.dk_openlock); - mutex_enter(&sc->sc_parent->dk_rawlock); + mutex_enter(&sc->sc_parent->dk_openlock); /* for sc_size */ + + if (sc->sc_dev == NULL) { + rv = ENXIO; + goto out; + } /* Our content type is static, no need to open the device. */ @@ -1719,8 +1862,7 @@ dkdump(dev_t dev, daddr_t blkno, void *va, size_t size) rv = (*bdev->d_dump)(sc->sc_pdev, blkno + p_offset, va, size); out: - mutex_exit(&sc->sc_parent->dk_rawlock); - mutex_exit(&sc->sc_dk.dk_openlock); + mutex_exit(&sc->sc_parent->dk_openlock); return rv; } @@ -1735,8 +1877,9 @@ out: * Find wedge corresponding to the specified parent name * and offset/length. */ -device_t -dkwedge_find_partition(device_t parent, daddr_t startblk, uint64_t nblks) +static device_t +dkwedge_find_partition_acquire(device_t parent, daddr_t startblk, + uint64_t nblks) { struct dkwedge_softc *sc; int i; @@ -1744,11 +1887,11 @@ dkwedge_find_partition(device_t parent, daddr_t startblk, uint64_t nblks) rw_enter(&dkwedges_lock, RW_READER); for (i = 0; i < ndkwedges; i++) { - if ((sc = dkwedges[i]) == NULL) + if ((sc = dkwedges[i]) == NULL || sc->sc_dev == NULL) continue; if (strcmp(sc->sc_parent->dk_name, device_xname(parent)) == 0 && sc->sc_offset == startblk && - sc->sc_size == nblks) { + sc->sc_size >= nblks) { if (wedge) { printf("WARNING: double match for boot wedge " "(%s, %s)\n", @@ -1757,6 +1900,7 @@ dkwedge_find_partition(device_t parent, daddr_t startblk, uint64_t nblks) continue; } wedge = sc->sc_dev; + device_acquire(wedge); } } rw_exit(&dkwedges_lock); @@ -1764,6 +1908,21 @@ dkwedge_find_partition(device_t parent, daddr_t startblk, uint64_t nblks) return wedge; } +/* XXX unsafe */ +device_t +dkwedge_find_partition(device_t parent, daddr_t startblk, + uint64_t nblks) +{ + device_t dv; + + if ((dv = dkwedge_find_partition_acquire(parent, startblk, nblks)) + == NULL) + return NULL; + device_release(dv); + return dv; +} + +/* XXX unsafe */ const char * dkwedge_get_parent_name(dev_t dev) { diff --git a/sys/kern/subr_autoconf.c b/sys/kern/subr_autoconf.c index df7f0e1b0e62..f8b62d0c83b7 100644 --- a/sys/kern/subr_autoconf.c +++ b/sys/kern/subr_autoconf.c @@ -1259,7 +1259,7 @@ static const char * const msgs[] = { * not configured, call the given `print' function and return NULL. */ device_t -config_found(device_t parent, void *aux, cfprint_t print, +config_found_acquire(device_t parent, void *aux, cfprint_t print, const struct cfargs * const cfargs) { cfdata_t cf; @@ -1286,6 +1286,29 @@ config_found(device_t parent, void *aux, cfprint_t print, return NULL; } +/* + * config_found(parent, aux, print, cfargs) + * + * Legacy entry point for callers whose use of the returned + * device_t is not delimited by device_release. This is + * inherently racy -- either they must ignore the return value, or + * they must be converted to config_found_acquire with a matching + * device_release. + */ +device_t +config_found(device_t parent, void *aux, cfprint_t print, + const struct cfargs * const cfargs) +{ + device_t dev; + + dev = config_found_acquire(parent, aux, print, cfargs); + if (dev == NULL) + return NULL; + device_release(dev); + + return dev; +} + /* * As above, but for root devices. */ @@ -1707,6 +1730,8 @@ config_add_attrib_dict(device_t dev) /* * Attach a found device. + * + * Returns the device referenced, to be released with device_release. */ static device_t config_attach_internal(device_t parent, cfdata_t cf, void *aux, cfprint_t print, @@ -1777,6 +1802,12 @@ config_attach_internal(device_t parent, cfdata_t cf, void *aux, cfprint_t print, */ config_pending_incr(dev); + /* + * Prevent concurrent detach from destroying the device_t until + * the caller has released the device. + */ + device_acquire(dev); + /* Call the driver's attach function. */ (*dev->dv_cfattach->ca_attach)(parent, dev, aux); @@ -1812,7 +1843,7 @@ config_attach_internal(device_t parent, cfdata_t cf, void *aux, cfprint_t print, } device_t -config_attach(device_t parent, cfdata_t cf, void *aux, cfprint_t print, +config_attach_acquire(device_t parent, cfdata_t cf, void *aux, cfprint_t print, const struct cfargs *cfargs) { struct cfargs_internal store; @@ -1823,6 +1854,29 @@ config_attach(device_t parent, cfdata_t cf, void *aux, cfprint_t print, cfargs_canonicalize(cfargs, &store)); } +/* + * config_attach(parent, cf, aux, print, cfargs) + * + * Legacy entry point for callers whose use of the returned + * device_t is not delimited by device_release. This is + * inherently racy -- either they must ignore the return value, or + * they must be converted to config_attach_acquire with a matching + * device_release. + */ +device_t +config_attach(device_t parent, cfdata_t cf, void *aux, cfprint_t print, + const struct cfargs *cfargs) +{ + device_t dev; + + dev = config_attach_acquire(parent, cf, aux, print, cfargs); + if (dev == NULL) + return NULL; + device_release(dev); + + return dev; +} + /* * As above, but for pseudo-devices. Pseudo-devices attached in this * way are silently inserted into the device tree, and their children @@ -1833,7 +1887,7 @@ config_attach(device_t parent, cfdata_t cf, void *aux, cfprint_t print, * name by the attach routine. */ device_t -config_attach_pseudo(cfdata_t cf) +config_attach_pseudo_acquire(cfdata_t cf, void *aux) { device_t dev; @@ -1866,8 +1920,14 @@ config_attach_pseudo(cfdata_t cf) */ config_pending_incr(dev); + /* + * Prevent concurrent detach from destroying the device_t until + * the caller has released the device. + */ + device_acquire(dev); + /* Call the driver's attach function. */ - (*dev->dv_cfattach->ca_attach)(ROOT, dev, NULL); + (*dev->dv_cfattach->ca_attach)(ROOT, dev, aux); /* * Allow other threads to acquire references to the device now @@ -1891,6 +1951,28 @@ out: KERNEL_UNLOCK_ONE(NULL); return dev; } +/* + * config_attach_pseudo(cf) + * + * Legacy entry point for callers whose use of the returned + * device_t is not delimited by device_release. This is + * inherently racy -- either they must ignore the return value, or + * they must be converted to config_attach_pseudo_acquire with a + * matching device_release. + */ +device_t +config_attach_pseudo(cfdata_t cf) +{ + device_t dev; + + dev = config_attach_pseudo_acquire(cf, NULL); + if (dev == NULL) + return dev; + device_release(dev); + + return dev; +} + /* * Caller must hold alldevs_lock. */ @@ -1999,9 +2081,12 @@ config_detach_exit(device_t dev) * Note that this code wants to be run from a process context, so * that the detach can sleep to allow processes which have a device * open to run and unwind their stacks. + * + * Caller must hold a reference with device_acquire or + * device_lookup_acquire. */ int -config_detach(device_t dev, int flags) +config_detach_release(device_t dev, int flags) { struct alldevs_foray af; struct cftable *ct; @@ -2030,6 +2115,7 @@ config_detach(device_t dev, int flags) * attached. */ rv = config_detach_enter(dev); + device_release(dev); if (rv) { KERNEL_UNLOCK_ONE(NULL); return rv; @@ -2184,6 +2270,22 @@ out: return rv; } +/* + * config_detach(dev, flags) + * + * Legacy entry point for callers that have not acquired a + * reference to dev. This is inherently racy -- you must convert + * such callers to acquire a reference and then use + * config_detach_release instead. + */ +int +config_detach(device_t dev, int flags) +{ + + device_acquire(dev); + return config_detach_release(dev, flags); +} + /* * config_detach_commit(dev) * @@ -2739,7 +2841,7 @@ retry: if (unit < 0 || unit >= cd->cd_ndevs || mutex_enter(&alldevs_lock); goto retry; } - localcount_acquire(dv->dv_localcount); + device_acquire(dv); } mutex_exit(&alldevs_lock); mutex_exit(&config_misc_lock); @@ -2747,10 +2849,31 @@ retry: if (unit < 0 || unit >= cd->cd_ndevs || return dv; } +/* + * device_acquire: + * + * Acquire a reference to a device. It is the caller's + * responsibility to ensure that the device's .ca_detach routine + * cannot return before calling this. Caller must release the + * reference with device_release or config_detach_release. + */ +void +device_acquire(device_t dv) +{ + + /* + * No lock because the caller has promised that this can't + * change concurrently with device_acquire. + */ + KASSERTMSG(!dv->dv_detach_done, "%s", + dv == NULL ? "(null)" : device_xname(dv)); + localcount_acquire(dv->dv_localcount); +} + /* * device_release: * - * Release a reference to a device acquired with + * Release a reference to a device acquired with device_acquire or * device_lookup_acquire. */ void diff --git a/sys/kern/subr_device.c b/sys/kern/subr_device.c index b00538790be2..073620ce1289 100644 --- a/sys/kern/subr_device.c +++ b/sys/kern/subr_device.c @@ -30,6 +30,8 @@ __KERNEL_RCSID(0, "$NetBSD: subr_device.c,v 1.13 2022/03/28 12:38:59 riastradh Exp $"); #include + +#include #include #include #include @@ -276,7 +278,7 @@ device_private(device_t dev) * It avoids having them test for it to be NULL and only then calling * device_private. */ - return dev == NULL ? NULL : dev->dv_private; + return dev == NULL ? NULL : atomic_load_consume(&dev->dv_private); } void @@ -287,7 +289,7 @@ device_set_private(device_t dev, void *private) " device %s already has private set to %p", dev, private, device_xname(dev), device_private(dev)); KASSERT(private != NULL); - dev->dv_private = private; + atomic_store_release(&dev->dv_private, private); } prop_dictionary_t diff --git a/sys/sys/device.h b/sys/sys/device.h index 4f47e063d0a6..91d994a9ff88 100644 --- a/sys/sys/device.h +++ b/sys/sys/device.h @@ -554,14 +554,20 @@ device_t config_found(device_t, void *, cfprint_t, const struct cfargs *); device_t config_rootfound(const char *, void *); device_t config_attach(device_t, cfdata_t, void *, cfprint_t, const struct cfargs *); +device_t config_found_acquire(device_t, void *, cfprint_t, + const struct cfargs *); +device_t config_attach_acquire(device_t, cfdata_t, void *, cfprint_t, + const struct cfargs *); int config_match(device_t, cfdata_t, void *); int config_probe(device_t, cfdata_t, void *); bool ifattr_match(const char *, const char *); device_t config_attach_pseudo(cfdata_t); +device_t config_attach_pseudo_acquire(cfdata_t, void *); int config_detach(device_t, int); +int config_detach_release(device_t, int); int config_detach_children(device_t, int flags); void config_detach_commit(device_t); bool config_detach_all(int); @@ -588,6 +594,7 @@ device_t device_lookup(cfdriver_t, int); void *device_lookup_private(cfdriver_t, int); device_t device_lookup_acquire(cfdriver_t, int); +void device_acquire(device_t); void device_release(device_t); void device_register(device_t, void *);