summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBryan Cantrill <bryan@joyent.com>2018-06-25 17:39:30 +0000
committerBryan Cantrill <bryan@joyent.com>2018-10-04 18:08:59 +0000
commit1f946b3740393b0dc458e4d386c40f40dd9b8465 (patch)
tree057418ad4c04fcd713db59c54863827d90388c85
parentee4c7a44cc40ef09d84ad16a742b124f3122fea2 (diff)
downloadillumos-joyent-1f946b3740393b0dc458e4d386c40f40dd9b8465.tar.gz
OS-3356 UFS freebehind can induce hat_pageunload()/htable_purge_hat() race
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com> Reviewed by: Robert Mustacchi <rm@joyent.com> Reviewed by: Patrick Mooney <patrick.mooney@joyent.com> Approved by: Patrick Mooney <patrick.mooney@joyent.com>
-rw-r--r--usr/src/uts/common/fs/hsfs/hsfs_vnops.c48
-rw-r--r--usr/src/uts/common/fs/ufs/ufs_vnops.c107
2 files changed, 1 insertions, 154 deletions
diff --git a/usr/src/uts/common/fs/hsfs/hsfs_vnops.c b/usr/src/uts/common/fs/hsfs/hsfs_vnops.c
index 09792a3ac6..4d93b897ca 100644
--- a/usr/src/uts/common/fs/hsfs/hsfs_vnops.c
+++ b/usr/src/uts/common/fs/hsfs/hsfs_vnops.c
@@ -133,24 +133,6 @@ struct kmem_cache *hio_info_cache;
*/
extern int use_rrip_inodes;
-/*
- * Free behind logic from UFS to tame our thirst for
- * the page cache.
- * See usr/src/uts/common/fs/ufs/ufs_vnops.c for more
- * explanation.
- */
-static int freebehind = 1;
-static int smallfile = 0;
-static int cache_read_ahead = 0;
-static u_offset_t smallfile64 = 32 * 1024;
-#define SMALLFILE1_D 1000
-#define SMALLFILE2_D 10
-static u_offset_t smallfile1 = 32 * 1024;
-static u_offset_t smallfile2 = 32 * 1024;
-static clock_t smallfile_update = 0; /* when to recompute */
-static uint_t smallfile1_d = SMALLFILE1_D;
-static uint_t smallfile2_d = SMALLFILE2_D;
-
static int hsched_deadline_compare(const void *x1, const void *x2);
static int hsched_offset_compare(const void *x1, const void *x2);
static void hsched_enqueue_io(struct hsfs *fsp, struct hio *hsio, int ra);
@@ -174,7 +156,6 @@ hsfs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
int error;
struct hsnode *hp;
uint_t filesize;
- int dofree;
hp = VTOH(vp);
/*
@@ -241,28 +222,6 @@ hsfs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
return (0);
}
- /*
- * Freebehind computation taken from:
- * usr/src/uts/common/fs/ufs/ufs_vnops.c
- */
- if (drv_hztousec(ddi_get_lbolt()) >= smallfile_update) {
- uint64_t percpufreeb;
- if (smallfile1_d == 0) smallfile1_d = SMALLFILE1_D;
- if (smallfile2_d == 0) smallfile2_d = SMALLFILE2_D;
- percpufreeb = ptob((uint64_t)freemem) / ncpus_online;
- smallfile1 = percpufreeb / smallfile1_d;
- smallfile2 = percpufreeb / smallfile2_d;
- smallfile1 = MAX(smallfile1, smallfile);
- smallfile1 = MAX(smallfile1, smallfile64);
- smallfile2 = MAX(smallfile1, smallfile2);
- smallfile_update = drv_hztousec(ddi_get_lbolt())
- + 1000000;
- }
-
- dofree = freebehind &&
- hp->hs_prev_offset == uiop->uio_loffset &&
- hp->hs_ra_bytes > 0;
-
base = segmap_getmapflt(segkmap, vp,
(u_offset_t)uiop->uio_loffset, n, 1, S_READ);
@@ -279,13 +238,6 @@ hsfs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
else
flags = 0;
- if (dofree) {
- flags = SM_FREE | SM_ASYNC;
- if ((cache_read_ahead == 0) &&
- uiop->uio_loffset > smallfile2)
- flags |= SM_DONTNEED;
- }
-
error = segmap_release(segkmap, base, flags);
} else
(void) segmap_release(segkmap, base, 0);
diff --git a/usr/src/uts/common/fs/ufs/ufs_vnops.c b/usr/src/uts/common/fs/ufs/ufs_vnops.c
index 10039536eb..370c982f08 100644
--- a/usr/src/uts/common/fs/ufs/ufs_vnops.c
+++ b/usr/src/uts/common/fs/ufs/ufs_vnops.c
@@ -658,76 +658,6 @@ out:
int stickyhack = 1;
/*
- * Free behind hacks. The pager is busted.
- * XXX - need to pass the information down to writedone() in a flag like B_SEQ
- * or B_FREE_IF_TIGHT_ON_MEMORY.
- */
-int freebehind = 1;
-int smallfile = 0;
-u_offset_t smallfile64 = 32 * 1024;
-
-/*
- * While we should, in most cases, cache the pages for write, we
- * may also want to cache the pages for read as long as they are
- * frequently re-usable.
- *
- * If cache_read_ahead = 1, the pages for read will go to the tail
- * of the cache list when they are released, otherwise go to the head.
- */
-int cache_read_ahead = 0;
-
-/*
- * Freebehind exists so that as we read large files sequentially we
- * don't consume most of memory with pages from a few files. It takes
- * longer to re-read from disk multiple small files as it does reading
- * one large one sequentially. As system memory grows customers need
- * to retain bigger chunks of files in memory. The advent of the
- * cachelist opens up of the possibility freeing pages to the head or
- * tail of the list.
- *
- * Not freeing a page is a bet that the page will be read again before
- * it's segmap slot is needed for something else. If we loose the bet,
- * it means some other thread is burdened with the page free we did
- * not do. If we win we save a free and reclaim.
- *
- * Freeing it at the tail vs the head of cachelist is a bet that the
- * page will survive until the next read. It's also saying that this
- * page is more likely to be re-used than a page freed some time ago
- * and never reclaimed.
- *
- * Freebehind maintains a range of file offset [smallfile1; smallfile2]
- *
- * 0 < offset < smallfile1 : pages are not freed.
- * smallfile1 < offset < smallfile2 : pages freed to tail of cachelist.
- * smallfile2 < offset : pages freed to head of cachelist.
- *
- * The range is computed at most once per second and depends on
- * freemem and ncpus_online. Both parameters are bounded to be
- * >= smallfile && >= smallfile64.
- *
- * smallfile1 = (free memory / ncpu) / 1000
- * smallfile2 = (free memory / ncpu) / 10
- *
- * A few examples values:
- *
- * Free Mem (in Bytes) [smallfile1; smallfile2] [smallfile1; smallfile2]
- * ncpus_online = 4 ncpus_online = 64
- * ------------------ ----------------------- -----------------------
- * 1G [256K; 25M] [32K; 1.5M]
- * 10G [2.5M; 250M] [156K; 15M]
- * 100G [25M; 2.5G] [1.5M; 150M]
- *
- */
-
-#define SMALLFILE1_D 1000
-#define SMALLFILE2_D 10
-static u_offset_t smallfile1 = 32 * 1024;
-static u_offset_t smallfile2 = 32 * 1024;
-static clock_t smallfile_update = 0; /* lbolt value of when to recompute */
-uint_t smallfile1_d = SMALLFILE1_D;
-uint_t smallfile2_d = SMALLFILE2_D;
-
-/*
* wrip does the real work of write requests for ufs.
*/
int
@@ -1350,10 +1280,9 @@ rdip(struct inode *ip, struct uio *uio, int ioflag, cred_t *cr)
int error = 0;
int doupdate = 1;
uint_t flags;
- int dofree, directio_status;
+ int directio_status;
krw_t rwtype;
o_mode_t type;
- clock_t now;
vp = ITOV(ip);
@@ -1420,26 +1349,6 @@ rdip(struct inode *ip, struct uio *uio, int ioflag, cred_t *cr)
n = (int)diff;
/*
- * We update smallfile2 and smallfile1 at most every second.
- */
- now = ddi_get_lbolt();
- if (now >= smallfile_update) {
- uint64_t percpufreeb;
- if (smallfile1_d == 0) smallfile1_d = SMALLFILE1_D;
- if (smallfile2_d == 0) smallfile2_d = SMALLFILE2_D;
- percpufreeb = ptob((uint64_t)freemem) / ncpus_online;
- smallfile1 = percpufreeb / smallfile1_d;
- smallfile2 = percpufreeb / smallfile2_d;
- smallfile1 = MAX(smallfile1, smallfile);
- smallfile1 = MAX(smallfile1, smallfile64);
- smallfile2 = MAX(smallfile1, smallfile2);
- smallfile_update = now + hz;
- }
-
- dofree = freebehind &&
- ip->i_nextr == (off & PAGEMASK) && off > smallfile1;
-
- /*
* At this point we can enter ufs_getpage() in one of two
* ways:
* 1) segmap_getmapflt() calls ufs_getpage() when the
@@ -1470,19 +1379,6 @@ rdip(struct inode *ip, struct uio *uio, int ioflag, cred_t *cr)
flags = 0;
if (!error) {
/*
- * If reading sequential we won't need this
- * buffer again soon. For offsets in range
- * [smallfile1, smallfile2] release the pages
- * at the tail of the cache list, larger
- * offsets are released at the head.
- */
- if (dofree) {
- flags = SM_FREE | SM_ASYNC;
- if ((cache_read_ahead == 0) &&
- (off > smallfile2))
- flags |= SM_DONTNEED;
- }
- /*
* In POSIX SYNC (FSYNC and FDSYNC) read mode,
* we want to make sure that the page which has
* been read, is written on disk if it is dirty.
@@ -1490,7 +1386,6 @@ rdip(struct inode *ip, struct uio *uio, int ioflag, cred_t *cr)
* be flushed out.
*/
if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) {
- flags &= ~SM_ASYNC;
flags |= SM_WRITE;
}
if (vpm_enable) {