summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/nfs/rnode.h
blob: a9433b9496366fe686d00c9cb9dcee0ac9feb69f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
/*	  All Rights Reserved	*/

#ifndef	_NFS_RNODE_H
#define	_NFS_RNODE_H

#include <sys/avl.h>
#include <sys/list.h>
#include <nfs/nfs.h>

#ifdef	__cplusplus
extern "C" {
#endif

typedef enum nfs_access_type {
	NFS_ACCESS_UNKNOWN,
	NFS_ACCESS_ALLOWED,
	NFS_ACCESS_DENIED
} nfs_access_type_t;

typedef struct acache_hash {
	struct acache *next;	/* next and prev must be first */
	struct acache *prev;
	krwlock_t lock;
} acache_hash_t;

typedef struct acache {
	struct acache *next;	/* next and prev must be first */
	struct acache *prev;
	uint32_t known;
	uint32_t allowed;
	struct rnode *rnode;
	cred_t *cred;
	struct acache *list;
	struct acache_hash *hashq;
} acache_t;

#define	NFS_FHANDLE_LEN	72

typedef struct nfs_fhandle {
	int fh_len;
	char fh_buf[NFS_FHANDLE_LEN];
} nfs_fhandle;

typedef struct rddir_cache {
	lloff_t _cookie;	/* cookie used to find this cache entry */
	lloff_t _ncookie;	/* cookie used to find the next cache entry */
	char *entries;		/* buffer containing dirent entries */
	int eof;		/* EOF reached after this request */
	int entlen;		/* size of dirent entries in buf */
	int buflen;		/* size of the buffer used to store entries */
	int flags;		/* control flags, see below */
	kcondvar_t cv;		/* cv for blocking */
	int error;		/* error from RPC operation */
	kmutex_t lock;
	uint_t count;		/* reference count */
	avl_node_t tree;	/* AVL tree links */
} rddir_cache;

#define	nfs_cookie	_cookie._p._l
#define	nfs_ncookie	_ncookie._p._l
#define	nfs3_cookie	_cookie._f
#define	nfs3_ncookie	_ncookie._f

#define	RDDIR		0x1	/* readdir operation in progress */
#define	RDDIRWAIT	0x2	/* waiting on readdir in progress */
#define	RDDIRREQ	0x4	/* a new readdir is required */
#define	RDDIRCACHED	0x8	/* entry is in the cache */

#define	HAVE_RDDIR_CACHE(rp)	(avl_numnodes(&(rp)->r_dir) > 0)

typedef struct symlink_cache {
	char *contents;		/* contents of the symbolic link */
	int len;		/* length of the contents */
	int size;		/* size of the allocated buffer */
} symlink_cache;

typedef struct commit {
	page_t *c_pages;	/* list of pages to commit */
	offset3 c_commbase;	/* base offset to do commit from */
	count3 c_commlen;	/* len to commit */
	kcondvar_t c_cv;	/* condvar for waiting for commit */
} commit_t;

/*
 * The various values for the commit states.  These are stored in
 * the p_fsdata byte in the page struct.
 * NFSv3,4 can use asynchronous writes - the NFS server can send a response
 * before storing the data to the stable store (disk). The response contains
 * information if the data are on a disk or not. NFS client marks pages
 * which are already on the stable store as C_NOCOMMIT. The pages which were
 * sent but are not yet on the stable store are only partially 'safe' and are
 * marked as C_DELAYCOMMIT, which can be later changed to C_COMMIT if the
 * commit operation is in progress. If the NFS server is e.g. rebooted, the
 * client needs to resend all the uncommitted data. The client walks all the
 * vp->v_pages and if C_DELAYCOMMIT or C_COMMIT is set, the page is marked as
 * dirty and thus will be written to the server again.
 */
#define	C_NOCOMMIT	0	/* no commit is required */
#define	C_COMMIT	1	/* a commit is required so do it now */
#define	C_DELAYCOMMIT	2	/* a commit is required, but can be delayed */

/*
 * The lock manager holds state making it possible for the client
 * and server to be out of sync.  For example, if the response from
 * the server granting a lock request is lost, the server will think
 * the lock is granted and the client will think the lock is lost.
 * To deal with this, a list of processes for which the client is
 * not sure if the server holds a lock is attached to the rnode.
 * When such a process closes the rnode, an unlock request is sent
 * to the server to unlock the entire file.
 *
 * The list is kept as a singularly linked NULL terminated list.
 * Because it is  only added to under extreme error conditions, the
 * list shouldn't get very big.  DEBUG kernels print a console warning
 * when the number of entries on a list go beyond nfs_lmpl_high_water
 * an  arbitrary number defined in nfs_add_locking_id()
 */
#define	RLMPL_PID	1
#define	RLMPL_OWNER	2
typedef struct lock_manager_pid_list {
	int lmpl_type;
	pid_t lmpl_pid;
	union {
		pid_t _pid;
		struct {
			int len;
			char *owner;
		} _own;
	} un;
	struct lock_manager_pid_list *lmpl_next;
} lmpl_t;

#define	lmpl_opid un._pid
#define	lmpl_own_len un._own.len
#define	lmpl_owner un._own.owner

/*
 * A homegrown reader/writer lock implementation.  It addresses
 * two requirements not addressed by the system primitives.  They
 * are that the `enter" operation is optionally interruptible and
 * that they can be re`enter'ed by writers without deadlock.
 */
typedef struct nfs_rwlock {
	int count;
	int waiters;
	kthread_t *owner;
	kmutex_t lock;
	kcondvar_t cv;
	kcondvar_t cv_rd;
} nfs_rwlock_t;

/*
 * The format of the hash bucket used to lookup rnodes from a file handle.
 */
typedef struct rhashq {
	struct rnode *r_hashf;
	struct rnode *r_hashb;
	krwlock_t r_lock;
} rhashq_t;

/*
 * Remote file information structure.
 *
 * The rnode is the "inode" for remote files.  It contains all the
 * information necessary to handle remote file on the client side.
 *
 * Note on file sizes:  we keep two file sizes in the rnode: the size
 * according to the client (r_size) and the size according to the server
 * (r_attr.va_size).  They can differ because we modify r_size during a
 * write system call (nfs_rdwr), before the write request goes over the
 * wire (before the file is actually modified on the server).  If an OTW
 * request occurs before the cached data is written to the server the file
 * size returned from the server (r_attr.va_size) may not match r_size.
 * r_size is the one we use, in general.  r_attr.va_size is only used to
 * determine whether or not our cached data is valid.
 *
 * Each rnode has 3 locks associated with it (not including the rnode
 * hash table and free list locks):
 *
 *	r_rwlock:	Serializes nfs_write and nfs_setattr requests
 *			and allows nfs_read requests to proceed in parallel.
 *			Serializes reads/updates to directories.
 *
 *	r_lkserlock:	Serializes lock requests with map, write, and
 *			readahead operations.
 *
 *	r_statelock:	Protects all fields in the rnode except for
 *			those listed below.  This lock is intented
 *			to be held for relatively short periods of
 *			time (not accross entire putpage operations,
 *			for example).
 *
 * The following members are protected by the mutex rpfreelist_lock:
 *	r_freef
 *	r_freeb
 *
 * The following members are protected by the hash bucket rwlock:
 *	r_hashf
 *	r_hashb
 *
 * Note: r_modaddr is only accessed when the r_statelock mutex is held.
 *	Its value is also controlled via r_rwlock.  It is assumed that
 *	there will be only 1 writer active at a time, so it safe to
 *	set r_modaddr and release r_statelock as long as the r_rwlock
 *	writer lock is held.
 *
 * r_inmap informs nfsX_read()/write() that there is a call to nfsX_map()
 * in progress. nfsX_read()/write() check r_inmap to decide whether
 * to perform directio on the file or not. r_inmap is atomically
 * incremented in nfsX_map() before the address space routines are
 * called and atomically decremented just before nfsX_map() exits.
 * r_inmap is not protected by any lock.
 *
 * r_mapcnt tells that the rnode has mapped pages. r_inmap can be 0
 * while the rnode has mapped pages.
 *
 * 64-bit offsets: the code formerly assumed that atomic reads of
 * r_size were safe and reliable; on 32-bit architectures, this is
 * not true since an intervening bus cycle from another processor
 * could update half of the size field.  The r_statelock must now
 * be held whenever any kind of access of r_size is made.
 *
 * Lock ordering:
 * 	r_rwlock > r_lkserlock > r_statelock
 */
struct exportinfo;	/* defined in nfs/export.h */
struct servinfo;	/* defined in nfs/nfs_clnt.h */
struct failinfo;	/* defined in nfs/nfs_clnt.h */
struct mntinfo;		/* defined in nfs/nfs_clnt.h */

#ifdef _KERNEL

typedef struct rnode {
	/* the hash fields must be first to match the rhashq_t */
	struct rnode	*r_hashf;	/* hash queue forward pointer */
	struct rnode	*r_hashb;	/* hash queue back pointer */
	struct rnode	*r_freef;	/* free list forward pointer */
	struct rnode	*r_freeb;	/* free list back pointer */
	rhashq_t	*r_hashq;	/* pointer to the hash bucket */
	vnode_t		*r_vnode;	/* vnode for remote file */
	nfs_rwlock_t	r_rwlock;	/* serializes write/setattr requests */
	nfs_rwlock_t	r_lkserlock;	/* serialize lock with other ops */
	kmutex_t	r_statelock;	/* protects (most of) rnode contents */
	nfs_fhandle	r_fh;		/* file handle */
	struct servinfo	*r_server;	/* current server */
	char		*r_path;	/* path to this rnode */
	u_offset_t	r_nextr;	/* next byte read offset (read-ahead) */
	cred_t		*r_cred;	/* current credentials */
	cred_t		*r_unlcred;	/* unlinked credentials */
	char		*r_unlname;	/* unlinked file name */
	vnode_t		*r_unldvp;	/* parent dir of unlinked file */
	len_t		r_size;		/* client's view of file size */
	struct vattr	r_attr;		/* cached vnode attributes */
	hrtime_t	r_attrtime;	/* time attributes become invalid */
	hrtime_t	r_mtime;	/* client time file last modified */
	long		r_mapcnt;	/* count of mmapped pages */
	uint_t		r_count;	/* # of refs not reflect in v_count */
	uint_t		r_awcount;	/* # of outstanding async write */
	uint_t		r_gcount;	/* getattrs waiting to flush pages */
	ushort_t	r_flags;	/* flags, see below */
	short		r_error;	/* async write error */
	kcondvar_t	r_cv;		/* condvar for blocked threads */
	int		(*r_putapage)	/* address of putapage routine */
		(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *);
	avl_tree_t	r_dir;		/* cache of readdir responses */
	rddir_cache	*r_direof;	/* pointer to the EOF entry */
	symlink_cache	r_symlink;	/* cached readlink response */
	writeverf3	r_verf;		/* version 3 write verifier */
	u_offset_t	r_modaddr;	/* address for page in writerp */
	commit_t	r_commit;	/* commit information */
	u_offset_t	r_truncaddr;	/* base for truncate operation */
	vsecattr_t	*r_secattr;	/* cached security attributes (acls) */
	cookieverf3	r_cookieverf;	/* version 3 readdir cookie verifier */
	lmpl_t		*r_lmpl;	/* pids that may be holding locks */
	nfs3_pathconf_info *r_pathconf;	/* cached pathconf information */
	acache_t	*r_acache;	/* list of access cache entries */
	kthread_t	*r_serial;	/* id of purging thread */
	list_t		r_indelmap;	/* list of delmap callers */
	uint_t		r_inmap;	/* to serialize read/write and mmap */
	list_node_t	r_mi_link;	/* linkage into list of rnodes for */
					/* this mntinfo */
} rnode_t;
#endif /* _KERNEL */

/*
 * Flags
 */
#define	RREADDIRPLUS	0x1	/* issue a READDIRPLUS instead of READDIR */
#define	RDIRTY		0x2	/* dirty pages from write operation */
#define	RSTALE		0x4	/* file handle is stale */
#define	RMODINPROGRESS	0x8	/* page modification happening */
#define	RTRUNCATE	0x10	/* truncating, don't commit */
#define	RHAVEVERF	0x20	/* have a write verifier to compare against */
#define	RCOMMIT		0x40	/* commit in progress */
#define	RCOMMITWAIT	0x80	/* someone is waiting to do a commit */
#define	RHASHED		0x100	/* rnode is in hash queues */
#define	ROUTOFSPACE	0x200	/* an out of space error has happened */
#define	RDIRECTIO	0x400	/* bypass the buffer cache */
#define	RLOOKUP		0x800	/* a lookup has been performed */
#define	RWRITEATTR	0x1000	/* attributes came from WRITE */
#define	RINDNLCPURGE	0x2000	/* in the process of purging DNLC references */
#define	RDELMAPLIST	0x4000	/* delmap callers tracking for as callback */
#define	RINCACHEPURGE	0x8000	/* purging caches due to file size change */

/*
 * Convert between vnode and rnode
 */
#define	RTOV(rp)	((rp)->r_vnode)
#define	VTOR(vp)	((rnode_t *)((vp)->v_data))

#define	VTOFH(vp)	(RTOFH(VTOR(vp)))
#define	RTOFH(rp)	((fhandle_t *)(&(rp)->r_fh.fh_buf))
#define	VTOFH3(vp)	(RTOFH3(VTOR(vp)))
#define	RTOFH3(rp)	((nfs_fh3 *)(&(rp)->r_fh))

#ifdef _KERNEL
extern int	nfs_async_readahead(vnode_t *, u_offset_t, caddr_t,
				struct seg *, cred_t *,
				void (*)(vnode_t *, u_offset_t,
				caddr_t, struct seg *, cred_t *));
extern int	nfs_async_putapage(vnode_t *, page_t *, u_offset_t, size_t,
				int, cred_t *, int (*)(vnode_t *, page_t *,
				u_offset_t, size_t, int, cred_t *));
extern int	nfs_async_pageio(vnode_t *, page_t *, u_offset_t, size_t,
				int, cred_t *, int (*)(vnode_t *, page_t *,
				u_offset_t, size_t, int, cred_t *));
extern void	nfs_async_readdir(vnode_t *, rddir_cache *,
				cred_t *, int (*)(vnode_t *,
				rddir_cache *, cred_t *));
extern void	nfs_async_commit(vnode_t *, page_t *, offset3, count3,
				cred_t *, void (*)(vnode_t *, page_t *,
				offset3, count3, cred_t *));
extern void	nfs_async_inactive(vnode_t *, cred_t *, void (*)(vnode_t *,
				cred_t *, caller_context_t *));
extern int	writerp(rnode_t *, caddr_t, int, struct uio *, int);
extern int	nfs_putpages(vnode_t *, u_offset_t, size_t, int, cred_t *);
extern void	nfs_invalidate_pages(vnode_t *, u_offset_t, cred_t *);
extern int	rfs2call(struct mntinfo *, rpcproc_t, xdrproc_t, caddr_t,
			xdrproc_t, caddr_t, cred_t *, int *, enum nfsstat *,
			int, struct failinfo *);
extern int	rfs3call(struct mntinfo *, rpcproc_t, xdrproc_t, caddr_t,
			xdrproc_t, caddr_t, cred_t *, int *, nfsstat3 *,
			int, struct failinfo *);
extern void	nfs_setswaplike(vnode_t *, vattr_t *);
extern vnode_t	*makenfsnode(fhandle_t *, struct nfsfattr *, struct vfs *,
			hrtime_t, cred_t *, char *, char *);
extern vnode_t	*makenfs3node_va(nfs_fh3 *, vattr_t *, struct vfs *, hrtime_t,
			cred_t *, char *, char *);
extern vnode_t	*makenfs3node(nfs_fh3 *, fattr3 *, struct vfs *, hrtime_t,
			cred_t *, char *, char *);
extern void	rp_addfree(rnode_t *, cred_t *);
extern void	rp_rmhash(rnode_t *);
extern int	check_rtable(struct vfs *);
extern void	destroy_rtable(struct vfs *, cred_t *);
extern void	rflush(struct vfs *, cred_t *);
extern nfs_access_type_t nfs_access_check(rnode_t *, uint32_t, cred_t *);
extern void	nfs_access_cache(rnode_t *rp, uint32_t, uint32_t, cred_t *);
extern int	nfs_access_purge_rp(rnode_t *);
extern int	nfs_putapage(vnode_t *, page_t *, u_offset_t *, size_t *,
			int, cred_t *);
extern int	nfs3_putapage(vnode_t *, page_t *, u_offset_t *, size_t *,
			int, cred_t *);
extern void	nfs_printfhandle(nfs_fhandle *);
extern void	nfs_write_error(vnode_t *, int, cred_t *);
extern rddir_cache	*rddir_cache_alloc(int);
extern void		rddir_cache_hold(rddir_cache *);
extern void		rddir_cache_rele(rddir_cache *);
#ifdef DEBUG
extern char		*rddir_cache_buf_alloc(size_t, int);
extern void		rddir_cache_buf_free(void *, size_t);
#endif
extern int	nfs_rw_enter_sig(nfs_rwlock_t *, krw_t, int);
extern int	nfs_rw_tryenter(nfs_rwlock_t *, krw_t);
extern void	nfs_rw_exit(nfs_rwlock_t *);
extern int	nfs_rw_lock_held(nfs_rwlock_t *, krw_t);
extern void	nfs_rw_init(nfs_rwlock_t *, char *, krw_type_t, void *);
extern void	nfs_rw_destroy(nfs_rwlock_t *);
extern int	nfs_directio(vnode_t *, int, cred_t *);
extern int	nfs3_rddir_compar(const void *, const void *);
extern int	nfs_rddir_compar(const void *, const void *);
extern struct zone *nfs_zone(void);
extern zoneid_t nfs_zoneid(void);

#endif

#ifdef	__cplusplus
}
#endif

#endif	/* _NFS_RNODE_H */