1/*
2 * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <stdint.h>
30#include <sys/fcntl.h>
31#include <sys/vnode_internal.h>
32#include <sys/vnode.h>
33#include <sys/kauth.h>
34#include <sys/mount_internal.h>
35#include <sys/buf_internal.h>
36#include <kern/debug.h>
37#include <kern/kalloc.h>
38#include <sys/cprotect.h>
39#include <sys/disk.h>
40#include <vm/vm_protos.h>
41#include <vm/vm_pageout.h>
42#include <sys/content_protection.h>
43
44void vm_swapfile_open(const char *path, vnode_t *vp);
45void vm_swapfile_close(uint64_t path, vnode_t vp);
46int vm_swapfile_preallocate(vnode_t vp, uint64_t *size, boolean_t *pin);
47uint64_t vm_swapfile_get_blksize(vnode_t vp);
48uint64_t vm_swapfile_get_transfer_size(vnode_t vp);
49int vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flags, void *);
50int vm_record_file_write(struct vnode *vp, uint64_t offset, char *buf, int size);
51int vm_swap_vol_get_capacity(const char *volume_name, uint64_t *capacity);
52
53#if CONFIG_FREEZE
54int vm_swap_vol_get_budget(vnode_t vp, uint64_t *freeze_daily_budget);
55#endif /* CONFIG_FREEZE */
56
57
58void
59vm_swapfile_open(const char *path, vnode_t *vp)
60{
61 int error = 0;
62 vfs_context_t ctx = vfs_context_kernel();
63
64 if ((error = vnode_open(path, fmode: (O_CREAT | O_TRUNC | FREAD | FWRITE), S_IRUSR | S_IWUSR, flags: 0, vpp: vp, ctx))) {
65 printf("Failed to open swap file %d\n", error);
66 *vp = NULL;
67 return;
68 }
69
70 /*
71 * If MNT_IOFLAGS_NOSWAP is set, opening the swap file should fail.
72 * To avoid a race on the mount we only make this check after creating the
73 * vnode.
74 */
75 if ((*vp)->v_mount->mnt_kern_flag & MNTK_NOSWAP) {
76 vnode_put(vp: *vp);
77 vm_swapfile_close(path: (uint64_t)path, vp: *vp);
78 *vp = NULL;
79 return;
80 }
81
82 vnode_put(vp: *vp);
83}
84
85uint64_t
86vm_swapfile_get_blksize(vnode_t vp)
87{
88 return (uint64_t)vfs_devblocksize(mp: vnode_mount(vp));
89}
90
91uint64_t
92vm_swapfile_get_transfer_size(vnode_t vp)
93{
94 return (uint64_t)vp->v_mount->mnt_vfsstat.f_iosize;
95}
96
97int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
98
99void
100vm_swapfile_close(uint64_t path_addr, vnode_t vp)
101{
102 vfs_context_t context = vfs_context_kernel();
103 int error;
104
105 vnode_getwithref(vp);
106 vnode_close(vp, flags: 0, ctx: context);
107
108 error = unlink1(context, NULLVP, CAST_USER_ADDR_T(path_addr),
109 UIO_SYSSPACE, 0);
110
111#if DEVELOPMENT || DEBUG
112 if (error) {
113 printf("%s : unlink of %s failed with error %d", __FUNCTION__,
114 (char *)path_addr, error);
115 }
116#endif
117}
118
119int
120vm_swapfile_preallocate(vnode_t vp, uint64_t *size, boolean_t *pin)
121{
122 int error = 0;
123 uint64_t file_size = 0;
124 vfs_context_t ctx = NULL;
125#if CONFIG_FREEZE
126 struct vnode_attr va;
127#endif /* CONFIG_FREEZE */
128
129 ctx = vfs_context_kernel();
130
131 error = vnode_setsize(vp, *size, IO_NOZEROFILL, ctx);
132
133 if (error) {
134 printf("vnode_setsize for swap files failed: %d\n", error);
135 goto done;
136 }
137
138 error = vnode_size(vp, (off_t*) &file_size, ctx);
139
140 if (error) {
141 printf("vnode_size (new file) for swap file failed: %d\n", error);
142 goto done;
143 }
144 assert(file_size == *size);
145
146 if (pin != NULL && *pin != FALSE) {
147 error = VNOP_IOCTL(vp, FIOPINSWAP, NULL, fflag: 0, ctx);
148
149 if (error) {
150 printf("pin for swap files failed: %d, file_size = %lld\n", error, file_size);
151 /* this is not fatal, carry on with files wherever they landed */
152 *pin = FALSE;
153 error = 0;
154 }
155 }
156
157 vnode_lock_spin(vp);
158 SET(vp->v_flag, VSWAP);
159 vnode_unlock(vp);
160
161#if CONFIG_FREEZE
162 VATTR_INIT(&va);
163 VATTR_SET(&va, va_dataprotect_class, PROTECTION_CLASS_C);
164 error = VNOP_SETATTR(vp, &va, ctx);
165
166 if (error) {
167 printf("setattr PROTECTION_CLASS_C for swap file failed: %d\n", error);
168 goto done;
169 }
170#endif /* CONFIG_FREEZE */
171
172done:
173 return error;
174}
175
176
177int
178vm_record_file_write(vnode_t vp, uint64_t offset, char *buf, int size)
179{
180 int error = 0;
181 vfs_context_t ctx;
182
183 ctx = vfs_context_kernel();
184
185 error = vn_rdwr(rw: UIO_WRITE, vp, base: (caddr_t)buf, len: size, offset,
186 segflg: UIO_SYSSPACE, IO_NODELOCKED, cred: vfs_context_ucred(ctx), aresid: (int *) 0, p: vfs_context_proc(ctx));
187
188 return error;
189}
190
191
192
193int
194vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flags, void *upl_iodone)
195{
196 int error = 0;
197 upl_size_t io_size = (upl_size_t) (npages * PAGE_SIZE_64);
198#if 1
199 kern_return_t kr = KERN_SUCCESS;
200 upl_t upl = NULL;
201 unsigned int count = 0;
202 upl_control_flags_t upl_create_flags = 0;
203 int upl_control_flags = 0;
204 upl_size_t upl_size = 0;
205
206 upl_create_flags = UPL_SET_INTERNAL | UPL_SET_LITE;
207
208 if (upl_iodone == NULL) {
209 upl_control_flags = UPL_IOSYNC;
210 }
211
212#if ENCRYPTED_SWAP
213 upl_control_flags |= UPL_PAGING_ENCRYPTED;
214#endif
215
216 if ((flags & SWAP_READ) == FALSE) {
217 upl_create_flags |= UPL_COPYOUT_FROM;
218 }
219
220 upl_size = io_size;
221 kr = vm_map_create_upl( map: kernel_map,
222 offset: start,
223 upl_size: &upl_size,
224 upl: &upl,
225 NULL,
226 count: &count,
227 flags: &upl_create_flags,
228 VM_KERN_MEMORY_OSFMK);
229
230 if (kr != KERN_SUCCESS || (upl_size != io_size)) {
231 panic("vm_map_create_upl failed with %d", kr);
232 }
233
234 if (flags & SWAP_READ) {
235 vnode_pagein(vp,
236 upl,
237 0,
238 offset,
239 io_size,
240 upl_control_flags | UPL_IGNORE_VALID_PAGE_CHECK,
241 &error);
242 if (error) {
243#if DEBUG
244 printf("vm_swapfile_io: vnode_pagein failed with %d (vp: %p, offset: 0x%llx, size:%u)\n", error, vp, offset, io_size);
245#else /* DEBUG */
246 printf("vm_swapfile_io: vnode_pagein failed with %d.\n", error);
247#endif /* DEBUG */
248 }
249 } else {
250 upl_set_iodone(upl, upl_iodone);
251
252 vnode_pageout(vp,
253 upl,
254 0,
255 offset,
256 io_size,
257 upl_control_flags,
258 &error);
259 if (error) {
260#if DEBUG
261 printf("vm_swapfile_io: vnode_pageout failed with %d (vp: %p, offset: 0x%llx, size:%u)\n", error, vp, offset, io_size);
262#else /* DEBUG */
263 printf("vm_swapfile_io: vnode_pageout failed with %d.\n", error);
264#endif /* DEBUG */
265 }
266 }
267
268 return error;
269
270#else /* 1 */
271 vfs_context_t ctx;
272 ctx = vfs_context_kernel();
273
274 error = vn_rdwr((flags & SWAP_READ) ? UIO_READ : UIO_WRITE, vp, (caddr_t)start, io_size, offset,
275 UIO_SYSSPACE, IO_SYNC | IO_NODELOCKED | IO_UNIT | IO_NOCACHE | IO_SWAP_DISPATCH, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx));
276
277 if (error) {
278 printf("vn_rdwr: Swap I/O failed with %d\n", error);
279 }
280 return error;
281#endif /* 1 */
282}
283
284
285#define MAX_BATCH_TO_TRIM 256
286
287#define ROUTE_ONLY 0x10 /* if corestorage is present, tell it to just pass */
288 /* the DKIOUNMAP command through w/o acting on it */
289 /* this is used by the compressed swap system to reclaim empty space */
290
291
292u_int32_t
293vnode_trim_list(vnode_t vp, struct trim_list *tl, boolean_t route_only)
294{
295 int error = 0;
296 int trim_index = 0;
297 u_int32_t blocksize = 0;
298 struct vnode *devvp;
299 dk_extent_t *extents;
300 dk_unmap_t unmap;
301 _dk_cs_unmap_t cs_unmap;
302
303 if (!(vp->v_mount->mnt_ioflags & MNT_IOFLAGS_UNMAP_SUPPORTED)) {
304 return ENOTSUP;
305 }
306
307 if (tl == NULL) {
308 return 0;
309 }
310
311 /*
312 * Get the underlying device vnode and physical block size
313 */
314 devvp = vp->v_mount->mnt_devvp;
315 blocksize = vp->v_mount->mnt_devblocksize;
316
317 extents = kalloc_data(sizeof(dk_extent_t) * MAX_BATCH_TO_TRIM, Z_WAITOK);
318
319 if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_CSUNMAP_SUPPORTED) {
320 memset(s: &cs_unmap, c: 0, n: sizeof(_dk_cs_unmap_t));
321 cs_unmap.extents = extents;
322
323 if (route_only == TRUE) {
324 cs_unmap.options = ROUTE_ONLY;
325 }
326 } else {
327 memset(s: &unmap, c: 0, n: sizeof(dk_unmap_t));
328 unmap.extents = extents;
329 }
330
331 while (tl) {
332 daddr64_t io_blockno; /* Block number corresponding to the start of the extent */
333 size_t io_bytecount; /* Number of bytes in current extent for the specified range */
334 size_t trimmed;
335 size_t remaining_length;
336 off_t current_offset;
337
338 current_offset = tl->tl_offset;
339 remaining_length = tl->tl_length;
340 trimmed = 0;
341
342 /*
343 * We may not get the entire range from tl_offset -> tl_offset+tl_length in a single
344 * extent from the blockmap call. Keep looping/going until we are sure we've hit
345 * the whole range or if we encounter an error.
346 */
347 while (trimmed < tl->tl_length) {
348 /*
349 * VNOP_BLOCKMAP will tell us the logical to physical block number mapping for the
350 * specified offset. It returns blocks in contiguous chunks, so if the logical range is
351 * broken into multiple extents, it must be called multiple times, increasing the offset
352 * in each call to ensure that the entire range is covered.
353 */
354 error = VNOP_BLOCKMAP(vp, current_offset, remaining_length,
355 &io_blockno, &io_bytecount, NULL, VNODE_READ | VNODE_BLOCKMAP_NO_TRACK, NULL);
356
357 if (error) {
358 goto trim_exit;
359 }
360 if (io_blockno != -1) {
361 extents[trim_index].offset = (uint64_t) io_blockno * (u_int64_t) blocksize;
362 extents[trim_index].length = io_bytecount;
363
364 trim_index++;
365 }
366 if (trim_index == MAX_BATCH_TO_TRIM) {
367 if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_CSUNMAP_SUPPORTED) {
368 cs_unmap.extentsCount = trim_index;
369 error = VNOP_IOCTL(vp: devvp, _DKIOCCSUNMAP, data: (caddr_t)&cs_unmap, fflag: 0, ctx: vfs_context_kernel());
370 } else {
371 unmap.extentsCount = trim_index;
372 error = VNOP_IOCTL(vp: devvp, DKIOCUNMAP, data: (caddr_t)&unmap, fflag: 0, ctx: vfs_context_kernel());
373 }
374 if (error) {
375 goto trim_exit;
376 }
377 trim_index = 0;
378 }
379 trimmed += io_bytecount;
380 current_offset += io_bytecount;
381 remaining_length -= io_bytecount;
382 }
383 tl = tl->tl_next;
384 }
385 if (trim_index) {
386 if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_CSUNMAP_SUPPORTED) {
387 cs_unmap.extentsCount = trim_index;
388 error = VNOP_IOCTL(vp: devvp, _DKIOCCSUNMAP, data: (caddr_t)&cs_unmap, fflag: 0, ctx: vfs_context_kernel());
389 } else {
390 unmap.extentsCount = trim_index;
391 error = VNOP_IOCTL(vp: devvp, DKIOCUNMAP, data: (caddr_t)&unmap, fflag: 0, ctx: vfs_context_kernel());
392 }
393 }
394trim_exit:
395 kfree_data(extents, sizeof(dk_extent_t) * MAX_BATCH_TO_TRIM);
396
397 return error;
398}
399
400#if CONFIG_FREEZE
401int
402vm_swap_vol_get_budget(vnode_t vp, uint64_t *freeze_daily_budget)
403{
404 vnode_t devvp = NULL;
405 vfs_context_t ctx = vfs_context_kernel();
406 errno_t err = 0;
407
408 err = vnode_getwithref(vp);
409 if (err == 0) {
410 if (vp->v_mount && vp->v_mount->mnt_devvp) {
411 devvp = vp->v_mount->mnt_devvp;
412 err = VNOP_IOCTL(devvp, DKIOCGETMAXSWAPWRITE, (caddr_t)freeze_daily_budget, 0, ctx);
413 } else {
414 err = ENODEV;
415 }
416 vnode_put(vp);
417 }
418
419 return err;
420}
421#endif /* CONFIG_FREEZE */
422
423int
424vm_swap_vol_get_capacity(const char *volume_name, uint64_t *capacity)
425{
426 vfs_context_t ctx = vfs_context_kernel();
427 vnode_t vp = NULL, devvp = NULL;
428 uint64_t block_size = 0;
429 uint64_t block_count = 0;
430 int error = 0;
431 *capacity = 0;
432
433 if ((error = vnode_open(path: volume_name, FREAD, cmode: 0, flags: 0, vpp: &vp, ctx))) {
434 printf("Unable to open swap volume\n");
435 return error;
436 }
437
438 devvp = vp->v_mount->mnt_devvp;
439 if ((error = VNOP_IOCTL(vp: devvp, DKIOCGETBLOCKSIZE, data: (caddr_t)&block_size, fflag: 0, ctx))) {
440 printf("Unable to get swap volume block size\n");
441 goto out;
442 }
443 if ((error = VNOP_IOCTL(vp: devvp, DKIOCGETBLOCKCOUNT, data: (caddr_t)&block_count, fflag: 0, ctx))) {
444 printf("Unable to get swap volume block count\n");
445 goto out;
446 }
447
448 *capacity = block_count * block_size;
449out:
450 error = vnode_close(vp, flags: 0, ctx);
451 return error;
452}
453