1/*
2 * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <stdint.h>
30#include <sys/fcntl.h>
31#include <sys/vnode_internal.h>
32#include <sys/vnode.h>
33#include <sys/kauth.h>
34#include <sys/mount_internal.h>
35#include <sys/buf_internal.h>
36#include <kern/debug.h>
37#include <kern/kalloc.h>
38#include <sys/cprotect.h>
39#include <sys/disk.h>
40#include <vm/vm_protos.h>
41#include <vm/vm_pageout.h>
42#include <sys/content_protection.h>
43
44void vm_swapfile_open(const char *path, vnode_t *vp);
45void vm_swapfile_close(uint64_t path, vnode_t vp);
46int vm_swapfile_preallocate(vnode_t vp, uint64_t *size, boolean_t *pin);
47uint64_t vm_swapfile_get_blksize(vnode_t vp);
48uint64_t vm_swapfile_get_transfer_size(vnode_t vp);
49int vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flags, void *);
50int vm_record_file_write(struct vnode *vp, uint64_t offset, char *buf, int size);
51
52#if CONFIG_FREEZE
53int vm_swap_vol_get_budget(vnode_t vp, uint64_t *freeze_daily_budget);
54#endif /* CONFIG_FREEZE */
55
56
57void
58vm_swapfile_open(const char *path, vnode_t *vp)
59{
60 int error = 0;
61 vfs_context_t ctx = vfs_context_kernel();
62
63 if ((error = vnode_open(path, (O_CREAT | O_TRUNC | FREAD | FWRITE), S_IRUSR | S_IWUSR, 0, vp, ctx))) {
64 printf("Failed to open swap file %d\n", error);
65 *vp = NULL;
66 return;
67 }
68
69 /*
70 * If MNT_IOFLAGS_NOSWAP is set, opening the swap file should fail.
71 * To avoid a race on the mount we only make this check after creating the
72 * vnode.
73 */
74 if ((*vp)->v_mount->mnt_kern_flag & MNTK_NOSWAP) {
75 vnode_put(*vp);
76 vm_swapfile_close((uint64_t)path, *vp);
77 *vp = NULL;
78 return;
79 }
80
81 vnode_put(*vp);
82}
83
84uint64_t
85vm_swapfile_get_blksize(vnode_t vp)
86{
87 return ((uint64_t)vfs_devblocksize(vnode_mount(vp)));
88}
89
90uint64_t
91vm_swapfile_get_transfer_size(vnode_t vp)
92{
93 return((uint64_t)vp->v_mount->mnt_vfsstat.f_iosize);
94}
95
96int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
97
98void
99vm_swapfile_close(uint64_t path_addr, vnode_t vp)
100{
101 vfs_context_t context = vfs_context_kernel();
102 int error;
103
104 vnode_getwithref(vp);
105 vnode_close(vp, 0, context);
106
107 error = unlink1(context, NULLVP, CAST_USER_ADDR_T(path_addr),
108 UIO_SYSSPACE, 0);
109
110#if DEVELOPMENT || DEBUG
111 if (error)
112 printf("%s : unlink of %s failed with error %d", __FUNCTION__,
113 (char *)path_addr, error);
114#endif
115}
116
117int
118vm_swapfile_preallocate(vnode_t vp, uint64_t *size, boolean_t *pin)
119{
120 int error = 0;
121 uint64_t file_size = 0;
122 vfs_context_t ctx = NULL;
123#if CONFIG_FREEZE
124 struct vnode_attr va;
125#endif /* CONFIG_FREEZE */
126
127 ctx = vfs_context_kernel();
128
129 error = vnode_setsize(vp, *size, IO_NOZEROFILL, ctx);
130
131 if (error) {
132 printf("vnode_setsize for swap files failed: %d\n", error);
133 goto done;
134 }
135
136 error = vnode_size(vp, (off_t*) &file_size, ctx);
137
138 if (error) {
139 printf("vnode_size (new file) for swap file failed: %d\n", error);
140 goto done;
141 }
142 assert(file_size == *size);
143
144 if (pin != NULL && *pin != FALSE) {
145 error = VNOP_IOCTL(vp, FIOPINSWAP, NULL, 0, ctx);
146
147 if (error) {
148 printf("pin for swap files failed: %d, file_size = %lld\n", error, file_size);
149 /* this is not fatal, carry on with files wherever they landed */
150 *pin = FALSE;
151 error = 0;
152 }
153 }
154
155 vnode_lock_spin(vp);
156 SET(vp->v_flag, VSWAP);
157 vnode_unlock(vp);
158
159#if CONFIG_FREEZE
160 VATTR_INIT(&va);
161 VATTR_SET(&va, va_dataprotect_class, PROTECTION_CLASS_C);
162 error = VNOP_SETATTR(vp, &va, ctx);
163
164 if (error) {
165 printf("setattr PROTECTION_CLASS_C for swap file failed: %d\n", error);
166 goto done;
167 }
168#endif /* CONFIG_FREEZE */
169
170done:
171 return error;
172}
173
174
175int
176vm_record_file_write(vnode_t vp, uint64_t offset, char *buf, int size)
177{
178 int error = 0;
179 vfs_context_t ctx;
180
181 ctx = vfs_context_kernel();
182
183 error = vn_rdwr(UIO_WRITE, vp, (caddr_t)buf, size, offset,
184 UIO_SYSSPACE, IO_NODELOCKED, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx));
185
186 return (error);
187}
188
189
190
191int
192vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flags, void *upl_iodone)
193{
194 int error = 0;
195 uint64_t io_size = npages * PAGE_SIZE_64;
196#if 1
197 kern_return_t kr = KERN_SUCCESS;
198 upl_t upl = NULL;
199 unsigned int count = 0;
200 upl_control_flags_t upl_create_flags = 0;
201 int upl_control_flags = 0;
202 upl_size_t upl_size = 0;
203
204 upl_create_flags = UPL_SET_INTERNAL | UPL_SET_LITE;
205
206 if (upl_iodone == NULL)
207 upl_control_flags = UPL_IOSYNC;
208
209#if ENCRYPTED_SWAP
210 upl_control_flags |= UPL_PAGING_ENCRYPTED;
211#endif
212
213 if ((flags & SWAP_READ) == FALSE) {
214 upl_create_flags |= UPL_COPYOUT_FROM;
215 }
216
217 upl_size = io_size;
218 kr = vm_map_create_upl( kernel_map,
219 start,
220 &upl_size,
221 &upl,
222 NULL,
223 &count,
224 &upl_create_flags,
225 VM_KERN_MEMORY_OSFMK);
226
227 if (kr != KERN_SUCCESS || (upl_size != io_size)) {
228 panic("vm_map_create_upl failed with %d\n", kr);
229 }
230
231 if (flags & SWAP_READ) {
232 vnode_pagein(vp,
233 upl,
234 0,
235 offset,
236 io_size,
237 upl_control_flags | UPL_IGNORE_VALID_PAGE_CHECK,
238 &error);
239 if (error) {
240#if DEBUG
241 printf("vm_swapfile_io: vnode_pagein failed with %d (vp: %p, offset: 0x%llx, size:%llu)\n", error, vp, offset, io_size);
242#else /* DEBUG */
243 printf("vm_swapfile_io: vnode_pagein failed with %d.\n", error);
244#endif /* DEBUG */
245 }
246
247 } else {
248 upl_set_iodone(upl, upl_iodone);
249
250 vnode_pageout(vp,
251 upl,
252 0,
253 offset,
254 io_size,
255 upl_control_flags,
256 &error);
257 if (error) {
258#if DEBUG
259 printf("vm_swapfile_io: vnode_pageout failed with %d (vp: %p, offset: 0x%llx, size:%llu)\n", error, vp, offset, io_size);
260#else /* DEBUG */
261 printf("vm_swapfile_io: vnode_pageout failed with %d.\n", error);
262#endif /* DEBUG */
263 }
264 }
265 return error;
266
267#else /* 1 */
268 vfs_context_t ctx;
269 ctx = vfs_context_kernel();
270
271 error = vn_rdwr((flags & SWAP_READ) ? UIO_READ : UIO_WRITE, vp, (caddr_t)start, io_size, offset,
272 UIO_SYSSPACE, IO_SYNC | IO_NODELOCKED | IO_UNIT | IO_NOCACHE | IO_SWAP_DISPATCH, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx));
273
274 if (error) {
275 printf("vn_rdwr: Swap I/O failed with %d\n", error);
276 }
277 return error;
278#endif /* 1 */
279}
280
281
282#define MAX_BATCH_TO_TRIM 256
283
284#define ROUTE_ONLY 0x10 /* if corestorage is present, tell it to just pass */
285 /* the DKIOUNMAP command through w/o acting on it */
286 /* this is used by the compressed swap system to reclaim empty space */
287
288
289u_int32_t vnode_trim_list (vnode_t vp, struct trim_list *tl, boolean_t route_only)
290{
291 int error = 0;
292 int trim_index = 0;
293 u_int32_t blocksize = 0;
294 struct vnode *devvp;
295 dk_extent_t *extents;
296 dk_unmap_t unmap;
297 _dk_cs_unmap_t cs_unmap;
298
299 if ( !(vp->v_mount->mnt_ioflags & MNT_IOFLAGS_UNMAP_SUPPORTED))
300 return (ENOTSUP);
301
302 if (tl == NULL)
303 return (0);
304
305 /*
306 * Get the underlying device vnode and physical block size
307 */
308 devvp = vp->v_mount->mnt_devvp;
309 blocksize = vp->v_mount->mnt_devblocksize;
310
311 extents = kalloc(sizeof(dk_extent_t) * MAX_BATCH_TO_TRIM);
312
313 if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_CSUNMAP_SUPPORTED) {
314 memset (&cs_unmap, 0, sizeof(_dk_cs_unmap_t));
315 cs_unmap.extents = extents;
316
317 if (route_only == TRUE)
318 cs_unmap.options = ROUTE_ONLY;
319 } else {
320 memset (&unmap, 0, sizeof(dk_unmap_t));
321 unmap.extents = extents;
322 }
323
324 while (tl) {
325 daddr64_t io_blockno; /* Block number corresponding to the start of the extent */
326 size_t io_bytecount; /* Number of bytes in current extent for the specified range */
327 size_t trimmed;
328 size_t remaining_length;
329 off_t current_offset;
330
331 current_offset = tl->tl_offset;
332 remaining_length = tl->tl_length;
333 trimmed = 0;
334
335 /*
336 * We may not get the entire range from tl_offset -> tl_offset+tl_length in a single
337 * extent from the blockmap call. Keep looping/going until we are sure we've hit
338 * the whole range or if we encounter an error.
339 */
340 while (trimmed < tl->tl_length) {
341 /*
342 * VNOP_BLOCKMAP will tell us the logical to physical block number mapping for the
343 * specified offset. It returns blocks in contiguous chunks, so if the logical range is
344 * broken into multiple extents, it must be called multiple times, increasing the offset
345 * in each call to ensure that the entire range is covered.
346 */
347 error = VNOP_BLOCKMAP (vp, current_offset, remaining_length,
348 &io_blockno, &io_bytecount, NULL, VNODE_READ | VNODE_BLOCKMAP_NO_TRACK, NULL);
349
350 if (error) {
351 goto trim_exit;
352 }
353 if (io_blockno != -1) {
354 extents[trim_index].offset = (uint64_t) io_blockno * (u_int64_t) blocksize;
355 extents[trim_index].length = io_bytecount;
356
357 trim_index++;
358 }
359 if (trim_index == MAX_BATCH_TO_TRIM) {
360
361 if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_CSUNMAP_SUPPORTED) {
362 cs_unmap.extentsCount = trim_index;
363 error = VNOP_IOCTL(devvp, _DKIOCCSUNMAP, (caddr_t)&cs_unmap, 0, vfs_context_kernel());
364 } else {
365 unmap.extentsCount = trim_index;
366 error = VNOP_IOCTL(devvp, DKIOCUNMAP, (caddr_t)&unmap, 0, vfs_context_kernel());
367 }
368 if (error) {
369 goto trim_exit;
370 }
371 trim_index = 0;
372 }
373 trimmed += io_bytecount;
374 current_offset += io_bytecount;
375 remaining_length -= io_bytecount;
376 }
377 tl = tl->tl_next;
378 }
379 if (trim_index) {
380 if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_CSUNMAP_SUPPORTED) {
381 cs_unmap.extentsCount = trim_index;
382 error = VNOP_IOCTL(devvp, _DKIOCCSUNMAP, (caddr_t)&cs_unmap, 0, vfs_context_kernel());
383 } else {
384 unmap.extentsCount = trim_index;
385 error = VNOP_IOCTL(devvp, DKIOCUNMAP, (caddr_t)&unmap, 0, vfs_context_kernel());
386 }
387 }
388trim_exit:
389 kfree(extents, sizeof(dk_extent_t) * MAX_BATCH_TO_TRIM);
390
391 return error;
392}
393
394#if CONFIG_FREEZE
395int
396vm_swap_vol_get_budget(vnode_t vp, uint64_t *freeze_daily_budget)
397{
398 vnode_t devvp = NULL;
399 vfs_context_t ctx = vfs_context_kernel();
400 errno_t err = 0;
401
402 devvp = vp->v_mount->mnt_devvp;
403
404 err = VNOP_IOCTL(devvp, DKIOCGETMAXSWAPWRITE, (caddr_t)freeze_daily_budget, 0, ctx);
405
406 return err;
407}
408#endif /* CONFIG_FREEZE */
409