1 | /* |
2 | * Copyright (c) 2000-2016 Apple Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | */ |
28 | |
29 | #include <stdint.h> |
30 | #include <sys/fcntl.h> |
31 | #include <sys/vnode_internal.h> |
32 | #include <sys/vnode.h> |
33 | #include <sys/kauth.h> |
34 | #include <sys/mount_internal.h> |
35 | #include <sys/buf_internal.h> |
36 | #include <kern/debug.h> |
37 | #include <kern/kalloc.h> |
38 | #include <sys/cprotect.h> |
39 | #include <sys/disk.h> |
40 | #include <vm/vm_protos.h> |
41 | #include <vm/vm_pageout.h> |
42 | #include <sys/content_protection.h> |
43 | |
44 | void vm_swapfile_open(const char *path, vnode_t *vp); |
45 | void vm_swapfile_close(uint64_t path, vnode_t vp); |
46 | int vm_swapfile_preallocate(vnode_t vp, uint64_t *size, boolean_t *pin); |
47 | uint64_t vm_swapfile_get_blksize(vnode_t vp); |
48 | uint64_t vm_swapfile_get_transfer_size(vnode_t vp); |
49 | int vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flags, void *); |
50 | int vm_record_file_write(struct vnode *vp, uint64_t offset, char *buf, int size); |
51 | int vm_swap_vol_get_capacity(const char *volume_name, uint64_t *capacity); |
52 | |
53 | #if CONFIG_FREEZE |
54 | int vm_swap_vol_get_budget(vnode_t vp, uint64_t *freeze_daily_budget); |
55 | #endif /* CONFIG_FREEZE */ |
56 | |
57 | |
58 | void |
59 | vm_swapfile_open(const char *path, vnode_t *vp) |
60 | { |
61 | int error = 0; |
62 | vfs_context_t ctx = vfs_context_kernel(); |
63 | |
64 | if ((error = vnode_open(path, fmode: (O_CREAT | O_TRUNC | FREAD | FWRITE), S_IRUSR | S_IWUSR, flags: 0, vpp: vp, ctx))) { |
65 | printf("Failed to open swap file %d\n" , error); |
66 | *vp = NULL; |
67 | return; |
68 | } |
69 | |
70 | /* |
71 | * If MNT_IOFLAGS_NOSWAP is set, opening the swap file should fail. |
72 | * To avoid a race on the mount we only make this check after creating the |
73 | * vnode. |
74 | */ |
75 | if ((*vp)->v_mount->mnt_kern_flag & MNTK_NOSWAP) { |
76 | vnode_put(vp: *vp); |
77 | vm_swapfile_close(path: (uint64_t)path, vp: *vp); |
78 | *vp = NULL; |
79 | return; |
80 | } |
81 | |
82 | vnode_put(vp: *vp); |
83 | } |
84 | |
85 | uint64_t |
86 | vm_swapfile_get_blksize(vnode_t vp) |
87 | { |
88 | return (uint64_t)vfs_devblocksize(mp: vnode_mount(vp)); |
89 | } |
90 | |
91 | uint64_t |
92 | vm_swapfile_get_transfer_size(vnode_t vp) |
93 | { |
94 | return (uint64_t)vp->v_mount->mnt_vfsstat.f_iosize; |
95 | } |
96 | |
97 | int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int); |
98 | |
99 | void |
100 | vm_swapfile_close(uint64_t path_addr, vnode_t vp) |
101 | { |
102 | vfs_context_t context = vfs_context_kernel(); |
103 | int error; |
104 | |
105 | vnode_getwithref(vp); |
106 | vnode_close(vp, flags: 0, ctx: context); |
107 | |
108 | error = unlink1(context, NULLVP, CAST_USER_ADDR_T(path_addr), |
109 | UIO_SYSSPACE, 0); |
110 | |
111 | #if DEVELOPMENT || DEBUG |
112 | if (error) { |
113 | printf("%s : unlink of %s failed with error %d" , __FUNCTION__, |
114 | (char *)path_addr, error); |
115 | } |
116 | #endif |
117 | } |
118 | |
119 | int |
120 | vm_swapfile_preallocate(vnode_t vp, uint64_t *size, boolean_t *pin) |
121 | { |
122 | int error = 0; |
123 | uint64_t file_size = 0; |
124 | vfs_context_t ctx = NULL; |
125 | #if CONFIG_FREEZE |
126 | struct vnode_attr va; |
127 | #endif /* CONFIG_FREEZE */ |
128 | |
129 | ctx = vfs_context_kernel(); |
130 | |
131 | error = vnode_setsize(vp, *size, IO_NOZEROFILL, ctx); |
132 | |
133 | if (error) { |
134 | printf("vnode_setsize for swap files failed: %d\n" , error); |
135 | goto done; |
136 | } |
137 | |
138 | error = vnode_size(vp, (off_t*) &file_size, ctx); |
139 | |
140 | if (error) { |
141 | printf("vnode_size (new file) for swap file failed: %d\n" , error); |
142 | goto done; |
143 | } |
144 | assert(file_size == *size); |
145 | |
146 | if (pin != NULL && *pin != FALSE) { |
147 | error = VNOP_IOCTL(vp, FIOPINSWAP, NULL, fflag: 0, ctx); |
148 | |
149 | if (error) { |
150 | printf("pin for swap files failed: %d, file_size = %lld\n" , error, file_size); |
151 | /* this is not fatal, carry on with files wherever they landed */ |
152 | *pin = FALSE; |
153 | error = 0; |
154 | } |
155 | } |
156 | |
157 | vnode_lock_spin(vp); |
158 | SET(vp->v_flag, VSWAP); |
159 | vnode_unlock(vp); |
160 | |
161 | #if CONFIG_FREEZE |
162 | VATTR_INIT(&va); |
163 | VATTR_SET(&va, va_dataprotect_class, PROTECTION_CLASS_C); |
164 | error = VNOP_SETATTR(vp, &va, ctx); |
165 | |
166 | if (error) { |
167 | printf("setattr PROTECTION_CLASS_C for swap file failed: %d\n" , error); |
168 | goto done; |
169 | } |
170 | #endif /* CONFIG_FREEZE */ |
171 | |
172 | done: |
173 | return error; |
174 | } |
175 | |
176 | |
177 | int |
178 | vm_record_file_write(vnode_t vp, uint64_t offset, char *buf, int size) |
179 | { |
180 | int error = 0; |
181 | vfs_context_t ctx; |
182 | |
183 | ctx = vfs_context_kernel(); |
184 | |
185 | error = vn_rdwr(rw: UIO_WRITE, vp, base: (caddr_t)buf, len: size, offset, |
186 | segflg: UIO_SYSSPACE, IO_NODELOCKED, cred: vfs_context_ucred(ctx), aresid: (int *) 0, p: vfs_context_proc(ctx)); |
187 | |
188 | return error; |
189 | } |
190 | |
191 | |
192 | |
193 | int |
194 | vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flags, void *upl_iodone) |
195 | { |
196 | int error = 0; |
197 | upl_size_t io_size = (upl_size_t) (npages * PAGE_SIZE_64); |
198 | #if 1 |
199 | kern_return_t kr = KERN_SUCCESS; |
200 | upl_t upl = NULL; |
201 | unsigned int count = 0; |
202 | upl_control_flags_t upl_create_flags = 0; |
203 | int upl_control_flags = 0; |
204 | upl_size_t upl_size = 0; |
205 | |
206 | upl_create_flags = UPL_SET_INTERNAL | UPL_SET_LITE; |
207 | |
208 | if (upl_iodone == NULL) { |
209 | upl_control_flags = UPL_IOSYNC; |
210 | } |
211 | |
212 | #if ENCRYPTED_SWAP |
213 | upl_control_flags |= UPL_PAGING_ENCRYPTED; |
214 | #endif |
215 | |
216 | if ((flags & SWAP_READ) == FALSE) { |
217 | upl_create_flags |= UPL_COPYOUT_FROM; |
218 | } |
219 | |
220 | upl_size = io_size; |
221 | kr = vm_map_create_upl( map: kernel_map, |
222 | offset: start, |
223 | upl_size: &upl_size, |
224 | upl: &upl, |
225 | NULL, |
226 | count: &count, |
227 | flags: &upl_create_flags, |
228 | VM_KERN_MEMORY_OSFMK); |
229 | |
230 | if (kr != KERN_SUCCESS || (upl_size != io_size)) { |
231 | panic("vm_map_create_upl failed with %d" , kr); |
232 | } |
233 | |
234 | if (flags & SWAP_READ) { |
235 | vnode_pagein(vp, |
236 | upl, |
237 | 0, |
238 | offset, |
239 | io_size, |
240 | upl_control_flags | UPL_IGNORE_VALID_PAGE_CHECK, |
241 | &error); |
242 | if (error) { |
243 | #if DEBUG |
244 | printf("vm_swapfile_io: vnode_pagein failed with %d (vp: %p, offset: 0x%llx, size:%u)\n" , error, vp, offset, io_size); |
245 | #else /* DEBUG */ |
246 | printf("vm_swapfile_io: vnode_pagein failed with %d.\n" , error); |
247 | #endif /* DEBUG */ |
248 | } |
249 | } else { |
250 | upl_set_iodone(upl, upl_iodone); |
251 | |
252 | vnode_pageout(vp, |
253 | upl, |
254 | 0, |
255 | offset, |
256 | io_size, |
257 | upl_control_flags, |
258 | &error); |
259 | if (error) { |
260 | #if DEBUG |
261 | printf("vm_swapfile_io: vnode_pageout failed with %d (vp: %p, offset: 0x%llx, size:%u)\n" , error, vp, offset, io_size); |
262 | #else /* DEBUG */ |
263 | printf("vm_swapfile_io: vnode_pageout failed with %d.\n" , error); |
264 | #endif /* DEBUG */ |
265 | } |
266 | } |
267 | |
268 | return error; |
269 | |
270 | #else /* 1 */ |
271 | vfs_context_t ctx; |
272 | ctx = vfs_context_kernel(); |
273 | |
274 | error = vn_rdwr((flags & SWAP_READ) ? UIO_READ : UIO_WRITE, vp, (caddr_t)start, io_size, offset, |
275 | UIO_SYSSPACE, IO_SYNC | IO_NODELOCKED | IO_UNIT | IO_NOCACHE | IO_SWAP_DISPATCH, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); |
276 | |
277 | if (error) { |
278 | printf("vn_rdwr: Swap I/O failed with %d\n" , error); |
279 | } |
280 | return error; |
281 | #endif /* 1 */ |
282 | } |
283 | |
284 | |
285 | #define MAX_BATCH_TO_TRIM 256 |
286 | |
287 | #define ROUTE_ONLY 0x10 /* if corestorage is present, tell it to just pass */ |
288 | /* the DKIOUNMAP command through w/o acting on it */ |
289 | /* this is used by the compressed swap system to reclaim empty space */ |
290 | |
291 | |
292 | u_int32_t |
293 | vnode_trim_list(vnode_t vp, struct trim_list *tl, boolean_t route_only) |
294 | { |
295 | int error = 0; |
296 | int trim_index = 0; |
297 | u_int32_t blocksize = 0; |
298 | struct vnode *devvp; |
299 | dk_extent_t *extents; |
300 | dk_unmap_t unmap; |
301 | _dk_cs_unmap_t cs_unmap; |
302 | |
303 | if (!(vp->v_mount->mnt_ioflags & MNT_IOFLAGS_UNMAP_SUPPORTED)) { |
304 | return ENOTSUP; |
305 | } |
306 | |
307 | if (tl == NULL) { |
308 | return 0; |
309 | } |
310 | |
311 | /* |
312 | * Get the underlying device vnode and physical block size |
313 | */ |
314 | devvp = vp->v_mount->mnt_devvp; |
315 | blocksize = vp->v_mount->mnt_devblocksize; |
316 | |
317 | extents = kalloc_data(sizeof(dk_extent_t) * MAX_BATCH_TO_TRIM, Z_WAITOK); |
318 | |
319 | if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_CSUNMAP_SUPPORTED) { |
320 | memset(s: &cs_unmap, c: 0, n: sizeof(_dk_cs_unmap_t)); |
321 | cs_unmap.extents = extents; |
322 | |
323 | if (route_only == TRUE) { |
324 | cs_unmap.options = ROUTE_ONLY; |
325 | } |
326 | } else { |
327 | memset(s: &unmap, c: 0, n: sizeof(dk_unmap_t)); |
328 | unmap.extents = extents; |
329 | } |
330 | |
331 | while (tl) { |
332 | daddr64_t io_blockno; /* Block number corresponding to the start of the extent */ |
333 | size_t io_bytecount; /* Number of bytes in current extent for the specified range */ |
334 | size_t trimmed; |
335 | size_t remaining_length; |
336 | off_t current_offset; |
337 | |
338 | current_offset = tl->tl_offset; |
339 | remaining_length = tl->tl_length; |
340 | trimmed = 0; |
341 | |
342 | /* |
343 | * We may not get the entire range from tl_offset -> tl_offset+tl_length in a single |
344 | * extent from the blockmap call. Keep looping/going until we are sure we've hit |
345 | * the whole range or if we encounter an error. |
346 | */ |
347 | while (trimmed < tl->tl_length) { |
348 | /* |
349 | * VNOP_BLOCKMAP will tell us the logical to physical block number mapping for the |
350 | * specified offset. It returns blocks in contiguous chunks, so if the logical range is |
351 | * broken into multiple extents, it must be called multiple times, increasing the offset |
352 | * in each call to ensure that the entire range is covered. |
353 | */ |
354 | error = VNOP_BLOCKMAP(vp, current_offset, remaining_length, |
355 | &io_blockno, &io_bytecount, NULL, VNODE_READ | VNODE_BLOCKMAP_NO_TRACK, NULL); |
356 | |
357 | if (error) { |
358 | goto trim_exit; |
359 | } |
360 | if (io_blockno != -1) { |
361 | extents[trim_index].offset = (uint64_t) io_blockno * (u_int64_t) blocksize; |
362 | extents[trim_index].length = io_bytecount; |
363 | |
364 | trim_index++; |
365 | } |
366 | if (trim_index == MAX_BATCH_TO_TRIM) { |
367 | if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_CSUNMAP_SUPPORTED) { |
368 | cs_unmap.extentsCount = trim_index; |
369 | error = VNOP_IOCTL(vp: devvp, _DKIOCCSUNMAP, data: (caddr_t)&cs_unmap, fflag: 0, ctx: vfs_context_kernel()); |
370 | } else { |
371 | unmap.extentsCount = trim_index; |
372 | error = VNOP_IOCTL(vp: devvp, DKIOCUNMAP, data: (caddr_t)&unmap, fflag: 0, ctx: vfs_context_kernel()); |
373 | } |
374 | if (error) { |
375 | goto trim_exit; |
376 | } |
377 | trim_index = 0; |
378 | } |
379 | trimmed += io_bytecount; |
380 | current_offset += io_bytecount; |
381 | remaining_length -= io_bytecount; |
382 | } |
383 | tl = tl->tl_next; |
384 | } |
385 | if (trim_index) { |
386 | if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_CSUNMAP_SUPPORTED) { |
387 | cs_unmap.extentsCount = trim_index; |
388 | error = VNOP_IOCTL(vp: devvp, _DKIOCCSUNMAP, data: (caddr_t)&cs_unmap, fflag: 0, ctx: vfs_context_kernel()); |
389 | } else { |
390 | unmap.extentsCount = trim_index; |
391 | error = VNOP_IOCTL(vp: devvp, DKIOCUNMAP, data: (caddr_t)&unmap, fflag: 0, ctx: vfs_context_kernel()); |
392 | } |
393 | } |
394 | trim_exit: |
395 | kfree_data(extents, sizeof(dk_extent_t) * MAX_BATCH_TO_TRIM); |
396 | |
397 | return error; |
398 | } |
399 | |
400 | #if CONFIG_FREEZE |
401 | int |
402 | vm_swap_vol_get_budget(vnode_t vp, uint64_t *freeze_daily_budget) |
403 | { |
404 | vnode_t devvp = NULL; |
405 | vfs_context_t ctx = vfs_context_kernel(); |
406 | errno_t err = 0; |
407 | |
408 | err = vnode_getwithref(vp); |
409 | if (err == 0) { |
410 | if (vp->v_mount && vp->v_mount->mnt_devvp) { |
411 | devvp = vp->v_mount->mnt_devvp; |
412 | err = VNOP_IOCTL(devvp, DKIOCGETMAXSWAPWRITE, (caddr_t)freeze_daily_budget, 0, ctx); |
413 | } else { |
414 | err = ENODEV; |
415 | } |
416 | vnode_put(vp); |
417 | } |
418 | |
419 | return err; |
420 | } |
421 | #endif /* CONFIG_FREEZE */ |
422 | |
423 | int |
424 | vm_swap_vol_get_capacity(const char *volume_name, uint64_t *capacity) |
425 | { |
426 | vfs_context_t ctx = vfs_context_kernel(); |
427 | vnode_t vp = NULL, devvp = NULL; |
428 | uint64_t block_size = 0; |
429 | uint64_t block_count = 0; |
430 | int error = 0; |
431 | *capacity = 0; |
432 | |
433 | if ((error = vnode_open(path: volume_name, FREAD, cmode: 0, flags: 0, vpp: &vp, ctx))) { |
434 | printf("Unable to open swap volume\n" ); |
435 | return error; |
436 | } |
437 | |
438 | devvp = vp->v_mount->mnt_devvp; |
439 | if ((error = VNOP_IOCTL(vp: devvp, DKIOCGETBLOCKSIZE, data: (caddr_t)&block_size, fflag: 0, ctx))) { |
440 | printf("Unable to get swap volume block size\n" ); |
441 | goto out; |
442 | } |
443 | if ((error = VNOP_IOCTL(vp: devvp, DKIOCGETBLOCKCOUNT, data: (caddr_t)&block_count, fflag: 0, ctx))) { |
444 | printf("Unable to get swap volume block count\n" ); |
445 | goto out; |
446 | } |
447 | |
448 | *capacity = block_count * block_size; |
449 | out: |
450 | error = vnode_close(vp, flags: 0, ctx); |
451 | return error; |
452 | } |
453 | |