1 | /* |
2 | * Copyright (c) 2000-2016 Apple Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | */ |
28 | |
29 | #include <stdint.h> |
30 | #include <sys/fcntl.h> |
31 | #include <sys/vnode_internal.h> |
32 | #include <sys/vnode.h> |
33 | #include <sys/kauth.h> |
34 | #include <sys/mount_internal.h> |
35 | #include <sys/buf_internal.h> |
36 | #include <kern/debug.h> |
37 | #include <kern/kalloc.h> |
38 | #include <sys/cprotect.h> |
39 | #include <sys/disk.h> |
40 | #include <vm/vm_protos.h> |
41 | #include <vm/vm_pageout.h> |
42 | #include <sys/content_protection.h> |
43 | |
44 | void vm_swapfile_open(const char *path, vnode_t *vp); |
45 | void vm_swapfile_close(uint64_t path, vnode_t vp); |
46 | int vm_swapfile_preallocate(vnode_t vp, uint64_t *size, boolean_t *pin); |
47 | uint64_t vm_swapfile_get_blksize(vnode_t vp); |
48 | uint64_t vm_swapfile_get_transfer_size(vnode_t vp); |
49 | int vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flags, void *); |
50 | int vm_record_file_write(struct vnode *vp, uint64_t offset, char *buf, int size); |
51 | |
52 | #if CONFIG_FREEZE |
53 | int vm_swap_vol_get_budget(vnode_t vp, uint64_t *freeze_daily_budget); |
54 | #endif /* CONFIG_FREEZE */ |
55 | |
56 | |
57 | void |
58 | vm_swapfile_open(const char *path, vnode_t *vp) |
59 | { |
60 | int error = 0; |
61 | vfs_context_t ctx = vfs_context_kernel(); |
62 | |
63 | if ((error = vnode_open(path, (O_CREAT | O_TRUNC | FREAD | FWRITE), S_IRUSR | S_IWUSR, 0, vp, ctx))) { |
64 | printf("Failed to open swap file %d\n" , error); |
65 | *vp = NULL; |
66 | return; |
67 | } |
68 | |
69 | /* |
70 | * If MNT_IOFLAGS_NOSWAP is set, opening the swap file should fail. |
71 | * To avoid a race on the mount we only make this check after creating the |
72 | * vnode. |
73 | */ |
74 | if ((*vp)->v_mount->mnt_kern_flag & MNTK_NOSWAP) { |
75 | vnode_put(*vp); |
76 | vm_swapfile_close((uint64_t)path, *vp); |
77 | *vp = NULL; |
78 | return; |
79 | } |
80 | |
81 | vnode_put(*vp); |
82 | } |
83 | |
84 | uint64_t |
85 | vm_swapfile_get_blksize(vnode_t vp) |
86 | { |
87 | return ((uint64_t)vfs_devblocksize(vnode_mount(vp))); |
88 | } |
89 | |
90 | uint64_t |
91 | vm_swapfile_get_transfer_size(vnode_t vp) |
92 | { |
93 | return((uint64_t)vp->v_mount->mnt_vfsstat.f_iosize); |
94 | } |
95 | |
96 | int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int); |
97 | |
98 | void |
99 | vm_swapfile_close(uint64_t path_addr, vnode_t vp) |
100 | { |
101 | vfs_context_t context = vfs_context_kernel(); |
102 | int error; |
103 | |
104 | vnode_getwithref(vp); |
105 | vnode_close(vp, 0, context); |
106 | |
107 | error = unlink1(context, NULLVP, CAST_USER_ADDR_T(path_addr), |
108 | UIO_SYSSPACE, 0); |
109 | |
110 | #if DEVELOPMENT || DEBUG |
111 | if (error) |
112 | printf("%s : unlink of %s failed with error %d" , __FUNCTION__, |
113 | (char *)path_addr, error); |
114 | #endif |
115 | } |
116 | |
117 | int |
118 | vm_swapfile_preallocate(vnode_t vp, uint64_t *size, boolean_t *pin) |
119 | { |
120 | int error = 0; |
121 | uint64_t file_size = 0; |
122 | vfs_context_t ctx = NULL; |
123 | #if CONFIG_FREEZE |
124 | struct vnode_attr va; |
125 | #endif /* CONFIG_FREEZE */ |
126 | |
127 | ctx = vfs_context_kernel(); |
128 | |
129 | error = vnode_setsize(vp, *size, IO_NOZEROFILL, ctx); |
130 | |
131 | if (error) { |
132 | printf("vnode_setsize for swap files failed: %d\n" , error); |
133 | goto done; |
134 | } |
135 | |
136 | error = vnode_size(vp, (off_t*) &file_size, ctx); |
137 | |
138 | if (error) { |
139 | printf("vnode_size (new file) for swap file failed: %d\n" , error); |
140 | goto done; |
141 | } |
142 | assert(file_size == *size); |
143 | |
144 | if (pin != NULL && *pin != FALSE) { |
145 | error = VNOP_IOCTL(vp, FIOPINSWAP, NULL, 0, ctx); |
146 | |
147 | if (error) { |
148 | printf("pin for swap files failed: %d, file_size = %lld\n" , error, file_size); |
149 | /* this is not fatal, carry on with files wherever they landed */ |
150 | *pin = FALSE; |
151 | error = 0; |
152 | } |
153 | } |
154 | |
155 | vnode_lock_spin(vp); |
156 | SET(vp->v_flag, VSWAP); |
157 | vnode_unlock(vp); |
158 | |
159 | #if CONFIG_FREEZE |
160 | VATTR_INIT(&va); |
161 | VATTR_SET(&va, va_dataprotect_class, PROTECTION_CLASS_C); |
162 | error = VNOP_SETATTR(vp, &va, ctx); |
163 | |
164 | if (error) { |
165 | printf("setattr PROTECTION_CLASS_C for swap file failed: %d\n" , error); |
166 | goto done; |
167 | } |
168 | #endif /* CONFIG_FREEZE */ |
169 | |
170 | done: |
171 | return error; |
172 | } |
173 | |
174 | |
175 | int |
176 | vm_record_file_write(vnode_t vp, uint64_t offset, char *buf, int size) |
177 | { |
178 | int error = 0; |
179 | vfs_context_t ctx; |
180 | |
181 | ctx = vfs_context_kernel(); |
182 | |
183 | error = vn_rdwr(UIO_WRITE, vp, (caddr_t)buf, size, offset, |
184 | UIO_SYSSPACE, IO_NODELOCKED, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); |
185 | |
186 | return (error); |
187 | } |
188 | |
189 | |
190 | |
191 | int |
192 | vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flags, void *upl_iodone) |
193 | { |
194 | int error = 0; |
195 | uint64_t io_size = npages * PAGE_SIZE_64; |
196 | #if 1 |
197 | kern_return_t kr = KERN_SUCCESS; |
198 | upl_t upl = NULL; |
199 | unsigned int count = 0; |
200 | upl_control_flags_t upl_create_flags = 0; |
201 | int upl_control_flags = 0; |
202 | upl_size_t upl_size = 0; |
203 | |
204 | upl_create_flags = UPL_SET_INTERNAL | UPL_SET_LITE; |
205 | |
206 | if (upl_iodone == NULL) |
207 | upl_control_flags = UPL_IOSYNC; |
208 | |
209 | #if ENCRYPTED_SWAP |
210 | upl_control_flags |= UPL_PAGING_ENCRYPTED; |
211 | #endif |
212 | |
213 | if ((flags & SWAP_READ) == FALSE) { |
214 | upl_create_flags |= UPL_COPYOUT_FROM; |
215 | } |
216 | |
217 | upl_size = io_size; |
218 | kr = vm_map_create_upl( kernel_map, |
219 | start, |
220 | &upl_size, |
221 | &upl, |
222 | NULL, |
223 | &count, |
224 | &upl_create_flags, |
225 | VM_KERN_MEMORY_OSFMK); |
226 | |
227 | if (kr != KERN_SUCCESS || (upl_size != io_size)) { |
228 | panic("vm_map_create_upl failed with %d\n" , kr); |
229 | } |
230 | |
231 | if (flags & SWAP_READ) { |
232 | vnode_pagein(vp, |
233 | upl, |
234 | 0, |
235 | offset, |
236 | io_size, |
237 | upl_control_flags | UPL_IGNORE_VALID_PAGE_CHECK, |
238 | &error); |
239 | if (error) { |
240 | #if DEBUG |
241 | printf("vm_swapfile_io: vnode_pagein failed with %d (vp: %p, offset: 0x%llx, size:%llu)\n" , error, vp, offset, io_size); |
242 | #else /* DEBUG */ |
243 | printf("vm_swapfile_io: vnode_pagein failed with %d.\n" , error); |
244 | #endif /* DEBUG */ |
245 | } |
246 | |
247 | } else { |
248 | upl_set_iodone(upl, upl_iodone); |
249 | |
250 | vnode_pageout(vp, |
251 | upl, |
252 | 0, |
253 | offset, |
254 | io_size, |
255 | upl_control_flags, |
256 | &error); |
257 | if (error) { |
258 | #if DEBUG |
259 | printf("vm_swapfile_io: vnode_pageout failed with %d (vp: %p, offset: 0x%llx, size:%llu)\n" , error, vp, offset, io_size); |
260 | #else /* DEBUG */ |
261 | printf("vm_swapfile_io: vnode_pageout failed with %d.\n" , error); |
262 | #endif /* DEBUG */ |
263 | } |
264 | } |
265 | return error; |
266 | |
267 | #else /* 1 */ |
268 | vfs_context_t ctx; |
269 | ctx = vfs_context_kernel(); |
270 | |
271 | error = vn_rdwr((flags & SWAP_READ) ? UIO_READ : UIO_WRITE, vp, (caddr_t)start, io_size, offset, |
272 | UIO_SYSSPACE, IO_SYNC | IO_NODELOCKED | IO_UNIT | IO_NOCACHE | IO_SWAP_DISPATCH, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); |
273 | |
274 | if (error) { |
275 | printf("vn_rdwr: Swap I/O failed with %d\n" , error); |
276 | } |
277 | return error; |
278 | #endif /* 1 */ |
279 | } |
280 | |
281 | |
282 | #define MAX_BATCH_TO_TRIM 256 |
283 | |
284 | #define ROUTE_ONLY 0x10 /* if corestorage is present, tell it to just pass */ |
285 | /* the DKIOUNMAP command through w/o acting on it */ |
286 | /* this is used by the compressed swap system to reclaim empty space */ |
287 | |
288 | |
289 | u_int32_t vnode_trim_list (vnode_t vp, struct trim_list *tl, boolean_t route_only) |
290 | { |
291 | int error = 0; |
292 | int trim_index = 0; |
293 | u_int32_t blocksize = 0; |
294 | struct vnode *devvp; |
295 | dk_extent_t *extents; |
296 | dk_unmap_t unmap; |
297 | _dk_cs_unmap_t cs_unmap; |
298 | |
299 | if ( !(vp->v_mount->mnt_ioflags & MNT_IOFLAGS_UNMAP_SUPPORTED)) |
300 | return (ENOTSUP); |
301 | |
302 | if (tl == NULL) |
303 | return (0); |
304 | |
305 | /* |
306 | * Get the underlying device vnode and physical block size |
307 | */ |
308 | devvp = vp->v_mount->mnt_devvp; |
309 | blocksize = vp->v_mount->mnt_devblocksize; |
310 | |
311 | extents = kalloc(sizeof(dk_extent_t) * MAX_BATCH_TO_TRIM); |
312 | |
313 | if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_CSUNMAP_SUPPORTED) { |
314 | memset (&cs_unmap, 0, sizeof(_dk_cs_unmap_t)); |
315 | cs_unmap.extents = extents; |
316 | |
317 | if (route_only == TRUE) |
318 | cs_unmap.options = ROUTE_ONLY; |
319 | } else { |
320 | memset (&unmap, 0, sizeof(dk_unmap_t)); |
321 | unmap.extents = extents; |
322 | } |
323 | |
324 | while (tl) { |
325 | daddr64_t io_blockno; /* Block number corresponding to the start of the extent */ |
326 | size_t io_bytecount; /* Number of bytes in current extent for the specified range */ |
327 | size_t trimmed; |
328 | size_t remaining_length; |
329 | off_t current_offset; |
330 | |
331 | current_offset = tl->tl_offset; |
332 | remaining_length = tl->tl_length; |
333 | trimmed = 0; |
334 | |
335 | /* |
336 | * We may not get the entire range from tl_offset -> tl_offset+tl_length in a single |
337 | * extent from the blockmap call. Keep looping/going until we are sure we've hit |
338 | * the whole range or if we encounter an error. |
339 | */ |
340 | while (trimmed < tl->tl_length) { |
341 | /* |
342 | * VNOP_BLOCKMAP will tell us the logical to physical block number mapping for the |
343 | * specified offset. It returns blocks in contiguous chunks, so if the logical range is |
344 | * broken into multiple extents, it must be called multiple times, increasing the offset |
345 | * in each call to ensure that the entire range is covered. |
346 | */ |
347 | error = VNOP_BLOCKMAP (vp, current_offset, remaining_length, |
348 | &io_blockno, &io_bytecount, NULL, VNODE_READ | VNODE_BLOCKMAP_NO_TRACK, NULL); |
349 | |
350 | if (error) { |
351 | goto trim_exit; |
352 | } |
353 | if (io_blockno != -1) { |
354 | extents[trim_index].offset = (uint64_t) io_blockno * (u_int64_t) blocksize; |
355 | extents[trim_index].length = io_bytecount; |
356 | |
357 | trim_index++; |
358 | } |
359 | if (trim_index == MAX_BATCH_TO_TRIM) { |
360 | |
361 | if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_CSUNMAP_SUPPORTED) { |
362 | cs_unmap.extentsCount = trim_index; |
363 | error = VNOP_IOCTL(devvp, _DKIOCCSUNMAP, (caddr_t)&cs_unmap, 0, vfs_context_kernel()); |
364 | } else { |
365 | unmap.extentsCount = trim_index; |
366 | error = VNOP_IOCTL(devvp, DKIOCUNMAP, (caddr_t)&unmap, 0, vfs_context_kernel()); |
367 | } |
368 | if (error) { |
369 | goto trim_exit; |
370 | } |
371 | trim_index = 0; |
372 | } |
373 | trimmed += io_bytecount; |
374 | current_offset += io_bytecount; |
375 | remaining_length -= io_bytecount; |
376 | } |
377 | tl = tl->tl_next; |
378 | } |
379 | if (trim_index) { |
380 | if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_CSUNMAP_SUPPORTED) { |
381 | cs_unmap.extentsCount = trim_index; |
382 | error = VNOP_IOCTL(devvp, _DKIOCCSUNMAP, (caddr_t)&cs_unmap, 0, vfs_context_kernel()); |
383 | } else { |
384 | unmap.extentsCount = trim_index; |
385 | error = VNOP_IOCTL(devvp, DKIOCUNMAP, (caddr_t)&unmap, 0, vfs_context_kernel()); |
386 | } |
387 | } |
388 | trim_exit: |
389 | kfree(extents, sizeof(dk_extent_t) * MAX_BATCH_TO_TRIM); |
390 | |
391 | return error; |
392 | } |
393 | |
394 | #if CONFIG_FREEZE |
395 | int |
396 | vm_swap_vol_get_budget(vnode_t vp, uint64_t *freeze_daily_budget) |
397 | { |
398 | vnode_t devvp = NULL; |
399 | vfs_context_t ctx = vfs_context_kernel(); |
400 | errno_t err = 0; |
401 | |
402 | devvp = vp->v_mount->mnt_devvp; |
403 | |
404 | err = VNOP_IOCTL(devvp, DKIOCGETMAXSWAPWRITE, (caddr_t)freeze_daily_budget, 0, ctx); |
405 | |
406 | return err; |
407 | } |
408 | #endif /* CONFIG_FREEZE */ |
409 | |