1 | /* |
2 | * Copyright (c) 2016-2020 Apple Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | */ |
28 | |
29 | #include <sys/fsctl.h> |
30 | #include <stdbool.h> |
31 | #include <sys/time.h> |
32 | #include <sys/buf.h> |
33 | #include <sys/mount_internal.h> |
34 | #include <sys/vnode_internal.h> |
35 | #include <sys/buf_internal.h> |
36 | |
37 | #include <kern/kalloc.h> |
38 | |
39 | #include <sys/kauth.h> |
40 | #include <IOKit/IOBSD.h> |
41 | |
42 | #include <vfs/vfs_disk_conditioner.h> |
43 | |
44 | #define DISK_CONDITIONER_SET_ENTITLEMENT "com.apple.private.dmc.set" |
45 | |
46 | // number of total blocks for a mount |
47 | #define BLK_MAX(mp) ((mp->mnt_vfsstat.f_blocks * mp->mnt_vfsstat.f_bsize) / (mp->mnt_devblocksize)) |
48 | |
49 | // approx. time to spin up an idle HDD |
50 | #define DISK_SPINUP_SEC (8) |
51 | |
52 | // idle period until assumed disk spin down |
53 | #define DISK_IDLE_SEC (10 * 60) |
54 | |
55 | struct saved_mount_fields { |
56 | uint32_t mnt_maxreadcnt; /* Max. byte count for read */ |
57 | uint32_t mnt_maxwritecnt; /* Max. byte count for write */ |
58 | uint32_t mnt_segreadcnt; /* Max. segment count for read */ |
59 | uint32_t mnt_segwritecnt; /* Max. segment count for write */ |
60 | uint32_t mnt_ioqueue_depth; /* the maxiumum number of commands a device can accept */ |
61 | uint32_t mnt_ioscale; /* scale the various throttles/limits imposed on the amount of I/O in flight */ |
62 | }; |
63 | |
64 | struct _disk_conditioner_info_t { |
65 | disk_conditioner_info dcinfo; // all the original data from fsctl |
66 | struct saved_mount_fields mnt_fields; // fields to restore in mount_t when conditioner is disabled |
67 | |
68 | daddr64_t last_blkno; // approx. last transfered block for simulating seek times |
69 | struct timeval last_io_timestamp; // the last time an I/O completed |
70 | }; |
71 | |
72 | void disk_conditioner_delay(buf_t, int, int, uint64_t); |
73 | void disk_conditioner_unmount(mount_t mp); |
74 | |
75 | extern void throttle_info_mount_reset_period(mount_t, int isssd); |
76 | |
77 | static double |
78 | weighted_scale_factor(double scale) |
79 | { |
80 | // 0 to 1 increasing quickly from 0. This weights smaller blkdiffs higher to add a type of minimum latency |
81 | // I would like to use log(10) / 2.0 + 1, but using different approximation due to no math library |
82 | // y = (x-1)^3 + 1 |
83 | double x_m1 = scale - 1; |
84 | return x_m1 * x_m1 * x_m1 + 1; |
85 | } |
86 | |
87 | void |
88 | disk_conditioner_delay(buf_t bp, int extents, int total_size, uint64_t already_elapsed_usec) |
89 | { |
90 | mount_t mp; |
91 | uint64_t delay_usec; |
92 | daddr64_t blkdiff; |
93 | daddr64_t last_blkno; |
94 | double access_time_scale; |
95 | struct _disk_conditioner_info_t *internal_info = NULL; |
96 | disk_conditioner_info *info = NULL; |
97 | struct timeval elapsed; |
98 | struct timeval start; |
99 | vnode_t vp; |
100 | |
101 | vp = buf_vnode(bp); |
102 | if (!vp) { |
103 | return; |
104 | } |
105 | |
106 | mp = vp->v_mount; |
107 | if (!mp) { |
108 | return; |
109 | } |
110 | |
111 | internal_info = mp->mnt_disk_conditioner_info; |
112 | if (!internal_info || !internal_info->dcinfo.enabled) { |
113 | return; |
114 | } |
115 | info = &(internal_info->dcinfo); |
116 | |
117 | if (!info->is_ssd) { |
118 | // calculate approximate seek time based on difference in block number |
119 | last_blkno = internal_info->last_blkno; |
120 | blkdiff = bp->b_blkno > last_blkno ? bp->b_blkno - last_blkno : last_blkno - bp->b_blkno; |
121 | internal_info->last_blkno = bp->b_blkno + bp->b_bcount; |
122 | } else { |
123 | blkdiff = BLK_MAX(mp); |
124 | } |
125 | |
126 | // scale access time by (distance in blocks from previous I/O / maximum blocks) |
127 | access_time_scale = weighted_scale_factor(scale: (double)blkdiff / (double)BLK_MAX(mp)); |
128 | if (__builtin_isnan(access_time_scale)) { |
129 | return; |
130 | } |
131 | // most cases should pass in extents==1 for optimal delay calculation, otherwise just multiply delay by extents |
132 | double temp = (((double)extents * (double)info->access_time_usec) * access_time_scale); |
133 | if (temp <= 0) { |
134 | delay_usec = 0; |
135 | } else if (temp >= (double)(18446744073709549568ULL)) { /* highest 64-bit unsigned integer representable as a double */ |
136 | delay_usec = UINT64_MAX; |
137 | } else { |
138 | delay_usec = (uint64_t)temp; |
139 | } |
140 | |
141 | if (info->read_throughput_mbps && (bp->b_flags & B_READ)) { |
142 | delay_usec += (uint64_t)(total_size / ((double)(info->read_throughput_mbps * 1024 * 1024 / 8) / USEC_PER_SEC)); |
143 | } else if (info->write_throughput_mbps && !(bp->b_flags & B_READ)) { |
144 | delay_usec += (uint64_t)(total_size / ((double)(info->write_throughput_mbps * 1024 * 1024 / 8) / USEC_PER_SEC)); |
145 | } |
146 | |
147 | // try simulating disk spinup based on time since last I/O |
148 | if (!info->is_ssd) { |
149 | microuptime(tv: &elapsed); |
150 | timevalsub(t1: &elapsed, t2: &internal_info->last_io_timestamp); |
151 | // avoid this delay right after boot (assuming last_io_timestamp is 0 and disk is already spinning) |
152 | if (elapsed.tv_sec > DISK_IDLE_SEC && internal_info->last_io_timestamp.tv_sec != 0) { |
153 | delay_usec += DISK_SPINUP_SEC * USEC_PER_SEC; |
154 | } |
155 | } |
156 | |
157 | if (delay_usec <= already_elapsed_usec) { |
158 | microuptime(tv: &internal_info->last_io_timestamp); |
159 | return; |
160 | } |
161 | |
162 | delay_usec -= already_elapsed_usec; |
163 | |
164 | while (delay_usec) { |
165 | microuptime(tv: &start); |
166 | assert(delay_usec <= INT_MAX); |
167 | delay(usec: (int)delay_usec); |
168 | microuptime(tv: &elapsed); |
169 | timevalsub(t1: &elapsed, t2: &start); |
170 | if (elapsed.tv_sec * USEC_PER_SEC < delay_usec) { |
171 | delay_usec -= elapsed.tv_sec * USEC_PER_SEC; |
172 | } else { |
173 | break; |
174 | } |
175 | if ((uint64_t)elapsed.tv_usec < delay_usec) { |
176 | delay_usec -= elapsed.tv_usec; |
177 | } else { |
178 | break; |
179 | } |
180 | } |
181 | |
182 | microuptime(tv: &internal_info->last_io_timestamp); |
183 | } |
184 | |
185 | int |
186 | disk_conditioner_get_info(mount_t mp, disk_conditioner_info *uinfo) |
187 | { |
188 | struct _disk_conditioner_info_t *info; |
189 | |
190 | if (!mp) { |
191 | return EINVAL; |
192 | } |
193 | |
194 | info = mp->mnt_disk_conditioner_info; |
195 | |
196 | if (info) { |
197 | memcpy(dst: uinfo, src: &(info->dcinfo), n: sizeof(disk_conditioner_info)); |
198 | } |
199 | |
200 | return 0; |
201 | } |
202 | |
203 | static inline void |
204 | disk_conditioner_restore_mount_fields(mount_t mp, struct saved_mount_fields *mnt_fields) |
205 | { |
206 | mp->mnt_maxreadcnt = mnt_fields->mnt_maxreadcnt; |
207 | mp->mnt_maxwritecnt = mnt_fields->mnt_maxwritecnt; |
208 | mp->mnt_segreadcnt = mnt_fields->mnt_segreadcnt; |
209 | mp->mnt_segwritecnt = mnt_fields->mnt_segwritecnt; |
210 | mp->mnt_ioqueue_depth = mnt_fields->mnt_ioqueue_depth; |
211 | mp->mnt_ioscale = mnt_fields->mnt_ioscale; |
212 | } |
213 | |
214 | int |
215 | disk_conditioner_set_info(mount_t mp, disk_conditioner_info *uinfo) |
216 | { |
217 | struct _disk_conditioner_info_t *internal_info; |
218 | disk_conditioner_info *info; |
219 | struct saved_mount_fields *mnt_fields; |
220 | |
221 | if (!kauth_cred_issuser(cred: kauth_cred_get()) || !IOCurrentTaskHasEntitlement(DISK_CONDITIONER_SET_ENTITLEMENT)) { |
222 | return EPERM; |
223 | } |
224 | |
225 | if (!mp) { |
226 | return EINVAL; |
227 | } |
228 | |
229 | mount_lock(mp); |
230 | |
231 | internal_info = mp->mnt_disk_conditioner_info; |
232 | if (!internal_info) { |
233 | internal_info = kalloc_type(struct _disk_conditioner_info_t, |
234 | Z_WAITOK | Z_ZERO); |
235 | mp->mnt_disk_conditioner_info = internal_info; |
236 | mnt_fields = &(internal_info->mnt_fields); |
237 | |
238 | /* save mount_t fields for restoration later */ |
239 | mnt_fields->mnt_maxreadcnt = mp->mnt_maxreadcnt; |
240 | mnt_fields->mnt_maxwritecnt = mp->mnt_maxwritecnt; |
241 | mnt_fields->mnt_segreadcnt = mp->mnt_segreadcnt; |
242 | mnt_fields->mnt_segwritecnt = mp->mnt_segwritecnt; |
243 | mnt_fields->mnt_ioqueue_depth = mp->mnt_ioqueue_depth; |
244 | mnt_fields->mnt_ioscale = mp->mnt_ioscale; |
245 | } |
246 | |
247 | info = &(internal_info->dcinfo); |
248 | mnt_fields = &(internal_info->mnt_fields); |
249 | |
250 | if (!uinfo->enabled && info->enabled) { |
251 | /* disk conditioner is being disabled when already enabled */ |
252 | disk_conditioner_restore_mount_fields(mp, mnt_fields); |
253 | } |
254 | |
255 | memcpy(dst: info, src: uinfo, n: sizeof(disk_conditioner_info)); |
256 | |
257 | /* scale back based on hardware advertised limits */ |
258 | if (uinfo->ioqueue_depth == 0 || uinfo->ioqueue_depth > mnt_fields->mnt_ioqueue_depth) { |
259 | info->ioqueue_depth = mnt_fields->mnt_ioqueue_depth; |
260 | } |
261 | if (uinfo->maxreadcnt == 0 || uinfo->maxreadcnt > mnt_fields->mnt_maxreadcnt) { |
262 | info->maxreadcnt = mnt_fields->mnt_maxreadcnt; |
263 | } |
264 | if (uinfo->maxwritecnt == 0 || uinfo->maxwritecnt > mnt_fields->mnt_maxwritecnt) { |
265 | info->maxwritecnt = mnt_fields->mnt_maxwritecnt; |
266 | } |
267 | if (uinfo->segreadcnt == 0 || uinfo->segreadcnt > mnt_fields->mnt_segreadcnt) { |
268 | info->segreadcnt = mnt_fields->mnt_segreadcnt; |
269 | } |
270 | if (uinfo->segwritecnt == 0 || uinfo->segwritecnt > mnt_fields->mnt_segwritecnt) { |
271 | info->segwritecnt = mnt_fields->mnt_segwritecnt; |
272 | } |
273 | |
274 | if (uinfo->enabled) { |
275 | mp->mnt_maxreadcnt = info->maxreadcnt; |
276 | mp->mnt_maxwritecnt = info->maxwritecnt; |
277 | mp->mnt_segreadcnt = info->segreadcnt; |
278 | mp->mnt_segwritecnt = info->segwritecnt; |
279 | mp->mnt_ioqueue_depth = info->ioqueue_depth; |
280 | mp->mnt_ioscale = MNT_IOSCALE(info->ioqueue_depth); |
281 | } |
282 | |
283 | mount_unlock(mp); |
284 | |
285 | microuptime(tv: &internal_info->last_io_timestamp); |
286 | |
287 | // make sure throttling picks up the new periods |
288 | throttle_info_mount_reset_period(mp, isssd: info->is_ssd); |
289 | |
290 | return 0; |
291 | } |
292 | |
293 | void |
294 | disk_conditioner_unmount(mount_t mp) |
295 | { |
296 | struct _disk_conditioner_info_t *internal_info = mp->mnt_disk_conditioner_info; |
297 | |
298 | if (!internal_info) { |
299 | return; |
300 | } |
301 | |
302 | if (internal_info->dcinfo.enabled) { |
303 | disk_conditioner_restore_mount_fields(mp, mnt_fields: &(internal_info->mnt_fields)); |
304 | } |
305 | mp->mnt_disk_conditioner_info = NULL; |
306 | kfree_type(struct _disk_conditioner_info_t, internal_info); |
307 | } |
308 | |
309 | boolean_t |
310 | disk_conditioner_mount_is_ssd(mount_t mp) |
311 | { |
312 | struct _disk_conditioner_info_t *internal_info = mp->mnt_disk_conditioner_info; |
313 | |
314 | if (!internal_info || !internal_info->dcinfo.enabled) { |
315 | if (mp->mnt_kern_flag & MNTK_SSD) { |
316 | return TRUE; |
317 | } |
318 | return FALSE; |
319 | } |
320 | |
321 | return internal_info->dcinfo.is_ssd; |
322 | } |
323 | |