1/*
2 * Copyright (c) 2016-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/fsctl.h>
30#include <stdbool.h>
31#include <sys/time.h>
32#include <sys/buf.h>
33#include <sys/mount_internal.h>
34#include <sys/vnode_internal.h>
35#include <sys/buf_internal.h>
36
37#include <kern/kalloc.h>
38
39#include <sys/kauth.h>
40#include <IOKit/IOBSD.h>
41
42#include <vfs/vfs_disk_conditioner.h>
43
44#define DISK_CONDITIONER_SET_ENTITLEMENT "com.apple.private.dmc.set"
45
46// number of total blocks for a mount
47#define BLK_MAX(mp) ((mp->mnt_vfsstat.f_blocks * mp->mnt_vfsstat.f_bsize) / (mp->mnt_devblocksize))
48
49// approx. time to spin up an idle HDD
50#define DISK_SPINUP_SEC (8)
51
52// idle period until assumed disk spin down
53#define DISK_IDLE_SEC (10 * 60)
54
55struct saved_mount_fields {
56 uint32_t mnt_maxreadcnt; /* Max. byte count for read */
57 uint32_t mnt_maxwritecnt; /* Max. byte count for write */
58 uint32_t mnt_segreadcnt; /* Max. segment count for read */
59 uint32_t mnt_segwritecnt; /* Max. segment count for write */
60 uint32_t mnt_ioqueue_depth; /* the maxiumum number of commands a device can accept */
61 uint32_t mnt_ioscale; /* scale the various throttles/limits imposed on the amount of I/O in flight */
62};
63
64struct _disk_conditioner_info_t {
65 disk_conditioner_info dcinfo; // all the original data from fsctl
66 struct saved_mount_fields mnt_fields; // fields to restore in mount_t when conditioner is disabled
67
68 daddr64_t last_blkno; // approx. last transfered block for simulating seek times
69 struct timeval last_io_timestamp; // the last time an I/O completed
70};
71
72void disk_conditioner_delay(buf_t, int, int, uint64_t);
73void disk_conditioner_unmount(mount_t mp);
74
75extern void throttle_info_mount_reset_period(mount_t, int isssd);
76
77static double
78weighted_scale_factor(double scale)
79{
80 // 0 to 1 increasing quickly from 0. This weights smaller blkdiffs higher to add a type of minimum latency
81 // I would like to use log(10) / 2.0 + 1, but using different approximation due to no math library
82 // y = (x-1)^3 + 1
83 double x_m1 = scale - 1;
84 return x_m1 * x_m1 * x_m1 + 1;
85}
86
87void
88disk_conditioner_delay(buf_t bp, int extents, int total_size, uint64_t already_elapsed_usec)
89{
90 mount_t mp;
91 uint64_t delay_usec;
92 daddr64_t blkdiff;
93 daddr64_t last_blkno;
94 double access_time_scale;
95 struct _disk_conditioner_info_t *internal_info = NULL;
96 disk_conditioner_info *info = NULL;
97 struct timeval elapsed;
98 struct timeval start;
99 vnode_t vp;
100
101 vp = buf_vnode(bp);
102 if (!vp) {
103 return;
104 }
105
106 mp = vp->v_mount;
107 if (!mp) {
108 return;
109 }
110
111 internal_info = mp->mnt_disk_conditioner_info;
112 if (!internal_info || !internal_info->dcinfo.enabled) {
113 return;
114 }
115 info = &(internal_info->dcinfo);
116
117 if (!info->is_ssd) {
118 // calculate approximate seek time based on difference in block number
119 last_blkno = internal_info->last_blkno;
120 blkdiff = bp->b_blkno > last_blkno ? bp->b_blkno - last_blkno : last_blkno - bp->b_blkno;
121 internal_info->last_blkno = bp->b_blkno + bp->b_bcount;
122 } else {
123 blkdiff = BLK_MAX(mp);
124 }
125
126 // scale access time by (distance in blocks from previous I/O / maximum blocks)
127 access_time_scale = weighted_scale_factor(scale: (double)blkdiff / (double)BLK_MAX(mp));
128 if (__builtin_isnan(access_time_scale)) {
129 return;
130 }
131 // most cases should pass in extents==1 for optimal delay calculation, otherwise just multiply delay by extents
132 double temp = (((double)extents * (double)info->access_time_usec) * access_time_scale);
133 if (temp <= 0) {
134 delay_usec = 0;
135 } else if (temp >= (double)(18446744073709549568ULL)) { /* highest 64-bit unsigned integer representable as a double */
136 delay_usec = UINT64_MAX;
137 } else {
138 delay_usec = (uint64_t)temp;
139 }
140
141 if (info->read_throughput_mbps && (bp->b_flags & B_READ)) {
142 delay_usec += (uint64_t)(total_size / ((double)(info->read_throughput_mbps * 1024 * 1024 / 8) / USEC_PER_SEC));
143 } else if (info->write_throughput_mbps && !(bp->b_flags & B_READ)) {
144 delay_usec += (uint64_t)(total_size / ((double)(info->write_throughput_mbps * 1024 * 1024 / 8) / USEC_PER_SEC));
145 }
146
147 // try simulating disk spinup based on time since last I/O
148 if (!info->is_ssd) {
149 microuptime(tv: &elapsed);
150 timevalsub(t1: &elapsed, t2: &internal_info->last_io_timestamp);
151 // avoid this delay right after boot (assuming last_io_timestamp is 0 and disk is already spinning)
152 if (elapsed.tv_sec > DISK_IDLE_SEC && internal_info->last_io_timestamp.tv_sec != 0) {
153 delay_usec += DISK_SPINUP_SEC * USEC_PER_SEC;
154 }
155 }
156
157 if (delay_usec <= already_elapsed_usec) {
158 microuptime(tv: &internal_info->last_io_timestamp);
159 return;
160 }
161
162 delay_usec -= already_elapsed_usec;
163
164 while (delay_usec) {
165 microuptime(tv: &start);
166 assert(delay_usec <= INT_MAX);
167 delay(usec: (int)delay_usec);
168 microuptime(tv: &elapsed);
169 timevalsub(t1: &elapsed, t2: &start);
170 if (elapsed.tv_sec * USEC_PER_SEC < delay_usec) {
171 delay_usec -= elapsed.tv_sec * USEC_PER_SEC;
172 } else {
173 break;
174 }
175 if ((uint64_t)elapsed.tv_usec < delay_usec) {
176 delay_usec -= elapsed.tv_usec;
177 } else {
178 break;
179 }
180 }
181
182 microuptime(tv: &internal_info->last_io_timestamp);
183}
184
185int
186disk_conditioner_get_info(mount_t mp, disk_conditioner_info *uinfo)
187{
188 struct _disk_conditioner_info_t *info;
189
190 if (!mp) {
191 return EINVAL;
192 }
193
194 info = mp->mnt_disk_conditioner_info;
195
196 if (info) {
197 memcpy(dst: uinfo, src: &(info->dcinfo), n: sizeof(disk_conditioner_info));
198 }
199
200 return 0;
201}
202
203static inline void
204disk_conditioner_restore_mount_fields(mount_t mp, struct saved_mount_fields *mnt_fields)
205{
206 mp->mnt_maxreadcnt = mnt_fields->mnt_maxreadcnt;
207 mp->mnt_maxwritecnt = mnt_fields->mnt_maxwritecnt;
208 mp->mnt_segreadcnt = mnt_fields->mnt_segreadcnt;
209 mp->mnt_segwritecnt = mnt_fields->mnt_segwritecnt;
210 mp->mnt_ioqueue_depth = mnt_fields->mnt_ioqueue_depth;
211 mp->mnt_ioscale = mnt_fields->mnt_ioscale;
212}
213
214int
215disk_conditioner_set_info(mount_t mp, disk_conditioner_info *uinfo)
216{
217 struct _disk_conditioner_info_t *internal_info;
218 disk_conditioner_info *info;
219 struct saved_mount_fields *mnt_fields;
220
221 if (!kauth_cred_issuser(cred: kauth_cred_get()) || !IOCurrentTaskHasEntitlement(DISK_CONDITIONER_SET_ENTITLEMENT)) {
222 return EPERM;
223 }
224
225 if (!mp) {
226 return EINVAL;
227 }
228
229 mount_lock(mp);
230
231 internal_info = mp->mnt_disk_conditioner_info;
232 if (!internal_info) {
233 internal_info = kalloc_type(struct _disk_conditioner_info_t,
234 Z_WAITOK | Z_ZERO);
235 mp->mnt_disk_conditioner_info = internal_info;
236 mnt_fields = &(internal_info->mnt_fields);
237
238 /* save mount_t fields for restoration later */
239 mnt_fields->mnt_maxreadcnt = mp->mnt_maxreadcnt;
240 mnt_fields->mnt_maxwritecnt = mp->mnt_maxwritecnt;
241 mnt_fields->mnt_segreadcnt = mp->mnt_segreadcnt;
242 mnt_fields->mnt_segwritecnt = mp->mnt_segwritecnt;
243 mnt_fields->mnt_ioqueue_depth = mp->mnt_ioqueue_depth;
244 mnt_fields->mnt_ioscale = mp->mnt_ioscale;
245 }
246
247 info = &(internal_info->dcinfo);
248 mnt_fields = &(internal_info->mnt_fields);
249
250 if (!uinfo->enabled && info->enabled) {
251 /* disk conditioner is being disabled when already enabled */
252 disk_conditioner_restore_mount_fields(mp, mnt_fields);
253 }
254
255 memcpy(dst: info, src: uinfo, n: sizeof(disk_conditioner_info));
256
257 /* scale back based on hardware advertised limits */
258 if (uinfo->ioqueue_depth == 0 || uinfo->ioqueue_depth > mnt_fields->mnt_ioqueue_depth) {
259 info->ioqueue_depth = mnt_fields->mnt_ioqueue_depth;
260 }
261 if (uinfo->maxreadcnt == 0 || uinfo->maxreadcnt > mnt_fields->mnt_maxreadcnt) {
262 info->maxreadcnt = mnt_fields->mnt_maxreadcnt;
263 }
264 if (uinfo->maxwritecnt == 0 || uinfo->maxwritecnt > mnt_fields->mnt_maxwritecnt) {
265 info->maxwritecnt = mnt_fields->mnt_maxwritecnt;
266 }
267 if (uinfo->segreadcnt == 0 || uinfo->segreadcnt > mnt_fields->mnt_segreadcnt) {
268 info->segreadcnt = mnt_fields->mnt_segreadcnt;
269 }
270 if (uinfo->segwritecnt == 0 || uinfo->segwritecnt > mnt_fields->mnt_segwritecnt) {
271 info->segwritecnt = mnt_fields->mnt_segwritecnt;
272 }
273
274 if (uinfo->enabled) {
275 mp->mnt_maxreadcnt = info->maxreadcnt;
276 mp->mnt_maxwritecnt = info->maxwritecnt;
277 mp->mnt_segreadcnt = info->segreadcnt;
278 mp->mnt_segwritecnt = info->segwritecnt;
279 mp->mnt_ioqueue_depth = info->ioqueue_depth;
280 mp->mnt_ioscale = MNT_IOSCALE(info->ioqueue_depth);
281 }
282
283 mount_unlock(mp);
284
285 microuptime(tv: &internal_info->last_io_timestamp);
286
287 // make sure throttling picks up the new periods
288 throttle_info_mount_reset_period(mp, isssd: info->is_ssd);
289
290 return 0;
291}
292
293void
294disk_conditioner_unmount(mount_t mp)
295{
296 struct _disk_conditioner_info_t *internal_info = mp->mnt_disk_conditioner_info;
297
298 if (!internal_info) {
299 return;
300 }
301
302 if (internal_info->dcinfo.enabled) {
303 disk_conditioner_restore_mount_fields(mp, mnt_fields: &(internal_info->mnt_fields));
304 }
305 mp->mnt_disk_conditioner_info = NULL;
306 kfree_type(struct _disk_conditioner_info_t, internal_info);
307}
308
309boolean_t
310disk_conditioner_mount_is_ssd(mount_t mp)
311{
312 struct _disk_conditioner_info_t *internal_info = mp->mnt_disk_conditioner_info;
313
314 if (!internal_info || !internal_info->dcinfo.enabled) {
315 if (mp->mnt_kern_flag & MNTK_SSD) {
316 return TRUE;
317 }
318 return FALSE;
319 }
320
321 return internal_info->dcinfo.is_ssd;
322}
323