1/*
2 * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <vm/vm_page.h>
30#include <vm/vm_object.h>
31#include <vm/vm_kern.h>
32#include <vm/vm_pageout.h>
33#include <vm/vm_phantom_cache.h>
34#include <vm/vm_compressor.h>
35
36
37uint32_t phantom_cache_eval_period_in_msecs = 250;
38uint32_t phantom_cache_thrashing_threshold_ssd = 1000;
39#if CONFIG_EMBEDDED
40uint32_t phantom_cache_thrashing_threshold = 500;
41#else
42uint32_t phantom_cache_thrashing_threshold = 50;
43#endif
44
45/*
46 * Number of consecutive thrashing periods required before
47 * vm_phantom_cache_check_pressure() returns true.
48 */
49#if CONFIG_EMBEDDED
50unsigned phantom_cache_contiguous_periods = 4;
51#else
52unsigned phantom_cache_contiguous_periods = 2;
53#endif
54
55clock_sec_t pc_start_of_eval_period_sec = 0;
56clock_nsec_t pc_start_of_eval_period_nsec = 0;
57boolean_t pc_need_eval_reset = FALSE;
58
59/* One bit per recent sampling period. Bit 0 = current period. */
60uint32_t pc_history = 0;
61
62uint32_t sample_period_ghost_added_count = 0;
63uint32_t sample_period_ghost_added_count_ssd = 0;
64uint32_t sample_period_ghost_found_count = 0;
65uint32_t sample_period_ghost_found_count_ssd = 0;
66
67uint32_t vm_phantom_object_id = 1;
68#define VM_PHANTOM_OBJECT_ID_AFTER_WRAP 1000000
69
70vm_ghost_t vm_phantom_cache;
71uint32_t vm_phantom_cache_nindx = 1;
72uint32_t vm_phantom_cache_num_entries = 0;
73uint32_t vm_phantom_cache_size;
74
75typedef uint32_t vm_phantom_hash_entry_t;
76vm_phantom_hash_entry_t *vm_phantom_cache_hash;
77uint32_t vm_phantom_cache_hash_size;
78uint32_t vm_ghost_hash_mask; /* Mask for hash function */
79uint32_t vm_ghost_bucket_hash; /* Basic bucket hash */
80
81
82int pg_masks[4] = {
83 0x1, 0x2, 0x4, 0x8
84};
85
86
87#define vm_phantom_hash(obj_id, offset) (\
88 ( (natural_t)((uintptr_t)obj_id * vm_ghost_bucket_hash) + (offset ^ vm_ghost_bucket_hash)) & vm_ghost_hash_mask)
89
90
91struct phantom_cache_stats {
92 uint32_t pcs_wrapped;
93 uint32_t pcs_added_page_to_entry;
94 uint32_t pcs_added_new_entry;
95 uint32_t pcs_replaced_entry;
96
97 uint32_t pcs_lookup_found_page_in_cache;
98 uint32_t pcs_lookup_entry_not_in_cache;
99 uint32_t pcs_lookup_page_not_in_entry;
100
101 uint32_t pcs_updated_phantom_state;
102} phantom_cache_stats;
103
104
105
106void
107vm_phantom_cache_init()
108{
109 unsigned int num_entries;
110 unsigned int log1;
111 unsigned int size;
112
113 if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE)
114 return;
115#if CONFIG_EMBEDDED
116 num_entries = (uint32_t)(((max_mem / PAGE_SIZE) / 10) / VM_GHOST_PAGES_PER_ENTRY);
117#else
118 num_entries = (uint32_t)(((max_mem / PAGE_SIZE) / 4) / VM_GHOST_PAGES_PER_ENTRY);
119#endif
120 vm_phantom_cache_num_entries = 1;
121
122 while (vm_phantom_cache_num_entries < num_entries)
123 vm_phantom_cache_num_entries <<= 1;
124
125 vm_phantom_cache_size = sizeof(struct vm_ghost) * vm_phantom_cache_num_entries;
126 vm_phantom_cache_hash_size = sizeof(vm_phantom_hash_entry_t) * vm_phantom_cache_num_entries;
127
128 if (kernel_memory_allocate(kernel_map, (vm_offset_t *)(&vm_phantom_cache), vm_phantom_cache_size, 0, KMA_KOBJECT | KMA_PERMANENT, VM_KERN_MEMORY_PHANTOM_CACHE) != KERN_SUCCESS)
129 panic("vm_phantom_cache_init: kernel_memory_allocate failed\n");
130 bzero(vm_phantom_cache, vm_phantom_cache_size);
131
132 if (kernel_memory_allocate(kernel_map, (vm_offset_t *)(&vm_phantom_cache_hash), vm_phantom_cache_hash_size, 0, KMA_KOBJECT | KMA_PERMANENT, VM_KERN_MEMORY_PHANTOM_CACHE) != KERN_SUCCESS)
133 panic("vm_phantom_cache_init: kernel_memory_allocate failed\n");
134 bzero(vm_phantom_cache_hash, vm_phantom_cache_hash_size);
135
136
137 vm_ghost_hash_mask = vm_phantom_cache_num_entries - 1;
138
139 /*
140 * Calculate object_id shift value for hashing algorithm:
141 * O = log2(sizeof(struct vm_object))
142 * B = log2(vm_page_bucket_count)
143 * hash shifts the object_id left by
144 * B/2 - O
145 */
146 size = vm_phantom_cache_num_entries;
147 for (log1 = 0; size > 1; log1++)
148 size /= 2;
149
150 vm_ghost_bucket_hash = 1 << ((log1 + 1) >> 1); /* Get (ceiling of sqrt of table size) */
151 vm_ghost_bucket_hash |= 1 << ((log1 + 1) >> 2); /* Get (ceiling of quadroot of table size) */
152 vm_ghost_bucket_hash |= 1; /* Set bit and add 1 - always must be 1 to insure unique series */
153
154 if (vm_ghost_hash_mask & vm_phantom_cache_num_entries)
155 printf("vm_phantom_cache_init: WARNING -- strange page hash\n");
156}
157
158
159void
160vm_phantom_cache_add_ghost(vm_page_t m)
161{
162 vm_ghost_t vpce;
163 vm_object_t object;
164 int ghost_index;
165 int pg_mask;
166 boolean_t isSSD = FALSE;
167 vm_phantom_hash_entry_t ghost_hash_index;
168
169 object = VM_PAGE_OBJECT(m);
170
171 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
172 vm_object_lock_assert_exclusive(object);
173
174 if (vm_phantom_cache_num_entries == 0)
175 return;
176
177 pg_mask = pg_masks[(m->vmp_offset >> PAGE_SHIFT) & VM_GHOST_PAGE_MASK];
178
179 if (object->phantom_object_id == 0) {
180
181 vnode_pager_get_isSSD(object->pager, &isSSD);
182
183 if (isSSD == TRUE)
184 object->phantom_isssd = TRUE;
185
186 object->phantom_object_id = vm_phantom_object_id++;
187
188 if (vm_phantom_object_id == 0)
189 vm_phantom_object_id = VM_PHANTOM_OBJECT_ID_AFTER_WRAP;
190 } else {
191 if ( (vpce = vm_phantom_cache_lookup_ghost(m, 0)) ) {
192 vpce->g_pages_held |= pg_mask;
193
194 phantom_cache_stats.pcs_added_page_to_entry++;
195 goto done;
196 }
197 }
198 /*
199 * if we're here then the vm_ghost_t of this vm_page_t
200 * is not present in the phantom cache... take the next
201 * available entry in the LRU first evicting the existing
202 * entry if we've wrapped the ring
203 */
204 ghost_index = vm_phantom_cache_nindx++;
205
206 if (vm_phantom_cache_nindx == vm_phantom_cache_num_entries) {
207 vm_phantom_cache_nindx = 1;
208
209 phantom_cache_stats.pcs_wrapped++;
210 }
211 vpce = &vm_phantom_cache[ghost_index];
212
213 if (vpce->g_obj_id) {
214 /*
215 * we're going to replace an existing entry
216 * so first remove it from the hash
217 */
218 vm_ghost_t nvpce;
219
220 ghost_hash_index = vm_phantom_hash(vpce->g_obj_id, vpce->g_obj_offset);
221
222 nvpce = &vm_phantom_cache[vm_phantom_cache_hash[ghost_hash_index]];
223
224 if (nvpce == vpce) {
225 vm_phantom_cache_hash[ghost_hash_index] = vpce->g_next_index;
226 } else {
227 for (;;) {
228 if (nvpce->g_next_index == 0)
229 panic("didn't find ghost in hash\n");
230
231 if (&vm_phantom_cache[nvpce->g_next_index] == vpce) {
232 nvpce->g_next_index = vpce->g_next_index;
233 break;
234 }
235 nvpce = &vm_phantom_cache[nvpce->g_next_index];
236 }
237 }
238 phantom_cache_stats.pcs_replaced_entry++;
239 } else
240 phantom_cache_stats.pcs_added_new_entry++;
241
242 vpce->g_pages_held = pg_mask;
243 vpce->g_obj_offset = (m->vmp_offset >> (PAGE_SHIFT + VM_GHOST_PAGE_SHIFT)) & VM_GHOST_OFFSET_MASK;
244 vpce->g_obj_id = object->phantom_object_id;
245
246 ghost_hash_index = vm_phantom_hash(vpce->g_obj_id, vpce->g_obj_offset);
247 vpce->g_next_index = vm_phantom_cache_hash[ghost_hash_index];
248 vm_phantom_cache_hash[ghost_hash_index] = ghost_index;
249
250done:
251 vm_pageout_vminfo.vm_phantom_cache_added_ghost++;
252
253 if (object->phantom_isssd)
254 OSAddAtomic(1, &sample_period_ghost_added_count_ssd);
255 else
256 OSAddAtomic(1, &sample_period_ghost_added_count);
257}
258
259
260vm_ghost_t
261vm_phantom_cache_lookup_ghost(vm_page_t m, uint32_t pg_mask)
262{
263 uint64_t g_obj_offset;
264 uint32_t g_obj_id;
265 uint32_t ghost_index;
266 vm_object_t object;
267
268 object = VM_PAGE_OBJECT(m);
269
270 if ((g_obj_id = object->phantom_object_id) == 0) {
271 /*
272 * no entries in phantom cache for this object
273 */
274 return (NULL);
275 }
276 g_obj_offset = (m->vmp_offset >> (PAGE_SHIFT + VM_GHOST_PAGE_SHIFT)) & VM_GHOST_OFFSET_MASK;
277
278 ghost_index = vm_phantom_cache_hash[vm_phantom_hash(g_obj_id, g_obj_offset)];
279
280 while (ghost_index) {
281 vm_ghost_t vpce;
282
283 vpce = &vm_phantom_cache[ghost_index];
284
285 if (vpce->g_obj_id == g_obj_id && vpce->g_obj_offset == g_obj_offset) {
286
287 if (pg_mask == 0 || (vpce->g_pages_held & pg_mask)) {
288 phantom_cache_stats.pcs_lookup_found_page_in_cache++;
289
290 return (vpce);
291 }
292 phantom_cache_stats.pcs_lookup_page_not_in_entry++;
293
294 return (NULL);
295 }
296 ghost_index = vpce->g_next_index;
297 }
298 phantom_cache_stats.pcs_lookup_entry_not_in_cache++;
299
300 return (NULL);
301}
302
303
304
305void
306vm_phantom_cache_update(vm_page_t m)
307{
308 int pg_mask;
309 vm_ghost_t vpce;
310 vm_object_t object;
311
312 object = VM_PAGE_OBJECT(m);
313
314 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
315 vm_object_lock_assert_exclusive(object);
316
317 if (vm_phantom_cache_num_entries == 0)
318 return;
319
320 pg_mask = pg_masks[(m->vmp_offset >> PAGE_SHIFT) & VM_GHOST_PAGE_MASK];
321
322 if ( (vpce = vm_phantom_cache_lookup_ghost(m, pg_mask)) ) {
323
324 vpce->g_pages_held &= ~pg_mask;
325
326 phantom_cache_stats.pcs_updated_phantom_state++;
327 vm_pageout_vminfo.vm_phantom_cache_found_ghost++;
328
329 if (object->phantom_isssd)
330 OSAddAtomic(1, &sample_period_ghost_found_count_ssd);
331 else
332 OSAddAtomic(1, &sample_period_ghost_found_count);
333 }
334}
335
336
337#define PHANTOM_CACHE_DEBUG 1
338
339#if PHANTOM_CACHE_DEBUG
340
341int sample_period_ghost_counts_indx = 0;
342
343struct {
344 uint32_t added;
345 uint32_t found;
346 uint32_t added_ssd;
347 uint32_t found_ssd;
348 uint32_t elapsed_ms;
349 boolean_t pressure_detected;
350} sample_period_ghost_counts[256];
351
352#endif
353
354/*
355 * Determine if the file cache is thrashing from sampling interval statistics.
356 *
357 * Pages added to the phantom cache = pages evicted from the file cache.
358 * Pages found in the phantom cache = reads of pages that were recently evicted.
359 * Threshold is the latency-dependent number of reads we consider thrashing.
360 */
361static boolean_t
362is_thrashing(uint32_t added, uint32_t found, uint32_t threshold)
363{
364 /* Ignore normal activity below the threshold. */
365 if (added < threshold || found < threshold)
366 return FALSE;
367
368 /*
369 * When thrashing in a way that we can mitigate, most of the pages read
370 * into the file cache were recently evicted, and 'found' will be close
371 * to 'added'.
372 *
373 * When replacing the current working set because a new app is
374 * launched, we see very high read traffic with sporadic phantom cache
375 * hits.
376 *
377 * This is not thrashing, or freeing up memory wouldn't help much
378 * anyway.
379 */
380 if (found < added / 2)
381 return FALSE;
382
383 return TRUE;
384}
385
386/*
387 * the following function is never called
388 * from multiple threads simultaneously due
389 * to a condition variable used to serialize
390 * at the compressor level... thus no need
391 * to provide locking for the sample processing
392 */
393boolean_t
394vm_phantom_cache_check_pressure()
395{
396 clock_sec_t cur_ts_sec;
397 clock_nsec_t cur_ts_nsec;
398 uint64_t elapsed_msecs_in_eval;
399 boolean_t pressure_detected = FALSE;
400
401 clock_get_system_nanotime(&cur_ts_sec, &cur_ts_nsec);
402
403 elapsed_msecs_in_eval = vm_compressor_compute_elapsed_msecs(cur_ts_sec, cur_ts_nsec, pc_start_of_eval_period_sec, pc_start_of_eval_period_nsec);
404
405 /*
406 * Reset evaluation period after phantom_cache_eval_period_in_msecs or
407 * whenever vm_phantom_cache_restart_sample has been called.
408 */
409 if (elapsed_msecs_in_eval >= phantom_cache_eval_period_in_msecs) {
410 pc_need_eval_reset = TRUE;
411 }
412
413 if (pc_need_eval_reset == TRUE) {
414
415#if PHANTOM_CACHE_DEBUG
416 /*
417 * maintain some info about the last 256 sample periods
418 */
419 sample_period_ghost_counts[sample_period_ghost_counts_indx].added = sample_period_ghost_added_count;
420 sample_period_ghost_counts[sample_period_ghost_counts_indx].found = sample_period_ghost_found_count;
421 sample_period_ghost_counts[sample_period_ghost_counts_indx].added_ssd = sample_period_ghost_added_count_ssd;
422 sample_period_ghost_counts[sample_period_ghost_counts_indx].found_ssd = sample_period_ghost_found_count_ssd;
423 sample_period_ghost_counts[sample_period_ghost_counts_indx].elapsed_ms = (uint32_t)elapsed_msecs_in_eval;
424
425 sample_period_ghost_counts_indx++;
426
427 if (sample_period_ghost_counts_indx >= 256)
428 sample_period_ghost_counts_indx = 0;
429#endif
430 sample_period_ghost_added_count = 0;
431 sample_period_ghost_found_count = 0;
432 sample_period_ghost_added_count_ssd = 0;
433 sample_period_ghost_found_count_ssd = 0;
434
435 pc_start_of_eval_period_sec = cur_ts_sec;
436 pc_start_of_eval_period_nsec = cur_ts_nsec;
437 pc_history <<= 1;
438 pc_need_eval_reset = FALSE;
439 } else {
440 /*
441 * Since the trashing rate is really a function of the read latency of the disk
442 * we have to consider both the SSD and spinning disk case since the file cache
443 * could be backed by either or even both flavors. When the object is first
444 * assigned a phantom_object_id, we query the pager to determine if the backing
445 * backing media is an SSD and remember that answer in the vm_object. We use
446 * that info to maintains counts for both the SSD and spinning disk cases.
447 */
448 if (is_thrashing(sample_period_ghost_added_count,
449 sample_period_ghost_found_count,
450 phantom_cache_thrashing_threshold) ||
451 is_thrashing(sample_period_ghost_added_count_ssd,
452 sample_period_ghost_found_count_ssd,
453 phantom_cache_thrashing_threshold_ssd)) {
454 /* Thrashing in the current period: Set bit 0. */
455 pc_history |= 1;
456 }
457 }
458
459 /*
460 * Declare pressure_detected after phantom_cache_contiguous_periods.
461 *
462 * Create a bitmask with the N low bits set. These bits must all be set
463 * in pc_history. The high bits of pc_history are ignored.
464 */
465 uint32_t bitmask = (1u << phantom_cache_contiguous_periods) - 1;
466 if ((pc_history & bitmask) == bitmask)
467 pressure_detected = TRUE;
468
469 if (vm_page_external_count > ((AVAILABLE_MEMORY) * 50) / 100)
470 pressure_detected = FALSE;
471
472#if PHANTOM_CACHE_DEBUG
473 sample_period_ghost_counts[sample_period_ghost_counts_indx].pressure_detected = pressure_detected;
474#endif
475 return (pressure_detected);
476}
477
478/*
479 * Restart the current sampling because conditions have changed significantly,
480 * and we don't want to react to old data.
481 *
482 * This function can be called from any thread.
483 */
484void
485vm_phantom_cache_restart_sample(void)
486{
487 pc_need_eval_reset = TRUE;
488}
489