1 // SPDX-License-Identifier: MIT
3 * Copyright © 2020 Intel Corporation
6 #include "xe_migrate.h"
8 #include <linux/bitfield.h>
9 #include <linux/sizes.h>
11 #include <drm/drm_managed.h>
12 #include <drm/ttm/ttm_tt.h>
13 #include <drm/xe_drm.h>
15 #include "generated/xe_wa_oob.h"
16 #include "instructions/xe_mi_commands.h"
17 #include "regs/xe_gpu_commands.h"
18 #include "tests/xe_test.h"
19 #include "xe_assert.h"
22 #include "xe_exec_queue.h"
25 #include "xe_hw_engine.h"
30 #include "xe_res_cursor.h"
31 #include "xe_sched_job.h"
38 * struct xe_migrate - migrate context.
41 /** @q: Default exec queue used for migration */
42 struct xe_exec_queue *q;
43 /** @tile: Backpointer to the tile this struct xe_migrate belongs to. */
45 /** @job_mutex: Timeline mutex for @eng. */
46 struct mutex job_mutex;
47 /** @pt_bo: Page-table buffer object. */
49 /** @batch_base_ofs: VM offset of the migration batch buffer */
51 /** @usm_batch_base_ofs: VM offset of the usm batch buffer */
52 u64 usm_batch_base_ofs;
53 /** @cleared_mem_ofs: VM offset of @cleared_bo. */
56 * @fence: dma-fence representing the last migration job batch.
57 * Protected by @job_mutex.
59 struct dma_fence *fence;
61 * @vm_update_sa: For integrated, used to suballocate page-tables
64 struct drm_suballoc_manager vm_update_sa;
65 /** @min_chunk_size: For dgfx, Minimum chunk size */
69 #define MAX_PREEMPTDISABLE_TRANSFER SZ_8M /* Around 1ms. */
70 #define MAX_CCS_LIMITED_TRANSFER SZ_4M /* XE_PAGE_SIZE * (FIELD_MAX(XE2_CCS_SIZE_MASK) + 1) */
71 #define NUM_KERNEL_PDE 17
72 #define NUM_PT_SLOTS 32
73 #define LEVEL0_PAGE_TABLE_ENCODE_SIZE SZ_2M
76 * xe_tile_migrate_engine() - Get this tile's migrate engine.
79 * Returns the default migrate engine of this tile.
80 * TODO: Perhaps this function is slightly misplaced, and even unneeded?
82 * Return: The default migrate engine
84 struct xe_exec_queue *xe_tile_migrate_engine(struct xe_tile *tile)
86 return tile->migrate->q;
89 static void xe_migrate_fini(struct drm_device *dev, void *arg)
91 struct xe_migrate *m = arg;
93 xe_vm_lock(m->q->vm, false);
94 xe_bo_unpin(m->pt_bo);
95 xe_vm_unlock(m->q->vm);
97 dma_fence_put(m->fence);
99 drm_suballoc_manager_fini(&m->vm_update_sa);
100 mutex_destroy(&m->job_mutex);
101 xe_vm_close_and_put(m->q->vm);
102 xe_exec_queue_put(m->q);
105 static u64 xe_migrate_vm_addr(u64 slot, u32 level)
107 XE_WARN_ON(slot >= NUM_PT_SLOTS);
109 /* First slot is reserved for mapping of PT bo and bb, start from 1 */
110 return (slot + 1ULL) << xe_pt_shift(level + 1);
113 static u64 xe_migrate_vram_ofs(struct xe_device *xe, u64 addr)
116 * Remove the DPA to get a correct offset into identity table for the
119 addr -= xe->mem.vram.dpa_base;
120 return addr + (256ULL << xe_pt_shift(2));
123 static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m,
126 struct xe_device *xe = tile_to_xe(tile);
127 u16 pat_index = xe->pat.idx[XE_CACHE_WB];
129 u32 num_entries = NUM_PT_SLOTS, num_level = vm->pt_root[id]->level;
130 u32 map_ofs, level, i;
131 struct xe_bo *bo, *batch = tile->mem.kernel_bb_pool->bo;
134 /* Can't bump NUM_PT_SLOTS too high */
135 BUILD_BUG_ON(NUM_PT_SLOTS > SZ_2M/XE_PAGE_SIZE);
136 /* Must be a multiple of 64K to support all platforms */
137 BUILD_BUG_ON(NUM_PT_SLOTS * XE_PAGE_SIZE % SZ_64K);
138 /* And one slot reserved for the 4KiB page table updates */
139 BUILD_BUG_ON(!(NUM_KERNEL_PDE & 1));
141 /* Need to be sure everything fits in the first PT, or create more */
142 xe_tile_assert(tile, m->batch_base_ofs + batch->size < SZ_2M);
144 bo = xe_bo_create_pin_map(vm->xe, tile, vm,
145 num_entries * XE_PAGE_SIZE,
147 XE_BO_CREATE_VRAM_IF_DGFX(tile) |
148 XE_BO_CREATE_PINNED_BIT);
152 entry = vm->pt_ops->pde_encode_bo(bo, bo->size - XE_PAGE_SIZE, pat_index);
153 xe_pt_write(xe, &vm->pt_root[id]->bo->vmap, 0, entry);
155 map_ofs = (num_entries - num_level) * XE_PAGE_SIZE;
157 /* Map the entire BO in our level 0 pt */
158 for (i = 0, level = 0; i < num_entries; level++) {
159 entry = vm->pt_ops->pte_encode_bo(bo, i * XE_PAGE_SIZE,
162 xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, entry);
164 if (vm->flags & XE_VM_FLAG_64K)
171 /* Write out batch too */
172 m->batch_base_ofs = NUM_PT_SLOTS * XE_PAGE_SIZE;
173 if (xe->info.has_usm) {
174 batch = tile->primary_gt->usm.bb_pool->bo;
175 m->usm_batch_base_ofs = m->batch_base_ofs;
178 for (i = 0; i < batch->size;
179 i += vm->flags & XE_VM_FLAG_64K ? XE_64K_PAGE_SIZE :
181 entry = vm->pt_ops->pte_encode_bo(batch, i,
184 xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64,
189 u64 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE);
191 m->batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr);
193 if (xe->info.has_usm) {
194 batch = tile->primary_gt->usm.bb_pool->bo;
195 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE);
196 m->usm_batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr);
200 for (level = 1; level < num_level; level++) {
203 if (vm->flags & XE_VM_FLAG_64K && level == 1)
206 entry = vm->pt_ops->pde_encode_bo(bo, map_ofs + (level - 1) *
207 XE_PAGE_SIZE, pat_index);
208 xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level, u64,
212 /* Write PDE's that point to our BO. */
213 for (i = 0; i < num_entries - num_level; i++) {
214 entry = vm->pt_ops->pde_encode_bo(bo, i * XE_PAGE_SIZE,
217 xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE +
218 (i + 1) * 8, u64, entry);
221 /* Set up a 1GiB NULL mapping at 255GiB offset. */
223 xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level + 255 * 8, u64,
224 vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level, IS_DGFX(xe), 0)
226 m->cleared_mem_ofs = (255ULL << xe_pt_shift(level));
228 /* Identity map the entire vram at 256GiB offset */
233 ofs = map_ofs + XE_PAGE_SIZE * level + 256 * 8;
234 flags = vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level,
238 * Use 1GB pages, it shouldn't matter the physical amount of
239 * vram is less, when we don't access it.
241 for (pos = xe->mem.vram.dpa_base;
242 pos < xe->mem.vram.actual_physical_size + xe->mem.vram.dpa_base;
243 pos += SZ_1G, ofs += 8)
244 xe_map_wr(xe, &bo->vmap, ofs, u64, pos | flags);
248 * Example layout created above, with root level = 3:
249 * [PT0...PT7]: kernel PT's for copy/clear; 64 or 4KiB PTE's
250 * [PT8]: Kernel PT for VM_BIND, 4 KiB PTE's
251 * [PT9...PT28]: Userspace PT's for VM_BIND, 4 KiB PTE's
252 * [PT29 = PDE 0] [PT30 = PDE 1] [PT31 = PDE 2]
254 * This makes the lowest part of the VM point to the pagetables.
255 * Hence the lowest 2M in the vm should point to itself, with a few writes
256 * and flushes, other parts of the VM can be used either for copying and
259 * For performance, the kernel reserves PDE's, so about 20 are left
260 * for async VM updates.
262 * To make it easier to work, each scratch PT is put in slot (1 + PT #)
263 * everywhere, this allows lockless updates to scratch pages by using
264 * the different addresses in VM.
266 #define NUM_VMUSA_UNIT_PER_PAGE 32
267 #define VM_SA_UPDATE_UNIT_SIZE (XE_PAGE_SIZE / NUM_VMUSA_UNIT_PER_PAGE)
268 #define NUM_VMUSA_WRITES_PER_UNIT (VM_SA_UPDATE_UNIT_SIZE / sizeof(u64))
269 drm_suballoc_manager_init(&m->vm_update_sa,
270 (map_ofs / XE_PAGE_SIZE - NUM_KERNEL_PDE) *
271 NUM_VMUSA_UNIT_PER_PAGE, 0);
278 * Due to workaround 16017236439, odd instance hardware copy engines are
279 * faster than even instance ones.
280 * This function returns the mask involving all fast copy engines and the
281 * reserved copy engine to be used as logical mask for migrate engine.
282 * Including the reserved copy engine is required to avoid deadlocks due to
283 * migrate jobs servicing the faults gets stuck behind the job that faulted.
285 static u32 xe_migrate_usm_logical_mask(struct xe_gt *gt)
287 u32 logical_mask = 0;
288 struct xe_hw_engine *hwe;
289 enum xe_hw_engine_id id;
291 for_each_hw_engine(hwe, gt, id) {
292 if (hwe->class != XE_ENGINE_CLASS_COPY)
295 if (!XE_WA(gt, 16017236439) ||
296 xe_gt_is_usm_hwe(gt, hwe) || hwe->instance & 1)
297 logical_mask |= BIT(hwe->logical_instance);
304 * xe_migrate_init() - Initialize a migrate context
305 * @tile: Back-pointer to the tile we're initializing for.
307 * Return: Pointer to a migrate context on success. Error pointer on error.
309 struct xe_migrate *xe_migrate_init(struct xe_tile *tile)
311 struct xe_device *xe = tile_to_xe(tile);
312 struct xe_gt *primary_gt = tile->primary_gt;
313 struct xe_migrate *m;
317 m = drmm_kzalloc(&xe->drm, sizeof(*m), GFP_KERNEL);
319 return ERR_PTR(-ENOMEM);
323 /* Special layout, prepared below.. */
324 vm = xe_vm_create(xe, XE_VM_FLAG_MIGRATION |
325 XE_VM_FLAG_SET_TILE_ID(tile));
329 xe_vm_lock(vm, false);
330 err = xe_migrate_prepare_vm(tile, m, vm);
333 xe_vm_close_and_put(vm);
337 if (xe->info.has_usm) {
338 struct xe_hw_engine *hwe = xe_gt_hw_engine(primary_gt,
339 XE_ENGINE_CLASS_COPY,
340 primary_gt->usm.reserved_bcs_instance,
342 u32 logical_mask = xe_migrate_usm_logical_mask(primary_gt);
344 if (!hwe || !logical_mask)
345 return ERR_PTR(-EINVAL);
347 m->q = xe_exec_queue_create(xe, vm, logical_mask, 1, hwe,
348 EXEC_QUEUE_FLAG_KERNEL |
349 EXEC_QUEUE_FLAG_PERMANENT |
350 EXEC_QUEUE_FLAG_HIGH_PRIORITY);
352 m->q = xe_exec_queue_create_class(xe, primary_gt, vm,
353 XE_ENGINE_CLASS_COPY,
354 EXEC_QUEUE_FLAG_KERNEL |
355 EXEC_QUEUE_FLAG_PERMANENT);
358 xe_vm_close_and_put(vm);
359 return ERR_CAST(m->q);
362 mutex_init(&m->job_mutex);
364 err = drmm_add_action_or_reset(&xe->drm, xe_migrate_fini, m);
369 if (xe_device_has_flat_ccs(xe))
370 /* min chunk size corresponds to 4K of CCS Metadata */
371 m->min_chunk_size = SZ_4K * SZ_64K /
372 xe_device_ccs_bytes(xe, SZ_64K);
374 /* Somewhat arbitrary to avoid a huge amount of blits */
375 m->min_chunk_size = SZ_64K;
376 m->min_chunk_size = roundup_pow_of_two(m->min_chunk_size);
377 drm_dbg(&xe->drm, "Migrate min chunk size is 0x%08llx\n",
378 (unsigned long long)m->min_chunk_size);
384 static u64 max_mem_transfer_per_pass(struct xe_device *xe)
386 if (!IS_DGFX(xe) && xe_device_has_flat_ccs(xe))
387 return MAX_CCS_LIMITED_TRANSFER;
389 return MAX_PREEMPTDISABLE_TRANSFER;
392 static u64 xe_migrate_res_sizes(struct xe_migrate *m, struct xe_res_cursor *cur)
394 struct xe_device *xe = tile_to_xe(m->tile);
395 u64 size = min_t(u64, max_mem_transfer_per_pass(xe), cur->remaining);
397 if (mem_type_is_vram(cur->mem_type)) {
399 * VRAM we want to blit in chunks with sizes aligned to
400 * min_chunk_size in order for the offset to CCS metadata to be
401 * page-aligned. If it's the last chunk it may be smaller.
403 * Another constraint is that we need to limit the blit to
404 * the VRAM block size, unless size is smaller than
407 u64 chunk = max_t(u64, cur->size, m->min_chunk_size);
409 size = min_t(u64, size, chunk);
410 if (size > m->min_chunk_size)
411 size = round_down(size, m->min_chunk_size);
417 static bool xe_migrate_allow_identity(u64 size, const struct xe_res_cursor *cur)
419 /* If the chunk is not fragmented, allow identity map. */
420 return cur->size >= size;
423 static u32 pte_update_size(struct xe_migrate *m,
425 struct ttm_resource *res,
426 struct xe_res_cursor *cur,
427 u64 *L0, u64 *L0_ofs, u32 *L0_pt,
428 u32 cmd_size, u32 pt_ofs, u32 avail_pts)
433 if (is_vram && xe_migrate_allow_identity(*L0, cur)) {
434 /* Offset into identity map. */
435 *L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile),
436 cur->start + vram_region_gpu_offset(res));
439 /* Clip L0 to available size */
440 u64 size = min(*L0, (u64)avail_pts * SZ_2M);
441 u64 num_4k_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE);
444 *L0_ofs = xe_migrate_vm_addr(pt_ofs, 0);
446 /* MI_STORE_DATA_IMM */
447 cmds += 3 * DIV_ROUND_UP(num_4k_pages, 0x1ff);
450 cmds += num_4k_pages * 2;
452 /* Each chunk has a single blit command */
459 static void emit_pte(struct xe_migrate *m,
460 struct xe_bb *bb, u32 at_pt,
461 bool is_vram, bool is_comp_pte,
462 struct xe_res_cursor *cur,
463 u32 size, struct ttm_resource *res)
465 struct xe_device *xe = tile_to_xe(m->tile);
466 struct xe_vm *vm = m->q->vm;
469 u64 ofs = at_pt * XE_PAGE_SIZE;
472 /* Indirect access needs compression enabled uncached PAT index */
473 if (GRAPHICS_VERx100(xe) >= 2000)
474 pat_index = is_comp_pte ? xe->pat.idx[XE_CACHE_NONE_COMPRESSION] :
475 xe->pat.idx[XE_CACHE_NONE];
477 pat_index = xe->pat.idx[XE_CACHE_WB];
479 ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE);
482 u32 chunk = min(0x1ffU, ptes);
484 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk);
485 bb->cs[bb->len++] = ofs;
486 bb->cs[bb->len++] = 0;
496 addr = xe_res_dma(cur) & PAGE_MASK;
498 if (vm->flags & XE_VM_FLAG_64K) {
499 u64 va = cur_ofs * XE_PAGE_SIZE / 8;
501 xe_assert(xe, (va & (SZ_64K - 1)) ==
502 (addr & (SZ_64K - 1)));
504 flags |= XE_PTE_PS64;
507 addr += vram_region_gpu_offset(res);
511 addr = vm->pt_ops->pte_encode_addr(m->tile->xe,
514 bb->cs[bb->len++] = lower_32_bits(addr);
515 bb->cs[bb->len++] = upper_32_bits(addr);
517 xe_res_next(cur, min_t(u32, size, PAGE_SIZE));
523 #define EMIT_COPY_CCS_DW 5
524 static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
525 u64 dst_ofs, bool dst_is_indirect,
526 u64 src_ofs, bool src_is_indirect,
529 struct xe_device *xe = gt_to_xe(gt);
530 u32 *cs = bb->cs + bb->len;
536 if (GRAPHICS_VERx100(xe) >= 2000) {
537 num_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE);
538 xe_gt_assert(gt, FIELD_FIT(XE2_CCS_SIZE_MASK, num_pages - 1));
540 ccs_copy_size = REG_FIELD_PREP(XE2_CCS_SIZE_MASK, num_pages - 1);
541 mocs = FIELD_PREP(XE2_XY_CTRL_SURF_MOCS_INDEX_MASK, gt->mocs.uc_index);
544 num_ccs_blks = DIV_ROUND_UP(xe_device_ccs_bytes(gt_to_xe(gt), size),
545 NUM_CCS_BYTES_PER_BLOCK);
546 xe_gt_assert(gt, FIELD_FIT(CCS_SIZE_MASK, num_ccs_blks - 1));
548 ccs_copy_size = REG_FIELD_PREP(CCS_SIZE_MASK, num_ccs_blks - 1);
549 mocs = FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, gt->mocs.uc_index);
552 *cs++ = XY_CTRL_SURF_COPY_BLT |
553 (src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT |
554 (dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT |
556 *cs++ = lower_32_bits(src_ofs);
557 *cs++ = upper_32_bits(src_ofs) | mocs;
558 *cs++ = lower_32_bits(dst_ofs);
559 *cs++ = upper_32_bits(dst_ofs) | mocs;
561 bb->len = cs - bb->cs;
564 #define EMIT_COPY_DW 10
565 static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
566 u64 src_ofs, u64 dst_ofs, unsigned int size,
569 struct xe_device *xe = gt_to_xe(gt);
573 xe_gt_assert(gt, size / pitch <= S16_MAX);
574 xe_gt_assert(gt, pitch / 4 <= S16_MAX);
575 xe_gt_assert(gt, pitch <= U16_MAX);
577 if (GRAPHICS_VER(xe) >= 20)
578 mocs = FIELD_PREP(XE2_XY_FAST_COPY_BLT_MOCS_INDEX_MASK, gt->mocs.uc_index);
580 if (GRAPHICS_VERx100(xe) >= 1250)
581 tile_y = XY_FAST_COPY_BLT_D1_SRC_TILE4 | XY_FAST_COPY_BLT_D1_DST_TILE4;
583 bb->cs[bb->len++] = XY_FAST_COPY_BLT_CMD | (10 - 2);
584 bb->cs[bb->len++] = XY_FAST_COPY_BLT_DEPTH_32 | pitch | tile_y | mocs;
585 bb->cs[bb->len++] = 0;
586 bb->cs[bb->len++] = (size / pitch) << 16 | pitch / 4;
587 bb->cs[bb->len++] = lower_32_bits(dst_ofs);
588 bb->cs[bb->len++] = upper_32_bits(dst_ofs);
589 bb->cs[bb->len++] = 0;
590 bb->cs[bb->len++] = pitch | mocs;
591 bb->cs[bb->len++] = lower_32_bits(src_ofs);
592 bb->cs[bb->len++] = upper_32_bits(src_ofs);
595 static int job_add_deps(struct xe_sched_job *job, struct dma_resv *resv,
596 enum dma_resv_usage usage)
598 return drm_sched_job_add_resv_dependencies(&job->drm, resv, usage);
601 static u64 xe_migrate_batch_base(struct xe_migrate *m, bool usm)
603 return usm ? m->usm_batch_base_ofs : m->batch_base_ofs;
606 static u32 xe_migrate_ccs_copy(struct xe_migrate *m,
608 u64 src_ofs, bool src_is_indirect,
609 u64 dst_ofs, bool dst_is_indirect, u32 dst_size,
610 u64 ccs_ofs, bool copy_ccs)
612 struct xe_gt *gt = m->tile->primary_gt;
615 if (xe_device_has_flat_ccs(gt_to_xe(gt)) && !copy_ccs && dst_is_indirect) {
617 * If the src is already in vram, then it should already
618 * have been cleared by us, or has been populated by the
619 * user. Make sure we copy the CCS aux state as-is.
621 * Otherwise if the bo doesn't have any CCS metadata attached,
622 * we still need to clear it for security reasons.
624 u64 ccs_src_ofs = src_is_indirect ? src_ofs : m->cleared_mem_ofs;
626 emit_copy_ccs(gt, bb,
628 ccs_src_ofs, src_is_indirect, dst_size);
630 flush_flags = MI_FLUSH_DW_CCS;
631 } else if (copy_ccs) {
632 if (!src_is_indirect)
634 else if (!dst_is_indirect)
637 xe_gt_assert(gt, src_is_indirect || dst_is_indirect);
639 emit_copy_ccs(gt, bb, dst_ofs, dst_is_indirect, src_ofs,
640 src_is_indirect, dst_size);
642 flush_flags = MI_FLUSH_DW_CCS;
649 * xe_migrate_copy() - Copy content of TTM resources.
650 * @m: The migration context.
651 * @src_bo: The buffer object @src is currently bound to.
652 * @dst_bo: If copying between resources created for the same bo, set this to
653 * the same value as @src_bo. If copying between buffer objects, set it to
654 * the buffer object @dst is currently bound to.
655 * @src: The source TTM resource.
656 * @dst: The dst TTM resource.
657 * @copy_only_ccs: If true copy only CCS metadata
659 * Copies the contents of @src to @dst: On flat CCS devices,
660 * the CCS metadata is copied as well if needed, or if not present,
661 * the CCS metadata of @dst is cleared for security reasons.
663 * Return: Pointer to a dma_fence representing the last copy batch, or
664 * an error pointer on failure. If there is a failure, any copy operation
665 * started by the function call has been synced.
667 struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
668 struct xe_bo *src_bo,
669 struct xe_bo *dst_bo,
670 struct ttm_resource *src,
671 struct ttm_resource *dst,
674 struct xe_gt *gt = m->tile->primary_gt;
675 struct xe_device *xe = gt_to_xe(gt);
676 struct dma_fence *fence = NULL;
677 u64 size = src_bo->size;
678 struct xe_res_cursor src_it, dst_it, ccs_it;
679 u64 src_L0_ofs, dst_L0_ofs;
680 u32 src_L0_pt, dst_L0_pt;
684 bool src_is_pltt = src->mem_type == XE_PL_TT;
685 bool dst_is_pltt = dst->mem_type == XE_PL_TT;
686 bool src_is_vram = mem_type_is_vram(src->mem_type);
687 bool dst_is_vram = mem_type_is_vram(dst->mem_type);
688 bool copy_ccs = xe_device_has_flat_ccs(xe) &&
689 xe_bo_needs_ccs_pages(src_bo) && xe_bo_needs_ccs_pages(dst_bo);
690 bool copy_system_ccs = copy_ccs && (!src_is_vram || !dst_is_vram);
692 /* Copying CCS between two different BOs is not supported yet. */
693 if (XE_WARN_ON(copy_ccs && src_bo != dst_bo))
694 return ERR_PTR(-EINVAL);
696 if (src_bo != dst_bo && XE_WARN_ON(src_bo->size != dst_bo->size))
697 return ERR_PTR(-EINVAL);
700 xe_res_first_sg(xe_bo_sg(src_bo), 0, size, &src_it);
702 xe_res_first(src, 0, size, &src_it);
704 xe_res_first_sg(xe_bo_sg(dst_bo), 0, size, &dst_it);
706 xe_res_first(dst, 0, size, &dst_it);
709 xe_res_first_sg(xe_bo_sg(src_bo), xe_bo_ccs_pages_start(src_bo),
710 PAGE_ALIGN(xe_device_ccs_bytes(xe, size)),
714 u32 batch_size = 2; /* arb_clear() + MI_BATCH_BUFFER_END */
715 struct xe_sched_job *job;
719 u64 ccs_ofs, ccs_size;
722 bool usm = xe->info.has_usm;
723 u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE;
725 src_L0 = xe_migrate_res_sizes(m, &src_it);
726 dst_L0 = xe_migrate_res_sizes(m, &dst_it);
728 drm_dbg(&xe->drm, "Pass %u, sizes: %llu & %llu\n",
729 pass++, src_L0, dst_L0);
731 src_L0 = min(src_L0, dst_L0);
733 batch_size += pte_update_size(m, src_is_vram, src, &src_it, &src_L0,
734 &src_L0_ofs, &src_L0_pt, 0, 0,
737 batch_size += pte_update_size(m, dst_is_vram, dst, &dst_it, &src_L0,
738 &dst_L0_ofs, &dst_L0_pt, 0,
739 avail_pts, avail_pts);
741 if (copy_system_ccs) {
742 ccs_size = xe_device_ccs_bytes(xe, src_L0);
743 batch_size += pte_update_size(m, false, NULL, &ccs_it, &ccs_size,
744 &ccs_ofs, &ccs_pt, 0,
747 xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE));
750 /* Add copy commands size here */
751 batch_size += ((copy_only_ccs) ? 0 : EMIT_COPY_DW) +
752 ((xe_device_has_flat_ccs(xe) ? EMIT_COPY_CCS_DW : 0));
754 bb = xe_bb_new(gt, batch_size, usm);
760 if (src_is_vram && xe_migrate_allow_identity(src_L0, &src_it))
761 xe_res_next(&src_it, src_L0);
763 emit_pte(m, bb, src_L0_pt, src_is_vram, true, &src_it, src_L0,
766 if (dst_is_vram && xe_migrate_allow_identity(src_L0, &dst_it))
767 xe_res_next(&dst_it, src_L0);
769 emit_pte(m, bb, dst_L0_pt, dst_is_vram, true, &dst_it, src_L0,
773 emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src);
775 bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
776 update_idx = bb->len;
779 emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, src_L0, XE_PAGE_SIZE);
781 flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs,
782 IS_DGFX(xe) ? src_is_vram : src_is_pltt,
784 IS_DGFX(xe) ? dst_is_vram : dst_is_pltt,
785 src_L0, ccs_ofs, copy_ccs);
787 mutex_lock(&m->job_mutex);
788 job = xe_bb_create_migration_job(m->q, bb,
789 xe_migrate_batch_base(m, usm),
796 xe_sched_job_add_migrate_flush(job, flush_flags);
798 err = job_add_deps(job, src_bo->ttm.base.resv,
799 DMA_RESV_USAGE_BOOKKEEP);
800 if (!err && src_bo != dst_bo)
801 err = job_add_deps(job, dst_bo->ttm.base.resv,
802 DMA_RESV_USAGE_BOOKKEEP);
807 xe_sched_job_arm(job);
808 dma_fence_put(fence);
809 fence = dma_fence_get(&job->drm.s_fence->finished);
810 xe_sched_job_push(job);
812 dma_fence_put(m->fence);
813 m->fence = dma_fence_get(fence);
815 mutex_unlock(&m->job_mutex);
817 xe_bb_free(bb, fence);
822 xe_sched_job_put(job);
824 mutex_unlock(&m->job_mutex);
825 xe_bb_free(bb, NULL);
828 /* Sync partial copy if any. FIXME: under job_mutex? */
830 dma_fence_wait(fence, false);
831 dma_fence_put(fence);
840 static void emit_clear_link_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs,
843 struct xe_device *xe = gt_to_xe(gt);
844 u32 *cs = bb->cs + bb->len;
845 u32 len = PVC_MEM_SET_CMD_LEN_DW;
847 *cs++ = PVC_MEM_SET_CMD | PVC_MEM_SET_MATRIX | (len - 2);
849 *cs++ = (size / pitch) - 1;
851 *cs++ = lower_32_bits(src_ofs);
852 *cs++ = upper_32_bits(src_ofs);
853 if (GRAPHICS_VERx100(xe) >= 2000)
854 *cs++ = FIELD_PREP(XE2_MEM_SET_MOCS_INDEX_MASK, gt->mocs.uc_index);
856 *cs++ = FIELD_PREP(PVC_MEM_SET_MOCS_INDEX_MASK, gt->mocs.uc_index);
858 xe_gt_assert(gt, cs - bb->cs == len + bb->len);
863 static void emit_clear_main_copy(struct xe_gt *gt, struct xe_bb *bb,
864 u64 src_ofs, u32 size, u32 pitch, bool is_vram)
866 struct xe_device *xe = gt_to_xe(gt);
867 u32 *cs = bb->cs + bb->len;
868 u32 len = XY_FAST_COLOR_BLT_DW;
870 if (GRAPHICS_VERx100(xe) < 1250)
873 *cs++ = XY_FAST_COLOR_BLT_CMD | XY_FAST_COLOR_BLT_DEPTH_32 |
875 if (GRAPHICS_VERx100(xe) >= 2000)
876 *cs++ = FIELD_PREP(XE2_XY_FAST_COLOR_BLT_MOCS_INDEX_MASK, gt->mocs.uc_index) |
879 *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, gt->mocs.uc_index) |
882 *cs++ = (size / pitch) << 16 | pitch / 4;
883 *cs++ = lower_32_bits(src_ofs);
884 *cs++ = upper_32_bits(src_ofs);
885 *cs++ = (is_vram ? 0x0 : 0x1) << XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT;
899 xe_gt_assert(gt, cs - bb->cs == len + bb->len);
904 static bool has_service_copy_support(struct xe_gt *gt)
907 * What we care about is whether the architecture was designed with
908 * service copy functionality (specifically the new MEM_SET / MEM_COPY
909 * instructions) so check the architectural engine list rather than the
910 * actual list since these instructions are usable on BCS0 even if
911 * all of the actual service copy engines (BCS1-BCS8) have been fused
914 return gt->info.__engine_mask & GENMASK(XE_HW_ENGINE_BCS8,
918 static u32 emit_clear_cmd_len(struct xe_gt *gt)
920 if (has_service_copy_support(gt))
921 return PVC_MEM_SET_CMD_LEN_DW;
923 return XY_FAST_COLOR_BLT_DW;
926 static void emit_clear(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs,
927 u32 size, u32 pitch, bool is_vram)
929 if (has_service_copy_support(gt))
930 emit_clear_link_copy(gt, bb, src_ofs, size, pitch);
932 emit_clear_main_copy(gt, bb, src_ofs, size, pitch,
937 * xe_migrate_clear() - Copy content of TTM resources.
938 * @m: The migration context.
939 * @bo: The buffer object @dst is currently bound to.
940 * @dst: The dst TTM resource to be cleared.
942 * Clear the contents of @dst to zero. On flat CCS devices,
943 * the CCS metadata is cleared to zero as well on VRAM destinations.
944 * TODO: Eliminate the @bo argument.
946 * Return: Pointer to a dma_fence representing the last clear batch, or
947 * an error pointer on failure. If there is a failure, any clear operation
948 * started by the function call has been synced.
950 struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
952 struct ttm_resource *dst)
954 bool clear_vram = mem_type_is_vram(dst->mem_type);
955 struct xe_gt *gt = m->tile->primary_gt;
956 struct xe_device *xe = gt_to_xe(gt);
957 bool clear_system_ccs = (xe_bo_needs_ccs_pages(bo) && !IS_DGFX(xe)) ? true : false;
958 struct dma_fence *fence = NULL;
960 struct xe_res_cursor src_it;
961 struct ttm_resource *src = dst;
966 xe_res_first_sg(xe_bo_sg(bo), 0, bo->size, &src_it);
968 xe_res_first(src, 0, bo->size, &src_it);
975 struct xe_sched_job *job;
977 u32 batch_size, update_idx;
979 bool usm = xe->info.has_usm;
980 u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE;
982 clear_L0 = xe_migrate_res_sizes(m, &src_it);
984 drm_dbg(&xe->drm, "Pass %u, size: %llu\n", pass++, clear_L0);
986 /* Calculate final sizes and batch size.. */
988 pte_update_size(m, clear_vram, src, &src_it,
989 &clear_L0, &clear_L0_ofs, &clear_L0_pt,
990 clear_system_ccs ? 0 : emit_clear_cmd_len(gt), 0,
993 if (xe_device_has_flat_ccs(xe))
994 batch_size += EMIT_COPY_CCS_DW;
998 if (WARN_ON_ONCE(!clear_L0))
1001 bb = xe_bb_new(gt, batch_size, usm);
1008 /* Preemption is enabled again by the ring ops. */
1009 if (clear_vram && xe_migrate_allow_identity(clear_L0, &src_it))
1010 xe_res_next(&src_it, clear_L0);
1012 emit_pte(m, bb, clear_L0_pt, clear_vram, true, &src_it, clear_L0,
1015 bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
1016 update_idx = bb->len;
1018 if (!clear_system_ccs)
1019 emit_clear(gt, bb, clear_L0_ofs, clear_L0, XE_PAGE_SIZE, clear_vram);
1021 if (xe_device_has_flat_ccs(xe)) {
1022 emit_copy_ccs(gt, bb, clear_L0_ofs, true,
1023 m->cleared_mem_ofs, false, clear_L0);
1024 flush_flags = MI_FLUSH_DW_CCS;
1027 mutex_lock(&m->job_mutex);
1028 job = xe_bb_create_migration_job(m->q, bb,
1029 xe_migrate_batch_base(m, usm),
1036 xe_sched_job_add_migrate_flush(job, flush_flags);
1039 * There can't be anything userspace related at this
1040 * point, so we just need to respect any potential move
1041 * fences, which are always tracked as
1042 * DMA_RESV_USAGE_KERNEL.
1044 err = job_add_deps(job, bo->ttm.base.resv,
1045 DMA_RESV_USAGE_KERNEL);
1050 xe_sched_job_arm(job);
1051 dma_fence_put(fence);
1052 fence = dma_fence_get(&job->drm.s_fence->finished);
1053 xe_sched_job_push(job);
1055 dma_fence_put(m->fence);
1056 m->fence = dma_fence_get(fence);
1058 mutex_unlock(&m->job_mutex);
1060 xe_bb_free(bb, fence);
1064 xe_sched_job_put(job);
1066 mutex_unlock(&m->job_mutex);
1067 xe_bb_free(bb, NULL);
1069 /* Sync partial copies if any. FIXME: job_mutex? */
1071 dma_fence_wait(m->fence, false);
1072 dma_fence_put(fence);
1075 return ERR_PTR(err);
1078 if (clear_system_ccs)
1079 bo->ccs_cleared = true;
1084 static void write_pgtable(struct xe_tile *tile, struct xe_bb *bb, u64 ppgtt_ofs,
1085 const struct xe_vm_pgtable_update *update,
1086 struct xe_migrate_pt_update *pt_update)
1088 const struct xe_migrate_pt_update_ops *ops = pt_update->ops;
1090 u32 ofs = update->ofs, size = update->qwords;
1093 * If we have 512 entries (max), we would populate it ourselves,
1094 * and update the PDE above it to the new pointer.
1095 * The only time this can only happen if we have to update the top
1096 * PDE. This requires a BO that is almost vm->size big.
1098 * This shouldn't be possible in practice.. might change when 16K
1099 * pages are used. Hence the assert.
1101 xe_tile_assert(tile, update->qwords <= 0x1ff);
1103 ppgtt_ofs = xe_migrate_vram_ofs(tile_to_xe(tile),
1104 xe_bo_addr(update->pt_bo, 0,
1108 u64 addr = ppgtt_ofs + ofs * 8;
1110 chunk = min(update->qwords, 0x1ffU);
1112 /* Ensure populatefn can do memset64 by aligning bb->cs */
1114 bb->cs[bb->len++] = MI_NOOP;
1116 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk);
1117 bb->cs[bb->len++] = lower_32_bits(addr);
1118 bb->cs[bb->len++] = upper_32_bits(addr);
1119 ops->populate(pt_update, tile, NULL, bb->cs + bb->len, ofs, chunk,
1122 bb->len += chunk * 2;
1128 struct xe_vm *xe_migrate_get_vm(struct xe_migrate *m)
1130 return xe_vm_get(m->q->vm);
1133 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST)
1134 struct migrate_test_params {
1135 struct xe_test_priv base;
1139 #define to_migrate_test_params(_priv) \
1140 container_of(_priv, struct migrate_test_params, base)
1143 static struct dma_fence *
1144 xe_migrate_update_pgtables_cpu(struct xe_migrate *m,
1145 struct xe_vm *vm, struct xe_bo *bo,
1146 const struct xe_vm_pgtable_update *updates,
1147 u32 num_updates, bool wait_vm,
1148 struct xe_migrate_pt_update *pt_update)
1150 XE_TEST_DECLARE(struct migrate_test_params *test =
1151 to_migrate_test_params
1152 (xe_cur_kunit_priv(XE_TEST_LIVE_MIGRATE));)
1153 const struct xe_migrate_pt_update_ops *ops = pt_update->ops;
1154 struct dma_fence *fence;
1158 if (XE_TEST_ONLY(test && test->force_gpu))
1159 return ERR_PTR(-ETIME);
1161 if (bo && !dma_resv_test_signaled(bo->ttm.base.resv,
1162 DMA_RESV_USAGE_KERNEL))
1163 return ERR_PTR(-ETIME);
1165 if (wait_vm && !dma_resv_test_signaled(xe_vm_resv(vm),
1166 DMA_RESV_USAGE_BOOKKEEP))
1167 return ERR_PTR(-ETIME);
1169 if (ops->pre_commit) {
1170 pt_update->job = NULL;
1171 err = ops->pre_commit(pt_update);
1173 return ERR_PTR(err);
1175 for (i = 0; i < num_updates; i++) {
1176 const struct xe_vm_pgtable_update *update = &updates[i];
1178 ops->populate(pt_update, m->tile, &update->pt_bo->vmap, NULL,
1179 update->ofs, update->qwords, update);
1183 trace_xe_vm_cpu_bind(vm);
1184 xe_device_wmb(vm->xe);
1187 fence = dma_fence_get_stub();
1192 static bool no_in_syncs(struct xe_vm *vm, struct xe_exec_queue *q,
1193 struct xe_sync_entry *syncs, u32 num_syncs)
1195 struct dma_fence *fence;
1198 for (i = 0; i < num_syncs; i++) {
1199 fence = syncs[i].fence;
1201 if (fence && !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
1206 fence = xe_exec_queue_last_fence_get(q, vm);
1207 if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
1215 * xe_migrate_update_pgtables() - Pipelined page-table update
1216 * @m: The migrate context.
1217 * @vm: The vm we'll be updating.
1218 * @bo: The bo whose dma-resv we will await before updating, or NULL if userptr.
1219 * @q: The exec queue to be used for the update or NULL if the default
1220 * migration engine is to be used.
1221 * @updates: An array of update descriptors.
1222 * @num_updates: Number of descriptors in @updates.
1223 * @syncs: Array of xe_sync_entry to await before updating. Note that waits
1224 * will block the engine timeline.
1225 * @num_syncs: Number of entries in @syncs.
1226 * @pt_update: Pointer to a struct xe_migrate_pt_update, which contains
1227 * pointers to callback functions and, if subclassed, private arguments to
1230 * Perform a pipelined page-table update. The update descriptors are typically
1231 * built under the same lock critical section as a call to this function. If
1232 * using the default engine for the updates, they will be performed in the
1233 * order they grab the job_mutex. If different engines are used, external
1234 * synchronization is needed for overlapping updates to maintain page-table
1235 * consistency. Note that the meaing of "overlapping" is that the updates
1236 * touch the same page-table, which might be a higher-level page-directory.
1237 * If no pipelining is needed, then updates may be performed by the cpu.
1239 * Return: A dma_fence that, when signaled, indicates the update completion.
1242 xe_migrate_update_pgtables(struct xe_migrate *m,
1245 struct xe_exec_queue *q,
1246 const struct xe_vm_pgtable_update *updates,
1248 struct xe_sync_entry *syncs, u32 num_syncs,
1249 struct xe_migrate_pt_update *pt_update)
1251 const struct xe_migrate_pt_update_ops *ops = pt_update->ops;
1252 struct xe_tile *tile = m->tile;
1253 struct xe_gt *gt = tile->primary_gt;
1254 struct xe_device *xe = tile_to_xe(tile);
1255 struct xe_sched_job *job;
1256 struct dma_fence *fence;
1257 struct drm_suballoc *sa_bo = NULL;
1258 struct xe_vma *vma = pt_update->vma;
1260 u32 i, batch_size, ppgtt_ofs, update_idx, page_ofs = 0;
1263 bool usm = !q && xe->info.has_usm;
1264 bool first_munmap_rebind = vma &&
1265 vma->gpuva.flags & XE_VMA_FIRST_REBIND;
1266 struct xe_exec_queue *q_override = !q ? m->q : q;
1267 u16 pat_index = xe->pat.idx[XE_CACHE_WB];
1269 /* Use the CPU if no in syncs and engine is idle */
1270 if (no_in_syncs(vm, q, syncs, num_syncs) && xe_exec_queue_is_idle(q_override)) {
1271 fence = xe_migrate_update_pgtables_cpu(m, vm, bo, updates,
1273 first_munmap_rebind,
1275 if (!IS_ERR(fence) || fence == ERR_PTR(-EAGAIN))
1279 /* fixed + PTE entries */
1283 batch_size = 6 + num_updates * 2;
1285 for (i = 0; i < num_updates; i++) {
1286 u32 num_cmds = DIV_ROUND_UP(updates[i].qwords, 0x1ff);
1288 /* align noop + MI_STORE_DATA_IMM cmd prefix */
1289 batch_size += 4 * num_cmds + updates[i].qwords * 2;
1293 * XXX: Create temp bo to copy from, if batch_size becomes too big?
1295 * Worst case: Sum(2 * (each lower level page size) + (top level page size))
1296 * Should be reasonably bound..
1298 xe_tile_assert(tile, batch_size < SZ_128K);
1300 bb = xe_bb_new(gt, batch_size, !q && xe->info.has_usm);
1302 return ERR_CAST(bb);
1304 /* For sysmem PTE's, need to map them in our hole.. */
1306 ppgtt_ofs = NUM_KERNEL_PDE - 1;
1308 xe_tile_assert(tile, num_updates <= NUM_VMUSA_WRITES_PER_UNIT);
1310 sa_bo = drm_suballoc_new(&m->vm_update_sa, 1,
1311 GFP_KERNEL, true, 0);
1312 if (IS_ERR(sa_bo)) {
1313 err = PTR_ERR(sa_bo);
1317 ppgtt_ofs = NUM_KERNEL_PDE +
1318 (drm_suballoc_soffset(sa_bo) /
1319 NUM_VMUSA_UNIT_PER_PAGE);
1320 page_ofs = (drm_suballoc_soffset(sa_bo) %
1321 NUM_VMUSA_UNIT_PER_PAGE) *
1322 VM_SA_UPDATE_UNIT_SIZE;
1325 /* Map our PT's to gtt */
1326 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(num_updates);
1327 bb->cs[bb->len++] = ppgtt_ofs * XE_PAGE_SIZE + page_ofs;
1328 bb->cs[bb->len++] = 0; /* upper_32_bits */
1330 for (i = 0; i < num_updates; i++) {
1331 struct xe_bo *pt_bo = updates[i].pt_bo;
1333 xe_tile_assert(tile, pt_bo->size == SZ_4K);
1335 addr = vm->pt_ops->pte_encode_bo(pt_bo, 0, pat_index, 0);
1336 bb->cs[bb->len++] = lower_32_bits(addr);
1337 bb->cs[bb->len++] = upper_32_bits(addr);
1340 bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
1341 update_idx = bb->len;
1343 addr = xe_migrate_vm_addr(ppgtt_ofs, 0) +
1344 (page_ofs / sizeof(u64)) * XE_PAGE_SIZE;
1345 for (i = 0; i < num_updates; i++)
1346 write_pgtable(tile, bb, addr + i * XE_PAGE_SIZE,
1347 &updates[i], pt_update);
1349 /* phys pages, no preamble required */
1350 bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
1351 update_idx = bb->len;
1353 for (i = 0; i < num_updates; i++)
1354 write_pgtable(tile, bb, 0, &updates[i], pt_update);
1358 mutex_lock(&m->job_mutex);
1360 job = xe_bb_create_migration_job(q ?: m->q, bb,
1361 xe_migrate_batch_base(m, usm),
1368 /* Wait on BO move */
1370 err = job_add_deps(job, bo->ttm.base.resv,
1371 DMA_RESV_USAGE_KERNEL);
1377 * Munmap style VM unbind, need to wait for all jobs to be complete /
1378 * trigger preempts before moving forward
1380 if (first_munmap_rebind) {
1381 err = job_add_deps(job, xe_vm_resv(vm),
1382 DMA_RESV_USAGE_BOOKKEEP);
1387 err = xe_sched_job_last_fence_add_dep(job, vm);
1388 for (i = 0; !err && i < num_syncs; i++)
1389 err = xe_sync_entry_add_deps(&syncs[i], job);
1394 if (ops->pre_commit) {
1395 pt_update->job = job;
1396 err = ops->pre_commit(pt_update);
1400 xe_sched_job_arm(job);
1401 fence = dma_fence_get(&job->drm.s_fence->finished);
1402 xe_sched_job_push(job);
1405 mutex_unlock(&m->job_mutex);
1407 xe_bb_free(bb, fence);
1408 drm_suballoc_free(sa_bo, fence);
1413 xe_sched_job_put(job);
1416 mutex_unlock(&m->job_mutex);
1417 xe_bb_free(bb, NULL);
1419 drm_suballoc_free(sa_bo, NULL);
1420 return ERR_PTR(err);
1424 * xe_migrate_wait() - Complete all operations using the xe_migrate context
1425 * @m: Migrate context to wait for.
1427 * Waits until the GPU no longer uses the migrate context's default engine
1428 * or its page-table objects. FIXME: What about separate page-table update
1431 void xe_migrate_wait(struct xe_migrate *m)
1434 dma_fence_wait(m->fence, false);
1437 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST)
1438 #include "tests/xe_migrate.c"