git.infradead.org Git - nvme.git/blob - drivers/gpu/drm/xe/xe

1 // SPDX-License-Identifier: MIT

2 /*

4 */

5

6 #include "xe_migrate.h"

7

8 #include <linux/bitfield.h>

9 #include <linux/sizes.h>

10

11 #include <drm/drm_managed.h>

12 #include <drm/ttm/ttm_tt.h>

13 #include <drm/xe_drm.h>

14

15 #include "generated/xe_wa_oob.h"

16 #include "instructions/xe_mi_commands.h"

17 #include "regs/xe_gpu_commands.h"

18 #include "tests/xe_test.h"

19 #include "xe_assert.h"

20 #include "xe_bb.h"

21 #include "xe_bo.h"

22 #include "xe_exec_queue.h"

23 #include "xe_ggtt.h"

24 #include "xe_gt.h"

25 #include "xe_hw_engine.h"

26 #include "xe_lrc.h"

27 #include "xe_map.h"

28 #include "xe_mocs.h"

29 #include "xe_pt.h"

30 #include "xe_res_cursor.h"

31 #include "xe_sched_job.h"

32 #include "xe_sync.h"

33 #include "xe_trace.h"

34 #include "xe_vm.h"

35 #include "xe_wa.h"

36

37 /**

38 * struct xe_migrate - migrate context.

39 */

40 struct xe_migrate {

41 /** @q: Default exec queue used for migration */

42 struct xe_exec_queue *q;

43 /** @tile: Backpointer to the tile this struct xe_migrate belongs to. */

44 struct xe_tile *tile;

45 /** @job_mutex: Timeline mutex for @eng. */

46 struct mutex job_mutex;

47 /** @pt_bo: Page-table buffer object. */

48 struct xe_bo *pt_bo;

49 /** @batch_base_ofs: VM offset of the migration batch buffer */

50 u64 batch_base_ofs;

51 /** @usm_batch_base_ofs: VM offset of the usm batch buffer */

52 u64 usm_batch_base_ofs;

53 /** @cleared_mem_ofs: VM offset of @cleared_bo. */

54 u64 cleared_mem_ofs;

55 /**

56 * @fence: dma-fence representing the last migration job batch.

57 * Protected by @job_mutex.

58 */

59 struct dma_fence *fence;

60 /**

61 * @vm_update_sa: For integrated, used to suballocate page-tables

62 * out of the pt_bo.

63 */

64 struct drm_suballoc_manager vm_update_sa;

65 /** @min_chunk_size: For dgfx, Minimum chunk size */

66 u64 min_chunk_size;

67 };

68

69 #define MAX_PREEMPTDISABLE_TRANSFER SZ_8M /* Around 1ms. */

70 #define MAX_CCS_LIMITED_TRANSFER SZ_4M /* XE_PAGE_SIZE * (FIELD_MAX(XE2_CCS_SIZE_MASK) + 1) */

71 #define NUM_KERNEL_PDE 17

72 #define NUM_PT_SLOTS 32

73 #define LEVEL0_PAGE_TABLE_ENCODE_SIZE SZ_2M

74

75 /**

76 * xe_tile_migrate_engine() - Get this tile's migrate engine.

77 * @tile: The tile.

78 *

79 * Returns the default migrate engine of this tile.

80 * TODO: Perhaps this function is slightly misplaced, and even unneeded?

81 *

82 * Return: The default migrate engine

83 */

84 struct xe_exec_queue *xe_tile_migrate_engine(struct xe_tile *tile)

85 {

86 return tile->migrate->q;

87 }

88

89 static void xe_migrate_fini(struct drm_device *dev, void *arg)

90 {

91 struct xe_migrate *m = arg;

92

93 xe_vm_lock(m->q->vm, false);

94 xe_bo_unpin(m->pt_bo);

95 xe_vm_unlock(m->q->vm);

96

97 dma_fence_put(m->fence);

98 xe_bo_put(m->pt_bo);

99 drm_suballoc_manager_fini(&m->vm_update_sa);

100 mutex_destroy(&m->job_mutex);

101 xe_vm_close_and_put(m->q->vm);

102 xe_exec_queue_put(m->q);

103 }

104

105 static u64 xe_migrate_vm_addr(u64 slot, u32 level)

106 {

107 XE_WARN_ON(slot >= NUM_PT_SLOTS);

108

109 /* First slot is reserved for mapping of PT bo and bb, start from 1 */

110 return (slot + 1ULL) << xe_pt_shift(level + 1);

111 }

112

113 static u64 xe_migrate_vram_ofs(struct xe_device *xe, u64 addr)

114 {

115 /*

116 * Remove the DPA to get a correct offset into identity table for the

117 * migrate offset

118 */

119 addr -= xe->mem.vram.dpa_base;

120 return addr + (256ULL << xe_pt_shift(2));

121 }

122

123 static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m,

124 struct xe_vm *vm)

125 {

126 struct xe_device *xe = tile_to_xe(tile);

127 u16 pat_index = xe->pat.idx[XE_CACHE_WB];

128 u8 id = tile->id;

129 u32 num_entries = NUM_PT_SLOTS, num_level = vm->pt_root[id]->level;

130 u32 map_ofs, level, i;

131 struct xe_bo *bo, *batch = tile->mem.kernel_bb_pool->bo;

132 u64 entry;

133

134 /* Can't bump NUM_PT_SLOTS too high */

135 BUILD_BUG_ON(NUM_PT_SLOTS > SZ_2M/XE_PAGE_SIZE);

136 /* Must be a multiple of 64K to support all platforms */

137 BUILD_BUG_ON(NUM_PT_SLOTS * XE_PAGE_SIZE % SZ_64K);

138 /* And one slot reserved for the 4KiB page table updates */

139 BUILD_BUG_ON(!(NUM_KERNEL_PDE & 1));

140

141 /* Need to be sure everything fits in the first PT, or create more */

142 xe_tile_assert(tile, m->batch_base_ofs + batch->size < SZ_2M);

143

144 bo = xe_bo_create_pin_map(vm->xe, tile, vm,

145 num_entries * XE_PAGE_SIZE,

146 ttm_bo_type_kernel,

147 XE_BO_CREATE_VRAM_IF_DGFX(tile) |

148 XE_BO_CREATE_PINNED_BIT);

149 if (IS_ERR(bo))

150 return PTR_ERR(bo);

151

152 entry = vm->pt_ops->pde_encode_bo(bo, bo->size - XE_PAGE_SIZE, pat_index);

153 xe_pt_write(xe, &vm->pt_root[id]->bo->vmap, 0, entry);

154

155 map_ofs = (num_entries - num_level) * XE_PAGE_SIZE;

156

157 /* Map the entire BO in our level 0 pt */

158 for (i = 0, level = 0; i < num_entries; level++) {

159 entry = vm->pt_ops->pte_encode_bo(bo, i * XE_PAGE_SIZE,

160 pat_index, 0);

161

162 xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, entry);

163

164 if (vm->flags & XE_VM_FLAG_64K)

165 i += 16;

166 else

167 i += 1;

168 }

169

170 if (!IS_DGFX(xe)) {

171 /* Write out batch too */

172 m->batch_base_ofs = NUM_PT_SLOTS * XE_PAGE_SIZE;

173 if (xe->info.has_usm) {

174 batch = tile->primary_gt->usm.bb_pool->bo;

175 m->usm_batch_base_ofs = m->batch_base_ofs;

176 }

177

178 for (i = 0; i < batch->size;

179 i += vm->flags & XE_VM_FLAG_64K ? XE_64K_PAGE_SIZE :

180 XE_PAGE_SIZE) {

181 entry = vm->pt_ops->pte_encode_bo(batch, i,

182 pat_index, 0);

183

184 xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64,

185 entry);

186 level++;

187 }

188 } else {

189 u64 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE);

190

191 m->batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr);

192

193 if (xe->info.has_usm) {

194 batch = tile->primary_gt->usm.bb_pool->bo;

195 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE);

196 m->usm_batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr);

197 }

198 }

199

200 for (level = 1; level < num_level; level++) {

201 u32 flags = 0;

202

203 if (vm->flags & XE_VM_FLAG_64K && level == 1)

204 flags = XE_PDE_64K;

205

206 entry = vm->pt_ops->pde_encode_bo(bo, map_ofs + (level - 1) *

207 XE_PAGE_SIZE, pat_index);

208 xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level, u64,

209 entry | flags);

210 }

211

212 /* Write PDE's that point to our BO. */

213 for (i = 0; i < num_entries - num_level; i++) {

214 entry = vm->pt_ops->pde_encode_bo(bo, i * XE_PAGE_SIZE,

215 pat_index);

216

217 xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE +

218 (i + 1) * 8, u64, entry);

219 }

220

221 /* Set up a 1GiB NULL mapping at 255GiB offset. */

222 level = 2;

223 xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level + 255 * 8, u64,

224 vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level, IS_DGFX(xe), 0)

225 | XE_PTE_NULL);

226 m->cleared_mem_ofs = (255ULL << xe_pt_shift(level));

227

228 /* Identity map the entire vram at 256GiB offset */

229 if (IS_DGFX(xe)) {

230 u64 pos, ofs, flags;

231

232 level = 2;

233 ofs = map_ofs + XE_PAGE_SIZE * level + 256 * 8;

234 flags = vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level,

235 true, 0);

236

237 /*

238 * Use 1GB pages, it shouldn't matter the physical amount of

239 * vram is less, when we don't access it.

240 */

241 for (pos = xe->mem.vram.dpa_base;

242 pos < xe->mem.vram.actual_physical_size + xe->mem.vram.dpa_base;

243 pos += SZ_1G, ofs += 8)

244 xe_map_wr(xe, &bo->vmap, ofs, u64, pos | flags);

245 }

246

247 /*

248 * Example layout created above, with root level = 3:

249 * [PT0...PT7]: kernel PT's for copy/clear; 64 or 4KiB PTE's

250 * [PT8]: Kernel PT for VM_BIND, 4 KiB PTE's

251 * [PT9...PT28]: Userspace PT's for VM_BIND, 4 KiB PTE's

252 * [PT29 = PDE 0] [PT30 = PDE 1] [PT31 = PDE 2]

253 *

254 * This makes the lowest part of the VM point to the pagetables.

255 * Hence the lowest 2M in the vm should point to itself, with a few writes

256 * and flushes, other parts of the VM can be used either for copying and

257 * clearing.

258 *

259 * For performance, the kernel reserves PDE's, so about 20 are left

260 * for async VM updates.

261 *

262 * To make it easier to work, each scratch PT is put in slot (1 + PT #)

263 * everywhere, this allows lockless updates to scratch pages by using

264 * the different addresses in VM.

265 */

266 #define NUM_VMUSA_UNIT_PER_PAGE 32

267 #define VM_SA_UPDATE_UNIT_SIZE (XE_PAGE_SIZE / NUM_VMUSA_UNIT_PER_PAGE)

268 #define NUM_VMUSA_WRITES_PER_UNIT (VM_SA_UPDATE_UNIT_SIZE / sizeof(u64))

269 drm_suballoc_manager_init(&m->vm_update_sa,

270 (map_ofs / XE_PAGE_SIZE - NUM_KERNEL_PDE) *

271 NUM_VMUSA_UNIT_PER_PAGE, 0);

272

273 m->pt_bo = bo;

274 return 0;

275 }

276

277 /*

278 * Due to workaround 16017236439, odd instance hardware copy engines are

279 * faster than even instance ones.

280 * This function returns the mask involving all fast copy engines and the

281 * reserved copy engine to be used as logical mask for migrate engine.

282 * Including the reserved copy engine is required to avoid deadlocks due to

283 * migrate jobs servicing the faults gets stuck behind the job that faulted.

284 */

285 static u32 xe_migrate_usm_logical_mask(struct xe_gt *gt)

286 {

287 u32 logical_mask = 0;

288 struct xe_hw_engine *hwe;

289 enum xe_hw_engine_id id;

290

291 for_each_hw_engine(hwe, gt, id) {

292 if (hwe->class != XE_ENGINE_CLASS_COPY)

293 continue;

294

295 if (!XE_WA(gt, 16017236439) ||

296 xe_gt_is_usm_hwe(gt, hwe) || hwe->instance & 1)

297 logical_mask |= BIT(hwe->logical_instance);

298 }

299

300 return logical_mask;

301 }

302

303 /**

304 * xe_migrate_init() - Initialize a migrate context

305 * @tile: Back-pointer to the tile we're initializing for.

306 *

307 * Return: Pointer to a migrate context on success. Error pointer on error.

308 */

309 struct xe_migrate *xe_migrate_init(struct xe_tile *tile)

310 {

311 struct xe_device *xe = tile_to_xe(tile);

312 struct xe_gt *primary_gt = tile->primary_gt;

313 struct xe_migrate *m;

314 struct xe_vm *vm;

315 int err;

316

317 m = drmm_kzalloc(&xe->drm, sizeof(*m), GFP_KERNEL);

318 if (!m)

319 return ERR_PTR(-ENOMEM);

320

321 m->tile = tile;

322

323 /* Special layout, prepared below.. */

324 vm = xe_vm_create(xe, XE_VM_FLAG_MIGRATION |

325 XE_VM_FLAG_SET_TILE_ID(tile));

326 if (IS_ERR(vm))

327 return ERR_CAST(vm);

328

329 xe_vm_lock(vm, false);

330 err = xe_migrate_prepare_vm(tile, m, vm);

331 xe_vm_unlock(vm);

332 if (err) {

333 xe_vm_close_and_put(vm);

334 return ERR_PTR(err);

335 }

336

337 if (xe->info.has_usm) {

338 struct xe_hw_engine *hwe = xe_gt_hw_engine(primary_gt,

339 XE_ENGINE_CLASS_COPY,

340 primary_gt->usm.reserved_bcs_instance,

341 false);

342 u32 logical_mask = xe_migrate_usm_logical_mask(primary_gt);

343

344 if (!hwe || !logical_mask)

345 return ERR_PTR(-EINVAL);

346

347 m->q = xe_exec_queue_create(xe, vm, logical_mask, 1, hwe,

348 EXEC_QUEUE_FLAG_KERNEL |

349 EXEC_QUEUE_FLAG_PERMANENT |

350 EXEC_QUEUE_FLAG_HIGH_PRIORITY);

351 } else {

352 m->q = xe_exec_queue_create_class(xe, primary_gt, vm,

353 XE_ENGINE_CLASS_COPY,

354 EXEC_QUEUE_FLAG_KERNEL |

355 EXEC_QUEUE_FLAG_PERMANENT);

356 }

357 if (IS_ERR(m->q)) {

358 xe_vm_close_and_put(vm);

359 return ERR_CAST(m->q);

360 }

361

362 mutex_init(&m->job_mutex);

363

364 err = drmm_add_action_or_reset(&xe->drm, xe_migrate_fini, m);

365 if (err)

366 return ERR_PTR(err);

367

368 if (IS_DGFX(xe)) {

369 if (xe_device_has_flat_ccs(xe))

370 /* min chunk size corresponds to 4K of CCS Metadata */

371 m->min_chunk_size = SZ_4K * SZ_64K /

372 xe_device_ccs_bytes(xe, SZ_64K);

373 else

374 /* Somewhat arbitrary to avoid a huge amount of blits */

375 m->min_chunk_size = SZ_64K;

376 m->min_chunk_size = roundup_pow_of_two(m->min_chunk_size);

377 drm_dbg(&xe->drm, "Migrate min chunk size is 0x%08llx\n",

378 (unsigned long long)m->min_chunk_size);

379 }

380

381 return m;

382 }

383

384 static u64 max_mem_transfer_per_pass(struct xe_device *xe)

385 {

386 if (!IS_DGFX(xe) && xe_device_has_flat_ccs(xe))

387 return MAX_CCS_LIMITED_TRANSFER;

388

389 return MAX_PREEMPTDISABLE_TRANSFER;

390 }

391

392 static u64 xe_migrate_res_sizes(struct xe_migrate *m, struct xe_res_cursor *cur)

393 {

394 struct xe_device *xe = tile_to_xe(m->tile);

395 u64 size = min_t(u64, max_mem_transfer_per_pass(xe), cur->remaining);

396

397 if (mem_type_is_vram(cur->mem_type)) {

398 /*

399 * VRAM we want to blit in chunks with sizes aligned to

400 * min_chunk_size in order for the offset to CCS metadata to be

401 * page-aligned. If it's the last chunk it may be smaller.

402 *

403 * Another constraint is that we need to limit the blit to

404 * the VRAM block size, unless size is smaller than

405 * min_chunk_size.

406 */

407 u64 chunk = max_t(u64, cur->size, m->min_chunk_size);

408

409 size = min_t(u64, size, chunk);

410 if (size > m->min_chunk_size)

411 size = round_down(size, m->min_chunk_size);

412 }

413

414 return size;

415 }

416

417 static bool xe_migrate_allow_identity(u64 size, const struct xe_res_cursor *cur)

418 {

419 /* If the chunk is not fragmented, allow identity map. */

420 return cur->size >= size;

421 }

422

423 static u32 pte_update_size(struct xe_migrate *m,

424 bool is_vram,

425 struct ttm_resource *res,

426 struct xe_res_cursor *cur,

427 u64 *L0, u64 *L0_ofs, u32 *L0_pt,

428 u32 cmd_size, u32 pt_ofs, u32 avail_pts)

429 {

430 u32 cmds = 0;

431

432 *L0_pt = pt_ofs;

433 if (is_vram && xe_migrate_allow_identity(*L0, cur)) {

434 /* Offset into identity map. */

435 *L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile),

436 cur->start + vram_region_gpu_offset(res));

437 cmds += cmd_size;

438 } else {

439 /* Clip L0 to available size */

440 u64 size = min(*L0, (u64)avail_pts * SZ_2M);

441 u64 num_4k_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE);

442

443 *L0 = size;

444 *L0_ofs = xe_migrate_vm_addr(pt_ofs, 0);

445

446 /* MI_STORE_DATA_IMM */

447 cmds += 3 * DIV_ROUND_UP(num_4k_pages, 0x1ff);

448

449 /* PDE qwords */

450 cmds += num_4k_pages * 2;

451

452 /* Each chunk has a single blit command */

453 cmds += cmd_size;

454 }

455

456 return cmds;

457 }

458

459 static void emit_pte(struct xe_migrate *m,

460 struct xe_bb *bb, u32 at_pt,

461 bool is_vram, bool is_comp_pte,

462 struct xe_res_cursor *cur,

463 u32 size, struct ttm_resource *res)

464 {

465 struct xe_device *xe = tile_to_xe(m->tile);

466 struct xe_vm *vm = m->q->vm;

467 u16 pat_index;

468 u32 ptes;

469 u64 ofs = at_pt * XE_PAGE_SIZE;

470 u64 cur_ofs;

471

472 /* Indirect access needs compression enabled uncached PAT index */

473 if (GRAPHICS_VERx100(xe) >= 2000)

474 pat_index = is_comp_pte ? xe->pat.idx[XE_CACHE_NONE_COMPRESSION] :

475 xe->pat.idx[XE_CACHE_NONE];

476 else

477 pat_index = xe->pat.idx[XE_CACHE_WB];

478

479 ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE);

480

481 while (ptes) {

482 u32 chunk = min(0x1ffU, ptes);

483

484 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk);

485 bb->cs[bb->len++] = ofs;

486 bb->cs[bb->len++] = 0;

487

488 cur_ofs = ofs;

489 ofs += chunk * 8;

490 ptes -= chunk;

491

492 while (chunk--) {

493 u64 addr, flags = 0;

494 bool devmem = false;

495

496 addr = xe_res_dma(cur) & PAGE_MASK;

497 if (is_vram) {

498 if (vm->flags & XE_VM_FLAG_64K) {

499 u64 va = cur_ofs * XE_PAGE_SIZE / 8;

500

501 xe_assert(xe, (va & (SZ_64K - 1)) ==

502 (addr & (SZ_64K - 1)));

503

504 flags |= XE_PTE_PS64;

505 }

506

507 addr += vram_region_gpu_offset(res);

508 devmem = true;

509 }

510

511 addr = vm->pt_ops->pte_encode_addr(m->tile->xe,

512 addr, pat_index,

513 0, devmem, flags);

514 bb->cs[bb->len++] = lower_32_bits(addr);

515 bb->cs[bb->len++] = upper_32_bits(addr);

516

517 xe_res_next(cur, min_t(u32, size, PAGE_SIZE));

518 cur_ofs += 8;

519 }

520 }

521 }

522

523 #define EMIT_COPY_CCS_DW 5

524 static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,

525 u64 dst_ofs, bool dst_is_indirect,

526 u64 src_ofs, bool src_is_indirect,

527 u32 size)

528 {

529 struct xe_device *xe = gt_to_xe(gt);

530 u32 *cs = bb->cs + bb->len;

531 u32 num_ccs_blks;

532 u32 num_pages;

533 u32 ccs_copy_size;

534 u32 mocs;

535

536 if (GRAPHICS_VERx100(xe) >= 2000) {

537 num_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE);

538 xe_gt_assert(gt, FIELD_FIT(XE2_CCS_SIZE_MASK, num_pages - 1));

539

540 ccs_copy_size = REG_FIELD_PREP(XE2_CCS_SIZE_MASK, num_pages - 1);

541 mocs = FIELD_PREP(XE2_XY_CTRL_SURF_MOCS_INDEX_MASK, gt->mocs.uc_index);

542

543 } else {

544 num_ccs_blks = DIV_ROUND_UP(xe_device_ccs_bytes(gt_to_xe(gt), size),

545 NUM_CCS_BYTES_PER_BLOCK);

546 xe_gt_assert(gt, FIELD_FIT(CCS_SIZE_MASK, num_ccs_blks - 1));

547

548 ccs_copy_size = REG_FIELD_PREP(CCS_SIZE_MASK, num_ccs_blks - 1);

549 mocs = FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, gt->mocs.uc_index);

550 }

551

552 *cs++ = XY_CTRL_SURF_COPY_BLT |

553 (src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT |

554 (dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT |

555 ccs_copy_size;

556 *cs++ = lower_32_bits(src_ofs);

557 *cs++ = upper_32_bits(src_ofs) | mocs;

558 *cs++ = lower_32_bits(dst_ofs);

559 *cs++ = upper_32_bits(dst_ofs) | mocs;

560

561 bb->len = cs - bb->cs;

562 }

563

564 #define EMIT_COPY_DW 10

565 static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,

566 u64 src_ofs, u64 dst_ofs, unsigned int size,

567 unsigned int pitch)

568 {

569 struct xe_device *xe = gt_to_xe(gt);

570 u32 mocs = 0;

571 u32 tile_y = 0;

572

573 xe_gt_assert(gt, size / pitch <= S16_MAX);

574 xe_gt_assert(gt, pitch / 4 <= S16_MAX);

575 xe_gt_assert(gt, pitch <= U16_MAX);

576

577 if (GRAPHICS_VER(xe) >= 20)

578 mocs = FIELD_PREP(XE2_XY_FAST_COPY_BLT_MOCS_INDEX_MASK, gt->mocs.uc_index);

579

580 if (GRAPHICS_VERx100(xe) >= 1250)

581 tile_y = XY_FAST_COPY_BLT_D1_SRC_TILE4 | XY_FAST_COPY_BLT_D1_DST_TILE4;

582

583 bb->cs[bb->len++] = XY_FAST_COPY_BLT_CMD | (10 - 2);

584 bb->cs[bb->len++] = XY_FAST_COPY_BLT_DEPTH_32 | pitch | tile_y | mocs;

585 bb->cs[bb->len++] = 0;

586 bb->cs[bb->len++] = (size / pitch) << 16 | pitch / 4;

587 bb->cs[bb->len++] = lower_32_bits(dst_ofs);

588 bb->cs[bb->len++] = upper_32_bits(dst_ofs);

589 bb->cs[bb->len++] = 0;

590 bb->cs[bb->len++] = pitch | mocs;

591 bb->cs[bb->len++] = lower_32_bits(src_ofs);

592 bb->cs[bb->len++] = upper_32_bits(src_ofs);

593 }

594

595 static int job_add_deps(struct xe_sched_job *job, struct dma_resv *resv,

596 enum dma_resv_usage usage)

597 {

598 return drm_sched_job_add_resv_dependencies(&job->drm, resv, usage);

599 }

600

601 static u64 xe_migrate_batch_base(struct xe_migrate *m, bool usm)

602 {

603 return usm ? m->usm_batch_base_ofs : m->batch_base_ofs;

604 }

605

606 static u32 xe_migrate_ccs_copy(struct xe_migrate *m,

607 struct xe_bb *bb,

608 u64 src_ofs, bool src_is_indirect,

609 u64 dst_ofs, bool dst_is_indirect, u32 dst_size,

610 u64 ccs_ofs, bool copy_ccs)

611 {

612 struct xe_gt *gt = m->tile->primary_gt;

613 u32 flush_flags = 0;

614

615 if (xe_device_has_flat_ccs(gt_to_xe(gt)) && !copy_ccs && dst_is_indirect) {

616 /*

617 * If the src is already in vram, then it should already

618 * have been cleared by us, or has been populated by the

619 * user. Make sure we copy the CCS aux state as-is.

620 *

621 * Otherwise if the bo doesn't have any CCS metadata attached,

622 * we still need to clear it for security reasons.

623 */

624 u64 ccs_src_ofs = src_is_indirect ? src_ofs : m->cleared_mem_ofs;

625

626 emit_copy_ccs(gt, bb,

627 dst_ofs, true,

628 ccs_src_ofs, src_is_indirect, dst_size);

629

630 flush_flags = MI_FLUSH_DW_CCS;

631 } else if (copy_ccs) {

632 if (!src_is_indirect)

633 src_ofs = ccs_ofs;

634 else if (!dst_is_indirect)

635 dst_ofs = ccs_ofs;

636

637 xe_gt_assert(gt, src_is_indirect || dst_is_indirect);

638

639 emit_copy_ccs(gt, bb, dst_ofs, dst_is_indirect, src_ofs,

640 src_is_indirect, dst_size);

641 if (dst_is_indirect)

642 flush_flags = MI_FLUSH_DW_CCS;

643 }

644

645 return flush_flags;

646 }

647

648 /**

649 * xe_migrate_copy() - Copy content of TTM resources.

650 * @m: The migration context.

651 * @src_bo: The buffer object @src is currently bound to.

652 * @dst_bo: If copying between resources created for the same bo, set this to

653 * the same value as @src_bo. If copying between buffer objects, set it to

654 * the buffer object @dst is currently bound to.

655 * @src: The source TTM resource.

656 * @dst: The dst TTM resource.

657 * @copy_only_ccs: If true copy only CCS metadata

658 *

659 * Copies the contents of @src to @dst: On flat CCS devices,

660 * the CCS metadata is copied as well if needed, or if not present,

661 * the CCS metadata of @dst is cleared for security reasons.

662 *

663 * Return: Pointer to a dma_fence representing the last copy batch, or

664 * an error pointer on failure. If there is a failure, any copy operation

665 * started by the function call has been synced.

666 */

667 struct dma_fence *xe_migrate_copy(struct xe_migrate *m,

668 struct xe_bo *src_bo,

669 struct xe_bo *dst_bo,

670 struct ttm_resource *src,

671 struct ttm_resource *dst,

672 bool copy_only_ccs)

673 {

674 struct xe_gt *gt = m->tile->primary_gt;

675 struct xe_device *xe = gt_to_xe(gt);

676 struct dma_fence *fence = NULL;

677 u64 size = src_bo->size;

678 struct xe_res_cursor src_it, dst_it, ccs_it;

679 u64 src_L0_ofs, dst_L0_ofs;

680 u32 src_L0_pt, dst_L0_pt;

681 u64 src_L0, dst_L0;

682 int pass = 0;

683 int err;

684 bool src_is_pltt = src->mem_type == XE_PL_TT;

685 bool dst_is_pltt = dst->mem_type == XE_PL_TT;

686 bool src_is_vram = mem_type_is_vram(src->mem_type);

687 bool dst_is_vram = mem_type_is_vram(dst->mem_type);

688 bool copy_ccs = xe_device_has_flat_ccs(xe) &&

689 xe_bo_needs_ccs_pages(src_bo) && xe_bo_needs_ccs_pages(dst_bo);

690 bool copy_system_ccs = copy_ccs && (!src_is_vram || !dst_is_vram);

691

692 /* Copying CCS between two different BOs is not supported yet. */

693 if (XE_WARN_ON(copy_ccs && src_bo != dst_bo))

694 return ERR_PTR(-EINVAL);

695

696 if (src_bo != dst_bo && XE_WARN_ON(src_bo->size != dst_bo->size))

697 return ERR_PTR(-EINVAL);

698

699 if (!src_is_vram)

700 xe_res_first_sg(xe_bo_sg(src_bo), 0, size, &src_it);

701 else

702 xe_res_first(src, 0, size, &src_it);

703 if (!dst_is_vram)

704 xe_res_first_sg(xe_bo_sg(dst_bo), 0, size, &dst_it);

705 else

706 xe_res_first(dst, 0, size, &dst_it);

707

708 if (copy_system_ccs)

709 xe_res_first_sg(xe_bo_sg(src_bo), xe_bo_ccs_pages_start(src_bo),

710 PAGE_ALIGN(xe_device_ccs_bytes(xe, size)),

711 &ccs_it);

712

713 while (size) {

714 u32 batch_size = 2; /* arb_clear() + MI_BATCH_BUFFER_END */

715 struct xe_sched_job *job;

716 struct xe_bb *bb;

717 u32 flush_flags;

718 u32 update_idx;

719 u64 ccs_ofs, ccs_size;

720 u32 ccs_pt;

721

722 bool usm = xe->info.has_usm;

723 u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE;

724

725 src_L0 = xe_migrate_res_sizes(m, &src_it);

726 dst_L0 = xe_migrate_res_sizes(m, &dst_it);

727

728 drm_dbg(&xe->drm, "Pass %u, sizes: %llu & %llu\n",

729 pass++, src_L0, dst_L0);

730

731 src_L0 = min(src_L0, dst_L0);

732

733 batch_size += pte_update_size(m, src_is_vram, src, &src_it, &src_L0,

734 &src_L0_ofs, &src_L0_pt, 0, 0,

735 avail_pts);

736

737 batch_size += pte_update_size(m, dst_is_vram, dst, &dst_it, &src_L0,

738 &dst_L0_ofs, &dst_L0_pt, 0,

739 avail_pts, avail_pts);

740

741 if (copy_system_ccs) {

742 ccs_size = xe_device_ccs_bytes(xe, src_L0);

743 batch_size += pte_update_size(m, false, NULL, &ccs_it, &ccs_size,

744 &ccs_ofs, &ccs_pt, 0,

745 2 * avail_pts,

746 avail_pts);

747 xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE));

748 }

749

750 /* Add copy commands size here */

751 batch_size += ((copy_only_ccs) ? 0 : EMIT_COPY_DW) +

752 ((xe_device_has_flat_ccs(xe) ? EMIT_COPY_CCS_DW : 0));

753

754 bb = xe_bb_new(gt, batch_size, usm);

755 if (IS_ERR(bb)) {

756 err = PTR_ERR(bb);

757 goto err_sync;

758 }

759

760 if (src_is_vram && xe_migrate_allow_identity(src_L0, &src_it))

761 xe_res_next(&src_it, src_L0);

762 else

763 emit_pte(m, bb, src_L0_pt, src_is_vram, true, &src_it, src_L0,

764 src);

765

766 if (dst_is_vram && xe_migrate_allow_identity(src_L0, &dst_it))

767 xe_res_next(&dst_it, src_L0);

768 else

769 emit_pte(m, bb, dst_L0_pt, dst_is_vram, true, &dst_it, src_L0,

770 dst);

771

772 if (copy_system_ccs)

773 emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src);

774

775 bb->cs[bb->len++] = MI_BATCH_BUFFER_END;

776 update_idx = bb->len;

777

778 if (!copy_only_ccs)

779 emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, src_L0, XE_PAGE_SIZE);

780

781 flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs,

782 IS_DGFX(xe) ? src_is_vram : src_is_pltt,

783 dst_L0_ofs,

784 IS_DGFX(xe) ? dst_is_vram : dst_is_pltt,

785 src_L0, ccs_ofs, copy_ccs);

786

787 mutex_lock(&m->job_mutex);

788 job = xe_bb_create_migration_job(m->q, bb,

789 xe_migrate_batch_base(m, usm),

790 update_idx);

791 if (IS_ERR(job)) {

792 err = PTR_ERR(job);

793 goto err;

794 }

795

796 xe_sched_job_add_migrate_flush(job, flush_flags);

797 if (!fence) {

798 err = job_add_deps(job, src_bo->ttm.base.resv,

799 DMA_RESV_USAGE_BOOKKEEP);

800 if (!err && src_bo != dst_bo)

801 err = job_add_deps(job, dst_bo->ttm.base.resv,

802 DMA_RESV_USAGE_BOOKKEEP);

803 if (err)

804 goto err_job;

805 }

806

807 xe_sched_job_arm(job);

808 dma_fence_put(fence);

809 fence = dma_fence_get(&job->drm.s_fence->finished);

810 xe_sched_job_push(job);

811

812 dma_fence_put(m->fence);

813 m->fence = dma_fence_get(fence);

814

815 mutex_unlock(&m->job_mutex);

816

817 xe_bb_free(bb, fence);

818 size -= src_L0;

819 continue;

820

821 err_job:

822 xe_sched_job_put(job);

823 err:

824 mutex_unlock(&m->job_mutex);

825 xe_bb_free(bb, NULL);

826

827 err_sync:

828 /* Sync partial copy if any. FIXME: under job_mutex? */

829 if (fence) {

830 dma_fence_wait(fence, false);

831 dma_fence_put(fence);

832 }

833

834 return ERR_PTR(err);

835 }

836

837 return fence;

838 }

839

840 static void emit_clear_link_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs,

841 u32 size, u32 pitch)

842 {

843 struct xe_device *xe = gt_to_xe(gt);

844 u32 *cs = bb->cs + bb->len;

845 u32 len = PVC_MEM_SET_CMD_LEN_DW;

846

847 *cs++ = PVC_MEM_SET_CMD | PVC_MEM_SET_MATRIX | (len - 2);

848 *cs++ = pitch - 1;

849 *cs++ = (size / pitch) - 1;

850 *cs++ = pitch - 1;

851 *cs++ = lower_32_bits(src_ofs);

852 *cs++ = upper_32_bits(src_ofs);

853 if (GRAPHICS_VERx100(xe) >= 2000)

854 *cs++ = FIELD_PREP(XE2_MEM_SET_MOCS_INDEX_MASK, gt->mocs.uc_index);

855 else

856 *cs++ = FIELD_PREP(PVC_MEM_SET_MOCS_INDEX_MASK, gt->mocs.uc_index);

857

858 xe_gt_assert(gt, cs - bb->cs == len + bb->len);

859

860 bb->len += len;

861 }

862

863 static void emit_clear_main_copy(struct xe_gt *gt, struct xe_bb *bb,

864 u64 src_ofs, u32 size, u32 pitch, bool is_vram)

865 {

866 struct xe_device *xe = gt_to_xe(gt);

867 u32 *cs = bb->cs + bb->len;

868 u32 len = XY_FAST_COLOR_BLT_DW;

869

870 if (GRAPHICS_VERx100(xe) < 1250)

871 len = 11;

872

873 *cs++ = XY_FAST_COLOR_BLT_CMD | XY_FAST_COLOR_BLT_DEPTH_32 |

874 (len - 2);

875 if (GRAPHICS_VERx100(xe) >= 2000)

876 *cs++ = FIELD_PREP(XE2_XY_FAST_COLOR_BLT_MOCS_INDEX_MASK, gt->mocs.uc_index) |

877 (pitch - 1);

878 else

879 *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, gt->mocs.uc_index) |

880 (pitch - 1);

881 *cs++ = 0;

882 *cs++ = (size / pitch) << 16 | pitch / 4;

883 *cs++ = lower_32_bits(src_ofs);

884 *cs++ = upper_32_bits(src_ofs);

885 *cs++ = (is_vram ? 0x0 : 0x1) << XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT;

886 *cs++ = 0;

887 *cs++ = 0;

888 *cs++ = 0;

889 *cs++ = 0;

890

891 if (len > 11) {

892 *cs++ = 0;

893 *cs++ = 0;

894 *cs++ = 0;

895 *cs++ = 0;

896 *cs++ = 0;

897 }

898

899 xe_gt_assert(gt, cs - bb->cs == len + bb->len);

900

901 bb->len += len;

902 }

903

904 static bool has_service_copy_support(struct xe_gt *gt)

905 {

906 /*

907 * What we care about is whether the architecture was designed with

908 * service copy functionality (specifically the new MEM_SET / MEM_COPY

909 * instructions) so check the architectural engine list rather than the

910 * actual list since these instructions are usable on BCS0 even if

911 * all of the actual service copy engines (BCS1-BCS8) have been fused

912 * off.

913 */

914 return gt->info.__engine_mask & GENMASK(XE_HW_ENGINE_BCS8,

915 XE_HW_ENGINE_BCS1);

916 }

917

918 static u32 emit_clear_cmd_len(struct xe_gt *gt)

919 {

920 if (has_service_copy_support(gt))

921 return PVC_MEM_SET_CMD_LEN_DW;

922 else

923 return XY_FAST_COLOR_BLT_DW;

924 }

925

926 static void emit_clear(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs,

927 u32 size, u32 pitch, bool is_vram)

928 {

929 if (has_service_copy_support(gt))

930 emit_clear_link_copy(gt, bb, src_ofs, size, pitch);

931 else

932 emit_clear_main_copy(gt, bb, src_ofs, size, pitch,

933 is_vram);

934 }

935

936 /**

937 * xe_migrate_clear() - Copy content of TTM resources.

938 * @m: The migration context.

939 * @bo: The buffer object @dst is currently bound to.

940 * @dst: The dst TTM resource to be cleared.

941 *

942 * Clear the contents of @dst to zero. On flat CCS devices,

943 * the CCS metadata is cleared to zero as well on VRAM destinations.

944 * TODO: Eliminate the @bo argument.

945 *

946 * Return: Pointer to a dma_fence representing the last clear batch, or

947 * an error pointer on failure. If there is a failure, any clear operation

948 * started by the function call has been synced.

949 */

950 struct dma_fence *xe_migrate_clear(struct xe_migrate *m,

951 struct xe_bo *bo,

952 struct ttm_resource *dst)

953 {

954 bool clear_vram = mem_type_is_vram(dst->mem_type);

955 struct xe_gt *gt = m->tile->primary_gt;

956 struct xe_device *xe = gt_to_xe(gt);

957 bool clear_system_ccs = (xe_bo_needs_ccs_pages(bo) && !IS_DGFX(xe)) ? true : false;

958 struct dma_fence *fence = NULL;

959 u64 size = bo->size;

960 struct xe_res_cursor src_it;

961 struct ttm_resource *src = dst;

962 int err;

963 int pass = 0;

964

965 if (!clear_vram)

966 xe_res_first_sg(xe_bo_sg(bo), 0, bo->size, &src_it);

967 else

968 xe_res_first(src, 0, bo->size, &src_it);

969

970 while (size) {

971 u64 clear_L0_ofs;

972 u32 clear_L0_pt;

973 u32 flush_flags = 0;

974 u64 clear_L0;

975 struct xe_sched_job *job;

976 struct xe_bb *bb;

977 u32 batch_size, update_idx;

978

979 bool usm = xe->info.has_usm;

980 u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE;

981

982 clear_L0 = xe_migrate_res_sizes(m, &src_it);

983

984 drm_dbg(&xe->drm, "Pass %u, size: %llu\n", pass++, clear_L0);

985

986 /* Calculate final sizes and batch size.. */

987 batch_size = 2 +

988 pte_update_size(m, clear_vram, src, &src_it,

989 &clear_L0, &clear_L0_ofs, &clear_L0_pt,

990 clear_system_ccs ? 0 : emit_clear_cmd_len(gt), 0,

991 avail_pts);

992

993 if (xe_device_has_flat_ccs(xe))

994 batch_size += EMIT_COPY_CCS_DW;

995

996 /* Clear commands */

997

998 if (WARN_ON_ONCE(!clear_L0))

999 break;

1000

1001 bb = xe_bb_new(gt, batch_size, usm);

1002 if (IS_ERR(bb)) {

1003 err = PTR_ERR(bb);

1004 goto err_sync;

1005 }

1006

1007 size -= clear_L0;

1008 /* Preemption is enabled again by the ring ops. */

1009 if (clear_vram && xe_migrate_allow_identity(clear_L0, &src_it))

1010 xe_res_next(&src_it, clear_L0);

1011 else

1012 emit_pte(m, bb, clear_L0_pt, clear_vram, true, &src_it, clear_L0,

1013 dst);

1014

1015 bb->cs[bb->len++] = MI_BATCH_BUFFER_END;

1016 update_idx = bb->len;

1017

1018 if (!clear_system_ccs)

1019 emit_clear(gt, bb, clear_L0_ofs, clear_L0, XE_PAGE_SIZE, clear_vram);

1020

1021 if (xe_device_has_flat_ccs(xe)) {

1022 emit_copy_ccs(gt, bb, clear_L0_ofs, true,

1023 m->cleared_mem_ofs, false, clear_L0);

1024 flush_flags = MI_FLUSH_DW_CCS;

1025 }

1026

1027 mutex_lock(&m->job_mutex);

1028 job = xe_bb_create_migration_job(m->q, bb,

1029 xe_migrate_batch_base(m, usm),

1030 update_idx);

1031 if (IS_ERR(job)) {

1032 err = PTR_ERR(job);

1033 goto err;

1034 }

1035

1036 xe_sched_job_add_migrate_flush(job, flush_flags);

1037 if (!fence) {

1038 /*

1039 * There can't be anything userspace related at this

1040 * point, so we just need to respect any potential move

1041 * fences, which are always tracked as

1042 * DMA_RESV_USAGE_KERNEL.

1043 */

1044 err = job_add_deps(job, bo->ttm.base.resv,

1045 DMA_RESV_USAGE_KERNEL);

1046 if (err)

1047 goto err_job;

1048 }

1049

1050 xe_sched_job_arm(job);

1051 dma_fence_put(fence);

1052 fence = dma_fence_get(&job->drm.s_fence->finished);

1053 xe_sched_job_push(job);

1054

1055 dma_fence_put(m->fence);

1056 m->fence = dma_fence_get(fence);

1057

1058 mutex_unlock(&m->job_mutex);

1059

1060 xe_bb_free(bb, fence);

1061 continue;

1062

1063 err_job:

1064 xe_sched_job_put(job);

1065 err:

1066 mutex_unlock(&m->job_mutex);

1067 xe_bb_free(bb, NULL);

1068 err_sync:

1069 /* Sync partial copies if any. FIXME: job_mutex? */

1070 if (fence) {

1071 dma_fence_wait(m->fence, false);

1072 dma_fence_put(fence);

1073 }

1074

1075 return ERR_PTR(err);

1076 }

1077

1078 if (clear_system_ccs)

1079 bo->ccs_cleared = true;

1080

1081 return fence;

1082 }

1083

1084 static void write_pgtable(struct xe_tile *tile, struct xe_bb *bb, u64 ppgtt_ofs,

1085 const struct xe_vm_pgtable_update *update,

1086 struct xe_migrate_pt_update *pt_update)

1087 {

1088 const struct xe_migrate_pt_update_ops *ops = pt_update->ops;

1089 u32 chunk;

1090 u32 ofs = update->ofs, size = update->qwords;

1091

1092 /*

1093 * If we have 512 entries (max), we would populate it ourselves,

1094 * and update the PDE above it to the new pointer.

1095 * The only time this can only happen if we have to update the top

1096 * PDE. This requires a BO that is almost vm->size big.

1097 *

1098 * This shouldn't be possible in practice.. might change when 16K

1099 * pages are used. Hence the assert.

1100 */

1101 xe_tile_assert(tile, update->qwords <= 0x1ff);

1102 if (!ppgtt_ofs)

1103 ppgtt_ofs = xe_migrate_vram_ofs(tile_to_xe(tile),

1104 xe_bo_addr(update->pt_bo, 0,

1105 XE_PAGE_SIZE));

1106

1107 do {

1108 u64 addr = ppgtt_ofs + ofs * 8;

1109

1110 chunk = min(update->qwords, 0x1ffU);

1111

1112 /* Ensure populatefn can do memset64 by aligning bb->cs */

1113 if (!(bb->len & 1))

1114 bb->cs[bb->len++] = MI_NOOP;

1115

1116 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk);

1117 bb->cs[bb->len++] = lower_32_bits(addr);

1118 bb->cs[bb->len++] = upper_32_bits(addr);

1119 ops->populate(pt_update, tile, NULL, bb->cs + bb->len, ofs, chunk,

1120 update);

1121

1122 bb->len += chunk * 2;

1123 ofs += chunk;

1124 size -= chunk;

1125 } while (size);

1126 }

1127

1128 struct xe_vm *xe_migrate_get_vm(struct xe_migrate *m)

1129 {

1130 return xe_vm_get(m->q->vm);

1131 }

1132

1133 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST)

1134 struct migrate_test_params {

1135 struct xe_test_priv base;

1136 bool force_gpu;

1137 };

1138

1139 #define to_migrate_test_params(_priv) \

1140 container_of(_priv, struct migrate_test_params, base)

1141 #endif

1142

1143 static struct dma_fence *

1144 xe_migrate_update_pgtables_cpu(struct xe_migrate *m,

1145 struct xe_vm *vm, struct xe_bo *bo,

1146 const struct xe_vm_pgtable_update *updates,

1147 u32 num_updates, bool wait_vm,

1148 struct xe_migrate_pt_update *pt_update)

1149 {

1150 XE_TEST_DECLARE(struct migrate_test_params *test =

1151 to_migrate_test_params

1152 (xe_cur_kunit_priv(XE_TEST_LIVE_MIGRATE));)

1153 const struct xe_migrate_pt_update_ops *ops = pt_update->ops;

1154 struct dma_fence *fence;

1155 int err;

1156 u32 i;

1157

1158 if (XE_TEST_ONLY(test && test->force_gpu))

1159 return ERR_PTR(-ETIME);

1160

1161 if (bo && !dma_resv_test_signaled(bo->ttm.base.resv,

1162 DMA_RESV_USAGE_KERNEL))

1163 return ERR_PTR(-ETIME);

1164

1165 if (wait_vm && !dma_resv_test_signaled(xe_vm_resv(vm),

1166 DMA_RESV_USAGE_BOOKKEEP))

1167 return ERR_PTR(-ETIME);

1168

1169 if (ops->pre_commit) {

1170 pt_update->job = NULL;

1171 err = ops->pre_commit(pt_update);

1172 if (err)

1173 return ERR_PTR(err);

1174 }

1175 for (i = 0; i < num_updates; i++) {

1176 const struct xe_vm_pgtable_update *update = &updates[i];

1177

1178 ops->populate(pt_update, m->tile, &update->pt_bo->vmap, NULL,

1179 update->ofs, update->qwords, update);

1180 }

1181

1182 if (vm) {

1183 trace_xe_vm_cpu_bind(vm);

1184 xe_device_wmb(vm->xe);

1185 }

1186

1187 fence = dma_fence_get_stub();

1188

1189 return fence;

1190 }

1191

1192 static bool no_in_syncs(struct xe_vm *vm, struct xe_exec_queue *q,

1193 struct xe_sync_entry *syncs, u32 num_syncs)

1194 {

1195 struct dma_fence *fence;

1196 int i;

1197

1198 for (i = 0; i < num_syncs; i++) {

1199 fence = syncs[i].fence;

1200

1201 if (fence && !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,

1202 &fence->flags))

1203 return false;

1204 }

1205 if (q) {

1206 fence = xe_exec_queue_last_fence_get(q, vm);

1207 if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))

1208 return false;

1209 }

1210

1211 return true;

1212 }

1213

1214 /**

1215 * xe_migrate_update_pgtables() - Pipelined page-table update

1216 * @m: The migrate context.

1217 * @vm: The vm we'll be updating.

1218 * @bo: The bo whose dma-resv we will await before updating, or NULL if userptr.

1219 * @q: The exec queue to be used for the update or NULL if the default

1220 * migration engine is to be used.

1221 * @updates: An array of update descriptors.

1222 * @num_updates: Number of descriptors in @updates.

1223 * @syncs: Array of xe_sync_entry to await before updating. Note that waits

1224 * will block the engine timeline.

1225 * @num_syncs: Number of entries in @syncs.

1226 * @pt_update: Pointer to a struct xe_migrate_pt_update, which contains

1227 * pointers to callback functions and, if subclassed, private arguments to

1228 * those.

1229 *

1230 * Perform a pipelined page-table update. The update descriptors are typically

1231 * built under the same lock critical section as a call to this function. If

1232 * using the default engine for the updates, they will be performed in the

1233 * order they grab the job_mutex. If different engines are used, external

1234 * synchronization is needed for overlapping updates to maintain page-table

1235 * consistency. Note that the meaing of "overlapping" is that the updates

1236 * touch the same page-table, which might be a higher-level page-directory.

1237 * If no pipelining is needed, then updates may be performed by the cpu.

1238 *

1239 * Return: A dma_fence that, when signaled, indicates the update completion.

1240 */

1241 struct dma_fence *

1242 xe_migrate_update_pgtables(struct xe_migrate *m,

1243 struct xe_vm *vm,

1244 struct xe_bo *bo,

1245 struct xe_exec_queue *q,

1246 const struct xe_vm_pgtable_update *updates,

1247 u32 num_updates,

1248 struct xe_sync_entry *syncs, u32 num_syncs,

1249 struct xe_migrate_pt_update *pt_update)

1250 {

1251 const struct xe_migrate_pt_update_ops *ops = pt_update->ops;

1252 struct xe_tile *tile = m->tile;

1253 struct xe_gt *gt = tile->primary_gt;

1254 struct xe_device *xe = tile_to_xe(tile);

1255 struct xe_sched_job *job;

1256 struct dma_fence *fence;

1257 struct drm_suballoc *sa_bo = NULL;

1258 struct xe_vma *vma = pt_update->vma;

1259 struct xe_bb *bb;

1260 u32 i, batch_size, ppgtt_ofs, update_idx, page_ofs = 0;

1261 u64 addr;

1262 int err = 0;

1263 bool usm = !q && xe->info.has_usm;

1264 bool first_munmap_rebind = vma &&

1265 vma->gpuva.flags & XE_VMA_FIRST_REBIND;

1266 struct xe_exec_queue *q_override = !q ? m->q : q;

1267 u16 pat_index = xe->pat.idx[XE_CACHE_WB];

1268

1269 /* Use the CPU if no in syncs and engine is idle */

1270 if (no_in_syncs(vm, q, syncs, num_syncs) && xe_exec_queue_is_idle(q_override)) {

1271 fence = xe_migrate_update_pgtables_cpu(m, vm, bo, updates,

1272 num_updates,

1273 first_munmap_rebind,

1274 pt_update);

1275 if (!IS_ERR(fence) || fence == ERR_PTR(-EAGAIN))

1276 return fence;

1277 }

1278

1279 /* fixed + PTE entries */

1280 if (IS_DGFX(xe))

1281 batch_size = 2;

1282 else

1283 batch_size = 6 + num_updates * 2;

1284

1285 for (i = 0; i < num_updates; i++) {

1286 u32 num_cmds = DIV_ROUND_UP(updates[i].qwords, 0x1ff);

1287

1288 /* align noop + MI_STORE_DATA_IMM cmd prefix */

1289 batch_size += 4 * num_cmds + updates[i].qwords * 2;

1290 }

1291

1292 /*

1293 * XXX: Create temp bo to copy from, if batch_size becomes too big?

1294 *

1295 * Worst case: Sum(2 * (each lower level page size) + (top level page size))

1296 * Should be reasonably bound..

1297 */

1298 xe_tile_assert(tile, batch_size < SZ_128K);

1299

1300 bb = xe_bb_new(gt, batch_size, !q && xe->info.has_usm);

1301 if (IS_ERR(bb))

1302 return ERR_CAST(bb);

1303

1304 /* For sysmem PTE's, need to map them in our hole.. */

1305 if (!IS_DGFX(xe)) {

1306 ppgtt_ofs = NUM_KERNEL_PDE - 1;

1307 if (q) {

1308 xe_tile_assert(tile, num_updates <= NUM_VMUSA_WRITES_PER_UNIT);

1309

1310 sa_bo = drm_suballoc_new(&m->vm_update_sa, 1,

1311 GFP_KERNEL, true, 0);

1312 if (IS_ERR(sa_bo)) {

1313 err = PTR_ERR(sa_bo);

1314 goto err;

1315 }

1316

1317 ppgtt_ofs = NUM_KERNEL_PDE +

1318 (drm_suballoc_soffset(sa_bo) /

1319 NUM_VMUSA_UNIT_PER_PAGE);

1320 page_ofs = (drm_suballoc_soffset(sa_bo) %

1321 NUM_VMUSA_UNIT_PER_PAGE) *

1322 VM_SA_UPDATE_UNIT_SIZE;

1323 }

1324

1325 /* Map our PT's to gtt */

1326 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(num_updates);

1327 bb->cs[bb->len++] = ppgtt_ofs * XE_PAGE_SIZE + page_ofs;

1328 bb->cs[bb->len++] = 0; /* upper_32_bits */

1329

1330 for (i = 0; i < num_updates; i++) {

1331 struct xe_bo *pt_bo = updates[i].pt_bo;

1332

1333 xe_tile_assert(tile, pt_bo->size == SZ_4K);

1334

1335 addr = vm->pt_ops->pte_encode_bo(pt_bo, 0, pat_index, 0);

1336 bb->cs[bb->len++] = lower_32_bits(addr);

1337 bb->cs[bb->len++] = upper_32_bits(addr);

1338 }

1339

1340 bb->cs[bb->len++] = MI_BATCH_BUFFER_END;

1341 update_idx = bb->len;

1342

1343 addr = xe_migrate_vm_addr(ppgtt_ofs, 0) +

1344 (page_ofs / sizeof(u64)) * XE_PAGE_SIZE;

1345 for (i = 0; i < num_updates; i++)

1346 write_pgtable(tile, bb, addr + i * XE_PAGE_SIZE,

1347 &updates[i], pt_update);

1348 } else {

1349 /* phys pages, no preamble required */

1350 bb->cs[bb->len++] = MI_BATCH_BUFFER_END;

1351 update_idx = bb->len;

1352

1353 for (i = 0; i < num_updates; i++)

1354 write_pgtable(tile, bb, 0, &updates[i], pt_update);

1355 }

1356

1357 if (!q)

1358 mutex_lock(&m->job_mutex);

1359

1360 job = xe_bb_create_migration_job(q ?: m->q, bb,

1361 xe_migrate_batch_base(m, usm),

1362 update_idx);

1363 if (IS_ERR(job)) {

1364 err = PTR_ERR(job);

1365 goto err_bb;

1366 }

1367

1368 /* Wait on BO move */

1369 if (bo) {

1370 err = job_add_deps(job, bo->ttm.base.resv,

1371 DMA_RESV_USAGE_KERNEL);

1372 if (err)

1373 goto err_job;

1374 }

1375

1376 /*

1377 * Munmap style VM unbind, need to wait for all jobs to be complete /

1378 * trigger preempts before moving forward

1379 */

1380 if (first_munmap_rebind) {

1381 err = job_add_deps(job, xe_vm_resv(vm),

1382 DMA_RESV_USAGE_BOOKKEEP);

1383 if (err)

1384 goto err_job;

1385 }

1386

1387 err = xe_sched_job_last_fence_add_dep(job, vm);

1388 for (i = 0; !err && i < num_syncs; i++)

1389 err = xe_sync_entry_add_deps(&syncs[i], job);

1390

1391 if (err)

1392 goto err_job;

1393

1394 if (ops->pre_commit) {

1395 pt_update->job = job;

1396 err = ops->pre_commit(pt_update);

1397 if (err)

1398 goto err_job;

1399 }

1400 xe_sched_job_arm(job);

1401 fence = dma_fence_get(&job->drm.s_fence->finished);

1402 xe_sched_job_push(job);

1403

1404 if (!q)

1405 mutex_unlock(&m->job_mutex);

1406

1407 xe_bb_free(bb, fence);

1408 drm_suballoc_free(sa_bo, fence);

1409

1410 return fence;

1411

1412 err_job:

1413 xe_sched_job_put(job);

1414 err_bb:

1415 if (!q)

1416 mutex_unlock(&m->job_mutex);

1417 xe_bb_free(bb, NULL);

1418 err:

1419 drm_suballoc_free(sa_bo, NULL);

1420 return ERR_PTR(err);

1421 }

1422

1423 /**

1424 * xe_migrate_wait() - Complete all operations using the xe_migrate context

1425 * @m: Migrate context to wait for.

1426 *

1427 * Waits until the GPU no longer uses the migrate context's default engine

1428 * or its page-table objects. FIXME: What about separate page-table update

1429 * engines?

1430 */

1431 void xe_migrate_wait(struct xe_migrate *m)

1432 {

1433 if (m->fence)

1434 dma_fence_wait(m->fence, false);

1435 }

1436

1437 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST)

1438 #include "tests/xe_migrate.c"

1439 #endif