From ea18a680b9fe3d82deddfdd677dab516ab2eb8d4 Mon Sep 17 00:00:00 2001 From: Banit Agrawal Date: Wed, 4 Oct 2023 14:40:15 -0700 Subject: [PATCH] Use 4k page instead of 2M for managed tensor (#2058) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2058 This diff changes the page size from 2M to 4k for prefaulting/mapping the pages. Reviewed By: q10, jasonjk-park, zyan0, jianyuh Differential Revision: D49924136 fbshipit-source-id: fdee08b9a4da54dce902c98ee3aae62ac0d3ad6c --- fbgemm_gpu/src/cumem_utils.cu | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fbgemm_gpu/src/cumem_utils.cu b/fbgemm_gpu/src/cumem_utils.cu index 9f7ecc308..ce04a34cf 100644 --- a/fbgemm_gpu/src/cumem_utils.cu +++ b/fbgemm_gpu/src/cumem_utils.cu @@ -224,11 +224,9 @@ Tensor new_host_mapped_tensor( // can minimize the cost while holding this global lock. void* const ptr = malloc(size_bytes); - // advise the kernel to allocate large 2M pages - madvise(ptr, size_bytes, MADV_HUGEPAGE); - - // pre-fault/map the pages by setting the first byte of the page - size_t pageSize = (1 << 21); + // Pre-fault/map the pages by setting the first byte of the page + // TODO: parallelize the mapping of pages with a threadpool executor + const size_t pageSize = (size_t)sysconf(_SC_PAGESIZE); uintptr_t alignedPtr = (((uintptr_t)ptr + pageSize - 1) & ~(pageSize - 1)); for (uintptr_t p = alignedPtr; p < ((uintptr_t)ptr + size_bytes); p += pageSize) {