Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Experiment with very basic barrier tracker for debug purposes. #2055

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions include/vkd3d.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ extern "C" {
#define VKD3D_CONFIG_FLAG_DRIVER_VERSION_SENSITIVE_SHADERS (1ull << 48)
#define VKD3D_CONFIG_FLAG_SMALL_VRAM_REBAR (1ull << 49)
#define VKD3D_CONFIG_FLAG_NO_STAGGERED_SUBMIT (1ull << 50)
#define VKD3D_CONFIG_FLAG_CLEAR_UAV_SYNC (1ull << 51)
#define VKD3D_CONFIG_FLAG_COPY_BUFFER_SYNC (1ull << 52)

struct vkd3d_instance;

Expand Down
16 changes: 9 additions & 7 deletions include/vkd3d_shader.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,9 @@ enum vkd3d_shader_meta_flags
VKD3D_SHADER_META_FLAG_EXPORTS_SAMPLE_MASK = 1 << 17,
VKD3D_SHADER_META_FLAG_FORCE_PRE_RASTERIZATION_BEFORE_DISPATCH = 1 << 18,
VKD3D_SHADER_META_FLAG_FORCE_GRAPHICS_BEFORE_DISPATCH = 1 << 19,
VKD3D_SHADER_META_FLAG_USES_DEPTH_STENCIL_WRITE = 1 << 20,
VKD3D_SHADER_META_FLAG_DISABLE_OPTIMIZATIONS = 1 << 21
VKD3D_SHADER_META_FLAG_FORCE_COMPUTE_BARRIER_BEFORE_DISPATCH = 1 << 20,
VKD3D_SHADER_META_FLAG_USES_DEPTH_STENCIL_WRITE = 1 << 21,
VKD3D_SHADER_META_FLAG_DISABLE_OPTIMIZATIONS = 1 << 22
};

struct vkd3d_shader_meta
Expand Down Expand Up @@ -432,19 +433,20 @@ enum vkd3d_shader_quirk
/* Driver workarounds. Force loops to not be unrolled with SPIR-V control masks. */
VKD3D_SHADER_QUIRK_FORCE_LOOP = (1 << 13),

/* Requests META_FLAG_FORCE_COMPUTE_BARRIER_AFTER_DISPATCH to be set in shader meta. */
/* Requests META_FLAG_FORCE_COMPUTE_BARRIER_{AFTER,BEFORE}_DISPATCH to be set in shader meta. */
VKD3D_SHADER_QUIRK_FORCE_COMPUTE_BARRIER = (1 << 14),
VKD3D_SHADER_QUIRK_FORCE_PRE_COMPUTE_BARRIER = (1 << 15),

/* Range check every descriptor heap access with dynamic index and robustness check it. */
VKD3D_SHADER_QUIRK_DESCRIPTOR_HEAP_ROBUSTNESS = (1 << 15),
VKD3D_SHADER_QUIRK_DESCRIPTOR_HEAP_ROBUSTNESS = (1 << 16),

/* Requests META_FLAG_FORCE_PRE_RASTERIZATION_BEFORE_DISPATCH to be set in shader meta. */
VKD3D_SHADER_QUIRK_FORCE_PRE_RASTERIZATION_BARRIER = (1 << 16),
VKD3D_SHADER_QUIRK_FORCE_PRE_RASTERIZATION_BARRIER = (1 << 17),
/* Requests META_FLAG_FORCE_GRAPHICS_BEFORE_DISPATCH to be set in shader meta. */
VKD3D_SHADER_QUIRK_FORCE_GRAPHICS_BARRIER = (1 << 17),
VKD3D_SHADER_QUIRK_FORCE_GRAPHICS_BARRIER = (1 << 18),

/* VK_PIPELINE_CREATE_DISABLE_OPTIMIZATIONS. For driver workarounds where optimizations break stuff. */
VKD3D_SHADER_QUIRK_DISABLE_OPTIMIZATIONS = (1 << 18)
VKD3D_SHADER_QUIRK_DISABLE_OPTIMIZATIONS = (1 << 19)
};

struct vkd3d_shader_quirk_hash
Expand Down
2 changes: 2 additions & 0 deletions libs/vkd3d-shader/dxil.c
Original file line number Diff line number Diff line change
Expand Up @@ -1205,6 +1205,8 @@ int vkd3d_shader_compile_dxil_export(const struct vkd3d_shader_code *dxil,
quirks = vkd3d_shader_compile_arguments_select_quirks(compiler_args, hash);
if (quirks & VKD3D_SHADER_QUIRK_FORCE_COMPUTE_BARRIER)
spirv->meta.flags |= VKD3D_SHADER_META_FLAG_FORCE_COMPUTE_BARRIER_AFTER_DISPATCH;
if (quirks & VKD3D_SHADER_QUIRK_FORCE_PRE_COMPUTE_BARRIER)
spirv->meta.flags |= VKD3D_SHADER_META_FLAG_FORCE_COMPUTE_BARRIER_BEFORE_DISPATCH;
if (quirks & VKD3D_SHADER_QUIRK_FORCE_PRE_RASTERIZATION_BARRIER)
spirv->meta.flags |= VKD3D_SHADER_META_FLAG_FORCE_PRE_RASTERIZATION_BEFORE_DISPATCH;
if (quirks & VKD3D_SHADER_QUIRK_FORCE_GRAPHICS_BARRIER)
Expand Down
10 changes: 10 additions & 0 deletions libs/vkd3d/acceleration_structure.c
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ static void vkd3d_acceleration_structure_end_barrier(struct d3d12_command_list *
dep_info.pMemoryBarriers = &barrier;

VK_CALL(vkCmdPipelineBarrier2(list->cmd.vk_command_buffer, &dep_info));
d3d12_command_list_debug_mark_barrier(list, &dep_info);
}

static void vkd3d_acceleration_structure_write_postbuild_info(
Expand Down Expand Up @@ -326,6 +327,7 @@ static void vkd3d_acceleration_structure_write_postbuild_info(
* For now, just clear to 0. */
VK_CALL(vkCmdFillBuffer(list->cmd.vk_command_buffer, vk_buffer, offset,
sizeof(uint64_t), 0));
d3d12_command_list_debug_mark_execution(list, VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT);
return;
}

Expand All @@ -344,6 +346,7 @@ static void vkd3d_acceleration_structure_write_postbuild_info(
vk_query_pool, vk_query_index, 1,
vk_buffer, offset, stride,
VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT));
d3d12_command_list_debug_mark_execution(list, VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT);

if (desc->InfoType == D3D12_RAYTRACING_ACCELERATION_STRUCTURE_POSTBUILD_INFO_SERIALIZATION)
{
Expand All @@ -366,12 +369,14 @@ static void vkd3d_acceleration_structure_write_postbuild_info(
vk_query_pool, vk_query_index, 1,
vk_buffer, offset + sizeof(uint64_t), stride,
VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT));
d3d12_command_list_debug_mark_execution(list, VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT);
}
else
{
FIXME("NumBottomLevelPointers will always return 0.\n");
VK_CALL(vkCmdFillBuffer(list->cmd.vk_command_buffer, vk_buffer, offset + sizeof(uint64_t),
sizeof(uint64_t), 0));
d3d12_command_list_debug_mark_execution(list, VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT);
}
}
}
Expand Down Expand Up @@ -402,6 +407,7 @@ void vkd3d_acceleration_structure_emit_postbuild_info(
dep_info.pMemoryBarriers = &barrier;

VK_CALL(vkCmdPipelineBarrier2(list->cmd.vk_command_buffer, &dep_info));
d3d12_command_list_debug_mark_barrier(list, &dep_info);

stride = desc->InfoType == D3D12_RAYTRACING_ACCELERATION_STRUCTURE_POSTBUILD_INFO_SERIALIZATION ?
2 * sizeof(uint64_t) : sizeof(uint64_t);
Expand Down Expand Up @@ -449,6 +455,7 @@ void vkd3d_acceleration_structure_emit_immediate_postbuild_info(
dep_info.pMemoryBarriers = &barrier;

VK_CALL(vkCmdPipelineBarrier2(list->cmd.vk_command_buffer, &dep_info));
d3d12_command_list_debug_mark_barrier(list, &dep_info);

/* Could optimize a bit by batching more aggressively, but no idea if it's going to help in practice. */
for (i = 0; i < count; i++)
Expand Down Expand Up @@ -503,5 +510,8 @@ void vkd3d_acceleration_structure_copy(
info.dst = dst_as;
info.src = src_as;
if (convert_copy_mode(mode, &info.mode))
{
VK_CALL(vkCmdCopyAccelerationStructureKHR(list->cmd.vk_command_buffer, &info));
d3d12_command_list_debug_mark_execution(list, VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT);
}
}
213 changes: 207 additions & 6 deletions libs/vkd3d/breadcrumbs.c
Original file line number Diff line number Diff line change
Expand Up @@ -867,7 +867,10 @@ void vkd3d_breadcrumb_tracer_update_barrier_hashes(struct vkd3d_breadcrumb_trace
while (*end_ptr != '\0' && !isalnum(*end_ptr))
end_ptr++;

hi_hash = strtoull(end_ptr, NULL, 16);
hi_hash = strtoull(end_ptr, &end_ptr, 16);

while (*end_ptr != '\0' && !isalpha(*end_ptr))
end_ptr++;

if (!hi_hash)
hi_hash = lo_hash;
Expand All @@ -879,6 +882,48 @@ void vkd3d_breadcrumb_tracer_update_barrier_hashes(struct vkd3d_breadcrumb_trace

tracer->barrier_hashes[new_count].lo = lo_hash;
tracer->barrier_hashes[new_count].hi = hi_hash;

if (*end_ptr != '\0')
{
char *stray_newline = end_ptr + (strlen(end_ptr) - 1);
if (*stray_newline == '\n')
*stray_newline = '\0';
}

if (*end_ptr == '\0')
{
tracer->barrier_hashes[new_count].shader_meta_flags =
VKD3D_SHADER_META_FLAG_FORCE_COMPUTE_BARRIER_BEFORE_DISPATCH;
end_ptr = "post-compute (default)";
}
else if (strcmp(end_ptr, "pre-compute") == 0)
{
tracer->barrier_hashes[new_count].shader_meta_flags =
VKD3D_SHADER_META_FLAG_FORCE_COMPUTE_BARRIER_BEFORE_DISPATCH;
}
else if (strcmp(end_ptr, "post-compute") == 0)
{
tracer->barrier_hashes[new_count].shader_meta_flags =
VKD3D_SHADER_META_FLAG_FORCE_COMPUTE_BARRIER_AFTER_DISPATCH;
}
else if (strcmp(end_ptr, "pre-raster") == 0)
{
tracer->barrier_hashes[new_count].shader_meta_flags =
VKD3D_SHADER_META_FLAG_FORCE_PRE_RASTERIZATION_BEFORE_DISPATCH;
}
else if (strcmp(end_ptr, "graphics") == 0)
{
tracer->barrier_hashes[new_count].shader_meta_flags =
VKD3D_SHADER_META_FLAG_FORCE_GRAPHICS_BEFORE_DISPATCH;
}
else
end_ptr = "N/A";

INFO("Inserting %s barrier for %016"PRIx64" - %016"PRIx64".\n",
end_ptr,
tracer->barrier_hashes[new_count].lo,
tracer->barrier_hashes[new_count].hi);

new_count++;
}
}
Expand All @@ -891,19 +936,175 @@ void vkd3d_breadcrumb_tracer_update_barrier_hashes(struct vkd3d_breadcrumb_trace
}
}

bool vkd3d_breadcrumb_tracer_shader_hash_forces_barrier(struct vkd3d_breadcrumb_tracer *tracer, vkd3d_shader_hash_t hash)
uint32_t vkd3d_breadcrumb_tracer_shader_hash_forces_barrier(struct vkd3d_breadcrumb_tracer *tracer,
vkd3d_shader_hash_t hash)
{
bool ret = false;
uint32_t flags = 0;
size_t i;

/* Avoid taking lock every dispatch when we're not explicitly using the feature.
* Ordering is not relevant, since if we decide to look at hashes, we take full locks anyway. */
if (vkd3d_atomic_uint32_load_explicit(&tracer->barrier_hashes_count, vkd3d_memory_order_relaxed) != 0)
{
pthread_mutex_lock(&tracer->barrier_hash_lock);
for (i = 0; i < tracer->barrier_hashes_count && !ret; i++)
ret = tracer->barrier_hashes[i].lo <= hash && hash <= tracer->barrier_hashes[i].hi;
for (i = 0; i < tracer->barrier_hashes_count; i++)
if (tracer->barrier_hashes[i].lo <= hash && hash <= tracer->barrier_hashes[i].hi)
flags |= tracer->barrier_hashes[i].shader_meta_flags;
pthread_mutex_unlock(&tracer->barrier_hash_lock);
}
return ret;
return flags;
}

void d3d12_command_list_debug_mark_execution(struct d3d12_command_list *list, VkPipelineStageFlags2 stages)
{
if ((stages & list->cmd.stages_synced) != stages)
{
VkPipelineStageFlags2 unsynced_stages = stages & ~list->cmd.stages_synced;
bool assume_synchronized = false;
char buf[256];

/* Graphics -> Graphics is implicitly synchronized, and we assume that Graphics will never write Indirect data. */
assume_synchronized =
list->cmd.stages_pending_execution == VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT &&
(list->cmd.stages_synced & (VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT | VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT)) ==
list->cmd.stages_synced;

if (list->state)
{
if ((list->cmd.stages_pending_execution & VK_PIPELINE_STAGE_2_CLEAR_BIT) &&
(stages & VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT))
{
INFO("Potential UAV clear -> compute hazard for #%"PRIx64" (%s).\n",
list->state->compute.code.meta.hash,
list->state->compute.code_debug.debug_entry_point_name ?
list->state->compute.code_debug.debug_entry_point_name : "N/A");
}
#if 0
else if ((list->cmd.stages_pending_execution & VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT) &&
(stages & VK_PIPELINE_STAGE_2_CLEAR_BIT))
{
INFO("Potential compute -> UAV clear hazard for #%"PRIx64" (%s).\n", list->state->compute.code.meta.hash,
list->state->compute.code_debug.debug_entry_point_name ?
list->state->compute.code_debug.debug_entry_point_name : "N/A");
}
#endif
}

if (!assume_synchronized)
{
if (list->cmd.stages_pending_execution != 0)
{
if (list->cmd.last_active_pipeline == list->current_pipeline)
{
/* Back-to-back compute is less likely to be a real hazard since it's probably just
* doing a bunch of parallel dispatches over a data set or something like that. */
snprintf(buf, sizeof(buf), "Back-to-back (pending #%"PRIx64", unsynced #%"PRIx64")",
list->cmd.stages_pending_execution, unsynced_stages);
d3d12_command_list_debug_mark_label(list, buf, 0.7f, 0.7f, 1.0f, 1.0f);
}
else
{
/* Changing compute pipeline without a barrier is somewhat suspicious. */
snprintf(buf, sizeof(buf), "Potential hazard (pending #%"PRIx64", unsynced #%"PRIx64")",
list->cmd.stages_pending_execution, unsynced_stages);
d3d12_command_list_debug_mark_label(list, buf, 1.0f, 0.7f, 0.7f, 1.0f);
}
}
else
{
/* First command of the command buffer. A potential hazard between command lists, but very unlikely. */
snprintf(buf, sizeof(buf), "Potential cross-cmdlist hazard (unsynced #%"PRIx64")",
unsynced_stages);
d3d12_command_list_debug_mark_label(list, buf, 0.7f, 1.0f, 0.7f, 1.0f);
}
VKD3D_BREADCRUMB_TAG("Potential hazard [pending, unsynced stages]");
VKD3D_BREADCRUMB_AUX64(list->cmd.stages_pending_execution);
VKD3D_BREADCRUMB_AUX64(unsynced_stages);
}
}

if (stages == VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT || stages == VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT)
list->cmd.last_active_pipeline = list->current_pipeline;

list->cmd.stages_pending_execution |= stages;
if (stages != VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT)
{
list->cmd.stages_synced = 0;
}
else
{
/* Graphics -> Graphics is fine. Similarly, Graphics -> Indirect is fine.
* We only expect that non-graphics work will write to indirect data.
* Otherwise, we will get false positive spam for every indirect command. */
list->cmd.stages_synced &= ~(VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT | VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT);
}
}

void d3d12_command_list_debug_mark_barrier(struct d3d12_command_list *list, const VkDependencyInfo *deps)
{
static const VkPipelineStageFlags2 graphics_stages =
VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT |
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT | VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT;

/* Reserve CLEAR_BIT for UAV clears since they are a bit special. We don't use that for other things. */
static const VkPipelineStageFlags2 transfer_stages =
VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_BLIT_BIT | VK_PIPELINE_STAGE_2_RESOLVE_BIT;

VkPipelineStageFlags2 src_stages = 0;
VkPipelineStageFlags2 dst_stages = 0;
uint32_t i;

for (i = 0; i < deps->memoryBarrierCount; i++)
{
src_stages |= deps->pMemoryBarriers[i].srcStageMask;
dst_stages |= deps->pMemoryBarriers[i].dstStageMask;
}

for (i = 0; i < deps->imageMemoryBarrierCount; i++)
{
src_stages |= deps->pImageMemoryBarriers[i].srcStageMask;
dst_stages |= deps->pImageMemoryBarriers[i].dstStageMask;
}

if (src_stages & VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)
{
src_stages =
VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT | VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT |
VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT;
}
if (src_stages & graphics_stages)
src_stages |= VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT;
if (src_stages & transfer_stages)
src_stages |= VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT;
if (src_stages & VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT)
src_stages |= VK_PIPELINE_STAGE_2_CLEAR_BIT;

if (dst_stages & VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)
{
dst_stages =
VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT | VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT |
VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT;
}
if (dst_stages & graphics_stages)
dst_stages |= VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT;
if (dst_stages & transfer_stages)
dst_stages |= VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT;
if (dst_stages & VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT)
dst_stages |= VK_PIPELINE_STAGE_2_CLEAR_BIT;

list->cmd.stages_pending_execution &= ~src_stages;
/* The only way to be sure is that all previous work has been synchronized in some way. */
if (list->cmd.stages_pending_execution == 0)
{
list->cmd.stages_synced |= dst_stages;
}
else
{
/* If we fail to mark stages as synced, report it here. */
char buf[256];
snprintf(buf, sizeof(buf), "Still unsynchronized stages (pending #%"PRIx64")",
list->cmd.stages_pending_execution);
d3d12_command_list_debug_mark_label(list, buf, 0.2f, 0.7f, 0.7f, 1.0f);
}
}
Loading