Fix memory callback coverage #1163

lacraig2 · 2022-01-11T18:37:43Z

What this PR does:

Makes the decision for memory callback a run-time decision instead of a translate time decision
Removes translate-time installation of qemu_ld_helpers_panda based on panda_use_memcb
Moves original helper_[endian]_[ld/st]_name functions into the inline static functions of the form helper_[endian]_[ld/st]_name_inner (and changes internal calls to this form as well)
Replaces helper_[endian]_[ld/st]_name functions with PANDA equivalent that checks panda_use_memcb at runtime
Keeps helper_[endian]_[ld/st]_name_panda functions for compatibility.
Adds panda memory callbacks to fast case of cpu_ldst_template.h

The issue:

See #1119 for context.

A bug was reported that indicated that the x86 instruction cmpxchg (32 or 64-bit versions) did not generate memory callbacks. The examination in the PR is somewhat different from my analysis. First, the relevant functions are not:

void helper_cmpxchg8b(CPUX86State *env, target_ulong a0)
void helper_cmpxchg16b(CPUX86State *env, target_ulong a0)

because when generating for single core we generate _unlocked versions.

panda/target/i386/translate.c

Lines 5269 to 5273 in 0c15599

    
           if ((s->prefix & PREFIX_LOCK) && parallel_cpus) { 
        
               gen_helper_cmpxchg8b(cpu_env, cpu_A0); 
        
           } else { 
        
               gen_helper_cmpxchg8b_unlocked(cpu_env, cpu_A0); 
        
           }

The unlocked versions use cpu_ldst functions like @AndrewFasano mentioned.

panda/target/i386/mem_helper.c

Lines 28 to 52 in 0c15599

    
           void helper_cmpxchg8b_unlocked(CPUX86State *env, target_ulong a0) 
        
           { 
        
               uintptr_t ra = GETPC(); 
        
               uint64_t oldv, cmpv, newv; 
        
               int eflags; 
        
               eflags = cpu_cc_compute_all(env, CC_OP); 
        
               cmpv = deposit64(env->regs[R_EAX], 32, 32, env->regs[R_EDX]); 
        
               newv = deposit64(env->regs[R_EBX], 32, 32, env->regs[R_ECX]); 
        
               oldv = cpu_ldq_data_ra(env, a0, ra); 
        
               newv = (cmpv == oldv ? newv : oldv); 
        
               /* always do the store */ 
        
               cpu_stq_data_ra(env, a0, newv, ra); 
        
               if (oldv == cmpv) { 
        
                   eflags |= CC_Z; 
        
               } else { 
        
                   env->regs[R_EAX] = (uint32_t)oldv; 
        
                   env->regs[R_EDX] = (uint32_t)(oldv >> 32); 
        
                   eflags &= ~CC_Z; 
        
               } 
        
               CC_SRC = eflags; 
        
           }

It seems to me that there are two problems:

The first is that functions of the form helper_[endian]_st[size]_mmu_panda are only replaced here

panda/tcg/i386/tcg-target.inc.c

Lines 1223 to 1242 in d2ee3d1

    
           static void * const qemu_ld_helpers_normal[16] = { 
        
               [MO_UB]   = helper_ret_ldub_mmu, 
        
               [MO_LEUW] = helper_le_lduw_mmu, 
        
               [MO_LEUL] = helper_le_ldul_mmu, 
        
               [MO_LEQ]  = helper_le_ldq_mmu, 
        
               [MO_BEUW] = helper_be_lduw_mmu, 
        
               [MO_BEUL] = helper_be_ldul_mmu, 
        
               [MO_BEQ]  = helper_be_ldq_mmu, 
        
           }; 
        
           static void * const qemu_ld_helpers_panda[16] = { 
        
               [MO_UB]   = helper_ret_ldub_mmu_panda, 
        
               [MO_LEUW] = helper_le_lduw_mmu_panda, 
        
               [MO_LEUL] = helper_le_ldul_mmu_panda, 
        
               [MO_LEQ]  = helper_le_ldq_mmu_panda, 
        
               [MO_BEUW] = helper_be_lduw_mmu_panda, 
        
               [MO_BEUL] = helper_be_ldul_mmu_panda, 
        
               [MO_BEQ]  = helper_be_ldq_mmu_panda, 
        
           }; 
        
           #define qemu_ld_helpers \ 
        
               (panda_use_memcb ? qemu_ld_helpers_panda : qemu_ld_helpers_normal)

which leaves it open to be called by other APIs, which misses our callback infrastructure. Such is the case for the cpu_ldst framework. https://github.com/panda-re/panda/blob/dev/include/exec/cpu_ldst_template.h#L102-L103

The second is that within the cpu_ldst framework we do not cover the case where ld[size]_p is called directly on translated memory. https://github.com/panda-re/panda/blob/dev/include/exec/cpu_ldst_template.h#L105-L106

Advantages

This PR allows for memory callbacks to be truly dynamically enabled/disabled. Prior to this the decision was made at translate-time. If you wanted to truly disable mem callbacks you'd need to clear the TB cache. This PR changes the decision to runtime.

This PR allows PANDA to cover a wider range of data accesses on x86 that were otherwise missed. This includes the cpu_ldst functions, but also includes direct calls from other instruction such as:

FLDT:

panda/target/i386/fpu_helper.c

Lines 72 to 80 in 6ed3f19

    
           static inline floatx80 helper_fldt(CPUX86State *env, target_ulong ptr, 
        
                                              uintptr_t retaddr) 
        
           { 
        
               CPU_LDoubleU temp; 
        
               temp.l.lower = cpu_ldq_data_ra(env, ptr, retaddr); 
        
               temp.l.upper = cpu_lduw_data_ra(env, ptr + 8, retaddr); 
        
               return temp.d; 
        
           }

BNDLDX:

panda/target/i386/mpx_helper.c

Lines 138 to 148 in 6ed3f19

    
           void helper_bndstx64(CPUX86State *env, target_ulong base, target_ulong ptr, 
        
                                uint64_t lb, uint64_t ub) 
        
           { 
        
               uintptr_t ra = GETPC(); 
        
               uint64_t bte; 
        
               bte = lookup_bte64(env, base, ra); 
        
               cpu_stq_data_ra(env, bte, lb, ra); 
        
               cpu_stq_data_ra(env, bte + 8, ub, ra); 
        
               cpu_stq_data_ra(env, bte + 16, ptr, ra); 
        
           }

BNDSTX:

panda/target/i386/mpx_helper.c

Lines 105 to 120 in 6ed3f19

    
           uint64_t helper_bndldx64(CPUX86State *env, target_ulong base, target_ulong ptr) 
        
           { 
        
               uintptr_t ra = GETPC(); 
        
               uint64_t bte, lb, ub, pt; 
        
               bte = lookup_bte64(env, base, ra); 
        
               lb = cpu_ldq_data_ra(env, bte, ra); 
        
               ub = cpu_ldq_data_ra(env, bte + 8, ra); 
        
               pt = cpu_ldq_data_ra(env, bte + 16, ra); 
        
               if (pt != ptr) { 
        
                   lb = ub = 0; 
        
               } 
        
               env->mmx_t0.MMX_Q(0) = ub; 
        
               return lb; 
        
           }

Additionally, there are likely benefits to other architectures. There are several examples of other architectures using this style. This one is from PPC:

panda/target/ppc/mem_helper.c

Lines 56 to 66 in 6ed3f19

    
           void helper_lmw(CPUPPCState *env, target_ulong addr, uint32_t reg) 
        
           { 
        
               for (; reg < 32; reg++) { 
        
                   if (needs_byteswap(env)) { 
        
                       env->gpr[reg] = bswap32(cpu_ldl_data_ra(env, addr, GETPC())); 
        
                   } else { 
        
                       env->gpr[reg] = cpu_ldl_data_ra(env, addr, GETPC()); 
        
                   } 
        
                   addr = addr_add(env, addr, 4); 
        
               } 
        
           }

Potential issues

This PR moves the decision to use or not use a memory callback to runtime. That is at least one additional resolution of panda_use_memcb for every memory access. The function makes the case for not using memory callbacks as fast as possible. It inlines the internal version, but it's still a change that could impact performance.

panda/softmmu_template.h

Lines 536 to 543 in 6ed3f19

    
           void helper_be_st_name(CPUArchState *env, target_ulong addr, 
        
                                                DATA_TYPE val, TCGMemOpIdx oi, 
        
                                                uintptr_t retaddr){ 
        
               if (likely(!panda_use_memcb)){ 
        
                   glue(helper_be_st_name,_internal)(env, addr, val, oi, retaddr); 
        
                   return; 
        
               } 
        
               unsigned mmu_idx = get_mmuidx(oi);

Design decisions

Instead of replacing helper_[endian]_[ld/st]_name with their PANDA equivalents and moving the old to the _internal versions I could have instead changed the cpu_[ld/st][size]_data_ret functions to use the _panda versions based on the status of panda_use_memcb. I chose not to because I think covering the default case of helper_[endian]_[ld/st]_name with the PANDA version covers far more cases we may have not considered. I also wanted the ability to dynamically enable/disable memory callbacks. If it turns out that that is hugely detrimental to performance I'm happy to consider other options.

Coverage decisions

This PR changes the PANDA model to cover virtual memory read/writes from all system mmu modes. This means every time data is written/read in a helper it will be recorded. Examples include #1179 and #1119. However, it also includes things not previously considered part of the model such as reads made by the FPU, register store/restore, segment reads, and SIMD.

Performance implications

Does this PR make a difference compare to two systems with memcb disabled?
The performance difference here is almost indistinguishable.
Does this PR make a difference compare to two systems with memcb enabled?
The relatively minimal performance impact here seems to result from additional reads/writes being hit.
In a relatively short recording on x86 there were about 1500 more calls out of around 7.3 million total calls.
In an ARM recording there were exactly the same number of calls. ARM doesn't seem to use these calls.

CLOSES: #1119

pcworld · 2022-02-08T11:28:32Z

Although I am not deep enough into QEMU internals to give a thorough code review, I can confirm that your PR fixes my cmpxchg16b use case (many thanks!).

lacraig2 requested review from moyix and tleek January 11, 2022 18:38

lacraig2 mentioned this pull request Feb 23, 2022

memory callbacks missing on MIPS #1179

Open

lacraig2 linked an issue Feb 23, 2022 that may be closed by this pull request

memory callbacks missing on MIPS #1179

Open

lacraig2 force-pushed the softmmu_rethink branch from 8431d2d to 3747f31 Compare March 9, 2022 16:08

lacraig2 marked this pull request as ready for review March 16, 2022 16:04

lacraig2 force-pushed the softmmu_rethink branch 3 times, most recently from d2a557d to fcd9ff7 Compare March 23, 2022 19:00

change softmmu model

d77210d

lacraig2 force-pushed the softmmu_rethink branch from fcd9ff7 to d77210d Compare March 24, 2022 03:04

fix up extern def for pypanda

d8de219

lacraig2 added bug bugfix and removed bug labels Jun 9, 2022

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Fix memory callback coverage #1163

Fix memory callback coverage #1163

lacraig2 commented Jan 11, 2022 •

edited

Loading

pcworld commented Feb 8, 2022

	if ((s->prefix & PREFIX_LOCK) && parallel_cpus) {
	gen_helper_cmpxchg8b(cpu_env, cpu_A0);
	} else {
	gen_helper_cmpxchg8b_unlocked(cpu_env, cpu_A0);
	}

	void helper_cmpxchg8b_unlocked(CPUX86State *env, target_ulong a0)
	{
	uintptr_t ra = GETPC();
	uint64_t oldv, cmpv, newv;
	int eflags;

	eflags = cpu_cc_compute_all(env, CC_OP);

	cmpv = deposit64(env->regs[R_EAX], 32, 32, env->regs[R_EDX]);
	newv = deposit64(env->regs[R_EBX], 32, 32, env->regs[R_ECX]);

	oldv = cpu_ldq_data_ra(env, a0, ra);
	newv = (cmpv == oldv ? newv : oldv);
	/* always do the store */
	cpu_stq_data_ra(env, a0, newv, ra);

	if (oldv == cmpv) {
	eflags \|= CC_Z;
	} else {
	env->regs[R_EAX] = (uint32_t)oldv;
	env->regs[R_EDX] = (uint32_t)(oldv >> 32);
	eflags &= ~CC_Z;
	}
	CC_SRC = eflags;
	}

	static void * const qemu_ld_helpers_normal[16] = {
	[MO_UB] = helper_ret_ldub_mmu,
	[MO_LEUW] = helper_le_lduw_mmu,
	[MO_LEUL] = helper_le_ldul_mmu,
	[MO_LEQ] = helper_le_ldq_mmu,
	[MO_BEUW] = helper_be_lduw_mmu,
	[MO_BEUL] = helper_be_ldul_mmu,
	[MO_BEQ] = helper_be_ldq_mmu,
	};
	static void * const qemu_ld_helpers_panda[16] = {
	[MO_UB] = helper_ret_ldub_mmu_panda,
	[MO_LEUW] = helper_le_lduw_mmu_panda,
	[MO_LEUL] = helper_le_ldul_mmu_panda,
	[MO_LEQ] = helper_le_ldq_mmu_panda,
	[MO_BEUW] = helper_be_lduw_mmu_panda,
	[MO_BEUL] = helper_be_ldul_mmu_panda,
	[MO_BEQ] = helper_be_ldq_mmu_panda,
	};
	#define qemu_ld_helpers \
	(panda_use_memcb ? qemu_ld_helpers_panda : qemu_ld_helpers_normal)

	static inline floatx80 helper_fldt(CPUX86State *env, target_ulong ptr,
	uintptr_t retaddr)
	{
	CPU_LDoubleU temp;

	temp.l.lower = cpu_ldq_data_ra(env, ptr, retaddr);
	temp.l.upper = cpu_lduw_data_ra(env, ptr + 8, retaddr);
	return temp.d;
	}

	void helper_bndstx64(CPUX86State *env, target_ulong base, target_ulong ptr,
	uint64_t lb, uint64_t ub)
	{
	uintptr_t ra = GETPC();
	uint64_t bte;

	bte = lookup_bte64(env, base, ra);
	cpu_stq_data_ra(env, bte, lb, ra);
	cpu_stq_data_ra(env, bte + 8, ub, ra);
	cpu_stq_data_ra(env, bte + 16, ptr, ra);
	}

	uint64_t helper_bndldx64(CPUX86State *env, target_ulong base, target_ulong ptr)
	{
	uintptr_t ra = GETPC();
	uint64_t bte, lb, ub, pt;

	bte = lookup_bte64(env, base, ra);
	lb = cpu_ldq_data_ra(env, bte, ra);
	ub = cpu_ldq_data_ra(env, bte + 8, ra);
	pt = cpu_ldq_data_ra(env, bte + 16, ra);

	if (pt != ptr) {
	lb = ub = 0;
	}
	env->mmx_t0.MMX_Q(0) = ub;
	return lb;
	}

	void helper_lmw(CPUPPCState *env, target_ulong addr, uint32_t reg)
	{
	for (; reg < 32; reg++) {
	if (needs_byteswap(env)) {
	env->gpr[reg] = bswap32(cpu_ldl_data_ra(env, addr, GETPC()));
	} else {
	env->gpr[reg] = cpu_ldl_data_ra(env, addr, GETPC());
	}
	addr = addr_add(env, addr, 4);
	}
	}

	void helper_be_st_name(CPUArchState *env, target_ulong addr,
	DATA_TYPE val, TCGMemOpIdx oi,
	uintptr_t retaddr){
	if (likely(!panda_use_memcb)){
	glue(helper_be_st_name,_internal)(env, addr, val, oi, retaddr);
	return;
	}
	unsigned mmu_idx = get_mmuidx(oi);

Fix memory callback coverage #1163

Are you sure you want to change the base?

Fix memory callback coverage #1163

Conversation

lacraig2 commented Jan 11, 2022 • edited Loading

What this PR does:

The issue:

Advantages

Potential issues

Design decisions

Coverage decisions

Performance implications

pcworld commented Feb 8, 2022

lacraig2 commented Jan 11, 2022 •

edited

Loading