notdirty_write: fix store-related performance problems

Every store would always cause the tb_invalidate_phys_page_fast path to be invoked,
amounting to a 40x slowdown of stores compared to loads.

Change this code to only worry about TB invalidation for regions marked as
executable (i.e. emulated executable).

Even without uc_set_native_thunks, this change fixes most of the performance
issues seen with thunking to native calls.

Signed-off-by: Andrei Warkentin <andrei.warkentin@intel.com>
This commit is contained in:
Andrei Warkentin
2022-12-13 00:13:58 -06:00
committed by mio
parent 9f21566b53
commit d01035767e
20 changed files with 76 additions and 23 deletions

View File

@@ -661,6 +661,25 @@ static void tlb_reset_dirty_range_locked(struct uc_struct *uc, CPUTLBEntry *tlb_
}
}
static void tlb_reset_dirty_range_by_vaddr_locked(struct uc_struct *uc, CPUTLBEntry *tlb_entry,
target_ulong start, target_ulong length)
{
uintptr_t addr = tlb_entry->addr_write;
if ((addr & (TLB_INVALID_MASK | TLB_MMIO |
TLB_DISCARD_WRITE | TLB_NOTDIRTY)) == 0) {
addr &= TARGET_PAGE_MASK;
if ((addr - start) < length) {
#if TCG_OVERSIZED_GUEST
tlb_entry->addr_write |= TLB_NOTDIRTY;
#else
tlb_entry->addr_write = tlb_entry->addr_write | TLB_NOTDIRTY;
#endif
}
}
}
/*
* Called with tlb_c.lock held.
* Called only from the vCPU context, i.e. the TLB's owner thread.
@@ -699,6 +718,30 @@ void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length)
}
}
void tlb_reset_dirty_by_vaddr(CPUState *cpu, target_ulong start1, target_ulong length)
{
struct uc_struct *uc = cpu->uc;
CPUArchState *env;
int mmu_idx;
env = cpu->env_ptr;
for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
unsigned int i;
unsigned int n = tlb_n_entries(&env_tlb(env)->f[mmu_idx]);
for (i = 0; i < n; i++) {
tlb_reset_dirty_range_by_vaddr_locked(uc, &env_tlb(env)->f[mmu_idx].table[i],
start1, length);
}
for (i = 0; i < CPU_VTLB_SIZE; i++) {
tlb_reset_dirty_range_by_vaddr_locked(uc, &env_tlb(env)->d[mmu_idx].vtable[i],
start1, length);
}
}
}
/* Called with tlb_c.lock held */
static inline void tlb_set_dirty1_locked(CPUTLBEntry *tlb_entry,
target_ulong vaddr)
@@ -1144,30 +1187,24 @@ tb_page_addr_t get_page_addr_code(CPUArchState *env, target_ulong addr)
}
static void notdirty_write(CPUState *cpu, vaddr mem_vaddr, unsigned size,
CPUIOTLBEntry *iotlbentry, uintptr_t retaddr)
CPUIOTLBEntry *iotlbentry, uintptr_t retaddr,
MemoryRegion *mr)
{
ram_addr_t ram_addr = mem_vaddr + iotlbentry->addr;
// trace_memory_notdirty_write_access(mem_vaddr, ram_addr, size);
if (mr == NULL) {
mr = memory_mapping(cpu->uc, mem_vaddr);
}
if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
if ((mr->perms & UC_PROT_EXEC) != 0) {
struct page_collection *pages
= page_collection_lock(cpu->uc, ram_addr, ram_addr + size);
tb_invalidate_phys_page_fast(cpu->uc, pages, ram_addr, size, retaddr);
page_collection_unlock(pages);
}
/*
* Set both VGA and migration bits for simplicity and to remove
* the notdirty callback faster.
*/
cpu_physical_memory_set_dirty_range(ram_addr, size, DIRTY_CLIENTS_NOCODE);
/* We remove the notdirty callback only if the code has been flushed. */
if (!cpu_physical_memory_is_clean(ram_addr)) {
// trace_memory_notdirty_set_dirty(mem_vaddr);
tlb_set_dirty(cpu, mem_vaddr);
}
/* For exec pages, this is cleared in tb_gen_code. */
tlb_set_dirty(cpu, mem_vaddr);
}
/*
@@ -1244,7 +1281,7 @@ void *probe_access(CPUArchState *env, target_ulong addr, int size,
/* Handle clean RAM pages. */
if (tlb_addr & TLB_NOTDIRTY) {
notdirty_write(env_cpu(env), addr, size, iotlbentry, retaddr);
notdirty_write(env_cpu(env), addr, size, iotlbentry, retaddr, NULL);
}
}
@@ -1370,7 +1407,7 @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
if (unlikely(tlb_addr & TLB_NOTDIRTY)) {
notdirty_write(env_cpu(env), addr, 1 << s_bits,
&env_tlb(env)->d[mmu_idx].iotlb[index], retaddr);
&env_tlb(env)->d[mmu_idx].iotlb[index], retaddr, NULL);
}
return hostaddr;
@@ -2216,7 +2253,7 @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
/* Handle clean RAM pages. */
if (tlb_addr & TLB_NOTDIRTY) {
notdirty_write(env_cpu(env), addr, size, iotlbentry, retaddr);
notdirty_write(env_cpu(env), addr, size, iotlbentry, retaddr, mr);
}
haddr = (void *)((uintptr_t)addr + entry->addend);