From 3d5b2643f0af742d9b90b4511d0ee137775c8526 Mon Sep 17 00:00:00 2001 From: mio Date: Sat, 10 Jun 2023 14:04:56 +0200 Subject: [PATCH] Support demand paging via closures and seh Reverts 12a79192ee6a4ec5b69130ff3481ac64d93a9b25 which exploits normal tcg mechanism This uses a trampoline to pass extra data to seh handlers --- include/uc_priv.h | 5 ++ msvc/config-host.h | 9 +-- qemu/accel/tcg/translate-all.c | 112 ++++++++++++++++++++++++++++----- qemu/include/sysemu/os-win32.h | 1 + qemu/include/tcg/tcg.h | 5 -- qemu/tcg/tcg.c | 18 +----- 6 files changed, 103 insertions(+), 47 deletions(-) diff --git a/include/uc_priv.h b/include/uc_priv.h index 5b2bfed0..968ab22f 100644 --- a/include/uc_priv.h +++ b/include/uc_priv.h @@ -399,6 +399,11 @@ struct uc_struct { struct TranslationBlock *last_tb; // The real last tb we executed. FlatView *empty_view; // Static function variable moved from flatviews_init + +#ifdef WIN32 + PVOID seh_handle; + void* seh_closure; +#endif }; // Metadata stub for the variable-size cpu context used with uc_context_*() diff --git a/msvc/config-host.h b/msvc/config-host.h index a5e1e190..d07f74ac 100644 --- a/msvc/config-host.h +++ b/msvc/config-host.h @@ -6,11 +6,4 @@ // #define CONFIG_INT128 1 #define CONFIG_CMPXCHG128 1 // #define CONFIG_ATOMIC64 1 -#define CONFIG_PLUGIN 1 - -// QEMU by default allocates (and commits) 1GB memory on Windows, and multiple Unicorn instances will result in OOM error easily. -// Unfortunately, Windows doesn't have a similar demand paging feature like mmap(), therefore a workaround is to use tcg regions mechanism. -// Note most Unicorn hacks (and even QEMU!) relies on the assumption that the translation memory won't run out and thus it might result -// in some unexpected errors. If that is case, define WIN32_QEMU_ALLOC_BUFFER to align with QEMU and Unicorn <= 2.0.1 behavior. -// -// #define WIN32_QEMU_ALLOC_BUFFER \ No newline at end of file +#define CONFIG_PLUGIN 1 \ No newline at end of file diff --git a/qemu/accel/tcg/translate-all.c b/qemu/accel/tcg/translate-all.c index ac71e391..ac7f084c 100644 --- a/qemu/accel/tcg/translate-all.c +++ b/qemu/accel/tcg/translate-all.c @@ -869,35 +869,113 @@ static inline void *alloc_code_gen_buffer(struct uc_struct *uc) return buf; } #elif defined(_WIN32) -#ifdef WIN32_QEMU_ALLOC_BUFFER -static inline void *alloc_code_gen_buffer(struct uc_struct *uc) -{ - TCGContext *tcg_ctx = uc->tcg_ctx; - size_t size = tcg_ctx->code_gen_buffer_size; - return VirtualAlloc(NULL, size, MEM_RESERVE | MEM_COMMIT, - PAGE_EXECUTE_READWRITE); -} +#define COMMIT_COUNT (1024) // Commit 4MB per exception +#define CLOSURE_SIZE (4096) + +#ifdef _WIN64 +static LONG code_gen_buffer_handler(PEXCEPTION_POINTERS ptr, struct uc_struct *uc) #else +/* +The first two DWORD or smaller arguments that are found in the argument list +from left to right are passed in ECX and EDX registers; all other arguments +are passed on the stack from right to left. +*/ +static LONG __fastcall code_gen_buffer_handler(PEXCEPTION_POINTERS ptr, struct uc_struct* uc) +#endif +{ + PEXCEPTION_RECORD record = ptr->ExceptionRecord; + if (record->ExceptionCode == EXCEPTION_ACCESS_VIOLATION) { + uint8_t* base = (uint8_t*)(record->ExceptionInformation[1]); + uint8_t* left = uc->tcg_ctx->initial_buffer; + uint8_t* right = left + uc->tcg_ctx->initial_buffer_size; + if (left && base >= left && base < right) { + // It's our region + uint8_t* base_end = base + COMMIT_COUNT * 4096; + uint32_t size = COMMIT_COUNT * 4096; + if (base_end >= right) { + size = base_end - base; + // whoops, we are almost run out of memory! Commit all instead + } + if (VirtualAlloc(base, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE)) { + return EXCEPTION_CONTINUE_EXECUTION; + } else { + return EXCEPTION_CONTINUE_SEARCH; + } + } + } + return EXCEPTION_CONTINUE_SEARCH; +} + +static inline void may_remove_handler(struct uc_struct *uc) { + if (uc->seh_closure) { + if (uc->seh_handle) { + RemoveVectoredContinueHandler(uc->seh_handle); + } + VirtualFree(uc->seh_closure, 0, MEM_RELEASE); + } +} + static inline void *alloc_code_gen_buffer(struct uc_struct *uc) { TCGContext *tcg_ctx = uc->tcg_ctx; size_t size = tcg_ctx->code_gen_buffer_size; + uint8_t *closure, *data; + void* handler = code_gen_buffer_handler; - void* ptr = VirtualAlloc(NULL, size, MEM_RESERVE, - PAGE_EXECUTE_READWRITE); + may_remove_handler(uc); - // for prolog init - VirtualAlloc(ptr, - uc->qemu_real_host_page_size * UC_TCG_REGION_PAGES_COUNT, - MEM_COMMIT, - PAGE_EXECUTE_READWRITE); - return ptr; -} + // Naive trampoline implementation + closure = VirtualAlloc(NULL, CLOSURE_SIZE, MEM_RESERVE | MEM_COMMIT, PAGE_EXECUTE_READWRITE); + if (!closure) { + return NULL; + } + uc->seh_closure = closure; + data = closure + CLOSURE_SIZE /2; + +#ifdef _WIN64 + closure[0] = 0x48; // REX.w + closure[1] = 0xb8; // mov rax + memcpy(closure + 2, &data, 8); // mov rax, &data + // ; rax = &data + // mov [rax], rdx ; save rdx + // mov rdx, [rax+0x8] ; move uc pointer to 2nd arg + // jmp [rax + 0x10] ; go to handler + // mov rdx, [rax] ; restore rdx + const char tramp[] = "\x48\x89\x10\x48\x8b\x50\x08\xff\x60\x10\x48\x8b\x10"; + memcpy(closure + 2 + 8, (void*)tramp, sizeof(tramp)); + memcpy(data + 0x8, (void*)&uc, 8); + memcpy(data + 0x10, (void*)&handler, 8); +#else + closure[0] = 0xb8; // mov eax + memcpy(closure + 1, &data, 4); // mov eax, &data + // ; eax = &data + // mov [eax], edx; save edx + // mov [eax+0x4], ecx; save ecx + // mov ecx, [esp+4]; get ptr to exception because of cdecl + // mov edx, [eax+0x8]; get ptr to uc + // jmp [eax + 0xC]; get ptr to our handler, it's fastcall so we don't clean stack + // mov edx, [eax] ; restore edx + // mov ecx, [eax+4] ; restore ecx + const char tramp[] = "\x89\x10\x89\x48\x04\x8b\x4c\x24\x04\x8b\x50\x08\xff\x60\x0c\x8b\x10\x8b\x48\x04"; + memcpy(closure + 1 + 4, (void*)tramp, sizeof(tramp)); + memcpy(data + 0x8, (void*)&uc, 4); + memcpy(data + 0xC, (void*)&handler, 4); #endif + + uc->seh_handle = AddVectoredExceptionHandler(0, (PVECTORED_EXCEPTION_HANDLER)closure); + if (!uc->seh_handle) { + VirtualFree(uc->seh_closure, 0, MEM_RELEASE); + uc->seh_closure = NULL; + return NULL; + } + + return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_EXECUTE_READWRITE); +} void free_code_gen_buffer(struct uc_struct *uc) { TCGContext *tcg_ctx = uc->tcg_ctx; if (tcg_ctx->initial_buffer) { + may_remove_handler(uc); VirtualFree(tcg_ctx->initial_buffer, 0, MEM_RELEASE); } } diff --git a/qemu/include/sysemu/os-win32.h b/qemu/include/sysemu/os-win32.h index 44e37d7e..d77d0fba 100644 --- a/qemu/include/sysemu/os-win32.h +++ b/qemu/include/sysemu/os-win32.h @@ -28,6 +28,7 @@ #include #include +#include // For vectorized handler #include #if defined(_WIN64) diff --git a/qemu/include/tcg/tcg.h b/qemu/include/tcg/tcg.h index 1134c75c..f3643fe3 100644 --- a/qemu/include/tcg/tcg.h +++ b/qemu/include/tcg/tcg.h @@ -35,11 +35,6 @@ #include "tcg-apple-jit.h" #include "qemu/int128.h" -// Unicorn: Default region size for win32 -#if defined(_WIN32) && !defined(WIN32_QEMU_ALLOC_BUFFER) -#define UC_TCG_REGION_PAGES_COUNT (128) // Note less pages may cause unexpected and subtle errors. -#endif - /* XXX: make safe guess about sizes */ #define MAX_OP_PER_INSTR 266 diff --git a/qemu/tcg/tcg.c b/qemu/tcg/tcg.c index cb76f31a..dcacec7c 100644 --- a/qemu/tcg/tcg.c +++ b/qemu/tcg/tcg.c @@ -23,8 +23,6 @@ */ /* define it to use liveness analysis (better code) */ -#include "tcg/tcg.h" -#include #define USE_TCG_OPTIMIZATIONS #include "qemu/osdep.h" @@ -408,13 +406,7 @@ static void tcg_region_assign(TCGContext *s, size_t curr_region) s->code_gen_buffer = start; s->code_gen_ptr = start; s->code_gen_buffer_size = (char *)end - (char *)start; -#if defined(WIN32) && !defined(WIN32_QEMU_ALLOC_BUFFER) - VirtualAlloc( - s->code_gen_buffer, - ROUND_UP(s->code_gen_buffer_size, s->uc->qemu_real_host_page_size), - MEM_COMMIT, - PAGE_EXECUTE_READWRITE); -#endif + memset(s->code_gen_buffer, 0x00, s->code_gen_buffer_size); s->code_gen_highwater = (char *)end - TCG_HIGHWATER; } @@ -509,11 +501,7 @@ void tcg_region_init(TCGContext *tcg_ctx) size_t n_regions; size_t i; -#if defined(WIN32) && !defined(WIN32_QEMU_ALLOC_BUFFER) - n_regions = size / (tcg_ctx->uc->qemu_real_host_page_size * UC_TCG_REGION_PAGES_COUNT); -#else n_regions = 1; -#endif /* The first region will be 'aligned - buf' bytes larger than the others */ aligned = (void *)QEMU_ALIGN_PTR_UP(buf, page_size); @@ -551,10 +539,6 @@ void tcg_region_init(TCGContext *tcg_ctx) tcg_ctx->tree = g_tree_new(tb_tc_cmp); -#if defined(WIN32) && !defined(WIN32_QEMU_ALLOC_BUFFER) - // Allocate a region immediately, or the highwater is not set correctly. - tcg_region_alloc(tcg_ctx); -#endif } /*