diff --git a/qemu/include/exec/gen-icount.h b/qemu/include/exec/gen-icount.h index 1a579c05..d5e22dff 100644 --- a/qemu/include/exec/gen-icount.h +++ b/qemu/include/exec/gen-icount.h @@ -41,14 +41,25 @@ static inline void gen_tb_start(TCGContext *tcg_ctx, TranslationBlock *tb) tcg_gen_ld_i32(tcg_ctx, count, tcg_ctx->cpu_env, offsetof(ArchCPU, neg.icount_decr.u32) - offsetof(ArchCPU, env)); - + // Unicorn: + // We CANT'T use brcondi_i32 here or we will fail liveness analysis + // because it marks the end of BB + if (tcg_ctx->delay_slot_flag != NULL) { + TCGv_i32 tmp = tcg_const_i32(tcg_ctx, 0); + // dest = (c1 cond c2 ? v1 : v2) + tcg_gen_movcond_i32(tcg_ctx, TCG_COND_GT, count, tcg_ctx->delay_slot_flag, tmp, tcg_ctx->delay_slot_flag, count); + tcg_temp_free_i32(tcg_ctx, tmp); + } tcg_gen_brcondi_i32(tcg_ctx, TCG_COND_LT, count, 0, tcg_ctx->exitreq_label); - tcg_temp_free_i32(tcg_ctx, count); } static inline void gen_tb_end(TCGContext *tcg_ctx, TranslationBlock *tb, int num_insns) { + if (tcg_ctx->delay_slot_flag != NULL){ + tcg_temp_free_i32(tcg_ctx, tcg_ctx->delay_slot_flag); + } + tcg_ctx->delay_slot_flag = NULL; if (tb_cflags(tb) & CF_USE_ICOUNT) { /* Update the num_insn immediate parameter now that we know * the actual insn count. */ diff --git a/qemu/include/tcg/tcg.h b/qemu/include/tcg/tcg.h index c1d16b0b..bf0ea3da 100644 --- a/qemu/include/tcg/tcg.h +++ b/qemu/include/tcg/tcg.h @@ -664,6 +664,7 @@ struct TCGContext { struct TCGLabelPoolData *pool_labels; #endif + TCGv_i32 delay_slot_flag; TCGLabel *exitreq_label; TCGTempSet free_temps[TCG_TYPE_COUNT * 2]; diff --git a/qemu/target/mips/translate.c b/qemu/target/mips/translate.c index 4d0b3eb0..3fab57b2 100644 --- a/qemu/target/mips/translate.c +++ b/qemu/target/mips/translate.c @@ -6006,6 +6006,10 @@ static inline void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest) { TCGContext *tcg_ctx = ctx->uc->tcg_ctx; if (use_goto_tb(ctx, dest)) { + // Unicorn: Force save pc for delay slot instructions, this should be + // harmless either goto_tb is taken or not but will give correct + // pc after execution stops within the delay slot. + gen_save_pc(tcg_ctx, dest); tcg_gen_goto_tb(tcg_ctx, n); gen_save_pc(tcg_ctx, dest); tcg_gen_exit_tb(tcg_ctx, ctx->base.tb, n); @@ -30888,6 +30892,9 @@ static void mips_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs) static void mips_tr_tb_start(DisasContextBase *dcbase, CPUState *cs) { + DisasContext *ctx = container_of(dcbase, DisasContext, base); + TCGContext *tcg_ctx = ctx->uc->tcg_ctx; + tcg_ctx->delay_slot_flag = tcg_temp_local_new_i32(tcg_ctx); } static void mips_tr_insn_start(DisasContextBase *dcbase, CPUState *cs) @@ -30897,6 +30904,7 @@ static void mips_tr_insn_start(DisasContextBase *dcbase, CPUState *cs) tcg_gen_insn_start(tcg_ctx, ctx->base.pc_next, ctx->hflags & MIPS_HFLAG_BMASK, ctx->btarget); + tcg_gen_movi_i32(tcg_ctx, tcg_ctx->delay_slot_flag, 0); } static bool mips_tr_breakpoint_check(DisasContextBase *dcbase, CPUState *cs, @@ -30928,6 +30936,7 @@ static void mips_tr_translate_insn(DisasContextBase *dcbase, CPUState *cs) int insn_bytes; int is_slot; bool hook_insn = false; + TCGv_i32 dyn_is_slot = NULL; is_slot = ctx->hflags & MIPS_HFLAG_BMASK; @@ -30939,6 +30948,10 @@ static void mips_tr_translate_insn(DisasContextBase *dcbase, CPUState *cs) return; } + dyn_is_slot = tcg_const_i32(tcg_ctx, 0); + slot_op = tcg_last_op(tcg_ctx); + tcg_gen_mov_i32(tcg_ctx, tcg_ctx->delay_slot_flag, dyn_is_slot); + // Unicorn: trace this instruction on request if (HOOK_EXISTS_BOUNDED(uc, UC_HOOK_CODE, ctx->base.pc_next)) { @@ -30949,17 +30962,7 @@ static void mips_tr_translate_insn(DisasContextBase *dcbase, CPUState *cs) prev_op = tcg_last_op(tcg_ctx); hook_insn = true; gen_uc_tracecode(tcg_ctx, 4, UC_HOOK_CODE_IDX, uc, ctx->base.pc_next); - - // TODO: Memory hooks, maybe use icount_decr.low? - TCGLabel *skip_label = gen_new_label(tcg_ctx); - TCGv_i32 dyn_is_slot = tcg_const_i32(tcg_ctx, 0); - slot_op = tcg_last_op(tcg_ctx); - // if slot, skip exit_request - // tcg is smart enough to optimize this branch away - tcg_gen_brcondi_i32(tcg_ctx, TCG_COND_GT, dyn_is_slot, 0, skip_label); - tcg_temp_free_i32(tcg_ctx, dyn_is_slot); check_exit_request(tcg_ctx); - gen_set_label(tcg_ctx, skip_label); } if (ctx->insn_flags & ISA_NANOMIPS32) { @@ -30978,6 +30981,7 @@ static void mips_tr_translate_insn(DisasContextBase *dcbase, CPUState *cs) } else { generate_exception_end(ctx, EXCP_RI); g_assert(ctx->base.is_jmp == DISAS_NORETURN); + slot_op->args[1] = is_slot; return; } @@ -31072,6 +31076,10 @@ static void mips_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs) g_assert_not_reached(); } } + if (tcg_ctx->delay_slot_flag) { + tcg_temp_free_i32(tcg_ctx, tcg_ctx->delay_slot_flag); + } + tcg_ctx->delay_slot_flag = NULL; } static void mips_sync_pc(DisasContextBase *db, CPUState *cpu) diff --git a/qemu/tcg/tcg-op.c b/qemu/tcg/tcg-op.c index 86902f03..8a5865df 100644 --- a/qemu/tcg/tcg-op.c +++ b/qemu/tcg/tcg-op.c @@ -2866,9 +2866,17 @@ void check_exit_request(TCGContext *tcg_ctx) tcg_gen_ld_i32(tcg_ctx, count, tcg_ctx->cpu_env, offsetof(ArchCPU, neg.icount_decr.u32) - offsetof(ArchCPU, env)); - + // Unicorn: + // We CANT'T use brcondi_i32 here or we will fail liveness analysis + // because it marks the end of BB + if (tcg_ctx->delay_slot_flag != NULL) { + TCGv_i32 tmp = tcg_const_i32(tcg_ctx, 0); + // dest = (c1 cond c2 ? v1 : v2) + tcg_gen_movcond_i32(tcg_ctx, TCG_COND_GT, count, tcg_ctx->delay_slot_flag, tmp, tcg_ctx->delay_slot_flag, count); + tcg_temp_free_i32(tcg_ctx, tmp); + } + tcg_gen_brcondi_i32(tcg_ctx, TCG_COND_LT, count, 0, tcg_ctx->exitreq_label); - tcg_temp_free_i32(tcg_ctx, count); } diff --git a/tests/regress/tcg_liveness_analysis_bug_issue-287.py b/tests/regress/tcg_liveness_analysis_bug_issue-287.py index d053803c..83f4cc5f 100755 --- a/tests/regress/tcg_liveness_analysis_bug_issue-287.py +++ b/tests/regress/tcg_liveness_analysis_bug_issue-287.py @@ -2,89 +2,88 @@ from unicorn import * from unicorn.arm_const import * -# issue #287 -# Initial Register States: R0=3, R1=24, R2=16, R3=0 -# -# ----- code start ----- -# CMP R0,R1,LSR#3 -# SUBCS R0,R0,R1,LSR#3 # CPU flags got changed in these two instructions, and *REMEMBERED*, now NF == VF == 0 -# CMP R0,#1 # CPU flags changed again, now NF == 1, VF == 0, but they are not properly *REMEMBERED* -# MOV R1,R1,LSR#4 -# SUBGES R2,R2,#4 # according to the result of CMP, we should skip this op -# -# MOVGE R3,#100 # since changed flags are not *REMEMBERED* in CMP, now NF == VF == 0, which result in wrong branch -# # at the end of this code block, should R3 == 0 -# ----- code end ------ -# -# TCG ops are correct, plain op translation is done correctly, -# but there're In-Memory bits invisible from ops that control the host code generation. -# all these codes are in one TCG translation-block, so wrong things could happen. -# detail explanation is given on the right side. -# remember, both set_label and brcond are point to refresh the dead_temps and mem_temps states in TCG -# -# ----- TCG ops ------ -# ld_i32 tmp5,env,$0xfffffffffffffff4 -# movi_i32 tmp6,$0x0 -# brcond_i32 tmp5,tmp6,ne,$0x0 -# mov_i32 tmp5,r1 ------------------------- -# movi_i32 tmp6,$0x3 | -# shr_i32 tmp5,r1,tmp6 | -# mov_i32 tmp6,r0 | -# sub_i32 NF,r0,tmp5 | -# mov_i32 ZF,NF | -# setcond_i32 CF,r0,tmp5,geu | # This part is "CMP R0,R1,LSR#3" -# xor_i32 VF,NF,r0 |-----> # and "SUBCS R0,R0,R1,LSR#3" -# xor_i32 tmp7,r0,tmp5 | # the last op in this block, set_label get a chance to refresh the TCG globals memory states, -# and_i32 VF,VF,tmp7 | # so things get back to normal states -# mov_i32 tmp6,NF | # these codes are not affected by the bug. Let's called this Part-D -# movi_i32 tmp5,$0x0 | -# brcond_i32 CF,tmp5,eq,$0x1 | -# mov_i32 tmp5,r1 | -# movi_i32 tmp6,$0x3 | -# shr_i32 tmp5,r1,tmp6 | -# mov_i32 tmp6,r0 | -# sub_i32 tmp6,r0,tmp5 | -# mov_i32 r0,tmp6 | -# set_label $0x1 ------------------------- -# movi_i32 tmp5,$0x1 ----------------- # Let's called this Part-C -# mov_i32 tmp6,r0 | # NF is used as output operand again! -# sub_i32 NF,r0,tmp5 ----------------|-----> # but it is stated as Not-In-Memory, -# mov_i32 ZF,NF | # no need to sync it after calculation. -# setcond_i32 CF,r0,tmp5,geu | # the generated host code does not write NF -# xor_i32 VF,NF,r0 | # back to its memory location, hence forgot. And the CPU flags after this calculation is not changed. -# xor_i32 tmp7,r0,tmp5 | # Caution: the following SUBGES's condition check is right, even though the generated host code does not *REMEMBER* NF, it will cache the calculated result and serve SUBGES correctly -# and_i32 VF,VF,tmp7 | -# mov_i32 tmp6,NF | -# mov_i32 tmp5,r1 | # this part is "CMP R0,#1" -# movi_i32 tmp6,$0x4 | # and "MOV R1,R1,LSR#4" -# shr_i32 tmp5,r1,tmp6 | # and "SUBGES R2,R2,#4" -# mov_i32 r1,tmp5 |-----> # This is the part where problem start to arise -# xor_i32 tmp5,VF,NF | -# movi_i32 tmp6,$0x0 | -# brcond_i32 tmp5,tmp6,lt,$0x2 --------|-----> # QEMU will refresh the InMemory bit for TCG globals here, but Unicorn won't -# movi_i32 tmp5,$0x4 | -# mov_i32 tmp6,r2 | # this is the 1st bug-related op get analyzed. -# sub_i32 NF,r2,tmp5 ----------------|-----> # here, NF is an output operand, it's flagged dead -# mov_i32 ZF,NF | # and the InMemory bit is clear, tell the previous(above) ops -# setcond_i32 CF,r2,tmp5,geu | # if it is used as output operand again, do not sync it -# xor_i32 VF,NF,r2 | # so the generated host-code for previous ops will not write it back to Memory -# xor_i32 tmp7,r2,tmp5 | # Caution: the CPU flags after this calculation is also right, because the set_label is a point of refresh, make them *REMEMBERED* -# and_i32 VF,VF,tmp7 | # Let's call this Part-B -# mov_i32 tmp6,NF | -# mov_i32 r2,ZF | -# set_label $0x2 ----------------- -# xor_i32 tmp5,VF,NF ----------------- -# movi_i32 tmp6,$0x0 | -# brcond_i32 tmp5,tmp6,lt,$0x3 | # Let's call this Part-A -# movi_i32 tmp5,$0x64 | # if Part-B is not skipped, this part won't go wrong, because we'll check the CPU flags as the result of Part-B, it's *REMEMBERED* -# movi_i32 r3,$0x64 |-----> # but if Part-B is skipped, -# set_label $0x3 | # what should we expected? we will check the condition based on the result of Part-D!!! -# call wfi,$0x0,$0,env | # because result of Part-C is lost. this is why things go wrong. -# set_label $0x0 | -# exit_tb $0x7f6401714013 ----------------- -# ########### -# ----- TCG ends ------ +''' + issue #287 + Initial Register States: R0=3, R1=24, R2=16, R3=0 + ----- code start ----- + CMP R0,R1,LSR#3 + SUBCS R0,R0,R1,LSR#3 # CPU flags got changed in these two instructions, and *REMEMBERED*, now NF == VF == 0 + CMP R0,#1 # CPU flags changed again, now NF == 1, VF == 0, but they are not properly *REMEMBERED* + MOV R1,R1,LSR#4 + SUBGES R2,R2,#4 # according to the result of CMP, we should skip this op + + MOVGE R3,#100 # since changed flags are not *REMEMBERED* in CMP, now NF == VF == 0, which result in wrong branch + # at the end of this code block, should R3 == 0 + ----- code end ------ + # TCG ops are correct, plain op translation is done correctly, + # but there're In-Memory bits invisible from ops that control the host code generation. + # all these codes are in one TCG translation-block, so wrong things could happen. + # detail explanation is given on the right side. + # remember, both set_label and brcond are point to refresh the dead_temps and mem_temps states in TCG + ----- TCG ops ------ + ld_i32 tmp5,env,$0xfffffffffffffff4 + movi_i32 tmp6,$0x0 + brcond_i32 tmp5,tmp6,ne,$0x0 + mov_i32 tmp5,r1 ------------------------- + movi_i32 tmp6,$0x3 | + shr_i32 tmp5,r1,tmp6 | + mov_i32 tmp6,r0 | + sub_i32 NF,r0,tmp5 | + mov_i32 ZF,NF | + setcond_i32 CF,r0,tmp5,geu | # This part is "CMP R0,R1,LSR#3" + xor_i32 VF,NF,r0 |-----> # and "SUBCS R0,R0,R1,LSR#3" + xor_i32 tmp7,r0,tmp5 | # the last op in this block, set_label get a chance to refresh the TCG globals memory states, + and_i32 VF,VF,tmp7 | # so things get back to normal states + mov_i32 tmp6,NF | # these codes are not affected by the bug. Let's called this Part-D + movi_i32 tmp5,$0x0 | + brcond_i32 CF,tmp5,eq,$0x1 | + mov_i32 tmp5,r1 | + movi_i32 tmp6,$0x3 | + shr_i32 tmp5,r1,tmp6 | + mov_i32 tmp6,r0 | + sub_i32 tmp6,r0,tmp5 | + mov_i32 r0,tmp6 | + set_label $0x1 ------------------------- + movi_i32 tmp5,$0x1 ----------------- # Let's called this Part-C + mov_i32 tmp6,r0 | # NF is used as output operand again! + sub_i32 NF,r0,tmp5 ----------------|-----> # but it is stated as Not-In-Memory, + mov_i32 ZF,NF | # no need to sync it after calculation. + setcond_i32 CF,r0,tmp5,geu | # the generated host code does not write NF + xor_i32 VF,NF,r0 | # back to its memory location, hence forgot. And the CPU flags after this calculation is not changed. + xor_i32 tmp7,r0,tmp5 | # Caution: the following SUBGES's condition check is right, even though the generated host code does not *REMEMBER* NF, it will cache the calculated result and serve SUBGES correctly + and_i32 VF,VF,tmp7 | + mov_i32 tmp6,NF | + mov_i32 tmp5,r1 | # this part is "CMP R0,#1" + movi_i32 tmp6,$0x4 | # and "MOV R1,R1,LSR#4" + shr_i32 tmp5,r1,tmp6 | # and "SUBGES R2,R2,#4" + mov_i32 r1,tmp5 |-----> # This is the part where problem start to arise + xor_i32 tmp5,VF,NF | + movi_i32 tmp6,$0x0 | + brcond_i32 tmp5,tmp6,lt,$0x2 --------|-----> # QEMU will refresh the InMemory bit for TCG globals here, but Unicorn won't + movi_i32 tmp5,$0x4 | + mov_i32 tmp6,r2 | # this is the 1st bug-related op get analyzed. + sub_i32 NF,r2,tmp5 ----------------|-----> # here, NF is an output operand, it's flagged dead + mov_i32 ZF,NF | # and the InMemory bit is clear, tell the previous(above) ops + setcond_i32 CF,r2,tmp5,geu | # if it is used as output operand again, do not sync it + xor_i32 VF,NF,r2 | # so the generated host-code for previous ops will not write it back to Memory + xor_i32 tmp7,r2,tmp5 | # Caution: the CPU flags after this calculation is also right, because the set_label is a point of refresh, make them *REMEMBERED* + and_i32 VF,VF,tmp7 | # Let's call this Part-B + mov_i32 tmp6,NF | + mov_i32 r2,ZF | + set_label $0x2 ----------------- + xor_i32 tmp5,VF,NF ----------------- + movi_i32 tmp6,$0x0 | + brcond_i32 tmp5,tmp6,lt,$0x3 | # Let's call this Part-A + movi_i32 tmp5,$0x64 | # if Part-B is not skipped, this part won't go wrong, because we'll check the CPU flags as the result of Part-B, it's *REMEMBERED* + movi_i32 r3,$0x64 |-----> # but if Part-B is skipped, + set_label $0x3 | # what should we expected? we will check the condition based on the result of Part-D!!! + call wfi,$0x0,$0,env | # because result of Part-C is lost. this is why things go wrong. + set_label $0x0 | + exit_tb $0x7f6401714013 ----------------- + ########### + ----- TCG ends ------ +''' CODE = ( b'\xa1\x01\x50\xe1' # cmp r0, r1, lsr #3 diff --git a/tests/unit/test_mips.c b/tests/unit/test_mips.c index d1f98d3a..e0786cc7 100644 --- a/tests/unit/test_mips.c +++ b/tests/unit/test_mips.c @@ -101,22 +101,29 @@ static void test_mips_stop_delay_slot_from_qiling(void) { uc_engine *uc; // 24 06 00 03 addiu $a2, $zero, 3 - // 10 a6 00 79 beq $a1, $a2, 0x47c8da4 + // 10 a6 00 79 beq $a1, $a2, 0x1e8 // 30 42 00 fc andi $v0, $v0, 0xfc + // 10 40 00 32 beqz $v0, 0x47c8c90 + // 24 ab ff da addiu $t3, $a1, -0x26 + // 2d 62 00 02 sltiu $v0, $t3, 2 + // 10 40 00 32 beqz $v0, 0x47c8c9c + // 00 00 00 00 nop char code[] = - "\x24\x06\x00\x03\x10\xa6\x00\x79\x30\x42\x00\xfc"; + "\x24\x06\x00\x03\x10\xa6\x00\x79\x30\x42\x00\xfc\x10\x40\x00\x32\x24\xab\xff\xda\x2d\x62\x00\x02\x10\x40\x00\x32\x00\x00\x00\x00"; uint32_t r_pc = 0x0; - uint32_t r_a2 = 1; - + uint32_t r_v0 = 0xff; + uint32_t r_a1 = 0x3; + uc_common_setup(&uc, UC_ARCH_MIPS, UC_MODE_MIPS32 | UC_MODE_BIG_ENDIAN, code, sizeof(code) - 1); - - OK(uc_reg_write(uc, UC_MIPS_REG_A2, &r_a2)); - - OK(uc_emu_start(uc, code_start, code_start + sizeof(code) - 1, 0, 2)); + OK(uc_reg_write(uc, UC_MIPS_REG_V0, &r_v0)); + OK(uc_reg_write(uc, UC_MIPS_REG_A1, &r_a1)); + OK(uc_emu_start(uc, code_start, code_start + sizeof(code) + 16, 0, 2)); OK(uc_reg_read(uc, UC_MIPS_REG_PC, &r_pc)); - TEST_CHECK(r_pc == code_start + 12); + OK(uc_reg_read(uc, UC_MIPS_REG_V0, &r_v0)); + TEST_CHECK(r_pc == code_start + 4 + 0x1e8); + TEST_CHECK(r_v0 == 0xfc); OK(uc_close(uc)); }