diff --git a/nemu/include/common.h b/nemu/include/common.h
index b6e40c4b650f42fceecb8aa0decadcb4b052155e..d7658173d93d3e3c845fe43e6fbd7ad7608aa846 100644
--- a/nemu/include/common.h
+++ b/nemu/include/common.h
@@ -37,4 +37,6 @@ inline std::string num2hex(uint32_t n) {
     return ss.str();
 }
 
+#include <rlib/stdio.hpp>
+
 #endif
diff --git a/nemu/include/cpu/decode.h b/nemu/include/cpu/decode.h
index e8c61911585dcfa9bc3559f750a01f1f8fc140cf..df6a347449c99e58c281cf196487cd8a0b3b1dac 100644
--- a/nemu/include/cpu/decode.h
+++ b/nemu/include/cpu/decode.h
@@ -22,6 +22,10 @@ typedef struct {
   char str[OP_STR_SIZE];
 } Operand;
 
+inline std::ostream & operator<< (std::ostream &os, const Operand &operand) {
+  return os << "Operand{type=" << operand.type << ",width=" << operand.width << ",union{reg/addr/imm/simm}=" << std::hex << operand.reg << ",val=" << operand.val << std::dec << ",str=" << operand.str << "}";
+}
+
 typedef struct {
   uint32_t opcode;
   vaddr_t seq_eip;  // sequential eip
diff --git a/nemu/include/cpu/rtl.h b/nemu/include/cpu/rtl.h
index dd2b41acb64a115f02f6e88ba0e2138c1c6f6e3e..135278555630db929751dbd77aa124b13c89dea3 100644
--- a/nemu/include/cpu/rtl.h
+++ b/nemu/include/cpu/rtl.h
@@ -21,6 +21,10 @@ static inline void interpret_rtl_mv(rtlreg_t* dest, const rtlreg_t *src1) {
   *dest = *src1;
 }
 
+// TODO: Optimize: change rtlreg_t* to rtlreg_t&, which barries the compiler optimization.
+
+// TODO: Optimize: DO NOT use `at` register in `addi`/subi... instructions. It's unnecessarily slow.
+
 #define make_rtl_arith_logic(name) \
   static inline void concat(interpret_rtl_, name) (rtlreg_t* dest, const rtlreg_t* src1, const rtlreg_t* src2) { \
     *dest = concat(c_, name) (*src1, *src2); \
@@ -158,16 +162,20 @@ static inline void rtl_sext(rtlreg_t* dest, const rtlreg_t* src1, int width) {
   TODO();
 }
 
+template <int OperandBytes>
 static inline void rtl_push(const rtlreg_t* src1) {
   // esp <- esp - 4
   // M[esp] <- src1
-  TODO();
+  rtl_subi(&cpu.esp, &cpu.esp, OperandBytes);
+  interpret_rtl_sm(&cpu.esp, src1, OperandBytes);
 }
 
+template <int OperandBytes>
 static inline void rtl_pop(rtlreg_t* dest) {
   // dest <- M[esp]
   // esp <- esp + 4
-  TODO();
+  interpret_rtl_lm(dest, &cpu.esp, OperandBytes);
+  rtl_addi(&cpu.esp, &cpu.esp, OperandBytes);
 }
 
 static inline void rtl_setrelopi(uint32_t relop, rtlreg_t *dest,
diff --git a/nemu/include/macro.h b/nemu/include/macro.h
index a08559bd06a48f4d980fcabcccf2bb60730c4c98..68b9399fb591ac99a8541e769c66d78df12e9d6b 100644
--- a/nemu/include/macro.h
+++ b/nemu/include/macro.h
@@ -9,4 +9,10 @@
 #define concat4(x, y, z, w) concat3(concat(x, y), z, w)
 #define concat5(x, y, z, v, w) concat4(concat(x, y), z, v, w)
 
+#ifdef DEBUG
+#define RLIB_MACRO_DEBUG_ASSERT(expr) assert(expr)
+#else
+#define RLIB_MACRO_DEBUG_ASSERT(expr)
+#endif
+
 #endif
diff --git a/nemu/src/cpu/decode/decode.cc b/nemu/src/cpu/decode/decode.cc
index 036c330174487f6841004d929d43ddf06bd37003..ecd0665f7c24898bd243bad3f792cc5f6f233c0e 100644
--- a/nemu/src/cpu/decode/decode.cc
+++ b/nemu/src/cpu/decode/decode.cc
@@ -42,7 +42,7 @@ static inline make_DopHelper(SI) {
    *
    op->simm = ???
    */
-  TODO();
+  op->simm = instr_fetch(eip, op->width);
 
   rtl_li(&op->val, op->simm);
 
diff --git a/nemu/src/cpu/exec/all-instr.h b/nemu/src/cpu/exec/all-instr.h
index 613d8f23432ee953a78af611055d1416408b4a15..d63f2ecf1c202e0f9cdadda9268f1ebee76ca4e7 100644
--- a/nemu/src/cpu/exec/all-instr.h
+++ b/nemu/src/cpu/exec/all-instr.h
@@ -6,3 +6,10 @@ make_EHelper(operand_size);
 
 make_EHelper(inv);
 make_EHelper(nemu_trap);
+
+make_EHelper(ret);
+make_EHelper(call);
+make_EHelper(push);
+make_EHelper(pop);
+//make_EHelper();
+//make_EHelper();
diff --git a/nemu/src/cpu/exec/control.cc b/nemu/src/cpu/exec/control.cc
index 676a2bd2eaf68a64fc7332cb7bc9e6fc43b6219a..f58ec10be0a69d7f6263c48272cb43f0e9cb03ca 100644
--- a/nemu/src/cpu/exec/control.cc
+++ b/nemu/src/cpu/exec/control.cc
@@ -26,13 +26,41 @@ make_EHelper(jmp_rm) {
 
 make_EHelper(call) {
   // the target address is calculated at the decode stage
-  TODO();
+  const bool near = true;
+  if(near) {
+    if(decoding.is_operand_size_16) {
+      throw std::runtime_error("call operand size 16 not implemented.");
+    }
+    else {
+      // operand size 32b
+      rlib::println("debug: call touched.", std::hex);
+      rtl_push<4>(&cpu.eip);
+      rlib::println("debug: idsrc.val=", id_src->val, "eip from=", cpu.eip);
+      rtl_add(&cpu.eip, &cpu.eip, &id_src->val);
+      rlib::println("debug: idsrc.val=", id_src->val, "eip to=", cpu.eip);
+    }
+  }
+ // TODO: support far call
+  // TODO();
 
   print_asm("call %x", decoding.jmp_eip);
 }
 
 make_EHelper(ret) {
-  TODO();
+  const bool near = true;
+  if(near) {
+    if(decoding.is_operand_size_16) {
+      throw std::runtime_error("call operand size 16 not implemented.");
+    }
+    else {
+      // operand size 32b
+      rtl_pop<4>(&cpu.eip);
+    }
+    rlib::println("debug: decoding.src=", decoding.src);
+  }
+
+ // TODO: support far ret
+  // TODO();
 
   print_asm("ret");
 }
diff --git a/nemu/src/cpu/exec/data-mov.cc b/nemu/src/cpu/exec/data-mov.cc
index 49ba2f12ec964400def4c1f988574e22bc3744e2..9bf9883fc35c264561ff9ea0317a3428ccfad408 100644
--- a/nemu/src/cpu/exec/data-mov.cc
+++ b/nemu/src/cpu/exec/data-mov.cc
@@ -6,13 +6,29 @@ make_EHelper(mov) {
 }
 
 make_EHelper(push) {
-  TODO();
+  static_assert(sizeof(paddr_t) * 8 == 32);
+  if(decoding.is_operand_size_16) {
+    // 16b push
+    rtl_push<2>(&id_src->val);
+  }
+  else {
+    // 32b push
+    rtl_push<4>(&id_src->val);
+  }
 
   print_asm_template1(push);
 }
 
 make_EHelper(pop) {
-  TODO();
+  static_assert(sizeof(paddr_t) * 8 == 32);
+  if(decoding.is_operand_size_16) {
+    // 16b
+    rtl_pop<2>(&id_src->val);
+  }
+  else {
+    // 32b
+    rtl_pop<4>(&id_src->val);
+  }
 
   print_asm_template1(pop);
 }
diff --git a/nemu/src/cpu/exec/exec.cc b/nemu/src/cpu/exec/exec.cc
index 373d4c38ac3e58bae6684810dff88d3225d9395e..f05726b15aad71e818c3c83d25bd91613e7bcf41 100644
--- a/nemu/src/cpu/exec/exec.cc
+++ b/nemu/src/cpu/exec/exec.cc
@@ -4,7 +4,7 @@
 typedef struct {
   DHelper decode;
   EHelper execute;
-  int width;
+  int width; // ByteWidth. If width is 0, using its default value: OperandSize (2Byte or 4Byte)
 } opcode_entry;
 
 #define IDEXW(id, ex, w)   {make_DHelper_funcname(id), make_EHelper_funcname(ex), w}
@@ -120,7 +120,7 @@ opcode_entry opcode_table [512] = {
   /* 0xb4 */	IDEXW(mov_I2r, mov, 1), IDEXW(mov_I2r, mov, 1), IDEXW(mov_I2r, mov, 1), IDEXW(mov_I2r, mov, 1),
   /* 0xb8 */	IDEX(mov_I2r, mov), IDEX(mov_I2r, mov), IDEX(mov_I2r, mov), IDEX(mov_I2r, mov),
   /* 0xbc */	IDEX(mov_I2r, mov), IDEX(mov_I2r, mov), IDEX(mov_I2r, mov), IDEX(mov_I2r, mov),
-  /* 0xc0 */	IDEXW(gp2_Ib2E, gp2, 1), IDEX(gp2_Ib2E, gp2), EMPTY, EMPTY,
+  /* 0xc0 */	IDEXW(gp2_Ib2E, gp2, 1), IDEX(gp2_Ib2E, gp2), IDEX(I, ret), EX(ret),
   /* 0xc4 */	EMPTY, EMPTY, IDEXW(mov_I2E, mov, 1), IDEX(mov_I2E, mov),
   /* 0xc8 */	EMPTY, EMPTY, EMPTY, EMPTY,
   /* 0xcc */	EMPTY, EMPTY, EMPTY, EMPTY,
@@ -130,7 +130,7 @@ opcode_entry opcode_table [512] = {
   /* 0xdc */	EMPTY, EMPTY, EMPTY, EMPTY,
   /* 0xe0 */	EMPTY, EMPTY, EMPTY, EMPTY,
   /* 0xe4 */	EMPTY, EMPTY, EMPTY, EMPTY,
-  /* 0xe8 */	EMPTY, EMPTY, EMPTY, EMPTY,
+  /* 0xe8 */  IDEX(J, call), EMPTY, EMPTY, EMPTY,
   /* 0xec */	EMPTY, EMPTY, EMPTY, EMPTY,
   /* 0xf0 */	EMPTY, EMPTY, EMPTY, EMPTY,
   /* 0xf4 */	EMPTY, EMPTY, IDEXW(E, gp3, 1), IDEX(E, gp3),
diff --git a/nemu/src/memory/memory.cc b/nemu/src/memory/memory.cc
index 26370567f4ad710a7751645c11ad987128f35abf..0206211aa00cb15402a95ff951e9d64d1652a2d1 100644
--- a/nemu/src/memory/memory.cc
+++ b/nemu/src/memory/memory.cc
@@ -19,10 +19,12 @@ void paddr_write(paddr_t addr, uint32_t data, int len) {
   memcpy(guest_to_host(addr), &data, len);
 }
 
+// len is Bytes.
 uint32_t vaddr_read(vaddr_t addr, int len) {
   return paddr_read(addr, len);
 }
 
+// len is Bytes.
 void vaddr_write(vaddr_t addr, uint32_t data, int len) {
   paddr_write(addr, data, len);
 }