diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 1f8878da14e4a6d6d5803466b61314e05468e52b..6e6ffa5bdd506b815a91babdf4f08557087ee7a8 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,4 +1,4 @@
-image: archlinux/base
+image: recolic/cxx-toolkit
 
 stages:
     - build
@@ -7,9 +7,9 @@ stages:
 build:
     stage: build
     script: 
-        - pacman -Sy && pacman -S --noconfirm bison flex gcc make gettext sdl2 lib32-glibc grep
+        - pacman -Sy && pacman -S --noconfirm bison flex gettext sdl2 lib32-glibc grep
         - export AM_HOME=$(pwd)/nexus-am/
-        - cd nemu && make
+        - cd nemu && make EXTRA_FLAGS='-DDISABLE_MMIO'
         - show_log=1 ./runall.sh &> testcases.log ; echo $? > testres.log
     artifacts:
         paths:
@@ -17,6 +17,12 @@ build:
             - nemu/testcases.log
         expire_in: 1 week
 
+build-icc-pgo:
+    stage: build
+    script:
+        - pacman -Sy && pacman -S --noconfirm bison flex gettext sdl2 lib32-glibc grep
+        - export AM_HOME=$(pwd)/nexus-am/
+        - cd nemu && ./icc-build.sh
 
 test:
     stage: test
diff --git a/nemu/Makefile b/nemu/Makefile
index e058045535d99d11ce3bf744ce435c0ce42fc517..8e0081d5d066a6525e5735a73d38bee0d2edf542 100644
--- a/nemu/Makefile
+++ b/nemu/Makefile
@@ -19,8 +19,9 @@ include Makefile.git
 CXX ?= g++
 LD = $(CXX)
 INCLUDES  = $(addprefix -I, $(INC_DIR))
-CFLAGS   += -O2 -MMD -Wall -ggdb3 $(INCLUDES) -fomit-frame-pointer -std=c++17
-CFLAGS   += -DDIFF_TEST_QEMU
+CFLAGS   += -O3 -MMD -Wall $(INCLUDES) -fomit-frame-pointer -std=c++17
+CFLAGS   += $(EXTRA_FLAGS)
+# CFLAGS   += -DDIFF_TEST_QEMU
 
 # Source code generation before any targets.
 SUBDIRS = src/monitor/debug/expr_impl
@@ -53,7 +54,7 @@ NEMU_EXEC := $(BINARY) $(ARGS)
 $(BINARY): $(OBJS)
 	$(call git_commit, "compile")
 	@echo + LD $@
-	@$(LD) -O2 -rdynamic $(SO_LDLAGS) -o $@ $^ -lSDL2 -lreadline -ldl
+	@$(LD) -O2 -rdynamic $(SO_LDLAGS) -o $@ $^ -lSDL2 -lreadline -ldl -pthread
 
 run: $(BINARY)
 	$(call git_commit, "run")
diff --git a/nemu/icc-build.sh b/nemu/icc-build.sh
new file mode 100755
index 0000000000000000000000000000000000000000..3adee30d7dca9b7e4d15d5703504fbb0b6b2c636
--- /dev/null
+++ b/nemu/icc-build.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+xflags="-no-ansi-alias -DDISABLE_MMIO -no-complex-limited-range -qopt-prefetch=2"
+cpus=$(grep -c '^processor' /proc/cpuinfo)
+
+make clean &&
+make EXTRA_FLAGS="$xflags -prof-gen" CXX=icpc -j$cpus &&
+make -C "$AM_HOME/apps/microbench" ARCH=x86-nemu &&
+build/nemu -b "$AM_HOME/apps/microbench/build/microbench-x86-nemu.bin" &&
+make clean &&
+make EXTRA_FLAGS="$xflags -prof-use" CXX=icpc -j$cpus
+
+exit $?
+
diff --git a/nemu/include/common.h b/nemu/include/common.h
index b734ebc0e3100c1c84e49fb8e93a1cb011d8eee5..83d7ae959575662c70fa4d9f20b3afeffd98cc32 100644
--- a/nemu/include/common.h
+++ b/nemu/include/common.h
@@ -1,7 +1,7 @@
 #ifndef __COMMON_H__
 #define __COMMON_H__
 
-#define DEBUG
+//#define DEBUG
 //#define DIFF_TEST
 
 #if _SHARE
diff --git a/nemu/include/macro.h b/nemu/include/macro.h
index 68b9399fb591ac99a8541e769c66d78df12e9d6b..2bc2d2b77ec74ecb6a2dde413b3d531610c84ea9 100644
--- a/nemu/include/macro.h
+++ b/nemu/include/macro.h
@@ -15,4 +15,7 @@
 #define RLIB_MACRO_DEBUG_ASSERT(expr)
 #endif
 
+#define RLIB_MACRO_LIKELY(x)       __builtin_expect((x),1)
+#define RLIB_MACRO_UNLIKELY(x)     __builtin_expect((x),0)
+
 #endif
diff --git a/nemu/include/rlib/3rdparty/prettyprint.hpp b/nemu/include/rlib/3rdparty/prettyprint.hpp
index ce2226a92b3d753826442b16ee18264e0a755839..106c275e9a7e2bd8a9af54f884fee0ba9a6efb1a 100644
--- a/nemu/include/rlib/3rdparty/prettyprint.hpp
+++ b/nemu/include/rlib/3rdparty/prettyprint.hpp
@@ -167,11 +167,11 @@ namespace pretty_print
 
     template <typename T, typename TChar, typename TCharTraits, typename TDelimiters>
     template <typename T1, typename T2>
-    struct print_container_helper<T, TChar, TCharTraits, TDelimiters>::printer<std::pair<T1, T2>>
+    struct print_container_helper<T, TChar, TCharTraits, TDelimiters>::printer<::std::pair<T1, T2>>
     {
         using ostream_type = typename print_container_helper<T, TChar, TCharTraits, TDelimiters>::ostream_type;
 
-        static void print_body(const std::pair<T1, T2> & c, ostream_type & stream)
+        static void print_body(const ::std::pair<T1, T2> & c, ostream_type & stream)
         {
             stream << c.first;
             if (print_container_helper<T, TChar, TCharTraits, TDelimiters>::delimiters_type::values.delimiter != NULL)
@@ -184,10 +184,10 @@ namespace pretty_print
 
     template <typename T, typename TChar, typename TCharTraits, typename TDelimiters>
     template <typename ...Args>
-    struct print_container_helper<T, TChar, TCharTraits, TDelimiters>::printer<std::tuple<Args...>>
+    struct print_container_helper<T, TChar, TCharTraits, TDelimiters>::printer<::std::tuple<Args...>>
     {
         using ostream_type = typename print_container_helper<T, TChar, TCharTraits, TDelimiters>::ostream_type;
-        using element_type = std::tuple<Args...>;
+        using element_type = ::std::tuple<Args...>;
 
         template <std::size_t I> struct Int { };
 
@@ -249,10 +249,10 @@ namespace pretty_print
     struct is_container<std::valarray<T>> : std::true_type { };
 
     template <typename T1, typename T2>
-    struct is_container<std::pair<T1, T2>> : std::true_type { };
+    struct is_container<::std::pair<T1, T2>> : std::true_type { };
 
     template <typename ...Args>
-    struct is_container<std::tuple<Args...>> : std::true_type { };
+    struct is_container<::std::tuple<Args...>> : std::true_type { };
 
 
     // Default delimiters
@@ -316,13 +316,13 @@ namespace pretty_print
 
     // Delimiters for pair and tuple
 
-    template <typename T1, typename T2> struct delimiters<std::pair<T1, T2>, char> { static const delimiters_values<char> values; };
-    template <typename T1, typename T2> const delimiters_values<char> delimiters<std::pair<T1, T2>, char>::values = { "(", ", ", ")" };
+    template <typename T1, typename T2> struct delimiters<::std::pair<T1, T2>, char> { static const delimiters_values<char> values; };
+    template <typename T1, typename T2> const delimiters_values<char> delimiters<::std::pair<T1, T2>, char>::values = { "(", ", ", ")" };
     template <typename T1, typename T2> struct delimiters< ::std::pair<T1, T2>, wchar_t> { static const delimiters_values<wchar_t> values; };
     template <typename T1, typename T2> const delimiters_values<wchar_t> delimiters< ::std::pair<T1, T2>, wchar_t>::values = { L"(", L", ", L")" };
 
-    template <typename ...Args> struct delimiters<std::tuple<Args...>, char> { static const delimiters_values<char> values; };
-    template <typename ...Args> const delimiters_values<char> delimiters<std::tuple<Args...>, char>::values = { "(", ", ", ")" };
+    template <typename ...Args> struct delimiters<::std::tuple<Args...>, char> { static const delimiters_values<char> values; };
+    template <typename ...Args> const delimiters_values<char> delimiters<::std::tuple<Args...>, char>::values = { "(", ", ", ")" };
     template <typename ...Args> struct delimiters< ::std::tuple<Args...>, wchar_t> { static const delimiters_values<wchar_t> values; };
     template <typename ...Args> const delimiters_values<wchar_t> delimiters< ::std::tuple<Args...>, wchar_t>::values = { L"(", L", ", L")" };
 
diff --git a/nemu/include/util/util.h b/nemu/include/util/util.h
index 330a1d97315d45346981fd79c8eb6cfe9bb7e92f..2fe56754c7a2a04a2229baa1902028b4ba93e464 100644
--- a/nemu/include/util/util.h
+++ b/nemu/include/util/util.h
@@ -11,7 +11,7 @@ namespace rlib {
       return (int8_t)val;
     else if constexpr(BytesCount == 2)
       return (int16_t)val;
-    else return val;
+    return val;
   }
 }
 
diff --git a/nemu/src/cpu/exec/exec.cc b/nemu/src/cpu/exec/exec.cc
index 3c67656de60b0c26d801a851b00a8a3dd5de09d2..f6e847abcce4ca5bc94d63abc7c389f9e799c9d6 100644
--- a/nemu/src/cpu/exec/exec.cc
+++ b/nemu/src/cpu/exec/exec.cc
@@ -14,10 +14,8 @@ typedef struct {
 #define EMPTY              EX(inv)
 
 static inline void set_width(int width) {
-  if (width == 0) {
-    width = decoding.is_operand_size_16 ? 2 : 4;
-  }
-  decoding.src.width = decoding.dest.width = decoding.src2.width = width;
+  const auto tmp = width == 0 ? (decoding.is_operand_size_16 ? 2 : 4) : width;
+  decoding.src.width = decoding.dest.width = decoding.src2.width = tmp;
 }
 
 /* Instruction Decode and EXecute */
@@ -211,7 +209,7 @@ namespace EHelperImpl {
     idex(eip, &opcode_table[opcode]);
   }
 
-  make_EHelper(real) {
+  __attribute__((hot)) make_EHelper(real) {
     uint32_t opcode = instr_fetch(eip, 1);
     decoding.opcode = opcode;
     set_width(opcode_table[opcode].width);
diff --git a/nemu/src/device/device.cc b/nemu/src/device/device.cc
index 560d1b08f36209fa834ee22aa3cce70b1ede2153..b36ae401cc29a4af046bc59ab548579e0d775e9e 100644
--- a/nemu/src/device/device.cc
+++ b/nemu/src/device/device.cc
@@ -6,13 +6,16 @@
 #include <signal.h>
 #include <SDL2/SDL.h>
 
+#include <thread>
+#include <atomic>
+
 #define TIMER_HZ 100
 #define VGA_HZ 50
 
 static uint64_t jiffy = 0;
 static struct itimerval it;
-static int device_update_flag = false;
-static int update_screen_flag = false;
+static std::atomic<bool> device_update_flag(false);
+static std::atomic<bool> update_screen_flag(false);
 
 void init_serial();
 void init_timer();
@@ -37,12 +40,9 @@ static void timer_sig_handler(int signum) {
   Assert(ret == 0, "Can not set timer");
 }
 
-void device_update() {
-  if (!device_update_flag) {
-    return;
-  }
-  device_update_flag = false;
+void device_update() {} // Now an independent thread will do it.
 
+void device_update_impl() {
   if (update_screen_flag) {
     update_screen();
     update_screen_flag = false;
@@ -72,6 +72,16 @@ void device_update() {
   }
 }
 
+static void device_update_thread_daemon() {
+  while(true) {
+    if(device_update_flag.exchange(false)) {
+      device_update_impl();
+    }
+    // At most, 1000FPS
+    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+  }
+}
+
 void sdl_clear_event_queue() {
   SDL_Event event;
   while (SDL_PollEvent(&event));
@@ -93,6 +103,8 @@ void init_device() {
   it.it_value.tv_usec = 1000000 / TIMER_HZ;
   ret = setitimer(ITIMER_VIRTUAL, &it, NULL);
   Assert(ret == 0, "Can not set timer");
+
+  std::thread(device_update_thread_daemon).detach();
 }
 #else
 
diff --git a/nemu/src/device/io/mmio.cc b/nemu/src/device/io/mmio.cc
index 529845fe4d038ed724a139a22c0f3f63fbea9ceb..50feaabe59f30028c4afb7d47c0fbc0435a36545 100644
--- a/nemu/src/device/io/mmio.cc
+++ b/nemu/src/device/io/mmio.cc
@@ -33,9 +33,8 @@ void* add_mmio_map(paddr_t addr, int len, mmio_callback_t callback) {
 }
 
 /* bus interface */
-int is_mmio(paddr_t addr) {
-  int i;
-  for (i = 0; i < nr_map; i ++) {
+__attribute__((hot)) int is_mmio(paddr_t addr) {
+  for (int i = 0; i < nr_map; i ++) {
     if (addr >= maps[i].low && addr <= maps[i].high) {
       return i;
     }
diff --git a/nemu/src/device/vga.cc b/nemu/src/device/vga.cc
index 66445841d2b6d50cda986da93c74f0740338f649..8095b786bb84fac392e771fc34680c39a880632c 100644
--- a/nemu/src/device/vga.cc
+++ b/nemu/src/device/vga.cc
@@ -18,16 +18,15 @@ static SDL_Texture *texture;
 static uint32_t (*vmem) [SCREEN_W];
 static uint32_t *screensize_port_base;
 
-void update_screen() {
-  SDL_UpdateTexture(texture, NULL, vmem, SCREEN_W * sizeof(vmem[0][0]));
-  SDL_RenderClear(renderer);
-  SDL_RenderCopy(renderer, texture, NULL, NULL);
-  SDL_RenderPresent(renderer);
+inline void SDL_ErrorCheck(int ret) {
+  if(ret != 0) {
+    rlib::println("SDL_Error: ret=", ret, ", GETERR=", SDL_GetError());
+  }
 }
 
-void init_vga() {
-  SDL_Init(SDL_INIT_VIDEO);
-  SDL_CreateWindowAndRenderer(SCREEN_W * 2, SCREEN_H * 2, 0, &window, &renderer);
+static void init_vga_impl() {
+  SDL_ErrorCheck(SDL_Init(SDL_INIT_VIDEO));
+  SDL_ErrorCheck(SDL_CreateWindowAndRenderer(SCREEN_W * 2, SCREEN_H * 2, 0, &window, &renderer));
   SDL_SetWindowTitle(window, "NEMU");
   texture = SDL_CreateTexture(renderer, SDL_PIXELFORMAT_ARGB8888,
       SDL_TEXTUREACCESS_STATIC, SCREEN_W, SCREEN_H);
@@ -36,4 +35,20 @@ void init_vga() {
   *screensize_port_base = ((SCREEN_W) << 16) | (SCREEN_H);
   vmem = reinterpret_cast<decltype(vmem)>(add_mmio_map(VMEM, 0x80000, nullptr));
 }
+
+void update_screen() {
+#ifndef DISABLE_MMIO
+  if(window == nullptr) init_vga_impl();
+  SDL_ErrorCheck(SDL_UpdateTexture(texture, NULL, vmem, SCREEN_W * sizeof(vmem[0][0])));
+  SDL_ErrorCheck(SDL_RenderClear(renderer));
+  SDL_ErrorCheck(SDL_RenderCopy(renderer, texture, NULL, NULL));
+  SDL_RenderPresent(renderer);
+#endif
+}
+
+void init_vga() {
+  // Because of fucking SDL design, vga_init should be done in updating thread.
+  // Do nothing in main thread.
+}
+
 #endif	/* HAS_IOE */
diff --git a/nemu/src/memory/memory.cc b/nemu/src/memory/memory.cc
index 0206211aa00cb15402a95ff951e9d64d1652a2d1..46be639c22a2af66d476b3a3e6b4798dfb48d34b 100644
--- a/nemu/src/memory/memory.cc
+++ b/nemu/src/memory/memory.cc
@@ -1,4 +1,5 @@
 #include "nemu.h"
+#include "device/mmio.h"
 
 #define PMEM_SIZE (128 * 1024 * 1024)
 
@@ -11,14 +12,35 @@ uint8_t pmem[PMEM_SIZE];
 
 /* Memory accessing interfaces */
 
-uint32_t paddr_read(paddr_t addr, int len) {
-  return pmem_rw(addr, uint32_t) & (~0u >> ((4 - len) << 3));
+__attribute__((hot)) uint32_t paddr_read(paddr_t addr, int len) {
+    static const uint32_t niddle[] = {0, 0xff, 0xffff, 0xffffff, 0xffffffff};
+
+#ifndef DISABLE_MMIO
+    if(const auto mmio_id = is_mmio(addr); RLIB_MACRO_LIKELY(-1 == mmio_id)) {
+#endif
+      return pmem_rw(addr, uint32_t) & niddle[len];
+#ifndef DISABLE_MMIO
+    }
+    else {
+      return mmio_read(addr, len, mmio_id);
+    }
+#endif
 }
 
 void paddr_write(paddr_t addr, uint32_t data, int len) {
-  memcpy(guest_to_host(addr), &data, len);
+#ifndef DISABLE_MMIO
+  if(const auto mmio_id = is_mmio(addr); RLIB_MACRO_LIKELY(-1 == mmio_id)) {
+#endif
+    memcpy(guest_to_host(addr), &data, len);
+#ifndef DISABLE_MMIO
+  }
+  else {
+    mmio_write(addr, len, data, mmio_id);
+  }
+#endif
 }
 
+
 // len is Bytes.
 uint32_t vaddr_read(vaddr_t addr, int len) {
   return paddr_read(addr, len);
diff --git a/nexus-am/am/arch/x86-nemu/src/devices/input.c b/nexus-am/am/arch/x86-nemu/src/devices/input.c
index a0634a778d2e26d2d6c72da0998d30e2d0cfebbc..15173f5d216813888cb8f3f5cf22a417e48ff7be 100644
--- a/nexus-am/am/arch/x86-nemu/src/devices/input.c
+++ b/nexus-am/am/arch/x86-nemu/src/devices/input.c
@@ -3,11 +3,15 @@
 #include <amdev.h>
 
 size_t input_read(uintptr_t reg, void *buf, size_t size) {
+  const uint32_t I8042_DATA_PORT = 0x60;
   switch (reg) {
     case _DEVREG_INPUT_KBD: {
       _KbdReg *kbd = (_KbdReg *)buf;
-      kbd->keydown = 0;
-      kbd->keycode = _KEY_NONE;
+      uint32_t press = inl(I8042_DATA_PORT);
+      kbd->keycode = press;
+      if(press != _KEY_NONE){
+          kbd->keydown = !(kbd->keydown);
+      }
       return sizeof(_KbdReg);
     }
   }
diff --git a/nexus-am/am/arch/x86-nemu/src/devices/video.c b/nexus-am/am/arch/x86-nemu/src/devices/video.c
index f56057286afd174c6f843bec4c7418f27da7110f..2baa1f5d5b7c616a550e9e1ac7c18b68f69781b3 100644
--- a/nexus-am/am/arch/x86-nemu/src/devices/video.c
+++ b/nexus-am/am/arch/x86-nemu/src/devices/video.c
@@ -6,11 +6,13 @@
 static uint32_t* const fb __attribute__((used)) = (uint32_t *)0x40000;
 
 size_t video_read(uintptr_t reg, void *buf, size_t size) {
+  const uint32_t SCREEN_PORT = 0x100;
   switch (reg) {
     case _DEVREG_VIDEO_INFO: {
       _VideoInfoReg *info = (_VideoInfoReg *)buf;
-      info->width = 0;
-      info->height = 0;
+      uint32_t screen = inl(SCREEN_PORT);
+      info->width = screen >> 16;
+      info->height = screen << 16 >> 16;
       return sizeof(_VideoInfoReg);
     }
   }
@@ -21,7 +23,9 @@ size_t video_write(uintptr_t reg, void *buf, size_t size) {
   switch (reg) {
     case _DEVREG_VIDEO_FBCTL: {
       _FBCtlReg *ctl = (_FBCtlReg *)buf;
-
+      for(int i = 0; i < ctl->h; ++i) {
+        memcpy(fb+(ctl->y+i)*screen_width()+ctl->x,ctl->pixels+i*ctl->w,ctl->w*4);
+      }
       if (ctl->sync) {
         // do nothing, hardware syncs.
       }