/* Copyright (C) 2021 Mathieu Maret Copyright (C) 2005 David Decotigny Copyright (C) 2000-2004, The KOS team Initially taken from SOS */ #include "cpu_context.h" #include "assert.h" #include "gdt.h" #include "klibc.h" #include "segment.h" #include "uaccess.h" /** * Here is the definition of a CPU context for IA32 processors. This * is a Matos/SOS convention, not a specification given by the IA32 * spec. However there is a strong constraint related to the x86 * interrupt handling specification: the top of the stack MUST be * compatible with the 'iret' instruction, ie there must be the * err_code (might be 0), eip, cs and eflags of the destination * context in that order (see Intel x86 specs vol 3, figure 5-4). * * @note IMPORTANT: This definition MUST be consistent with the way * the registers are stored on the stack in * irq_wrappers.S/exception_wrappers.S !!! Hence the constraint above. */ struct cpu_state { /* (Lower addresses) */ /* These are Matos/SOS convention */ uint16_t gs; uint16_t fs; uint16_t es; uint16_t ds; uint16_t cpl0_ss; /* This is ALWAYS the Stack Segment of the Kernel context (CPL0) of the interrupted thread, even for a user thread */ uint16_t alignment_padding; /* unused */ uint32_t edi; uint32_t esi; uint32_t ebx; uint32_t edx; uint32_t ecx; uint32_t eax; uint32_t ebp; /* MUST NEVER CHANGE (dependent on the IA32 iret instruction) */ uint32_t error_code; vaddr_t eip; uint32_t cs; /* 32bits according to the specs ! However, the CS register is really 16bits long */ uint32_t eflags; /* (Higher addresses) */ } __attribute__((packed)); /** * The CS value pushed on the stack by the CPU upon interrupt, and * needed by the iret instruction, is 32bits long while the real CPU * CS register is 16bits only: this macro simply retrieves the CPU * "CS" register value from the CS value pushed on the stack by the * CPU upon interrupt. * * The remaining 16bits pushed by the CPU should be considered * "reserved" and architecture dependent. IMHO, the specs don't say * anything about them. Considering that some architectures generate * non-zero values for these 16bits (at least Cyrix), we'd better * ignore them. */ #define GET_CPU_CS_REGISTER_VALUE(pushed_ui32_cs_value) ((pushed_ui32_cs_value)&0xffff) /** * Structure of an interrupted Kernel thread's context */ struct cpu_kstate { struct cpu_state regs; } __attribute__((packed)); /** * Structure of an interrupted User thread's context. This is almost * the same as a kernel context, except that 2 additional values are * pushed on the stack before the eflags/cs/eip of the interrupted * context: the stack configuration of the interrupted user context. * * @see Section 6.4.1 of Intel x86 vol 1 */ struct cpu_ustate { struct cpu_state regs; struct { uint32_t cpl3_esp; uint16_t cpl3_ss; }; } __attribute__((packed)); /** * THE main operation of a kernel thread. This routine calls the * kernel thread function start_func and calls exit_func when * start_func returns. */ static void core_routine(cpu_kstate_function_arg1_t *start_func, void *start_arg, cpu_kstate_function_arg1_t *exit_func, void *exit_arg) __attribute__((noreturn)); static void core_routine(cpu_kstate_function_arg1_t *start_func, void *start_arg, cpu_kstate_function_arg1_t *exit_func, void *exit_arg) { start_func(start_arg); exit_func(exit_arg); assert(!"The exit function of the thread should NOT return !"); for (;;) ; } /* * Structure of a Task State Segment on the x86 Architecture. * * @see Intel x86 spec vol 3, figure 6-2 * * @note Such a data structure should not cross any page boundary (see * end of section 6.2.1 of Intel spec vol 3). This is the reason why * we tell gcc to align it on a 128B boundary (its size is 104B, which * is <= 128). */ struct x86_tss { /** * Intel provides a way for a task to switch to another in an * automatic way (call gates). In this case, the back_link field * stores the source TSS of the context switch. This allows to * easily implement coroutines, task backtracking, ... In Matos/SOS we * don't use TSS for the context switch purpouse, so we always * ignore this field. * (+0) */ uint16_t back_link; uint16_t reserved1; /* CPL0 saved context. (+4) */ vaddr_t esp0; uint16_t ss0; uint16_t reserved2; /* CPL1 saved context. (+12) */ vaddr_t esp1; uint16_t ss1; uint16_t reserved3; /* CPL2 saved context. (+20) */ vaddr_t esp2; uint16_t ss2; uint16_t reserved4; /* Interrupted context's saved registers. (+28) */ vaddr_t cr3; vaddr_t eip; uint32_t eflags; uint32_t eax; uint32_t ecx; uint32_t edx; uint32_t ebx; uint32_t esp; uint32_t ebp; uint32_t esi; uint32_t edi; /* +72 */ uint16_t es; uint16_t reserved5; /* +76 */ uint16_t cs; uint16_t reserved6; /* +80 */ uint16_t ss; uint16_t reserved7; /* +84 */ uint16_t ds; uint16_t reserved8; /* +88 */ uint16_t fs; uint16_t reserved9; /* +92 */ uint16_t gs; uint16_t reserved10; /* +96 */ uint16_t ldtr; uint16_t reserved11; /* +100 */ uint16_t debug_trap_flag : 1; uint16_t reserved12 : 15; uint16_t iomap_base_addr; /* 104 */ } __attribute__((packed, aligned(128))); static struct x86_tss kernel_tss; int cpu_context_subsystem_setup() { /* Reset the kernel TSS */ memset(&kernel_tss, 0x0, sizeof(kernel_tss)); /** * Now setup the kernel TSS. * * Considering the privilege change method we choose (cpl3 -> cpl0 * through a software interrupt), we don't need to initialize a * full-fledged TSS. See section 6.4.1 of Intel x86 vol 1. Actually, * only a correct value for the kernel esp and ss are required (aka * "ss0" and "esp0" fields). Since the esp0 will have to be updated * at privilege change time, we don't have to set it up now. */ kernel_tss.ss0 = BUILD_SEGMENT_REG_VALUE(0, FALSE, SEG_KDATA); /* Register this TSS into the gdt */ gdtRegisterTSS((vaddr_t)&kernel_tss); return 0; } int cpu_kstate_init(struct cpu_state **ctxt, cpu_kstate_function_arg1_t *start_func, vaddr_t start_arg, vaddr_t stack_bottom, size_t stack_size, cpu_kstate_function_arg1_t *exit_func, vaddr_t exit_arg) { /* We are initializing a Kernel thread's context */ struct cpu_kstate *kctxt; /* This is a critical internal function, so that it is assumed that the caller knows what he does: we legitimally assume that values for ctxt, start_func, stack_* and exit_func are allways VALID ! */ /* Setup the stack. * * On x86, the stack goes downward. Each frame is configured this * way (higher addresses first): * * - (optional unused space. As of gcc 3.3, this space is 24 bytes) * - arg n * - arg n-1 * - ... * - arg 1 * - return instruction address: The address the function returns to * once finished * - local variables * * The remaining of the code should be read from the end upward to * understand how the processor will handle it. */ vaddr_t tmp_vaddr = stack_bottom + stack_size; uint32_t *stack = (uint32_t *)tmp_vaddr; /* If needed, poison the stack */ #ifdef CPU_STATE_DETECT_UNINIT_KERNEL_VARS memset((void *)stack_bottom, CPU_STATE_STACK_POISON, stack_size); #elif defined(CPU_STATE_DETECT_KERNEL_STACK_OVERFLOW) cpu_state_prepare_detect_kernel_stack_overflow(stack_bottom, stack_size); #endif /* Simulate a call to the core_routine() function: prepare its arguments */ *(--stack) = exit_arg; *(--stack) = (uint32_t)exit_func; *(--stack) = start_arg; *(--stack) = (uint32_t)start_func; *(--stack) = 0; /* Return address of core_routine => force page fault */ /* * Setup the initial context structure, so that the CPU will execute * the function core_routine() once this new context has been * restored on CPU */ /* Compute the base address of the structure, which must be located below the previous elements */ tmp_vaddr = ((vaddr_t)stack) - sizeof(struct cpu_kstate); kctxt = (struct cpu_kstate *)tmp_vaddr; /* Initialize the CPU context structure */ memset(kctxt, 0x0, sizeof(struct cpu_kstate)); /* Tell the CPU context structure that the first instruction to execute will be that of the core_routine() function */ kctxt->regs.eip = (uint32_t)core_routine; /* Setup the segment registers */ kctxt->regs.cs = BUILD_SEGMENT_REG_VALUE(0, FALSE, SEG_KCODE); /* Code */ kctxt->regs.ds = BUILD_SEGMENT_REG_VALUE(0, FALSE, SEG_KDATA); /* Data */ kctxt->regs.es = BUILD_SEGMENT_REG_VALUE(0, FALSE, SEG_KDATA); /* Data */ kctxt->regs.cpl0_ss = BUILD_SEGMENT_REG_VALUE(0, FALSE, SEG_KDATA); /* Stack */ /* fs and gs unused for the moment. */ /* The newly created context is initially interruptible */ kctxt->regs.eflags = (1 << 9); /* set IF bit */ /* Finally, update the generic kernel/user thread context */ *ctxt = (struct cpu_state *)kctxt; return 0; } int cpu_ustate_init(struct cpu_state **ctx, uaddr_t startPC, uint32_t arg1, uint32_t arg2, uaddr_t startSP, vaddr_t kernelStackBottom, size_t kernelStackSize) { // The user context is stacked above the usual cpu state by the CPU on context switch. // So store it when the cpu expect it (See cpu_kstate_init for more details) struct cpu_ustate *uctx = (struct cpu_ustate *)(kernelStackBottom + kernelStackSize - sizeof(struct cpu_ustate)); /* If needed, poison the stack */ #ifdef CPU_STATE_DETECT_UNINIT_KERNEL_VARS memset((void *)kernelStackBottom, CPU_STATE_STACK_POISON, kernelStackSize); #elif defined(CPU_STATE_DETECT_KERNEL_STACK_OVERFLOW) cpu_state_prepare_detect_kernel_stack_overflow(stack_bottom, stack_size); #endif memset(uctx, 0, sizeof(struct cpu_ustate)); uctx->regs.eip = startPC; uctx->regs.eax = arg1; uctx->regs.ebx = arg2; uctx->regs.cs = BUILD_SEGMENT_REG_VALUE(3, FALSE, SEG_UCODE); // Code uctx->regs.ds = BUILD_SEGMENT_REG_VALUE(3, FALSE, SEG_UDATA); // Data uctx->regs.es = BUILD_SEGMENT_REG_VALUE(3, FALSE, SEG_UDATA); // Data uctx->regs.cpl0_ss = BUILD_SEGMENT_REG_VALUE(0, FALSE, SEG_KDATA); // Kernel Stack uctx->cpl3_ss = BUILD_SEGMENT_REG_VALUE(3, FALSE, SEG_UDATA); // User Stack uctx->cpl3_esp = startSP; /* The newly created context is initially interruptible */ uctx->regs.eflags = (1 << 9); /* set IF bit */ *ctx = (struct cpu_state *)uctx; return 0; } #if defined(CPU_STATE_DETECT_KERNEL_STACK_OVERFLOW) void cpu_state_prepare_detect_kernel_stack_overflow(const struct cpu_state *ctxt, vaddr_t stack_bottom, size_t stack_size) { (void)ctxt; size_t poison_size = CPU_STATE_DETECT_KERNEL_STACK_OVERFLOW; if (poison_size > stack_size) poison_size = stack_size; memset((void *)stack_bottom, CPU_STATE_STACK_POISON, poison_size); } void cpu_state_detect_kernel_stack_overflow(const struct cpu_state *ctxt, vaddr_t stack_bottom, size_t stack_size) { unsigned char *c; size_t i; /* On Matos/SOS, "ctxt" corresponds to the address of the esp register of the saved context in Kernel mode (always, even for the interrupted context of a user thread). Here we make sure that this stack pointer is within the allowed stack area */ assert(((vaddr_t)ctxt) >= stack_bottom); assert(((vaddr_t)ctxt) + sizeof(struct cpu_kstate) <= stack_bottom + stack_size); /* Check that the bottom of the stack has not been altered */ for (c = (unsigned char *)stack_bottom, i = 0; (i < CPU_STATE_DETECT_KERNEL_STACK_OVERFLOW) && (i < stack_size); c++, i++) { assert(CPU_STATE_STACK_POISON == *c); } } #endif /* ======================================================================= * Public Accessor functions */ int cpu_context_is_in_user_mode(const struct cpu_state *ctxt) { /* An interrupted user thread has its CS register set to that of the User code segment */ switch (GET_CPU_CS_REGISTER_VALUE(ctxt->cs)) { case BUILD_SEGMENT_REG_VALUE(3, FALSE, SEG_UCODE): return TRUE; break; case BUILD_SEGMENT_REG_VALUE(0, FALSE, SEG_KCODE): return FALSE; break; default: pr_err("Invalid saved context Code segment register: 0x%x (k=%x, u=%x) !", (unsigned)GET_CPU_CS_REGISTER_VALUE(ctxt->cs), BUILD_SEGMENT_REG_VALUE(0, FALSE, SEG_KCODE), BUILD_SEGMENT_REG_VALUE(3, FALSE, SEG_UCODE)); break; } /* Should never get here */ return -1; } vaddr_t cpu_context_get_PC(const struct cpu_state *ctxt) { assert(NULL != ctxt); /* This is the PC of the interrupted context (ie kernel or user context). */ return ctxt->eip; } vaddr_t cpu_context_get_SP(const struct cpu_state *ctxt) { assert(NULL != ctxt); /* On Matos/SOS, "ctxt" corresponds to the address of the esp register of the saved context in Kernel mode (always, even for the interrupted context of a user thread). */ return (vaddr_t)ctxt; } uint32_t cpu_context_get_EX_err(const struct cpu_state *ctxt) { assert(NULL != ctxt); /* This is the Err_code of the interrupted context (ie kernel or user context). */ return ctxt->error_code; } vaddr_t cpu_context_get_EX_faulting_vaddr(const struct cpu_state *ctxt) { assert(NULL != ctxt); // A page fault has occurred. // The faulting address is stored in the CR2 register. vaddr_t faulting_address; asm volatile("mov %%cr2, %0" : "=r"(faulting_address)); return faulting_address; } void cpu_context_dump(const struct cpu_state *ctxt) { printf("CPU: eip=%x esp=%x eflags=%x cs=%x ds=%x ss=%x err=%x", (unsigned)ctxt->eip, (unsigned)ctxt, (unsigned)ctxt->eflags, (unsigned)GET_CPU_CS_REGISTER_VALUE(ctxt->cs), (unsigned)ctxt->ds, (unsigned)ctxt->cpl0_ss, (unsigned)ctxt->error_code); } /* ************************************************************* * Function to manage the TSS. This function is not really "public": * it is reserved to the assembler routines defined in * cpu_context_switch.S * * Update the kernel stack address so that the IRQ, syscalls and * exception return in a correct stack location when coming back into * kernel mode. */ void cpu_context_update_kernel_tss(struct cpu_state *next_ctxt) { /* next_ctxt corresponds to an interrupted user thread ? */ if (cpu_context_is_in_user_mode(next_ctxt)) { /* * Yes: "next_ctxt" is an interrupted user thread => we are * going to switch to user mode ! Setup the stack address so * that the user thread "next_ctxt" can come back to the correct * stack location when returning in kernel mode. * * This stack location corresponds to the SP of the next user * thread once its context has been transferred on the CPU, ie * once the CPU has executed all the pop/iret instruction of the * context switch with privilege change. */ kernel_tss.esp0 = ((vaddr_t)next_ctxt) + sizeof(struct cpu_ustate); /* Note: no need to protect this agains IRQ because IRQs are not allowed to update it by themselves, and they are not allowed to block */ } else { /* No: No need to update kernel TSS when we stay in kernel mode */ } } inline int syscallGet3args(const struct cpu_state *user_ctxt, /* out */unsigned int *arg1, /* out */unsigned int *arg2, /* out */unsigned int *arg3) { *arg1 = user_ctxt->ebx; *arg2 = user_ctxt->ecx; *arg3 = user_ctxt->edx; return 0; } int syscallGet1arg(const struct cpu_state *user_ctxt, /* out */unsigned int *arg1) { unsigned int unused; return syscallGet3args(user_ctxt, arg1, & unused, & unused); } int syscallGet2args(const struct cpu_state *user_ctxt, /* out */unsigned int *arg1, /* out */unsigned int *arg2) { unsigned int unused; return syscallGet3args(user_ctxt, arg1, arg2, & unused); } int syscallGet4args(const struct cpu_state *user_ctxt, /* out */ unsigned int *arg1, /* out */ unsigned int *arg2, /* out */ unsigned int *arg3, /* out */ unsigned int *arg4) { uaddr_t userOtherArgs; unsigned int otherArgs[2]; int ret; ret = syscallGet3args(user_ctxt, arg1, arg2, (unsigned int *)&userOtherArgs); if (ret) return ret; ret = memcpyFromUser((vaddr_t)otherArgs, userOtherArgs, sizeof(otherArgs)); if (ret != sizeof(otherArgs)) return -EFAULT; *arg3 = otherArgs[0]; *arg4 = otherArgs[1]; return 0; } int syscallGet5args(const struct cpu_state *user_ctxt, /* out */ unsigned int *arg1, /* out */ unsigned int *arg2, /* out */ unsigned int *arg3, /* out */ unsigned int *arg4, /* out */ unsigned int *arg5) { uaddr_t userOtherArgs; unsigned int otherArgs[3]; int ret; ret = syscallGet3args(user_ctxt, arg1, arg2, (unsigned int *)&userOtherArgs); if (ret) return ret; ret = memcpyFromUser((vaddr_t)otherArgs, userOtherArgs, sizeof(otherArgs)); if (ret != sizeof(otherArgs)) return -EFAULT; *arg3 = otherArgs[0]; *arg4 = otherArgs[1]; *arg5 = otherArgs[2]; return 0; }