matos/arch/x86/cpu_context.c

/* Copyright (C) 2021  Mathieu Maret
   Copyright (C) 2005  David Decotigny
   Copyright (C) 2000-2004, The KOS team

   Initially taken from SOS
*/

#include "cpu_context.h"
#include "assert.h"
#include "gdt.h"
#include "klibc.h"
#include "segment.h"
#include "uaccess.h"

/**
 * Here is the definition of a CPU context for IA32 processors. This
 * is a Matos/SOS convention, not a specification given by the IA32
 * spec. However there is a strong constraint related to the x86
 * interrupt handling specification: the top of the stack MUST be
 * compatible with the 'iret' instruction, ie there must be the
 * err_code (might be 0), eip, cs and eflags of the destination
 * context in that order (see Intel x86 specs vol 3, figure 5-4).
 *
 * @note IMPORTANT: This definition MUST be consistent with the way
 * the registers are stored on the stack in
 * irq_wrappers.S/exception_wrappers.S !!! Hence the constraint above.
 */
struct cpu_state {
    /* (Lower addresses) */

    /* These are Matos/SOS convention */
    uint16_t gs;
    uint16_t fs;
    uint16_t es;
    uint16_t ds;
    uint16_t cpl0_ss;           /* This is ALWAYS the Stack Segment of the
                   Kernel context (CPL0) of the interrupted
                   thread, even for a user thread */
    uint16_t alignment_padding; /* unused */
    uint32_t edi;
    uint32_t esi;
    uint32_t ebx;
    uint32_t edx;
    uint32_t ecx;
    uint32_t eax;
    uint32_t ebp;

    /* MUST NEVER CHANGE (dependent on the IA32 iret instruction) */
    uint32_t error_code;
    vaddr_t eip;
    uint32_t cs; /* 32bits according to the specs ! However, the CS
          register is really 16bits long */
    uint32_t eflags;

    /* (Higher addresses) */
} __attribute__((packed));

/**
 * The CS value pushed on the stack by the CPU upon interrupt, and
 * needed by the iret instruction, is 32bits long while the real CPU
 * CS register is 16bits only: this macro simply retrieves the CPU
 * "CS" register value from the CS value pushed on the stack by the
 * CPU upon interrupt.
 *
 * The remaining 16bits pushed by the CPU should be considered
 * "reserved" and architecture dependent. IMHO, the specs don't say
 * anything about them. Considering that some architectures generate
 * non-zero values for these 16bits (at least Cyrix), we'd better
 * ignore them.
 */
#define GET_CPU_CS_REGISTER_VALUE(pushed_ui32_cs_value) ((pushed_ui32_cs_value)&0xffff)

/**
 * Structure of an interrupted Kernel thread's context
 */
struct cpu_kstate {
    struct cpu_state regs;
} __attribute__((packed));

/**
 * Structure of an interrupted User thread's context. This is almost
 * the same as a kernel context, except that 2 additional values are
 * pushed on the stack before the eflags/cs/eip of the interrupted
 * context: the stack configuration of the interrupted user context.
 *
 * @see Section 6.4.1 of Intel x86 vol 1
 */
struct cpu_ustate {
    struct cpu_state regs;
    struct {
        uint32_t cpl3_esp;
        uint16_t cpl3_ss;
    };
} __attribute__((packed));

/**
 * THE main operation of a kernel thread. This routine calls the
 * kernel thread function start_func and calls exit_func when
 * start_func returns.
 */
static void core_routine(cpu_kstate_function_arg1_t *start_func, void *start_arg,
                         cpu_kstate_function_arg1_t *exit_func, void *exit_arg)
    __attribute__((noreturn));

static void core_routine(cpu_kstate_function_arg1_t *start_func, void *start_arg,
                         cpu_kstate_function_arg1_t *exit_func, void *exit_arg)
{
    start_func(start_arg);
    exit_func(exit_arg);

    assert(!"The exit function of the thread should NOT return !");
    for (;;)
        ;
}

/*
 * Structure of a Task State Segment on the x86 Architecture.
 *
 * @see Intel x86 spec vol 3, figure 6-2
 *
 * @note Such a data structure should not cross any page boundary (see
 * end of section 6.2.1 of Intel spec vol 3). This is the reason why
 * we tell gcc to align it on a 128B boundary (its size is 104B, which
 * is <= 128).
 */
struct x86_tss {

    /**
     * Intel provides a way for a task to switch to another in an
     * automatic way (call gates). In this case, the back_link field
     * stores the source TSS of the context switch. This allows to
     * easily implement coroutines, task backtracking, ... In Matos/SOS we
     * don't use TSS for the context switch purpouse, so we always
     * ignore this field.
     * (+0)
     */
    uint16_t back_link;

    uint16_t reserved1;

    /* CPL0 saved context. (+4) */
    vaddr_t esp0;
    uint16_t ss0;

    uint16_t reserved2;

    /* CPL1 saved context. (+12) */
    vaddr_t esp1;
    uint16_t ss1;

    uint16_t reserved3;

    /* CPL2 saved context. (+20) */
    vaddr_t esp2;
    uint16_t ss2;

    uint16_t reserved4;

    /* Interrupted context's saved registers. (+28) */
    vaddr_t cr3;
    vaddr_t eip;
    uint32_t eflags;
    uint32_t eax;
    uint32_t ecx;
    uint32_t edx;
    uint32_t ebx;
    uint32_t esp;
    uint32_t ebp;
    uint32_t esi;
    uint32_t edi;

    /* +72 */
    uint16_t es;
    uint16_t reserved5;

    /* +76 */
    uint16_t cs;
    uint16_t reserved6;

    /* +80 */
    uint16_t ss;
    uint16_t reserved7;

    /* +84 */
    uint16_t ds;
    uint16_t reserved8;

    /* +88 */
    uint16_t fs;
    uint16_t reserved9;

    /* +92 */
    uint16_t gs;
    uint16_t reserved10;

    /* +96 */
    uint16_t ldtr;
    uint16_t reserved11;

    /* +100 */
    uint16_t debug_trap_flag : 1;
    uint16_t reserved12 : 15;
    uint16_t iomap_base_addr;

    /* 104 */
} __attribute__((packed, aligned(128)));

static struct x86_tss kernel_tss;

int cpu_context_subsystem_setup()
{
    /* Reset the kernel TSS */
    memset(&kernel_tss, 0x0, sizeof(kernel_tss));

    /**
     * Now setup the kernel TSS.
     *
     * Considering the privilege change method we choose (cpl3 -> cpl0
     * through a software interrupt), we don't need to initialize a
     * full-fledged TSS. See section 6.4.1 of Intel x86 vol 1. Actually,
     * only a correct value for the kernel esp and ss are required (aka
     * "ss0" and "esp0" fields). Since the esp0 will have to be updated
     * at privilege change time, we don't have to set it up now.
     */
    kernel_tss.ss0 = BUILD_SEGMENT_REG_VALUE(0, FALSE, SEG_KDATA);

    /* Register this TSS into the gdt */
    gdtRegisterTSS((vaddr_t)&kernel_tss);

    return 0;
}

int cpu_kstate_init(struct cpu_state **ctxt, cpu_kstate_function_arg1_t *start_func,
                    vaddr_t start_arg, vaddr_t stack_bottom, size_t stack_size,
                    cpu_kstate_function_arg1_t *exit_func, vaddr_t exit_arg)
{
    /* We are initializing a Kernel thread's context */
    struct cpu_kstate *kctxt;

    /* This is a critical internal function, so that it is assumed that
       the caller knows what he does: we legitimally assume that values
       for ctxt, start_func, stack_* and exit_func are allways VALID ! */

    /* Setup the stack.
     *
     * On x86, the stack goes downward. Each frame is configured this
     * way (higher addresses first):
     *
     *  - (optional unused space. As of gcc 3.3, this space is 24 bytes)
     *  - arg n
     *  - arg n-1
     *  - ...
     *  - arg 1
     *  - return instruction address: The address the function returns to
     *    once finished
     *  - local variables
     *
     * The remaining of the code should be read from the end upward to
     * understand how the processor will handle it.
     */

    vaddr_t tmp_vaddr = stack_bottom + stack_size;
    uint32_t *stack   = (uint32_t *)tmp_vaddr;

    /* If needed, poison the stack */
#ifdef CPU_STATE_DETECT_UNINIT_KERNEL_VARS
    memset((void *)stack_bottom, CPU_STATE_STACK_POISON, stack_size);
#elif defined(CPU_STATE_DETECT_KERNEL_STACK_OVERFLOW)
    cpu_state_prepare_detect_kernel_stack_overflow(stack_bottom, stack_size);
#endif

    /* Simulate a call to the core_routine() function: prepare its
       arguments */
    *(--stack) = exit_arg;
    *(--stack) = (uint32_t)exit_func;
    *(--stack) = start_arg;
    *(--stack) = (uint32_t)start_func;
    *(--stack) = 0; /* Return address of core_routine => force page fault */

    /*
     * Setup the initial context structure, so that the CPU will execute
     * the function core_routine() once this new context has been
     * restored on CPU
     */

    /* Compute the base address of the structure, which must be located
       below the previous elements */
    tmp_vaddr = ((vaddr_t)stack) - sizeof(struct cpu_kstate);
    kctxt     = (struct cpu_kstate *)tmp_vaddr;

    /* Initialize the CPU context structure */
    memset(kctxt, 0x0, sizeof(struct cpu_kstate));

    /* Tell the CPU context structure that the first instruction to
       execute will be that of the core_routine() function */
    kctxt->regs.eip = (uint32_t)core_routine;

    /* Setup the segment registers */
    kctxt->regs.cs      = BUILD_SEGMENT_REG_VALUE(0, FALSE, SEG_KCODE); /* Code */
    kctxt->regs.ds      = BUILD_SEGMENT_REG_VALUE(0, FALSE, SEG_KDATA); /* Data */
    kctxt->regs.es      = BUILD_SEGMENT_REG_VALUE(0, FALSE, SEG_KDATA); /* Data */
    kctxt->regs.cpl0_ss = BUILD_SEGMENT_REG_VALUE(0, FALSE, SEG_KDATA); /* Stack */
    /* fs and gs unused for the moment. */

    /* The newly created context is initially interruptible */
    kctxt->regs.eflags = (1 << 9); /* set IF bit */

    /* Finally, update the generic kernel/user thread context */
    *ctxt = (struct cpu_state *)kctxt;

    return 0;
}

int cpu_ustate_init(struct cpu_state **ctx, uaddr_t startPC, uint32_t arg1, uint32_t arg2,
                    uaddr_t startSP, vaddr_t kernelStackBottom, size_t kernelStackSize)
{

    // The user context is stacked above the usual cpu state by the CPU on context switch.
    // So store it when the cpu expect it (See cpu_kstate_init for more details)
    struct cpu_ustate *uctx =
        (struct cpu_ustate *)(kernelStackBottom + kernelStackSize - sizeof(struct cpu_ustate));

    /* If needed, poison the stack */
#ifdef CPU_STATE_DETECT_UNINIT_KERNEL_VARS
    memset((void *)kernelStackBottom, CPU_STATE_STACK_POISON, kernelStackSize);
#elif defined(CPU_STATE_DETECT_KERNEL_STACK_OVERFLOW)
    cpu_state_prepare_detect_kernel_stack_overflow(stack_bottom, stack_size);
#endif

    memset(uctx, 0, sizeof(struct cpu_ustate));

    uctx->regs.eip = startPC;
    uctx->regs.eax = arg1;
    uctx->regs.ebx = arg2;

    uctx->regs.cs      = BUILD_SEGMENT_REG_VALUE(3, FALSE, SEG_UCODE); // Code
    uctx->regs.ds      = BUILD_SEGMENT_REG_VALUE(3, FALSE, SEG_UDATA); // Data
    uctx->regs.es      = BUILD_SEGMENT_REG_VALUE(3, FALSE, SEG_UDATA); // Data
    uctx->regs.cpl0_ss = BUILD_SEGMENT_REG_VALUE(0, FALSE, SEG_KDATA); // Kernel Stack
    uctx->cpl3_ss      = BUILD_SEGMENT_REG_VALUE(3, FALSE, SEG_UDATA); // User Stack

    uctx->cpl3_esp = startSP;

    /* The newly created context is initially interruptible */
    uctx->regs.eflags = (1 << 9); /* set IF bit */

    *ctx = (struct cpu_state *)uctx;
    return 0;
}

#if defined(CPU_STATE_DETECT_KERNEL_STACK_OVERFLOW)
void cpu_state_prepare_detect_kernel_stack_overflow(const struct cpu_state *ctxt,
                                                    vaddr_t stack_bottom, size_t stack_size)
{
    (void)ctxt;
    size_t poison_size = CPU_STATE_DETECT_KERNEL_STACK_OVERFLOW;
    if (poison_size > stack_size)
        poison_size = stack_size;

    memset((void *)stack_bottom, CPU_STATE_STACK_POISON, poison_size);
}

void cpu_state_detect_kernel_stack_overflow(const struct cpu_state *ctxt, vaddr_t stack_bottom,
                                            size_t stack_size)
{
    unsigned char *c;
    size_t i;

    /* On Matos/SOS, "ctxt" corresponds to the address of the esp register of
       the saved context in Kernel mode (always, even for the interrupted
       context of a user thread). Here we make sure that this stack
       pointer is within the allowed stack area */
    assert(((vaddr_t)ctxt) >= stack_bottom);
    assert(((vaddr_t)ctxt) + sizeof(struct cpu_kstate) <= stack_bottom + stack_size);

    /* Check that the bottom of the stack has not been altered */
    for (c = (unsigned char *)stack_bottom, i = 0;
         (i < CPU_STATE_DETECT_KERNEL_STACK_OVERFLOW) && (i < stack_size); c++, i++) {
        assert(CPU_STATE_STACK_POISON == *c);
    }
}
#endif

/* =======================================================================
 * Public Accessor functions
 */
int cpu_context_is_in_user_mode(const struct cpu_state *ctxt)
{
    /* An interrupted user thread has its CS register set to that of the
       User code segment */
    switch (GET_CPU_CS_REGISTER_VALUE(ctxt->cs)) {
        case BUILD_SEGMENT_REG_VALUE(3, FALSE, SEG_UCODE):
            return TRUE;
            break;

        case BUILD_SEGMENT_REG_VALUE(0, FALSE, SEG_KCODE):
            return FALSE;
            break;

        default:
            pr_err("Invalid saved context Code segment register: 0x%x (k=%x, u=%x) !",
                   (unsigned)GET_CPU_CS_REGISTER_VALUE(ctxt->cs),
                   BUILD_SEGMENT_REG_VALUE(0, FALSE, SEG_KCODE),
                   BUILD_SEGMENT_REG_VALUE(3, FALSE, SEG_UCODE));
            break;
    }

    /* Should never get here */
    return -1;
}

vaddr_t cpu_context_get_PC(const struct cpu_state *ctxt)
{
    assert(NULL != ctxt);

    /* This is the PC of the interrupted context (ie kernel or user
       context). */
    return ctxt->eip;
}

vaddr_t cpu_context_get_SP(const struct cpu_state *ctxt)
{
    assert(NULL != ctxt);

    /* On Matos/SOS, "ctxt" corresponds to the address of the esp register of
       the saved context in Kernel mode (always, even for the interrupted
       context of a user thread). */
    return (vaddr_t)ctxt;
}

uint32_t cpu_context_get_EX_err(const struct cpu_state *ctxt)
{
    assert(NULL != ctxt);

    /* This is the Err_code of the interrupted context (ie kernel or user
       context). */
    return ctxt->error_code;
}

vaddr_t cpu_context_get_EX_faulting_vaddr(const struct cpu_state *ctxt)
{
    assert(NULL != ctxt);

    // A page fault has occurred.
    // The faulting address is stored in the CR2 register.
    vaddr_t faulting_address;
    asm volatile("mov %%cr2, %0" : "=r"(faulting_address));

    return faulting_address;
}

void cpu_context_dump(const struct cpu_state *ctxt)
{
    printf("CPU: eip=%x esp=%x eflags=%x cs=%x ds=%x ss=%x err=%x", (unsigned)ctxt->eip,
           (unsigned)ctxt, (unsigned)ctxt->eflags,
           (unsigned)GET_CPU_CS_REGISTER_VALUE(ctxt->cs), (unsigned)ctxt->ds,
           (unsigned)ctxt->cpl0_ss, (unsigned)ctxt->error_code);
}

/* *************************************************************
 * Function to manage the TSS.  This function is not really "public":
 * it is reserved to the assembler routines defined in
 * cpu_context_switch.S
 *
 * Update the kernel stack address so that the IRQ, syscalls and
 * exception return in a correct stack location when coming back into
 * kernel mode.
 */
void cpu_context_update_kernel_tss(struct cpu_state *next_ctxt)
{
    /* next_ctxt corresponds to an interrupted user thread ? */
    if (cpu_context_is_in_user_mode(next_ctxt)) {
        /*
         * Yes: "next_ctxt" is an interrupted user thread => we are
         * going to switch to user mode ! Setup the stack address so
         * that the user thread "next_ctxt" can come back to the correct
         * stack location when returning in kernel mode.
         *
         * This stack location corresponds to the SP of the next user
         * thread once its context has been transferred on the CPU, ie
         * once the CPU has executed all the pop/iret instruction of the
         * context switch with privilege change.
         */
        kernel_tss.esp0 = ((vaddr_t)next_ctxt) + sizeof(struct cpu_ustate);
        /* Note: no need to protect this agains IRQ because IRQs are not
     allowed to update it by themselves, and they are not allowed
     to block */
    } else {
        /* No: No need to update kernel TSS when we stay in kernel
     mode */
    }
}


inline
int syscallGet3args(const struct cpu_state *user_ctxt,
			       /* out */unsigned int *arg1,
			       /* out */unsigned int *arg2,
			       /* out */unsigned int *arg3)
{
  *arg1 = user_ctxt->ebx;
  *arg2 = user_ctxt->ecx;
  *arg3 = user_ctxt->edx;
  return 0;
}


int syscallGet1arg(const struct cpu_state *user_ctxt,
			      /* out */unsigned int *arg1)
{
  unsigned int unused;
  return syscallGet3args(user_ctxt, arg1, & unused, & unused);
}


int syscallGet2args(const struct cpu_state *user_ctxt,
			       /* out */unsigned int *arg1,
			       /* out */unsigned int *arg2)
{
  unsigned int unused;
  return syscallGet3args(user_ctxt, arg1, arg2, & unused);
}

int syscallGet4args(const struct cpu_state *user_ctxt,
                     /* out */ unsigned int *arg1,
                     /* out */ unsigned int *arg2,
                     /* out */ unsigned int *arg3,
                     /* out */ unsigned int *arg4)
{
    uaddr_t userOtherArgs;
    unsigned int otherArgs[2];
    int ret;

    ret = syscallGet3args(user_ctxt, arg1, arg2, (unsigned int *)&userOtherArgs);
    if (ret)
        return ret;

    ret = memcpyFromUser((vaddr_t)otherArgs, userOtherArgs, sizeof(otherArgs));

    if (ret != sizeof(otherArgs))
        return -EFAULT;

    *arg3 = otherArgs[0];
    *arg4 = otherArgs[1];
    return 0;
}

int syscallGet5args(const struct cpu_state *user_ctxt,
                     /* out */ unsigned int *arg1,
                     /* out */ unsigned int *arg2,
                     /* out */ unsigned int *arg3,
                     /* out */ unsigned int *arg4,
                     /* out */ unsigned int *arg5)
{
    uaddr_t userOtherArgs;
    unsigned int otherArgs[3];
    int ret;

    ret = syscallGet3args(user_ctxt, arg1, arg2, (unsigned int *)&userOtherArgs);
    if (ret)
        return ret;

    ret = memcpyFromUser((vaddr_t)otherArgs, userOtherArgs, sizeof(otherArgs));

    if (ret != sizeof(otherArgs))
        return -EFAULT;

    *arg3 = otherArgs[0];
    *arg4 = otherArgs[1];
    *arg5 = otherArgs[2];
    return 0;
}