Jailhouse内存虚拟化实现详解

概述

Jailhouse采用硬件辅助的内存虚拟化技术,在x86平台使用EPT (Extended Page Tables),在ARM平台使用Stage-2页表。本文详细分析EPT的建立过程、cell镜像加载到GPA空间的完整流程,以及内存管理的核心机制。

1. EPT (Extended Page Tables) 建立机制

1.1 EPT初始化和配置

A. EPT能力检查和初始化

// hypervisor/arch/x86/vmx.c - vmx_check_features()
static int vmx_check_features(void)
{
    unsigned long vmx_proc_ctrl2, ept_cap;

    // 检查EPT支持
    vmx_proc_ctrl2 = read_msr(MSR_IA32_VMX_PROCBASED_CTLS2) >> 32;
    ept_cap = read_msr(MSR_IA32_VMX_EPT_VPID_CAP);

    if (!(vmx_proc_ctrl2 & SECONDARY_EXEC_ENABLE_EPT) ||
        (ept_cap & EPT_MANDATORY_FEATURES) != EPT_MANDATORY_FEATURES ||
        !(ept_cap & (EPT_INVEPT_SINGLE | EPT_INVEPT_GLOBAL)))
        return trace_error(-EIO);

    return 0;
}

B. EPT页表结构初始化

// hypervisor/arch/x86/vmx.c - vcpu_vendor_early_init()
int vcpu_vendor_early_init(void)
{
    unsigned int n;

    // 从x86_64_paging派生EPT页表结构
    memcpy(ept_paging, x86_64_paging, sizeof(ept_paging));

    // 设置EPT特定的页表操作函数
    for (n = 0; n < EPT_PAGE_DIR_LEVELS; n++)
        ept_paging[n].set_next_pt = ept_set_next_pt;

    // 根据硬件能力调整页大小支持
    if (!(read_msr(MSR_IA32_VMX_EPT_VPID_CAP) & EPT_1G_PAGES))
        ept_paging[1].page_size = 0;  // 禁用1GB页
    if (!(read_msr(MSR_IA32_VMX_EPT_VPID_CAP) & EPT_2M_PAGES))
        ept_paging[2].page_size = 0;  // 禁用2MB页

    // 设置parking页表使用EPT
    parking_pt.root_paging = ept_paging;

    return 0;
}

C. EPT页表项设置函数

// hypervisor/arch/x86/vmx.c
static void ept_set_next_pt(pt_entry_t pte, unsigned long next_pt)
{
    // EPT页表项格式:物理地址 + 权限位
    *pte = (next_pt & BIT_MASK(51, 12)) | 
           EPT_FLAG_READ | EPT_FLAG_WRITE | EPT_FLAG_EXECUTE;
}

1.2 Cell的EPT结构建立

A. Cell EPT初始化

// hypervisor/arch/x86/vmx.c - vcpu_vendor_cell_init()
int vcpu_vendor_cell_init(struct cell *cell)
{
    // 建立cell的根EPT结构
    cell->arch.vmx.ept_structs.root_paging = ept_paging;
    cell->arch.vmx.ept_structs.root_table = 
        (page_table_t)cell->arch.root_table_page;

    // 映射特殊的APIC访问页到guest物理地址空间
    return paging_create(&cell->arch.vmx.ept_structs,
                         paging_hvirt2phys(apic_access_page),
                         PAGE_SIZE, XAPIC_BASE,
                         EPT_FLAG_READ | EPT_FLAG_WRITE | EPT_FLAG_WB_TYPE,
                         PAGING_NON_COHERENT | PAGING_NO_HUGE);
}

B. EPT指针设置到VMCS

// hypervisor/arch/x86/vmx.c - vmx_set_cell_config()
static bool vmx_set_cell_config(void)
{
    struct cell *cell = this_cell();

    // 设置EPT指针到VMCS
    ok &= vmcs_write64(EPT_POINTER,
        paging_hvirt2phys(cell->arch.vmx.ept_structs.root_table) |
        EPT_TYPE_WRITEBACK | EPT_PAGE_WALK_LEN);

    return ok;
}

2. 内存区域映射机制

2.1 内存区域映射函数

A. 核心映射函数

// hypervisor/arch/x86/vmx.c - vcpu_map_memory_region()
int vcpu_map_memory_region(struct cell *cell,
                           const struct jailhouse_memory *mem)
{
    u64 phys_start = mem->phys_start;
    unsigned long access_flags = EPT_FLAG_WB_TYPE;
    unsigned long paging_flags = PAGING_NON_COHERENT | PAGING_HUGE;

    // 根据内存区域标志设置访问权限
    if (mem->flags & JAILHOUSE_MEM_READ)
        access_flags |= EPT_FLAG_READ;
    if (mem->flags & JAILHOUSE_MEM_WRITE)
        access_flags |= EPT_FLAG_WRITE;
    if (mem->flags & JAILHOUSE_MEM_EXECUTE)
        access_flags |= EPT_FLAG_EXECUTE;

    // 特殊处理通信区域
    if (mem->flags & JAILHOUSE_MEM_COMM_REGION)
        phys_start = paging_hvirt2phys(&cell->comm_page);

    // 禁用大页的情况
    if (mem->flags & JAILHOUSE_MEM_NO_HUGEPAGES)
        paging_flags &= ~PAGING_HUGE;

    // 创建EPT映射:HPA -> GPA
    return paging_create(&cell->arch.vmx.ept_structs, 
                         phys_start, mem->size,
                         mem->virt_start, access_flags, paging_flags);
}

2.2 通用页表创建函数

A. paging_create核心实现

// hypervisor/paging.c - paging_create()
int paging_create(const struct paging_structures *pg_structs,
                  unsigned long phys, unsigned long size, unsigned long virt,
                  unsigned long access_flags, unsigned long paging_flags)
{
    // 页对齐处理
    phys &= PAGE_MASK;
    virt &= PAGE_MASK;
    size = PAGE_ALIGN(size);

    while (size > 0) {
        const struct paging *paging = pg_structs->root_paging;
        page_table_t pt = pg_structs->root_table;
        pt_entry_t pte;

        // 遍历页表层次
        while (1) {
            pte = paging->get_entry(pt, virt);

            // 检查是否可以使用大页
            if (paging->page_size > 0 &&
                paging->page_size <= size &&
                ((phys | virt) & (paging->page_size - 1)) == 0 &&
                (paging_flags & PAGING_HUGE ||
                 paging->page_size == PAGE_SIZE)) {

                // 设置终端页表项
                paging->set_terminal(pte, phys, access_flags);
                flush_pt_entry(pte, paging_flags);
                break;
            }

            // 需要下一级页表
            if (paging->entry_valid(pte, PAGE_PRESENT_FLAGS)) {
                // 已存在,可能需要分割大页
                pt = paging_phys2hvirt(paging->get_next_pt(pte));
            } else {
                // 分配新的页表页
                pt = page_alloc(&mem_pool, 1);
                if (!pt)
                    return -ENOMEM;
                paging->set_next_pt(pte, paging_hvirt2phys(pt));
                flush_pt_entry(pte, paging_flags);
            }
            paging++;
        }

        // 刷新TLB
        if (pg_structs->hv_paging)
            arch_paging_flush_page_tlbs(virt);

        // 处理下一页
        phys += paging->page_size;
        virt += paging->page_size;
        size -= paging->page_size;
    }
    return 0;
}

3. Cell镜像加载到GPA空间的过程

3.1 Cell创建和内存分配

A. Cell创建流程

// hypervisor/control.c - cell_create()
static int cell_create(struct per_cpu *cpu_data, unsigned long config_address)
{
    const struct jailhouse_memory *mem;
    unsigned int n;
    int err;

    // 1. 解析配置并创建cell结构
    cfg_mapping = paging_get_guest_pages(NULL, config_address, cfg_pages,
                                         PAGE_READONLY_FLAGS);
    config = (struct jailhouse_cell_desc *)cfg_mapping;

    // 2. 分配cell结构和内存区域数组
    cell = cell_create(config);

    // 3. 从root cell取消映射,映射到新cell
    for_each_mem_region(mem, config, n) {
        // 检查内存区域冲突
        err = check_mem_regions(mem);
        if (err)
            goto error_cell_delete;

        // 从root cell取消映射
        err = unmap_from_root_cell(mem);
        if (err)
            goto error_cell_delete;

        // 映射到新cell的EPT
        err = arch_map_memory_region(cell, mem);
        if (err)
            goto error_cell_delete;
    }

    return 0;
}

B. 内存区域映射到Cell

// 内存映射的完整流程:
// 1. 物理内存 (HPA) -> 从root cell取消映射
// 2. 物理内存 (HPA) -> 映射到cell的EPT (HPA -> GPA)
// 3. Cell看到的是GPA地址空间

// 示例:Linux demo cell的内存配置
// configs/x86/linux-x86-demo.c
static struct jailhouse_memory mem_regions[] = {
    /* RAM */ {
        .phys_start = 0x3a600000,    // 物理地址 (HPA)
        .virt_start = 0x0,           // Guest物理地址 (GPA)
        .size = 0x5a00000,           // 90MB
        .flags = JAILHOUSE_MEM_READ | JAILHOUSE_MEM_WRITE |
                 JAILHOUSE_MEM_EXECUTE | JAILHOUSE_MEM_LOADABLE,
    },
    // ... 其他内存区域
};

3.2 Cell镜像加载过程

A. 用户空间镜像加载

// driver/cell.c - jailhouse_cmd_cell_load()
static long jailhouse_cmd_cell_load(struct jailhouse_cell_load __user *arg)
{
    struct jailhouse_cell_load cell_load;
    void *image_mem;

    // 1. 从用户空间复制加载参数
    if (copy_from_user(&cell_load, arg, sizeof(cell_load)))
        return -EFAULT;

    // 2. 分配内核缓冲区
    image_mem = vmalloc(cell_load.image.size);
    if (!image_mem)
        return -ENOMEM;

    // 3. 从用户空间复制镜像数据
    if (copy_from_user(image_mem, 
                       (void __user *)(unsigned long)cell_load.image.source_address,
                       cell_load.image.size)) {
        vfree(image_mem);
        return -EFAULT;
    }

    // 4. 调用hypervisor加载镜像
    return jailhouse_call_arg2(JAILHOUSE_HC_CELL_LOAD, 
                               cell_load.cell_id, 
                               virt_to_phys(image_mem));
}

B. Hypervisor镜像加载

// hypervisor/control.c - cell_load()
static int cell_load(struct per_cpu *cpu_data, unsigned long id,
                     unsigned long image_address)
{
    struct cell *cell;
    const struct jailhouse_memory *mem;
    void *image_mapping;
    unsigned int n;

    // 1. 查找目标cell
    cell = cell_get(id);
    if (!cell)
        return -ENOENT;

    // 2. 映射镜像到hypervisor地址空间
    image_mapping = paging_get_guest_pages(NULL, image_address, 
                                           PAGES(cell->loadable_size),
                                           PAGE_READONLY_FLAGS);

    // 3. 查找可加载内存区域
    for_each_mem_region(mem, cell->config, n) {
        if (!(mem->flags & JAILHOUSE_MEM_LOADABLE))
            continue;

        // 4. 将镜像数据复制到cell的物理内存
        // 这里的关键:直接写入HPA,通过EPT映射到GPA
        target_mem = paging_get_guest_pages(NULL, mem->phys_start,
                                            PAGES(mem->size),
                                            PAGE_DEFAULT_FLAGS);

        // 复制镜像数据
        memcpy(target_mem, image_mapping + offset, copy_size);

        break;
    }

    return 0;
}

3.3 地址空间转换关系

A. 三层地址空间

用户空间应用 (VA) 
    ↓ (guest页表)
Guest物理地址 (GPA)
    ↓ (EPT)
Host物理地址 (HPA)
    ↓ (host页表)
Host虚拟地址 (HVA)

B. Cell内存分配和映射建立流程图

graph TD
    A[用户空间: jailhouse cell create] --> B[解析Cell配置文件]
    B --> C[验证内存区域配置]
    C --> D[创建Cell结构体]
    D --> E[分配EPT根页表]
    E --> F[遍历内存区域配置]

    F --> G{内存区域类型}
    G -->|RAM区域| H[从Root Cell取消映射]
    G -->|设备区域| I[验证设备访问权限]
    G -->|通信区域| J[分配通信页面]

    H --> K[建立EPT映射: HPA->GPA]
    I --> K
    J --> K

    K --> L[设置内存访问权限]
    L --> M[刷新EPT TLB]
    M --> N[建立HVA->GPA关系]

    N --> O{还有内存区域?}
    O -->|是| F
    O -->|否| P[Cell创建完成]

    subgraph "内存映射详细过程"
        K --> K1[调用paging_create]
        K1 --> K2[页对齐处理]
        K2 --> K3[遍历EPT页表层次]
        K3 --> K4{可以使用大页?}
        K4 -->|是| K5[设置2MB/1GB页表项]
        K4 -->|否| K6[分配下级页表]
        K6 --> K7[设置4KB页表项]
        K5 --> K8[刷新页表项缓存]
        K7 --> K8
    end

    subgraph "地址关系建立"
        N --> N1[HVA = HPA + page_offset]
        N1 --> N2[GPA通过EPT映射到HPA]
        N2 --> N3[Cell访问GPA自动转换为HPA]
    end

    style A fill:#e1f5fe
    style P fill:#c8e6c9
    style K fill:#fff3e0
    style N fill:#f3e5f5

C. 内存地址映射关系图

graph LR
    subgraph "Root Cell地址空间"
        RootHVA[Root Cell HVA<br/>0xffff888000000000]
        RootHPA[Root Cell HPA<br/>0x00000000-0x3a600000]
    end

    subgraph "New Cell地址空间"
        CellGPA[Cell GPA<br/>0x00000000]
        CellHPA[Cell HPA<br/>0x3a600000]
        CellHVA[Cell HVA<br/>0xffff888000000000+offset]
    end

    subgraph "EPT页表"
        EPT[EPT Root Table<br/>Cell专用]
        EPT1[EPT Level 4]
        EPT2[EPT Level 3] 
        EPT3[EPT Level 2]
        EPT4[EPT Level 1]
    end

    subgraph "物理内存"
        PM1[0x00000000-0x3a600000<br/>Root Cell使用]
        PM2[0x3a600000-0x40000000<br/>New Cell使用]
    end

    RootHVA -.->|page_offset| RootHPA
    CellHVA -.->|page_offset| CellHPA
    CellGPA -->|EPT转换| CellHPA

    EPT --> EPT1
    EPT1 --> EPT2
    EPT2 --> EPT3
    EPT3 --> EPT4
    EPT4 -.->|映射| PM2

    RootHPA --> PM1
    CellHPA --> PM2

    style CellGPA fill:#e3f2fd
    style CellHPA fill:#fff3e0
    style CellHVA fill:#f1f8e9
    style EPT fill:#fce4ec

D. 镜像加载到Cell内存的完整流程图

sequenceDiagram
    participant User as 用户空间
    participant Driver as Jailhouse驱动
    participant HV as Hypervisor
    participant EPT as EPT硬件
    participant Mem as 物理内存

    Note over User,Mem: 镜像加载流程

    User->>Driver: jailhouse cell load cell_id image.bin
    Driver->>Driver: 分配内核缓冲区 (HVA)
    Driver->>Driver: copy_from_user(镜像数据)
    Driver->>HV: JAILHOUSE_HC_CELL_LOAD hypercall

    Note over HV: Hypervisor处理镜像加载

    HV->>HV: 查找目标Cell
    HV->>HV: 映射镜像到HV地址空间

    loop 遍历Cell内存区域
        HV->>HV: 检查LOADABLE标志
        alt 找到可加载区域
            HV->>HV: 计算目标物理地址 (HPA)
            HV->>HV: 映射HPA到HV虚拟地址
            HV->>Mem: memcpy(镜像数据到HPA)
            Note over HV,Mem: 直接写入物理内存
        end
    end

    Note over EPT,Mem: Cell访问镜像数据

    HV->>User: 加载完成

    Note over User,Mem: Cell启动后的内存访问

    User->>HV: jailhouse cell start cell_id
    HV->>EPT: 切换到Cell的EPT

    loop Cell执行过程
        EPT->>EPT: Cell访问GPA 0x1000
        EPT->>Mem: 硬件转换: GPA->HPA
        Mem->>EPT: 返回镜像数据
        Note over EPT: VM Exit,硬件自动转换
    end

E. 镜像加载内存布局变化图

graph TD
    subgraph "加载前状态"
        A1[用户空间镜像文件<br/>image.bin]
        A2[物理内存 HPA<br/>0x3a600000: 空白]
        A3[Cell EPT<br/>GPA 0x0 -> HPA 0x3a600000]
    end

    subgraph "加载过程"
        B1[内核缓冲区 HVA<br/>vmalloc分配]
        B2[copy_from_user<br/>用户空间->内核]
        B3[paging_get_guest_pages<br/>映射HPA到HV空间]
        B4[memcpy<br/>复制到物理内存]
    end

    subgraph "加载后状态"
        C1[物理内存 HPA<br/>0x3a600000: 镜像数据]
        C2[Cell EPT映射<br/>GPA 0x0 -> HPA 0x3a600000]
        C3[Cell访问<br/>GPA 0x0读取镜像]
    end

    A1 --> B1
    B1 --> B2
    B2 --> B3
    B3 --> B4
    B4 --> C1

    A2 --> C1
    A3 --> C2
    C2 --> C3

    style A1 fill:#e3f2fd
    style B4 fill:#fff3e0
    style C1 fill:#c8e6c9
    style C3 fill:#f1f8e9

F. 具体的地址转换示例

// 示例:Linux demo cell的地址映射
// 配置:HPA 0x3a600000 -> GPA 0x0, size 90MB

// 1. Cell配置阶段:
//    paging_create(&cell->arch.vmx.ept_structs,
//                  0x3a600000,  // HPA
//                  0x5a00000,   // size
//                  0x0,         // GPA
//                  EPT_FLAGS, PAGING_FLAGS);

// 2. 镜像加载阶段:
//    - 用户空间镜像 -> 内核缓冲区 (HVA)
//    - 内核缓冲区 -> 物理内存 (HPA 0x3a600000)
//    - 通过EPT,cell看到GPA 0x0处有镜像数据

// 3. Cell执行阶段:
//    - Cell访问GPA 0x1000
//    - EPT转换:GPA 0x1000 -> HPA 0x3a601000
//    - 硬件自动完成转换,无VM Exit

4. 内存管理的关键机制

4.1 页表缓存和TLB管理

A. EPT TLB刷新

// hypervisor/arch/x86/vmx.c - vcpu_tlb_flush()
void vcpu_tlb_flush(void)
{
    unsigned long ept_cap = read_msr(MSR_IA32_VMX_EPT_VPID_CAP);
    struct {
        u64 eptp;
        u64 reserved;
    } descriptor;
    u64 type;

    descriptor.reserved = 0;
    if (ept_cap & EPT_INVEPT_SINGLE) {
        // 单个EPT上下文刷新
        type = VMX_INVEPT_SINGLE;
        descriptor.eptp = vmcs_read64(EPT_POINTER);
    } else {
        // 全局EPT刷新
        type = VMX_INVEPT_GLOBAL;
        descriptor.eptp = 0;
    }

    // 执行INVEPT指令
    asm volatile(
        "invept (%1),%2\n\t"
        "seta %0\n\t"
        : "=qm" (ok)
        : "r" (&descriptor), "r" (type)
        : "memory", "cc");
}

4.2 内存池管理

A. Hypervisor内存池

// hypervisor/paging.c - 内存池结构
struct page_pool mem_pool;      // 物理页池
struct page_pool remap_pool;    // 重映射页池

// 页分配函数
void *page_alloc(struct page_pool *pool, unsigned int num)
{
    // 从位图中查找连续的空闲页
    // 标记为已使用
    // 返回虚拟地址
}

// 页释放函数  
void page_free(struct page_pool *pool, void *page, unsigned int num)
{
    // 清零页内容(安全考虑)
    // 在位图中标记为空闲
}

B. 地址转换函数

// hypervisor/paging.c - 地址转换
unsigned long page_offset;  // 虚拟地址和物理地址的偏移

// 虚拟地址 -> 物理地址
static inline unsigned long paging_hvirt2phys(void *virt)
{
    return (unsigned long)virt - page_offset;
}

// 物理地址 -> 虚拟地址  
static inline void *paging_phys2hvirt(unsigned long phys)
{
    return (void *)(phys + page_offset);
}

4.3 Guest物理地址转换

A. GPA到HPA转换

// hypervisor/arch/x86/vmx.c - arch_paging_gphys2phys()
unsigned long arch_paging_gphys2phys(unsigned long gphys, unsigned long flags)
{
    // 通过当前cell的EPT结构进行转换
    return paging_virt2phys(&this_cell()->arch.vmx.ept_structs, 
                            gphys, flags);
}

B. Guest页表遍历

// hypervisor/paging.c - paging_virt2phys()
unsigned long paging_virt2phys(const struct paging_structures *pg_structs,
                                unsigned long virt, unsigned long flags)
{
    const struct paging *paging = pg_structs->root_paging;
    page_table_t pt = pg_structs->root_table;
    unsigned long phys;
    pt_entry_t pte;

    // 遍历页表层次
    while (1) {
        pte = paging->get_entry(pt, virt);
        if (!paging->entry_valid(pte, flags))
            return INVALID_PHYS_ADDR;

        phys = paging->get_phys(pte, virt);
        if (phys != INVALID_PHYS_ADDR)
            return phys;  // 找到最终物理地址

        // 继续下一级页表
        pt = paging_phys2hvirt(paging->get_next_pt(pte));
        paging++;
    }
}

5. 内存虚拟化的性能优化

5.1 大页支持

A. 大页配置

// EPT支持的页大小:
// - 4KB页:基本页大小
// - 2MB页:中等大页
// - 1GB页:超大页

// 大页使用条件检查
if (paging->page_size > 0 &&
    paging->page_size <= size &&
    ((phys | virt) & (paging->page_size - 1)) == 0 &&
    (paging_flags & PAGING_HUGE ||
     paging->page_size == PAGE_SIZE)) {
    // 可以使用大页
    paging->set_terminal(pte, phys, access_flags);
}

5.2 缓存优化

A. 缓存一致性

// EPT页表项缓存类型设置
access_flags |= EPT_FLAG_WB_TYPE;  // Write-Back缓存

// 非一致性缓存处理
if (paging_flags & PAGING_COHERENT)
    arch_paging_flush_cpu_caches(pte, sizeof(*pte));

6. 总结

Jailhouse的内存虚拟化实现体现了其简洁高效的设计理念:

6.1 核心特点

6.2 关键流程

  1. EPT初始化:检查硬件能力,设置页表结构
  2. Cell创建:建立独立的EPT,映射内存区域
  3. 镜像加载:将用户镜像复制到分配的物理内存
  4. 地址转换:通过EPT实现GPA到HPA的硬件转换

6.3 技术优势

这种设计使得Jailhouse特别适合实时系统和安全关键应用,在保证性能的同时提供了强大的内存隔离能力。