mirror of
https://github.com/momo5502/hypervisor.git
synced 2025-04-19 21:52:55 +00:00
1341 lines
36 KiB
C++
1341 lines
36 KiB
C++
#include "std_include.hpp"
|
|
#include "hypervisor.hpp"
|
|
|
|
#include "exception.hpp"
|
|
#include "logging.hpp"
|
|
#include "finally.hpp"
|
|
#include "memory.hpp"
|
|
#include "thread.hpp"
|
|
#include "assembly.hpp"
|
|
|
|
#include <ia32.hpp>
|
|
|
|
#define _1GB (1 * 1024 * 1024 * 1024)
|
|
#define _2MB (2 * 1024 * 1024)
|
|
|
|
namespace
|
|
{
|
|
hypervisor* instance{nullptr};
|
|
|
|
bool is_vmx_supported()
|
|
{
|
|
cpuid_eax_01 data{};
|
|
__cpuid(reinterpret_cast<int*>(&data), CPUID_VERSION_INFORMATION);
|
|
return data.cpuid_feature_information_ecx.virtual_machine_extensions;
|
|
}
|
|
|
|
bool is_vmx_available()
|
|
{
|
|
ia32_feature_control_register feature_control{};
|
|
feature_control.flags = __readmsr(IA32_FEATURE_CONTROL);
|
|
return feature_control.lock_bit && feature_control.enable_vmx_outside_smx;
|
|
}
|
|
|
|
bool is_virtualization_supported()
|
|
{
|
|
return is_vmx_supported() && is_vmx_available();
|
|
}
|
|
|
|
#define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000
|
|
#define HYPERV_CPUID_INTERFACE 0x40000001
|
|
|
|
bool is_hypervisor_present()
|
|
{
|
|
cpuid_eax_01 data{};
|
|
__cpuid(reinterpret_cast<int*>(&data), CPUID_VERSION_INFORMATION);
|
|
if ((data.cpuid_feature_information_ecx.flags & HYPERV_HYPERVISOR_PRESENT_BIT) == 0)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
int32_t cpuid_data[4] = {0};
|
|
__cpuid(cpuid_data, HYPERV_CPUID_INTERFACE);
|
|
return cpuid_data[0] == 'momo';
|
|
}
|
|
}
|
|
|
|
hypervisor::hypervisor()
|
|
{
|
|
if (instance != nullptr)
|
|
{
|
|
throw std::runtime_error("Hypervisor already instantiated");
|
|
}
|
|
|
|
auto destructor = utils::finally([this]()
|
|
{
|
|
this->free_vm_states();
|
|
instance = nullptr;
|
|
});
|
|
|
|
instance = this;
|
|
|
|
if (!is_virtualization_supported())
|
|
{
|
|
throw std::runtime_error("VMX not supported on this machine");
|
|
}
|
|
|
|
debug_log("VMX supported!\n");
|
|
this->allocate_vm_states();
|
|
this->enable();
|
|
destructor.cancel();
|
|
}
|
|
|
|
hypervisor::~hypervisor()
|
|
{
|
|
this->disable();
|
|
this->free_vm_states();
|
|
instance = nullptr;
|
|
}
|
|
|
|
void hypervisor::disable()
|
|
{
|
|
thread::dispatch_on_all_cores([this]()
|
|
{
|
|
this->disable_core();
|
|
});
|
|
}
|
|
|
|
void hypervisor::enable()
|
|
{
|
|
const auto cr3 = __readcr3();
|
|
|
|
bool success = true;
|
|
thread::dispatch_on_all_cores([&]()
|
|
{
|
|
success &= this->try_enable_core(cr3);
|
|
}, true);
|
|
|
|
if (!success)
|
|
{
|
|
this->disable();
|
|
//throw std::runtime_error("Hypervisor initialization failed");
|
|
}
|
|
}
|
|
|
|
bool hypervisor::try_enable_core(const uint64_t system_directory_table_base)
|
|
{
|
|
try
|
|
{
|
|
this->enable_core(system_directory_table_base);
|
|
return true;
|
|
}
|
|
catch (std::exception& e)
|
|
{
|
|
debug_log("Failed to enable hypervisor on core %d: %s\n", thread::get_processor_index(), e.what());
|
|
return false;
|
|
}catch (...)
|
|
{
|
|
debug_log("Failed to enable hypervisor on core %d.\n", thread::get_processor_index());
|
|
return false;
|
|
}
|
|
}
|
|
|
|
void ShvCaptureSpecialRegisters(vmx::special_registers* special_registers)
|
|
{
|
|
special_registers->cr0 = __readcr0();
|
|
special_registers->cr3 = __readcr3();
|
|
special_registers->cr4 = __readcr4();
|
|
special_registers->debug_control = __readmsr(IA32_DEBUGCTL);
|
|
special_registers->msr_gs_base = __readmsr(IA32_GS_BASE);
|
|
special_registers->kernel_dr7 = __readdr(7);
|
|
_sgdt(&special_registers->gdtr.limit);
|
|
__sidt(&special_registers->idtr.limit);
|
|
_str(&special_registers->tr);
|
|
_sldt(&special_registers->ldtr);
|
|
}
|
|
|
|
uintptr_t
|
|
FORCEINLINE
|
|
ShvVmxRead(
|
|
_In_ UINT32 VmcsFieldId
|
|
)
|
|
{
|
|
size_t FieldData;
|
|
|
|
//
|
|
// Because VMXREAD returns an error code, and not the data, it is painful
|
|
// to use in most circumstances. This simple function simplifies it use.
|
|
//
|
|
__vmx_vmread(VmcsFieldId, &FieldData);
|
|
return FieldData;
|
|
}
|
|
|
|
INT32
|
|
ShvVmxLaunch(
|
|
VOID)
|
|
{
|
|
INT32 failureCode;
|
|
|
|
//
|
|
// Launch the VMCS
|
|
//
|
|
__vmx_vmlaunch();
|
|
|
|
//
|
|
// If we got here, either VMCS setup failed in some way, or the launch
|
|
// did not proceed as planned.
|
|
//
|
|
failureCode = (INT32)ShvVmxRead(VMCS_VM_INSTRUCTION_ERROR);
|
|
__vmx_off();
|
|
|
|
//
|
|
// Return the error back to the caller
|
|
//
|
|
return failureCode;
|
|
}
|
|
|
|
#define MTRR_PAGE_SIZE 4096
|
|
#define MTRR_PAGE_MASK (~(MTRR_PAGE_SIZE-1))
|
|
|
|
VOID ShvVmxMtrrInitialize(vmx::vm_state* VpData)
|
|
{
|
|
UINT32 i;
|
|
ia32_mtrr_capabilities_register mtrrCapabilities;
|
|
ia32_mtrr_physbase_register mtrrBase;
|
|
ia32_mtrr_physmask_register mtrrMask;
|
|
unsigned long bit;
|
|
|
|
//
|
|
// Read the capabilities mask
|
|
//
|
|
mtrrCapabilities.flags = __readmsr(IA32_MTRR_CAPABILITIES);
|
|
|
|
//
|
|
// Iterate over each variable MTRR
|
|
//
|
|
for (i = 0; i < mtrrCapabilities.variable_range_count; i++)
|
|
{
|
|
//
|
|
// Capture the value
|
|
//
|
|
mtrrBase.flags = __readmsr(IA32_MTRR_PHYSBASE0 + i * 2);
|
|
mtrrMask.flags = __readmsr(IA32_MTRR_PHYSMASK0 + i * 2);
|
|
|
|
//
|
|
// Check if the MTRR is enabled
|
|
//
|
|
VpData->mtrr_data[i].type = (UINT32)mtrrBase.type;
|
|
VpData->mtrr_data[i].enabled = (UINT32)mtrrMask.valid;
|
|
if (VpData->mtrr_data[i].enabled != FALSE)
|
|
{
|
|
//
|
|
// Set the base
|
|
//
|
|
VpData->mtrr_data[i].physical_address_min = mtrrBase.page_frame_number *
|
|
MTRR_PAGE_SIZE;
|
|
|
|
//
|
|
// Compute the length
|
|
//
|
|
_BitScanForward64(&bit, mtrrMask.page_frame_number * MTRR_PAGE_SIZE);
|
|
VpData->mtrr_data[i].physical_address_max = VpData->mtrr_data[i].
|
|
physical_address_min +
|
|
(1ULL << bit) - 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
UINT32
|
|
ShvVmxMtrrAdjustEffectiveMemoryType(
|
|
vmx::vm_state* VpData,
|
|
_In_ UINT64 LargePageAddress,
|
|
_In_ UINT32 CandidateMemoryType
|
|
)
|
|
{
|
|
UINT32 i;
|
|
|
|
//
|
|
// Loop each MTRR range
|
|
//
|
|
for (i = 0; i < sizeof(VpData->mtrr_data) / sizeof(VpData->mtrr_data[0]); i++)
|
|
{
|
|
//
|
|
// Check if it's active
|
|
//
|
|
if (VpData->mtrr_data[i].enabled != FALSE)
|
|
{
|
|
//
|
|
// Check if this large page falls within the boundary. If a single
|
|
// physical page (4KB) touches it, we need to override the entire 2MB.
|
|
//
|
|
if (((LargePageAddress + (_2MB - 1)) >= VpData->mtrr_data[i].physical_address_min) &&
|
|
(LargePageAddress <= VpData->mtrr_data[i].physical_address_max))
|
|
{
|
|
//
|
|
// Override candidate type with MTRR type
|
|
//
|
|
CandidateMemoryType = VpData->mtrr_data[i].type;
|
|
}
|
|
}
|
|
}
|
|
|
|
//
|
|
// Return the correct type needed
|
|
//
|
|
return CandidateMemoryType;
|
|
}
|
|
|
|
void ShvVmxEptInitialize(vmx::vm_state* VpData)
|
|
{
|
|
UINT32 i, j;
|
|
vmx::pdpte tempEpdpte;
|
|
|
|
//
|
|
// Fill out the EPML4E which covers the first 512GB of RAM
|
|
//
|
|
VpData->epml4[0].read = 1;
|
|
VpData->epml4[0].write = 1;
|
|
VpData->epml4[0].execute = 1;
|
|
VpData->epml4[0].page_frame_number = memory::get_physical_address(&VpData->epdpt) /
|
|
PAGE_SIZE;
|
|
|
|
//
|
|
// Fill out a RWX PDPTE
|
|
//
|
|
tempEpdpte.full = 0;
|
|
tempEpdpte.read = tempEpdpte.write = tempEpdpte.execute = 1;
|
|
|
|
//
|
|
// Construct EPT identity map for every 1GB of RAM
|
|
//
|
|
__stosq((UINT64*)VpData->epdpt, tempEpdpte.full, PDPTE_ENTRY_COUNT);
|
|
for (i = 0; i < PDPTE_ENTRY_COUNT; i++)
|
|
{
|
|
//
|
|
// Set the page frame number of the PDE table
|
|
//
|
|
VpData->epdpt[i].page_frame_number = memory::get_physical_address(&VpData->epde[i][0]) / PAGE_SIZE;
|
|
}
|
|
|
|
//
|
|
// Fill out a RWX Large PDE
|
|
//
|
|
epde_2mb temp_epde;
|
|
temp_epde.flags = 0;
|
|
temp_epde.read_access = 1;
|
|
temp_epde.write_access = 1;
|
|
temp_epde.execute_access = 1;
|
|
temp_epde.large_page = 1;
|
|
|
|
//
|
|
// Loop every 1GB of RAM (described by the PDPTE)
|
|
//
|
|
__stosq((UINT64*)VpData->epde, temp_epde.flags, PDPTE_ENTRY_COUNT * PDE_ENTRY_COUNT);
|
|
for (i = 0; i < PDPTE_ENTRY_COUNT; i++)
|
|
{
|
|
//
|
|
// Construct EPT identity map for every 2MB of RAM
|
|
//
|
|
for (j = 0; j < PDE_ENTRY_COUNT; j++)
|
|
{
|
|
VpData->epde[i][j].page_frame_number = (i * 512) + j;
|
|
VpData->epde[i][j].memory_type = ShvVmxMtrrAdjustEffectiveMemoryType(VpData,
|
|
VpData->epde[i][j].page_frame_number * _2MB,
|
|
MEMORY_TYPE_WRITE_BACK);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
UINT8
|
|
ShvVmxEnterRootModeOnVp(vmx::vm_state* VpData)
|
|
{
|
|
auto* Registers = &VpData->special_registers;
|
|
|
|
//
|
|
// Ensure the the VMCS can fit into a single page
|
|
//
|
|
ia32_vmx_basic_register basic_register{};
|
|
basic_register.flags = VpData->msr_data[0].QuadPart;
|
|
if (basic_register.vmcs_size_in_bytes > PAGE_SIZE)
|
|
{
|
|
return FALSE;
|
|
}
|
|
|
|
//
|
|
// Ensure that the VMCS is supported in writeback memory
|
|
//
|
|
if (basic_register.memory_type != MEMORY_TYPE_WRITE_BACK)
|
|
{
|
|
return FALSE;
|
|
}
|
|
|
|
//
|
|
// Ensure that true MSRs can be used for capabilities
|
|
//
|
|
if (basic_register.must_be_zero)
|
|
{
|
|
return FALSE;
|
|
}
|
|
|
|
//
|
|
// Ensure that EPT is available with the needed features SimpleVisor uses
|
|
//
|
|
ia32_vmx_ept_vpid_cap_register ept_vpid_cap_register{};
|
|
ept_vpid_cap_register.flags = VpData->msr_data[12].QuadPart;
|
|
|
|
if (ept_vpid_cap_register.page_walk_length_4 &&
|
|
ept_vpid_cap_register.memory_type_write_back &&
|
|
ept_vpid_cap_register.pde_2mb_pages)
|
|
{
|
|
//
|
|
// Enable EPT if these features are supported
|
|
//
|
|
VpData->ept_controls.flags = 0;
|
|
VpData->ept_controls.enable_ept = 1;
|
|
VpData->ept_controls.enable_vpid = 1;
|
|
}
|
|
|
|
//
|
|
// Capture the revision ID for the VMXON and VMCS region
|
|
//
|
|
VpData->vmx_on.revision_id = VpData->msr_data[0].LowPart;
|
|
VpData->vmcs.revision_id = VpData->msr_data[0].LowPart;
|
|
|
|
//
|
|
// Store the physical addresses of all per-LP structures allocated
|
|
//
|
|
VpData->vmx_on_physical_address = memory::get_physical_address(&VpData->vmx_on);
|
|
VpData->vmcs_physical_address = memory::get_physical_address(&VpData->vmcs);
|
|
VpData->msr_bitmap_physical_address = memory::get_physical_address(VpData->msr_bitmap);
|
|
VpData->ept_pml4_physical_address = memory::get_physical_address(&VpData->epml4);
|
|
|
|
//
|
|
// Update CR0 with the must-be-zero and must-be-one requirements
|
|
//
|
|
Registers->cr0 &= VpData->msr_data[7].LowPart;
|
|
Registers->cr0 |= VpData->msr_data[6].LowPart;
|
|
|
|
//
|
|
// Do the same for CR4
|
|
//
|
|
Registers->cr4 &= VpData->msr_data[9].LowPart;
|
|
Registers->cr4 |= VpData->msr_data[8].LowPart;
|
|
|
|
//
|
|
// Update host CR0 and CR4 based on the requirements above
|
|
//
|
|
__writecr0(Registers->cr0);
|
|
__writecr4(Registers->cr4);
|
|
|
|
//
|
|
// Enable VMX Root Mode
|
|
//
|
|
if (__vmx_on(&VpData->vmx_on_physical_address))
|
|
{
|
|
return FALSE;
|
|
}
|
|
|
|
//
|
|
// Clear the state of the VMCS, setting it to Inactive
|
|
//
|
|
if (__vmx_vmclear(&VpData->vmcs_physical_address))
|
|
{
|
|
__vmx_off();
|
|
return FALSE;
|
|
}
|
|
|
|
//
|
|
// Load the VMCS, setting its state to Active
|
|
//
|
|
if (__vmx_vmptrld(&VpData->vmcs_physical_address))
|
|
{
|
|
__vmx_off();
|
|
return FALSE;
|
|
}
|
|
|
|
//
|
|
// VMX Root Mode is enabled, with an active VMCS.
|
|
//
|
|
return TRUE;
|
|
}
|
|
|
|
typedef struct _VMX_GDTENTRY64
|
|
{
|
|
UINT64 Base;
|
|
UINT32 Limit;
|
|
|
|
union
|
|
{
|
|
struct
|
|
{
|
|
UINT8 Flags1;
|
|
UINT8 Flags2;
|
|
UINT8 Flags3;
|
|
UINT8 Flags4;
|
|
} Bytes;
|
|
|
|
struct
|
|
{
|
|
UINT16 SegmentType : 4;
|
|
UINT16 DescriptorType : 1;
|
|
UINT16 Dpl : 2;
|
|
UINT16 Present : 1;
|
|
|
|
UINT16 Reserved : 4;
|
|
UINT16 System : 1;
|
|
UINT16 LongMode : 1;
|
|
UINT16 DefaultBig : 1;
|
|
UINT16 Granularity : 1;
|
|
|
|
UINT16 Unusable : 1;
|
|
UINT16 Reserved2 : 15;
|
|
} Bits;
|
|
|
|
UINT32 AccessRights;
|
|
};
|
|
|
|
UINT16 Selector;
|
|
} VMX_GDTENTRY64, *PVMX_GDTENTRY64;
|
|
|
|
|
|
typedef union _KGDTENTRY64
|
|
{
|
|
struct
|
|
{
|
|
UINT16 LimitLow;
|
|
UINT16 BaseLow;
|
|
|
|
union
|
|
{
|
|
struct
|
|
{
|
|
UINT8 BaseMiddle;
|
|
UINT8 Flags1;
|
|
UINT8 Flags2;
|
|
UINT8 BaseHigh;
|
|
} Bytes;
|
|
|
|
struct
|
|
{
|
|
UINT32 BaseMiddle : 8;
|
|
UINT32 Type : 5;
|
|
UINT32 Dpl : 2;
|
|
UINT32 Present : 1;
|
|
UINT32 LimitHigh : 4;
|
|
UINT32 System : 1;
|
|
UINT32 LongMode : 1;
|
|
UINT32 DefaultBig : 1;
|
|
UINT32 Granularity : 1;
|
|
UINT32 BaseHigh : 8;
|
|
} Bits;
|
|
};
|
|
|
|
UINT32 BaseUpper;
|
|
UINT32 MustBeZero;
|
|
};
|
|
|
|
struct
|
|
{
|
|
INT64 DataLow;
|
|
INT64 DataHigh;
|
|
};
|
|
} KGDTENTRY64, *PKGDTENTRY64;
|
|
|
|
VOID
|
|
ShvUtilConvertGdtEntry(
|
|
_In_ VOID* GdtBase,
|
|
_In_ UINT16 Selector,
|
|
_Out_ PVMX_GDTENTRY64 VmxGdtEntry
|
|
)
|
|
{
|
|
PKGDTENTRY64 gdtEntry;
|
|
|
|
//
|
|
// Reject LDT or NULL entries
|
|
//
|
|
if ((Selector == 0) ||
|
|
(Selector & SEGMENT_SELECTOR_TABLE_FLAG) != 0)
|
|
{
|
|
VmxGdtEntry->Limit = VmxGdtEntry->AccessRights = 0;
|
|
VmxGdtEntry->Base = 0;
|
|
VmxGdtEntry->Selector = 0;
|
|
VmxGdtEntry->Bits.Unusable = TRUE;
|
|
return;
|
|
}
|
|
|
|
//
|
|
// Read the GDT entry at the given selector, masking out the RPL bits.
|
|
//
|
|
gdtEntry = (PKGDTENTRY64)((uintptr_t)GdtBase + (Selector & ~SEGMENT_ACCESS_RIGHTS_DESCRIPTOR_PRIVILEGE_LEVEL_MASK));
|
|
|
|
//
|
|
// Write the selector directly
|
|
//
|
|
VmxGdtEntry->Selector = Selector;
|
|
|
|
//
|
|
// Use the LSL intrinsic to read the segment limit
|
|
//
|
|
VmxGdtEntry->Limit = __segmentlimit(Selector);
|
|
|
|
//
|
|
// Build the full 64-bit effective address, keeping in mind that only when
|
|
// the System bit is unset, should this be done.
|
|
//
|
|
// NOTE: The Windows definition of KGDTENTRY64 is WRONG. The "System" field
|
|
// is incorrectly defined at the position of where the AVL bit should be.
|
|
// The actual location of the SYSTEM bit is encoded as the highest bit in
|
|
// the "Type" field.
|
|
//
|
|
VmxGdtEntry->Base = ((gdtEntry->Bytes.BaseHigh << 24) |
|
|
(gdtEntry->Bytes.BaseMiddle << 16) |
|
|
(gdtEntry->BaseLow)) & 0xFFFFFFFF;
|
|
VmxGdtEntry->Base |= ((gdtEntry->Bits.Type & 0x10) == 0) ? ((uintptr_t)gdtEntry->BaseUpper << 32) : 0;
|
|
|
|
//
|
|
// Load the access rights
|
|
//
|
|
VmxGdtEntry->AccessRights = 0;
|
|
VmxGdtEntry->Bytes.Flags1 = gdtEntry->Bytes.Flags1;
|
|
VmxGdtEntry->Bytes.Flags2 = gdtEntry->Bytes.Flags2;
|
|
|
|
//
|
|
// Finally, handle the VMX-specific bits
|
|
//
|
|
VmxGdtEntry->Bits.Reserved = 0;
|
|
VmxGdtEntry->Bits.Unusable = !gdtEntry->Bits.Present;
|
|
}
|
|
|
|
UINT32
|
|
ShvUtilAdjustMsr(
|
|
_In_ LARGE_INTEGER ControlValue,
|
|
_In_ UINT32 DesiredValue
|
|
)
|
|
{
|
|
//
|
|
// VMX feature/capability MSRs encode the "must be 0" bits in the high word
|
|
// of their value, and the "must be 1" bits in the low word of their value.
|
|
// Adjust any requested capability/feature based on these requirements.
|
|
//
|
|
DesiredValue &= ControlValue.HighPart;
|
|
DesiredValue |= ControlValue.LowPart;
|
|
return DesiredValue;
|
|
}
|
|
|
|
extern "C" VOID
|
|
ShvOsCaptureContext(
|
|
_In_ PCONTEXT ContextRecord
|
|
)
|
|
{
|
|
//
|
|
// Windows provides a nice OS function to do this
|
|
//
|
|
RtlCaptureContext(ContextRecord);
|
|
}
|
|
|
|
extern "C" DECLSPEC_NORETURN
|
|
VOID
|
|
__cdecl
|
|
ShvOsRestoreContext2(
|
|
_In_ PCONTEXT ContextRecord,
|
|
_In_opt_ struct _EXCEPTION_RECORD* ExceptionRecord
|
|
);
|
|
|
|
DECLSPEC_NORETURN
|
|
VOID
|
|
ShvVpRestoreAfterLaunch(
|
|
VOID)
|
|
{
|
|
debug_log("[%d] restore\n", thread::get_processor_index());
|
|
//
|
|
// Get the per-processor data. This routine temporarily executes on the
|
|
// same stack as the hypervisor (using no real stack space except the home
|
|
// registers), so we can retrieve the VP the same way the hypervisor does.
|
|
//
|
|
auto* vpData = (vmx::vm_state*)((uintptr_t)_AddressOfReturnAddress() +
|
|
sizeof(CONTEXT) -
|
|
KERNEL_STACK_SIZE);
|
|
|
|
//
|
|
// Record that VMX is now enabled by returning back to ShvVpInitialize with
|
|
// the Alignment Check (AC) bit set.
|
|
//
|
|
vpData->context_frame.EFlags |= EFLAGS_ALIGNMENT_CHECK_FLAG_FLAG;
|
|
|
|
//
|
|
// And finally, restore the context, so that all register and stack
|
|
// state is finally restored.
|
|
//
|
|
ShvOsRestoreContext2(&vpData->context_frame, nullptr);
|
|
}
|
|
|
|
|
|
VOID
|
|
ShvVmxHandleInvd(
|
|
VOID)
|
|
{
|
|
//
|
|
// This is the handler for the INVD instruction. Technically it may be more
|
|
// correct to use __invd instead of __wbinvd, but that intrinsic doesn't
|
|
// actually exist. Additionally, the Windows kernel (or HAL) don't contain
|
|
// any example of INVD actually ever being used. Finally, Hyper-V itself
|
|
// handles INVD by issuing WBINVD as well, so we'll just do that here too.
|
|
//
|
|
__wbinvd();
|
|
}
|
|
|
|
#define DPL_USER 3
|
|
#define DPL_SYSTEM 0
|
|
|
|
typedef struct _SHV_VP_STATE
|
|
{
|
|
PCONTEXT VpRegs;
|
|
uintptr_t GuestRip;
|
|
uintptr_t GuestRsp;
|
|
uintptr_t GuestEFlags;
|
|
UINT16 ExitReason;
|
|
UINT8 ExitVm;
|
|
} SHV_VP_STATE, *PSHV_VP_STATE;
|
|
|
|
VOID
|
|
ShvVmxHandleCpuid(
|
|
_In_ PSHV_VP_STATE VpState
|
|
)
|
|
{
|
|
INT32 cpu_info[4];
|
|
|
|
//
|
|
// Check for the magic CPUID sequence, and check that it is coming from
|
|
// Ring 0. Technically we could also check the RIP and see if this falls
|
|
// in the expected function, but we may want to allow a separate "unload"
|
|
// driver or code at some point.
|
|
//
|
|
if ((VpState->VpRegs->Rax == 0x41414141) &&
|
|
(VpState->VpRegs->Rcx == 0x42424242) &&
|
|
((ShvVmxRead(VMCS_GUEST_CS_SELECTOR) & SEGMENT_ACCESS_RIGHTS_DESCRIPTOR_PRIVILEGE_LEVEL_MASK) == DPL_SYSTEM))
|
|
{
|
|
VpState->ExitVm = TRUE;
|
|
return;
|
|
}
|
|
|
|
//
|
|
// Otherwise, issue the CPUID to the logical processor based on the indexes
|
|
// on the VP's GPRs.
|
|
//
|
|
__cpuidex(cpu_info, (INT32)VpState->VpRegs->Rax, (INT32)VpState->VpRegs->Rcx);
|
|
|
|
//
|
|
// Check if this was CPUID 1h, which is the features request.
|
|
//
|
|
if (VpState->VpRegs->Rax == 1)
|
|
{
|
|
//
|
|
// Set the Hypervisor Present-bit in RCX, which Intel and AMD have both
|
|
// reserved for this indication.
|
|
//
|
|
cpu_info[2] |= HYPERV_HYPERVISOR_PRESENT_BIT;
|
|
}
|
|
else if (VpState->VpRegs->Rax == HYPERV_CPUID_INTERFACE)
|
|
{
|
|
//
|
|
// Return our interface identifier
|
|
//
|
|
cpu_info[0] = 'momo';
|
|
}
|
|
|
|
//
|
|
// Copy the values from the logical processor registers into the VP GPRs.
|
|
//
|
|
VpState->VpRegs->Rax = cpu_info[0];
|
|
VpState->VpRegs->Rbx = cpu_info[1];
|
|
VpState->VpRegs->Rcx = cpu_info[2];
|
|
VpState->VpRegs->Rdx = cpu_info[3];
|
|
}
|
|
|
|
VOID
|
|
ShvVmxHandleXsetbv(
|
|
_In_ PSHV_VP_STATE VpState
|
|
)
|
|
{
|
|
//
|
|
// Simply issue the XSETBV instruction on the native logical processor.
|
|
//
|
|
|
|
_xsetbv((UINT32)VpState->VpRegs->Rcx,
|
|
VpState->VpRegs->Rdx << 32 |
|
|
VpState->VpRegs->Rax);
|
|
}
|
|
|
|
VOID
|
|
ShvVmxHandleVmx(
|
|
_In_ PSHV_VP_STATE VpState
|
|
)
|
|
{
|
|
//
|
|
// Set the CF flag, which is how VMX instructions indicate failure
|
|
//
|
|
VpState->GuestEFlags |= 0x1; // VM_FAIL_INVALID
|
|
|
|
//
|
|
// RFLAGs is actually restored from the VMCS, so update it here
|
|
//
|
|
__vmx_vmwrite(VMCS_GUEST_RFLAGS, VpState->GuestEFlags);
|
|
}
|
|
|
|
VOID
|
|
ShvVmxHandleExit(
|
|
_In_ PSHV_VP_STATE VpState
|
|
)
|
|
{
|
|
//
|
|
// This is the generic VM-Exit handler. Decode the reason for the exit and
|
|
// call the appropriate handler. As per Intel specifications, given that we
|
|
// have requested no optional exits whatsoever, we should only see CPUID,
|
|
// INVD, XSETBV and other VMX instructions. GETSEC cannot happen as we do
|
|
// not run in SMX context.
|
|
//
|
|
switch (VpState->ExitReason)
|
|
{
|
|
case VMX_EXIT_REASON_EXECUTE_CPUID:
|
|
ShvVmxHandleCpuid(VpState);
|
|
break;
|
|
case VMX_EXIT_REASON_EXECUTE_INVD:
|
|
ShvVmxHandleInvd();
|
|
break;
|
|
case VMX_EXIT_REASON_EXECUTE_XSETBV:
|
|
ShvVmxHandleXsetbv(VpState);
|
|
break;
|
|
case VMX_EXIT_REASON_EXECUTE_VMCALL:
|
|
case VMX_EXIT_REASON_EXECUTE_VMCLEAR:
|
|
case VMX_EXIT_REASON_EXECUTE_VMLAUNCH:
|
|
case VMX_EXIT_REASON_EXECUTE_VMPTRLD:
|
|
case VMX_EXIT_REASON_EXECUTE_VMPTRST:
|
|
case VMX_EXIT_REASON_EXECUTE_VMREAD:
|
|
case VMX_EXIT_REASON_EXECUTE_VMRESUME:
|
|
case VMX_EXIT_REASON_EXECUTE_VMWRITE:
|
|
case VMX_EXIT_REASON_EXECUTE_VMXOFF:
|
|
case VMX_EXIT_REASON_EXECUTE_VMXON:
|
|
ShvVmxHandleVmx(VpState);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
//
|
|
// Move the instruction pointer to the next instruction after the one that
|
|
// caused the exit. Since we are not doing any special handling or changing
|
|
// of execution, this can be done for any exit reason.
|
|
//
|
|
VpState->GuestRip += ShvVmxRead(VMCS_VMEXIT_INSTRUCTION_LENGTH);
|
|
__vmx_vmwrite(VMCS_GUEST_RIP, VpState->GuestRip);
|
|
}
|
|
|
|
VOID
|
|
ShvOsUnprepareProcessor(
|
|
_In_ vmx::vm_state* VpData
|
|
)
|
|
{
|
|
//
|
|
// When running in VMX root mode, the processor will set limits of the
|
|
// GDT and IDT to 0xFFFF (notice that there are no Host VMCS fields to
|
|
// set these values). This causes problems with PatchGuard, which will
|
|
// believe that the GDTR and IDTR have been modified by malware, and
|
|
// eventually crash the system. Since we know what the original state
|
|
// of the GDTR and IDTR was, simply restore it now.
|
|
//
|
|
__lgdt(&VpData->special_registers.gdtr.limit);
|
|
__lidt(&VpData->special_registers.idtr.limit);
|
|
}
|
|
|
|
DECLSPEC_NORETURN
|
|
VOID
|
|
ShvVmxResume()
|
|
{
|
|
//
|
|
// Issue a VMXRESUME. The reason that we've defined an entire function for
|
|
// this sole instruction is both so that we can use it as the target of the
|
|
// VMCS when re-entering the VM After a VM-Exit, as well as so that we can
|
|
// decorate it with the DECLSPEC_NORETURN marker, which is not set on the
|
|
// intrinsic (as it can fail in case of an error).
|
|
//
|
|
__vmx_vmresume();
|
|
}
|
|
|
|
extern "C" DECLSPEC_NORETURN
|
|
VOID
|
|
ShvVmxEntryHandler()
|
|
{
|
|
PCONTEXT Context = (PCONTEXT)_AddressOfReturnAddress();
|
|
SHV_VP_STATE guestContext;
|
|
|
|
//
|
|
// Because we had to use RCX when calling ShvOsCaptureContext, its value
|
|
// was actually pushed on the stack right before the call. Go dig into the
|
|
// stack to find it, and overwrite the bogus value that's there now.
|
|
//
|
|
//Context->Rcx = *(UINT64*)((uintptr_t)Context - sizeof(Context->Rcx));
|
|
|
|
//
|
|
// Get the per-VP data for this processor.
|
|
//
|
|
auto* vpData = (vmx::vm_state*)((uintptr_t)(Context + 1) - KERNEL_STACK_SIZE);
|
|
|
|
//
|
|
// Build a little stack context to make it easier to keep track of certain
|
|
// guest state, such as the RIP/RSP/RFLAGS, and the exit reason. The rest
|
|
// of the general purpose registers come from the context structure that we
|
|
// captured on our own with RtlCaptureContext in the assembly entrypoint.
|
|
//
|
|
guestContext.GuestEFlags = ShvVmxRead(VMCS_GUEST_RFLAGS);
|
|
guestContext.GuestRip = ShvVmxRead(VMCS_GUEST_RIP);
|
|
guestContext.GuestRsp = ShvVmxRead(VMCS_GUEST_RSP);
|
|
guestContext.ExitReason = ShvVmxRead(VMCS_EXIT_REASON) & 0xFFFF;
|
|
guestContext.VpRegs = Context;
|
|
guestContext.ExitVm = FALSE;
|
|
|
|
//
|
|
// Call the generic handler
|
|
//
|
|
ShvVmxHandleExit(&guestContext);
|
|
|
|
//
|
|
// Did we hit the magic exit sequence, or should we resume back to the VM
|
|
// context?
|
|
//
|
|
if (guestContext.ExitVm != FALSE)
|
|
{
|
|
//
|
|
// Return the VP Data structure in RAX:RBX which is going to be part of
|
|
// the CPUID response that the caller (ShvVpUninitialize) expects back.
|
|
// Return confirmation in RCX that we are loaded
|
|
//
|
|
Context->Rax = (uintptr_t)vpData >> 32;
|
|
Context->Rbx = (uintptr_t)vpData & 0xFFFFFFFF;
|
|
Context->Rcx = 0x43434343;
|
|
|
|
//
|
|
// Perform any OS-specific CPU uninitialization work
|
|
//
|
|
ShvOsUnprepareProcessor(vpData);
|
|
|
|
//
|
|
// Our callback routine may have interrupted an arbitrary user process,
|
|
// and therefore not a thread running with a systemwide page directory.
|
|
// Therefore if we return back to the original caller after turning off
|
|
// VMX, it will keep our current "host" CR3 value which we set on entry
|
|
// to the PML4 of the SYSTEM process. We want to return back with the
|
|
// correct value of the "guest" CR3, so that the currently executing
|
|
// process continues to run with its expected address space mappings.
|
|
//
|
|
__writecr3(ShvVmxRead(VMCS_GUEST_CR3));
|
|
|
|
//
|
|
// Finally, restore the stack, instruction pointer and EFLAGS to the
|
|
// original values present when the instruction causing our VM-Exit
|
|
// execute (such as ShvVpUninitialize). This will effectively act as
|
|
// a longjmp back to that location.
|
|
//
|
|
Context->Rsp = guestContext.GuestRsp;
|
|
Context->Rip = (UINT64)guestContext.GuestRip;
|
|
Context->EFlags = (UINT32)guestContext.GuestEFlags;
|
|
|
|
//
|
|
// Turn off VMX root mode on this logical processor. We're done here.
|
|
//
|
|
__vmx_off();
|
|
}
|
|
else
|
|
{
|
|
//
|
|
// Because we won't be returning back into assembly code, nothing will
|
|
// ever know about the "pop rcx" that must technically be done (or more
|
|
// accurately "add rsp, 4" as rcx will already be correct thanks to the
|
|
// fixup earlier. In order to keep the stack sane, do that adjustment
|
|
// here.
|
|
//
|
|
//Context->Rsp += sizeof(Context->Rcx);
|
|
|
|
//
|
|
// Return into a VMXRESUME intrinsic, which we broke out as its own
|
|
// function, in order to allow this to work. No assembly code will be
|
|
// needed as RtlRestoreContext will fix all the GPRs, and what we just
|
|
// did to RSP will take care of the rest.
|
|
//
|
|
Context->Rip = (UINT64)ShvVmxResume;
|
|
}
|
|
|
|
//
|
|
// Restore the context to either ShvVmxResume, in which case the CPU's VMX
|
|
// facility will do the "true" return back to the VM (but without restoring
|
|
// GPRs, which is why we must do it here), or to the original guest's RIP,
|
|
// which we use in case an exit was requested. In this case VMX must now be
|
|
// off, and this will look like a longjmp to the original stack and RIP.
|
|
//
|
|
ShvOsRestoreContext2(Context, nullptr);
|
|
}
|
|
|
|
|
|
extern "C" VOID
|
|
ShvVmxEntry(
|
|
VOID);
|
|
|
|
void ShvVmxSetupVmcsForVp(vmx::vm_state* VpData)
|
|
{
|
|
auto* state = &VpData->special_registers;
|
|
PCONTEXT context = &VpData->context_frame;
|
|
VMX_GDTENTRY64 vmxGdtEntry; // vmx_segment_access_rights
|
|
ept_pointer vmxEptp;
|
|
|
|
//
|
|
// Begin by setting the link pointer to the required value for 4KB VMCS.
|
|
//
|
|
__vmx_vmwrite(VMCS_GUEST_VMCS_LINK_POINTER, ~0ULL);
|
|
|
|
//
|
|
// Enable EPT features if supported
|
|
//
|
|
if (VpData->ept_controls.flags != 0)
|
|
{
|
|
//
|
|
// Configure the EPTP
|
|
//
|
|
vmxEptp.flags = 0;
|
|
vmxEptp.page_walk_length = 3;
|
|
vmxEptp.memory_type = MEMORY_TYPE_WRITE_BACK;
|
|
vmxEptp.page_frame_number = VpData->ept_pml4_physical_address / PAGE_SIZE;
|
|
|
|
//
|
|
// Load EPT Root Pointer
|
|
//
|
|
__vmx_vmwrite(VMCS_CTRL_EPT_POINTER, vmxEptp.flags);
|
|
|
|
//
|
|
// Set VPID to one
|
|
//
|
|
__vmx_vmwrite(VMCS_CTRL_VIRTUAL_PROCESSOR_IDENTIFIER, 1);
|
|
}
|
|
|
|
//
|
|
// Load the MSR bitmap. Unlike other bitmaps, not having an MSR bitmap will
|
|
// trap all MSRs, so we allocated an empty one.
|
|
//
|
|
__vmx_vmwrite(VMCS_CTRL_MSR_BITMAP_ADDRESS, VpData->msr_bitmap_physical_address);
|
|
|
|
//
|
|
// Enable support for RDTSCP and XSAVES/XRESTORES in the guest. Windows 10
|
|
// makes use of both of these instructions if the CPU supports it. By using
|
|
// ShvUtilAdjustMsr, these options will be ignored if this processor does
|
|
// not actually support the instructions to begin with.
|
|
//
|
|
// Also enable EPT support, for additional performance and ability to trap
|
|
// memory access efficiently.
|
|
//
|
|
auto ept_controls = VpData->ept_controls;
|
|
ept_controls.enable_rdtscp = 1;
|
|
ept_controls.enable_invpcid = 1;
|
|
ept_controls.enable_xsaves = 1;
|
|
__vmx_vmwrite(VMCS_CTRL_SECONDARY_PROCESSOR_BASED_VM_EXECUTION_CONTROLS,
|
|
ShvUtilAdjustMsr(VpData->msr_data[11], ept_controls.flags));
|
|
|
|
//
|
|
// Enable no pin-based options ourselves, but there may be some required by
|
|
// the processor. Use ShvUtilAdjustMsr to add those in.
|
|
//
|
|
__vmx_vmwrite(VMCS_CTRL_PIN_BASED_VM_EXECUTION_CONTROLS,
|
|
ShvUtilAdjustMsr(VpData->msr_data[13], 0));
|
|
|
|
//
|
|
// In order for our choice of supporting RDTSCP and XSAVE/RESTORES above to
|
|
// actually mean something, we have to request secondary controls. We also
|
|
// want to activate the MSR bitmap in order to keep them from being caught.
|
|
//
|
|
ia32_vmx_procbased_ctls_register procbased_ctls_register{};
|
|
procbased_ctls_register.activate_secondary_controls = 1;
|
|
procbased_ctls_register.use_msr_bitmaps = 1;
|
|
|
|
__vmx_vmwrite(VMCS_CTRL_PROCESSOR_BASED_VM_EXECUTION_CONTROLS,
|
|
ShvUtilAdjustMsr(VpData->msr_data[14],
|
|
procbased_ctls_register.flags));
|
|
|
|
//
|
|
// Make sure to enter us in x64 mode at all times.
|
|
//
|
|
ia32_vmx_exit_ctls_register exit_ctls_register{};
|
|
exit_ctls_register.host_address_space_size = 1;
|
|
__vmx_vmwrite(VMCS_CTRL_VMEXIT_CONTROLS,
|
|
ShvUtilAdjustMsr(VpData->msr_data[15],
|
|
exit_ctls_register.flags));
|
|
|
|
//
|
|
// As we exit back into the guest, make sure to exist in x64 mode as well.
|
|
//
|
|
ia32_vmx_entry_ctls_register entry_ctls_register{};
|
|
entry_ctls_register.ia32e_mode_guest = 1;
|
|
__vmx_vmwrite(VMCS_CTRL_VMENTRY_CONTROLS,
|
|
ShvUtilAdjustMsr(VpData->msr_data[16],
|
|
entry_ctls_register.flags));
|
|
|
|
//
|
|
// Load the CS Segment (Ring 0 Code)
|
|
//
|
|
ShvUtilConvertGdtEntry(state->gdtr.base, context->SegCs, &vmxGdtEntry);
|
|
__vmx_vmwrite(VMCS_GUEST_CS_SELECTOR, vmxGdtEntry.Selector);
|
|
__vmx_vmwrite(VMCS_GUEST_CS_LIMIT, vmxGdtEntry.Limit);
|
|
__vmx_vmwrite(VMCS_GUEST_CS_ACCESS_RIGHTS, vmxGdtEntry.AccessRights);
|
|
__vmx_vmwrite(VMCS_GUEST_CS_BASE, vmxGdtEntry.Base);
|
|
__vmx_vmwrite(VMCS_HOST_CS_SELECTOR, context->SegCs & ~SEGMENT_ACCESS_RIGHTS_DESCRIPTOR_PRIVILEGE_LEVEL_MASK);
|
|
|
|
//
|
|
// Load the SS Segment (Ring 0 Data)
|
|
//
|
|
ShvUtilConvertGdtEntry(state->gdtr.base, context->SegSs, &vmxGdtEntry);
|
|
__vmx_vmwrite(VMCS_GUEST_SS_SELECTOR, vmxGdtEntry.Selector);
|
|
__vmx_vmwrite(VMCS_GUEST_SS_LIMIT, vmxGdtEntry.Limit);
|
|
__vmx_vmwrite(VMCS_GUEST_SS_ACCESS_RIGHTS, vmxGdtEntry.AccessRights);
|
|
__vmx_vmwrite(VMCS_GUEST_SS_BASE, vmxGdtEntry.Base);
|
|
__vmx_vmwrite(VMCS_HOST_SS_SELECTOR, context->SegSs & ~SEGMENT_ACCESS_RIGHTS_DESCRIPTOR_PRIVILEGE_LEVEL_MASK);
|
|
|
|
//
|
|
// Load the DS Segment (Ring 3 Data)
|
|
//
|
|
ShvUtilConvertGdtEntry(state->gdtr.base, context->SegDs, &vmxGdtEntry);
|
|
__vmx_vmwrite(VMCS_GUEST_DS_SELECTOR, vmxGdtEntry.Selector);
|
|
__vmx_vmwrite(VMCS_GUEST_DS_LIMIT, vmxGdtEntry.Limit);
|
|
__vmx_vmwrite(VMCS_GUEST_DS_ACCESS_RIGHTS, vmxGdtEntry.AccessRights);
|
|
__vmx_vmwrite(VMCS_GUEST_DS_BASE, vmxGdtEntry.Base);
|
|
__vmx_vmwrite(VMCS_HOST_DS_SELECTOR, context->SegDs & ~SEGMENT_ACCESS_RIGHTS_DESCRIPTOR_PRIVILEGE_LEVEL_MASK);
|
|
|
|
//
|
|
// Load the ES Segment (Ring 3 Data)
|
|
//
|
|
ShvUtilConvertGdtEntry(state->gdtr.base, context->SegEs, &vmxGdtEntry);
|
|
__vmx_vmwrite(VMCS_GUEST_ES_SELECTOR, vmxGdtEntry.Selector);
|
|
__vmx_vmwrite(VMCS_GUEST_ES_LIMIT, vmxGdtEntry.Limit);
|
|
__vmx_vmwrite(VMCS_GUEST_ES_ACCESS_RIGHTS, vmxGdtEntry.AccessRights);
|
|
__vmx_vmwrite(VMCS_GUEST_ES_BASE, vmxGdtEntry.Base);
|
|
__vmx_vmwrite(VMCS_HOST_ES_SELECTOR, context->SegEs & ~SEGMENT_ACCESS_RIGHTS_DESCRIPTOR_PRIVILEGE_LEVEL_MASK);
|
|
|
|
//
|
|
// Load the FS Segment (Ring 3 Compatibility-Mode TEB)
|
|
//
|
|
ShvUtilConvertGdtEntry(state->gdtr.base, context->SegFs, &vmxGdtEntry);
|
|
__vmx_vmwrite(VMCS_GUEST_FS_SELECTOR, vmxGdtEntry.Selector);
|
|
__vmx_vmwrite(VMCS_GUEST_FS_LIMIT, vmxGdtEntry.Limit);
|
|
__vmx_vmwrite(VMCS_GUEST_FS_ACCESS_RIGHTS, vmxGdtEntry.AccessRights);
|
|
__vmx_vmwrite(VMCS_GUEST_FS_BASE, vmxGdtEntry.Base);
|
|
__vmx_vmwrite(VMCS_HOST_FS_BASE, vmxGdtEntry.Base);
|
|
__vmx_vmwrite(VMCS_HOST_FS_SELECTOR, context->SegFs & ~SEGMENT_ACCESS_RIGHTS_DESCRIPTOR_PRIVILEGE_LEVEL_MASK);
|
|
|
|
//
|
|
// Load the GS Segment (Ring 3 Data if in Compatibility-Mode, MSR-based in Long Mode)
|
|
//
|
|
ShvUtilConvertGdtEntry(state->gdtr.base, context->SegGs, &vmxGdtEntry);
|
|
__vmx_vmwrite(VMCS_GUEST_GS_SELECTOR, vmxGdtEntry.Selector);
|
|
__vmx_vmwrite(VMCS_GUEST_GS_LIMIT, vmxGdtEntry.Limit);
|
|
__vmx_vmwrite(VMCS_GUEST_GS_ACCESS_RIGHTS, vmxGdtEntry.AccessRights);
|
|
__vmx_vmwrite(VMCS_GUEST_GS_BASE, state->msr_gs_base);
|
|
__vmx_vmwrite(VMCS_HOST_GS_BASE, state->msr_gs_base);
|
|
__vmx_vmwrite(VMCS_HOST_GS_SELECTOR, context->SegGs & ~SEGMENT_ACCESS_RIGHTS_DESCRIPTOR_PRIVILEGE_LEVEL_MASK);
|
|
|
|
//
|
|
// Load the Task Register (Ring 0 TSS)
|
|
//
|
|
ShvUtilConvertGdtEntry(state->gdtr.base, state->tr, &vmxGdtEntry);
|
|
__vmx_vmwrite(VMCS_GUEST_TR_SELECTOR, vmxGdtEntry.Selector);
|
|
__vmx_vmwrite(VMCS_GUEST_TR_LIMIT, vmxGdtEntry.Limit);
|
|
__vmx_vmwrite(VMCS_GUEST_TR_ACCESS_RIGHTS, vmxGdtEntry.AccessRights);
|
|
__vmx_vmwrite(VMCS_GUEST_TR_BASE, vmxGdtEntry.Base);
|
|
__vmx_vmwrite(VMCS_HOST_TR_BASE, vmxGdtEntry.Base);
|
|
__vmx_vmwrite(VMCS_HOST_TR_SELECTOR, state->tr & ~SEGMENT_ACCESS_RIGHTS_DESCRIPTOR_PRIVILEGE_LEVEL_MASK);
|
|
|
|
//
|
|
// Load the Local Descriptor Table (Ring 0 LDT on Redstone)
|
|
//
|
|
ShvUtilConvertGdtEntry(state->gdtr.base, state->ldtr, &vmxGdtEntry);
|
|
__vmx_vmwrite(VMCS_GUEST_LDTR_SELECTOR, vmxGdtEntry.Selector);
|
|
__vmx_vmwrite(VMCS_GUEST_LDTR_LIMIT, vmxGdtEntry.Limit);
|
|
__vmx_vmwrite(VMCS_GUEST_LDTR_ACCESS_RIGHTS, vmxGdtEntry.AccessRights);
|
|
__vmx_vmwrite(VMCS_GUEST_LDTR_BASE, vmxGdtEntry.Base);
|
|
|
|
//
|
|
// Now load the GDT itself
|
|
//
|
|
__vmx_vmwrite(VMCS_GUEST_GDTR_BASE, (uintptr_t)state->gdtr.base);
|
|
__vmx_vmwrite(VMCS_GUEST_GDTR_LIMIT, state->gdtr.limit);
|
|
__vmx_vmwrite(VMCS_HOST_GDTR_BASE, (uintptr_t)state->gdtr.base);
|
|
|
|
//
|
|
// And then the IDT
|
|
//
|
|
__vmx_vmwrite(VMCS_GUEST_IDTR_BASE, (uintptr_t)state->idtr.base);
|
|
__vmx_vmwrite(VMCS_GUEST_IDTR_LIMIT, state->idtr.limit);
|
|
__vmx_vmwrite(VMCS_HOST_IDTR_BASE, (uintptr_t)state->idtr.base);
|
|
|
|
//
|
|
// Load CR0
|
|
//
|
|
__vmx_vmwrite(VMCS_CTRL_CR0_READ_SHADOW, state->cr0);
|
|
__vmx_vmwrite(VMCS_HOST_CR0, state->cr0);
|
|
__vmx_vmwrite(VMCS_GUEST_CR0, state->cr0);
|
|
|
|
//
|
|
// Load CR3 -- do not use the current process' address space for the host,
|
|
// because we may be executing in an arbitrary user-mode process right now
|
|
// as part of the DPC interrupt we execute in.
|
|
//
|
|
__vmx_vmwrite(VMCS_HOST_CR3, VpData->system_directory_table_base);
|
|
__vmx_vmwrite(VMCS_GUEST_CR3, state->cr3);
|
|
|
|
//
|
|
// Load CR4
|
|
//
|
|
__vmx_vmwrite(VMCS_HOST_CR4, state->cr4);
|
|
__vmx_vmwrite(VMCS_GUEST_CR4, state->cr4);
|
|
__vmx_vmwrite(VMCS_CTRL_CR4_READ_SHADOW, state->cr4);
|
|
|
|
//
|
|
// Load debug MSR and register (DR7)
|
|
//
|
|
__vmx_vmwrite(VMCS_GUEST_DEBUGCTL, state->debug_control);
|
|
__vmx_vmwrite(VMCS_GUEST_DR7, state->kernel_dr7);
|
|
|
|
//
|
|
// Finally, load the guest stack, instruction pointer, and rflags, which
|
|
// corresponds exactly to the location where RtlCaptureContext will return
|
|
// to inside of ShvVpInitialize.
|
|
//
|
|
__vmx_vmwrite(VMCS_GUEST_RSP, (uintptr_t)VpData->stack_buffer + KERNEL_STACK_SIZE - sizeof(CONTEXT));
|
|
__vmx_vmwrite(VMCS_GUEST_RIP, (uintptr_t)ShvVpRestoreAfterLaunch);
|
|
__vmx_vmwrite(VMCS_GUEST_RFLAGS, context->EFlags);
|
|
|
|
//
|
|
// Load the hypervisor entrypoint and stack. We give ourselves a standard
|
|
// size kernel stack (24KB) and bias for the context structure that the
|
|
// hypervisor entrypoint will push on the stack, avoiding the need for RSP
|
|
// modifying instructions in the entrypoint. Note that the CONTEXT pointer
|
|
// and thus the stack itself, must be 16-byte aligned for ABI compatibility
|
|
// with AMD64 -- specifically, XMM operations will fail otherwise, such as
|
|
// the ones that RtlCaptureContext will perform.
|
|
//
|
|
C_ASSERT((KERNEL_STACK_SIZE - sizeof(CONTEXT)) % 16 == 0);
|
|
__vmx_vmwrite(VMCS_HOST_RSP, (uintptr_t)VpData->stack_buffer + KERNEL_STACK_SIZE - sizeof(CONTEXT));
|
|
__vmx_vmwrite(VMCS_HOST_RIP, (uintptr_t)ShvVmxEntry);
|
|
}
|
|
|
|
INT32 ShvVmxLaunchOnVp(vmx::vm_state* VpData)
|
|
{
|
|
//
|
|
// Initialize all the VMX-related MSRs by reading their value
|
|
//
|
|
for (UINT32 i = 0; i < sizeof(VpData->msr_data) / sizeof(VpData->msr_data[0]); i++)
|
|
{
|
|
VpData->msr_data[i].QuadPart = __readmsr(IA32_VMX_BASIC + i);
|
|
}
|
|
|
|
debug_log("[%d] mtrr init\n", thread::get_processor_index());
|
|
|
|
//
|
|
// Initialize all the MTRR-related MSRs by reading their value and build
|
|
// range structures to describe their settings
|
|
//
|
|
ShvVmxMtrrInitialize(VpData);
|
|
|
|
debug_log("[%d] ept init\n", thread::get_processor_index());
|
|
|
|
//
|
|
// Initialize the EPT structures
|
|
//
|
|
ShvVmxEptInitialize(VpData);
|
|
|
|
debug_log("[%d] entering root mode\n", thread::get_processor_index());
|
|
|
|
//
|
|
// Attempt to enter VMX root mode on this processor.
|
|
//
|
|
if (ShvVmxEnterRootModeOnVp(VpData) == FALSE)
|
|
{
|
|
throw std::runtime_error("Not available");
|
|
}
|
|
|
|
debug_log("[%d] setting up vmcs\n", thread::get_processor_index());
|
|
|
|
//
|
|
// Initialize the VMCS, both guest and host state.
|
|
//
|
|
ShvVmxSetupVmcsForVp(VpData);
|
|
|
|
//
|
|
// Launch the VMCS, based on the guest data that was loaded into the
|
|
// various VMCS fields by ShvVmxSetupVmcsForVp. This will cause the
|
|
// processor to jump to ShvVpRestoreAfterLaunch on success, or return
|
|
// back to the caller on failure.
|
|
//
|
|
debug_log("[%d] vmx launch\n", thread::get_processor_index());
|
|
return ShvVmxLaunch();
|
|
}
|
|
|
|
|
|
void hypervisor::enable_core(const uint64_t system_directory_table_base)
|
|
{
|
|
debug_log("[%d] Enabling hypervisor on core %d\n", thread::get_processor_index(), thread::get_processor_index());
|
|
auto* vm_state = this->get_current_vm_state();
|
|
|
|
vm_state->system_directory_table_base = system_directory_table_base;
|
|
|
|
debug_log("[%d] Capturing registers\n", thread::get_processor_index());
|
|
ShvCaptureSpecialRegisters(&vm_state->special_registers);
|
|
|
|
//
|
|
// Then, capture the entire register state. We will need this, as once we
|
|
// launch the VM, it will begin execution at the defined guest instruction
|
|
// pointer, which we set to ShvVpRestoreAfterLaunch, with the registers set
|
|
// to whatever value they were deep inside the VMCS/VMX initialization code.
|
|
// By using RtlRestoreContext, that function sets the AC flag in EFLAGS and
|
|
// returns here with our registers restored.
|
|
//
|
|
debug_log("[%d] Capturing context\n", thread::get_processor_index());
|
|
RtlCaptureContext(&vm_state->context_frame);
|
|
if ((__readeflags() & EFLAGS_ALIGNMENT_CHECK_FLAG_FLAG) == 0)
|
|
{
|
|
//
|
|
// If the AC bit is not set in EFLAGS, it means that we have not yet
|
|
// launched the VM. Attempt to initialize VMX on this processor.
|
|
//
|
|
debug_log("[%d] Launching\n", thread::get_processor_index());
|
|
ShvVmxLaunchOnVp(vm_state);
|
|
}
|
|
|
|
if (!is_hypervisor_present())
|
|
{
|
|
throw std::runtime_error("Hypervisor is not present");
|
|
}
|
|
}
|
|
|
|
void hypervisor::disable_core()
|
|
{
|
|
int32_t cpu_info[4]{0};
|
|
__cpuidex(cpu_info, 0x41414141, 0x42424242);
|
|
}
|
|
|
|
void hypervisor::allocate_vm_states()
|
|
{
|
|
if (this->vm_states_)
|
|
{
|
|
throw std::runtime_error("VM states are still in use");
|
|
}
|
|
|
|
const auto core_count = thread::get_processor_count();
|
|
const auto allocation_size = sizeof(vmx::vm_state) * core_count;
|
|
|
|
this->vm_states_ = static_cast<vmx::vm_state*>(memory::allocate_aligned_memory(allocation_size));
|
|
if (!this->vm_states_)
|
|
{
|
|
throw std::runtime_error("Failed to allocate VM states");
|
|
}
|
|
|
|
RtlSecureZeroMemory(this->vm_states_, allocation_size);
|
|
}
|
|
|
|
void hypervisor::free_vm_states()
|
|
{
|
|
memory::free_aligned_memory(this->vm_states_);
|
|
this->vm_states_ = nullptr;
|
|
}
|
|
|
|
vmx::vm_state* hypervisor::get_current_vm_state() const
|
|
{
|
|
const auto current_core = thread::get_processor_index();
|
|
return &this->vm_states_[current_core];
|
|
}
|