这几天学习了虚拟机在创建和运行过程中,QEMU和KVM的核心执行流程。当然只是大概过程,并没有做到流程中的每个函数都分析。
很喜欢侯捷老师的一句话:源码之前,了无秘密。我阅读的源码是qemu-6.2.0和linux-5.15.39。
编译安装qemu的过程很简单,参考官方文档就行。
可以直接用gdb命令行调试qemu,也可以vscode搭配gdb,调试属于基本能力,不多说。
动态调试qemu,并结合qemu源码分析流程。
启动参数如下:
1
2
3
4
5
6
7
8
9
|
$ .
/
qemu
-
system
-
x86_64 \
-
-
enable
-
kvm \
-
machine q35 \
-
cpu host,
+
vmx \
-
smp
1
\
-
m
2048
\
-
name ubuntu \
-
hda
/
opt
/
vms
/
ubuntu.qcow2 \
-
cdrom
/
opt
/
vms
/
ubuntu.iso
|
qemu-6.2.0/softmmu/vl.c,line 2765
在vl.c文件2765行是入口,对运行程序传入的参数进行解析。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
|
void qemu_init(
int
argc, char
*
*
argv, char
*
*
envp)
{
/
/
...
/
/
对参数进行解析
for
(;;) {
if
(optind >
=
argc)
break
;
if
(argv[optind][
0
] !
=
'-'
) {
loc_set_cmdline(argv, optind,
1
);
drive_add(IF_DEFAULT,
0
, argv[optind
+
+
], HD_OPTS);
}
else
{
const QEMUOption
*
popt;
popt
=
lookup_opt(argc, argv, &optarg, &optind);
if
(!(popt
-
>arch_mask & arch_type)) {
error_report(
"Option not supported for this target"
);
exit(
1
);
}
switch(popt
-
>index) {
case QEMU_OPTION_cpu:
/
*
hw initialization will check this
*
/
cpu_option
=
optarg;
break
;
/
/
...
/
/
主要关注下面几个参数
case QEMU_OPTION_m:
opts
=
qemu_opts_parse_noisily(qemu_find_opts(
"memory"
),
optarg, true);
if
(!opts) {
exit(EXIT_FAILURE);
}
break
;
case QEMU_OPTION_enable_kvm:
qdict_put_str(machine_opts_dict,
"accel"
,
"kvm"
);
break
;
case QEMU_OPTION_M:
case QEMU_OPTION_machine:
{
bool
help
;
keyval_parse_into(machine_opts_dict, optarg,
"type"
, &
help
, &error_fatal);
if
(
help
) {
machine_help_func(machine_opts_dict);
exit(EXIT_SUCCESS);
}
break
;
}
case QEMU_OPTION_smp:
machine_parse_property_opt(qemu_find_opts(
"smp-opts"
),
"smp"
, optarg);
break
;
}
}
}
/
/
...
/
/
根据accel设置accelerators
=
kvm
qemu_apply_legacy_machine_options(machine_opts_dict);
qemu_apply_machine_options(machine_opts_dict);
/
/
也会根据进程名判断可用的加速类型
configure_accelerators(argv[
0
]);
/
/
内部调用了do_configure_accelerator
-
-
> accel_init_machine
/
/
accel_init_machine
-
-
> kvm_init
/
/
初始化具体的accel类(这里是kvm)
/
/
在qemu
-
6.2
.
0
/
accel
/
kvm
/
kvm
-
all
.c line
3629
/
/
函数kvm_accel_class_init内部找到真正的初始化函数
/
/
ac
-
>init_machine
=
kvm_init;
/
/
...
/
/
在qmp_x_exit_preconfig与虚拟cpu创建有关
if
(!preconfig_requested) {
qmp_x_exit_preconfig(&error_fatal);
}
qemu_init_displays();
/
/
设置accel
accel_setup_post(current_machine);
os_setup_post();
resume_mux_open();
}
|
参数 | 描述 |
---|---|
QEMU_OPTION_m | 虚拟机内存大小 |
QEMU_OPTION_enable_kvm | 启用kvm加速 |
QEMU_OPTION_machine | 虚拟机机器类型 |
QEMU_OPTION_smp | 虚拟机cpu数量 |
qemu-6.2.0/accel/kvm/kvm-all.c line 2306
在kvm-all.c文件2306行是kvm_init
调用栈:
1
2
3
4
5
6
7
|
kvm_init(MachineState
*
ms) (qemu
-
6.2
.
0
\accel\kvm\kvm
-
all
.c:
2308
)
accel_init_machine(AccelState
*
accel, MachineState
*
ms) (qemu
-
6.2
.
0
\accel\accel
-
softmmu.c:
39
)
do_configure_accelerator(void
*
opaque, QemuOpts
*
opts, Error
*
*
errp) (qemu
-
6.2
.
0
\softmmu\vl.c:
2348
)
qemu_opts_foreach(QemuOptsList
*
list
, qemu_opts_loopfunc func, void
*
opaque, Error
*
*
errp) (qemu
-
6.2
.
0
\util\qemu
-
option.c:
1135
)
configure_accelerators(const char
*
progname) (qemu
-
6.2
.
0
\softmmu\vl.c:
2414
)
qemu_init(
int
argc, char
*
*
argv, char
*
*
envp) (qemu
-
6.2
.
0
\softmmu\vl.c:
3724
)
main(
int
argc, char
*
*
argv, char
*
*
envp) (qemu
-
6.2
.
0
\softmmu\main.c:
49
)
|
主要函数kvm_init
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
|
static
int
kvm_init(MachineState
*
ms)
{
MachineClass
*
mc
=
MACHINE_GET_CLASS(ms);
static const char upgrade_note[]
=
"Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
"(see http://sourceforge.net/projects/kvm).\n"
;
/
/
...
QLIST_INIT(&s
-
>kvm_parked_vcpus);
/
/
开始使用kvm之前的标准流程
/
/
打开设备
/
dev
/
kvm,检查kvm API版本
/
/
保存了kvm设备描述符s
-
>fd
s
-
>fd
=
qemu_open_old(
"/dev/kvm"
, O_RDWR);
if
(s
-
>fd
=
=
-
1
) {
fprintf(stderr,
"Could not access KVM kernel module: %m\n"
);
ret
=
-
errno;
goto err;
}
ret
=
kvm_ioctl(s, KVM_GET_API_VERSION,
0
);
if
(ret < KVM_API_VERSION) {
if
(ret >
=
0
) {
ret
=
-
EINVAL;
}
fprintf(stderr,
"kvm version too old\n"
);
goto err;
}
if
(ret > KVM_API_VERSION) {
ret
=
-
EINVAL;
fprintf(stderr,
"kvm version not supported\n"
);
goto err;
}
/
/
...
/
/
创建虚拟机,保存虚拟机描述符s
-
>vmfd
do {
ret
=
kvm_ioctl(s, KVM_CREATE_VM,
type
);
}
while
(ret
=
=
-
EINTR);
/
/
...
s
-
>vmfd
=
ret;
/
/
...
}
|
其主要功能是保存了kvm设备描述符s->fd,创建的虚拟机的描述符s->vmfd。
在2.1节中有提到,qmp_x_exit_preconfig函数与虚拟cpu的创建有关。
动态调试跟踪分析
qmp_x_exit_preconfig qemu-6.2.0\softmmu\vl.c:2740
--> qemu_init_board qemu-6.2.0\softmmu\vl.c:2652
--> machine_run_board_init qemu-6.2.0\hw\core\machine.c:1181
--> pc_q35_init qemu-6.2.0\hw\i386\pc_q35.c:182
--> x86_cpus_init qemu-6.2.0\hw\i386\x86.c:141
--> x86_cpu_new qemu-6.2.0\hw\i386\x86.c:114
在machine_run_board_init函数中根据参数中给的机器类型调用不同的pc_machine_init函数
machine_class->init(machine)----pc_q35_init
1
2
3
4
5
6
7
8
|
void x86_cpus_init(X86MachineState
*
x86ms,
int
default_cpu_version)
{
/
/
...
/
/
根据参数smp的值,创建对应数量的虚拟cpu
for
(i
=
0
; i < ms
-
>smp.cpus; i
+
+
) {
x86_cpu_new(x86ms, possible_cpus
-
>cpus[i].arch_id, &error_fatal);
}
}
|
qemu-6.2.0/softmmu/cpus.c line 611
在cpus.c文件611行qemu_init_vcpu中初始化虚拟cpu,创建执行线程。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
|
void qemu_init_vcpu(CPUState
*
cpu)
{
/
/
...
/
/
调用函数kvm_start_vcpu_thread创建虚拟cpu执行线程
cpus_accel
-
>create_vcpu_thread(cpu);
/
/
...
}
static void kvm_start_vcpu_thread(CPUState
*
cpu)
{
/
/
...
/
/
线程函数kvm_vcpu_thread_fn
qemu_thread_create(cpu
-
>thread, thread_name, kvm_vcpu_thread_fn,
cpu, QEMU_THREAD_JOINABLE);
/
/
...
}
static void
*
kvm_vcpu_thread_fn(void
*
arg)
{
/
/
...
/
/
kvm_init_vcpu中通过kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void
*
)vcpu_id)
/
/
获取了vcpu描述符 cpu
-
>kvm_fd
=
ret;
r
=
kvm_init_vcpu(cpu, &error_fatal);
kvm_init_cpu_signals(cpu);
/
*
signal CPU creation
*
/
cpu_thread_signal_created(cpu);
qemu_guest_random_seed_thread_part2(cpu
-
>random_seed);
/
/
do
while
循环执行kvm_cpu_exec
do {
if
(cpu_can_run(cpu)) {
r
=
kvm_cpu_exec(cpu);
if
(r
=
=
EXCP_DEBUG) {
cpu_handle_guest_debug(cpu);
}
}
qemu_wait_io_event(cpu);
}
while
(!cpu
-
>unplug || cpu_can_run(cpu));
kvm_destroy_vcpu(cpu);
cpu_thread_signal_destroyed(cpu);
qemu_mutex_unlock_iothread();
rcu_unregister_thread();
return
NULL;
}
int
kvm_cpu_exec(CPUState
*
cpu)
{
/
/
...
do {
/
/
kvm_vcpu_ioctl(cpu, KVM_RUN,
0
)
/
/
从这里进入kvm内核阶段,开始运行虚拟机
run_ret
=
kvm_vcpu_ioctl(cpu, KVM_RUN,
0
);
/
/
...
/
/
根据退出原因,分发处理
switch (run
-
>exit_reason) {
case KVM_EXIT_IO:
DPRINTF(
"handle_io\n"
);
/
*
Called outside BQL
*
/
kvm_handle_io(run
-
>io.port, attrs,
(uint8_t
*
)run
+
run
-
>io.data_offset,
run
-
>io.direction,
run
-
>io.size,
run
-
>io.count);
ret
=
0
;
break
;
default:
DPRINTF(
"kvm_arch_handle_exit\n"
);
ret
=
kvm_arch_handle_exit(cpu, run);
break
;
}
}
while
(ret
=
=
0
);
cpu_exec_end(cpu);
/
/
...
qatomic_set(&cpu
-
>exit_request,
0
);
return
ret;
}
|
虚拟机的运行就是kvm_cpu_exec中的do()while(ret == 0)的循环,该循环体中主要通过KVM_RUN启动虚拟机,进入了kvm的内核处理阶段,并等待返回结果。
当虚拟机退出,会根据返回的原因进行相应处理,最后将处理结果返回。
而kvm_cpu_exec自身也处于vcpu线程函数kvm_vcpu_thread_fn的循环当中,所以虚拟机的运行就是在这两个循环中不断进行。
解析参数,创建虚拟机,创建虚拟cpu,并获取三个最主要的描述符kvmfd、vmfd以及vcpufd。
根据vcpu数量创建具体的执行线程。
在线程中通过KVM_RUN启动虚拟机,进入内核KVM的处理流程。
重复循环KVM_RUN阶段。
在用户层QEMU阶段有提到通过函数kvm_vcpu_ioctl(cpu, KVM_RUN, 0)进入到内核KVM处理阶段。
linux-5.15.39/virt/kvm/kvm_main.c,line 3764
在kvm_main.c文件3764行找到内核中实际的kvm_vcpu_ioctl函数。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
|
static
long
kvm_vcpu_ioctl(struct
file
*
filp,
unsigned
int
ioctl, unsigned
long
arg)
{
/
/
...
switch (ioctl) {
case KVM_RUN: {
/
/
...
/
/
根据KVM_RUN,调用kvm_arch_vcpu_ioctl_run
r
=
kvm_arch_vcpu_ioctl_run(vcpu);
trace_kvm_userspace_exit(vcpu
-
>run
-
>exit_reason, r);
break
;
}
/
/
...
}
out:
mutex_unlock(&vcpu
-
>mutex);
kfree(fpu);
kfree(kvm_sregs);
return
r;
}
/
/
arch
/
x86
/
kvm
/
x86.c line
10103
int
kvm_arch_vcpu_ioctl_run(struct kvm_vcpu
*
vcpu)
{
/
/
...
if
(kvm_run
-
>immediate_exit)
r
=
-
EINTR;
else
r
=
vcpu_run(vcpu);
/
/
...
return
r;
}
/
/
arch
/
x86
/
kvm
/
x86.c line
9923
static
int
vcpu_run(struct kvm_vcpu
*
vcpu)
{
int
r;
struct kvm
*
kvm
=
vcpu
-
>kvm;
/
/
...
for
(;;) {
if
(kvm_vcpu_running(vcpu)) {
/
/
进入guest模式的入口
r
=
vcpu_enter_guest(vcpu);
}
else
{
r
=
vcpu_block(kvm, vcpu);
}
/
/
当返回值r<
=
0
时,退出循环,一步步返回到用户层QEMU处理
/
/
当返回值r>
0
时,继续guest运行循环
if
(r <
=
0
)
break
;
}
/
/
...
return
r;
}
/
/
arch
/
x86
/
kvm
/
x86.c line
9532
static
int
vcpu_enter_guest(struct kvm_vcpu
*
vcpu)
{
int
r;
/
/
一系列kvm_check_request函数调用
/
/
检查guest请求
/
/
...
/
/
guest内存管理单元
r
=
kvm_mmu_reload(vcpu);
/
/
...
/
/
禁用内核抢占
preempt_disable();
/
/
...
vcpu
-
>mode
=
IN_GUEST_MODE;
/
/
exit_fastpath
=
static_call(kvm_x86_run)(vcpu);
/
/
调用架构相关的run函数进入guest模式运行
for
(;;) {
exit_fastpath
=
static_call(kvm_x86_run)(vcpu);
/
/
...
break
;
}
/
/
能走到这里标志已退出guest模式
vcpu
-
>mode
=
OUTSIDE_GUEST_MODE;
/
/
...
/
/
启用内核抢占
preempt_enable();
/
/
调用架构相关kvm_x86_handle_exit函数
/
/
根据具体退出原因进行处理
r
=
static_call(kvm_x86_handle_exit)(vcpu, exit_fastpath);
return
r;
}
|
那调用流程就是
kvm_vcpu_ioctl --> kvm_arch_vcpu_ioctl_run
--> vcpu_run --> vcpu_enter_guest
--> static_call(kvm_x86_run)(vcpu)
在arch/x86/kvm/vmx/vmx.c line 7584
定义了一系列架构相关的操作函数
关注运行相关的
.run = vmx_vcpu_run,
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
/
/
arch
/
x86
/
kvm
/
vmx
/
vmx.c line
6628
static fastpath_t vmx_vcpu_run(struct kvm_vcpu
*
vcpu)
{
/
/
检查和准备工作
/
/
...
vmx_vcpu_enter_exit(vcpu, vmx);
/
/
...
}
/
/
arch
/
x86
/
kvm
/
vmx
/
vmx.c line
6606
static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu
*
vcpu,
struct vcpu_vmx
*
vmx)
{
/
/
...
/
/
arch
/
x86
/
kvm
/
vmx
/
vmenter.S汇编
vmx
-
>fail
=
__vmx_vcpu_run(vmx, (unsigned
long
*
)&vcpu
-
>arch.regs,
vmx
-
>loaded_vmcs
-
>launched);
/
/
...
}
|
1
2
3
4
5
6
7
|
arch
/
x86
/
kvm
/
vmx
/
vmenter.S
1.
保存host状态
2.
加载guest状态
3.
进入guest模式: call vmx_vmenter
cpu从ROOT模式切换至NON
-
ROOT模式,进入到guest的世界运行
4.
发生VM Exit时,保存guest状态,加载host状态
cpu从NON
-
ROOT模式切换至ROOT模式,返回到host的世界
|
在arch/x86/kvm/vmx/vmx.c line 7584
定义了一系列架构相关的操作函数
关注退出处理相关的
.handle_exit = vmx_handle_exit,
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
|
static
int
vmx_handle_exit(struct kvm_vcpu
*
vcpu, fastpath_t exit_fastpath)
{
int
ret
=
__vmx_handle_exit(vcpu, exit_fastpath);
/
/
...
return
ret;
}
static
int
__vmx_handle_exit(struct kvm_vcpu
*
vcpu, fastpath_t exit_fastpath)
{
/
/
...
exit_handler_index
=
array_index_nospec((u16)exit_reason.basic,
kvm_vmx_max_exit_handlers);
return
kvm_vmx_exit_handlers[exit_handler_index](vcpu);
}
/
/
退出处理例程返回<
=
0
,表示异常需要到用户层qemu进行进一步处理
/
/
退出处理例程返回值>
0
,表示内核层已经处理完,可继续执行
static
int
(
*
kvm_vmx_exit_handlers[])(struct kvm_vcpu
*
vcpu)
=
{
[EXIT_REASON_EXCEPTION_NMI]
=
handle_exception_nmi,
[EXIT_REASON_EXTERNAL_INTERRUPT]
=
handle_external_interrupt,
[EXIT_REASON_TRIPLE_FAULT]
=
handle_triple_fault,
[EXIT_REASON_NMI_WINDOW]
=
handle_nmi_window,
[EXIT_REASON_IO_INSTRUCTION]
=
handle_io,
[EXIT_REASON_CR_ACCESS]
=
handle_cr,
[EXIT_REASON_DR_ACCESS]
=
handle_dr,
[EXIT_REASON_CPUID]
=
kvm_emulate_cpuid,
[EXIT_REASON_MSR_READ]
=
kvm_emulate_rdmsr,
[EXIT_REASON_MSR_WRITE]
=
kvm_emulate_wrmsr,
[EXIT_REASON_INTERRUPT_WINDOW]
=
handle_interrupt_window,
[EXIT_REASON_HLT]
=
kvm_emulate_halt,
[EXIT_REASON_INVD]
=
kvm_emulate_invd,
[EXIT_REASON_INVLPG]
=
handle_invlpg,
[EXIT_REASON_RDPMC]
=
kvm_emulate_rdpmc,
[EXIT_REASON_VMCALL]
=
kvm_emulate_hypercall,
[EXIT_REASON_VMCLEAR]
=
handle_vmx_instruction,
[EXIT_REASON_VMLAUNCH]
=
handle_vmx_instruction,
[EXIT_REASON_VMPTRLD]
=
handle_vmx_instruction,
[EXIT_REASON_VMPTRST]
=
handle_vmx_instruction,
[EXIT_REASON_VMREAD]
=
handle_vmx_instruction,
[EXIT_REASON_VMRESUME]
=
handle_vmx_instruction,
[EXIT_REASON_VMWRITE]
=
handle_vmx_instruction,
[EXIT_REASON_VMOFF]
=
handle_vmx_instruction,
[EXIT_REASON_VMON]
=
handle_vmx_instruction,
[EXIT_REASON_TPR_BELOW_THRESHOLD]
=
handle_tpr_below_threshold,
[EXIT_REASON_APIC_ACCESS]
=
handle_apic_access,
[EXIT_REASON_APIC_WRITE]
=
handle_apic_write,
[EXIT_REASON_EOI_INDUCED]
=
handle_apic_eoi_induced,
[EXIT_REASON_WBINVD]
=
kvm_emulate_wbinvd,
[EXIT_REASON_XSETBV]
=
kvm_emulate_xsetbv,
[EXIT_REASON_TASK_SWITCH]
=
handle_task_switch,
[EXIT_REASON_MCE_DURING_VMENTRY]
=
handle_machine_check,
[EXIT_REASON_GDTR_IDTR]
=
handle_desc,
[EXIT_REASON_LDTR_TR]
=
handle_desc,
[EXIT_REASON_EPT_VIOLATION]
=
handle_ept_violation,
[EXIT_REASON_EPT_MISCONFIG]
=
handle_ept_misconfig,
[EXIT_REASON_PAUSE_INSTRUCTION]
=
handle_pause,
[EXIT_REASON_MWAIT_INSTRUCTION]
=
kvm_emulate_mwait,
[EXIT_REASON_MONITOR_TRAP_FLAG]
=
handle_monitor_trap,
[EXIT_REASON_MONITOR_INSTRUCTION]
=
kvm_emulate_monitor,
[EXIT_REASON_INVEPT]
=
handle_vmx_instruction,
[EXIT_REASON_INVVPID]
=
handle_vmx_instruction,
[EXIT_REASON_RDRAND]
=
kvm_handle_invalid_op,
[EXIT_REASON_RDSEED]
=
kvm_handle_invalid_op,
[EXIT_REASON_PML_FULL]
=
handle_pml_full,
[EXIT_REASON_INVPCID]
=
handle_invpcid,
[EXIT_REASON_VMFUNC]
=
handle_vmx_instruction,
[EXIT_REASON_PREEMPTION_TIMER]
=
handle_preemption_timer,
[EXIT_REASON_ENCLS]
=
handle_encls,
[EXIT_REASON_BUS_LOCK]
=
handle_bus_lock_vmexit,
};
|
进入guest世界的准备工作。
正式进入guest执行。
根据guest退出原因进行处理,KVM先自行处理,
若kvm不能完全处理,则返回到用户层由QEMU处理。
QEMU处理后再次通过KVM_RUN进入到内核KVM流程。
.《Intel® Volume 3 System Programming Guide》
.《系统虚拟化:原理与实现》
.《处理器虚拟化技术》
. https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.15.39.tar.xz
. https://download.qemu.org/qemu-6.2.0.tar.xz
更多【QEMU/KVM虚拟机运行核心流程】相关视频教程:www.yxfzedu.com