9pfs kvm qemu源码流程

这篇介绍qemu、kvm、unikraft三者的交互源码

大致流程

以9pfs read操作为例。大致流程为

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
qemu::kvm_cpu_exec ->
kvm:: kvm_arch_vcpu_ioctl_run ->
unikraft::main(9p_read) ->
unikraft::outl ->
kvm::vmx_handle_exit ->
qemu::kvm_handle_io ->
qemu::virtqueue->handle_output ->
qemu::handle_9p_output ->
qemu::v9fs_read ->
unistd.h::read ->
qemu::pdu_complete ->
qemu::transport->push_and_notify ->
qemu::virtio_set_isr ->
unikraft::virtio_pci_handle ->
unikraft::virtqueue->vq_callvack ->
unikraft::virtio_9p_recv ->
unikraft::uk_9preq_receive_cb ->
unikraft::main ->
...

kvm侧

kvm_arch_vcpu_ioctl_run

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
// linux::arch/x86/kvm/x86.c 	qemu中调用ioctl(KVM_RUN)会调用到这里,即kvm中的主函数
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
struct kvm_queued_exception *ex = &vcpu->arch.exception;
struct kvm_run *kvm_run = vcpu->run;

... // 各种准备操作

r = vcpu_run(vcpu); // 调用vcpu_run主函数

return r;
}


// linux::arch/x86/kvm/x86.c vcpu_run主函数
static int vcpu_run(struct kvm_vcpu *vcpu)
{
...
for (;;) {
/* 主循环 */
vcpu->arch.at_instruction_boundary = false;
if (kvm_vcpu_running(vcpu)) {
// vcpu_enter_guest主函数,r是guest os产生vm exit后kvm处理的结果
r = vcpu_enter_guest(vcpu);
} else {
r = vcpu_block(vcpu);
}

if (r <= 0) // 如果处理结果<0即kvm处理不了,就终止循环,退出到qemu里面处理,否则就继续循环
break;

... // 各种后续操作
}
return r;
}

kvm_enter_guest

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
// linux::arch/x86/kvm/x86.c	kvm这边进入guest os的主函数,即qemu调用ioctl后进入kvm中调用的函数
static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
... // 各种准备工作
static_call(kvm_x86_prepare_switch_to_guest)(vcpu);
...// 还是准备工作

for (;;) {
/* 主循环 */
...
// 调用vcpu_run主函数,进入guest os 这个返回值代表guest os那边产生了vm exit退出了
exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
...
if (unlikely(kvm_vcpu_exit_request(vcpu))) { // vm exit有请求就退出循环处理请求
exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
break;
}
}

... // 各种保存现场的代码

// 调用handle_exit来处理
r = static_call(kvm_x86_handle_exit)(vcpu, exit_fastpath);
return r;
}

其中有两个主要函数是kvm_x86_vcpu_run和kvm_x86_handle_exit,那么是在哪里注册的呢?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
// linux::arch/x86/kvm/x86.c 定义static call
#define KVM_X86_OP(func) \
DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func));


// linux::arch/x86/kvm/vmx/vmx.c 注册各种vmx操作到kvm_x86_ops里
static struct kvm_x86_ops vmx_x86_ops __initdata = {
.name = "kvm_intel",
...
.vcpu_load = vmx_vcpu_load,
.vcpu_put = vmx_vcpu_put,
...
.vcpu_run = vmx_vcpu_run, // run
.handle_exit = vmx_handle_exit, // handle_exit
... // 好多操作
};

vmx_vcpu_run

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
// linux::arch/x86/kvm/vmx/vmx.c vcpu run主函数
static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);

// 很多vmx相关的准备工作
...

/* 主函数,进入guest os */
vmx_vcpu_enter_exit(vcpu, vmx, __vmx_vcpu_run_flags(vmx));

/* 发生了vm exit */

... // 保存现场等exit前的操作

vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); // 读取exit的原因

... // exit前的准备工作

return vmx_exit_handlers_fastpath(vcpu);
}

vmx_vcpu_enter_exit

1
2
3
4
5
6
7
8
9
10
11
12
// linux::arch/x86/kvm/vmx/vmx.c 主函数
static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
struct vcpu_vmx *vmx,
unsigned long flags)
{
...
/* 调用__vmx_vcpu_run */
vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
flags);
...
}

__vmx_vcpu_run

这个__vmx_vcpu_run是汇编代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
/* linux::arch/x86/kvm/vmx/vmenter.S */
.section .noinstr.text, "ax"

/**
* __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode
* @vmx: struct vcpu_vmx *
* @regs: unsigned long * (to guest registers)
* @flags: VMX_RUN_VMRESUME: use VMRESUME instead of VMLAUNCH
* VMX_RUN_SAVE_SPEC_CTRL: save guest SPEC_CTRL into vmx->spec_ctrl
*
* Returns:
* 0 on VM-Exit, 1 on VM-Fail
*/
SYM_FUNC_START(__vmx_vcpu_run)
/* 很多汇编代码 */
...
jz .Lvmlaunch
...
/* 恢复到guest os */
.Lvmresume:
/* x86 Intel VT-x指令集 */
vmresume
jmp .Lvmfail

/* 启动guest os */
.Lvmlaunch:
/* x86 Intel VT-x指令集 */
vmlaunch
jmp .Lvmfail
...
/* 很多汇编代码 */
...
SYM_FUNC_END(__vmx_vcpu_run)

vmx_handle_exit

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
// linux::arch/x86/kvm/vmx/vmx.c
static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
int ret = __vmx_handle_exit(vcpu, exit_fastpath); // 主要实现

... // 一些操作
return ret;
}


/* linux::arch/x86/kvm/vmx/vmx.c handle exit的实现
* guest os出现了vm exit,kvm会先看看能不能自己处理,或者还是要qemu的帮助,如果返回值>0则不需要qemu
*/
static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
union vmx_exit_reason exit_reason = vmx->exit_reason;
... // 很多处理

return kvm_vmx_exit_handlers[exit_handler_index](vcpu); // 具体的处理函数

unexpected_vmexit:
...
}

qemu侧

kvm_cpu_exec

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
// qemu::accel/kvm/kvm_all.c  qemu中的主函数
int kvm_cpu_exec(CPUState *cpu)
{
struct kvm_run *run = cpu->kvm_run;
...
do { // 主循环
...
// 调用kvm中的vcpu_enter_guest的循环主函数
run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
...
trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
switch (run->exit_reason) { // 判断kvm的vmexit的原因
case KVM_EXIT_IO: // 如果是因为io而退出的
DPRINTF("handle_io\n");
/* 关键处理函数 */
kvm_handle_io(run->io.port, attrs,
(uint8_t *)run + run->io.data_offset,
run->io.direction,
run->io.size,
run->io.count);
ret = 0;
break;
case ... // 其他exit情况
} while (ret == 0); // 当ret等于零的时候继续循环,即继续回到kvm启动guest os

// 否则退出qemu
...
return ret;
}


// qemu::accel/kvm/kvm_all.c 调用 address_space_rw
static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction,
int size, uint32_t count)
{
int i;
uint8_t *ptr = data;

for (i = 0; i < count; i++) {
address_space_rw(&address_space_io, port, attrs,
ptr, size,
direction == KVM_EXIT_IO_OUT);
ptr += size;
}
}


// qemu::softmmu/physmem.c 上述函数的实现
MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
void *buf, hwaddr len, bool is_write)
{
if (is_write) {
return address_space_write(as, addr, attrs, buf, len); // 调用write
} else {
return address_space_read_full(as, addr, attrs, buf, len);
}
}


// qemu::softmmu/physmem.c write操作
MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
MemTxAttrs attrs,
const void *buf, hwaddr len)
{
MemTxResult result = MEMTX_OK;
FlatView *fv;

if (len > 0) {
RCU_READ_LOCK_GUARD();
fv = address_space_to_flatview(as); // 把地址转为fv
result = flatview_write(fv, addr, attrs, buf, len); // 主要函数
}

return result;
}

// qemu::softmmu/physmem.c 具体实现
static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
const void *buf, hwaddr len)
{
hwaddr l;
hwaddr addr1;
MemoryRegion *mr;

l = len;
mr = flatview_translate(fv, addr, &addr1, &l, true, attrs); // 把fv翻译成内存域mr
if (!flatview_access_allowed(mr, attrs, addr, len)) {
return MEMTX_ACCESS_ERROR;
}
return flatview_write_continue(fv, addr, attrs, buf, len, // 主要函数
addr1, l, mr);
}


// qemu::softmmu/physmem.c 具体实现
static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr,
MemTxAttrs attrs,
const void *ptr,
hwaddr len, hwaddr addr1,
hwaddr l, MemoryRegion *mr)
{
...
for (;;) {
...
/* 主要函数 */
result |= memory_region_dispatch_write(mr, addr1, val,
size_memop(l), attrs);
...
}
return result;
}


// qemu::softmmu/memory.c 具体实现
MemTxResult memory_region_dispatch_write(MemoryRegion *mr,
hwaddr addr,
uint64_t data,
MemOp op,
MemTxAttrs attrs)
{
...
if (mr->ops->write) { // 如果内存域mr的ops中注册了write
/* 主要函数 */
return access_with_adjusted_size(addr, &data, size,
mr->ops->impl.min_access_size,
mr->ops->impl.max_access_size,
memory_region_write_accessor, mr,
attrs);
...
}

// qemu::softmmu/memory.c 具体实现
static MemTxResult access_with_adjusted_size(hwaddr addr,
uint64_t *value,
unsigned size,
unsigned access_size_min,
unsigned access_size_max,
MemTxResult (*access_fn)
(MemoryRegion *mr,
hwaddr addr,
uint64_t *value,
unsigned size,
signed shift,
uint64_t mask,
MemTxAttrs attrs),
MemoryRegion *mr,
MemTxAttrs attrs)
{
...
if (memory_region_big_endian(mr)) { // big_endian
for (i = 0; i < size; i += access_size) {
/* 调用参数access_fn,即上述的memory_region_write_accessor */
r |= access_fn(mr, addr + i, value, access_size,
(size - access_size - i) * 8, access_mask, attrs);
}
} else {
for (i = 0; i < size; i += access_size) { // little_endian
r |= access_fn(mr, addr + i, value, access_size, i * 8,
access_mask, attrs);
}
}
return r;
}

// qemu::softmmu/memory.c 真正调用write
static MemTxResult memory_region_write_accessor(MemoryRegion *mr,
hwaddr addr,
uint64_t *value,
unsigned size,
signed shift,
uint64_t mask,
MemTxAttrs attrs)
{
uint64_t tmp = memory_region_shift_write_access(value, shift, mask);
...
/* 真正调用write的地方 */
mr->ops->write(mr->opaque, addr, tmp, size);
return MEMTX_OK;
}

那么在哪里注册了这个write操作呢?

virtio_pci_config_write

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
// qemu::hw/virtio/virtio-pci.c  注册mr的ops
static const MemoryRegionOps virtio_pci_config_ops = {
.read = virtio_pci_config_read,
.write = virtio_pci_config_write,
.impl = {
.min_access_size = 1,
.max_access_size = 4,
},
.endianness = DEVICE_LITTLE_ENDIAN,
};


// qemu::hw/virtio/virtio-pci.c write的实现
static void virtio_pci_config_write(void *opaque, hwaddr addr,
uint64_t val, unsigned size)
{
VirtIOPCIProxy *proxy = opaque;
uint32_t config = VIRTIO_PCI_CONFIG_SIZE(&proxy->pci_dev);
VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);

...
if (addr < config) {
/* ioport_write是这个的具体实现 */
virtio_ioport_write(proxy, addr, val);
return;
}
...
}

// qemu::hw/virtio/virtio-pci.c ioport_write的实现
static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
{
VirtIOPCIProxy *proxy = opaque;
VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
hwaddr pa;

switch (addr) { // 通过这个地址来判断这个地址属于什么类型
case ... // 各种情况
case VIRTIO_PCI_QUEUE_NOTIFY:
if (val < VIRTIO_QUEUE_MAX) {
/* 主要函数 */
virtio_queue_notify(vdev, val);
}
break;
case ...
}


// qemu::hw/virtio/virtio.c virtio_queue_notify的实现
void virtio_queue_notify(VirtIODevice *vdev, int n)
{
VirtQueue *vq = &vdev->vq[n];

if(...)
{
...
/* 主要函数 */
vq->handle_output(vdev, vq);
...
}
}

9p后端实现相关

可以看到最后是调了VirtQueue的handle_output这个方法,那么9p是在哪里注册的这个方法呢?

virtio_9p_device相关

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
// qemu::hw/9pfs/virtio-9p-device.c  对9p dev的实现
static void virtio_9p_device_realize(DeviceState *dev, Error **errp)
{
...
virtio_init(vdev, VIRTIO_ID_9P, v->config_size);
/* 在这里注册了 */
v->vq = virtio_add_queue(vdev, MAX_REQ, handle_9p_output);
}

// qemu::hw/virtio/virtio.c 注册函数
VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
VirtIOHandleOutput handle_output)
{
...
vdev->vq[i].handle_output = handle_output; // 注册handle_9p_output
...
return &vdev->vq[i];
}

handle_9p_output

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
// qemu:hw/9pfs/virtio-9p-device.c	handle_9p_output的实现
static void handle_9p_output(VirtIODevice *vdev, VirtQueue *vq)
{
V9fsPDU *pdu; // 分组数据单元
while(...){
elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); // 获取virtqueue
...

v->elems[pdu->idx] = elem;
pdu_submit(pdu, &out); // 关键函数的实现
...
}
}

// qemu:hw/9pfs/9p.c pdu_submit的实现
void pdu_submit(V9fsPDU *pdu, P9MsgHeader *hdr)
{
...
V9fsState *s = pdu->s;
...
if (...)
} else {
handler = pdu_co_handlers[pdu->id]; // 关键取handle的地方,根据pdu的id即9p的类型
}
...
}

// qemu::hw/9pfs/9p.c pdu_co_handlers数组, 即很多9p相关的具体实现
static CoroutineEntry *pdu_co_handlers[] = {
[P9_TREADDIR] = v9fs_readdir,
[P9_TSTATFS] = v9fs_statfs,
[P9_TGETATTR] = v9fs_getattr,
[P9_TSETATTR] = v9fs_setattr,
[P9_TXATTRWALK] = v9fs_xattrwalk,
[P9_TXATTRCREATE] = v9fs_xattrcreate,
[P9_TMKNOD] = v9fs_mknod,
[P9_TRENAME] = v9fs_rename,
[P9_TLOCK] = v9fs_lock,
[P9_TGETLOCK] = v9fs_getlock,
[P9_TRENAMEAT] = v9fs_renameat,
[P9_TREADLINK] = v9fs_readlink,
[P9_TUNLINKAT] = v9fs_unlinkat,
[P9_TMKDIR] = v9fs_mkdir,
[P9_TVERSION] = v9fs_version,
[P9_TLOPEN] = v9fs_open,
[P9_TATTACH] = v9fs_attach,
[P9_TSTAT] = v9fs_stat,
[P9_TWALK] = v9fs_walk,
[P9_TCLUNK] = v9fs_clunk,
[P9_TFSYNC] = v9fs_fsync,
[P9_TOPEN] = v9fs_open,
[P9_TREAD] = v9fs_read,
#if 0
[P9_TAUTH] = v9fs_auth,
#endif
[P9_TFLUSH] = v9fs_flush,
[P9_TLINK] = v9fs_link,
[P9_TSYMLINK] = v9fs_symlink,
[P9_TCREATE] = v9fs_create,
[P9_TLCREATE] = v9fs_lcreate,
[P9_TWRITE] = v9fs_write,
[P9_TWSTAT] = v9fs_wstat,
[P9_TREMOVE] = v9fs_remove,
};

v9fs_read实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
// qemu::hw/9pfs/9p.c	read的具体实现 
static void coroutine_fn v9fs_read(void *opaque)
{
int32_t fid;
uint64_t off;
ssize_t err = 0;
int32_t count = 0;
size_t offset = 7;
uint32_t max_count;
V9fsFidState *fidp;
V9fsPDU *pdu = opaque;
V9fsState *s = pdu->s;

err = pdu_unmarshal(pdu, offset, "dqd", &fid, &off, &max_count); // 取具体的数据
...
do {
len = v9fs_co_preadv(pdu, fidp, qiov.iov, qiov.niov, off); // 主要实现
if (len >= 0) {
off += len;
count += len;
}
} while (len == -EINTR && !pdu->cancelled);
...

/* 完成所有操作之后通知guest os */
pdu_complete(pdu, err); // 通知函数
}

// qemu::hw/9pfs/corefile.c
int coroutine_fn v9fs_co_preadv(V9fsPDU *pdu, V9fsFidState *fidp,
struct iovec *iov, int iovcnt, int64_t offset)
{
int err;
V9fsState *s = pdu->s;
...
err = s->ops->preadv(&s->ctx, &fidp->fs, iov, iovcnt, offset); // 主要实现
...
return err;
}


// qemu::hw/9pfs/9p-local.c preadv的一种具体实现,另外还有两种
FileOperations local_ops = {
... // 很多操作
.open = local_open,
.opendir = local_opendir,
.rewinddir = local_rewinddir,
.telldir = local_telldir,
.readdir = local_readdir,
.seekdir = local_seekdir,
/* 在这里 */
.preadv = local_preadv,
.pwritev = local_pwritev,
.chmod = local_chmod,
.mknod = local_mknod,
.mkdir = local_mkdir,
... //很多操作
};


// qemu::hw/9pfs/9p-local.c pread_v的实现
static ssize_t local_preadv(FsContext *ctx, V9fsFidOpenState *fs,
const struct iovec *iov,
int iovcnt, off_t offset)
{
#ifdef CONFIG_PREADV
return preadv(fs->fd, iov, iovcnt, offset); // 调用preadv
#else
int err = lseek(fs->fd, offset, SEEK_SET);
if (err == -1) {
return err;
} else {
return readv(fs->fd, iov, iovcnt); // 调用readv
}
#endif
}


// qemu::util/osdep.c readv的实现
ssize_t
readv(int fd, const struct iovec *iov, int iov_cnt)
{
return readv_writev(fd, iov, iov_cnt, false); // 调用readv_writev
}


// qemu::util/osdep.c readv_writev的实现
static ssize_t
readv_writev(int fd, const struct iovec *iov, int iov_cnt, bool do_write)
{
...
while (i < iov_cnt) {
ssize_t r = do_write
? write(fd, iov[i].iov_base + off, iov[i].iov_len - off)
// 调用linux posix系统调用read
: read(fd, iov[i].iov_base + off, iov[i].iov_len - off);
...
}
return ret;
}

pdu_complete设置中断

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
// qemu::hw/9pfs/9p.c	在操作完成后设置中断
static void coroutine_fn pdu_complete(V9fsPDU *pdu, ssize_t len)
{
...
/* 主要函数push_and_notify */
pdu->s->transport->push_and_notify(pdu);
...
}


// qemu::hw/9pfs/virtio-9p-device.c 注册push_and_notify
static const V9fsTransport virtio_9p_transport = {
.pdu_vmarshal = virtio_pdu_vmarshal,
.pdu_vunmarshal = virtio_pdu_vunmarshal,
.init_in_iov_from_pdu = virtio_init_in_iov_from_pdu,
.init_out_iov_from_pdu = virtio_init_out_iov_from_pdu,
.push_and_notify = virtio_9p_push_and_notify,
};


// qemu::hw/9pfs/virtio-9p-device.c 具体实现
static void virtio_9p_push_and_notify(V9fsPDU *pdu)
{
V9fsState *s = pdu->s;
V9fsVirtioState *v = container_of(s, V9fsVirtioState, state);
VirtQueueElement *elem = v->elems[pdu->idx];

/* 把处理完的queue push进virtqueue队列里,并且设置中断通知guest os */
virtqueue_push(v->vq, elem, pdu->size);
g_free(elem);
v->elems[pdu->idx] = NULL;

virtio_notify(VIRTIO_DEVICE(v), v->vq); // 通知函数
}


// qemu::hw/virtio/virtio.c
void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
{
WITH_RCU_READ_LOCK_GUARD() {
if (!virtio_should_notify(vdev, vq)) {
return;
}
}
virtio_irq(vq); // 设置中断
}


// qemu::hw/virtio/virtio.c
static void virtio_irq(VirtQueue *vq)
{
virtio_set_isr(vq->vdev, 0x1); // 设置中断
virtio_notify_vector(vq->vdev, vq->vector);
}


// qemu:: hw/virtio/virtio.c 设置中断
static void virtio_set_isr(VirtIODevice *vdev, int value)
{
uint8_t old = qatomic_read(&vdev->isr);
if ((old & value) != value) {
qatomic_or(&vdev->isr, value); // 写中断
}
}

unikraft侧

virtio_pci_handle获取中断处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26

// unikraft::plat/drivers/virtio/virtio_pci.c 处理中断
static int virtio_pci_handle(void *arg)
{
...
/* 读中断状态 */
isr_status = virtio_cread8((void *)(unsigned long)d->pci_isr_addr, 0);
...

if (isr_status & VIRTIO_PCI_ISR_HAS_INTR) { // 有中断了
UK_TAILQ_FOREACH(vq, &d->vdev.vqs, next) { // 遍历virtqueue队列来获取qemu处理的virtqueue
rc |= virtqueue_ring_interrupt(vq); // 处理中断
}
}
return rc;
}

// unikraft::plat/drivers/virtio/virtio_ring.c 处理中断
int virtqueue_ring_interrupt(void *obj)
{
struct virtqueue *vq = (struct virtqueue *)obj;
...
if (likely(vq->vq_callback))
rc = vq->vq_callback(vq, vq->priv); // 调用回调函数,即9p的相关函数
return rc;
}

这个回调函数callback在哪里注册的呢?其实在创建virtqueue的时候就注册了

callback相关

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
// unikraft::plat/drivers/virtio/virtio_ring.c	注册回调函数
struct virtqueue *virtqueue_create(__u16 queue_id, __u16 nr_descs, __u16 align,
virtqueue_callback_t callback,
virtqueue_notify_host_t notify,
struct virtio_dev *vdev, struct uk_alloc *a)
{
... // 一些操作
vq = &vrq->vq;
vq->queue_id = queue_id;
vq->vdev = vdev;
/* 在这里注册了 */
vq->vq_callback = callback;
vq->vq_notify_host = notify;
return vq;
}



// unikraft::plat/drivers/virtio/virtio_9p.c 在这里创建的vq
static int virtio_9p_vq_alloc(struct virtio_9p_device *d)
{
...
/* 这个setup函数调用的就是上面的create */
d->vq = virtio_vqueue_setup(d->vdev,
d->hwvq_id,
NUM_SEGMENTS,
virtio_9p_recv, // 这个就是callback回调函数
a);
...

return rc;
}


// unikraft::plat/drivers/virtio/virtio_9p.c recv的实现
static int virtio_9p_recv(struct virtqueue *vq, void *priv)
{
struct virtio_9p_device *dev;
struct uk_9preq *req = NULL;
uint32_t len;
int rc = 0;
int handled = 0;
...
while (1) {
...
/* 取出virtqueue */
rc = virtqueue_buffer_dequeue(dev->vq, (void **)&req, &len);
...

uk_9preq_receive_cb(req, len); // 真正实现

...
handled = 1;
...

}
return handled;
}



// unikraft:lib/uk9p/9preq.c 具体实现
int uk_9preq_receive_cb(struct uk_9preq *req, uint32_t recv_size)
{
uint32_t size;
uint16_t tag;
int rc;
...

/* 根据9p协议解序列化. */
req->recv.offset = 0;
req->recv.size = recv_size;
if ((rc = uk_9preq_read32(req, &size)) < 0 ||
(rc = uk_9preq_read8(req, &req->recv.type)) < 0 ||
(rc = uk_9preq_read16(req, &tag)) < 0)
return rc;

...
/* 更新req的状态. */
UK_WRITE_ONCE(req->state, UK_9PREQ_RECEIVED);
...
return 0;
}