uk_dpdk(驱动部分)

uk_dpdk/libs 目录

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
├─librte_eal        // EAL 层相关函数
│ └─include
├─librte_ethdev // 仅有 Makefile.uk
├─librte_hash // 仅有 Makefile.uk
├─librte_ip_frag // 仅有 Makefile.uk
├─librte_kvargs // 仅有 Makefile.uk
├─librte_mbuf // 仅有 Makefile.uk
├─librte_mempool // 仅有 Makefile.uk
├─librte_meter // 仅有 Makefile.uk
├─librte_net // 仅有 Makefile.uk
├─librte_ring // ring 相关操作函数
├─librte_uknetbuf // netbuf 相关操作函数
└─libuk_pmd
└─include
└─uk

数据结构简介

rte_device

该结构体用于抽象的描述一个运行时通用设备,位于 DPDK 侧。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
struct rte_device {
// 链表中下一个设备
RTE_TAILQ_ENTRY(rte_device) next; /**< Next device */
// 设备名称
const char *name; /**< Device name */
// 设备驱动
const struct rte_driver *driver; /**< Driver assigned after probing */
// 总线相关操作函数
const struct rte_bus *bus; /**< Bus handle assigned on scan */
// 非统一内存访问节点连接
int numa_node; /**< NUMA node connection */
// 最后一次探测到的设备参数
struct rte_devargs *devargs; /**< Arguments for latest probing */
};

rte_mbuf

mbuf 类似于 netbuf,是 DPDK 侧用于存储数据的结构

mbuf

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
struct rte_mbuf {
RTE_MARKER cacheline0;

void *buf_addr; /**< Virtual address of segment buffer. */
/**
* Physical address of segment buffer.
* Force alignment to 8-bytes, so as to ensure we have the exact
* same mbuf cacheline0 layout for 32-bit and 64-bit. This makes
* working on vector drivers easier.
*/
rte_iova_t buf_iova __rte_aligned(sizeof(rte_iova_t));

/* next 8 bytes are initialised on RX descriptor rearm */
RTE_MARKER64 rearm_data;
uint16_t data_off;

/**
* Reference counter. Its size should at least equal to the size
* of port field (16 bits), to support zero-copy broadcast.
* It should only be accessed using the following functions:
* rte_mbuf_refcnt_update(), rte_mbuf_refcnt_read(), and
* rte_mbuf_refcnt_set(). The functionality of these functions (atomic,
* or non-atomic) is controlled by the RTE_MBUF_REFCNT_ATOMIC flag.
*/
uint16_t refcnt;

/**
* Number of segments. Only valid for the first segment of an mbuf
* chain.
*/
uint16_t nb_segs;

/** Input port (16 bits to support more than 256 virtual ports).
* The event eth Tx adapter uses this field to specify the output port.
*/
uint16_t port;

uint64_t ol_flags; /**< Offload features. */

/* remaining bytes are set on RX when pulling packet from descriptor */
RTE_MARKER rx_descriptor_fields1;

/*
* The packet type, which is the combination of outer/inner L2, L3, L4
* and tunnel types. The packet_type is about data really present in the
* mbuf. Example: if vlan stripping is enabled, a received vlan packet
* would have RTE_PTYPE_L2_ETHER and not RTE_PTYPE_L2_VLAN because the
* vlan is stripped from the data.
*/
RTE_STD_C11
union {
uint32_t packet_type; /**< L2/L3/L4 and tunnel information. */
__extension__
struct {
uint8_t l2_type:4; /**< (Outer) L2 type. */
uint8_t l3_type:4; /**< (Outer) L3 type. */
uint8_t l4_type:4; /**< (Outer) L4 type. */
uint8_t tun_type:4; /**< Tunnel type. */
RTE_STD_C11
union {
uint8_t inner_esp_next_proto;
/**< ESP next protocol type, valid if
* RTE_PTYPE_TUNNEL_ESP tunnel type is set
* on both Tx and Rx.
*/
__extension__
struct {
uint8_t inner_l2_type:4;
/**< Inner L2 type. */
uint8_t inner_l3_type:4;
/**< Inner L3 type. */
};
};
uint8_t inner_l4_type:4; /**< Inner L4 type. */
};
};

uint32_t pkt_len; /**< Total pkt len: sum of all segments. */
uint16_t data_len; /**< Amount of data in segment buffer. */
/** VLAN TCI (CPU order), valid if RTE_MBUF_F_RX_VLAN is set. */
uint16_t vlan_tci;

RTE_STD_C11
union {
union {
uint32_t rss; /**< RSS hash result if RSS enabled */
struct {
union {
struct {
uint16_t hash;
uint16_t id;
};
uint32_t lo;
/**< Second 4 flexible bytes */
};
uint32_t hi;
/**< First 4 flexible bytes or FD ID, dependent
* on RTE_MBUF_F_RX_FDIR_* flag in ol_flags.
*/
} fdir; /**< Filter identifier if FDIR enabled */
struct rte_mbuf_sched sched;
/**< Hierarchical scheduler : 8 bytes */
struct {
uint32_t reserved1;
uint16_t reserved2;
uint16_t txq;
/**< The event eth Tx adapter uses this field
* to store Tx queue id.
* @see rte_event_eth_tx_adapter_txq_set()
*/
} txadapter; /**< Eventdev ethdev Tx adapter */
/**< User defined tags. See rte_distributor_process() */
uint32_t usr;
} hash; /**< hash information */
};

/** Outer VLAN TCI (CPU order), valid if RTE_MBUF_F_RX_QINQ is set. */
uint16_t vlan_tci_outer;

uint16_t buf_len; /**< Length of segment buffer. */

struct rte_mempool *pool; /**< Pool from which mbuf was allocated. */

/* second cache line - fields only used in slow path or on TX */
RTE_MARKER cacheline1 __rte_cache_min_aligned;

/**
* Next segment of scattered packet. Must be NULL in the last segment or
* in case of non-segmented packet.
*/
struct rte_mbuf *next;

/* fields to support TX offloads */
RTE_STD_C11
union {
uint64_t tx_offload; /**< combined for easy fetch */
__extension__
struct {
uint64_t l2_len:RTE_MBUF_L2_LEN_BITS;
/**< L2 (MAC) Header Length for non-tunneling pkt.
* Outer_L4_len + ... + Inner_L2_len for tunneling pkt.
*/
uint64_t l3_len:RTE_MBUF_L3_LEN_BITS;
/**< L3 (IP) Header Length. */
uint64_t l4_len:RTE_MBUF_L4_LEN_BITS;
/**< L4 (TCP/UDP) Header Length. */
uint64_t tso_segsz:RTE_MBUF_TSO_SEGSZ_BITS;
/**< TCP TSO segment size */

/*
* Fields for Tx offloading of tunnels.
* These are undefined for packets which don't request
* any tunnel offloads (outer IP or UDP checksum,
* tunnel TSO).
*
* PMDs should not use these fields unconditionally
* when calculating offsets.
*
* Applications are expected to set appropriate tunnel
* offload flags when they fill in these fields.
*/
uint64_t outer_l3_len:RTE_MBUF_OUTL3_LEN_BITS;
/**< Outer L3 (IP) Hdr Length. */
uint64_t outer_l2_len:RTE_MBUF_OUTL2_LEN_BITS;
/**< Outer L2 (MAC) Hdr Length. */

/* uint64_t unused:RTE_MBUF_TXOFLD_UNUSED_BITS; */
};
};

/** Shared data for external buffer attached to mbuf. See
* rte_pktmbuf_attach_extbuf().
*/
struct rte_mbuf_ext_shared_info *shinfo;

/** Size of the application private data. In case of an indirect
* mbuf, it stores the direct mbuf private data size.
*/
uint16_t priv_size;

/** Timesync flags for use with IEEE1588. */
uint16_t timesync;

uint32_t dynfield1[9]; /**< Reserved for dynamic fields. */
} __rte_cache_aligned;

rte_fbarray

1
2
3
4
5
6
7
8
struct rte_fbarray {
char name[RTE_FBARRAY_NAME_LEN]; /**< name associated with an array */
unsigned int count; /**< number of entries stored */
unsigned int len; /**< current length of the array */
unsigned int elt_sz; /**< size of each element */
void *data; /**< data pointer */
rte_rwlock_t rwlock; /**< multiprocess lock */
};

fbarray

初始化

app/dumpcap.c/dpdk_init

app/dumpcap 为例,在应用程序的 main 函数中,调取 dpdk_init 对 DPDK 进行初始化。由于 rte_eal_init 函数是以 argc/argv 的形式调用的,故需要对这两个参数进行初始化并完成传入。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
static void dpdk_init(void)
{
// 需要传入的参数
static const char * const args[] = {
"dumpcap", "--proc-type", "secondary",
"--log-level", "notice"

};
// RTE_DIM 为计算元素个数的宏定义
const int eal_argc = RTE_DIM(args);
char **eal_argv;
unsigned int i;

/* DPDK API requires mutable versions of command line arguments. */
// 为参数数组分配内存
eal_argv = calloc(eal_argc + 1, sizeof(char *));
if (eal_argv == NULL)
rte_panic("No memory\n");
// 复制参数
eal_argv[0] = strdup(progname);
for (i = 1; i < RTE_DIM(args); i++)
eal_argv[i] = strdup(args[i]);
// 传入参数进行初始化
if (rte_eal_init(eal_argc, eal_argv) < 0)
rte_exit(EXIT_FAILURE, "EAL init failed: is primary process running?\n");
// 获取可用端口数量
if (rte_eth_dev_count_avail() == 0)
rte_exit(EXIT_FAILURE, "No Ethernet ports found\n");
}

uk-dpdk/libs/librte_eal/uk_eal.c/rte_eal_init

init_dpdk

在这里将传入的字符串参数转化为 internal_config 结构体的内部参数后完成 EAL 层的各参数配置,最后调用 eal_uknetdev_init 函数将所有的 uk_netdev 注册成 eth_dev

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
int rte_eal_init(int argc, char **argv)
{
int rc;
// 重置设备内部配置
eal_reset_internal_config(&internal_config);
// 获取 CPU 信息并填充 cpu_info 结构体
// location: dpdk\lib\eal\common\eal_common_lcore.c
rc = rte_eal_cpu_init();
if (rc < 0) {
uk_pr_err("Failed to initialize the CPU\n");
return rc;
}

uk_pr_debug("initialized the cpu_init\n");
internal_config.no_shconf = 1;
internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE;
internal_config.legacy_mem = 1;

/**
* Parse the argument.
* 将传入的字符串参数转化为 internal_config 结构体内部参数
*/
rc = eal_parse_args(argc, argv);
if (rc < 0) {
uk_pr_err("Failed to parse the argument to library\n");
return rc;
}

uk_pr_debug("parsed argument\n");

/**
* Process the arguments on the device
* 在 Unikraft 中,该函数无实际效果
*/
rc = eal_option_device_parse();
if (rc < 0) {
uk_pr_err("Failed to parse the device\n");
return rc;
}
uk_pr_debug("dev_args parsed\n");
// 如果是主进程则初始化配置
rte_config_init();

/**
* Configure the heap based on the huge page information.
* 配置大内存页
*/
rc = eal_hugepage_info_init();
if (rc < 0) {
uk_pr_err("Failed to fetch hugetable info\n");
return rc;
}

/**
* Memzone initialization configure the fbarray.
* 初始化内存区子系统
*/
rc = rte_eal_memzone_init();
if (rc < 0) {
uk_pr_err("Failed to initialize the memory zone\n");
return rc;
}

/**
* TODO:
* Check if we need changes to configure
* - memseg
* - memalloc
* 初始化内存子系统
*/
rc = rte_eal_memory_init();
if (rc < 0) {
uk_pr_err("Failed to initialize the memory\n");
return rc;
}
// 初始化堆
rc = rte_eal_malloc_heap_init();
if (rc < 0) {
uk_pr_err("Failed to initialize heap\n");
return rc;
}
// 初始化队列
if (rte_eal_tailqs_init() < 0) {
uk_pr_err("Cannot init tail queues for objects\n");
rte_errno = EFAULT;
return -1;
}
// 初始化线程并设置 affinity
eal_thread_init_master(rte_config.master_lcore);

rc = eal_uknetdev_init();
if (rc < 0) {
uk_pr_err("Failed(%d) to initializes the netdevice\n", rc);
return rc;
}
return 0;
}

eal_reset_internal_config

TODO

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
void
eal_reset_internal_config(struct internal_config *internal_cfg)
{
int i;

internal_cfg->memory = 0;
internal_cfg->force_nrank = 0;
internal_cfg->force_nchannel = 0;
internal_cfg->hugefile_prefix = NULL;
internal_cfg->hugepage_dir = NULL;
internal_cfg->force_sockets = 0;
/* zero out the NUMA config */
for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
internal_cfg->socket_mem[i] = 0;
internal_cfg->force_socket_limits = 0;
/* zero out the NUMA limits config */
for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
internal_cfg->socket_limit[i] = 0;
/* zero out hugedir descriptors */
for (i = 0; i < MAX_HUGEPAGE_SIZES; i++) {
memset(&internal_cfg->hugepage_info[i], 0,
sizeof(internal_cfg->hugepage_info[0]));
internal_cfg->hugepage_info[i].lock_descriptor = -1;
}
internal_cfg->base_virtaddr = 0;

#ifdef LOG_DAEMON
internal_cfg->syslog_facility = LOG_DAEMON;
#endif

/* if set to NONE, interrupt mode is determined automatically */
internal_cfg->vfio_intr_mode = RTE_INTR_MODE_NONE;
memset(internal_cfg->vfio_vf_token, 0,
sizeof(internal_cfg->vfio_vf_token));

#ifdef RTE_LIBEAL_USE_HPET
internal_cfg->no_hpet = 0;
#else
internal_cfg->no_hpet = 1;
#endif
internal_cfg->vmware_tsc_map = 0;
internal_cfg->create_uio_dev = 0;
internal_cfg->iova_mode = RTE_IOVA_DC;
internal_cfg->user_mbuf_pool_ops_name = NULL;
CPU_ZERO(&internal_cfg->ctrl_cpuset);
internal_cfg->init_complete = 0;
internal_cfg->max_simd_bitwidth.bitwidth = RTE_VECT_DEFAULT_SIMD_BITWIDTH;
internal_cfg->max_simd_bitwidth.forced = 0;
}

rte_eal_cpu_init

TODO

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
int
rte_eal_cpu_init(void)
{
/* pointer to global configuration */
struct rte_config *config = rte_eal_get_configuration();
unsigned lcore_id;
unsigned count = 0;
unsigned int socket_id, prev_socket_id;
int lcore_to_socket_id[RTE_MAX_LCORE];

/*
* Parse the maximum set of logical cores, detect the subset of running
* ones and enable them by default.
*/
for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
lcore_config[lcore_id].core_index = count;

/* init cpuset for per lcore config */
CPU_ZERO(&lcore_config[lcore_id].cpuset);

/* find socket first */
socket_id = eal_cpu_socket_id(lcore_id);
lcore_to_socket_id[lcore_id] = socket_id;

if (eal_cpu_detected(lcore_id) == 0) {
config->lcore_role[lcore_id] = ROLE_OFF;
lcore_config[lcore_id].core_index = -1;
continue;
}

/* By default, lcore 1:1 map to cpu id */
CPU_SET(lcore_id, &lcore_config[lcore_id].cpuset);

/* By default, each detected core is enabled */
config->lcore_role[lcore_id] = ROLE_RTE;
lcore_config[lcore_id].core_role = ROLE_RTE;
lcore_config[lcore_id].core_id = eal_cpu_core_id(lcore_id);
lcore_config[lcore_id].socket_id = socket_id;
RTE_LOG(DEBUG, EAL, "Detected lcore %u as "
"core %u on socket %u\n",
lcore_id, lcore_config[lcore_id].core_id,
lcore_config[lcore_id].socket_id);
count++;
}
for (; lcore_id < CPU_SETSIZE; lcore_id++) {
if (eal_cpu_detected(lcore_id) == 0)
continue;
RTE_LOG(DEBUG, EAL, "Skipped lcore %u as core %u on socket %u\n",
lcore_id, eal_cpu_core_id(lcore_id),
eal_cpu_socket_id(lcore_id));
}

/* Set the count of enabled logical cores of the EAL configuration */
config->lcore_count = count;
RTE_LOG(DEBUG, EAL,
"Maximum logical cores by configuration: %u\n",
RTE_MAX_LCORE);
RTE_LOG(INFO, EAL, "Detected CPU lcores: %u\n", config->lcore_count);

/* sort all socket id's in ascending order */
qsort(lcore_to_socket_id, RTE_DIM(lcore_to_socket_id),
sizeof(lcore_to_socket_id[0]), socket_id_cmp);

prev_socket_id = -1;
config->numa_node_count = 0;
for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
socket_id = lcore_to_socket_id[lcore_id];
if (socket_id != prev_socket_id)
config->numa_nodes[config->numa_node_count++] =
socket_id;
prev_socket_id = socket_id;
}
RTE_LOG(INFO, EAL, "Detected NUMA nodes: %u\n", config->numa_node_count);

return 0;
}

eal_parse_args

TODO

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
static int eal_parse_args(int argc, char **argv)
{
int rc;
char **argvopt;
int option_index = 0;
char *prgname = argv[0];
const int old_optind = optind;
const int old_optopt = optopt;
char * const old_optarg = optarg;
int opt;

argvopt = argv;
optind = 1;
#if !defined(CONFIG_HUGEPAGE_DIR) && !defined(CONFIG_HUGEPAGE_FILE)
internal_config.no_hugetlbfs = 1;
#endif /* !CONFIG_HUGEPAGE_DIR && !CONFIG_HUGEPAGE_FILE */
while ((opt = getopt_long(argc, argvopt, eal_short_options,
eal_long_options, &option_index)) >= 0) {
/* getopt is not happy, stop right now */
if (opt == '?') {
uk_pr_err("Get opt long is not happy\n");
eal_usage(prgname);
rc = -1;
goto out;
}

printf("Opt: %d and optarg: %s\n", opt, optarg);

switch (opt) {
case 'h':
eal_usage(prgname);
exit(EXIT_SUCCESS);
case 'd':
RTE_LOG(ERR, EAL, "Unikraft does not support loading modules\n");
break;
case OPT_PROC_TYPE_NUM:
internal_config.process_type = RTE_PROC_PRIMARY;
break;
#ifdef CONFIG_HUGEPAGE_DIR
case OPT_HUGE_DIR_NUM:
internal_config.hugepage_dir = strdup(optarg);
break;
#endif /* CONFIG_HUGEPAGE_DIR */
#ifdef CONFIG_HUGEPAGE_FILE
case OPT_FILE_PREFIX_NUM:
internal_config.hugefile_prefix = strdup(optarg);
break;
#endif /* CONFIG_HUGEPAGE_FILE */
#ifdef CONFIG_SOCKET_MEM_NUM
case OPT_SOCKET_MEM_NUM:
if (eal_parse_socket_mem(optarg) < 0) {
RTE_LOG(ERR, EAL, "invalid parameters for --"
OPT_SOCKET_MEM "\n");
eal_usage(prgname);
rc = -1;
goto out;
}
break;
#endif /* CONFIG_SOCKET_MEM_NUM */
#ifdef CONFIG_VIRTADDR_NUM
case OPT_BASE_VIRTADDR_NUM:
if (eal_parse_base_virtaddr(optarg) < 0) {
RTE_LOG(ERR, EAL, "invalid parameter for --"
OPT_BASE_VIRTADDR "\n");
eal_usage(prgname);
rc = -1;
goto out;
}
break;
#endif /* CONFIG_VIRTADDR_NUM */
#ifdef CONFIG_VFIO_INTR_NUM
case OPT_VFIO_INTR_NUM:
if (eal_parse_vfio_intr(optarg) < 0) {
RTE_LOG(ERR, EAL, "invalid parameters for --"
OPT_VFIO_INTR "\n");
eal_usage(prgname);
rc = -1;
goto out;
}
break;
#endif /* CONFIG_VFIO_INTR_NUM */

#ifdef CONFIG_UIO_DEV
case OPT_CREATE_UIO_DEV_NUM:
internal_config.create_uio_dev = 1;
break;
#endif /* CONFIG_UIO_DEV_NUM */

#ifdef CONFIG_MBUF_POOL_OPS_NAME
case OPT_MBUF_POOL_OPS_NAME_NUM:
internal_config.mbuf_pool_ops_name = optarg;
break;
#endif /* CONFIG_MBUF_POOL_OPS_NAME */

default:
/**
* Parse common options later to enable overriding
* default common options
*/
rc = eal_parse_common_option(opt, optarg,
&internal_config);

/* common parser handled this option */
if (rc == 0)
continue;
/* common parser is not happy */
if (rc < 0) {
eal_usage(prgname);
rc = -1;
goto out;
} else if (opt < OPT_LONG_MIN_NUM && isprint(opt))
uk_pr_err("Option %c is not supported on Unikraft\n",
opt);
else if (opt >= OPT_LONG_MIN_NUM &&
opt < OPT_LONG_MAX_NUM)
uk_pr_err("Option %s is not supported on Unikraft\n",
eal_long_options[option_index].name);
else
uk_pr_err( "Option %d is not supported on Unikraft\n",
opt);

eal_usage(prgname);
rc = -1;
goto out;
}
}
if (eal_adjust_config(&internal_config) != 0) {
rc = -1;
goto out;
}

/* sanity checks */
if (eal_check_common_options(&internal_config) != 0) {
eal_usage(prgname);
rc = -1;
goto out;
}

if (optind >= 0)
argv[optind-1] = prgname;
rc = optind-1;

out:
/* restore getopt lib */
optind = old_optind;
optopt = old_optopt;
optarg = old_optarg;

return rc;
}

eal_option_device_parse

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
int eal_option_device_parse(void){
struct device_option *devopt;
void *tmp;
int ret = 0;
RTE_TAILQ_FOREACH_SAFE(devopt, &devopt_list, next, tmp) {
if (ret == 0) {
// unikraft 下的该函数直接返回 0
ret = rte_devargs_add(devopt->type, devopt->arg);
if (ret)
RTE_LOG(ERR, EAL, "Unable to parse device '%s'\n",
devopt->arg);
}
TAILQ_REMOVE(&devopt_list, devopt, next);
free(devopt);
}
return ret;
}

rte_config_init

1
2
3
4
5
6
7
8
9
10
11
12
static void rte_config_init(void){
rte_config.process_type = internal_config.process_type;
switch (rte_config.process_type){
case RTE_PROC_PRIMARY:
uk_pr_debug("Initializing the primary process config\n");
rte_eal_config_create();
break;
default:
uk_pr_err("Process type(%d) not supported\n",
rte_config.process_type);
}
}

eal_hugepage_info_init

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
/**
* 1ULL << 21 = 2M
* ULL = unsigned long long
* 2^10bit = 1KB
*/
static uint32_t huge_page_size = RTE_PGSIZE_2M
int eal_hugepage_info_init(void){
uint32_t nr_page;
int left_mem = 0;
struct hugepage_info *hpi;
hpi = &internal_config.hugepage_info[0];
// 获取剩余内存
left_mem = uk_alloc_availmem(uk_alloc_get_default());
if (left_mem < 0) {
uk_pr_err("Failed to fetch the available memory\n");
return left_mem;
}
hpi->hugepage_sz = huge_page_size;
// 计算页数
hpi->num_pages[0] = left_mem / huge_page_size;
internal_config.num_hugepage_sizes = 1;
printf("%s: left memory: %llu huge pages of size: %d and count: %d\n",
__func__, left_mem, huge_page_size, hpi->num_pages[0]);
return 0;
}

rte_eal_memzone_init

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
int rte_eal_memzone_init(void){
struct rte_mem_config *mcfg;
int ret = 0;
/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;
rte_rwlock_write_lock(&mcfg->mlock);
// 主进程中初始化 fbarray,子进程中连接主进程的 fbarray
if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
rte_fbarray_init(&mcfg->memzones, "memzone",
RTE_MAX_MEMZONE, sizeof(struct rte_memzone))) {
RTE_LOG(ERR, EAL, "Cannot allocate memzone list\n");
ret = -1;
} else if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
rte_fbarray_attach(&mcfg->memzones)) {
RTE_LOG(ERR, EAL, "Cannot attach to memzone list\n");
ret = -1;

rte_rwlock_write_unlock(&mcfg->mlock);
return ret;
}

rte_eal_memory_init

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
int
rte_eal_memory_init(void)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
int retval;
RTE_LOG(DEBUG, EAL, "Setting up physically contiguous memory...\n");

if (!mcfg)
return -1;

/* lock mem hotplug here, to prevent races while we init */
rte_mcfg_mem_read_lock();
// 初始化内存段
if (rte_eal_memseg_init() < 0)
goto fail;

if (eal_memalloc_init() < 0)
goto fail;
// 若为主进程则初始化 hugepage,若为子进程则连接主进程的 hugepage
retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
rte_eal_hugepage_init() :
rte_eal_hugepage_attach();
if (retval < 0)
goto fail;

if (internal_config.no_shconf == 0 && rte_eal_memdevice_init() < 0)
goto fail;

return 0;
fail:
rte_mcfg_mem_read_unlock();
return -1;
}

rte_eal_malloc_heap_init

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
int
rte_eal_malloc_heap_init(void)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
unsigned int i;
const struct internal_config *internal_conf =
eal_get_internal_configuration();

if (internal_conf->match_allocations)
RTE_LOG(DEBUG, EAL, "Hugepages will be freed exactly as allocated.\n");

if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
/* assign min socket ID to external heaps */
mcfg->next_socket_id = EXTERNAL_HEAP_MIN_SOCKET_ID;

/* assign names to default DPDK heaps */
for (i = 0; i < rte_socket_count(); i++) {
struct malloc_heap *heap = &mcfg->malloc_heaps[i];
char heap_name[RTE_HEAP_NAME_MAX_LEN];
int socket_id = rte_socket_id_by_idx(i);

snprintf(heap_name, sizeof(heap_name),
"socket_%i", socket_id);
strlcpy(heap->name, heap_name, RTE_HEAP_NAME_MAX_LEN);
heap->socket_id = socket_id;
}
}

if (register_mp_requests()) {
RTE_LOG(ERR, EAL, "Couldn't register malloc multiprocess actions\n");
rte_mcfg_mem_read_unlock();
return -1;
}

/* unlock mem hotplug here. it's safe for primary as no requests can
* even come before primary itself is fully initialized, and secondaries
* do not need to initialize the heap.
*/
rte_mcfg_mem_read_unlock();

/* secondary process does not need to initialize anything */
if (rte_eal_process_type() != RTE_PROC_PRIMARY)
return 0;

/* add all IOVA-contiguous areas to the heap */
return rte_memseg_contig_walk(malloc_add_seg, NULL);
}

rte_eal_tailqs_init

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
int
rte_eal_tailqs_init(void)
{
struct rte_tailq_elem *t;

rte_tailqs_count = 0;

TAILQ_FOREACH(t, &rte_tailq_elem_head, next) {
/* second part of register job for "early" tailqs, see
* rte_eal_tailq_register and EAL_REGISTER_TAILQ */
rte_eal_tailq_update(t);
if (t->head == NULL) {
RTE_LOG(ERR, EAL,
"Cannot initialize tailq: %s\n", t->name);
/* TAILQ_REMOVE not needed, error is already fatal */
goto fail;
}
}

return 0;

fail:
rte_dump_tailq(stderr);
return -1;
}

eal_thread_init_master

1
2
3
4
5
6
7
8
9
void eal_thread_init_master(unsigned lcore_id)
{
/* set the lcore ID in per-lcore memory area */
RTE_PER_LCORE(_lcore_id) = lcore_id;

/* set CPU affinity */
if (eal_thread_set_affinity() < 0)
UK_CRASH("Failed to set thread affinity\n");
}

uk-dpdk/libs/libuk_pmd/uk_pmd.c/eal_uknetdev_init

通过 for 循环遍历所有的 uk_netdev 并将其转化为对应的 eth_dev

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
int eal_uknetdev_init(void)
{
int cnt = 0, i, rc;
struct uk_netdev *dev;

#define RTE_ETHDEV_NAMESIZE 5
char name[RTE_ETHDEV_NAMESIZE];

cnt = uk_netdev_count();
// 将每一个 uk_netdev 注册成 eth_dev
for (i = 0; i < cnt; i++) {
dev = uk_netdev_get(i);
if (!dev)
continue;
uk_pr_info(DRIVER_NAME": Registered netdev id %d @ %p\n", i, dev);
snprintf(name, sizeof(name), "uk%02d", i);
rc = uk_ethdev_create(dev, name, eal_cpu_socket_id(), 0);
if (rc < 0) {
uk_pr_err("Failed to create the ethdev\n");
goto err_exit;
}
}
return 0;

err_exit:
return -1;
}

uk-dpdk/libs/libuk_pmd/uk_pmd.c/uk_ethdev_create

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
static int uk_ethdev_create(struct uk_netdev *dev, const char *name,
uint8_t socket_id,
uint8_t isr_support)
{
#if CONFIG_PCI_BUS
struct rte_pci_device *pci_dev = NULL;
struct rte_pci_driver *pci_drv = NULL;
struct rte_pci_id *id_table = NULL;
#endif
struct rte_eth_dev *eth_dev = NULL;
struct uk_ethdev_private *dev_private = NULL;
char name_buf[RTE_RING_NAMESIZE];

#ifdef CONFIG_PCI_BUS
/* now do all data allocation - for eth_dev structure, dummy pci driver
* and internal (dev_private) data
*/
pci_dev = rte_zmalloc_socket(name, sizeof(*pci_dev), 0, socket_id);
if (pci_dev == NULL)
goto err;

pci_drv = rte_zmalloc_socket(name, sizeof(*pci_drv), 0, socket_id);
if (pci_drv == NULL)
goto err;

id_table = rte_zmalloc_socket(name, sizeof(*id_table), 0, socket_id);
if (id_table == NULL)
goto err;
id_table->device_id = 0xBEEF;
#endif /* CONFIG_PCI_BUS */
// 在特定的 heap 分配内存
dev_private = rte_zmalloc_socket(name, sizeof(*dev_private), 0, socket_id);
if (dev_private == NULL)
goto err;

dev_private->netdev = dev;

#ifdef CONFIG_UK_NETDEV_RING
// 创建环队列
snprintf(name_buf, sizeof(name_buf), "%s_rxQ", name);
dev_private->rx_queue = rte_ring_create(name_buf, MAX_PKT_BURST, socket_id,
0);
if (dev_private->rx_queue == NULL)
goto err;

snprintf(name_buf, sizeof(name_buf), "%s_txQ", name);
dev_private->tx_queue = rte_ring_create(name_buf, MAX_PKT_BURST, socket_id,
0);
if (dev_private->tx_queue == NULL)
goto err;
#endif /* CONFIG_UK_NETDEV_RING */

/* reserve an ethdev entry */
eth_dev = rte_eth_dev_allocate(name);
if (eth_dev == NULL)
goto err;

#ifdef CONFIG_PCI_BUS
pci_dev->device.numa_node = socket_id;
pci_dev->device.name = eth_dev->data->name;
pci_drv->driver.name = uk_ethdev_driver_name;
pci_drv->id_table = id_table;

if (isr_support)
pci_drv->drv_flags |= RTE_PCI_DRV_INTR_LSC;
else
pci_drv->drv_flags &= ~RTE_PCI_DRV_INTR_LSC;

#endif /* CONFIG_PCI_BUS */
// 配置 eth_dev
eth_dev->device = &dev_private->dev;
eth_dev->device->driver = &uk_netdev_driver;

eth_dev->data->nb_rx_queues = (uint16_t)1;
eth_dev->data->nb_tx_queues = (uint16_t)1;

eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
eth_dev->data->dev_link.link_speed = ETH_SPEED_NUM_10G;
eth_dev->data->dev_link.link_duplex = ETH_LINK_FULL_DUPLEX;

eth_dev->data->mac_addrs = rte_zmalloc(name, RTE_ETHER_ADDR_LEN, 0);
if (eth_dev->data->mac_addrs == NULL)
goto err;

#ifdef CONFIG_DEFAULT_HWADDR
memcpy(eth_dev->data->mac_addrs, mac_addr,
sizeof(*eth_dev->data->mac_addrs));
#endif /* CONFIG_DEFAULT_HWADDR */

eth_dev->data->dev_started = 0;
eth_dev->data->promiscuous = 0;
eth_dev->data->scattered_rx = 0;
eth_dev->data->all_multicast = 0;

eth_dev->data->dev_private = dev_private;

/* Copy default device operation functions */
dev_private->dev_ops = uk_ethdev_default_dev_ops;
eth_dev->dev_ops = &dev_private->dev_ops;

#ifdef CONFIG_PCI_BUS
pci_dev->device.driver = &pci_drv->driver;
eth_dev->device = &pci_dev->device;
#endif /* CONFIG_PCI_BUS */
// 注册收发函数
eth_dev->rx_pkt_burst = uk_ethdev_rx_burst;
eth_dev->tx_pkt_burst = uk_ethdev_tx_burst;

rte_eth_dev_probing_finish(eth_dev);

return eth_dev->data->port_id;

err:
#ifdef CONFIG_PCI_BUS */
rte_free(pci_dev);
rte_free(pci_drv);
rte_free(id_table);
#endif /* CONFIG_PCI_BUS */
rte_free(dev_private);

return -1;
}

dpdk/lib/ethdev/rte_ethdev.c/rte_eth_dev_probing_finish

fast path 结构体在 rte_ethdev.c 中声明并完成赋值。以 rte_eth_fp_ops 作为首地址,设备的端口作为偏移量进行设置后即可以数组的形式快速获取到相应的操作函数。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
/* public fast-path API */
struct rte_eth_fp_ops rte_eth_fp_ops[RTE_MAX_ETHPORTS];
// 该函数的调用者是 init 流程中的 uk_ethdev_create
void
rte_eth_dev_probing_finish(struct rte_eth_dev *dev)
{
if (dev == NULL)
return;

/*
* for secondary process, at that point we expect device
* to be already 'usable', so shared data and all function pointers
* for fast-path devops have to be setup properly inside rte_eth_dev.
*/
if (rte_eal_process_type() == RTE_PROC_SECONDARY)
eth_dev_fp_ops_setup(rte_eth_fp_ops + dev->data->port_id, dev);

rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_NEW, NULL);

dev->state = RTE_ETH_DEV_ATTACHED;
}

int
rte_eth_dev_callback_process(struct rte_eth_dev *dev,
enum rte_eth_event_type event, void *ret_param)
{
struct rte_eth_dev_callback *cb_lst;
struct rte_eth_dev_callback dev_cb;
int rc = 0;

rte_spinlock_lock(&eth_dev_cb_lock);
TAILQ_FOREACH(cb_lst, &(dev->link_intr_cbs), next) {
if (cb_lst->cb_fn == NULL || cb_lst->event != event)
continue;
dev_cb = *cb_lst;
cb_lst->active = 1;
if (ret_param != NULL)
dev_cb.ret_param = ret_param;

rte_spinlock_unlock(&eth_dev_cb_lock);
rc = dev_cb.cb_fn(dev->data->port_id, dev_cb.event,
dev_cb.cb_arg, dev_cb.ret_param);
rte_spinlock_lock(&eth_dev_cb_lock);
cb_lst->active = 0;
}
rte_spinlock_unlock(&eth_dev_cb_lock);
return rc;
}

将 device 中注册好的的函数传递给 fp_ops

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
void
eth_dev_fp_ops_setup(struct rte_eth_fp_ops *fpo,
const struct rte_eth_dev *dev)
{
fpo->rx_pkt_burst = dev->rx_pkt_burst;
fpo->tx_pkt_burst = dev->tx_pkt_burst;
fpo->tx_pkt_prepare = dev->tx_pkt_prepare;
fpo->rx_queue_count = dev->rx_queue_count;
fpo->rx_descriptor_status = dev->rx_descriptor_status;
fpo->tx_descriptor_status = dev->tx_descriptor_status;

fpo->rxq.data = dev->data->rx_queues;
fpo->rxq.clbk = (void **)(uintptr_t)dev->post_rx_burst_cbs;

fpo->txq.data = dev->data->tx_queues;
fpo->txq.clbk = (void **)(uintptr_t)dev->pre_tx_burst_cbs;
}

接收数据包

dpdk/lib/ethdev/rte_ethdev.h/rte_eth_rx_burst

DPDK 中提供给上层应用的接收函数为 rte_eth_rx_burst,该函数通过端口 ID 与队列 ID 将 nb_pkts 数量的数据包取出至 rx_pkts 中。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
static inline uint16_t
rte_eth_rx_burst(uint16_t port_id, uint16_t queue_id,
struct rte_mbuf **rx_pkts, const uint16_t nb_pkts)
{
uint16_t nb_rx;
struct rte_eth_fp_ops *p;
void *qd;

#ifdef RTE_ETHDEV_DEBUG_RX
if (port_id >= RTE_MAX_ETHPORTS ||
queue_id >= RTE_MAX_QUEUES_PER_PORT) {
RTE_ETHDEV_LOG(ERR,
"Invalid port_id=%u or queue_id=%u\n",
port_id, queue_id);
return 0;
}
#endif

/* fetch pointer to queue data */
// 通过端口号和队列号取出相应结构体
p = &rte_eth_fp_ops[port_id];
qd = p->rxq.data[queue_id];

#ifdef RTE_ETHDEV_DEBUG_RX
RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, 0);

if (qd == NULL) {
RTE_ETHDEV_LOG(ERR, "Invalid Rx queue_id=%u for port_id=%u\n",
queue_id, port_id);
return 0;
}
#endif
// 调用 fast path 中注册好的 rx_pkt_burst
nb_rx = p->rx_pkt_burst(qd, rx_pkts, nb_pkts);

#ifdef RTE_ETHDEV_RXTX_CALLBACKS
{
void *cb;

/* __ATOMIC_RELEASE memory order was used when the
* call back was inserted into the list.
* Since there is a clear dependency between loading
* cb and cb->fn/cb->next, __ATOMIC_ACQUIRE memory order is
* not required.
*/
cb = __atomic_load_n((void **)&p->rxq.clbk[queue_id],
__ATOMIC_RELAXED);
if (unlikely(cb != NULL))
nb_rx = rte_eth_call_rx_callbacks(port_id, queue_id,
rx_pkts, nb_rx, nb_pkts, cb);
}
#endif
// 创建一个追踪点
rte_ethdev_trace_rx_burst(port_id, queue_id, (void **)rx_pkts, nb_rx);
return nb_rx;
}

uk-dpdk/libs/libuk_pmd/uk_pmd.c/uk_ethdev_rx_burst

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
static uint16_t uk_ethdev_rx_burst(void *queue,
struct rte_mbuf **bufs,
uint16_t nb_pkts)
{
struct rte_eth_dev *vrtl_eth_dev;
struct uk_ethdev_private *dev_private;
struct uk_ethdev_queue *rxq;
struct uk_netbuf *nb[MAX_PKT_BURST];
int rx_burst_size = (nb_pkts > MAX_PKT_BURST)? MAX_PKT_BURST:nb_pkts;
int rx_count = 0, i, rc, idx;
struct rte_mbuf *mbuf;

UK_ASSERT(queue && bufs);
// 获取队列以及 netdev
rxq = (struct uk_ethdev_private *) queue;
vrtl_eth_dev = &rte_eth_devices[rxq->port_id];
dev_private = vrtl_eth_dev->data->dev_private;
UK_ASSERT(dev_private);
// 限制最大收包数
nb_pkts = (nb_pkts > MAX_PKT_BURST)? MAX_PKT_BURST:nb_pkts;

if (unlikely(!vrtl_eth_dev->data->dev_link.link_status)) {
return 0;
} else {
while (rx_count < nb_pkts) {
rx_burst_size = nb_pkts - rx_count;
// 调用 netdev 中的 rx 函数进行收包
rc = uk_netdev_rx_burst(dev_private->netdev,
rxq->queue_id, &nb[rx_count],
&rx_burst_size);
idx = rx_count;
// 使用接收到的包填充 mbuf
for (i = idx; i < idx + rx_burst_size; i++) {
rx_count++;
UK_ASSERT(nb[i]);
mbuf = nb[i]->priv;
//printf("%s: mbuf %p\n", __func__, mbuf);
/**
* TODO: Fill in the mbuf for packet processing
*/
mbuf->port = rxq->port_id;
//mbuf->data_off = nb[i]->data - nb[i]->buf;
mbuf->ol_flags = 0;
mbuf->vlan_tci = 0;

mbuf->pkt_len = nb[i]->len;
mbuf->data_len = nb[i]->len;
bufs[i] = mbuf;

}

if (!uk_netdev_status_more(rc))
break;
}
}
/* increments ipackets count */
// 增加成功接收包的数量
dev_private->eth_stats.ipackets += rx_count;

/* increments ibytes count */
// 增加成功接收字节数
for (i = 0; i < rx_count; i++)
dev_private->eth_stats.ibytes += rte_pktmbuf_pkt_len(bufs[i]);

return rx_count;
}