diff --git a/QDMA/DPDK/RELEASE b/QDMA/DPDK/RELEASE index 9409e12195904337ca5728ec884ac92a4966fd42..92c106059f9c6513a443855d7c1093d1938d1719 100755 --- a/QDMA/DPDK/RELEASE +++ b/QDMA/DPDK/RELEASE @@ -1,4 +1,4 @@ -RELEASE: 2023.1.1 +RELEASE: 2023.1.2 ================= This release is based on DPDK v20.11, v21.11 and v22.11 and @@ -121,14 +121,17 @@ CPM5 ---------------- - Optimized the driver code and HW register settings for performance improvements +2023.1.2 Updates +---------------- +- Optimized dpdk PMD and HW register settings for CPM5 performance improvements + KNOWN ISSUE: ============ - CPM5: - - Performance optimizations are not finalized, Updated Performance report with some more optimizations will be available in next patch release. + - Smaller packet forwarding performance optimizations are in progress and report will be updated in subsequent releases - All Designs - Function Level Reset(FLR) of PF device when VFs are attached to this PF results in mailbox communication failure - - DPDK C2H and Forwarding performance values for 8 queue is lesser compared to 4 queue case for both PF and VF. DRIVER LIMITATIONS: @@ -139,7 +142,6 @@ DRIVER LIMITATIONS: - All Designs - Big endian systems are not supported - For optimal QDMA streaming performance, packet buffers of the descriptor ring should be aligned to at least 256 bytes. - - Current 2023.1.0 driver which is supporting DPDK 22.11/21.11 is not fully verified for stress, multicard, tandem boot and interop use cases. These will be verified and confirmed in next patch release. /*- diff --git a/QDMA/DPDK/drivers/net/qdma/meson.build b/QDMA/DPDK/drivers/net/qdma/meson.build index 63498a456fb581bd8e805390cd543de3bc228076..0a04d4d9bdf9a91dac85f9eb6d1993206a230a10 100755 --- a/QDMA/DPDK/drivers/net/qdma/meson.build +++ b/QDMA/DPDK/drivers/net/qdma/meson.build @@ -33,10 +33,6 @@ cflags += ['-DRTE_LIBRTE_QDMA_PMD'] cflags += ['-DDMA_BRAM_SIZE=524288'] cflags += ['-DTHROUGHPUT_MEASUREMENT'] -# Enable vectorization in qdma data path to use 128-bit SIMD registers -cflags += ['-DQDMA_RX_VEC_X86_64'] -cflags += ['-DQDMA_TX_VEC_X86_64'] - # Use QDMA_DPDK_22_11 compiler flag for DPDK v22.11 # Use QDMA_DPDK_21_11 compiler flag for DPDK v21.11 # Use QDMA_DPDK_20_11 compiler flag for DPDK v20.11 @@ -77,3 +73,7 @@ sources = files( 'rte_pmd_qdma.c', 'qdma_dpdk_compat.c' ) + +if arch_subdir == 'x86' + sources += files('qdma_rxtx_vec_sse.c') +endif \ No newline at end of file diff --git a/QDMA/DPDK/drivers/net/qdma/qdma.h b/QDMA/DPDK/drivers/net/qdma/qdma.h index 3e139665d2fa8be4da8a005c22c3a5e9e5220ed9..cf7657e18a7eb5440ca448b6be8feda04852764d 100755 --- a/QDMA/DPDK/drivers/net/qdma/qdma.h +++ b/QDMA/DPDK/drivers/net/qdma/qdma.h @@ -345,6 +345,9 @@ struct qdma_pci_dev { int16_t tx_qid_statid_map[RTE_ETHDEV_QUEUE_STAT_CNTRS]; int16_t rx_qid_statid_map[RTE_ETHDEV_QUEUE_STAT_CNTRS]; + + uint8_t rx_vec_allowed:1; + uint8_t tx_vec_allowed:1; }; void qdma_dev_ops_init(struct rte_eth_dev *dev); @@ -376,15 +379,20 @@ int qdma_identify_bars(struct rte_eth_dev *dev); int qdma_get_hw_version(struct rte_eth_dev *dev); /* implemented in rxtx.c */ -uint16_t qdma_recv_pkts_st(struct qdma_rx_queue *rxq, struct rte_mbuf **rx_pkts, +uint16_t qdma_recv_pkts_st(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts); -uint16_t qdma_recv_pkts_mm(struct qdma_rx_queue *rxq, struct rte_mbuf **rx_pkts, +uint16_t qdma_recv_pkts_mm(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts); -uint16_t qdma_xmit_pkts_st(struct qdma_tx_queue *txq, struct rte_mbuf **tx_pkts, +uint16_t qdma_xmit_pkts_st(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts); -uint16_t qdma_xmit_pkts_mm(struct qdma_tx_queue *txq, struct rte_mbuf **tx_pkts, +uint16_t qdma_xmit_pkts_mm(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts); +#ifdef TEST_64B_DESC_BYPASS +uint16_t qdma_xmit_64B_desc_bypass(struct qdma_tx_queue *txq, + struct rte_mbuf **tx_pkts, uint16_t nb_pkts); +#endif + uint32_t qdma_pci_read_reg(struct rte_eth_dev *dev, uint32_t bar, uint32_t reg); void qdma_pci_write_reg(struct rte_eth_dev *dev, uint32_t bar, uint32_t reg, uint32_t val); @@ -414,4 +422,27 @@ bool is_vf_device_supported(struct rte_eth_dev *dev); bool is_pf_device_supported(struct rte_eth_dev *dev); void qdma_check_errors(void *arg); + +struct rte_mbuf *prepare_segmented_packet(struct qdma_rx_queue *rxq, + uint16_t pkt_length, uint16_t *tail); +int reclaim_tx_mbuf(struct qdma_tx_queue *txq, + uint16_t cidx, uint16_t free_cnt); +int qdma_extract_st_cmpt_info(void *ul_cmpt_entry, void *cmpt_info); +int qdma_ul_extract_st_cmpt_info(void *ul_cmpt_entry, void *cmpt_info); + +/* Transmit API for Streaming mode */ +uint16_t qdma_xmit_pkts_vec(void *tx_queue, + struct rte_mbuf **tx_pkts, uint16_t nb_pkts); +uint16_t qdma_xmit_pkts_st_vec(void *tx_queue, + struct rte_mbuf **tx_pkts, uint16_t nb_pkts); + +/* Receive API for Streaming mode */ +uint16_t qdma_recv_pkts_vec(void *rx_queue, + struct rte_mbuf **rx_pkts, uint16_t nb_pkts); +uint16_t qdma_recv_pkts_st_vec(void *rx_queue, + struct rte_mbuf **rx_pkts, uint16_t nb_pkts); + +void __rte_cold qdma_set_tx_function(struct rte_eth_dev *dev); +void __rte_cold qdma_set_rx_function(struct rte_eth_dev *dev); + #endif /* ifndef __QDMA_H__ */ diff --git a/QDMA/DPDK/drivers/net/qdma/qdma_access/eqdma_cpm5_access/eqdma_cpm5_access.c b/QDMA/DPDK/drivers/net/qdma/qdma_access/eqdma_cpm5_access/eqdma_cpm5_access.c index ee516ee088715167ce36d005b63a2d10f4143ccb..11f9cca14f2ba948c8a7b03d8c722ba6eb21572d 100755 --- a/QDMA/DPDK/drivers/net/qdma/qdma_access/eqdma_cpm5_access/eqdma_cpm5_access.c +++ b/QDMA/DPDK/drivers/net/qdma/qdma_access/eqdma_cpm5_access/eqdma_cpm5_access.c @@ -99,8 +99,9 @@ #define EQDMA_CPM5_GLBL2_FLR_PRESENT_MASK BIT(1) #define EQDMA_CPM5_GLBL2_MAILBOX_EN_MASK BIT(0) -#define EQDMA_CPM5_DEFAULT_C2H_INTR_TIMER_TICK 50 -#define PREFETCH_QUEUE_COUNT_STEP 4 +#define EQDMA_CPM5_DEFAULT_C2H_INTR_TIMER_TICK 50 +#define PREFETCH_QUEUE_COUNT_STEP 4 +#define EQDMA_CPM5_DEFAULT_CMPT_COAL_MAX_BUF_SZ 0x3F /* TODO: This is work around and this needs to be auto generated from ODS */ /** EQDMA_CPM5_IND_REG_SEL_FMAP */ @@ -2399,7 +2400,7 @@ static void eqdma_cpm5_fill_intr_ctxt(struct qdma_indirect_intr_ctxt int eqdma_cpm5_set_default_global_csr(void *dev_hndl) { /* Default values */ - uint32_t cfg_val = 0, reg_val = 0; + uint32_t reg_val = 0; uint32_t rng_sz[QDMA_NUM_RING_SIZES] = {2049, 65, 129, 193, 257, 385, 513, 769, 1025, 1537, 3073, 4097, 6145, 8193, 12289, 16385}; uint32_t tmr_cnt[QDMA_NUM_C2H_TIMERS] = {1, 2, 4, 5, 8, 10, 15, 20, 25, @@ -2451,14 +2452,13 @@ int eqdma_cpm5_set_default_global_csr(void *dev_hndl) 0, QDMA_NUM_C2H_BUFFER_SIZES, buf_sz); /* C2h Completion Coalesce Configuration */ - cfg_val = qdma_reg_read(dev_hndl, - EQDMA_CPM5_C2H_WRB_COAL_BUF_DEPTH_ADDR); reg_val = FIELD_SET(C2H_WRB_COAL_CFG_TICK_CNT_MASK, - DEFAULT_CMPT_COAL_TIMER_CNT) | + DEFAULT_CMPT_COAL_TIMER_CNT) | FIELD_SET(C2H_WRB_COAL_CFG_TICK_VAL_MASK, - DEFAULT_CMPT_COAL_TIMER_TICK) | - FIELD_SET(C2H_WRB_COAL_CFG_MAX_BUF_SZ_MASK, cfg_val); + DEFAULT_CMPT_COAL_TIMER_TICK) | + FIELD_SET(C2H_WRB_COAL_CFG_MAX_BUF_SZ_MASK, + EQDMA_CPM5_DEFAULT_CMPT_COAL_MAX_BUF_SZ); qdma_reg_write(dev_hndl, EQDMA_CPM5_C2H_WRB_COAL_CFG_ADDR, reg_val); } diff --git a/QDMA/DPDK/drivers/net/qdma/qdma_access/eqdma_soft_access/eqdma_soft_access.c b/QDMA/DPDK/drivers/net/qdma/qdma_access/eqdma_soft_access/eqdma_soft_access.c index bf5eab1c00cb824163f400ae04e2a478661fd2f3..03324799ee9f252bade178cbd4634610bf92ab79 100755 --- a/QDMA/DPDK/drivers/net/qdma/qdma_access/eqdma_soft_access/eqdma_soft_access.c +++ b/QDMA/DPDK/drivers/net/qdma/qdma_access/eqdma_soft_access/eqdma_soft_access.c @@ -2560,7 +2560,7 @@ static int dump_eqdma_context(struct qdma_descq_context *queue_context, int n; int len = 0; int rv; - char banner[DEBGFS_LINE_SZ]; + char banner[DEBGFS_LINE_SZ] = ""; if (queue_context == NULL) { qdma_log_error("%s: queue_context is NULL, err:%d\n", diff --git a/QDMA/DPDK/drivers/net/qdma/qdma_access/qdma_access_version.h b/QDMA/DPDK/drivers/net/qdma/qdma_access/qdma_access_version.h index 94bec64b54a7ba14f58444e15414623911406175..72dc4f843b3dff517e23cae1b1794c06b90af082 100755 --- a/QDMA/DPDK/drivers/net/qdma/qdma_access/qdma_access_version.h +++ b/QDMA/DPDK/drivers/net/qdma/qdma_access/qdma_access_version.h @@ -37,7 +37,7 @@ #define QDMA_VERSION_MAJOR 2023 #define QDMA_VERSION_MINOR 1 -#define QDMA_VERSION_PATCH 1 +#define QDMA_VERSION_PATCH 2 #define QDMA_VERSION_STR \ __stringify(QDMA_VERSION_MAJOR) "." \ diff --git a/QDMA/DPDK/drivers/net/qdma/qdma_devops.c b/QDMA/DPDK/drivers/net/qdma/qdma_devops.c index 5b2fbbc1fe7c956fdefdb370a68ae8e85a9c0542..b53027f5b327465e36221dec43daf34c567d2ea1 100755 --- a/QDMA/DPDK/drivers/net/qdma/qdma_devops.c +++ b/QDMA/DPDK/drivers/net/qdma/qdma_devops.c @@ -379,6 +379,13 @@ int qdma_dev_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id, rxq->dev = dev; rxq->st_mode = qdma_dev->q_info[rx_queue_id].queue_mode; + /* Override rx_pkt_burst with direct call based on st or mm mode */ + if (rxq->st_mode) { + dev->rx_pkt_burst = (qdma_dev->rx_vec_allowed) ? + &qdma_recv_pkts_st_vec : &qdma_recv_pkts_st; + } else + dev->rx_pkt_burst = &qdma_recv_pkts_mm; + rxq->nb_rx_desc = (nb_rx_desc + 1); /* <= 2018.2 IP * double the cmpl ring size to avoid run out of cmpl entry while @@ -752,6 +759,13 @@ int qdma_dev_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id, txq->st_mode = qdma_dev->q_info[tx_queue_id].queue_mode; + /* Override tx_pkt_burst with direct call based on st or mm mode */ + if (txq->st_mode) { + dev->tx_pkt_burst = (qdma_dev->tx_vec_allowed) ? + &qdma_xmit_pkts_st_vec : &qdma_xmit_pkts_st; + } else + dev->tx_pkt_burst = &qdma_xmit_pkts_mm; + txq->en_bypass = (qdma_dev->q_info[tx_queue_id].tx_bypass_mode) ? 1 : 0; txq->bypass_desc_sz = qdma_dev->q_info[tx_queue_id].tx_bypass_desc_sz; @@ -1954,10 +1968,11 @@ void qdma_dev_ops_init(struct rte_eth_dev *dev) { dev->dev_ops = &qdma_eth_dev_ops; if (rte_eal_process_type() == RTE_PROC_PRIMARY) { - dev->rx_pkt_burst = &qdma_recv_pkts; - dev->tx_pkt_burst = &qdma_xmit_pkts; + qdma_set_rx_function(dev); + qdma_set_tx_function(dev); dev->rx_queue_count = &qdma_dev_rx_queue_count; dev->rx_descriptor_status = &qdma_dev_rx_descriptor_status; dev->tx_descriptor_status = &qdma_dev_tx_descriptor_status; } } + diff --git a/QDMA/DPDK/drivers/net/qdma/qdma_rxtx.c b/QDMA/DPDK/drivers/net/qdma/qdma_rxtx.c index e3e55739843a6ea664909f0e0f28c8d2bd497877..bf514110d00f88a59ad3f0f589b406edb5578673 100755 --- a/QDMA/DPDK/drivers/net/qdma/qdma_rxtx.c +++ b/QDMA/DPDK/drivers/net/qdma/qdma_rxtx.c @@ -45,111 +45,7 @@ #include <immintrin.h> #include <emmintrin.h> #define RTE_QDMA_DESCS_PER_LOOP (2) -#endif //RTE_ARCH_X86_64 - -/******** User logic dependent functions start **********/ -static int qdma_ul_extract_st_cmpt_info_v(void *ul_cmpt_entry, void *cmpt_info) -{ - union qdma_ul_st_cmpt_ring *cmpt_data, *cmpt_desc; - - cmpt_desc = (union qdma_ul_st_cmpt_ring *)(ul_cmpt_entry); - cmpt_data = (union qdma_ul_st_cmpt_ring *)(cmpt_info); - - cmpt_data->data = cmpt_desc->data; - if (unlikely(!cmpt_desc->desc_used)) - cmpt_data->length = 0; - - return 0; -} - -#ifdef QDMA_RX_VEC_X86_64 -/* Vector implementation to get packet length from two completion entries */ -static void qdma_ul_get_cmpt_pkt_len_v(void *ul_cmpt_entry, __m128i *data) -{ - union qdma_ul_st_cmpt_ring *cmpt_entry1, *cmpt_entry2; - __m128i pkt_len_shift = _mm_set_epi64x(0, 4); - - cmpt_entry1 = (union qdma_ul_st_cmpt_ring *)(ul_cmpt_entry); - cmpt_entry2 = cmpt_entry1 + 1; - - /* Read desc statuses backwards to avoid race condition */ - /* Load a pkt desc */ - data[1] = _mm_set_epi64x(0, cmpt_entry2->data); - /* Find packet length, currently driver needs - * only packet length from completion info - */ - data[1] = _mm_srl_epi32(data[1], pkt_len_shift); - - /* Load a pkt desc */ - data[0] = _mm_set_epi64x(0, cmpt_entry1->data); - /* Find packet length, currently driver needs - * only packet length from completion info - */ - data[0] = _mm_srl_epi32(data[0], pkt_len_shift); -} -#endif //QDMA_RX_VEC_X86_64 - -#ifdef QDMA_TX_VEC_X86_64 -/* Vector implementation to update H2C descriptor */ -static int qdma_ul_update_st_h2c_desc_v(void *qhndl, uint64_t q_offloads, - struct rte_mbuf *mb) -{ - (void)q_offloads; - int nsegs = mb->nb_segs; - uint16_t flags = S_H2C_DESC_F_SOP | S_H2C_DESC_F_EOP; - uint16_t id; - struct qdma_ul_st_h2c_desc *tx_ring_st; - struct qdma_tx_queue *txq = (struct qdma_tx_queue *)qhndl; - - tx_ring_st = (struct qdma_ul_st_h2c_desc *)txq->tx_ring; - id = txq->q_pidx_info.pidx; - - if (nsegs == 1) { - __m128i descriptor; - uint16_t datalen = mb->data_len; - - descriptor = _mm_set_epi64x(mb->buf_iova + mb->data_off, - (uint64_t)datalen << 16 | - (uint64_t)datalen << 32 | - (uint64_t)flags << 48); - _mm_store_si128((__m128i *)&tx_ring_st[id], descriptor); - - id++; - if (unlikely(id >= (txq->nb_tx_desc - 1))) - id -= (txq->nb_tx_desc - 1); - } else { - int pkt_segs = nsegs; - while (nsegs && mb) { - __m128i descriptor; - uint16_t datalen = mb->data_len; - - flags = 0; - if (nsegs == pkt_segs) - flags |= S_H2C_DESC_F_SOP; - if (nsegs == 1) - flags |= S_H2C_DESC_F_EOP; - - descriptor = _mm_set_epi64x(mb->buf_iova + mb->data_off, - (uint64_t)datalen << 16 | - (uint64_t)datalen << 32 | - (uint64_t)flags << 48); - _mm_store_si128((__m128i *)&tx_ring_st[id], descriptor); - - nsegs--; - mb = mb->next; - id++; - if (unlikely(id >= (txq->nb_tx_desc - 1))) - id -= (txq->nb_tx_desc - 1); - } - } - - txq->q_pidx_info.pidx = id; - - return 0; -} -#endif //QDMA_TX_VEC_X86_64 - -/******** User logic dependent functions end **********/ +#endif /** * Poll the QDMA engine for transfer completion. @@ -228,17 +124,32 @@ static int dma_wb_monitor(void *xq, uint8_t dir, uint16_t expected_count) return -1; } -static int reclaim_tx_mbuf(struct qdma_tx_queue *txq, +int qdma_extract_st_cmpt_info(void *ul_cmpt_entry, void *cmpt_info) +{ + union qdma_ul_st_cmpt_ring *cmpt_data, *cmpt_desc; + + cmpt_desc = (union qdma_ul_st_cmpt_ring *)(ul_cmpt_entry); + cmpt_data = (union qdma_ul_st_cmpt_ring *)(cmpt_info); + + cmpt_data->data = cmpt_desc->data; + if (unlikely(!cmpt_desc->desc_used)) + cmpt_data->length = 0; + + return 0; +} + +int reclaim_tx_mbuf(struct qdma_tx_queue *txq, uint16_t cidx, uint16_t free_cnt) { int fl_desc = 0; + uint16_t fl_desc_cnt; uint16_t count; int id; id = txq->tx_fl_tail; fl_desc = (int)cidx - id; - if (fl_desc == 0) + if (unlikely(!fl_desc)) return 0; if (fl_desc < 0) @@ -248,33 +159,35 @@ static int reclaim_tx_mbuf(struct qdma_tx_queue *txq, fl_desc = free_cnt; if ((id + fl_desc) < (txq->nb_tx_desc - 1)) { - for (count = 0; count < ((uint16_t)fl_desc & 0xFFFF); - count++) { - rte_pktmbuf_free(txq->sw_ring[id]); + fl_desc_cnt = ((uint16_t)fl_desc & 0xFFFF); + rte_pktmbuf_free_bulk(&txq->sw_ring[id], fl_desc_cnt); + for (count = 0; count < fl_desc_cnt; count++) txq->sw_ring[id++] = NULL; - } - } else { - fl_desc -= (txq->nb_tx_desc - 1 - id); - for (; id < (txq->nb_tx_desc - 1); id++) { - rte_pktmbuf_free(txq->sw_ring[id]); - txq->sw_ring[id] = NULL; - } - id -= (txq->nb_tx_desc - 1); - for (count = 0; count < ((uint16_t)fl_desc & 0xFFFF); - count++) { - rte_pktmbuf_free(txq->sw_ring[id]); - txq->sw_ring[id++] = NULL; - } + txq->tx_fl_tail = id; + return fl_desc; } + + /* Handle Tx queue ring wrap case */ + fl_desc -= (txq->nb_tx_desc - 1 - id); + rte_pktmbuf_free_bulk(&txq->sw_ring[id], (txq->nb_tx_desc - 1 - id)); + for (; id < (txq->nb_tx_desc - 1); id++) + txq->sw_ring[id] = NULL; + + id -= (txq->nb_tx_desc - 1); + fl_desc_cnt = ((uint16_t)fl_desc & 0xFFFF); + rte_pktmbuf_free_bulk(&txq->sw_ring[id], fl_desc_cnt); + for (count = 0; count < fl_desc_cnt; count++) + txq->sw_ring[id++] = NULL; + txq->tx_fl_tail = id; return fl_desc; } #ifdef TEST_64B_DESC_BYPASS -static uint16_t qdma_xmit_64B_desc_bypass(struct qdma_tx_queue *txq, - struct rte_mbuf **tx_pkts, uint16_t nb_pkts) +uint16_t qdma_xmit_64B_desc_bypass(struct qdma_tx_queue *txq, + struct rte_mbuf **tx_pkts, uint16_t nb_pkts) { uint16_t count, id; uint8_t *tx_ring_st_bypass = NULL; @@ -555,7 +468,7 @@ static int process_cmpt_ring(struct qdma_rx_queue *rxq, ((uint64_t)rxq->cmpt_ring + ((uint64_t)rx_cmpt_tail * rxq->cmpt_desc_len)); - ret = qdma_ul_extract_st_cmpt_info_v( + ret = qdma_extract_st_cmpt_info( user_cmpt_entry, &rxq->cmpt_data[count]); if (ret != 0) { @@ -574,7 +487,7 @@ static int process_cmpt_ring(struct qdma_rx_queue *rxq, ((uint64_t)rxq->cmpt_ring + ((uint64_t)rx_cmpt_tail * rxq->cmpt_desc_len)); - ret = qdma_ul_extract_st_cmpt_info_v( + ret = qdma_extract_st_cmpt_info( user_cmpt_entry, &rxq->cmpt_data[count]); if (ret != 0) { @@ -741,7 +654,7 @@ qdma_dev_rx_descriptor_status(void *rx_queue, uint16_t offset) } /* Update mbuf for a segmented packet */ -static struct rte_mbuf *prepare_segmented_packet(struct qdma_rx_queue *rxq, +struct rte_mbuf *prepare_segmented_packet(struct qdma_rx_queue *rxq, uint16_t pkt_length, uint16_t *tail) { struct rte_mbuf *mb; @@ -828,147 +741,12 @@ struct rte_mbuf *prepare_single_packet(struct qdma_rx_queue *rxq, return mb; } -#ifdef QDMA_RX_VEC_X86_64 -/* Vector implementation to prepare mbufs for packets. - * Update this API if HW provides more information to be populated in mbuf. - */ -static uint16_t prepare_packets_v(struct qdma_rx_queue *rxq, - struct rte_mbuf **rx_pkts, uint16_t nb_pkts) -{ - struct rte_mbuf *mb; - uint16_t count = 0, count_pkts = 0; - uint16_t n_pkts = nb_pkts & -2; - uint16_t id = rxq->rx_tail; - struct rte_mbuf **sw_ring = rxq->sw_ring; - uint16_t rx_buff_size = rxq->rx_buff_size; - /* mask to shuffle from desc. to mbuf */ - __m128i shuf_msk = _mm_set_epi8( - 0xFF, 0xFF, 0xFF, 0xFF, /* skip 32bits rss */ - 0xFF, 0xFF, /* skip low 16 bits vlan_macip */ - 1, 0, /* octet 0~1, 16 bits data_len */ - 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ - 1, 0, /* octet 0~1, low 16 bits pkt_len */ - 0xFF, 0xFF, /* skip 32 bit pkt_type */ - 0xFF, 0xFF - ); - __m128i mbuf_init, pktlen, zero_data; - - mbuf_init = _mm_set_epi64x(0, rxq->mbuf_initializer); - pktlen = _mm_setzero_si128(); - zero_data = _mm_setzero_si128(); - - /* compile-time check */ - RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != - offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); - RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != - offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); - RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != - RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16)); - - for (count = 0; count < n_pkts; - count += RTE_QDMA_DESCS_PER_LOOP) { - __m128i pkt_len[RTE_QDMA_DESCS_PER_LOOP]; - __m128i pkt_mb1, pkt_mb2; - __m128i mbp1; - uint16_t pktlen1, pktlen2; - - qdma_ul_get_cmpt_pkt_len_v( - &rxq->cmpt_data[count], pkt_len); - - pktlen1 = _mm_extract_epi16(pkt_len[0], 0); - pktlen2 = _mm_extract_epi16(pkt_len[1], 0); - - /* Check if packets are segmented across descriptors */ - if ((pktlen1 && (pktlen1 <= rx_buff_size)) && - (pktlen2 && (pktlen2 <= rx_buff_size)) && - ((id + RTE_QDMA_DESCS_PER_LOOP) < - (rxq->nb_rx_desc - 1))) { - /* Load 2 (64 bit) mbuf pointers */ - mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[id]); - - /* Copy 2 64 bit mbuf point into rx_pkts */ - _mm_storeu_si128((__m128i *)&rx_pkts[count_pkts], mbp1); - _mm_storeu_si128((__m128i *)&sw_ring[id], zero_data); - - /* Pkt 1,2 convert format from desc to pktmbuf */ - /* We only have packet length to copy */ - pkt_mb2 = _mm_shuffle_epi8(pkt_len[1], shuf_msk); - pkt_mb1 = _mm_shuffle_epi8(pkt_len[0], shuf_msk); - - /* Write the rearm data and the olflags in one write */ - _mm_store_si128( - (__m128i *)&rx_pkts[count_pkts]->rearm_data, mbuf_init); - _mm_store_si128( - (__m128i *)&rx_pkts[count_pkts + 1]->rearm_data, - mbuf_init); - - /* Write packet length */ - _mm_storeu_si128( - (void *)&rx_pkts[count_pkts]->rx_descriptor_fields1, - pkt_mb1); - _mm_storeu_si128( - (void *)&rx_pkts[count_pkts + 1]->rx_descriptor_fields1, - pkt_mb2); - - /* Accumulate packet length counter */ - pktlen = _mm_add_epi64(pktlen, - _mm_set_epi16(0, 0, 0, 0, - 0, 0, 0, pktlen1)); - pktlen = _mm_add_epi64(pktlen, - _mm_set_epi16(0, 0, 0, 0, - 0, 0, 0, pktlen2)); - - count_pkts += RTE_QDMA_DESCS_PER_LOOP; - id += RTE_QDMA_DESCS_PER_LOOP; - } else { - /* Handle packets segmented - * across multiple descriptors - * or ring wrap - */ - if (pktlen1) { - mb = prepare_segmented_packet(rxq, - pktlen1, &id); - rx_pkts[count_pkts++] = mb; - pktlen = _mm_add_epi64(pktlen, - _mm_set_epi16(0, 0, 0, 0, - 0, 0, 0, pktlen1)); - } - - if (pktlen2) { - mb = prepare_segmented_packet(rxq, - pktlen2, &id); - rx_pkts[count_pkts++] = mb; - pktlen = _mm_add_epi64(pktlen, - _mm_set_epi16(0, 0, 0, 0, - 0, 0, 0, pktlen2)); - } - } - } - - rxq->stats.pkts += count_pkts; - rxq->stats.bytes += _mm_extract_epi64(pktlen, 0); - rxq->rx_tail = id; - - /* Handle single packet, if any pending */ - if (nb_pkts & 1) { - mb = prepare_single_packet(rxq, count); - if (mb) - rx_pkts[count_pkts++] = mb; - } - - return count_pkts; -} -#endif //QDMA_RX_VEC_X86_64 - /* Prepare mbufs with packet information */ static uint16_t prepare_packets(struct qdma_rx_queue *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) { uint16_t count_pkts = 0; -#ifdef QDMA_RX_VEC_X86_64 - count_pkts = prepare_packets_v(rxq, rx_pkts, nb_pkts); -#else //QDMA_RX_VEC_X86_64 struct rte_mbuf *mb; uint16_t pkt_length; uint16_t count = 0; @@ -984,7 +762,6 @@ static uint16_t prepare_packets(struct qdma_rx_queue *rxq, } count++; } -#endif //QDMA_RX_VEC_X86_64 return count_pkts; } @@ -1023,47 +800,6 @@ static int rearm_c2h_ring(struct qdma_rx_queue *rxq, uint16_t num_desc) return -1; } -#ifdef QDMA_RX_VEC_X86_64 - int rearm_cnt = rearm_descs & -2; - __m128i head_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, - RTE_PKTMBUF_HEADROOM); - - for (mbuf_index = 0; mbuf_index < ((uint16_t)rearm_cnt & 0xFFFF); - mbuf_index += RTE_QDMA_DESCS_PER_LOOP, - id += RTE_QDMA_DESCS_PER_LOOP) { - __m128i vaddr0, vaddr1; - __m128i dma_addr; - - /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ - RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != - offsetof(struct rte_mbuf, buf_addr) + 8); - - /* Load two mbufs data addresses */ - vaddr0 = _mm_loadu_si128( - (__m128i *)&(rxq->sw_ring[id]->buf_addr)); - vaddr1 = _mm_loadu_si128( - (__m128i *)&(rxq->sw_ring[id+1]->buf_addr)); - - /* Extract physical addresses of two mbufs */ - dma_addr = _mm_unpackhi_epi64(vaddr0, vaddr1); - - /* Add headroom to dma_addr */ - dma_addr = _mm_add_epi64(dma_addr, head_room); - - /* Write C2H desc with physical dma_addr */ - _mm_storeu_si128((__m128i *)&rx_ring_st[id], dma_addr); - } - - if (rearm_descs & 1) { - mb = rxq->sw_ring[id]; - - /* rearm descriptor */ - rx_ring_st[id].dst_addr = - (uint64_t)mb->buf_iova + - RTE_PKTMBUF_HEADROOM; - id++; - } -#else //QDMA_RX_VEC_X86_64 for (mbuf_index = 0; mbuf_index < rearm_descs; mbuf_index++, id++) { mb = rxq->sw_ring[id]; @@ -1074,7 +810,6 @@ static int rearm_c2h_ring(struct qdma_rx_queue *rxq, uint16_t num_desc) (uint64_t)mb->buf_iova + RTE_PKTMBUF_HEADROOM; } -#endif //QDMA_RX_VEC_X86_64 if (unlikely(id >= (rxq->nb_rx_desc - 1))) id -= (rxq->nb_rx_desc - 1); @@ -1132,8 +867,8 @@ static int rearm_c2h_ring(struct qdma_rx_queue *rxq, uint16_t num_desc) } /* Receive API for Streaming mode */ -uint16_t qdma_recv_pkts_st(struct qdma_rx_queue *rxq, - struct rte_mbuf **rx_pkts, uint16_t nb_pkts) +uint16_t qdma_recv_pkts_st(void *rx_queue, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts) { uint16_t count_pkts; struct wb_status *wb_status; @@ -1141,6 +876,7 @@ uint16_t qdma_recv_pkts_st(struct qdma_rx_queue *rxq, uint16_t rx_cmpt_tail = 0; uint16_t cmpt_pidx, c2h_pidx; uint16_t pending_desc; + struct qdma_rx_queue *rxq = rx_queue; #ifdef TEST_64B_DESC_BYPASS int bypass_desc_sz_idx = qmda_get_desc_sz_idx(rxq->bypass_desc_sz); #endif @@ -1234,14 +970,15 @@ uint16_t qdma_recv_pkts_st(struct qdma_rx_queue *rxq, } /* Receive API for Memory mapped mode */ -uint16_t qdma_recv_pkts_mm(struct qdma_rx_queue *rxq, - struct rte_mbuf **rx_pkts, uint16_t nb_pkts) +uint16_t qdma_recv_pkts_mm(void *rx_queue, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts) { struct rte_mbuf *mb; uint32_t count, id; struct qdma_ul_mm_desc *desc; uint32_t len; int ret; + struct qdma_rx_queue *rxq = rx_queue; struct qdma_pci_dev *qdma_dev = rxq->dev->data->dev_private; #ifdef TEST_64B_DESC_BYPASS int bypass_desc_sz_idx = qmda_get_desc_sz_idx(rxq->bypass_desc_sz); @@ -1349,9 +1086,9 @@ uint16_t qdma_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint32_t count; if (rxq->st_mode) - count = qdma_recv_pkts_st(rxq, rx_pkts, nb_pkts); + count = qdma_recv_pkts_st(rx_queue, rx_pkts, nb_pkts); else - count = qdma_recv_pkts_mm(rxq, rx_pkts, nb_pkts); + count = qdma_recv_pkts_mm(rx_queue, rx_pkts, nb_pkts); return count; } @@ -1432,14 +1169,15 @@ qdma_dev_tx_descriptor_status(void *tx_queue, uint16_t offset) } /* Transmit API for Streaming mode */ -uint16_t qdma_xmit_pkts_st(struct qdma_tx_queue *txq, - struct rte_mbuf **tx_pkts, uint16_t nb_pkts) +uint16_t qdma_xmit_pkts_st(void *tx_queue, struct rte_mbuf **tx_pkts, + uint16_t nb_pkts) { - struct rte_mbuf *mb; + struct rte_mbuf *mb = NULL; uint64_t pkt_len = 0; int avail, in_use, ret, nsegs; uint16_t cidx = 0; uint16_t count = 0, id; + struct qdma_tx_queue *txq = tx_queue; struct qdma_pci_dev *qdma_dev = txq->dev->data->dev_private; #ifdef TEST_64B_DESC_BYPASS @@ -1496,11 +1234,7 @@ uint16_t qdma_xmit_pkts_st(struct qdma_tx_queue *txq, txq->sw_ring[id] = mb; pkt_len += rte_pktmbuf_pkt_len(mb); -#ifdef QDMA_TX_VEC_X86_64 - ret = qdma_ul_update_st_h2c_desc_v(txq, txq->offloads, mb); -#else ret = qdma_ul_update_st_h2c_desc(txq, txq->offloads, mb); -#endif //RTE_ARCH_X86_64 if (unlikely(ret < 0)) break; } @@ -1532,14 +1266,15 @@ uint16_t qdma_xmit_pkts_st(struct qdma_tx_queue *txq, } /* Transmit API for Memory mapped mode */ -uint16_t qdma_xmit_pkts_mm(struct qdma_tx_queue *txq, - struct rte_mbuf **tx_pkts, uint16_t nb_pkts) +uint16_t qdma_xmit_pkts_mm(void *tx_queue, struct rte_mbuf **tx_pkts, + uint16_t nb_pkts) { struct rte_mbuf *mb; uint32_t count, id; uint64_t len = 0; int avail, in_use; int ret; + struct qdma_tx_queue *txq = tx_queue; struct qdma_pci_dev *qdma_dev = txq->dev->data->dev_private; uint16_t cidx = 0; @@ -1646,9 +1381,43 @@ uint16_t qdma_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, return 0; if (txq->st_mode) - count = qdma_xmit_pkts_st(txq, tx_pkts, nb_pkts); + count = qdma_xmit_pkts_st(tx_queue, tx_pkts, nb_pkts); else - count = qdma_xmit_pkts_mm(txq, tx_pkts, nb_pkts); + count = qdma_xmit_pkts_mm(tx_queue, tx_pkts, nb_pkts); return count; } + +void __rte_cold +qdma_set_tx_function(struct rte_eth_dev *dev) +{ + struct qdma_pci_dev *qdma_dev = dev->data->dev_private; + + if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) { + PMD_DRV_LOG(DEBUG, "Using Vector Tx (port %d).", + dev->data->port_id); + qdma_dev->tx_vec_allowed = true; + dev->tx_pkt_burst = qdma_xmit_pkts_vec; + } else { + PMD_DRV_LOG(DEBUG, "Normal Rx will be used on port %d.", + dev->data->port_id); + dev->tx_pkt_burst = qdma_xmit_pkts; + } +} + +void __rte_cold +qdma_set_rx_function(struct rte_eth_dev *dev) +{ + struct qdma_pci_dev *qdma_dev = dev->data->dev_private; + + if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) { + PMD_DRV_LOG(DEBUG, "Using Vector Rx (port %d).", + dev->data->port_id); + qdma_dev->rx_vec_allowed = true; + dev->rx_pkt_burst = qdma_recv_pkts_vec; + } else { + PMD_DRV_LOG(DEBUG, "Normal Rx will be used on port %d.", + dev->data->port_id); + dev->rx_pkt_burst = qdma_recv_pkts; + } +} diff --git a/QDMA/DPDK/drivers/net/qdma/qdma_rxtx_vec_sse.c b/QDMA/DPDK/drivers/net/qdma/qdma_rxtx_vec_sse.c new file mode 100755 index 0000000000000000000000000000000000000000..7dae459c40dfa20911995862f6b5ee3bf6b8ef92 --- /dev/null +++ b/QDMA/DPDK/drivers/net/qdma/qdma_rxtx_vec_sse.c @@ -0,0 +1,787 @@ +/*- + * BSD LICENSE + * + * Copyright (c) 2017-2022 Xilinx, Inc. All rights reserved. + * Copyright (c) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <rte_mbuf.h> +#include <rte_cycles.h> +#include "qdma.h" +#include "qdma_access_common.h" + +#include <fcntl.h> +#include <unistd.h> +#include "qdma_rxtx.h" +#include "qdma_devops.h" + +#if defined RTE_ARCH_X86_64 +#include <immintrin.h> +#include <emmintrin.h> +#define RTE_QDMA_DESCS_PER_LOOP (2) +#endif + +/* Vector implementation to get packet length from two completion entries */ +static void qdma_ul_get_cmpt_pkt_len_vec(void *ul_cmpt_entry, __m128i *data) +{ + union qdma_ul_st_cmpt_ring *cmpt_entry1, *cmpt_entry2; + __m128i pkt_len_shift = _mm_set_epi64x(0, 4); + + cmpt_entry1 = (union qdma_ul_st_cmpt_ring *)(ul_cmpt_entry); + cmpt_entry2 = cmpt_entry1 + 1; + + /* Read desc statuses backwards to avoid race condition */ + /* Load a pkt desc */ + data[1] = _mm_set_epi64x(0, cmpt_entry2->data); + /* Find packet length, currently driver needs + * only packet length from completion info + */ + data[1] = _mm_srl_epi32(data[1], pkt_len_shift); + + /* Load a pkt desc */ + data[0] = _mm_set_epi64x(0, cmpt_entry1->data); + /* Find packet length, currently driver needs + * only packet length from completion info + */ + data[0] = _mm_srl_epi32(data[0], pkt_len_shift); +} + +/* Vector implementation to update H2C descriptor */ +static int qdma_ul_update_st_h2c_desc_vec(void *qhndl, uint64_t q_offloads, + struct rte_mbuf *mb) +{ + (void)q_offloads; + int nsegs = mb->nb_segs; + uint16_t flags = S_H2C_DESC_F_SOP | S_H2C_DESC_F_EOP; + uint16_t id; + struct qdma_ul_st_h2c_desc *tx_ring_st; + struct qdma_tx_queue *txq = (struct qdma_tx_queue *)qhndl; + + tx_ring_st = (struct qdma_ul_st_h2c_desc *)txq->tx_ring; + id = txq->q_pidx_info.pidx; + + if (nsegs == 1) { + __m128i descriptor; + uint16_t datalen = mb->data_len; + + descriptor = _mm_set_epi64x(mb->buf_iova + mb->data_off, + (uint64_t)datalen << 16 | + (uint64_t)datalen << 32 | + (uint64_t)flags << 48); + _mm_store_si128((__m128i *)&tx_ring_st[id], descriptor); + + id++; + if (unlikely(id >= (txq->nb_tx_desc - 1))) + id -= (txq->nb_tx_desc - 1); + } else { + int pkt_segs = nsegs; + while (nsegs && mb) { + __m128i descriptor; + uint16_t datalen = mb->data_len; + + flags = 0; + if (nsegs == pkt_segs) + flags |= S_H2C_DESC_F_SOP; + if (nsegs == 1) + flags |= S_H2C_DESC_F_EOP; + + descriptor = _mm_set_epi64x(mb->buf_iova + mb->data_off, + (uint64_t)datalen << 16 | + (uint64_t)datalen << 32 | + (uint64_t)flags << 48); + _mm_store_si128((__m128i *)&tx_ring_st[id], descriptor); + + nsegs--; + mb = mb->next; + id++; + if (unlikely(id >= (txq->nb_tx_desc - 1))) + id -= (txq->nb_tx_desc - 1); + } + } + + txq->q_pidx_info.pidx = id; + + return 0; +} + +/* Process completion ring */ +static int process_cmpt_ring_vec(struct qdma_rx_queue *rxq, + uint16_t num_cmpt_entries) +{ + struct qdma_pci_dev *qdma_dev = rxq->dev->data->dev_private; + union qdma_ul_st_cmpt_ring *user_cmpt_entry; + uint32_t count = 0; + int ret = 0; + uint16_t rx_cmpt_tail = rxq->cmpt_cidx_info.wrb_cidx; + + if (likely(!rxq->dump_immediate_data)) { + if ((rx_cmpt_tail + num_cmpt_entries) < + (rxq->nb_rx_cmpt_desc - 1)) { + for (count = 0; count < num_cmpt_entries; count++) { + user_cmpt_entry = + (union qdma_ul_st_cmpt_ring *) + ((uint64_t)rxq->cmpt_ring + + ((uint64_t)rx_cmpt_tail * rxq->cmpt_desc_len)); + + ret = qdma_ul_extract_st_cmpt_info( + user_cmpt_entry, + &rxq->cmpt_data[count]); + if (ret != 0) { + PMD_DRV_LOG(ERR, "Error detected on CMPT ring " + "at index %d, queue_id = %d\n", + rx_cmpt_tail, rxq->queue_id); + rxq->err = 1; + return -1; + } + rx_cmpt_tail++; + } + } else { + while (count < num_cmpt_entries) { + user_cmpt_entry = + (union qdma_ul_st_cmpt_ring *) + ((uint64_t)rxq->cmpt_ring + + ((uint64_t)rx_cmpt_tail * rxq->cmpt_desc_len)); + + ret = qdma_ul_extract_st_cmpt_info( + user_cmpt_entry, + &rxq->cmpt_data[count]); + if (ret != 0) { + PMD_DRV_LOG(ERR, "Error detected on CMPT ring " + "at index %d, queue_id = %d\n", + rx_cmpt_tail, rxq->queue_id); + rxq->err = 1; + return -1; + } + + rx_cmpt_tail++; + if (unlikely(rx_cmpt_tail >= + (rxq->nb_rx_cmpt_desc - 1))) + rx_cmpt_tail -= + (rxq->nb_rx_cmpt_desc - 1); + count++; + } + } + } else { + while (count < num_cmpt_entries) { + user_cmpt_entry = + (union qdma_ul_st_cmpt_ring *) + ((uint64_t)rxq->cmpt_ring + + ((uint64_t)rx_cmpt_tail * rxq->cmpt_desc_len)); + + ret = qdma_ul_extract_st_cmpt_info( + user_cmpt_entry, + &rxq->cmpt_data[count]); + if (ret != 0) { + PMD_DRV_LOG(ERR, "Error detected on CMPT ring " + "at CMPT index %d, queue_id = %d\n", + rx_cmpt_tail, rxq->queue_id); + rxq->err = 1; + return -1; + } + + ret = qdma_ul_process_immediate_data_st((void *)rxq, + user_cmpt_entry, rxq->cmpt_desc_len); + if (ret < 0) { + PMD_DRV_LOG(ERR, "Error processing immediate data " + "at CMPT index = %d, queue_id = %d\n", + rx_cmpt_tail, rxq->queue_id); + return -1; + } + + rx_cmpt_tail++; + if (unlikely(rx_cmpt_tail >= + (rxq->nb_rx_cmpt_desc - 1))) + rx_cmpt_tail -= (rxq->nb_rx_cmpt_desc - 1); + count++; + } + } + + // Update the CPMT CIDX + rxq->cmpt_cidx_info.wrb_cidx = rx_cmpt_tail; + qdma_dev->hw_access->qdma_queue_cmpt_cidx_update(rxq->dev, + qdma_dev->is_vf, + rxq->queue_id, &rxq->cmpt_cidx_info); + + return 0; +} + +/* Prepare mbuf for one packet */ +static inline +struct rte_mbuf *prepare_single_packet(struct qdma_rx_queue *rxq, + uint16_t cmpt_idx) +{ + struct rte_mbuf *mb = NULL; + uint16_t id = rxq->rx_tail; + uint16_t pkt_length; + + pkt_length = qdma_ul_get_cmpt_pkt_len(&rxq->cmpt_data[cmpt_idx]); + + if (pkt_length) { + rxq->stats.pkts++; + rxq->stats.bytes += pkt_length; + + if (likely(pkt_length <= rxq->rx_buff_size)) { + mb = rxq->sw_ring[id]; + rxq->sw_ring[id++] = NULL; + + if (unlikely(id >= (rxq->nb_rx_desc - 1))) + id -= (rxq->nb_rx_desc - 1); + + rte_mbuf_refcnt_set(mb, 1); + mb->nb_segs = 1; + mb->port = rxq->port_id; + mb->ol_flags = 0; + mb->packet_type = 0; + mb->pkt_len = pkt_length; + mb->data_len = pkt_length; + } else { + mb = prepare_segmented_packet(rxq, pkt_length, &id); + } + + rxq->rx_tail = id; + } + return mb; +} + +/* Vector implementation to prepare mbufs for packets. + * Update this API if HW provides more information to be populated in mbuf. + */ +static uint16_t prepare_packets_vec(struct qdma_rx_queue *rxq, + struct rte_mbuf **rx_pkts, uint16_t nb_pkts) +{ + struct rte_mbuf *mb; + uint16_t count = 0, count_pkts = 0; + uint16_t n_pkts = nb_pkts & -2; + uint16_t id = rxq->rx_tail; + struct rte_mbuf **sw_ring = rxq->sw_ring; + uint16_t rx_buff_size = rxq->rx_buff_size; + /* mask to shuffle from desc. to mbuf */ + __m128i shuf_msk = _mm_set_epi8( + 0xFF, 0xFF, 0xFF, 0xFF, /* skip 32bits rss */ + 0xFF, 0xFF, /* skip low 16 bits vlan_macip */ + 1, 0, /* octet 0~1, 16 bits data_len */ + 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ + 1, 0, /* octet 0~1, low 16 bits pkt_len */ + 0xFF, 0xFF, /* skip 32 bit pkt_type */ + 0xFF, 0xFF + ); + __m128i mbuf_init, pktlen, zero_data; + + mbuf_init = _mm_set_epi64x(0, rxq->mbuf_initializer); + pktlen = _mm_setzero_si128(); + zero_data = _mm_setzero_si128(); + + /* compile-time check */ + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != + RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16)); + + for (count = 0; count < n_pkts; + count += RTE_QDMA_DESCS_PER_LOOP) { + __m128i pkt_len[RTE_QDMA_DESCS_PER_LOOP]; + __m128i pkt_mb1, pkt_mb2; + __m128i mbp1; + uint16_t pktlen1, pktlen2; + + qdma_ul_get_cmpt_pkt_len_vec( + &rxq->cmpt_data[count], pkt_len); + + pktlen1 = _mm_extract_epi16(pkt_len[0], 0); + pktlen2 = _mm_extract_epi16(pkt_len[1], 0); + + /* Check if packets are segmented across descriptors */ + if ((pktlen1 && (pktlen1 <= rx_buff_size)) && + (pktlen2 && (pktlen2 <= rx_buff_size)) && + ((id + RTE_QDMA_DESCS_PER_LOOP) < + (rxq->nb_rx_desc - 1))) { + /* Load 2 (64 bit) mbuf pointers */ + mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[id]); + + /* Copy 2 64 bit mbuf point into rx_pkts */ + _mm_storeu_si128((__m128i *)&rx_pkts[count_pkts], mbp1); + _mm_storeu_si128((__m128i *)&sw_ring[id], zero_data); + + /* Pkt 1,2 convert format from desc to pktmbuf */ + /* We only have packet length to copy */ + pkt_mb2 = _mm_shuffle_epi8(pkt_len[1], shuf_msk); + pkt_mb1 = _mm_shuffle_epi8(pkt_len[0], shuf_msk); + + /* Write the rearm data and the olflags in one write */ + _mm_store_si128( + (__m128i *)&rx_pkts[count_pkts]->rearm_data, mbuf_init); + _mm_store_si128( + (__m128i *)&rx_pkts[count_pkts + 1]->rearm_data, + mbuf_init); + + /* Write packet length */ + _mm_storeu_si128( + (void *)&rx_pkts[count_pkts]->rx_descriptor_fields1, + pkt_mb1); + _mm_storeu_si128( + (void *)&rx_pkts[count_pkts + 1]->rx_descriptor_fields1, + pkt_mb2); + + /* Accumulate packet length counter */ + pktlen = _mm_add_epi64(pktlen, + _mm_set_epi16(0, 0, 0, 0, + 0, 0, 0, pktlen1)); + pktlen = _mm_add_epi64(pktlen, + _mm_set_epi16(0, 0, 0, 0, + 0, 0, 0, pktlen2)); + + count_pkts += RTE_QDMA_DESCS_PER_LOOP; + id += RTE_QDMA_DESCS_PER_LOOP; + } else { + /* Handle packets segmented + * across multiple descriptors + * or ring wrap + */ + if (pktlen1) { + mb = prepare_segmented_packet(rxq, + pktlen1, &id); + rx_pkts[count_pkts++] = mb; + pktlen = _mm_add_epi64(pktlen, + _mm_set_epi16(0, 0, 0, 0, + 0, 0, 0, pktlen1)); + } + + if (pktlen2) { + mb = prepare_segmented_packet(rxq, + pktlen2, &id); + rx_pkts[count_pkts++] = mb; + pktlen = _mm_add_epi64(pktlen, + _mm_set_epi16(0, 0, 0, 0, + 0, 0, 0, pktlen2)); + } + } + } + + rxq->stats.pkts += count_pkts; + rxq->stats.bytes += _mm_extract_epi64(pktlen, 0); + rxq->rx_tail = id; + + /* Handle single packet, if any pending */ + if (nb_pkts & 1) { + mb = prepare_single_packet(rxq, count); + if (mb) + rx_pkts[count_pkts++] = mb; + } + + return count_pkts; +} + +/* Populate C2H ring with new buffers */ +static int rearm_c2h_ring_vec(struct qdma_rx_queue *rxq, uint16_t num_desc) +{ + struct qdma_pci_dev *qdma_dev = rxq->dev->data->dev_private; + struct rte_mbuf *mb; + struct qdma_ul_st_c2h_desc *rx_ring_st = + (struct qdma_ul_st_c2h_desc *)rxq->rx_ring; + uint16_t mbuf_index = 0; + uint16_t id; + int rearm_descs; + + id = rxq->q_pidx_info.pidx; + + /* Split the C2H ring updation in two parts. + * First handle till end of ring and then + * handle from beginning of ring, if ring wraps + */ + if ((id + num_desc) < (rxq->nb_rx_desc - 1)) + rearm_descs = num_desc; + else + rearm_descs = (rxq->nb_rx_desc - 1) - id; + + /* allocate new buffer */ + if (rte_mempool_get_bulk(rxq->mb_pool, (void *)&rxq->sw_ring[id], + rearm_descs) != 0){ + PMD_DRV_LOG(ERR, "%s(): %d: No MBUFS, queue id = %d," + "mbuf_avail_count = %d," + " mbuf_in_use_count = %d, num_desc_req = %d\n", + __func__, __LINE__, rxq->queue_id, + rte_mempool_avail_count(rxq->mb_pool), + rte_mempool_in_use_count(rxq->mb_pool), rearm_descs); + return -1; + } + + int rearm_cnt = rearm_descs & -2; + __m128i head_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, + RTE_PKTMBUF_HEADROOM); + + for (mbuf_index = 0; mbuf_index < ((uint16_t)rearm_cnt & 0xFFFF); + mbuf_index += RTE_QDMA_DESCS_PER_LOOP, + id += RTE_QDMA_DESCS_PER_LOOP) { + __m128i vaddr0, vaddr1; + __m128i dma_addr; + + /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != + offsetof(struct rte_mbuf, buf_addr) + 8); + + /* Load two mbufs data addresses */ + vaddr0 = _mm_loadu_si128( + (__m128i *)&(rxq->sw_ring[id]->buf_addr)); + vaddr1 = _mm_loadu_si128( + (__m128i *)&(rxq->sw_ring[id+1]->buf_addr)); + + /* Extract physical addresses of two mbufs */ + dma_addr = _mm_unpackhi_epi64(vaddr0, vaddr1); + + /* Add headroom to dma_addr */ + dma_addr = _mm_add_epi64(dma_addr, head_room); + + /* Write C2H desc with physical dma_addr */ + _mm_storeu_si128((__m128i *)&rx_ring_st[id], dma_addr); + } + + if (rearm_descs & 1) { + mb = rxq->sw_ring[id]; + + /* rearm descriptor */ + rx_ring_st[id].dst_addr = + (uint64_t)mb->buf_iova + + RTE_PKTMBUF_HEADROOM; + id++; + } + + if (unlikely(id >= (rxq->nb_rx_desc - 1))) + id -= (rxq->nb_rx_desc - 1); + + /* Handle from beginning of ring, if ring wrapped */ + rearm_descs = num_desc - rearm_descs; + if (unlikely(rearm_descs)) { + /* allocate new buffer */ + if (rte_mempool_get_bulk(rxq->mb_pool, + (void *)&rxq->sw_ring[id], rearm_descs) != 0) { + PMD_DRV_LOG(ERR, "%s(): %d: No MBUFS, queue id = %d," + "mbuf_avail_count = %d," + " mbuf_in_use_count = %d, num_desc_req = %d\n", + __func__, __LINE__, rxq->queue_id, + rte_mempool_avail_count(rxq->mb_pool), + rte_mempool_in_use_count(rxq->mb_pool), rearm_descs); + + rxq->q_pidx_info.pidx = id; + qdma_dev->hw_access->qdma_queue_pidx_update(rxq->dev, + qdma_dev->is_vf, + rxq->queue_id, 1, &rxq->q_pidx_info); + + return -1; + } + + for (mbuf_index = 0; + mbuf_index < ((uint16_t)rearm_descs & 0xFFFF); + mbuf_index++, id++) { + mb = rxq->sw_ring[id]; + mb->data_off = RTE_PKTMBUF_HEADROOM; + + /* rearm descriptor */ + rx_ring_st[id].dst_addr = + (uint64_t)mb->buf_iova + + RTE_PKTMBUF_HEADROOM; + } + } + + PMD_DRV_LOG(DEBUG, "%s(): %d: PIDX Update: queue id = %d, " + "num_desc = %d", + __func__, __LINE__, rxq->queue_id, + num_desc); + + /* Make sure writes to the C2H descriptors are + * synchronized before updating PIDX + */ + rte_wmb(); + + rxq->q_pidx_info.pidx = id; + qdma_dev->hw_access->qdma_queue_pidx_update(rxq->dev, + qdma_dev->is_vf, + rxq->queue_id, 1, &rxq->q_pidx_info); + + return 0; +} + +/* Receive API for Streaming mode */ +uint16_t qdma_recv_pkts_st_vec(void *rx_queue, + struct rte_mbuf **rx_pkts, uint16_t nb_pkts) +{ + uint16_t count_pkts; + struct wb_status *wb_status; + uint16_t nb_pkts_avail = 0; + uint16_t rx_cmpt_tail = 0; + uint16_t cmpt_pidx, c2h_pidx; + uint16_t pending_desc; + struct qdma_rx_queue *rxq = rx_queue; +#ifdef TEST_64B_DESC_BYPASS + int bypass_desc_sz_idx = qmda_get_desc_sz_idx(rxq->bypass_desc_sz); +#endif + + if (unlikely(rxq->err)) + return 0; + + PMD_DRV_LOG(DEBUG, "recv start on rx queue-id :%d, on " + "tail index:%d number of pkts %d", + rxq->queue_id, rxq->rx_tail, nb_pkts); + wb_status = rxq->wb_status; + rx_cmpt_tail = rxq->cmpt_cidx_info.wrb_cidx; + +#ifdef TEST_64B_DESC_BYPASS + if (unlikely(rxq->en_bypass && + bypass_desc_sz_idx == SW_DESC_CNTXT_64B_BYPASS_DMA)) { + PMD_DRV_LOG(DEBUG, "For RX ST-mode, example" + " design doesn't support 64byte descriptor\n"); + return 0; + } +#endif + cmpt_pidx = wb_status->pidx; + + if (rx_cmpt_tail < cmpt_pidx) + nb_pkts_avail = cmpt_pidx - rx_cmpt_tail; + else if (rx_cmpt_tail > cmpt_pidx) + nb_pkts_avail = rxq->nb_rx_cmpt_desc - 1 - rx_cmpt_tail + + cmpt_pidx; + + if (nb_pkts_avail == 0) { + PMD_DRV_LOG(DEBUG, "%s(): %d: nb_pkts_avail = 0\n", + __func__, __LINE__); + return 0; + } + + nb_pkts = RTE_MIN(nb_pkts, RTE_MIN(nb_pkts_avail, QDMA_MAX_BURST_SIZE)); + +#ifdef DUMP_MEMPOOL_USAGE_STATS + PMD_DRV_LOG(DEBUG, "%s(): %d: queue id = %d, mbuf_avail_count = %d, " + "mbuf_in_use_count = %d", + __func__, __LINE__, rxq->queue_id, + rte_mempool_avail_count(rxq->mb_pool), + rte_mempool_in_use_count(rxq->mb_pool)); +#endif //DUMP_MEMPOOL_USAGE_STATS + /* Make sure reads to CMPT ring are synchronized before + * accessing the ring + */ + rte_rmb(); +#ifdef QDMA_LATENCY_OPTIMIZED + adapt_update_counter(rxq, nb_pkts_avail); +#endif //QDMA_LATENCY_OPTIMIZED + + int ret = process_cmpt_ring_vec(rxq, nb_pkts); + if (unlikely(ret)) + return 0; + + if (rxq->status != RTE_ETH_QUEUE_STATE_STARTED) { + PMD_DRV_LOG(DEBUG, "%s(): %d: rxq->status = %d\n", + __func__, __LINE__, rxq->status); + return 0; + } + + count_pkts = prepare_packets_vec(rxq, rx_pkts, nb_pkts); + + c2h_pidx = rxq->q_pidx_info.pidx; + pending_desc = rxq->rx_tail - c2h_pidx - 1; + if (rxq->rx_tail < (c2h_pidx + 1)) + pending_desc = rxq->nb_rx_desc - 2 + rxq->rx_tail - + c2h_pidx; + + /* Batch the PIDX updates, this minimizes overhead on + * descriptor engine + */ + if (pending_desc >= MIN_RX_PIDX_UPDATE_THRESHOLD) + rearm_c2h_ring_vec(rxq, pending_desc); + +#ifdef DUMP_MEMPOOL_USAGE_STATS + PMD_DRV_LOG(DEBUG, "%s(): %d: queue id = %d, mbuf_avail_count = %d," + " mbuf_in_use_count = %d, count_pkts = %d", + __func__, __LINE__, rxq->queue_id, + rte_mempool_avail_count(rxq->mb_pool), + rte_mempool_in_use_count(rxq->mb_pool), count_pkts); +#endif //DUMP_MEMPOOL_USAGE_STATS + + PMD_DRV_LOG(DEBUG, " Recv complete with hw cidx :%d", + rxq->wb_status->cidx); + PMD_DRV_LOG(DEBUG, " Recv complete with hw pidx :%d\n", + rxq->wb_status->pidx); + + return count_pkts; +} + +/** + * DPDK callback for receiving packets in burst. + * + * @param rx_queue + * Generic pointer to Rx queue structure. + * @param[out] rx_pkts + * Array to store received packets. + * @param nb_pkts + * Maximum number of packets in array. + * + * @return + * Number of packets successfully received (<= nb_pkts). + */ +uint16_t qdma_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts) +{ + struct qdma_rx_queue *rxq = rx_queue; + uint32_t count; + + if (rxq->st_mode) + count = qdma_recv_pkts_st_vec(rx_queue, rx_pkts, nb_pkts); + else + count = qdma_recv_pkts_mm(rx_queue, rx_pkts, nb_pkts); + + return count; +} + +/* Transmit API for Streaming mode */ +uint16_t qdma_xmit_pkts_st_vec(void *tx_queue, + struct rte_mbuf **tx_pkts, uint16_t nb_pkts) +{ + struct rte_mbuf *mb; + uint64_t pkt_len = 0; + int avail, in_use, ret, nsegs; + uint16_t cidx = 0; + uint16_t count = 0, id; + struct qdma_tx_queue *txq = tx_queue; + struct qdma_pci_dev *qdma_dev = txq->dev->data->dev_private; + +#ifdef TEST_64B_DESC_BYPASS + int bypass_desc_sz_idx = qmda_get_desc_sz_idx(txq->bypass_desc_sz); + + if (unlikely(txq->en_bypass && + bypass_desc_sz_idx == SW_DESC_CNTXT_64B_BYPASS_DMA)) { + return qdma_xmit_64B_desc_bypass(txq, tx_pkts, nb_pkts); + } +#endif + + id = txq->q_pidx_info.pidx; + + /* Make sure reads to Tx ring are synchronized before + * accessing the status descriptor. + */ + rte_rmb(); + + cidx = txq->wb_status->cidx; + PMD_DRV_LOG(DEBUG, "Xmit start on tx queue-id:%d, tail index:%d\n", + txq->queue_id, id); + + /* Free transmitted mbufs back to pool */ + reclaim_tx_mbuf(txq, cidx, 0); + + in_use = (int)id - cidx; + if (in_use < 0) + in_use += (txq->nb_tx_desc - 1); + + /* Make 1 less available, otherwise if we allow all descriptors + * to be filled, when nb_pkts = nb_tx_desc - 1, pidx will be same + * as old pidx and HW will treat this as no new descriptors were added. + * Hence, DMA won't happen with new descriptors. + */ + avail = txq->nb_tx_desc - 2 - in_use; + + if (unlikely(!avail)) { + PMD_DRV_LOG(DEBUG, "Tx queue full, in_use = %d", in_use); + return 0; + } + + for (count = 0; count < nb_pkts; count++) { + mb = tx_pkts[count]; + nsegs = mb->nb_segs; + if (nsegs > avail) { + /* Number of segments in current mbuf are greater + * than number of descriptors available, + * hence update PIDX and return + */ + break; + } + avail -= nsegs; + id = txq->q_pidx_info.pidx; + txq->sw_ring[id] = mb; + pkt_len += rte_pktmbuf_pkt_len(mb); + + ret = qdma_ul_update_st_h2c_desc_vec(txq, txq->offloads, mb); + + if (unlikely(ret < 0)) + break; + } + + txq->stats.pkts += count; + txq->stats.bytes += pkt_len; + +#if (MIN_TX_PIDX_UPDATE_THRESHOLD > 1) + rte_spinlock_lock(&txq->pidx_update_lock); +#endif + txq->tx_desc_pend += count; + + /* Send PIDX update only if pending desc is more than threshold + * Saves frequent Hardware transactions + */ + if (txq->tx_desc_pend >= MIN_TX_PIDX_UPDATE_THRESHOLD) { + qdma_dev->hw_access->qdma_queue_pidx_update(txq->dev, + qdma_dev->is_vf, + txq->queue_id, 0, &txq->q_pidx_info); + + txq->tx_desc_pend = 0; + } +#if (MIN_TX_PIDX_UPDATE_THRESHOLD > 1) + rte_spinlock_unlock(&txq->pidx_update_lock); +#endif + PMD_DRV_LOG(DEBUG, " xmit completed with count:%d\n", count); + + return count; +} + +/** + * DPDK callback for transmitting packets in burst. + * + * @param tx_queue + * Generic pointer to TX queue structure. + * @param[in] tx_pkts + * Packets to transmit. + * @param nb_pkts + * Number of packets in array. + * + * @return + * Number of packets successfully transmitted (<= nb_pkts). + */ +uint16_t qdma_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts, + uint16_t nb_pkts) +{ + struct qdma_tx_queue *txq = tx_queue; + uint16_t count; + + if (txq->status != RTE_ETH_QUEUE_STATE_STARTED) + return 0; + + if (txq->st_mode) + count = qdma_xmit_pkts_st_vec(tx_queue, tx_pkts, nb_pkts); + else + count = qdma_xmit_pkts_mm(tx_queue, tx_pkts, nb_pkts); + + return count; +} diff --git a/QDMA/DPDK/drivers/net/qdma/version.h b/QDMA/DPDK/drivers/net/qdma/version.h index 14dac1fd2d458d5ac4c9c806a8c28bb494344f17..9b4233c8756ba5892c20d9b47f783216db4a3c4c 100755 --- a/QDMA/DPDK/drivers/net/qdma/version.h +++ b/QDMA/DPDK/drivers/net/qdma/version.h @@ -39,7 +39,7 @@ #define QDMA_PMD_MAJOR 2023 #define QDMA_PMD_MINOR 1 -#define QDMA_PMD_PATCHLEVEL 1 +#define QDMA_PMD_PATCHLEVEL 2 #define QDMA_PMD_VERSION \ qdma_stringify(QDMA_PMD_MAJOR) "." \