From 4603d729f9aef2aad341765251a3bc7d24dde7c1 Mon Sep 17 00:00:00 2001 From: Adam Green Date: Thu, 22 Aug 2013 18:11:51 -0700 Subject: [PATCH 1/8] net: Fully disable LWIP_ASSERTs I was doing some debugging that had me looking at the disassembly of lpc_rx_queue() from within the debugger. I was looking for the call to pbuf_alloc() that we see in the following code snippet: p = pbuf_alloc(PBUF_RAW, (u16_t) EMAC_ETH_MAX_FLEN, PBUF_RAM); if (p == NULL) { LWIP_DEBUGF(UDP_LPC_EMAC | LWIP_DBG_TRACE, ("lpc_rx_queue: could not allocate RX pbuf (free desc=%d)\n", lpc_enetif->rx_free_descs)); return queued; } /* pbufs allocated from the RAM pool should be non-chained. */ LWIP_ASSERT("lpc_rx_queue: pbuf is not contiguous (chained)", pbuf_clen(p) <= 1); When I was looking through the disassembly for this code I noticed a call to pbuf_clen() in the actual machine code. => 0x0000bab0 <+24>: bl 0x44c0 0x0000bab4 <+28>: ldr r3, [r4, #112] ; 0x70 0x0000bab6 <+30>: ldrh.w r12, [r5, #10] 0x0000baba <+34>: add.w r2, r3, #9 0x0000babe <+38>: add.w r0, r12, #4294967295 ; 0xffffffff The only call to pbuf_clean made from this function is made from within the LWIP_ASSERT. When I looked more closely at how this macro was defined, I saw that the mbed version of the stack had disabled the LWIP_PLATFORM_ASSERT macro when LWIP_DEBUG was false which means that no action will be taken if the assert is false but it still allows the LWIP_ASSERT macro to potentially evaluate the assert expression. Defining the LWIP_NOASSERT macro will fully disable the LWIP_ASSERT macro. I saw one of my TCP/IP samples shrink about 0.5K when I made this change. --- libraries/net/lwip/lwip/lwipopts.h | 1 + 1 file changed, 1 insertion(+) diff --git a/libraries/net/lwip/lwip/lwipopts.h b/libraries/net/lwip/lwip/lwipopts.h index 10846e24169..463e2c387b2 100644 --- a/libraries/net/lwip/lwip/lwipopts.h +++ b/libraries/net/lwip/lwip/lwipopts.h @@ -99,6 +99,7 @@ #define MEMP_OVERFLOW_CHECK 1 #define MEMP_SANITY_CHECK 1 #else +#define LWIP_NOASSERT 1 #define LWIP_STATS 0 #endif From fa392423c84ac5d5acbf8fc650f8fcca1b322c52 Mon Sep 17 00:00:00 2001 From: Adam Green Date: Fri, 23 Aug 2013 22:14:38 -0700 Subject: [PATCH 2/8] Silence GCC unused variable warning. After making my previous commit to completely disable LWIP_ASSERT macro invocations, I ended up with a warning in pbuf.c where an err variable was set but only checked for success in an assert. I added a "(void)err;" reference to silence this warning. --- libraries/net/lwip/lwip/core/pbuf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libraries/net/lwip/lwip/core/pbuf.c b/libraries/net/lwip/lwip/core/pbuf.c index 818eb620356..cd37c797237 100644 --- a/libraries/net/lwip/lwip/core/pbuf.c +++ b/libraries/net/lwip/lwip/core/pbuf.c @@ -998,6 +998,8 @@ pbuf_coalesce(struct pbuf *p, pbuf_layer layer) } err = pbuf_copy(q, p); LWIP_ASSERT("pbuf_copy failed", err == ERR_OK); + /* next line references err variable even if LWIP_ASSERT is ignored. */ + (void)err; pbuf_free(p); return q; } From f5ec5d3ab2d2ab130dc4ba12a0742da8db4225fb Mon Sep 17 00:00:00 2001 From: Adam Green Date: Sat, 24 Aug 2013 13:13:26 -0700 Subject: [PATCH 3/8] net: Enable SYS_LIGHTWEIGHT_PROT This option actually enables the use of the lwip_sys_mutex for protecting concurrent access to such important lwIP resources as: select_cb_list (this is the one which orig flagged problem) sockets array mem stats (if enabled) heap (if LWIP_ALLOW_MEM_FREE_FROM_OTHER_CONTEXT was non-zero) memp pool allocs/frees netif->loop_last pbuf linked list pbuf reference counts ... I first noticed this issue when I hit a crash while slamming the net stack with a large number of TCP packets (I was actually sending 1k data buffers from the TCPEchoServer mbed sample.) It crashed in the last line of this code snippet from event_callback: for (scb = select_cb_list; scb != NULL; scb = scb->next) { if (scb->sem_signalled == 0) { It was crashing because scb had an invalid address so it generated a bus fault. I figured that memory was either corrupted or there was some kind of concurrency issue. In trying to determine which, I wanted to walk through the select_cb_list linked list and see where it was corrupted: (gdb) p scb $1 = (struct lwip_select_cb *) 0x85100080 (gdb) p select_cb_list $2 = (struct lwip_select_cb *) 0x0 That was interesting, the head of the linked list was now NULL but it must have had a non-NULL value when this loop started running or we would have never gotten to the point where we hit this crash. This was starting to look like a concurrency issue since the linked list was modified out from underneath this thread. Looking through the source code for this function, I saw use of macros like SYS_ARCH_PROTECT and SYS_ARCH_UNPROTECT which looked like they should be providing the thead synchronization. I disassembled the event_callback() function in the debugger and saw no signs of the usage of synchronizition APIs that I expected. A search through the code for the definition of these SYS_ARCH_UN/PROTECT macros led me to discovering that they were actually ignored unless an implementation defined them itself (the mbed version doesn't do so) or the SYS_LIGHTWEIGHT_PROT macro is set to non-zero (the mbed version didn't do this either). Flipping the SYS_LIGHTWEIGHT_PROT macro on in lwipopts.h fixed the crash I kept hitting, increased the size of the code a bit, and unfortunately slows things down a bit since it now actually serializes access to these data structures by making calls to the RTOS sync APIs. --- libraries/net/lwip/lwip/lwipopts.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libraries/net/lwip/lwip/lwipopts.h b/libraries/net/lwip/lwip/lwipopts.h index 463e2c387b2..4d07680dbcf 100644 --- a/libraries/net/lwip/lwip/lwipopts.h +++ b/libraries/net/lwip/lwip/lwipopts.h @@ -27,6 +27,8 @@ #if NO_SYS == 0 #include "cmsis_os.h" +#define SYS_LIGHTWEIGHT_PROT 1 + #define LWIP_RAW 0 #define TCPIP_MBOX_SIZE 8 From acb35785c90a721b0d815f91b38cc4fa9b8d157f Mon Sep 17 00:00:00 2001 From: Adam Green Date: Sat, 24 Aug 2013 14:10:30 -0700 Subject: [PATCH 4/8] net: Only process 1 packet per ethernet RX interrupt Previously the packet_rx() function would wait on the RxSem and when signalled it would process all available inbound packets. This used to cause no problem but once the thread synchronization was turned on via SYS_LIGHTWEIGHT_PROT, the semaphore actually started to overflow its maximum token count of 65535. This caused the mbed_die() flashing LEDs of death. The old code was really breaking the producer/consumer pattern that I typically see with a semaphore since the consumer was written to consume more than 1 produced object per semaphore wait. Before the thread synchronization was enabled, the packet_rx() thread could use a single time slice to process all of these packets and then loop back around a few more times to decrement the semaphore count while skipping the packet processing since it had all been done. Now the packet processing code would cause the thread to give up its time slice as it hit newly enabled critical sections. In the end it was possible for the code to leak 2 semaphore signals for every 1 by which the thread was awaken. After about 10 seconds of load, this would cause a leak of 65535 signals. NOTE: Two potential issues with this change: 1) The LPC_EMAC->RxConsumeIndex != LPC_EMAC->RxProduceIndex check was removed from packet_rx(). I believe that this is Ok since the same condition is later checked in lpc_low_level_input() anyway so it won't now try to process more packets than what exist. 2) What if ENET_IRQHandler(void) ends up not signalling the RxSem for every packet received? When would that happen? I could see it happening if the ethernet hardware would try to pend more than 1 interrupt when the priority was too elevated to process the pending requests. Putting the consumer loop back in packet_rx() and using a Signal instead of a Semaphore might be a better solution? --- libraries/net/eth/lwip-eth/arch/lpc17_emac.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/libraries/net/eth/lwip-eth/arch/lpc17_emac.c b/libraries/net/eth/lwip-eth/arch/lpc17_emac.c index 243f9930a04..18d577144dc 100644 --- a/libraries/net/eth/lwip-eth/arch/lpc17_emac.c +++ b/libraries/net/eth/lwip-eth/arch/lpc17_emac.c @@ -809,9 +809,8 @@ static void packet_rx(void* pvParameters) { /* Wait for receive task to wakeup */ sys_arch_sem_wait(&lpc_enetif->RxSem, 0); - /* Process packets until all empty */ - while (LPC_EMAC->RxConsumeIndex != LPC_EMAC->RxProduceIndex) - lpc_enetif_input(lpc_enetif->netif); + /* Process packet for this semaphore signal */ + lpc_enetif_input(lpc_enetif->netif); } } From de8161fde126ba36eaea9bd8f2f44a43d635c2c6 Mon Sep 17 00:00:00 2001 From: Adam Green Date: Sat, 24 Aug 2013 14:48:12 -0700 Subject: [PATCH 5/8] net: Reset pbuf length when re-queueing on error. I recently pulled a NXP crash fix for their ethernet driver which will requeue a pbuf to the ethernet driver rather than sending it to the lwip stack if it can't allocate a new pbuf to keep the ethernet hardware primed with available packet buffers. While recently reviewing this code I noticed that the full size of the pbuf wasn't used on this re-queueing operation but the size of the last received packet. I now reset the pbuf size back to its originally allocated size before doing this requeue operation. --- libraries/net/eth/lwip-eth/arch/lpc17_emac.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libraries/net/eth/lwip-eth/arch/lpc17_emac.c b/libraries/net/eth/lwip-eth/arch/lpc17_emac.c index 18d577144dc..e082fd495fa 100644 --- a/libraries/net/eth/lwip-eth/arch/lpc17_emac.c +++ b/libraries/net/eth/lwip-eth/arch/lpc17_emac.c @@ -346,6 +346,7 @@ static struct pbuf *lpc_low_level_input(struct netif *netif) struct lpc_enetdata *lpc_enetif = netif->state; struct pbuf *p = NULL; u32_t idx, length; + u16_t origLength; #ifdef LOCK_RX_THREAD #if NO_SYS == 0 @@ -428,6 +429,7 @@ static struct pbuf *lpc_low_level_input(struct netif *netif) /* Zero-copy */ p = lpc_enetif->rxb[idx]; + origLength = p->len; p->len = (u16_t) length; /* Free pbuf from descriptor */ @@ -440,6 +442,7 @@ static struct pbuf *lpc_low_level_input(struct netif *netif) LINK_STATS_INC(link.drop); /* Re-queue the pbuf for receive */ + p->len = origLength; lpc_rxqueue_pbuf(lpc_enetif, p); LWIP_DEBUGF(UDP_LPC_EMAC | LWIP_DBG_TRACE, From 2bed9964627c52b4ce54159fa724ed5e61623046 Mon Sep 17 00:00:00 2001 From: Adam Green Date: Tue, 27 Aug 2013 22:24:47 -0700 Subject: [PATCH 6/8] Revert "net: Only process 1 packet per ethernet RX interrupt" This reverts commit acb35785c90a721b0d815f91b38cc4fa9b8d157f. It turns out that this commit actually causes problems if an ethernet interrupt is dropped because a higher privilege task is running, such as LocalFileSystem accesses. If this happens, the semaphore count isn't incremented enough times and the packet_rx() thread will fall behind and end up running as though it had only one ethernet receive buffer. This causes even more lost packets. I plan to fix this by switching the semaphore to be a signal so that the syncronization object is more boolean. It simply indicates if an interrupt has arrived since the last time packet_rx() was awaken to process inbound packets. --- libraries/net/eth/lwip-eth/arch/lpc17_emac.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/libraries/net/eth/lwip-eth/arch/lpc17_emac.c b/libraries/net/eth/lwip-eth/arch/lpc17_emac.c index e082fd495fa..9bace5f80b8 100644 --- a/libraries/net/eth/lwip-eth/arch/lpc17_emac.c +++ b/libraries/net/eth/lwip-eth/arch/lpc17_emac.c @@ -812,8 +812,9 @@ static void packet_rx(void* pvParameters) { /* Wait for receive task to wakeup */ sys_arch_sem_wait(&lpc_enetif->RxSem, 0); - /* Process packet for this semaphore signal */ - lpc_enetif_input(lpc_enetif->netif); + /* Process packets until all empty */ + while (LPC_EMAC->RxConsumeIndex != LPC_EMAC->RxProduceIndex) + lpc_enetif_input(lpc_enetif->netif); } } From 8cf3e658d178aab5f0049c634ab83e49516b3a35 Mon Sep 17 00:00:00 2001 From: Adam Green Date: Tue, 27 Aug 2013 23:38:42 -0700 Subject: [PATCH 7/8] Don't use semaphore from ENET_IRQHandler to packet_rx I now use a signal to communicate when a packet has been received by the ethernet hardware and should be processed by the packet_rx thread. Previously the change to make the lwIP stack thread safe introduced enough delay in packet_rx that the semaphore count could lag behind the processed packets and overflow its maximum token count. Now the ISR uses the signal to indicate that >= 1 packet has been received since the last time packet_rx() was awaken. Previously the ethernet driver used generic sys_arch* APIs exposed from lwIP to manipulate the semaphores. I now call CMSIS RTOS APIs directly when using the signals. I think this is acceptable since that same driver source file already contains similar os* calls that talk directly to the RTOS. --- libraries/net/eth/lwip-eth/arch/lpc17_emac.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/libraries/net/eth/lwip-eth/arch/lpc17_emac.c b/libraries/net/eth/lwip-eth/arch/lpc17_emac.c index 9bace5f80b8..093f8a6ca34 100644 --- a/libraries/net/eth/lwip-eth/arch/lpc17_emac.c +++ b/libraries/net/eth/lwip-eth/arch/lpc17_emac.c @@ -88,6 +88,10 @@ */ #define TXINTGROUP (EMAC_INT_TX_UNDERRUN | EMAC_INT_TX_ERR | EMAC_INT_TX_DONE) +/** \brief Signal used for ethernet ISR to signal packet_rx() thread. + */ +#define RX_SIGNAL 1 + #else #define RXINTGROUP 0 #define TXINTGROUP 0 @@ -123,7 +127,7 @@ struct lpc_enetdata { struct pbuf *txb[LPC_NUM_BUFF_TXDESCS]; /**< TX pbuf pointer list, zero-copy mode */ u32_t lpc_last_tx_idx; /**< TX last descriptor index, zero-copy mode */ #if NO_SYS == 0 - sys_sem_t RxSem; /**< RX receive thread wakeup semaphore */ + sys_thread_t RxThread; /**< RX receive thread data object pointer */ sys_sem_t TxCleanSem; /**< TX cleanup thread wakeup semaphore */ sys_mutex_t TXLockMutex; /**< TX critical section mutex */ sys_sem_t xTXDCountSem; /**< TX free buffer counting semaphore */ @@ -783,8 +787,8 @@ void ENET_IRQHandler(void) ints = LPC_EMAC->IntStatus; if (ints & RXINTGROUP) { - /* RX group interrupt(s): Give semaphore to wakeup RX receive task.*/ - sys_sem_signal(&lpc_enetdata.RxSem); + /* RX group interrupt(s): Give signal to wakeup RX receive task.*/ + osSignalSet(lpc_enetdata.RxThread->id, RX_SIGNAL); } if (ints & TXINTGROUP) { @@ -810,7 +814,7 @@ static void packet_rx(void* pvParameters) { while (1) { /* Wait for receive task to wakeup */ - sys_arch_sem_wait(&lpc_enetif->RxSem, 0); + osSignalWait(RX_SIGNAL, osWaitForever); /* Process packets until all empty */ while (LPC_EMAC->RxConsumeIndex != LPC_EMAC->RxProduceIndex) @@ -1096,9 +1100,8 @@ err_t lpc_enetif_init(struct netif *netif) LWIP_ASSERT("TXLockMutex creation error", (err == ERR_OK)); /* Packet receive task */ - err = sys_sem_new(&lpc_enetdata.RxSem, 0); - LWIP_ASSERT("RxSem creation error", (err == ERR_OK)); - sys_thread_new("receive_thread", packet_rx, netif->state, DEFAULT_THREAD_STACKSIZE, RX_PRIORITY); + lpc_enetdata.RxThread = sys_thread_new("receive_thread", packet_rx, netif->state, DEFAULT_THREAD_STACKSIZE, RX_PRIORITY); + LWIP_ASSERT("RxThread creation error", (lpc_enetdata.RxThread)); /* Transmit cleanup task */ err = sys_sem_new(&lpc_enetdata.TxCleanSem, 0); From b0c0f47c7d3bb4965e0c3e631e37a416723c2310 Mon Sep 17 00:00:00 2001 From: Adam Green Date: Tue, 27 Aug 2013 23:57:24 -0700 Subject: [PATCH 8/8] Changed leading whitespace back to tab. The leading whitespace preceeding the fields in the lpc_enetdata structure definition were originally a tab and I used 4 spaces when I added RxThread. --- libraries/net/eth/lwip-eth/arch/lpc17_emac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libraries/net/eth/lwip-eth/arch/lpc17_emac.c b/libraries/net/eth/lwip-eth/arch/lpc17_emac.c index 093f8a6ca34..729840f6362 100644 --- a/libraries/net/eth/lwip-eth/arch/lpc17_emac.c +++ b/libraries/net/eth/lwip-eth/arch/lpc17_emac.c @@ -127,7 +127,7 @@ struct lpc_enetdata { struct pbuf *txb[LPC_NUM_BUFF_TXDESCS]; /**< TX pbuf pointer list, zero-copy mode */ u32_t lpc_last_tx_idx; /**< TX last descriptor index, zero-copy mode */ #if NO_SYS == 0 - sys_thread_t RxThread; /**< RX receive thread data object pointer */ + sys_thread_t RxThread; /**< RX receive thread data object pointer */ sys_sem_t TxCleanSem; /**< TX cleanup thread wakeup semaphore */ sys_mutex_t TXLockMutex; /**< TX critical section mutex */ sys_sem_t xTXDCountSem; /**< TX free buffer counting semaphore */