[sr-dev] [kamailio/kamailio] Kamailio freezes with DMQ and dialog (#2535)

pwakano notifications at github.com
Tue Nov 3 02:06:38 CET 2020


The investigation done so far regarding the dead lock shows that 2 process entered a dead lock, a udp worker and the timer process.
The timer process was trying to delete a dialog reference, and got the dialog lock (d_table->entry lock) and then tried to get the DMQ node list lock.
On the other hand, the UDP worker was trying to broadcast a DMQ message, got the DMQ node list lock and then tried to get the d_table entry lock for the same dialog as the timer process.
```
N	21068	main process - attendant
Y	21247	udp receiver child=0 sock=124.47.168.242:5060   TRYING TO GET LOCK  0x7f03675bdc48
Y	21248	udp receiver child=1 sock=124.47.168.242:5060   TRYING TO GET LOCK  0x7f03675bdc48
Y	21249	udp receiver child=2 sock=124.47.168.242:5060   TRYING TO GET LOCK  0x7f03675bdc48
Y	21250	udp receiver child=3 sock=124.47.168.242:5060   TRYING TO GET LOCK  0x7f03675bdc48
Y	21251	udp receiver child=4 sock=124.47.168.242:5060   TRYING TO GET LOCK  0x7f03675bdc48
Y	21252	udp receiver child=5 sock=124.47.168.242:5060   TRYING TO GET LOCK  0x7f03675bdc48
Y	21253	udp receiver child=6 sock=124.47.168.242:5060   TRYING TO GET LOCK  0x7f036822441c		HAS LOCK 0x7f03675bdc48
Y	21254	udp receiver child=7 sock=124.47.168.242:5060
Y	21255	udp receiver child=0 sock=10.51.1.72:5060
Y	21257	udp receiver child=1 sock=10.51.1.72:5060
Y	21258	udp receiver child=2 sock=10.51.1.72:5060
Y	21260	udp receiver child=3 sock=10.51.1.72:5060
Y	21261	udp receiver child=4 sock=10.51.1.72:5060
Y	21262	udp receiver child=5 sock=10.51.1.72:5060
Y	21263	udp receiver child=6 sock=10.51.1.72:5060
Y	21264	udp receiver child=7 sock=10.51.1.72:5060
N	21265	slow timer
Y	21266	timer   TRYING TO GET LOCK  0x7f03675bdc48		HAS LOCK 0x7f036822441c
N	21267	secondary timer
N	21268	ctl handler
N	21269	JSONRPCS FIFO
Y	21270	DMQ WORKER  TRYING TO GET LOCK  0x7f03675bdcd4
Y	21271	DMQ WORKER  TRYING TO GET LOCK  0x7f03675bdcec
Y	21273	Dialog KA Timer
Y	21275	Dialog Clean Timer TRYING TO GET LOCK   0x7f036822441c
N	21278	WEBSOCKET KEEPALIVE
N	21279	WEBSOCKET TIMER
N	21281	TIMER NH
N	21282	TIMER NH
Y	21283	TIMER UAC REG
N	21284	tcp receiver (generic) child=0
N	21285	tcp receiver (generic) child=1
N	21286	tcp receiver (generic) child=2
N	21287	tcp receiver (generic) child=3
N	21289	tcp receiver (generic) child=4
N	21291	tcp receiver (generic) child=5
N	21292	tcp receiver (generic) child=6
N	21293	tcp receiver (generic) child=7
N	21294	tcp main process
```
21266 # timer BT:
```
#0  0x00007f0386d30bf9 in syscall () from /lib64/libc.so.6
#1  0x00007f035d4effa9 in futex_get (lock=0x7f03675bdc48) at ../../core/futexlock.h:121 <- TRYING TO GET LOCK 0x7f03675bdc48
#2  0x00007f035d4f205b in bcast_dmq_message1 (peer=0x7f0368231ff0, body=0x7ffe95bc0920, except=0x0, resp_cback=0x7f035cbc9af0 <dlg_dmq_resp_callback>, max_forwards=1, content_type=0x7f035cbc9ab0 <dlg_dmq_content_type>, incl_inactive=0)
    at dmq_funcs.c:156
#3  0x00007f035d4f28d4 in bcast_dmq_message (peer=0x7f0368231ff0, body=0x7ffe95bc0920, except=0x0, resp_cback=0x7f035cbc9af0 <dlg_dmq_resp_callback>, max_forwards=1, content_type=0x7f035cbc9ab0 <dlg_dmq_content_type>) at dmq_funcs.c:188
#4  0x00007f035c96704d in dlg_dmq_send (body=0x7ffe95bc0920, node=0x0) at dlg_dmq.c:88
#5  0x00007f035c9709d5 in dlg_dmq_replicate_action (action=DLG_DMQ_RM, dlg=0x7f03684b77f8, needlock=0, node=0x0) at dlg_dmq.c:628
#6  0x00007f035c90a39a in destroy_dlg (dlg=0x7f03684b77f8) at dlg_hash.c:367
#7  0x00007f035c914064 in dlg_unref_helper (dlg=0x7f03684b77f8, cnt=2, fname=0x7f035c9b222d "dlg_handlers.c", fline=1212) at dlg_hash.c:1081 << --- GOT LOCK 0x7f036822441c

		(gdb) p *d_entry
		$2 = {first = 0x7f0369205248, last = 0x7f036855c680, next_id = 30442, lock = {val = 2}, locker_pid = {val = 21266}, rec_lock_level = 0}
	
		(gdb) p dlg->h_entry
		$3 = 2340
	
		(gdb) p &d_entry->lock
		$5 = (gen_lock_t *) 0x7f036822441c


#8  0x00007f035c945bd2 in unref_dlg_from_cb (t=0x7f036936b678, type=131072, param=0x7ffe95bc0e20) at dlg_handlers.c:1212
#9  0x00007f03612d3165 in run_trans_callbacks_internal (cb_lst=0x7f036936b6f0, type=131072, trans=0x7f036936b678, params=0x7ffe95bc0e20) at t_hooks.c:258
#10 0x00007f03612d3297 in run_trans_callbacks (type=131072, trans=0x7f036936b678, req=0x0, rpl=0x0, code=0) at t_hooks.c:285
#11 0x00007f03611f9b8f in free_cell_helper (dead_cell=0x7f036936b678, silent=0, fname=0x7f0361313363 "timer.c", fline=643) at h_table.c:165
#12 0x00007f03612bfc4d in wait_handler (ti=1856451739, wait_tl=0x7f036936b700, data=0x7f036936b678) at timer.c:643
#13 0x00000000004ea094 in timer_list_expire (t=1856451739, h=0x7f036587d380, slow_l=0x7f036587f948, slow_mark=5689) at core/timer.c:857
#14 0x00000000004ea53d in timer_handler () at core/timer.c:922
#15 0x00000000004ea9e7 in timer_main () at core/timer.c:961
#16 0x000000000042a838 in main_loop () at main.c:1753
#17 0x0000000000433a96 in main (argc=10, argv=0x7ffe95bc1968) at main.c:2856


```
21253 # udp receiver child=6 sock=124.47.168.242:5060 BT:
```
#0  0x00007f0386d30bf9 in syscall () from /lib64/libc.so.6
#1  0x00007f035c9061a7 in futex_get (lock=0x7f036822441c) at ../../core/futexlock.h:121
#2  0x00007f035c90ee00 in dlg_lookup_mode (h_entry=2340, h_id=30422, lmode=0) at dlg_hash.c:781 <- TRYING TO GET LOCK KHERE

		(gdb) p *d_entry
		$1 = {first = 0x7f0369205248, last = 0x7f036855c680, next_id = 30442, lock = {val = 2}, locker_pid = {val = 21266}, 
		  rec_lock_level = 0}

		(gdb) p h_entry
		$2 = 2340

#3  0x00007f035c90fa27 in dlg_get_by_iuid (diuid=0x7f035cbcbd40 <_dlg_ctx+64>) at dlg_hash.c:849
#4  0x00007f035c9abfd1 in dlg_get_ctx_dialog () at dlg_var.c:940
#5  0x00007f035c945c03 in dlg_lookup_msg_dialog (msg=0x7ffe95bbc8c0, dir=0x7ffe95bbc8bc) at dlg_handlers.c:1227
#6  0x00007f035c956f77 in dlg_cseq_msg_sent (evp=0x7ffe95bbd2d0) at dlg_cseq.c:393
#7  0x00000000004ec14b in sr_event_exec (type=2, evp=0x7ffe95bbd2d0) at core/events.c:240
#8  0x00007f03612c1e88 in msg_send_buffer (dst=0x7f036a0a4870, 
    buf=0x7f036bfa1af8 "KDMQ sip:dialog at 10.51.1.71:5060 SIP/2.0\r\nVia: SIP/2.0/UDP 10.51.1.72;branch=z9hG4bK6492.a407a601", '0' <repeats 24 times>, ".0\r\nTo: <sip:dialog at 10.51.1.71:5060>\r\nFrom: <sip:dialog at 10.51.1.72:5060>;tag=cf4"..., len=703, flags=0) at ../../core/forward.h:148
#9  0x00007f03612c743e in send_pr_buffer (rb=0x7f036a0a4820, buf=0x7f036bfa1af8, len=703) at t_funcs.c:69
#10 0x00007f036128d315 in send_prepared_request_impl (request=0x7f036a0a4820, retransmit=1, branch=0) at uac.c:669
#11 0x00007f036128e89e in t_uac_with_ids (uac_r=0x7ffe95bbd8f0, ret_index=0x0, ret_label=0x0) at uac.c:753
#12 0x00007f036128e7ca in t_uac (uac_r=0x7ffe95bbd8f0) at uac.c:721
#13 0x00007f0361291a62 in request (uac_r=0x7ffe95bbd8f0, ruri=0x7ffe95bbd8c0, to=0x7ffe95bbd8c0, 
    from=0x7ffe95bbd8d0, next_hop=0x0) at uac.c:1089
#14 0x00007f035d4f4053 in dmq_send_message (peer=0x7f0368231ff0, body=0x7ffe95bbdca0, node=0x7f0368464068, 
    resp_cback=0x7f035cbc9af0 <dlg_dmq_resp_callback>, max_forwards=1, 
    content_type=0x7f035cbc9ab0 <dlg_dmq_content_type>) at dmq_funcs.c:251
#15 0x00007f035d4f24ff in bcast_dmq_message1 (peer=0x7f0368231ff0, body=0x7ffe95bbdca0, except=0x0, 
    resp_cback=0x7f035cbc9af0 <dlg_dmq_resp_callback>, max_forwards=1, 
    content_type=0x7f035cbc9ab0 <dlg_dmq_content_type>, incl_inactive=0) at dmq_funcs.c:170   <----- GOT LOCK 0x7f03675bdc48

		(gdb) p &dmq_node_list->lock
		$1 = (gen_lock_t *) 0x7f03675bdc48

#16 0x00007f035d4f28d4 in bcast_dmq_message (peer=0x7f0368231ff0, body=0x7ffe95bbdca0, except=0x0, 
    resp_cback=0x7f035cbc9af0 <dlg_dmq_resp_callback>, max_forwards=1, 
    content_type=0x7f035cbc9ab0 <dlg_dmq_content_type>) at dmq_funcs.c:188
#17 0x00007f035c96704d in dlg_dmq_send (body=0x7ffe95bbdca0, node=0x0) at dlg_dmq.c:88
#18 0x00007f035c9709d5 in dlg_dmq_replicate_action (action=DLG_DMQ_STATE, dlg=0x7f0369205248, needlock=0, node=0x0)
    at dlg_dmq.c:628
#19 0x00007f035c94ba78 in dlg_onroute (req=0x7f03860006e0, route_params=0x7ffe95bbe030, param=0x0)
    at dlg_handlers.c:1559
#20 0x00007f0360b665c6 in run_rr_callbacks (req=0x7f03860006e0, rr_param=0x7ffe95bbe120) at rr_cb.c:96
#21 0x00007f0360b799b2 in after_loose (_m=0x7f03860006e0, preloaded=0) at loose.c:984
#22 0x00007f0360b7a0dc in loose_route (_m=0x7f03860006e0) at loose.c:1018
#23 0x00007f0360b7e240 in w_loose_route (msg=0x7f03860006e0, p1=0x0, p2=0x0) at rr_mod.c:276
#24 0x00000000005d0bc5 in do_action (h=0x7ffe95bbecd0, a=0x7f0385daa880, msg=0x7f03860006e0) at core/action.c:1076
#25 0x00000000005ddf78 in run_actions (h=0x7ffe95bbecd0, a=0x7f0385daa880, msg=0x7f03860006e0) at core/action.c:1581
#26 0x00000000005de5f4 in run_actions_safe (h=0x7ffe95bc09f0, a=0x7f0385daa880, msg=0x7f03860006e0)
    at core/action.c:1645
#27 0x0000000000446834 in rval_get_int (h=0x7ffe95bc09f0, msg=0x7f03860006e0, i=0x7ffe95bbf5fc, rv=0x7f0385daaac0, 
    cache=0x0) at core/rvalue.c:915
#28 0x000000000044b365 in rval_expr_eval_int (h=0x7ffe95bc09f0, msg=0x7f03860006e0, res=0x7ffe95bbf5fc, 
    rve=0x7f0385daaab8) at core/rvalue.c:1913
#29 0x00000000005d06a6 in do_action (h=0x7ffe95bc09f0, a=0x7f0385db3280, msg=0x7f03860006e0) at core/action.c:1052
#30 0x00000000005ddf78 in run_actions (h=0x7ffe95bc09f0, a=0x7f0385da9ee8, msg=0x7f03860006e0) at core/action.c:1581
#31 0x00000000005cd315 in do_action (h=0x7ffe95bc09f0, a=0x7f0385d24ea0, msg=0x7f03860006e0) at core/action.c:700
#32 0x00000000005ddf78 in run_actions (h=0x7ffe95bc09f0, a=0x7f0385d24c68, msg=0x7f03860006e0) at core/action.c:1581
#33 0x00000000005d0b34 in do_action (h=0x7ffe95bc09f0, a=0x7f0385d3e420, msg=0x7f03860006e0) at core/action.c:1067
#34 0x00000000005ddf78 in run_actions (h=0x7ffe95bc09f0, a=0x7f0385d1ae68, msg=0x7f03860006e0) at core/action.c:1581
#35 0x00000000005de6bc in run_top_route (a=0x7f0385d1ae68, msg=0x7f03860006e0, c=0x0) at core/action.c:1666
#36 0x00000000005e5c40 in receive_msg (
    buf=0xabcb80 <buf.7133> "ACK sip:61283214906 at 10.51.1.50:5060;alias=10.51.1.50~5060~1 SIP/2.0\r\nVia: SIP/2.0/UDP 124.47.168.253:6080;branch=z9hG4bK-18807-894-4\r\nFrom: test_894 <sip:61289993111 at 124.47.168.253:6080>;tag=18807SIPp"..., len=724, rcv_info=0x7ffe95bc10e0) at core/receive.c:423
#37 0x00000000004c1442 in udp_rcv_loop () at core/udp_server.c:543
#38 0x0000000000429d47 in main_loop () at main.c:1683
#39 0x0000000000433a96 in main (argc=10, argv=0x7ffe95bc1968) at main.c:2856
```

Full kamctl trap:
[gdb_kamailio_20201102_110450.txt](https://github.com/kamailio/kamailio/files/5478787/gdb_kamailio_20201102_110450.txt)


-- 
You are receiving this because you are subscribed to this thread.
Reply to this email directly or view it on GitHub:
https://github.com/kamailio/kamailio/issues/2535#issuecomment-720814149
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.kamailio.org/pipermail/sr-dev/attachments/20201102/db74425c/attachment-0001.htm>


More information about the sr-dev mailing list