[sr-dev] [kamailio/kamailio] Kamailio freezes with DMQ and dialog (#2535)

pwakano notifications at github.com
Wed Nov 4 05:54:03 CET 2020


Hi Daniel, we've run the test, but unfortunately the problem happened again after about 5 hours of test.
The UDP receiver had the DMQ lock (0x7f28bd321478) and tried to get the dialog lock(0x7f28bdf8b5b4), and the Timer process got the dialog lock (0x7f28bdf8b5b4) and tried to get the DMQ worker lock (0x7f28bd321478)...

Check the backtrace of the processes in deadlock:

1331    udp receiver child=3 sock=45.116.131.61:5060
```
#0  0x00007f28dca971c9 in syscall () from /lib64/libc.so.6
#1  0x00007f28b269e5cd in futex_get (lock=0x7f28bdf8b5b4) at ../../core/mem/../futexlock.h:121
#2  0x00007f28b26a7226 in dlg_lookup_mode (h_entry=2827, h_id=70761, lmode=0) at dlg_hash.c:781
#3  0x00007f28b26a7e4d in dlg_get_by_iuid (diuid=0x7f28b2958d40 <_dlg_ctx+64>) at dlg_hash.c:849
#4  0x00007f28b27393e0 in dlg_get_ctx_dialog () at dlg_var.c:940
#5  0x00007f28b26c44fb in dlg_lookup_msg_dialog (msg=0x7ffe79df6bd0, dir=0x7ffe79df6bcc) at dlg_handlers.c:1227
#6  0x00007f28b26e4386 in dlg_cseq_msg_sent (evp=0x7ffe79df75e0) at dlg_cseq.c:400
(gdb) p ((struct to_body*)(&msg)->to->parsed)->tag_value.len
$1 = 0
(gdb) p msg.first_line.u.request.method_value
$2 = 16384

(gdb) p *(((struct to_body*)(&msg)->to->parsed))
$9 = {error = 1, body = {
    s = 0x7f28dbbb0714 "<sip:dialog at 10.61.187.66:5060>\r\nFrom: <sip:dialog at 10.61.187.75:5060>;tag=9bf3c67f7017dfd69402f08ee6e7069c-d6bc8b8a\r\nCSeq: 10 KDMQ\r\nCall-ID: 4a4bd13913afec15-1331 at 45.116.131.61\r\nContent-Length: 310\r\nUs"..., len = 32}, uri = {
    s = 0x7f28dbbb0715 "sip:dialog at 10.61.187.66:5060>\r\nFrom: <sip:dialog at 10.61.187.75:5060>;tag=9bf3c67f7017dfd69402f08ee6e7069c-d6bc8b8a\r\nCSeq: 10 KDMQ\r\nCall-ID: 4a4bd13913afec15-1331 at 45.116.131.61\r\nContent-Length: 310\r\nUse"..., len = 28}, display = {s = 0x0, len = 0}, 
  tag_value = {s = 0x0, len = 0}, parsed_uri = {user = {s = 0x0, len = 0}, passwd = {s = 0x0, len = 0}, host = {s = 0x0, len = 0}, port = {
      s = 0x0, len = 0}, params = {s = 0x0, len = 0}, sip_params = {s = 0x0, len = 0}, headers = {s = 0x0, len = 0}, port_no = 0, 
    proto = 0, type = ERROR_URI_T, flags = (unknown: 0), transport = {s = 0x0, len = 0}, ttl = {s = 0x0, len = 0}, user_param = {s = 0x0, 
      len = 0}, maddr = {s = 0x0, len = 0}, method = {s = 0x0, len = 0}, lr = {s = 0x0, len = 0}, r2 = {s = 0x0, len = 0}, gr = {s = 0x0, 
      len = 0}, transport_val = {s = 0x0, len = 0}, ttl_val = {s = 0x0, len = 0}, user_param_val = {s = 0x0, len = 0}, maddr_val = {
      s = 0x0, len = 0}, method_val = {s = 0x0, len = 0}, lr_val = {s = 0x0, len = 0}, r2_val = {s = 0x0, len = 0}, gr_val = {s = 0x0, 
      len = 0}}, param_lst = 0x0, last_param = 0x0}

#7  0x0000000000503088 in sr_event_exec (type=2, evp=0x7ffe79df75e0) at core/events.c:240
#8  0x00007f28b7035a43 in msg_send_buffer (dst=0x7f28bf3a8c28, 
    buf=0x7f28bee53140 "KDMQ sip:dialog at 10.61.187.66:5060 SIP/2.0\r\nVia: SIP/2.0/UDP 10.61.187.75;branch=z9hG4bK4646.29f4a7c1", '0' <repeats 24 times>, ".0\r\nTo: <sip:dialog at 10.61.187.66:5060>\r\nFrom: <sip:dialog at 10.61.187.75:5060>"..., len=716, flags=0)
    at ../../core/forward.h:148
#9  0x00007f28b703aff9 in send_pr_buffer (rb=0x7f28bf3a8bd8, buf=0x7f28bee53140, len=716) at t_funcs.c:69
#10 0x00007f28b700d8eb in send_prepared_request_impl (request=0x7f28bf3a8bd8, retransmit=1, branch=0) at uac.c:669
#11 0x00007f28b700ee74 in t_uac_with_ids (uac_r=0x7ffe79df7c00, ret_index=0x0, ret_label=0x0) at uac.c:753
#12 0x00007f28b700eda0 in t_uac (uac_r=0x7ffe79df7c00) at uac.c:721
#13 0x00007f28b7012038 in request (uac_r=0x7ffe79df7c00, ruri=0x7ffe79df7bd0, to=0x7ffe79df7bd0, from=0x7ffe79df7be0, next_hop=0x0)
    at uac.c:1089
#14 0x00007f28b3281053 in dmq_send_message (peer=0x7f28bdf954a8, body=0x7ffe79df7fb0, node=0x7f28be1bf270, 
    resp_cback=0x7f28b2956af0 <dlg_dmq_resp_callback>, max_forwards=1, content_type=0x7f28b2956ab0 <dlg_dmq_content_type>)
    at dmq_funcs.c:251
#15 0x00007f28b327f4ff in bcast_dmq_message1 (peer=0x7f28bdf954a8, body=0x7ffe79df7fb0, except=0x0, 
    resp_cback=0x7f28b2956af0 <dlg_dmq_resp_callback>, max_forwards=1, content_type=0x7f28b2956ab0 <dlg_dmq_content_type>, incl_inactive=0)
    at dmq_funcs.c:170
        (gdb) p &dmq_node_list->lock
        $1 = (gen_lock_t *) 0x7f28bd321478

#16 0x00007f28b327f8d4 in bcast_dmq_message (peer=0x7f28bdf954a8, body=0x7ffe79df7fb0, except=0x0, 
    resp_cback=0x7f28b2956af0 <dlg_dmq_resp_callback>, max_forwards=1, content_type=0x7f28b2956ab0 <dlg_dmq_content_type>)
    at dmq_funcs.c:188
#17 0x00007f28b26f445c in dlg_dmq_send (body=0x7ffe79df7fb0, node=0x0) at dlg_dmq.c:88
#18 0x00007f28b26fdde4 in dlg_dmq_replicate_action (action=DLG_DMQ_STATE, dlg=0x7f28be36a058, needlock=0, node=0x0) at dlg_dmq.c:628
#19 0x00007f28b26ca370 in dlg_onroute (req=0x7f28dbee2088, route_params=0x7ffe79df8340, param=0x0) at dlg_handlers.c:1559
#20 0x00007f28b68f35c6 in run_rr_callbacks (req=0x7f28dbee2088, rr_param=0x7ffe79df8430) at rr_cb.c:96
#21 0x00007f28b69069b2 in after_loose (_m=0x7f28dbee2088, preloaded=0) at loose.c:984
#22 0x00007f28b69070dc in loose_route (_m=0x7f28dbee2088) at loose.c:1018
#23 0x00007f28b690b240 in w_loose_route (msg=0x7f28dbee2088, p1=0x0, p2=0x0) at rr_mod.c:276
#24 0x0000000000472e48 in do_action (h=0x7ffe79df8fe0, a=0x7f28dbb11a38, msg=0x7f28dbee2088) at core/action.c:1076
#25 0x00000000004801fb in run_actions (h=0x7ffe79df8fe0, a=0x7f28dbb11a38, msg=0x7f28dbee2088) at core/action.c:1581
#26 0x0000000000480877 in run_actions_safe (h=0x7ffe79dfad00, a=0x7f28dbb11a38, msg=0x7f28dbee2088) at core/action.c:1645
#27 0x0000000000446834 in rval_get_int (h=0x7ffe79dfad00, msg=0x7f28dbee2088, i=0x7ffe79df990c, rv=0x7f28dbb11c78, cache=0x0)
    at core/rvalue.c:915
#28 0x000000000044b365 in rval_expr_eval_int (h=0x7ffe79dfad00, msg=0x7f28dbee2088, res=0x7ffe79df990c, rve=0x7f28dbb11c70)
    at core/rvalue.c:1913
#29 0x0000000000472929 in do_action (h=0x7ffe79dfad00, a=0x7f28dbb1a438, msg=0x7f28dbee2088) at core/action.c:1052
#30 0x00000000004801fb in run_actions (h=0x7ffe79dfad00, a=0x7f28dbb110a0, msg=0x7f28dbee2088) at core/action.c:1581
#31 0x000000000046f598 in do_action (h=0x7ffe79dfad00, a=0x7f28dba8c058, msg=0x7f28dbee2088) at core/action.c:700
#32 0x00000000004801fb in run_actions (h=0x7ffe79dfad00, a=0x7f28dba8be20, msg=0x7f28dbee2088) at core/action.c:1581
#33 0x0000000000472db7 in do_action (h=0x7ffe79dfad00, a=0x7f28dbaa55d8, msg=0x7f28dbee2088) at core/action.c:1067
#34 0x00000000004801fb in run_actions (h=0x7ffe79dfad00, a=0x7f28dba82020, msg=0x7f28dbee2088) at core/action.c:1581
#35 0x000000000048093f in run_top_route (a=0x7f28dba82020, msg=0x7f28dbee2088, c=0x0) at core/action.c:1666
#36 0x00000000005ecc77 in receive_msg (
    buf=0xabca80 <buf.7134> "ACK sip:61283214906 at 10.61.162.48:5090;alias=10.61.162.48~5090~1 SIP/2.0\r\nVia: SIP/2.0/UDP 45.116.131.37:6060;branch=z9hG4bK-13506-36982-4\r\nFrom: test_36982 <sip:61289993111 at 45.116.131.37:6060>;tag=135"..., len=740, rcv_info=0x7ffe79dfb3f0)
    at core/receive.c:423
#37 0x00000000004d837f in udp_rcv_loop () at core/udp_server.c:543
#38 0x0000000000429d47 in main_loop () at main.c:1683
#39 0x0000000000433a96 in main (argc=12, argv=0x7ffe79dfbc78) at main.c:2856

``` 

1345    timer:
```
#0  0x00007f28dca971c9 in syscall () from /lib64/libc.so.6
#1  0x00007f28b327cfa9 in futex_get (lock=0x7f28bd321478) at ../../modules/tm/../../core/mem/../futexlock.h:121
#2  0x00007f28b327f05b in bcast_dmq_message1 (peer=0x7f28bdf954a8, body=0x7ffe79dfac30, except=0x0, 
    resp_cback=0x7f28b2956af0 <dlg_dmq_resp_callback>, max_forwards=1, content_type=0x7f28b2956ab0 <dlg_dmq_content_type>, incl_inactive=0)
    at dmq_funcs.c:156
#3  0x00007f28b327f8d4 in bcast_dmq_message (peer=0x7f28bdf954a8, body=0x7ffe79dfac30, except=0x0, 
    resp_cback=0x7f28b2956af0 <dlg_dmq_resp_callback>, max_forwards=1, content_type=0x7f28b2956ab0 <dlg_dmq_content_type>)
    at dmq_funcs.c:188
#4  0x00007f28b26f445c in dlg_dmq_send (body=0x7ffe79dfac30, node=0x0) at dlg_dmq.c:88
#5  0x00007f28b26fdde4 in dlg_dmq_replicate_action (action=DLG_DMQ_RM, dlg=0x7f28be825d20, needlock=0, node=0x0) at dlg_dmq.c:628
#6  0x00007f28b26a27c0 in destroy_dlg (dlg=0x7f28be825d20) at dlg_hash.c:367
#7  0x00007f28b26ac867 in dlg_unref_helper (dlg=0x7f28be825d20, cnt=2, fname=0x7f28b273d21d "dlg_handlers.c", fline=431) at dlg_hash.c:1095
        (gdb) p &d_entry->lock
        $1 = (gen_lock_t *) 0x7f28bdf8b5b4

#8  0x00007f28b26b9b29 in dlg_ontdestroy (t=0x7f28bef7d488, type=131072, param=0x7ffe79dfb130) at dlg_handlers.c:431
#9  0x00007f28b6fb8286 in run_trans_callbacks_internal (cb_lst=0x7f28bef7d500, type=131072, trans=0x7f28bef7d488, params=0x7ffe79dfb130)
    at t_hooks.c:258
#10 0x00007f28b6fb83b8 in run_trans_callbacks (type=131072, trans=0x7f28bef7d488, req=0x0, rpl=0x0, code=0) at t_hooks.c:285
#11 0x00007f28b6f8c96c in free_cell_helper (dead_cell=0x7f28bef7d488, silent=0, fname=0x7f28b709c2d3 "timer.c", fline=643) at h_table.c:165
#12 0x00007f28b7033808 in wait_handler (ti=241915320, wait_tl=0x7f28bef7d510, data=0x7f28bef7d488) at timer.c:643
#13 0x0000000000500fd1 in timer_list_expire (t=241915320, h=0x7f28bb5e4378, slow_l=0x7f28bb5e80e0, slow_mark=14259) at core/timer.c:857
#14 0x000000000050147a in timer_handler () at core/timer.c:922
#15 0x0000000000501924 in timer_main () at core/timer.c:961
#16 0x000000000042a838 in main_loop () at main.c:1753
#17 0x0000000000433a96 in main (argc=12, argv=0x7ffe79dfbc78) at main.c:2856
 ```

Full kamctl trap:
[gdb_kamailio_20201104_035531.txt](https://github.com/kamailio/kamailio/files/5485522/gdb_kamailio_20201104_035531.txt)


-- 
You are receiving this because you are subscribed to this thread.
Reply to this email directly or view it on GitHub:
https://github.com/kamailio/kamailio/issues/2535#issuecomment-721511024
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.kamailio.org/pipermail/sr-dev/attachments/20201103/aa67395a/attachment-0001.htm>


More information about the sr-dev mailing list