I'm on the latest 4.1 clone and seem to occasionally run into this crash:
#0 0x00007f7bd49e15ce in update_dlg_timer (tl=0x58, timeout=10) at dlg_timer.c:203 #1 0x00007f7bd49cf35a in dlg_clean_run (ti=73979985) at dlg_hash.c:253 #2 0x00007f7bd49b90ec in dlg_clean_timer_exec (ticks=73979985, param=0x0) at dialog.c:1246 #3 0x0000000000537091 in fork_sync_timer (child_id=-1, desc=0x7f7bd49ec431 "Dialog Clean Timer", make_sock=1, f=0x7f7bd49b90d3 <dlg_clean_timer_exec>, param=0x0, interval=90) at timer_proc.c:232 #4 0x00007f7bd49b5b7b in child_init (rank=0) at dialog.c:733
The address of 'tl' appears to be bogus; that doesn't look like a valid 64 bit vmem address to me.
I'm investigating further to see if I can track it down, but I don't have much more information right now.
I have about five more crashes like this, and they all have tl=0x58. I don't know where that value is coming from.
On 09/25/2014 01:59 PM, Alex Balashov wrote:
I'm on the latest 4.1 clone and seem to occasionally run into this crash:
#0 0x00007f7bd49e15ce in update_dlg_timer (tl=0x58, timeout=10) at dlg_timer.c:203 #1 0x00007f7bd49cf35a in dlg_clean_run (ti=73979985) at dlg_hash.c:253 #2 0x00007f7bd49b90ec in dlg_clean_timer_exec (ticks=73979985, param=0x0) at dialog.c:1246 #3 0x0000000000537091 in fork_sync_timer (child_id=-1, desc=0x7f7bd49ec431 "Dialog Clean Timer", make_sock=1, f=0x7f7bd49b90d3 <dlg_clean_timer_exec>, param=0x0, interval=90) at timer_proc.c:232 #4 0x00007f7bd49b5b7b in child_init (rank=0) at dialog.c:733
The address of 'tl' appears to be bogus; that doesn't look like a valid 64 bit vmem address to me.
I'm investigating further to see if I can track it down, but I don't have much more information right now.
Can you send the output for bt full ?
Cheers, Daniel
On 25/09/14 20:02, Alex Balashov wrote:
I have about five more crashes like this, and they all have tl=0x58. I don't know where that value is coming from.
On 09/25/2014 01:59 PM, Alex Balashov wrote:
I'm on the latest 4.1 clone and seem to occasionally run into this crash:
#0 0x00007f7bd49e15ce in update_dlg_timer (tl=0x58, timeout=10) at dlg_timer.c:203 #1 0x00007f7bd49cf35a in dlg_clean_run (ti=73979985) at dlg_hash.c:253 #2 0x00007f7bd49b90ec in dlg_clean_timer_exec (ticks=73979985, param=0x0) at dialog.c:1246 #3 0x0000000000537091 in fork_sync_timer (child_id=-1, desc=0x7f7bd49ec431 "Dialog Clean Timer", make_sock=1, f=0x7f7bd49b90d3 <dlg_clean_timer_exec>, param=0x0, interval=90) at timer_proc.c:232 #4 0x00007f7bd49b5b7b in child_init (rank=0) at dialog.c:733
The address of 'tl' appears to be bogus; that doesn't look like a valid 64 bit vmem address to me.
I'm investigating further to see if I can track it down, but I don't have much more information right now.
On 09/25/2014 03:48 PM, Daniel-Constantin Mierla wrote:
Can you send the output for bt full ?
Sure:
#0 0x00007f7bd49e15ce in update_dlg_timer (tl=0x58, timeout=10) at dlg_timer.c:203 __FUNCTION__ = "update_dlg_timer" #1 0x00007f7bd49cf35a in dlg_clean_run (ti=73979985) at dlg_hash.c:253 i = 900 tm = 1411630026 dlg = 0x0 tdlg = 0x7f7a53ca4f78 __FUNCTION__ = "dlg_clean_run" #2 0x00007f7bd49b90ec in dlg_clean_timer_exec (ticks=73979985, param=0x0) at dialog.c:1246 No locals. #3 0x0000000000537091 in fork_sync_timer (child_id=-1, desc=0x7f7bd49ec431 "Dialog Clean Timer", make_sock=1, f=0x7f7bd49b90d3 <dlg_clean_timer_exec>, param=0x0, interval=90) at timer_proc.c:232 pid = 0 ts1 = 73979985 ts2 = 90 #4 0x00007f7bd49b5b7b in child_init (rank=0) at dialog.c:733 __FUNCTION__ = "child_init" #5 0x00000000004f854a in init_mod_child (m=0x7f7bd8454118, rank=0) at sr_module.c:924 __FUNCTION__ = "init_mod_child" #6 0x00000000004f83ed in init_mod_child (m=0x7f7bd8454ed8, rank=0) at sr_module.c:921 __FUNCTION__ = "init_mod_child" #7 0x00000000004f83ed in init_mod_child (m=0x7f7bd8455750, rank=0) at sr_module.c:921 __FUNCTION__ = "init_mod_child" #8 0x00000000004f83ed in init_mod_child (m=0x7f7bd8455e60, rank=0) at sr_module.c:921 __FUNCTION__ = "init_mod_child" #9 0x00000000004f83ed in init_mod_child (m=0x7f7bd8456298, rank=0) at sr_module.c:921 __FUNCTION__ = "init_mod_child" #10 0x00000000004f83ed in init_mod_child (m=0x7f7bd8456958, rank=0) at sr_module.c:921 __FUNCTION__ = "init_mod_child" #11 0x00000000004f83ed in init_mod_child (m=0x7f7bd8456dc0, rank=0) at sr_module.c:921 __FUNCTION__ = "init_mod_child" #12 0x00000000004f83ed in init_mod_child (m=0x7f7bd84573f0, rank=0) at sr_module.c:921 __FUNCTION__ = "init_mod_child" #13 0x00000000004f83ed in init_mod_child (m=0x7f7bd8457ac8, rank=0) at sr_module.c:921 __FUNCTION__ = "init_mod_child" #14 0x00000000004f83ed in init_mod_child (m=0x7f7bd84580b8, rank=0) at sr_module.c:921 __FUNCTION__ = "init_mod_child" #15 0x00000000004f83ed in init_mod_child (m=0x7f7bd8458568, rank=0) at sr_module.c:921 __FUNCTION__ = "init_mod_child" #16 0x00000000004f86d0 in init_child (rank=0) at sr_module.c:948 No locals. #17 0x000000000046d8b8 in main_loop () at main.c:1696 i = 8 pid = 15067 si = 0x0 si_desc = "udp receiver child=7 sock=xxx.xxx.xxx.xxx:5060\000\177\000\000 \220f\330{\177\000\000N\207^\000\000\000\000\000\060y^\000\000\000\000\000\230\217-\027\000\000\000\000\300LA\000\000\000\000\000\200\031\350%\377\177", '\000' <repeats 18 times>"\300, \027\350%\377\177\000\000t\246K\000\000\000\000" nrprocs = 8 __FUNCTION__ = "main_loop" #18 0x00000000004706a7 in main (argc=11, argv=0x7fff25e81988) at main.c:2547 cfg_stream = 0x7f7bd8302010 c = -1 r = 0 tmp = 0x7fff25e828e0 "" tmp_len = 0 port = 0 proto = 0 options = 0x5e0c88 ":f:cm:M:dVIhEeb:l:L:n:vKrRDTN:W:w:t:u:g:P:G:SQ:O:a:A:" ret = -1 seed = 2825371499 rfd = 4 debug_save = 0 debug_flag = 0 dont_fork_cnt = 0 n_lst = 0x34f580fb88 p = 0x5cad30 "H\211l$\330L\211d$\340H\215-\277\242*" __FUNCTION__ = "main"
It seems to me that the real problem is this:
(gdb) frame 1 #1 0x00007f7bd49cf35a in dlg_clean_run (ti=73979985) at dlg_hash.c:253 253 if(update_dlg_timer(&dlg->tl, 10)<0) { (gdb) print dlg $1 = (dlg_cell_t *) 0x0
In this loop...
int dlg_clean_run(ticks_t ti) { unsigned int i; unsigned int tm; dlg_cell_t *dlg; dlg_cell_t *tdlg;
tm = (unsigned int)time(NULL); for(i=0; i<d_table->size; i++) { lock_set_get(d_table->locks, d_table->entries[i].lock_idx); dlg = d_table->entries[i].first; while (dlg) { tdlg = dlg; dlg = dlg->next; if(tdlg->state==DLG_STATE_UNCONFIRMED && tdlg->init_ts<tm-300) { /* dialog in early state older than 5min */ LM_NOTICE("dialog in early state is too old (%p ref %d)\n", tdlg, tdlg->ref); unlink_unsafe_dlg(&d_table->entries[i], tdlg); destroy_dlg(tdlg); } if(tdlg->state==DLG_STATE_CONFIRMED_NA && tdlg->start_ts<tm-60) { if(update_dlg_timer(&dlg->tl, 10)<0) { LM_ERR("failed to update dialog lifetime in long non-ack state\n"); } dlg->lifetime = 10; dlg->dflags |= DLG_FLAG_CHANGED; } }
Should we, perhaps, in that last conditional block (if(tdlg->state==DLG_STATE_CONFIRMED_NA && tdlg->start_ts<tm-60)), be operating on 'tdlg' rather than 'dlg', much as in the previous if block? Is that a typo/oversight/mistake/legacy code?
Good spot, a typo/copy&paste oversight ... going to push the patch.
Cheers, Daniel
On 25/09/14 23:57, Alex Balashov wrote:
In this loop...
int dlg_clean_run(ticks_t ti) { unsigned int i; unsigned int tm; dlg_cell_t *dlg; dlg_cell_t *tdlg;
tm = (unsigned int)time(NULL); for(i=0; i<d_table->size; i++) { lock_set_get(d_table->locks,
d_table->entries[i].lock_idx); dlg = d_table->entries[i].first; while (dlg) { tdlg = dlg; dlg = dlg->next; if(tdlg->state==DLG_STATE_UNCONFIRMED && tdlg->init_ts<tm-300) { /* dialog in early state older than 5min */ LM_NOTICE("dialog in early state is too old (%p ref %d)\n", tdlg, tdlg->ref); unlink_unsafe_dlg(&d_table->entries[i], tdlg); destroy_dlg(tdlg); } if(tdlg->state==DLG_STATE_CONFIRMED_NA && tdlg->start_ts<tm-60) { if(update_dlg_timer(&dlg->tl, 10)<0) { LM_ERR("failed to update dialog lifetime in long non-ack state\n"); } dlg->lifetime = 10; dlg->dflags |= DLG_FLAG_CHANGED; } }
Should we, perhaps, in that last conditional block (if(tdlg->state==DLG_STATE_CONFIRMED_NA && tdlg->start_ts<tm-60)), be operating on 'tdlg' rather than 'dlg', much as in the previous if block? Is that a typo/oversight/mistake/legacy code?