version: kamailio 5.5.6 (x86_64/linux) 21a9bc
Operating System: Debian GNU/Linux 11 (bullseye)
Kernel: Linux 5.10.0-22-amd64
we see this core it has repeated some times in different days
```
(gdb) bt
#0 __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
#1 0x00007fde2329f537 in __GI_abort () at abort.c:79
#2 0x000055d5fab45995 in sig_alarm_abort (signo=14) at main.c:699
#3 <signal handler called>
#4 syscall () at ../sysdeps/unix/sysv/linux/x86_64/syscall.S:37
#5 0x00007fde1c80a015 in futex_get (lock=0x7fdd9f9c9ca4) at ../../core/mem/../futexlock.h:108
#6 0x00007fde1c80c526 in destroy_linkers (linker=0x0) at dlg_profile.c:275
#7 0x00007fde1c7f87b3 in destroy_dlg (dlg=0x7fddad479ea0) at dlg_hash.c:377
#8 0x00007fde1c7f8ca0 in destroy_dlg_table () at dlg_hash.c:438
#9 0x00007fde1c790286 in mod_destroy () at dialog.c:809
#10 0x000055d5fad76dd8 in destroy_modules () at core/sr_module.c:842
#11 0x000055d5fab440e2 in cleanup (show_status=1) at main.c:575
#12 0x000055d5fab45d45 in shutdown_children (sig=15, show_status=1) at main.c:718
#13 0x000055d5fab49129 in handle_sigs () at main.c:816
#14 0x000055d5fab56959 in main_loop () at main.c:1903
#15 0x000055d5fab602e9 in main (argc=15, argv=0x7ffd9451bbb8) at main.c:3061
(gdb) bt full
#0 __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
set = {__val = {8192, 0 <repeats 15 times>}}
pid = <optimized out>
tid = <optimized out>
ret = <optimized out>
#1 0x00007fde2329f537 in __GI_abort () at abort.c:79
save_stage = 1
act = {__sigaction_handler = {sa_handler = 0x0, sa_sigaction = 0x0}, sa_mask = {__val = {0 <repeats 16 times>}}, sa_flags = 0, sa_restorer = 0x55d5faf6b088}
sigs = {__val = {32, 0 <repeats 15 times>}}
#2 0x000055d5fab45995 in sig_alarm_abort (signo=14) at main.c:699
__func__ = "sig_alarm_abort"
#3 <signal handler called>
No locals.
#4 syscall () at ../sysdeps/unix/sysv/linux/x86_64/syscall.S:37
No locals.
#5 0x00007fde1c80a015 in futex_get (lock=0x7fdd9f9c9ca4) at ../../core/mem/../futexlock.h:108
v = 2
i = 1024
#6 0x00007fde1c80c526 in destroy_linkers (linker=0x0) at dlg_profile.c:275
p_entry = 0x7fdd9f9c9d88
l = 0x7fddd2aca430
lh = 0x55d5faf81a03
__func__ = "destroy_linkers"
#7 0x00007fde1c7f87b3 in destroy_dlg (dlg=0x7fddad479ea0) at dlg_hash.c:377
ret = 1
var = 0x7fde1c719a44 <mod_destroy+859>
--Type <RET> for more, q to quit, c to continue without paging--
__func__ = "destroy_dlg"
#8 0x00007fde1c7f8ca0 in destroy_dlg_table () at dlg_hash.c:438
dlg = 0x0
l_dlg = 0x7fddad479ea0
i = 0
__func__ = "destroy_dlg_table"
#9 0x00007fde1c790286 in mod_destroy () at dialog.c:809
No locals.
#10 0x000055d5fad76dd8 in destroy_modules () at core/sr_module.c:842
t = 0x7fde1f328018
foo = 0x7fde1f327838
__func__ = "destroy_modules"
#11 0x000055d5fab440e2 in cleanup (show_status=1) at main.c:575
memlog = 0
__func__ = "cleanup"
#12 0x000055d5fab45d45 in shutdown_children (sig=15, show_status=1) at main.c:718
__func__ = "shutdown_children"
#13 0x000055d5fab49129 in handle_sigs () at main.c:816
chld = 0
chld_status = 139
any_chld_stopped = 1
memlog = 0
__func__ = "handle_sigs"
#14 0x000055d5fab56959 in main_loop () at main.c:1903
i = 14
pid = 3845341
--Type <RET> for more, q to quit, c to continue without paging--
si = 0x0
si_desc = "udp receiver child=13 sock=87.237.87.28:5060\000\000\000\000\300\272Q\224\375\177\000\000\000\000\000\000\000\000\000\000\003\032\370\372\325U\000\000-\000\000\000\000\000\000\000\200\003\062\037\336\177\000\000F\034\067#\336\177\000\000\060\000\000\000\060\000\000\000x\266Q\224\375\177\000\000\220\265Q\224\375\177\000\000\000\230\026︾\314\""
nrprocs = 14
woneinit = 1
__func__ = "main_loop"
#15 0x000055d5fab602e9 in main (argc=15, argv=0x7ffd9451bbb8) at main.c:3061
cfg_stream = 0x55d5fbd3f2e0
c = -1
r = 0
tmp = 0x7ffd9451ce7c ""
tmp_len = 832
port = 832
proto = 832
ahost = 0x0
aport = 0
options = 0x55d5faf6e0b8 ":f:cm:M:dVIhEeb:l:L:n:vKrRDTN:W:w:t:u:g:P:G:SQ:O:a:A:x:X:Y:"
ret = -1
seed = 4110196155
rfd = 4
debug_save = 0
debug_flag = 0
dont_fork_cnt = 0
n_lst = 0x98000000980
p = 0xc2 <error: Cannot access memory at address 0xc2>
st = {st_dev = 23, st_ino = 946, st_nlink = 2, st_mode = 16832, st_uid = 0, st_gid = 998, __pad0 = 0, st_rdev = 0, st_size = 140, st_blksize = 4096, st_blocks = 0, st_atim = {tv_sec = 1696021490, tv_nsec = 675255852},
--Type <RET> for more, q to quit, c to continue without paging--
st_mtim = {tv_sec = 1696418622, tv_nsec = 168794592}, st_ctim = {tv_sec = 1696418622, tv_nsec = 168794592}, __glibc_reserved = {0, 0, 0}}
tbuf = "P\267Q\224\375\177\000\000\310e)#\336\177\000\000\020\204]#\336\177\000\000\000\000\000\000\000\000\000\000зQ\224\375\177\000\000\000\000\000\000\000\000\000\000зQ\224\375\177", '\000' <repeats 18 times>, "`g^#\336\177\000\000\350$a#\336\177\000\000\204i^#\336\177\000\000\060d^#\336\177\000\000H\020a#\336\177\000\000\000`^#\336\177\000\000\000\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000@\"(#\336\177", '\000' <repeats 66 times>...
option_index = 0
long_options = {{name = 0x55d5faf70526 "help", has_arg = 0, flag = 0x0, val = 104}, {name = 0x55d5faf6b51c "version", has_arg = 0, flag = 0x0, val = 118}, {name = 0x55d5faf7052b "alias", has_arg = 1, flag = 0x0,
val = 1024}, {name = 0x55d5faf70531 "subst", has_arg = 1, flag = 0x0, val = 1025}, {name = 0x55d5faf70537 "substdef", has_arg = 1, flag = 0x0, val = 1026}, {name = 0x55d5faf70540 "substdefs", has_arg = 1, flag = 0x0,
val = 1027}, {name = 0x55d5faf7054a "server-id", has_arg = 1, flag = 0x0, val = 1028}, {name = 0x55d5faf70554 "loadmodule", has_arg = 1, flag = 0x0, val = 1029}, {name = 0x55d5faf7055f "modparam", has_arg = 1,
flag = 0x0, val = 1030}, {name = 0x55d5faf70568 "log-engine", has_arg = 1, flag = 0x0, val = 1031}, {name = 0x55d5faf70573 "debug", has_arg = 1, flag = 0x0, val = 1032}, {name = 0x55d5faf70579 "cfg-print",
has_arg = 0, flag = 0x0, val = 1033}, {name = 0x55d5faf70583 "atexit", has_arg = 1, flag = 0x0, val = 1034}, {name = 0x0, has_arg = 0, flag = 0x0, val = 0}}
__func__ = "main"
(gdb)
```
could it be related to any type of message that is making the kamailio crush?
thanks a lot and regards
david
--
Reply to this email directly or view it on GitHub:
https://github.com/kamailio/kamailio/issues/3593
You are receiving this because you are subscribed to this thread.
Message ID: <kamailio/kamailio/issues/3593(a)github.com>
<!--
Kamailio Project uses GitHub Issues only for bugs in the code or feature requests. Please use this template only for bug reports.
If you have questions about using Kamailio or related to its configuration file, ask on sr-users mailing list:
* https://lists.kamailio.org/mailman3/postorius/lists/sr-users.lists.kamailio…
If you have questions about developing extensions to Kamailio or its existing C code, ask on sr-dev mailing list:
* https://lists.kamailio.org/mailman3/postorius/lists/sr-dev.lists.kamailio.o…
Please try to fill this template as much as possible for any issue. It helps the developers to troubleshoot the issue.
If there is no content to be filled in a section, the entire section can be removed.
You can delete the comments from the template sections when filling.
You can delete next line and everything above before submitting (it is a comment).
-->
### Description
I have 2 jsonrpc servers configured with different prio's. For testing, I have the servers configured to always delay the response to any request by more than the module's timout setting.
The (initial) request is sent to the first server. As this one times out, I would expect a retry to go to the second servers, but instead, all retries are sent to the same server. The backup server is never contacted. This makes the whole "prio" system seem a bit useless.
<!--
Explain what you did, what you expected to happen, and what actually happened.
-->
### Troubleshooting
#### Reproduction
```
modparam("janssonrpcc", "server", "conn=test;addr=pc1;port=8081;priority=5;weight=10")
modparam("janssonrpcc", "server", "conn=test;addr=pc1;port=8082;priority=5;weight=10")
```
```
janssonrpc_request("test", "Test.Timeout", '[ { "Timout": 1000} ]', "route=JSONRPC_RESPONSE;retry=10;timeout=1000");
```
<!--
If the issue can be reproduced, describe how it can be done.
-->
#### Log Messages
No useful logs are produced. I verified the described behavior on the jsonrpc server.
<!--
Check the syslog file and if there are relevant log messages printed by Kamailio, add them next, or attach to issue, or provide a link to download them (e.g., to a pastebin site).
-->
```
2023-02-23T16:59:34.585346+01:00 pc1 proxy1[340870]: INFO: janssonrpcc [janssonrpc_connect.c:361]: bev_connect(): Connecting to server pc1:8081 for conn rating.
2023-02-23T16:59:34.585420+01:00 pc1 proxy1[340870]: INFO: janssonrpcc [janssonrpc_connect.c:361]: bev_connect(): Connecting to server pc1:8082 for conn rating.
2023-02-23T16:59:34.585446+01:00 pc1 proxy1[340870]: INFO: janssonrpcc [janssonrpc_connect.c:290]: bev_connect_cb(): Connected to host pc1:8081
2023-02-23T16:59:34.585462+01:00 pc1 proxy1[340870]: INFO: janssonrpcc [janssonrpc_connect.c:290]: bev_connect_cb(): Connected to host pc1:8082
2023-02-23T17:05:10.903398+01:00 pc1 proxy1[340870]: WARNING: janssonrpcc [janssonrpc_request.c:247]: schedule_retry(): Number of retries exceeded. Failing request.
```
### Possible Solutions
<!--
If you found a solution or workaround for the issue, describe it. Ideally, provide a pull request with a fix.
-->
Retry in combination with a timeout and prio's is a bit tricky. When do what? Just retrying on the first prio makes the lower prio servers completely useless, while going to the next prio on every retry skips possibly useful high-prio servers and may exhaust the number of candidate servers very fast.
Best solution IMHO would be to first try every server in the highest prio, before going to the next prio. Do not do (exponential) backoff between these steps.
If there are still retries remaining after that, wrap around to the highest prio with the exponentional backoff delay.
With the above, failover considers all servers and failover between servers is fast while not overloading a single server.
BTW. If I configure multiple servers per prio, it seems to randomly select one of them for every (re)try. It never selects one form the next prio.
### Additional Information
* **Kamailio Version** - output of `kamailio -v`
```
5.6.1
```
* **Operating System**:
<!--
Details about the operating system, the type: Linux (e.g.,: Debian 8.4, Ubuntu 16.04, CentOS 7.1, ...), MacOS, xBSD, Solaris, ...;
Kernel details (output of `lsb_release -a` and `uname -a`)
-->
```
(paste your output here)
```
--
Reply to this email directly or view it on GitHub:
https://github.com/kamailio/kamailio/issues/3378
You are receiving this because you are subscribed to this thread.
Message ID: <kamailio/kamailio/issues/3378(a)github.com>