### Description
during tls relaying, do tls.reload repeatedly, tcp receiver task will crash ### Troubleshooting
#### Reproduction (1)write a simple request_route logic in kamailio.cfg, as follows: request_route { add_local_rport(); $du = "sip:192.168.131.190:1001;transport=tls"; forward(); } (2)start sipp caller to kamailio tls:0.0.0.0:5061 and kamailio will relay this request to callee (3)at the same time, in another console, run the following cmds: while true; do /opt/kamailio/sbin/kamcmd tls.reload && sleep 0.1&& echo `date`; done;
#### Debugging Data ``` (gdb) bt full #0 0x00007f69569e81f7 in raise () from /lib64/libc.so.6 No symbol table info available. #1 0x00007f69569e98e8 in abort () from /lib64/libc.so.6 No symbol table info available. #2 0x00000000006d59cf in tlsf_free (tlsf=0x7f6920869000, ptr=0x7f6927ca74b8, file=0x7f694ccb73e4 "tls: tls_init.c", function=0x7f694ccb8b7e <__FUNCTION__.23358> "ser_free", line=323, mname=0x7f694ccb73e0 "tls") at core/mem/tlsf_malloc.c:1031 control = 0x7f6920869000 block = 0x7f6927ca7488 __FUNCTION__ = "tlsf_free" #3 0x00000000006da1ee in tlsf_shm_free (tlsfmp=0x7f6920869000, p=0x7f6927ca74b8, file=0x7f694ccb73e4 "tls: tls_init.c", func=0x7f694ccb8b7e <__FUNCTION__.23358> "ser_free", line=323, mname=0x7f694ccb73e0 "tls") at core/mem/tlsf_malloc.c:1544 No locals. #4 0x00007f694cc68cea in ser_free (ptr=0x7f6927ca74b8, fname=0x7f694c6fba75 "crypto/err/err.c", fline=450) at tls_init.c:323 __FUNCTION__ = "ser_free" #5 0x00007f694c617de3 in CRYPTO_free (str=0x7f6927ca74b8, file=0x7f694c6fba75 "crypto/err/err.c", line=450) at crypto/mem.c:299 No locals. #6 0x00007f694c5e656a in ERR_clear_error () at crypto/err/err.c:450 i = 1 es = 0x7f69208d17e8 #7 0x00007f694c9f8fbc in state_machine (s=0x7f6927ca4e58, server=1) at ssl/statem/statem.c:311 buf = 0x0 cb = 0x0 st = 0x7f6927ca4ea0 ret = -1 ssret = 57 #8 0x00007f694c9f8f17 in ossl_statem_accept (s=0x7f6927ca4e58) at ssl/statem/statem.c:255 No locals. ---Type <return> to continue, or q <return> to quit--- #9 0x00007f694c9dfd81 in SSL_do_handshake (s=0x7f6927ca4e58) at ssl/ssl_lib.c:3661 ret = 1 #10 0x00007f694c9dbfb9 in SSL_accept (s=0x7f6927ca4e58) at ssl/ssl_lib.c:1652 No locals. #11 0x00007f694cc9048e in tls_accept (c=0x7f6927c9f2a8, error=0x7ffe3a31e894) at tls_server.c:468 ret = 1 ssl = 0x7f6927ca4e58 cert = 0x0 tls_c = 0x7f6927c9ed50 tls_log = 32617 __FUNCTION__ = "tls_accept" pkey = 0x0 #12 0x00007f694cc9ab97 in tls_h_read_f (c=0x7f6927c9f2a8, flags=0x7ffe3a31ece0) at tls_server.c:1173 r = 0x7f6927c9f3d0 bytes_free = 16383 bytes_read = 194 read_size = 16383 ssl_error = 0 ssl_read = 0 ssl = 0x7f6927ca4e58 rd_buf = "\026\003\001\000\275\001\000\000\271\003\003\357$\303\342a\362C\v\220\n\031Fs=E\267m\215\260\031\341\376\027A\213Ѱ)\\251\036\336\000\000\070\300,\300\060\000\237̨̩̪\300+\300/\000\236\300$\300(\000k\300#\300'\000g\300\n\300\024\000\071\300\t\300\023\000\063\000\235\000\234\000=\000<\000\065\000/\000\377\001\000\000X\000\v\000\004\003\000\001\002\000\n\000\f\000\n\000\035\000\027\000\036\000\031\000\030\000#\000\000\000\026\000\000\000\027\000\000\000\r\000\060\000.\004\003\005\003\006\003\b\a\b\b\b\t\b\n\b\v\b\004\b\005\b\006\004\001\005\001\006\001\003\003\002\003\003\001\002\001\003\002\002\002\004\002\005\002\006\002\232\203\002?\267,\241/\\331"... wr_buf = "\026\003\003\000\272\004\000\000\266\000\000\034 \000\260\201\227\230o\346/%W\246\306o&\210j\017@\036\352\064\240---Type <return> to continue, or q <return> to quit--- 346\351(q\233\374\321\332D1|\rI\235\b`\022v^Xk\265\f\270a\243\260\373\311\t<o\v\324\245\205@\344\vz\016\005Vbud\254f\n\326!\306\nm>r]\271e\351\345w\214\200\037\t\265\307\064)\033.\273{1\351\326W,\262\264\267\241\237M\254\276\372\026\227j8\344#Aw\r\025ULM\021\245\222:\004a\006zm?\374\243\224\275\300\310AG\347\305\376\257\303\035I\215\064\035\215\021\300\210o\036A\373\037\231l\242_\324\017\312^\024\r\312\340yq\375?\317 \024\003\003\000\001\001\026\003\003\000"... rd = {buf = 0x7ffe3a30e890 "\026\003\001", pos = 0, used = 194, size = 65536} wr = {buf = 0x7ffe3a2fe890 "\026\003\003", pos = 0, used = 0, size = 65536} tls_c = 0x7f6927c9ed50 enc_rd_buf = 0x0 n = 0 flush_flags = 1 err_src = 0x7f694ccc1878 "TLS read:" ip_buf = "192.168.131.190", '\000' <repeats 48 times> x = 0 tls_dbg = 0 __FUNCTION__ = "tls_h_read_f" #13 0x00000000006b6509 in tcp_read_headers (c=0x7f6927c9f2a8, read_flags=0x7ffe3a31ece0) at core/tcp_read.c:441 bytes = 65535 remaining = 0 p = 0x7f6925192572 ": 100587 sip:100587@Sipproxy.gs:5062;tag=588\r\nTo: 100 sip:100@sipproxy.gs:5061;transport=TLS;tag=21794\r\nCall-ID: 588-49839@192.168.131.190\r\nCSeq: 1 INVITE\r\nServer: Grandstream UCM6301V1.2C 1.0.13."... r = 0x7f6927c9f3d0 mc = 622404320 body_len = 0 mfline = 0x0 mtransid = {s = 0x7f6925191f40 "\030\002", len = 622403416} __FUNCTION__ = "tcp_read_headers" #14 0x00000000006bdd81 in tcp_read_req (con=0x7f6927c9f2a8, bytes_read=0x7ffe3a31ece4, read_flags=0x7ffe3a31ece0) ---Type <return> to continue, or q <return> to quit--- at core/tcp_read.c:1469 bytes = -1 total_bytes = 0 resp = 1 size = 140729874770784 req = 0x7f6927c9f3d0 dst = {send_sock = 0x400000010, to = {s = {sa_family = 0, sa_data = "\300+\002\000\000\000\001\000\061:\006\000\000"}, sin = {sin_family = 0, sin_port = 11200, sin_addr = {s_addr = 2}, sin_zero = "\001\000\061:\006\000\000"}, sin6 = { sin6_family = 0, sin6_port = 11200, sin6_flowinfo = 2, sin6_addr = {__in6_u = { __u6_addr8 = "\001\000\061:\006\000\000\000\000F\265\000\000\000\000", __u6_addr16 = {1, 14897, 6, 0, 17920, 181, 0, 0}, __u6_addr32 = {976289793, 6, 11879936, 0}}}, sin6_scope_id = 8233784}, sas = {ss_family = 0, __ss_padding = "\300+\002\000\000\000\001\000\061:\006\000\000\000\000F\265\000\000\000\000\000\070\243}\000\004\274{ \020\000\000\000\020\000\000\000 \354\061:\371\273{ \002\000\000\000\000\000\000\000\024\000\000\000\000\000\000\000\001\000\000\000\001\000\000\000\a\000\000\000\000\000\000\000\000\000\300+\000\000\000\000\330\354\061:\376\177\000\000\b", '\000' <repeats 22 times>, __ss_align = 140729874770800}}, id = 1, send_flags = {f = 0, blst_imask = 0}, proto = 80 'P', proto_pad0 = -21 '\353', proto_pad1 = 14897} c = 58 ':' ret = 0 __FUNCTION__ = "tcp_read_req" #15 0x00000000006c29e7 in handle_io (fm=0x7f6952f58a58, events=1, idx=-1) at core/tcp_read.c:1780 ret = 8 n = 8 read_flags = RD_CONN_SHORT_READ con = 0x7f6927c9f2a8 s = 7 resp = 1 t = 0 ---Type <return> to continue, or q <return> to quit--- ee = 0x0 __FUNCTION__ = "handle_io" #16 0x00000000006b1729 in io_wait_loop_epoll (h=0xb54600 <io_w>, t=2, repeat=0) at core/io_wait.h:1070 n = 1 r = 0 fm = 0x7f6952f58a58 revents = 1 __FUNCTION__ = "io_wait_loop_epoll" #17 0x00000000006c5519 in tcp_receive_loop (unix_sock=16) at core/tcp_read.c:1976 __FUNCTION__ = "tcp_receive_loop" #18 0x0000000000535888 in tcp_init_children (woneinit=0x7ffe3a31f09c) at core/tcp_main.c:5227 r = 1 i = 5 reader_fd_1 = 16 pid = 0 si_desc = "tcp receiver (generic)\000\000\000\000\000\000\000\000\000\000@\304A\000\000\000\000\000SI\203\000\000\000\000\000\070\243}\000\000\000\000\000\000\000\300+\000\000\000\000\060\360\061:\376\177\000\000\347SW\000\000\000\000\000\060\360\061:\376\177\000\000\n\363e\000\000\000\000\000\340\357\061:\376\177\000\000)\310w\000\000\000\000\000\t\000\000\000\n\000\000\000\000\263\206 i\177\000" si = 0x0 __FUNCTION__ = "tcp_init_children" #19 0x000000000042d562 in main_loop () at main.c:1849 i = 0 pid = 37536 si = 0x0 si_desc = "\v\000\000\000z\000\000\000\004\000\000\000j\001\000\000\000\000\000\000\376\177\000\000\200p\000\000\000\000\000\000p\025\252\002\000\000\000\000 \261\206 i\177\000\000\240\243^Ni\177\000\000\000\000\000\000\000\000\000\000`\366\061:\376\177\00---Type <return> to continue, or q <return> to quit--- 0\000\230Y^Ni\177\000\000SI\203\000\000\000\000\000\070\243}\000\000\000\000\000\250\225^Ni\177\000\000\217U\252Vi\177\000\000\000\362\061:\376\177\000\000\060\000\000\000\060\000\000" nrprocs = 5143176 woneinit = 1 __FUNCTION__ = "main_loop" #20 0x0000000000436b1e in main (argc=7, argv=0x7ffe3a31f748) at main.c:3078 cfg_stream = 0x2a130a0 c = -1 r = 0 tmp = 0x7ffe3a32157d "" tmp_len = 0 port = 0 proto = 0 ahost = 0x0 aport = 0 options = 0x7dd4e0 ":f:cm:M:dVIhEeb:l:L:n:vKrRDTN:W:w:t:u:g:P:G:SQ:O:a:A:x:X:Y:" ret = -1 seed = 2220745850 rfd = 4 debug_save = 0 debug_flag = 0 dont_fork_cnt = 0 n_lst = 0x7ffe3a31f600 p = 0x0 st = {st_dev = 18, st_ino = 104172, st_nlink = 2, st_mode = 16832, st_uid = 0, st_gid = 0, __pad0 = 0, st_rdev = 0, st_size = 40, st_blksize = 4096, st_blocks = 0, st_atim = {tv_sec = 1672026025, tv_nsec = 194902649}, st_mtim = { tv_sec = 1672281302, tv_nsec = 856611068}, st_ctim = {tv_sec = 1672281302, tv_nsec = 856611068}, __unused = {0, 0, 0}} ---Type <return> to continue, or q <return> to quit--- tbuf = "\216\377w\001", '\000' <repeats 12 times>, "\250\061\234Vi\177\000\000\000\360\212Wi\177", '\000' <repeats 90 times>, "\200"\254\000\000\000\000\000\300\310A\000\000\000\000\000@\367\061:\376\177", '\000' <repeats 26 times>, "\336\310jWi\177\000\000\001", '\000' <repeats 23 times>... option_index = 0 long_options = {{name = 0x7df90f "help", has_arg = 0, flag = 0x0, val = 104}, {name = 0x7da7b4 "version", has_arg = 0, flag = 0x0, val = 118}, {name = 0x7df914 "alias", has_arg = 1, flag = 0x0, val = 1024}, {name = 0x7df91a "subst", has_arg = 1, flag = 0x0, val = 1025}, {name = 0x7df920 "substdef", has_arg = 1, flag = 0x0, val = 1026}, { name = 0x7df929 "substdefs", has_arg = 1, flag = 0x0, val = 1027}, {name = 0x7df933 "server-id", has_arg = 1, flag = 0x0, val = 1028}, {name = 0x7df93d "loadmodule", has_arg = 1, flag = 0x0, val = 1029}, { name = 0x7df948 "modparam", has_arg = 1, flag = 0x0, val = 1030}, {name = 0x7df951 "log-engine", has_arg = 1, flag = 0x0, val = 1031}, {name = 0x7df95c "debug", has_arg = 1, flag = 0x0, val = 1032}, {name = 0x7df962 "cfg-print", has_arg = 0, flag = 0x0, val = 1033}, {name = 0x7df96c "atexit", has_arg = 1, flag = 0x0, val = 1034}, {name = 0x0, has_arg = 0, flag = 0x0, val = 0}} __FUNCTION__ = "main" (gdb) (gdb) info locals cfg_stream = 0x2a130a0 c = -1 r = 0 tmp = 0x7ffe3a32157d "" tmp_len = 0 port = 0 proto = 0 ahost = 0x0 aport = 0 options = 0x7dd4e0 ":f:cm:M:dVIhEeb:l:L:n:vKrRDTN:W:w:t:u:g:P:G:SQ:O:a:A:x:X:Y:" ret = -1 seed = 2220745850 rfd = 4 debug_save = 0 debug_flag = 0 dont_fork_cnt = 0 n_lst = 0x7ffe3a31f600 p = 0x0 st = {st_dev = 18, st_ino = 104172, st_nlink = 2, st_mode = 16832, st_uid = 0, st_gid = 0, __pad0 = 0, st_rdev = 0, st_size = 40, st_blksize = 4096, st_blocks = 0, st_atim = {tv_sec = 1672026025, tv_nsec = 194902649}, st_mtim = {tv_sec = 1672281302, tv_nsec = 856611068}, st_ctim = {tv_sec = 1672281302, tv_nsec = 856611068}, __unused = {0, 0, 0}} tbuf = "\216\377w\001", '\000' <repeats 12 times>, "\250\061\234Vi\177\000\000\000\360\212Wi\177", '\000' <repeats 90 times>, "\200"\254\000\000\000\000\000\300\310A\000\000\000\000\000@\367\061:\376\177", '\000' <repeats 26 times>, "\336\310jWi\177\000\000\001", '\000' <repeats 23 times>... option_index = 0 long_options = {{name = 0x7df90f "help", has_arg = 0, flag = 0x0, val = 104}, {name = 0x7da7b4 "version", has_arg = 0, flag = 0x0, val = 118}, {name = 0x7df914 "alias", has_arg = 1, flag = 0x0, val = 1024}, {name = 0x7df91a "subst", has_arg = 1, flag = 0x0, ---Type <return> to continue, or q <return> to quit--- val = 1025}, {name = 0x7df920 "substdef", has_arg = 1, flag = 0x0, val = 1026}, {name = 0x7df929 "substdefs", has_arg = 1, flag = 0x0, val = 1027}, {name = 0x7df933 "server-id", has_arg = 1, flag = 0x0, val = 1028}, {name = 0x7df93d "loadmodule", has_arg = 1, flag = 0x0, val = 1029}, {name = 0x7df948 "modparam", has_arg = 1, flag = 0x0, val = 1030}, { name = 0x7df951 "log-engine", has_arg = 1, flag = 0x0, val = 1031}, {name = 0x7df95c "debug", has_arg = 1, flag = 0x0, val = 1032}, {name = 0x7df962 "cfg-print", has_arg = 0, flag = 0x0, val = 1033}, {name = 0x7df96c "atexit", has_arg = 1, flag = 0x0, val = 1034}, {name = 0x0, has_arg = 0, flag = 0x0, val = 0}} __FUNCTION__ = "main" (gdb) list 1978 int main(int argc, char** argv) 1979 { 1980 1981 FILE* cfg_stream; 1982 int c,r; 1983 char *tmp; 1984 int tmp_len; 1985 int port; 1986 int proto; 1987 char *ahost = NULL; ```
#### Log Messages
#### SIP Traffic
### Possible Solutions
### Additional Information
* **Kamailio Version** - output of `kamailio -v`
``` version: kamailio 5.6.2 (x86_64/linux) flags: USE_TCP, USE_TLS, USE_SCTP, TLS_HOOKS, USE_RAW_SOCKS, DISABLE_NAGLE, USE_MCAST, DNS_IP_HACK, SHM_MMAP, PKG_MALLOC, Q_MALLOC, F_MALLOC, TLSF_MALLOC, DBG_SR_MEMORY, USE_FUTEX, FAST_LOCK-ADAPTIVE_WAIT, USE_DNS_CACHE, USE_DNS_FAILOVER, USE_NAPTR, USE_DST_BLOCKLIST, HAVE_RESOLV_RES ADAPTIVE_WAIT_LOOPS 1024, MAX_RECV_BUFFER_SIZE 262144, MAX_URI_SIZE 1024, BUF_SIZE 65535, DEFAULT PKG_SIZE 8MB poll method support: poll, epoll_lt, epoll_et, sigio_rt, select. id: unknown compiled on 10:33:27 Dec 29 2022 with gcc 4.9.4 ```
* **Operating System**:
<!-- Details about the operating system, the type: Linux (e.g.,: Debian 8.4, Ubuntu 16.04, CentOS 7.1, ...), MacOS, xBSD, Solaris, ...; Kernel details (output of `lsb_release -a` and `uname -a`) -->
``` Linux localhost.localdomain 3.10.0-327.el7.x86_64 #1 SMP Thu Nov 19 22:10:57 UTC 2015 x86_64 x86_64 x86_64 GNU/Linux ```
As mentioned in #3305, you should only do a tls reload when the you change the certificate. Usually this happes not often, like once a month or so.
1.Although it was a low-frequency operation, But we found a serious problem with the code that caused this problem using the openSSL API.
2.reason: The main process first initializes the TLS module, causing the OpenSSL Error queue to initialize, followed by the fork process, the child process does not initialize in the error queue (multiple processes share the error memory), and the OpenSSL API of multiple child processes may have a double free when the error queue is free
main.cp for init TLS #ifdef USE_TCP #ifdef USE_TLS if (!tls_disable){ if (!tls_loaded()){ LM_WARN("tls support enabled, but no tls engine " " available (forgot to load the tls module?)\n"); LM_WARN("disabling tls...\n"); tls_disable=1; } else { if (pre_init_tls()<0){ LM_CRIT("could not pre-initialize tls, exiting...\n"); goto error; } } } #endif /* USE_TLS */ #endif /* USE_TCP */
--------->openssl err.c this state , multiple child processes ,share the error memory,and may have a double free when the error queue is free ERR_STATE *ERR_get_state(void) { ERR_STATE *state; int saveerrno = get_last_sys_error();
if (!OPENSSL_init_crypto(OPENSSL_INIT_BASE_ONLY, NULL)) return NULL;
if (!RUN_ONCE(&err_init, err_do_init)) return NULL;
state = CRYPTO_THREAD_get_local(&err_thread_local); if (state == (ERR_STATE*)-1) return NULL;
if (state == NULL) { if (!CRYPTO_THREAD_set_local(&err_thread_local, (ERR_STATE*)-1)) return NULL;
if ((state = OPENSSL_zalloc(sizeof(*state))) == NULL) { CRYPTO_THREAD_set_local(&err_thread_local, NULL); return NULL; }
if (!ossl_init_thread_start(OPENSSL_INIT_THREAD_ERR_STATE) || !CRYPTO_THREAD_set_local(&err_thread_local, state)) { ERR_STATE_free(state); CRYPTO_THREAD_set_local(&err_thread_local, NULL); return NULL; }
/* Ignore failures from these */ OPENSSL_init_crypto(OPENSSL_INIT_LOAD_CRYPTO_STRINGS, NULL); }
set_sys_error(saveerrno); return state; }
This kind of issues are unfortunately known from OpenSSL, it does not work well in multi-process environments since some time anymore. We have done several changes to the tls module and also added the tlsa static version as well. In the future the tls_wolfssl module will also provide hopefully a better alternative.
Regarding the issue about sharing the error handler, thanks for the additional information. If you have a potential code fix, just submit a pull request for review and possible applying to the code base. It might be possible to initialize the error handler also in the individual child processes as a bug fix.
I cannot quickly reproduce it, what is your libssl version? As I could guess from the issue description, you run on RHEL 7 with kernel 3.10, so it can be pretty old libssl there.
Also, if you think of some changes that could fix it, just post the patch or make a pull request to analyze and merge if all ok.
Moreover, maybe you can test with master branch and setting `rpc_exec_delta=5` global parameter in kamailio.cfg to see if the behaviour changes in any way. This adds a limit of one tls reload per 5 secs.
[sipp caller using while loop]----------INVITE---------->[kamailio]-----------INVITE------------>[sipp callee using while loop] 10cps at the same time while true; do kill -9 `pidof sipp` && sleep 5; done; and at the same time, while true; do kamcmd tls.reload && sleep 1;done;
request_route { xlog("L_INFO", "req[$rm:$ci:$cs][$pr]"); # message pre-processing add_local_rport(); $du = "sip:192.168.131.190:1001;transport=tls"; forward(); }
while true; do ./caller_only_INVITE_test.sh 10000 10;done; while true; do ./callee_test.sh 1001;done; while true; do /opt/kamailio/sbin/kamcmd tls.reload && echo `date`; done;
caller_only_INVITE_test.sh ----------------------------------------------------------------------------------- ulimit -n 102400 ./sipp -inf csv_user.csv -s 100 -t ln -sf caller_only_INVITE.xml -m $1 -r $2 -rp 1s -mp 50000 -max_socket 10000 -i 192.168.131.190 192.168.131.190:5061 -trace_err -default_behaviors all -watchdog_minor_threshold 10000000 -watchdog_major_threshold 10000000 -watchdog_major_maxtriggers 10000000 -watchdog_minor_maxtriggers 10000000 -l 10000 -recv_timeout 32000 -send_timeout 32000 -T2 4000 -max_retrans 10 -tls_cert ca.crt -tls_key ca.key ---------------------------------------------------------------------------------- callee_test.sh ---------------------------------------------------------------------------------- ulimit -n 102400 ./sipp -t ln -s 11111111 -sf callee.xml -i 192.168.131.190 -p $1 -max_socket 102400 -recv_timeout 32000 -send_timeout 32000 -T2 6000 -max_retrans 10 -tls_cert ca.crt -tls_key ca.key -aa ----------------------------------------------------------------------------------
if needed, i can send the whole set of test scripts by mail
As I said, reproducing might not happen because of different test environments. Describe you system for testing (operating system, libssl version, ...).
Running this under `valgrind` will provide precise information on where memory is allocated, freed, and double-freed. As alternative compiling the software under "address sanitizer" should reveal the same ingotmation.
Unfortunately `valgrind` cannot handle properly the use of shard memory managed by kamailio.
Can you try with git master branch and setting tls moduparam `lock_mode` to `1`?
Adding a note that the `lock_mode` parameter was renamed to `init_mode`, as I plan to use it for other purposes.
@JiangHai2011 any update from your side regarding the lastest comments about reproducing it on git master?
This issue is stale because it has been open 6 weeks with no activity. Remove stale label or comment or this will be closed in 2 weeks.
Closed #3319 as not planned.