Hi All, 
 
We have recently encountered a challenging issue in our environment that I wanted to bring to your attention. A few months ago, we performed updates to our Docker and OpenSSL installations, upgrading OpenSSL to version 1.1.1u. Since then, we have been experiencing intermittent traffic blocking (TLS and UDP both for around 10 minutes and getting recovered) on some of our instances.

Prior to the OpenSSL upgrade, we had not observed any such issues. However, the situation has become more complex due to a substantial increase in traffic, with each instance now handling 4K-5K concurrent calls and a daily total of approximately 200,000 calls per instance.

Our initial suspicion was related to a Docker iptable issue. To investigate further, we conducted a GDB trace using "kamctl pstrap" and discovered specific log entries indicating potential issues within the "pthread_rwlock_wrlock" function. This has prompted us to explore whether the problems are linked to the OpenSSL upgrade.

We are considering upgrading Kamailio to the latest version, although this process typically requires a significant amount of time for proper planning and testing. Given the high volume of calls we are currently handling, we are also exploring the possibility of a temporary solution involving a rollback of the OpenSSL version to see if it resolves the issue.

Curiously, during a recent load test using SIPP, which simulated 10,000 concurrent calls with a rate of 100 calls per second, we did not encounter the same problem. 

any hints or insights you may have that could help us pinpoint the root cause of the problem .

We do have a complete GDB trace available, but its size necessitates sharing a truncated version for now. 


version: kamailio 5.4.3 (x86_64/linux) 0178a5

flags: USE_TCP, USE_TLS, USE_SCTP, TLS_HOOKS, USE_RAW_SOCKS, DISABLE_NAGLE, USE_MCAST, DNS_IP_HACK, SHM_MMAP, PKG_MALLOC, Q_MALLOC, F_MALLOC, TLSF_MALLOC, DBG_SR_MEMORY, USE_FUTEX, FAST_LOCK-ADAPTIVE_WAIT, USE_DNS_CACHE, USE_DNS_FAILOVER, USE_NAPTR, USE_DST_BLACKLIST, HAVE_RESOLV_RES

ADAPTIVE_WAIT_LOOPS 1024, MAX_RECV_BUFFER_SIZE 262144, MAX_URI_SIZE 1024, BUF_SIZE 65535, DEFAULT PKG_SIZE 8MB

poll method support: poll, epoll_lt, epoll_et, sigio_rt, select.

id: 0178a5 

compiled on 12:36:56 Oct 22 2023 with gcc 4.8.5


Operating System Host :
LSB Version: :core-4.1-amd64:core-4.1-noarch

Distributor ID: Rocky

Description: Rocky Linux release 8.8 (Green Obsidian)

Release: 8.8

Codename: GreenObsidian


container os (Centos7) 

Kamctl pstraip output (truncated to last few lines)


---start 154 -----------------------------------------------------

kamailio: No such file or directory.

[Thread debugging using libthread_db enabled]

Using host libthread_db library "/lib64/libthread_db.so.1".

0x00007fbf6f38a0c3 in __epoll_wait_nocancel () from /lib64/libc.so.6

$1 = 82

$2 = {pid = 154, unix_sock = 182, idx = 27, status = 1, rank = 48, desc = "tcp receiver (tcp:xxx.xxx.xxx.xx:5060) child=27", '\000' <repeats 81 times>}

#0  0x00007fbf6f38a0c3 in __epoll_wait_nocancel () from /lib64/libc.so.6

No symbol table info available.

#1  0x0000000000684858 in io_wait_loop_epoll (h=0xb41720 <io_w>, t=2, repeat=0) at core/io_wait.h:1039

        n = 84

        r = 0

        fm = 0x2

        revents = 8201656

        __FUNCTION__ = "io_wait_loop_epoll"

#2  0x0000000000699e7a in tcp_receive_loop (unix_sock=186) at core/tcp_read.c:1997

        __FUNCTION__ = "tcp_receive_loop"

#3  0x00000000005279e7 in tcp_init_children () at core/tcp_main.c:5134

        r = 27

        i = -1

        reader_fd_1 = 186

        pid = 0

        si_desc = "tcp receiver (tcp:xxx.xx.xxx.xxx:5060)\000\000\000\355\217c\000\000\000\000\000\240\213{\001\001\000\000\000\017\000\000\000\000\000\000\000\250\a2o\276\177\000\000\200\b\231o\000\000\000\000\260\006\346\261\376\177\000\000\230\210w\000\000\000\000\000=\000\000\000>\000\000\000\270\016z\350\274\177\000\000\060;8o\002\000\000\000P\224\346\352\001\000\000"

        si = 0x0

        __FUNCTION__ = "tcp_init_children"

#4  0x000000000042ae66 in main_loop () at main.c:1771

        i = 16

        pid = 94

        si = 0x0

        si_desc = "udp receiver child=15 sock=xxxx.xxx.xxx.xx:5060\000\000\000\v\235\200", '\000' <repeats 13 times>, "P\r\346\261\376\177\000\000\270%}\000\000\000\000\000T\000\000\000\000\000\000\000\060;8o\277\177\000\000\343\207\201\000\000\000\000\000\277;8o\277\177\000\000 \271A\000\000\000\000\000@\374Ko\276\177\000"

        nrprocs = 16

        woneinit = 1

        __FUNCTION__ = "main_loop"

#5  0x0000000000433ad6 in main (argc=9, argv=0x7ffeb1e60e38) at main.c:2856

        cfg_stream = 0x16c9010

        c = -1

        r = 0

        tmp = 0x7ffeb1e61f0c ""

        tmp_len = 2496

        port = 2496

        proto = 1472

        ahost = 0x0

        aport = 0

        options = 0x7d5238 ":f:cm:M:dVIhEeb:l:L:n:vKrRDTN:W:w:t:u:g:P:G:SQ:O:a:A:x:X:Y:"

        ret = -1

        seed = 2208520674

        rfd = 4

        debug_save = 0

        debug_flag = 0

        dont_fork_cnt = 0

        n_lst = 0x7fbf6f40f3a0 <intel_02_known>

        p = 0xf0b5ff <Address 0xf0b5ff out of bounds>

        st = {st_dev = 76, st_ino = 120506675, st_nlink = 2, st_mode = 16877, st_uid = 0, st_gid = 0, __pad0 = 0, st_rdev = 0, st_size = 40, st_blksize = 4096, st_blocks = 0, st_atim = {tv_sec = 1698340416, tv_nsec = 618631009}, st_mtim = {tv_sec = 1698340416, tv_nsec = 618631009}, st_ctim = {tv_sec = 1698340416, tv_nsec = 618631009}, __unused = {0, 0, 0}}

        tbuf = '\000' <repeats 376 times>...

        option_index = 0

        long_options = {{name = 0x7d742f "help", has_arg = 0, flag = 0x0, val = 104}, {name = 0x7d2a34 "version", has_arg = 0, flag = 0x0, val = 118}, {name = 0x7d7434 "alias", has_arg = 1, flag = 0x0, val = 1024}, {name = 0x7d743a "subst", has_arg = 1, flag = 0x0, val = 1025}, {name = 0x7d7440 "substdef", has_arg = 1, flag = 0x0, val = 1026}, {name = 0x7d7449 "substdefs", has_arg = 1, flag = 0x0, val = 1027}, {name = 0x7d7453 "server-id", has_arg = 1, flag = 0x0, val = 1028}, {name = 0x7d745d "loadmodule", has_arg = 1, flag = 0x0, val = 1029}, {name = 0x7d7468 "modparam", has_arg = 1, flag = 0x0, val = 1030}, {name = 0x7d7471 "log-engine", has_arg = 1, flag = 0x0, val = 1031}, {name = 0x7d747c "debug", has_arg = 1, flag = 0x0, val = 1032}, {name = 0x0, has_arg = 0, flag = 0x0, val = 0}}

        __FUNCTION__ = "main"

[Inferior 1 (process 154) detached]

---end 154 -------------------------------------------------------


---start 155 -----------------------------------------------------

kamailio: No such file or directory.

[Thread debugging using libthread_db enabled]

Using host libthread_db library "/lib64/libthread_db.so.1".

0x00007fbe6e5bd39e in pthread_rwlock_wrlock () from /lib64/libpthread.so.0

$1 = 83

$2 = {pid = 155, unix_sock = -1, idx = -1, status = 1, rank = -4, desc = "tcp main process", '\000' <repeats 111 times>}

#0  0x00007fbe6e5bd39e in pthread_rwlock_wrlock () from /lib64/libpthread.so.0

No symbol table info available.

#1  0x00007fbe68b745c9 in CRYPTO_THREAD_write_lock () from /lib64/libcrypto.so.1.1

No symbol table info available.

#2  0x00007fbe68afe07e in CRYPTO_free_ex_data () from /lib64/libcrypto.so.1.1

No symbol table info available.

#3  0x00007fbe68ea52c2 in SSL_free () from /lib64/libssl.so.1.1

No symbol table info available.

#4  0x00007fbe69149480 in tls_h_tcpconn_clean_f (c=0x7fbceeae00d0) at tls_server.c:655

        extra = 0x7fbceec3b898

        __FUNCTION__ = "tls_h_tcpconn_clean_f"

#5  0x00000000004fbf2d in _tcpconn_free (c=0x7fbceeae00d0) at core/tcp_main.c:1529

        __FUNCTION__ = "_tcpconn_free"

#6  0x0000000000510912 in tcpconn_put_destroy (tcpconn=0x7fbceeae00d0) at core/tcp_main.c:3284

        __FUNCTION__ = "tcpconn_put_destroy"

#7  0x00000000005156a5 in handle_tcp_child (tcp_c=0x7fbe6f813440, fd_i=-1) at core/tcp_main.c:3723

        tcpconn = 0x7fbceeae00d0

        response = {140449434960080, -2}

        cmd = -2

        bytes = 16

        n = 1865551792

        t = 887186228

        crt_timeout = 410

        con_lifetime = 57680

        __FUNCTION__ = "handle_tcp_child"

#8  0x00000000005208ed in handle_io (fm=0x7fbe6f9d7c78, ev=1, idx=-1) at core/tcp_main.c:4542

        ret = 1

        __FUNCTION__ = "handle_io"

#9  0x00000000004e93ed in io_wait_loop_epoll (h=0xad5740 <io_h>, t=5, repeat=0) at core/io_wait.h:1070

        n = 1

        r = 0

        fm = 0x7fbe6f9d7c78

        revents = 1

        __FUNCTION__ = "io_wait_loop_epoll"

#10 0x0000000000523c99 in tcp_main_loop () at core/tcp_main.c:4827

        si = 0x0

        r = 28

        __FUNCTION__ = "tcp_main_loop"

#11 0x000000000042b1be in main_loop () at main.c:1779

        i = 16

        pid = 0

        si = 0x0

        si_desc = "udp receiver child=15 sock=xxx.xx.xxx.xxx:5060\000\000\000\v\235\200", '\000' <repeats 13 times>, "P\r\346\261\376\177\000\000\270%}\000\000\000\000\000T\000\000\000\000\000\000\000\060;8o\277\177\000\000\343\207\201\000\000\000\000\000\277;8o\277\177\000\000 \271A\000\000\000\000\000@\374Ko\276\177\000"

        nrprocs = 16

        woneinit = 1

        __FUNCTION__ = "main_loop"

#12 0x0000000000433ad6 in main (argc=9, argv=0x7ffeb1e60e38) at main.c:2856

        cfg_stream = 0x16c9010

        c = -1

        r = 0

        tmp = 0x7ffeb1e61f0c ""

        tmp_len = 2496

        port = 2496

        proto = 1472

        ahost = 0x0

        aport = 0

        options = 0x7d5238 ":f:cm:M:dVIhEeb:l:L:n:vKrRDTN:W:w:t:u:g:P:G:SQ:O:a:A:x:X:Y:"

        ret = -1

        seed = 2208520674

        rfd = 4

        debug_save = 0

        debug_flag = 0

        dont_fork_cnt = 0

        n_lst = 0x7fbf6f40f3a0 <intel_02_known>

        p = 0xf0b5ff <Address 0xf0b5ff out of bounds>

        st = {st_dev = 76, st_ino = 120506675, st_nlink = 2, st_mode = 16877, st_uid = 0, st_gid = 0, __pad0 = 0, st_rdev = 0, st_size = 40, st_blksize = 4096, st_blocks = 0, st_atim = {tv_sec = 1698340416, tv_nsec = 618631009}, st_mtim = {tv_sec = 1698340416, tv_nsec = 618631009}, st_ctim = {tv_sec = 1698340416, tv_nsec = 618631009}, __unused = {0, 0, 0}}

        tbuf = '\000' <repeats 376 times>...

        option_index = 0

        long_options = {{name = 0x7d742f "help", has_arg = 0, flag = 0x0, val = 104}, {name = 0x7d2a34 "version", has_arg = 0, flag = 0x0, val = 118}, {name = 0x7d7434 "alias", has_arg = 1, flag = 0x0, val = 1024}, {name = 0x7d743a "subst", has_arg = 1, flag = 0x0, val = 1025}, {name = 0x7d7440 "substdef", has_arg = 1, flag = 0x0, val = 1026}, {name = 0x7d7449 "substdefs", has_arg = 1, flag = 0x0, val = 1027}, {name = 0x7d7453 "server-id", has_arg = 1, flag = 0x0, val = 1028}, {name = 0x7d745d "loadmodule", has_arg = 1, flag = 0x0, val = 1029}, {name = 0x7d7468 "modparam", has_arg = 1, flag = 0x0, val = 1030}, {name = 0x7d7471 "log-engine", has_arg = 1, flag = 0x0, val = 1031}, {name = 0x7d747c "debug", has_arg = 1, flag = 0x0, val = 1032}, {name = 0x0, has_arg = 0, flag = 0x0, val = 0}}

        __FUNCTION__ = "main"

[Inferior 1 (process 155) detached]

---end 155 -------------------------------------------------------





Thanks & Regards 
Pintu Lohar