Hi,
On an ancient kamailio 3.1, we occasionally experience crash in the function get_all_db_ucontacts in usrloc.
A backtrace reveals the following: #0 0x00002b505f2fb4ff in get_all_db_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:176 176 if (VAL_NULL(ROW_VALUES(row)+4) || p1==0 || p1[0]==0){
The lines around line 176 (which look similar to trunk, except p and p1 are str today) 159 for(i = 0; i < RES_ROW_N(res); i++) { 160 row = RES_ROWS(res) + i; 161 162 /* received */ 163 p = (char*)VAL_STRING(ROW_VALUES(row)); 164 if ( VAL_NULL(ROW_VALUES(row)) || p==0 || p[0]==0 ) { 165 /* contact */ 166 p = (char*)VAL_STRING(ROW_VALUES(row)+1); 167 if (VAL_NULL(ROW_VALUES(row)+1) || p==0 || p[0]==0) { 168 LM_ERR("empty contact -> skipping\n"); 169 continue; 170 } 171 } 172 p_len = strlen(p); 173 174 /* path */ 175 p1 = (char*)VAL_STRING(ROW_VALUES(row)+4); 176 if (VAL_NULL(ROW_VALUES(row)+4) || p1==0 || p1[0]==0){ 177 p1 = NULL; 178 p1_len = 0; 179 } else { 180 p1_len = strlen(p1); 181 }
The client from frame #2 (bt full below), send a REGISTER, which was replied "200 OK" at the same second as the crash. Is it a timing issue between REGISTER and nathelper ping?
I have a trace and the core.
Regards, Kristian Høgh Uni-tel
bt full #0 0x00002b505f2fb4ff in get_all_db_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:176 sock = 0x0 dbflags = 11088 host = {s = 0x375ef8e "PROXY_IP:5060", len = 13} i = <value optimized out> row = 0xa13488 dom = 0x2b5062723278 p1 = 0x2 <Address 0x2 out of bounds> now_s = "'2015-01-13 12:58:51'\177\000\000)" port = 11088 p_len = 73 cp = <value optimized out> needed = 16 res = 0xa2ed38 p = 0xcfcdffaf <Address 0xcfcdffaf out of bounds> now_len = 21 proto = 1642847400 p1_len = 0 shortage = 0 query_buf = "select received, contact, socket, cflags, path from location where expires > '2015-01-13 12:58:51' and cflags & 64 = 64 and id % 30 = 1", '\000' <repeats 376 times> query_str = {s = 0x2b505f515520 "select received, contact, socket, cflags, path from location where expires > '2015-01-13 12:58:51' and cflags & 64 = 64 and id % 30 = 1", len = 135} #1 get_all_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:398 No locals. #2 0x00002b5061eb6b1a in nh_timer (ticks=<value optimized out>, timer_idx=<value optimized out>) at nathelper.c:1623 rval = <value optimized out> buf = 0xadd270 cp = <value optimized out> c = {s = 0x0, len = 0} opt = {s = 0x0, len = 0} path = {s = 0x0, len = 0} curi = {user = {s = 0x0, len = 0}, passwd = {s = 0x0, len = 0}, host = {s = 0xadd836 "CLIENT_IP:62252\250u\220", len = 14}, port = {s = 0xadd845 "62252\250u\220", len = 5}, params = {s = 0x0, len = 0}, sip_params = {s = 0x0, len = 0}, headers = { s = 0x7fff110134b0 "", len = 566862356}, port_no = 50336, proto = 70, type = ERROR_URI_T, flags = 65536, transport = {s = 0x8cbfa0 "", len = 5441948}, ttl = {s = 0x7fff1101352f "", len = 4448297}, user_param = {s = 0x375ef8e "PROXY_IP:5060", len = 13}, maddr = {s = 0x9204b0 "sip3", len = 10677560}, method = {s = 0x0, len = 0}, lr = {s = 0xadd84a "\250u\220", len = 0}, r2 = {s = 0xa2ebc0 "\250\272\252", len = 285291296}, transport_val = {s = 0x40 <Address 0x40 out of bounds>, len = 335544320}, ttl_val = { s = 0x3221c302d0 "H\307\300\017", len = 0}, user_param_val = {s = 0x0, len = 0}, maddr_val = {s = 0x0, len = 0}, method_val = {s = 0x0, len = 0}, lr_val = {s = 0x0, len = 0}, r2_val = {s = 0x0, len = 0}} he = <value optimized out> send_sock = <value optimized out> flags = <value optimized out> proto = <value optimized out> dst = {send_sock = 0x0, to = {s = {sa_family = 0, sa_data = '\000' <repeats 13 times>}, sin = {sin_family = 0, sin_port = 0, sin_addr = {s_addr = 0}, sin_zero = "\000\000\000\000\000\000\000"}, sin6 = {sin6_family = 0, sin6_port = 0, sin6_flowinfo = 0, sin6_addr = {in6_u = {u6_addr8 = '\000' <repeats 15 times>, u6_addr16 = {0, 0, 0, 0, 0, 0, 0, 0}, u6_addr32 = {0, 0, 0, 0}}}, sin6_scope_id = 0}}, id = 0, proto = 0 '\000', send_flags = {f = 0 '\000', blst_imask = 0 '\000'}} iteration = 1 __FUNCTION__ = "nh_timer" #3 0x0000000000521c54 in fork_dummy_timer (child_id=<value optimized out>, desc=<value optimized out>, make_sock=<value optimized out>, f=0x2b5061eb6a70 <nh_timer>, param=0x0, interval=1) at timer_proc.c:71 pid = <value optimized out> #4 0x00002b5061eb42ee in child_init (rank=<value optimized out>) at nathelper.c:673 i = 58055190 #5 0x00000000004db1d0 in init_mod_child (m=0x917dc0, rank=0) at sr_module.c:829 No locals. #6 0x00000000004db1a9 in init_mod_child (m=0x918420, rank=0) at sr_module.c:826 No locals. #7 0x00000000004db1a9 in init_mod_child (m=0x918aa0, rank=0) at sr_module.c:826 No locals.
Hello,
the operation done by that code is practically a sql query, fetching the result and packing it for nathelper usage. It is no race between it and handling any sip packet, no shared memory accessed.
Do you get only one corefile?
From the backtrace it looks like row pointer is not ok, can you get the
output for:
p *row
Cheers, Daniel
On 13/01/15 15:08, Kristian F. Høgh wrote:
Hi,
On an ancient kamailio 3.1, we occasionally experience crash in the function get_all_db_ucontacts in usrloc.
A backtrace reveals the following: #0 0x00002b505f2fb4ff in get_all_db_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:176 176 if (VAL_NULL(ROW_VALUES(row)+4) || p1==0 || p1[0]==0){
The lines around line 176 (which look similar to trunk, except p and p1 are str today) 159 for(i = 0; i < RES_ROW_N(res); i++) { 160 row = RES_ROWS(res) + i; 161 162 /* received */ 163 p = (char*)VAL_STRING(ROW_VALUES(row)); 164 if ( VAL_NULL(ROW_VALUES(row)) || p==0 || p[0]==0 ) { 165 /* contact */ 166 p = (char*)VAL_STRING(ROW_VALUES(row)+1); 167 if (VAL_NULL(ROW_VALUES(row)+1) || p==0 || p[0]==0) { 168 LM_ERR("empty contact -> skipping\n"); 169 continue; 170 } 171 } 172 p_len = strlen(p); 173 174 /* path */ 175 p1 = (char*)VAL_STRING(ROW_VALUES(row)+4); 176 if (VAL_NULL(ROW_VALUES(row)+4) || p1==0 || p1[0]==0){ 177 p1 = NULL; 178 p1_len = 0; 179 } else { 180 p1_len = strlen(p1); 181 }
The client from frame #2 (bt full below), send a REGISTER, which was replied "200 OK" at the same second as the crash. Is it a timing issue between REGISTER and nathelper ping?
I have a trace and the core.
Regards, Kristian Høgh Uni-tel
bt full #0 0x00002b505f2fb4ff in get_all_db_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:176 sock = 0x0 dbflags = 11088 host = {s = 0x375ef8e "PROXY_IP:5060", len = 13} i = <value optimized out> row = 0xa13488 dom = 0x2b5062723278 p1 = 0x2 <Address 0x2 out of bounds> now_s = "'2015-01-13 12:58:51'\177\000\000)" port = 11088 p_len = 73 cp = <value optimized out> needed = 16 res = 0xa2ed38 p = 0xcfcdffaf <Address 0xcfcdffaf out of bounds> now_len = 21 proto = 1642847400 p1_len = 0 shortage = 0 query_buf = "select received, contact, socket, cflags, path from location where expires > '2015-01-13 12:58:51' and cflags & 64 = 64 and id % 30 = 1", '\000' <repeats 376 times> query_str = {s = 0x2b505f515520 "select received, contact, socket, cflags, path from location where expires > '2015-01-13 12:58:51' and cflags & 64 = 64 and id % 30 = 1", len = 135} #1 get_all_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:398 No locals. #2 0x00002b5061eb6b1a in nh_timer (ticks=<value optimized out>, timer_idx=<value optimized out>) at nathelper.c:1623 rval = <value optimized out> buf = 0xadd270 cp = <value optimized out> c = {s = 0x0, len = 0} opt = {s = 0x0, len = 0} path = {s = 0x0, len = 0} curi = {user = {s = 0x0, len = 0}, passwd = {s = 0x0, len = 0}, host = {s = 0xadd836 "CLIENT_IP:62252\250u\220", len = 14}, port = {s = 0xadd845 "62252\250u\220", len = 5}, params = {s = 0x0, len = 0}, sip_params = {s = 0x0, len = 0}, headers = { s = 0x7fff110134b0 "", len = 566862356}, port_no = 50336, proto = 70, type = ERROR_URI_T, flags = 65536, transport = {s = 0x8cbfa0 "", len = 5441948}, ttl = {s = 0x7fff1101352f "", len = 4448297}, user_param = {s = 0x375ef8e "PROXY_IP:5060", len = 13}, maddr = {s = 0x9204b0 "sip3", len = 10677560}, method = {s = 0x0, len = 0}, lr = {s = 0xadd84a "\250u\220", len = 0}, r2 = {s = 0xa2ebc0 "\250\272\252", len = 285291296}, transport_val = {s = 0x40 <Address 0x40 out of bounds>, len = 335544320}, ttl_val = { s = 0x3221c302d0 "H\307\300\017", len = 0}, user_param_val = {s = 0x0, len = 0}, maddr_val = {s = 0x0, len = 0}, method_val = {s = 0x0, len = 0}, lr_val = {s = 0x0, len = 0}, r2_val = {s = 0x0, len = 0}} he = <value optimized out> send_sock = <value optimized out> flags = <value optimized out> proto = <value optimized out> dst = {send_sock = 0x0, to = {s = {sa_family = 0, sa_data = '\000' <repeats 13 times>}, sin = {sin_family = 0, sin_port = 0, sin_addr = {s_addr = 0}, sin_zero = "\000\000\000\000\000\000\000"}, sin6 = {sin6_family = 0, sin6_port = 0, sin6_flowinfo = 0, sin6_addr = {in6_u = {u6_addr8 = '\000' <repeats 15 times>, u6_addr16 = {0, 0, 0, 0, 0, 0, 0, 0}, u6_addr32 = {0, 0, 0, 0}}}, sin6_scope_id = 0}}, id = 0, proto = 0 '\000', send_flags = {f = 0 '\000', blst_imask = 0 '\000'}} iteration = 1 __FUNCTION__ = "nh_timer" #3 0x0000000000521c54 in fork_dummy_timer (child_id=<value optimized out>, desc=<value optimized out>, make_sock=<value optimized out>, f=0x2b5061eb6a70 <nh_timer>, param=0x0, interval=1) at timer_proc.c:71 pid = <value optimized out> #4 0x00002b5061eb42ee in child_init (rank=<value optimized out>) at nathelper.c:673 i = 58055190 #5 0x00000000004db1d0 in init_mod_child (m=0x917dc0, rank=0) at sr_module.c:829 No locals. #6 0x00000000004db1a9 in init_mod_child (m=0x918420, rank=0) at sr_module.c:826 No locals. #7 0x00000000004db1a9 in init_mod_child (m=0x918aa0, rank=0) at sr_module.c:826 No locals.
SIP Express Router (SER) and Kamailio (OpenSER) - sr-users mailing list sr-users@lists.sip-router.org http://lists.sip-router.org/cgi-bin/mailman/listinfo/sr-users
Hi Daniel,
(gdb) p *row $1 = {values = 0x9bd2b8, n = 13}
Regards, Kristian.
On Wednesday 14 January 2015 10:01:41 Daniel-Constantin Mierla wrote:
Hello,
the operation done by that code is practically a sql query, fetching the result and packing it for nathelper usage. It is no race between it and handling any sip packet, no shared memory accessed.
Do you get only one corefile?
From the backtrace it looks like row pointer is not ok, can you get the output for:
p *row
Cheers, Daniel
On 13/01/15 15:08, Kristian F. Høgh wrote:
Hi,
On an ancient kamailio 3.1, we occasionally experience crash in the function get_all_db_ucontacts in usrloc.
A backtrace reveals the following: #0 0x00002b505f2fb4ff in get_all_db_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:176 176 if (VAL_NULL(ROW_VALUES(row)+4) || p1==0 || p1[0]==0){
The lines around line 176 (which look similar to trunk, except p and p1 are str today) 159 for(i = 0; i < RES_ROW_N(res); i++) { 160 row = RES_ROWS(res) + i; 161 162 /* received */ 163 p = (char*)VAL_STRING(ROW_VALUES(row)); 164 if ( VAL_NULL(ROW_VALUES(row)) || p==0 || p[0]==0 ) { 165 /* contact */ 166 p = (char*)VAL_STRING(ROW_VALUES(row)+1); 167 if (VAL_NULL(ROW_VALUES(row)+1) || p==0 || p[0]==0) { 168 LM_ERR("empty contact -> skipping\n"); 169 continue; 170 } 171 } 172 p_len = strlen(p); 173 174 /* path */ 175 p1 = (char*)VAL_STRING(ROW_VALUES(row)+4); 176 if (VAL_NULL(ROW_VALUES(row)+4) || p1==0 || p1[0]==0){ 177 p1 = NULL; 178 p1_len = 0; 179 } else { 180 p1_len = strlen(p1); 181 }
The client from frame #2 (bt full below), send a REGISTER, which was replied "200 OK" at the same second as the crash. Is it a timing issue between REGISTER and nathelper ping?
I have a trace and the core.
Regards, Kristian Høgh Uni-tel
bt full #0 0x00002b505f2fb4ff in get_all_db_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:176 sock = 0x0 dbflags = 11088 host = {s = 0x375ef8e "PROXY_IP:5060", len = 13} i = <value optimized out> row = 0xa13488 dom = 0x2b5062723278 p1 = 0x2 <Address 0x2 out of bounds> now_s = "'2015-01-13 12:58:51'\177\000\000)" port = 11088 p_len = 73 cp = <value optimized out> needed = 16 res = 0xa2ed38 p = 0xcfcdffaf <Address 0xcfcdffaf out of bounds> now_len = 21 proto = 1642847400 p1_len = 0 shortage = 0 query_buf = "select received, contact, socket, cflags, path from location where expires > '2015-01-13 12:58:51' and cflags & 64 = 64 and id % 30 = 1", '\000' <repeats 376 times> query_str = {s = 0x2b505f515520 "select received, contact, socket, cflags, path from location where expires > '2015-01-13 12:58:51' and cflags & 64 = 64 and id % 30 = 1", len = 135} #1 get_all_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:398 No locals. #2 0x00002b5061eb6b1a in nh_timer (ticks=<value optimized out>, timer_idx=<value optimized out>) at nathelper.c:1623 rval = <value optimized out> buf = 0xadd270 cp = <value optimized out> c = {s = 0x0, len = 0} opt = {s = 0x0, len = 0} path = {s = 0x0, len = 0} curi = {user = {s = 0x0, len = 0}, passwd = {s = 0x0, len = 0}, host = {s = 0xadd836 "CLIENT_IP:62252\250u\220", len = 14}, port = {s = 0xadd845 "62252\250u\220", len = 5}, params = {s = 0x0, len = 0}, sip_params = {s = 0x0, len = 0}, headers = { s = 0x7fff110134b0 "", len = 566862356}, port_no = 50336, proto = 70, type = ERROR_URI_T, flags = 65536, transport = {s = 0x8cbfa0 "", len = 5441948}, ttl = {s = 0x7fff1101352f "", len = 4448297}, user_param = {s = 0x375ef8e "PROXY_IP:5060", len = 13}, maddr = {s = 0x9204b0 "sip3", len = 10677560}, method = {s = 0x0, len = 0}, lr = {s = 0xadd84a "\250u\220", len = 0}, r2 = {s = 0xa2ebc0 "\250\272\252", len = 285291296}, transport_val = {s = 0x40 <Address 0x40 out of bounds>, len = 335544320}, ttl_val = { s = 0x3221c302d0 "H\307\300\017", len = 0}, user_param_val = {s = 0x0, len = 0}, maddr_val = {s = 0x0, len = 0}, method_val = {s = 0x0, len = 0}, lr_val = {s = 0x0, len = 0}, r2_val = {s = 0x0, len = 0}} he = <value optimized out> send_sock = <value optimized out> flags = <value optimized out> proto = <value optimized out> dst = {send_sock = 0x0, to = {s = {sa_family = 0, sa_data = '\000' <repeats 13 times>}, sin = {sin_family = 0, sin_port = 0, sin_addr = {s_addr = 0}, sin_zero = "\000\000\000\000\000\000\000"}, sin6 = {sin6_family = 0, sin6_port = 0, sin6_flowinfo = 0, sin6_addr = {in6_u = {u6_addr8 = '\000' <repeats 15 times>, u6_addr16 = {0, 0, 0, 0, 0, 0, 0, 0}, u6_addr32 = {0, 0, 0, 0}}}, sin6_scope_id = 0}}, id = 0, proto = 0 '\000', send_flags = {f = 0 '\000', blst_imask = 0 '\000'}} iteration = 1 __FUNCTION__ = "nh_timer" #3 0x0000000000521c54 in fork_dummy_timer (child_id=<value optimized out>, desc=<value optimized out>, make_sock=<value optimized out>, f=0x2b5061eb6a70 <nh_timer>, param=0x0, interval=1) at timer_proc.c:71 pid = <value optimized out> #4 0x00002b5061eb42ee in child_init (rank=<value optimized out>) at nathelper.c:673 i = 58055190 #5 0x00000000004db1d0 in init_mod_child (m=0x917dc0, rank=0) at sr_module.c:829 No locals. #6 0x00000000004db1a9 in init_mod_child (m=0x918420, rank=0) at sr_module.c:826 No locals. #7 0x00000000004db1a9 in init_mod_child (m=0x918aa0, rank=0) at sr_module.c:826 No locals.
SIP Express Router (SER) and Kamailio (OpenSER) - sr-users mailing list sr-users@lists.sip-router.org http://lists.sip-router.org/cgi-bin/mailman/listinfo/sr-users
Hello,
get also:
p *row->values p i
Cheers, Daniel
On 14/01/15 10:50, Kristian F. Høgh wrote:
Hi Daniel,
(gdb) p *row $1 = {values = 0x9bd2b8, n = 13}
Regards, Kristian.
On Wednesday 14 January 2015 10:01:41 Daniel-Constantin Mierla wrote:
Hello,
the operation done by that code is practically a sql query, fetching the result and packing it for nathelper usage. It is no race between it and handling any sip packet, no shared memory accessed.
Do you get only one corefile?
From the backtrace it looks like row pointer is not ok, can you get the output for:
p *row
Cheers, Daniel
On 13/01/15 15:08, Kristian F. Høgh wrote:
Hi,
On an ancient kamailio 3.1, we occasionally experience crash in the function get_all_db_ucontacts in usrloc.
A backtrace reveals the following: #0 0x00002b505f2fb4ff in get_all_db_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:176 176 if (VAL_NULL(ROW_VALUES(row)+4) || p1==0 || p1[0]==0){
The lines around line 176 (which look similar to trunk, except p and p1 are str today) 159 for(i = 0; i < RES_ROW_N(res); i++) { 160 row = RES_ROWS(res) + i; 161 162 /* received */ 163 p = (char*)VAL_STRING(ROW_VALUES(row)); 164 if ( VAL_NULL(ROW_VALUES(row)) || p==0 || p[0]==0 ) { 165 /* contact */ 166 p = (char*)VAL_STRING(ROW_VALUES(row)+1); 167 if (VAL_NULL(ROW_VALUES(row)+1) || p==0 || p[0]==0) { 168 LM_ERR("empty contact -> skipping\n"); 169 continue; 170 } 171 } 172 p_len = strlen(p); 173 174 /* path */ 175 p1 = (char*)VAL_STRING(ROW_VALUES(row)+4); 176 if (VAL_NULL(ROW_VALUES(row)+4) || p1==0 || p1[0]==0){ 177 p1 = NULL; 178 p1_len = 0; 179 } else { 180 p1_len = strlen(p1); 181 }
The client from frame #2 (bt full below), send a REGISTER, which was replied "200 OK" at the same second as the crash. Is it a timing issue between REGISTER and nathelper ping?
I have a trace and the core.
Regards, Kristian Høgh Uni-tel
bt full #0 0x00002b505f2fb4ff in get_all_db_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:176 sock = 0x0 dbflags = 11088 host = {s = 0x375ef8e "PROXY_IP:5060", len = 13} i = <value optimized out> row = 0xa13488 dom = 0x2b5062723278 p1 = 0x2 <Address 0x2 out of bounds> now_s = "'2015-01-13 12:58:51'\177\000\000)" port = 11088 p_len = 73 cp = <value optimized out> needed = 16 res = 0xa2ed38 p = 0xcfcdffaf <Address 0xcfcdffaf out of bounds> now_len = 21 proto = 1642847400 p1_len = 0 shortage = 0 query_buf = "select received, contact, socket, cflags, path from location where expires > '2015-01-13 12:58:51' and cflags & 64 = 64 and id % 30 = 1", '\000' <repeats 376 times> query_str = {s = 0x2b505f515520 "select received, contact, socket, cflags, path from location where expires > '2015-01-13 12:58:51' and cflags & 64 = 64 and id % 30 = 1", len = 135} #1 get_all_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:398 No locals. #2 0x00002b5061eb6b1a in nh_timer (ticks=<value optimized out>, timer_idx=<value optimized out>) at nathelper.c:1623 rval = <value optimized out> buf = 0xadd270 cp = <value optimized out> c = {s = 0x0, len = 0} opt = {s = 0x0, len = 0} path = {s = 0x0, len = 0} curi = {user = {s = 0x0, len = 0}, passwd = {s = 0x0, len = 0}, host = {s = 0xadd836 "CLIENT_IP:62252\250u\220", len = 14}, port = {s = 0xadd845 "62252\250u\220", len = 5}, params = {s = 0x0, len = 0}, sip_params = {s = 0x0, len = 0}, headers = { s = 0x7fff110134b0 "", len = 566862356}, port_no = 50336, proto = 70, type = ERROR_URI_T, flags = 65536, transport = {s = 0x8cbfa0 "", len = 5441948}, ttl = {s = 0x7fff1101352f "", len = 4448297}, user_param = {s = 0x375ef8e "PROXY_IP:5060", len = 13}, maddr = {s = 0x9204b0 "sip3", len = 10677560}, method = {s = 0x0, len = 0}, lr = {s = 0xadd84a "\250u\220", len = 0}, r2 = {s = 0xa2ebc0 "\250\272\252", len = 285291296}, transport_val = {s = 0x40 <Address 0x40 out of bounds>, len = 335544320}, ttl_val = { s = 0x3221c302d0 "H\307\300\017", len = 0}, user_param_val = {s = 0x0, len = 0}, maddr_val = {s = 0x0, len = 0}, method_val = {s = 0x0, len = 0}, lr_val = {s = 0x0, len = 0}, r2_val = {s = 0x0, len = 0}} he = <value optimized out> send_sock = <value optimized out> flags = <value optimized out> proto = <value optimized out> dst = {send_sock = 0x0, to = {s = {sa_family = 0, sa_data = '\000' <repeats 13 times>}, sin = {sin_family = 0, sin_port = 0, sin_addr = {s_addr = 0}, sin_zero = "\000\000\000\000\000\000\000"}, sin6 = {sin6_family = 0, sin6_port = 0, sin6_flowinfo = 0, sin6_addr = {in6_u = {u6_addr8 = '\000' <repeats 15 times>, u6_addr16 = {0, 0, 0, 0, 0, 0, 0, 0}, u6_addr32 = {0, 0, 0, 0}}}, sin6_scope_id = 0}}, id = 0, proto = 0 '\000', send_flags = {f = 0 '\000', blst_imask = 0 '\000'}} iteration = 1 __FUNCTION__ = "nh_timer" #3 0x0000000000521c54 in fork_dummy_timer (child_id=<value optimized out>, desc=<value optimized out>, make_sock=<value optimized out>, f=0x2b5061eb6a70 <nh_timer>, param=0x0, interval=1) at timer_proc.c:71 pid = <value optimized out> #4 0x00002b5061eb42ee in child_init (rank=<value optimized out>) at nathelper.c:673 i = 58055190 #5 0x00000000004db1d0 in init_mod_child (m=0x917dc0, rank=0) at sr_module.c:829 No locals. #6 0x00000000004db1a9 in init_mod_child (m=0x918420, rank=0) at sr_module.c:826 No locals. #7 0x00000000004db1a9 in init_mod_child (m=0x918aa0, rank=0) at sr_module.c:826 No locals.
SIP Express Router (SER) and Kamailio (OpenSER) - sr-users mailing list sr-users@lists.sip-router.org http://lists.sip-router.org/cgi-bin/mailman/listinfo/sr-users
Hi Daniel,
(gdb) p *row->values $1 = {type = DB1_STRING, nul = 0, free = 0, val = {int_val = 58055112, ll_val = 58055112, double_val = 2.8683036404665902e-316, time_val = 58055112, string_val = 0x375d9c8 "sip:44221xxx@192.168.1.157:37612;rinstance=8c613319a30caff6;transport=TCP", str_val = { s = 0x375d9c8 "sip:44221xxx@192.168.1.157:37612;rinstance=8c613319a30caff6;transport=TCP", len = 0}, blob_val = {s = 0x375d9c8 "sip:44221xxx@192.168.1.157:37612;rinstance=8c613319a30caff6;transport=TCP", len = 0}, bitmap_val = 58055112}}
(gdb) p i $2 = <value optimized out>
Regards, Kristian.
On Wednesday 14 January 2015 11:13:43 Daniel-Constantin Mierla wrote:
Hello,
get also:
p *row->values p i
Cheers, Daniel
On 14/01/15 10:50, Kristian F. Høgh wrote:
Hi Daniel,
(gdb) p *row $1 = {values = 0x9bd2b8, n = 13}
Regards, Kristian.
On Wednesday 14 January 2015 10:01:41 Daniel-Constantin Mierla wrote:
Hello,
the operation done by that code is practically a sql query, fetching the result and packing it for nathelper usage. It is no race between it and handling any sip packet, no shared memory accessed.
Do you get only one corefile?
From the backtrace it looks like row pointer is not ok, can you get the output for:
p *row
Cheers, Daniel
On 13/01/15 15:08, Kristian F. Høgh wrote:
Hi,
On an ancient kamailio 3.1, we occasionally experience crash in the function get_all_db_ucontacts in usrloc.
A backtrace reveals the following: #0 0x00002b505f2fb4ff in get_all_db_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:176 176 if (VAL_NULL(ROW_VALUES(row)+4) || p1==0 || p1[0]==0){
The lines around line 176 (which look similar to trunk, except p and p1 are str today) 159 for(i = 0; i < RES_ROW_N(res); i++) { 160 row = RES_ROWS(res) + i; 161 162 /* received */ 163 p = (char*)VAL_STRING(ROW_VALUES(row)); 164 if ( VAL_NULL(ROW_VALUES(row)) || p==0 || p[0]==0 ) { 165 /* contact */ 166 p = (char*)VAL_STRING(ROW_VALUES(row)+1); 167 if (VAL_NULL(ROW_VALUES(row)+1) || p==0 || p[0]==0) { 168 LM_ERR("empty contact -> skipping\n"); 169 continue; 170 } 171 } 172 p_len = strlen(p); 173 174 /* path */ 175 p1 = (char*)VAL_STRING(ROW_VALUES(row)+4); 176 if (VAL_NULL(ROW_VALUES(row)+4) || p1==0 || p1[0]==0){ 177 p1 = NULL; 178 p1_len = 0; 179 } else { 180 p1_len = strlen(p1); 181 }
The client from frame #2 (bt full below), send a REGISTER, which was replied "200 OK" at the same second as the crash. Is it a timing issue between REGISTER and nathelper ping?
I have a trace and the core.
Regards, Kristian Høgh Uni-tel
bt full #0 0x00002b505f2fb4ff in get_all_db_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:176 sock = 0x0 dbflags = 11088 host = {s = 0x375ef8e "PROXY_IP:5060", len = 13} i = <value optimized out> row = 0xa13488 dom = 0x2b5062723278 p1 = 0x2 <Address 0x2 out of bounds> now_s = "'2015-01-13 12:58:51'\177\000\000)" port = 11088 p_len = 73 cp = <value optimized out> needed = 16 res = 0xa2ed38 p = 0xcfcdffaf <Address 0xcfcdffaf out of bounds> now_len = 21 proto = 1642847400 p1_len = 0 shortage = 0 query_buf = "select received, contact, socket, cflags, path from location where expires > '2015-01-13 12:58:51' and cflags & 64 = 64 and id % 30 = 1", '\000' <repeats 376 times> query_str = {s = 0x2b505f515520 "select received, contact, socket, cflags, path from location where expires > '2015-01-13 12:58:51' and cflags & 64 = 64 and id % 30 = 1", len = 135} #1 get_all_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:398 No locals. #2 0x00002b5061eb6b1a in nh_timer (ticks=<value optimized out>, timer_idx=<value optimized out>) at nathelper.c:1623 rval = <value optimized out> buf = 0xadd270 cp = <value optimized out> c = {s = 0x0, len = 0} opt = {s = 0x0, len = 0} path = {s = 0x0, len = 0} curi = {user = {s = 0x0, len = 0}, passwd = {s = 0x0, len = 0}, host = {s = 0xadd836 "CLIENT_IP:62252\250u\220", len = 14}, port = {s = 0xadd845 "62252\250u\220", len = 5}, params = {s = 0x0, len = 0}, sip_params = {s = 0x0, len = 0}, headers = { s = 0x7fff110134b0 "", len = 566862356}, port_no = 50336, proto = 70, type = ERROR_URI_T, flags = 65536, transport = {s = 0x8cbfa0 "", len = 5441948}, ttl = {s = 0x7fff1101352f "", len = 4448297}, user_param = {s = 0x375ef8e "PROXY_IP:5060", len = 13}, maddr = {s = 0x9204b0 "sip3", len = 10677560}, method = {s = 0x0, len = 0}, lr = {s = 0xadd84a "\250u\220", len = 0}, r2 = {s = 0xa2ebc0 "\250\272\252", len = 285291296}, transport_val = {s = 0x40 <Address 0x40 out of bounds>, len = 335544320}, ttl_val = { s = 0x3221c302d0 "H\307\300\017", len = 0}, user_param_val = {s = 0x0, len = 0}, maddr_val = {s = 0x0, len = 0}, method_val = {s = 0x0, len = 0}, lr_val = {s = 0x0, len = 0}, r2_val = {s = 0x0, len = 0}} he = <value optimized out> send_sock = <value optimized out> flags = <value optimized out> proto = <value optimized out> dst = {send_sock = 0x0, to = {s = {sa_family = 0, sa_data = '\000' <repeats 13 times>}, sin = {sin_family = 0, sin_port = 0, sin_addr = {s_addr = 0}, sin_zero = "\000\000\000\000\000\000\000"}, sin6 = {sin6_family = 0, sin6_port = 0, sin6_flowinfo = 0, sin6_addr = {in6_u = {u6_addr8 = '\000' <repeats 15 times>, u6_addr16 = {0, 0, 0, 0, 0, 0, 0, 0}, u6_addr32 = {0, 0, 0, 0}}}, sin6_scope_id = 0}}, id = 0, proto = 0 '\000', send_flags = {f = 0 '\000', blst_imask = 0 '\000'}} iteration = 1 __FUNCTION__ = "nh_timer" #3 0x0000000000521c54 in fork_dummy_timer (child_id=<value optimized out>, desc=<value optimized out>, make_sock=<value optimized out>, f=0x2b5061eb6a70 <nh_timer>, param=0x0, interval=1) at timer_proc.c:71 pid = <value optimized out> #4 0x00002b5061eb42ee in child_init (rank=<value optimized out>) at nathelper.c:673 i = 58055190 #5 0x00000000004db1d0 in init_mod_child (m=0x917dc0, rank=0) at sr_module.c:829 No locals. #6 0x00000000004db1a9 in init_mod_child (m=0x918420, rank=0) at sr_module.c:826 No locals. #7 0x00000000004db1a9 in init_mod_child (m=0x918aa0, rank=0) at sr_module.c:826 No locals.
SIP Express Router (SER) and Kamailio (OpenSER) - sr-users mailing list sr-users@lists.sip-router.org http://lists.sip-router.org/cgi-bin/mailman/listinfo/sr-users
Hello,
the string_val field look ok. But p variable which is assigned that value a line before the reported crash is not, as reported by bt full:
p = 0xcfcdffaf <Address 0xcfcdffaf out of bounds>
Do you have enabled core files per process/pid? I still wonder if it is a overwritten core.
Otherwise, there might be physical problems with memory...
By pointer addresses, I guess it is a 32b CPU, right?
Cheers, Daniel
On 14/01/15 11:30, Kristian F. Høgh wrote:
Hi Daniel,
(gdb) p *row->values $1 = {type = DB1_STRING, nul = 0, free = 0, val = {int_val = 58055112, ll_val = 58055112, double_val = 2.8683036404665902e-316, time_val = 58055112, string_val = 0x375d9c8 "sip:44221xxx@192.168.1.157:37612;rinstance=8c613319a30caff6;transport=TCP", str_val = { s = 0x375d9c8 "sip:44221xxx@192.168.1.157:37612;rinstance=8c613319a30caff6;transport=TCP", len = 0}, blob_val = {s = 0x375d9c8 "sip:44221xxx@192.168.1.157:37612;rinstance=8c613319a30caff6;transport=TCP", len = 0}, bitmap_val = 58055112}}
(gdb) p i $2 = <value optimized out>
Regards, Kristian.
On Wednesday 14 January 2015 11:13:43 Daniel-Constantin Mierla wrote:
Hello,
get also:
p *row->values p i
Cheers, Daniel
On 14/01/15 10:50, Kristian F. Høgh wrote:
Hi Daniel,
(gdb) p *row $1 = {values = 0x9bd2b8, n = 13}
Regards, Kristian.
On Wednesday 14 January 2015 10:01:41 Daniel-Constantin Mierla wrote:
Hello,
the operation done by that code is practically a sql query, fetching the result and packing it for nathelper usage. It is no race between it and handling any sip packet, no shared memory accessed.
Do you get only one corefile?
From the backtrace it looks like row pointer is not ok, can you get the output for:
p *row
Cheers, Daniel
On 13/01/15 15:08, Kristian F. Høgh wrote:
Hi,
On an ancient kamailio 3.1, we occasionally experience crash in the function get_all_db_ucontacts in usrloc.
A backtrace reveals the following: #0 0x00002b505f2fb4ff in get_all_db_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:176 176 if (VAL_NULL(ROW_VALUES(row)+4) || p1==0 || p1[0]==0){
The lines around line 176 (which look similar to trunk, except p and p1 are str today) 159 for(i = 0; i < RES_ROW_N(res); i++) { 160 row = RES_ROWS(res) + i; 161 162 /* received */ 163 p = (char*)VAL_STRING(ROW_VALUES(row)); 164 if ( VAL_NULL(ROW_VALUES(row)) || p==0 || p[0]==0 ) { 165 /* contact */ 166 p = (char*)VAL_STRING(ROW_VALUES(row)+1); 167 if (VAL_NULL(ROW_VALUES(row)+1) || p==0 || p[0]==0) { 168 LM_ERR("empty contact -> skipping\n"); 169 continue; 170 } 171 } 172 p_len = strlen(p); 173 174 /* path */ 175 p1 = (char*)VAL_STRING(ROW_VALUES(row)+4); 176 if (VAL_NULL(ROW_VALUES(row)+4) || p1==0 || p1[0]==0){ 177 p1 = NULL; 178 p1_len = 0; 179 } else { 180 p1_len = strlen(p1); 181 }
The client from frame #2 (bt full below), send a REGISTER, which was replied "200 OK" at the same second as the crash. Is it a timing issue between REGISTER and nathelper ping?
I have a trace and the core.
Regards, Kristian Høgh Uni-tel
bt full #0 0x00002b505f2fb4ff in get_all_db_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:176 sock = 0x0 dbflags = 11088 host = {s = 0x375ef8e "PROXY_IP:5060", len = 13} i = <value optimized out> row = 0xa13488 dom = 0x2b5062723278 p1 = 0x2 <Address 0x2 out of bounds> now_s = "'2015-01-13 12:58:51'\177\000\000)" port = 11088 p_len = 73 cp = <value optimized out> needed = 16 res = 0xa2ed38 p = 0xcfcdffaf <Address 0xcfcdffaf out of bounds> now_len = 21 proto = 1642847400 p1_len = 0 shortage = 0 query_buf = "select received, contact, socket, cflags, path from location where expires > '2015-01-13 12:58:51' and cflags & 64 = 64 and id % 30 = 1", '\000' <repeats 376 times> query_str = {s = 0x2b505f515520 "select received, contact, socket, cflags, path from location where expires > '2015-01-13 12:58:51' and cflags & 64 = 64 and id % 30 = 1", len = 135} #1 get_all_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:398 No locals. #2 0x00002b5061eb6b1a in nh_timer (ticks=<value optimized out>, timer_idx=<value optimized out>) at nathelper.c:1623 rval = <value optimized out> buf = 0xadd270 cp = <value optimized out> c = {s = 0x0, len = 0} opt = {s = 0x0, len = 0} path = {s = 0x0, len = 0} curi = {user = {s = 0x0, len = 0}, passwd = {s = 0x0, len = 0}, host = {s = 0xadd836 "CLIENT_IP:62252\250u\220", len = 14}, port = {s = 0xadd845 "62252\250u\220", len = 5}, params = {s = 0x0, len = 0}, sip_params = {s = 0x0, len = 0}, headers = { s = 0x7fff110134b0 "", len = 566862356}, port_no = 50336, proto = 70, type = ERROR_URI_T, flags = 65536, transport = {s = 0x8cbfa0 "", len = 5441948}, ttl = {s = 0x7fff1101352f "", len = 4448297}, user_param = {s = 0x375ef8e "PROXY_IP:5060", len = 13}, maddr = {s = 0x9204b0 "sip3", len = 10677560}, method = {s = 0x0, len = 0}, lr = {s = 0xadd84a "\250u\220", len = 0}, r2 = {s = 0xa2ebc0 "\250\272\252", len = 285291296}, transport_val = {s = 0x40 <Address 0x40 out of bounds>, len = 335544320}, ttl_val = { s = 0x3221c302d0 "H\307\300\017", len = 0}, user_param_val = {s = 0x0, len = 0}, maddr_val = {s = 0x0, len = 0}, method_val = {s = 0x0, len = 0}, lr_val = {s = 0x0, len = 0}, r2_val = {s = 0x0, len = 0}} he = <value optimized out> send_sock = <value optimized out> flags = <value optimized out> proto = <value optimized out> dst = {send_sock = 0x0, to = {s = {sa_family = 0, sa_data = '\000' <repeats 13 times>}, sin = {sin_family = 0, sin_port = 0, sin_addr = {s_addr = 0}, sin_zero = "\000\000\000\000\000\000\000"}, sin6 = {sin6_family = 0, sin6_port = 0, sin6_flowinfo = 0, sin6_addr = {in6_u = {u6_addr8 = '\000' <repeats 15 times>, u6_addr16 = {0, 0, 0, 0, 0, 0, 0, 0}, u6_addr32 = {0, 0, 0, 0}}}, sin6_scope_id = 0}}, id = 0, proto = 0 '\000', send_flags = {f = 0 '\000', blst_imask = 0 '\000'}} iteration = 1 __FUNCTION__ = "nh_timer" #3 0x0000000000521c54 in fork_dummy_timer (child_id=<value optimized out>, desc=<value optimized out>, make_sock=<value optimized out>, f=0x2b5061eb6a70 <nh_timer>, param=0x0, interval=1) at timer_proc.c:71 pid = <value optimized out> #4 0x00002b5061eb42ee in child_init (rank=<value optimized out>) at nathelper.c:673 i = 58055190 #5 0x00000000004db1d0 in init_mod_child (m=0x917dc0, rank=0) at sr_module.c:829 No locals. #6 0x00000000004db1a9 in init_mod_child (m=0x918420, rank=0) at sr_module.c:826 No locals. #7 0x00000000004db1a9 in init_mod_child (m=0x918aa0, rank=0) at sr_module.c:826 No locals.
SIP Express Router (SER) and Kamailio (OpenSER) - sr-users mailing list sr-users@lists.sip-router.org http://lists.sip-router.org/cgi-bin/mailman/listinfo/sr-users
Hi Daniel,
# kamailio -V version: kamailio 3.1.1 (x86_64/linux) c274ce flags: STATS: Off, USE_IPV6, USE_TCP, USE_TLS, TLS_HOOKS, USE_RAW_SOCKS, DISABLE_NAGLE, USE_MCAST, DNS_IP_HACK, SHM_MEM, SHM_MMAP, PKG_MALLOC, DBG_QM_MALLOC, USE_FUTEX, FAST_LOCK-ADAPTIVE_WAIT, USE_DNS_CACHE, USE_DNS_FAILOVER, USE_NAPTR, USE_DST_BLACKLIST, HAVE_RESOLV_RES ADAPTIVE_WAIT_LOOPS=1024, MAX_RECV_BUFFER_SIZE 262144, MAX_LISTEN 16, MAX_URI_SIZE 1024, BUF_SIZE 65535, PKG_SIZE 8MB poll method support: poll, epoll_lt, epoll_et, sigio_rt, select. id: c274ce compiled on 22:03:13 Feb 1 2011 with gcc 4.1.2
I got the following in the log Jan 13 12:58:51 sip-3-1 /usr/local/sbin/kamailio[15124]: : <core> [pass_fd.c:293]: ERROR: receive_fd: EOF on 5 Jan 13 12:58:52 sip-3-1 /usr/local/sbin/kamailio[15094]: ALERT: <core> [main.c:741]: child process 15115 exited by a signal 11 Jan 13 12:58:52 sip-3-1 /usr/local/sbin/kamailio[15094]: ALERT: <core> [main.c:744]: core was generated Jan 13 12:58:52 sip-3-1 /usr/local/sbin/kamailio[15094]: INFO: <core> [main.c:756]: INFO: terminating due to SIGCHLD
And the core file is called core.15115
Hope it answers your questions.
Regards, /Kristian.
On Wednesday 14 January 2015 14:09:41 Daniel-Constantin Mierla wrote:
Hello,
the string_val field look ok. But p variable which is assigned that value a line before the reported crash is not, as reported by bt full:
p = 0xcfcdffaf <Address 0xcfcdffaf out of bounds>
Do you have enabled core files per process/pid? I still wonder if it is a overwritten core.
Otherwise, there might be physical problems with memory...
By pointer addresses, I guess it is a 32b CPU, right?
Cheers, Daniel
On 14/01/15 11:30, Kristian F. Høgh wrote:
Hi Daniel,
(gdb) p *row->values $1 = {type = DB1_STRING, nul = 0, free = 0, val = {int_val = 58055112, ll_val = 58055112, double_val = 2.8683036404665902e-316, time_val = 58055112, string_val = 0x375d9c8 "sip:44221xxx@192.168.1.157:37612;rinstance=8c613319a30caff6;transport=TCP", str_val = { s = 0x375d9c8 "sip:44221xxx@192.168.1.157:37612;rinstance=8c613319a30caff6;transport=TCP", len = 0}, blob_val = {s = 0x375d9c8 "sip:44221xxx@192.168.1.157:37612;rinstance=8c613319a30caff6;transport=TCP", len = 0}, bitmap_val = 58055112}}
(gdb) p i $2 = <value optimized out>
Regards, Kristian.
On Wednesday 14 January 2015 11:13:43 Daniel-Constantin Mierla wrote:
Hello,
get also:
p *row->values p i
Cheers, Daniel
On 14/01/15 10:50, Kristian F. Høgh wrote:
Hi Daniel,
(gdb) p *row $1 = {values = 0x9bd2b8, n = 13}
Regards, Kristian.
On Wednesday 14 January 2015 10:01:41 Daniel-Constantin Mierla wrote:
Hello,
the operation done by that code is practically a sql query, fetching the result and packing it for nathelper usage. It is no race between it and handling any sip packet, no shared memory accessed.
Do you get only one corefile?
From the backtrace it looks like row pointer is not ok, can you get the output for:
p *row
Cheers, Daniel
On 13/01/15 15:08, Kristian F. Høgh wrote:
Hi,
On an ancient kamailio 3.1, we occasionally experience crash in the function get_all_db_ucontacts in usrloc.
A backtrace reveals the following: #0 0x00002b505f2fb4ff in get_all_db_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:176 176 if (VAL_NULL(ROW_VALUES(row)+4) || p1==0 || p1[0]==0){
The lines around line 176 (which look similar to trunk, except p and p1 are str today) 159 for(i = 0; i < RES_ROW_N(res); i++) { 160 row = RES_ROWS(res) + i; 161 162 /* received */ 163 p = (char*)VAL_STRING(ROW_VALUES(row)); 164 if ( VAL_NULL(ROW_VALUES(row)) || p==0 || p[0]==0 ) { 165 /* contact */ 166 p = (char*)VAL_STRING(ROW_VALUES(row)+1); 167 if (VAL_NULL(ROW_VALUES(row)+1) || p==0 || p[0]==0) { 168 LM_ERR("empty contact -> skipping\n"); 169 continue; 170 } 171 } 172 p_len = strlen(p); 173 174 /* path */ 175 p1 = (char*)VAL_STRING(ROW_VALUES(row)+4); 176 if (VAL_NULL(ROW_VALUES(row)+4) || p1==0 || p1[0]==0){ 177 p1 = NULL; 178 p1_len = 0; 179 } else { 180 p1_len = strlen(p1); 181 }
The client from frame #2 (bt full below), send a REGISTER, which was replied "200 OK" at the same second as the crash. Is it a timing issue between REGISTER and nathelper ping?
I have a trace and the core.
Regards, Kristian Høgh Uni-tel
bt full #0 0x00002b505f2fb4ff in get_all_db_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:176 sock = 0x0 dbflags = 11088 host = {s = 0x375ef8e "PROXY_IP:5060", len = 13} i = <value optimized out> row = 0xa13488 dom = 0x2b5062723278 p1 = 0x2 <Address 0x2 out of bounds> now_s = "'2015-01-13 12:58:51'\177\000\000)" port = 11088 p_len = 73 cp = <value optimized out> needed = 16 res = 0xa2ed38 p = 0xcfcdffaf <Address 0xcfcdffaf out of bounds> now_len = 21 proto = 1642847400 p1_len = 0 shortage = 0 query_buf = "select received, contact, socket, cflags, path from location where expires > '2015-01-13 12:58:51' and cflags & 64 = 64 and id % 30 = 1", '\000' <repeats 376 times> query_str = {s = 0x2b505f515520 "select received, contact, socket, cflags, path from location where expires > '2015-01-13 12:58:51' and cflags & 64 = 64 and id % 30 = 1", len = 135} #1 get_all_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:398 No locals. #2 0x00002b5061eb6b1a in nh_timer (ticks=<value optimized out>, timer_idx=<value optimized out>) at nathelper.c:1623 rval = <value optimized out> buf = 0xadd270 cp = <value optimized out> c = {s = 0x0, len = 0} opt = {s = 0x0, len = 0} path = {s = 0x0, len = 0} curi = {user = {s = 0x0, len = 0}, passwd = {s = 0x0, len = 0}, host = {s = 0xadd836 "CLIENT_IP:62252\250u\220", len = 14}, port = {s = 0xadd845 "62252\250u\220", len = 5}, params = {s = 0x0, len = 0}, sip_params = {s = 0x0, len = 0}, headers = { s = 0x7fff110134b0 "", len = 566862356}, port_no = 50336, proto = 70, type = ERROR_URI_T, flags = 65536, transport = {s = 0x8cbfa0 "", len = 5441948}, ttl = {s = 0x7fff1101352f "", len = 4448297}, user_param = {s = 0x375ef8e "PROXY_IP:5060", len = 13}, maddr = {s = 0x9204b0 "sip3", len = 10677560}, method = {s = 0x0, len = 0}, lr = {s = 0xadd84a "\250u\220", len = 0}, r2 = {s = 0xa2ebc0 "\250\272\252", len = 285291296}, transport_val = {s = 0x40 <Address 0x40 out of bounds>, len = 335544320}, ttl_val = { s = 0x3221c302d0 "H\307\300\017", len = 0}, user_param_val = {s = 0x0, len = 0}, maddr_val = {s = 0x0, len = 0}, method_val = {s = 0x0, len = 0}, lr_val = {s = 0x0, len = 0}, r2_val = {s = 0x0, len = 0}} he = <value optimized out> send_sock = <value optimized out> flags = <value optimized out> proto = <value optimized out> dst = {send_sock = 0x0, to = {s = {sa_family = 0, sa_data = '\000' <repeats 13 times>}, sin = {sin_family = 0, sin_port = 0, sin_addr = {s_addr = 0}, sin_zero = "\000\000\000\000\000\000\000"}, sin6 = {sin6_family = 0, sin6_port = 0, sin6_flowinfo = 0, sin6_addr = {in6_u = {u6_addr8 = '\000' <repeats 15 times>, u6_addr16 = {0, 0, 0, 0, 0, 0, 0, 0}, u6_addr32 = {0, 0, 0, 0}}}, sin6_scope_id = 0}}, id = 0, proto = 0 '\000', send_flags = {f = 0 '\000', blst_imask = 0 '\000'}} iteration = 1 __FUNCTION__ = "nh_timer" #3 0x0000000000521c54 in fork_dummy_timer (child_id=<value optimized out>, desc=<value optimized out>, make_sock=<value optimized out>, f=0x2b5061eb6a70 <nh_timer>, param=0x0, interval=1) at timer_proc.c:71 pid = <value optimized out> #4 0x00002b5061eb42ee in child_init (rank=<value optimized out>) at nathelper.c:673 i = 58055190 #5 0x00000000004db1d0 in init_mod_child (m=0x917dc0, rank=0) at sr_module.c:829 No locals. #6 0x00000000004db1a9 in init_mod_child (m=0x918420, rank=0) at sr_module.c:826 No locals. #7 0x00000000004db1a9 in init_mod_child (m=0x918aa0, rank=0) at sr_module.c:826 No locals.
SIP Express Router (SER) and Kamailio (OpenSER) - sr-users mailing list sr-users@lists.sip-router.org http://lists.sip-router.org/cgi-bin/mailman/listinfo/sr-users
On 14/01/15 14:21, Kristian F. Høgh wrote:
Hi Daniel,
# kamailio -V version: kamailio 3.1.1 (x86_64/linux) c274ce flags: STATS: Off, USE_IPV6, USE_TCP, USE_TLS, TLS_HOOKS, USE_RAW_SOCKS, DISABLE_NAGLE, USE_MCAST, DNS_IP_HACK, SHM_MEM, SHM_MMAP, PKG_MALLOC, DBG_QM_MALLOC, USE_FUTEX, FAST_LOCK-ADAPTIVE_WAIT, USE_DNS_CACHE, USE_DNS_FAILOVER, USE_NAPTR, USE_DST_BLACKLIST, HAVE_RESOLV_RES ADAPTIVE_WAIT_LOOPS=1024, MAX_RECV_BUFFER_SIZE 262144, MAX_LISTEN 16, MAX_URI_SIZE 1024, BUF_SIZE 65535, PKG_SIZE 8MB poll method support: poll, epoll_lt, epoll_et, sigio_rt, select. id: c274ce compiled on 22:03:13 Feb 1 2011 with gcc 4.1.2
I got the following in the log Jan 13 12:58:51 sip-3-1 /usr/local/sbin/kamailio[15124]: : <core> [pass_fd.c:293]: ERROR: receive_fd: EOF on 5 Jan 13 12:58:52 sip-3-1 /usr/local/sbin/kamailio[15094]: ALERT: <core> [main.c:741]: child process 15115 exited by a signal 11 Jan 13 12:58:52 sip-3-1 /usr/local/sbin/kamailio[15094]: ALERT: <core> [main.c:744]: core was generated Jan 13 12:58:52 sip-3-1 /usr/local/sbin/kamailio[15094]: INFO: <core> [main.c:756]: INFO: terminating due to SIGCHLD
And the core file is called core.15115
Hope it answers your questions.
Regards, /Kristian.
On Wednesday 14 January 2015 14:09:41 Daniel-Constantin Mierla wrote:
Hello,
the string_val field look ok. But p variable which is assigned that value a line before the reported crash is not, as reported by bt full:
p = 0xcfcdffaf <Address 0xcfcdffaf out of bounds>
Do you have enabled core files per process/pid? I still wonder if it is a overwritten core.
Otherwise, there might be physical problems with memory...
By pointer addresses, I guess it is a 32b CPU, right?
Cheers, Daniel
On 14/01/15 11:30, Kristian F. Høgh wrote:
Hi Daniel,
(gdb) p *row->values $1 = {type = DB1_STRING, nul = 0, free = 0, val = {int_val = 58055112, ll_val = 58055112, double_val = 2.8683036404665902e-316, time_val = 58055112, string_val = 0x375d9c8 "sip:44221xxx@192.168.1.157:37612;rinstance=8c613319a30caff6;transport=TCP", str_val = { s = 0x375d9c8 "sip:44221xxx@192.168.1.157:37612;rinstance=8c613319a30caff6;transport=TCP", len = 0}, blob_val = {s = 0x375d9c8 "sip:44221xxx@192.168.1.157:37612;rinstance=8c613319a30caff6;transport=TCP", len = 0}, bitmap_val = 58055112}}
(gdb) p i $2 = <value optimized out>
Regards, Kristian.
On Wednesday 14 January 2015 11:13:43 Daniel-Constantin Mierla wrote:
Hello,
get also:
p *row->values p i
Cheers, Daniel
On 14/01/15 10:50, Kristian F. Høgh wrote:
Hi Daniel,
(gdb) p *row $1 = {values = 0x9bd2b8, n = 13}
Regards, Kristian.
On Wednesday 14 January 2015 10:01:41 Daniel-Constantin Mierla wrote:
Hello,
the operation done by that code is practically a sql query, fetching the result and packing it for nathelper usage. It is no race between it and handling any sip packet, no shared memory accessed.
Do you get only one corefile?
From the backtrace it looks like row pointer is not ok, can you get the output for:
p *row
Cheers, Daniel
On 13/01/15 15:08, Kristian F. Høgh wrote: > Hi, > > On an ancient kamailio 3.1, we occasionally experience crash in the function get_all_db_ucontacts in usrloc. > > A backtrace reveals the following: > #0 0x00002b505f2fb4ff in get_all_db_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:176 > 176 if (VAL_NULL(ROW_VALUES(row)+4) || p1==0 || p1[0]==0){ > > > The lines around line 176 (which look similar to trunk, except p and p1 are str today) > 159 for(i = 0; i < RES_ROW_N(res); i++) { > 160 row = RES_ROWS(res) + i; > 161 > 162 /* received */ > 163 p = (char*)VAL_STRING(ROW_VALUES(row)); > 164 if ( VAL_NULL(ROW_VALUES(row)) || p==0 || p[0]==0 ) { > 165 /* contact */ > 166 p = (char*)VAL_STRING(ROW_VALUES(row)+1); > 167 if (VAL_NULL(ROW_VALUES(row)+1) || p==0 || p[0]==0) { > 168 LM_ERR("empty contact -> skipping\n"); > 169 continue; > 170 } > 171 } > 172 p_len = strlen(p); > 173 > 174 /* path */ > 175 p1 = (char*)VAL_STRING(ROW_VALUES(row)+4); > 176 if (VAL_NULL(ROW_VALUES(row)+4) || p1==0 || p1[0]==0){ > 177 p1 = NULL; > 178 p1_len = 0; > 179 } else { > 180 p1_len = strlen(p1); > 181 } > > The client from frame #2 (bt full below), send a REGISTER, which was replied "200 OK" at the same second as the crash. > Is it a timing issue between REGISTER and nathelper ping? > > I have a trace and the core. > > Regards, > Kristian Høgh > Uni-tel > > > > bt full > #0 0x00002b505f2fb4ff in get_all_db_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:176 > sock = 0x0 > dbflags = 11088 > host = {s = 0x375ef8e "PROXY_IP:5060", len = 13} > i = <value optimized out> > row = 0xa13488 > dom = 0x2b5062723278 > p1 = 0x2 <Address 0x2 out of bounds> > now_s = "'2015-01-13 12:58:51'\177\000\000)" > port = 11088 > p_len = 73 > cp = <value optimized out> > needed = 16 > res = 0xa2ed38 > p = 0xcfcdffaf <Address 0xcfcdffaf out of bounds> > now_len = 21 > proto = 1642847400 > p1_len = 0 > shortage = 0 > query_buf = "select received, contact, socket, cflags, path from location where expires > '2015-01-13 12:58:51' and cflags & 64 = 64 and id % 30 = 1", '\000' <repeats 376 times> > query_str = {s = 0x2b505f515520 "select received, contact, socket, cflags, path from location where expires > '2015-01-13 12:58:51' and cflags & 64 = 64 and id % 30 = 1", len = 135} > #1 get_all_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:398 > No locals. > #2 0x00002b5061eb6b1a in nh_timer (ticks=<value optimized out>, timer_idx=<value optimized out>) at nathelper.c:1623 > rval = <value optimized out> > buf = 0xadd270 > cp = <value optimized out> > c = {s = 0x0, len = 0} > opt = {s = 0x0, len = 0} > path = {s = 0x0, len = 0} > curi = {user = {s = 0x0, len = 0}, passwd = {s = 0x0, len = 0}, host = {s = 0xadd836 "CLIENT_IP:62252\250u\220", len = 14}, port = {s = 0xadd845 "62252\250u\220", len = 5}, params = {s = 0x0, len = 0}, sip_params = {s = 0x0, len = 0}, headers = { > s = 0x7fff110134b0 "", len = 566862356}, port_no = 50336, proto = 70, type = ERROR_URI_T, flags = 65536, transport = {s = 0x8cbfa0 "", len = 5441948}, ttl = {s = 0x7fff1101352f "", len = 4448297}, user_param = {s = 0x375ef8e "PROXY_IP:5060", len = 13}, > maddr = {s = 0x9204b0 "sip3", len = 10677560}, method = {s = 0x0, len = 0}, lr = {s = 0xadd84a "\250u\220", len = 0}, r2 = {s = 0xa2ebc0 "\250\272\252", len = 285291296}, transport_val = {s = 0x40 <Address 0x40 out of bounds>, len = 335544320}, ttl_val = { > s = 0x3221c302d0 "H\307\300\017", len = 0}, user_param_val = {s = 0x0, len = 0}, maddr_val = {s = 0x0, len = 0}, method_val = {s = 0x0, len = 0}, lr_val = {s = 0x0, len = 0}, r2_val = {s = 0x0, len = 0}} > he = <value optimized out> > send_sock = <value optimized out> > flags = <value optimized out> > proto = <value optimized out> > dst = {send_sock = 0x0, to = {s = {sa_family = 0, sa_data = '\000' <repeats 13 times>}, sin = {sin_family = 0, sin_port = 0, sin_addr = {s_addr = 0}, sin_zero = "\000\000\000\000\000\000\000"}, sin6 = {sin6_family = 0, sin6_port = 0, sin6_flowinfo = 0, > sin6_addr = {in6_u = {u6_addr8 = '\000' <repeats 15 times>, u6_addr16 = {0, 0, 0, 0, 0, 0, 0, 0}, u6_addr32 = {0, 0, 0, 0}}}, sin6_scope_id = 0}}, id = 0, proto = 0 '\000', send_flags = {f = 0 '\000', blst_imask = 0 '\000'}} > iteration = 1 > __FUNCTION__ = "nh_timer" > #3 0x0000000000521c54 in fork_dummy_timer (child_id=<value optimized out>, desc=<value optimized out>, make_sock=<value optimized out>, f=0x2b5061eb6a70 <nh_timer>, param=0x0, interval=1) at timer_proc.c:71 > pid = <value optimized out> > #4 0x00002b5061eb42ee in child_init (rank=<value optimized out>) at nathelper.c:673 > i = 58055190 > #5 0x00000000004db1d0 in init_mod_child (m=0x917dc0, rank=0) at sr_module.c:829 > No locals. > #6 0x00000000004db1a9 in init_mod_child (m=0x918420, rank=0) at sr_module.c:826 > No locals. > #7 0x00000000004db1a9 in init_mod_child (m=0x918aa0, rank=0) at sr_module.c:826 > No locals. > > > > _______________________________________________ > SIP Express Router (SER) and Kamailio (OpenSER) - sr-users mailing list > sr-users@lists.sip-router.org > http://lists.sip-router.org/cgi-bin/mailman/listinfo/sr-users
Hello,
ok, so it is from the pid.
At this moment I don't see any reason in that part of code to crash, unless hardware problems.
Maybe you can at least upgrade to latest version in 3.1.x series, to rule out the side effects of the bugs found in 3.1 branch at that time. It should be same config and database when using a newer 3.1.x.
Otherwise, that is really hard to remeber what was fixed in 3.1 branch, being quite old by now.
Cheers, Daniel
On 14/01/15 14:21, Kristian F. Høgh wrote:
Hi Daniel,
# kamailio -V version: kamailio 3.1.1 (x86_64/linux) c274ce flags: STATS: Off, USE_IPV6, USE_TCP, USE_TLS, TLS_HOOKS, USE_RAW_SOCKS, DISABLE_NAGLE, USE_MCAST, DNS_IP_HACK, SHM_MEM, SHM_MMAP, PKG_MALLOC, DBG_QM_MALLOC, USE_FUTEX, FAST_LOCK-ADAPTIVE_WAIT, USE_DNS_CACHE, USE_DNS_FAILOVER, USE_NAPTR, USE_DST_BLACKLIST, HAVE_RESOLV_RES ADAPTIVE_WAIT_LOOPS=1024, MAX_RECV_BUFFER_SIZE 262144, MAX_LISTEN 16, MAX_URI_SIZE 1024, BUF_SIZE 65535, PKG_SIZE 8MB poll method support: poll, epoll_lt, epoll_et, sigio_rt, select. id: c274ce compiled on 22:03:13 Feb 1 2011 with gcc 4.1.2
I got the following in the log Jan 13 12:58:51 sip-3-1 /usr/local/sbin/kamailio[15124]: : <core> [pass_fd.c:293]: ERROR: receive_fd: EOF on 5 Jan 13 12:58:52 sip-3-1 /usr/local/sbin/kamailio[15094]: ALERT: <core> [main.c:741]: child process 15115 exited by a signal 11 Jan 13 12:58:52 sip-3-1 /usr/local/sbin/kamailio[15094]: ALERT: <core> [main.c:744]: core was generated Jan 13 12:58:52 sip-3-1 /usr/local/sbin/kamailio[15094]: INFO: <core> [main.c:756]: INFO: terminating due to SIGCHLD
And the core file is called core.15115
Hope it answers your questions.
Regards, /Kristian.
On Wednesday 14 January 2015 14:09:41 Daniel-Constantin Mierla wrote:
Hello,
the string_val field look ok. But p variable which is assigned that value a line before the reported crash is not, as reported by bt full:
p = 0xcfcdffaf <Address 0xcfcdffaf out of bounds>
Do you have enabled core files per process/pid? I still wonder if it is a overwritten core.
Otherwise, there might be physical problems with memory...
By pointer addresses, I guess it is a 32b CPU, right?
Cheers, Daniel
On 14/01/15 11:30, Kristian F. Høgh wrote:
Hi Daniel,
(gdb) p *row->values $1 = {type = DB1_STRING, nul = 0, free = 0, val = {int_val = 58055112, ll_val = 58055112, double_val = 2.8683036404665902e-316, time_val = 58055112, string_val = 0x375d9c8 "sip:44221xxx@192.168.1.157:37612;rinstance=8c613319a30caff6;transport=TCP", str_val = { s = 0x375d9c8 "sip:44221xxx@192.168.1.157:37612;rinstance=8c613319a30caff6;transport=TCP", len = 0}, blob_val = {s = 0x375d9c8 "sip:44221xxx@192.168.1.157:37612;rinstance=8c613319a30caff6;transport=TCP", len = 0}, bitmap_val = 58055112}}
(gdb) p i $2 = <value optimized out>
Regards, Kristian.
On Wednesday 14 January 2015 11:13:43 Daniel-Constantin Mierla wrote:
Hello,
get also:
p *row->values p i
Cheers, Daniel
On 14/01/15 10:50, Kristian F. Høgh wrote:
Hi Daniel,
(gdb) p *row $1 = {values = 0x9bd2b8, n = 13}
Regards, Kristian.
On Wednesday 14 January 2015 10:01:41 Daniel-Constantin Mierla wrote:
Hello,
the operation done by that code is practically a sql query, fetching the result and packing it for nathelper usage. It is no race between it and handling any sip packet, no shared memory accessed.
Do you get only one corefile?
From the backtrace it looks like row pointer is not ok, can you get the output for:
p *row
Cheers, Daniel
On 13/01/15 15:08, Kristian F. Høgh wrote: > Hi, > > On an ancient kamailio 3.1, we occasionally experience crash in the function get_all_db_ucontacts in usrloc. > > A backtrace reveals the following: > #0 0x00002b505f2fb4ff in get_all_db_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:176 > 176 if (VAL_NULL(ROW_VALUES(row)+4) || p1==0 || p1[0]==0){ > > > The lines around line 176 (which look similar to trunk, except p and p1 are str today) > 159 for(i = 0; i < RES_ROW_N(res); i++) { > 160 row = RES_ROWS(res) + i; > 161 > 162 /* received */ > 163 p = (char*)VAL_STRING(ROW_VALUES(row)); > 164 if ( VAL_NULL(ROW_VALUES(row)) || p==0 || p[0]==0 ) { > 165 /* contact */ > 166 p = (char*)VAL_STRING(ROW_VALUES(row)+1); > 167 if (VAL_NULL(ROW_VALUES(row)+1) || p==0 || p[0]==0) { > 168 LM_ERR("empty contact -> skipping\n"); > 169 continue; > 170 } > 171 } > 172 p_len = strlen(p); > 173 > 174 /* path */ > 175 p1 = (char*)VAL_STRING(ROW_VALUES(row)+4); > 176 if (VAL_NULL(ROW_VALUES(row)+4) || p1==0 || p1[0]==0){ > 177 p1 = NULL; > 178 p1_len = 0; > 179 } else { > 180 p1_len = strlen(p1); > 181 } > > The client from frame #2 (bt full below), send a REGISTER, which was replied "200 OK" at the same second as the crash. > Is it a timing issue between REGISTER and nathelper ping? > > I have a trace and the core. > > Regards, > Kristian Høgh > Uni-tel > > > > bt full > #0 0x00002b505f2fb4ff in get_all_db_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:176 > sock = 0x0 > dbflags = 11088 > host = {s = 0x375ef8e "PROXY_IP:5060", len = 13} > i = <value optimized out> > row = 0xa13488 > dom = 0x2b5062723278 > p1 = 0x2 <Address 0x2 out of bounds> > now_s = "'2015-01-13 12:58:51'\177\000\000)" > port = 11088 > p_len = 73 > cp = <value optimized out> > needed = 16 > res = 0xa2ed38 > p = 0xcfcdffaf <Address 0xcfcdffaf out of bounds> > now_len = 21 > proto = 1642847400 > p1_len = 0 > shortage = 0 > query_buf = "select received, contact, socket, cflags, path from location where expires > '2015-01-13 12:58:51' and cflags & 64 = 64 and id % 30 = 1", '\000' <repeats 376 times> > query_str = {s = 0x2b505f515520 "select received, contact, socket, cflags, path from location where expires > '2015-01-13 12:58:51' and cflags & 64 = 64 and id % 30 = 1", len = 135} > #1 get_all_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:398 > No locals. > #2 0x00002b5061eb6b1a in nh_timer (ticks=<value optimized out>, timer_idx=<value optimized out>) at nathelper.c:1623 > rval = <value optimized out> > buf = 0xadd270 > cp = <value optimized out> > c = {s = 0x0, len = 0} > opt = {s = 0x0, len = 0} > path = {s = 0x0, len = 0} > curi = {user = {s = 0x0, len = 0}, passwd = {s = 0x0, len = 0}, host = {s = 0xadd836 "CLIENT_IP:62252\250u\220", len = 14}, port = {s = 0xadd845 "62252\250u\220", len = 5}, params = {s = 0x0, len = 0}, sip_params = {s = 0x0, len = 0}, headers = { > s = 0x7fff110134b0 "", len = 566862356}, port_no = 50336, proto = 70, type = ERROR_URI_T, flags = 65536, transport = {s = 0x8cbfa0 "", len = 5441948}, ttl = {s = 0x7fff1101352f "", len = 4448297}, user_param = {s = 0x375ef8e "PROXY_IP:5060", len = 13}, > maddr = {s = 0x9204b0 "sip3", len = 10677560}, method = {s = 0x0, len = 0}, lr = {s = 0xadd84a "\250u\220", len = 0}, r2 = {s = 0xa2ebc0 "\250\272\252", len = 285291296}, transport_val = {s = 0x40 <Address 0x40 out of bounds>, len = 335544320}, ttl_val = { > s = 0x3221c302d0 "H\307\300\017", len = 0}, user_param_val = {s = 0x0, len = 0}, maddr_val = {s = 0x0, len = 0}, method_val = {s = 0x0, len = 0}, lr_val = {s = 0x0, len = 0}, r2_val = {s = 0x0, len = 0}} > he = <value optimized out> > send_sock = <value optimized out> > flags = <value optimized out> > proto = <value optimized out> > dst = {send_sock = 0x0, to = {s = {sa_family = 0, sa_data = '\000' <repeats 13 times>}, sin = {sin_family = 0, sin_port = 0, sin_addr = {s_addr = 0}, sin_zero = "\000\000\000\000\000\000\000"}, sin6 = {sin6_family = 0, sin6_port = 0, sin6_flowinfo = 0, > sin6_addr = {in6_u = {u6_addr8 = '\000' <repeats 15 times>, u6_addr16 = {0, 0, 0, 0, 0, 0, 0, 0}, u6_addr32 = {0, 0, 0, 0}}}, sin6_scope_id = 0}}, id = 0, proto = 0 '\000', send_flags = {f = 0 '\000', blst_imask = 0 '\000'}} > iteration = 1 > __FUNCTION__ = "nh_timer" > #3 0x0000000000521c54 in fork_dummy_timer (child_id=<value optimized out>, desc=<value optimized out>, make_sock=<value optimized out>, f=0x2b5061eb6a70 <nh_timer>, param=0x0, interval=1) at timer_proc.c:71 > pid = <value optimized out> > #4 0x00002b5061eb42ee in child_init (rank=<value optimized out>) at nathelper.c:673 > i = 58055190 > #5 0x00000000004db1d0 in init_mod_child (m=0x917dc0, rank=0) at sr_module.c:829 > No locals. > #6 0x00000000004db1a9 in init_mod_child (m=0x918420, rank=0) at sr_module.c:826 > No locals. > #7 0x00000000004db1a9 in init_mod_child (m=0x918aa0, rank=0) at sr_module.c:826 > No locals. > > > > _______________________________________________ > SIP Express Router (SER) and Kamailio (OpenSER) - sr-users mailing list > sr-users@lists.sip-router.org > http://lists.sip-router.org/cgi-bin/mailman/listinfo/sr-users
Hi Daniel,
I will see if I can grow my balls big enough to upgrade :-) Thanks for your help and time.
Regards, Kristian.
On Wednesday 14 January 2015 14:58:22 Daniel-Constantin Mierla wrote:
Hello,
ok, so it is from the pid.
At this moment I don't see any reason in that part of code to crash, unless hardware problems.
Maybe you can at least upgrade to latest version in 3.1.x series, to rule out the side effects of the bugs found in 3.1 branch at that time. It should be same config and database when using a newer 3.1.x.
Otherwise, that is really hard to remeber what was fixed in 3.1 branch, being quite old by now.
Cheers, Daniel
On 14/01/15 14:21, Kristian F. Høgh wrote:
Hi Daniel,
# kamailio -V version: kamailio 3.1.1 (x86_64/linux) c274ce flags: STATS: Off, USE_IPV6, USE_TCP, USE_TLS, TLS_HOOKS, USE_RAW_SOCKS, DISABLE_NAGLE, USE_MCAST, DNS_IP_HACK, SHM_MEM, SHM_MMAP, PKG_MALLOC, DBG_QM_MALLOC, USE_FUTEX, FAST_LOCK-ADAPTIVE_WAIT, USE_DNS_CACHE, USE_DNS_FAILOVER, USE_NAPTR, USE_DST_BLACKLIST, HAVE_RESOLV_RES ADAPTIVE_WAIT_LOOPS=1024, MAX_RECV_BUFFER_SIZE 262144, MAX_LISTEN 16, MAX_URI_SIZE 1024, BUF_SIZE 65535, PKG_SIZE 8MB poll method support: poll, epoll_lt, epoll_et, sigio_rt, select. id: c274ce compiled on 22:03:13 Feb 1 2011 with gcc 4.1.2
I got the following in the log Jan 13 12:58:51 sip-3-1 /usr/local/sbin/kamailio[15124]: : <core> [pass_fd.c:293]: ERROR: receive_fd: EOF on 5 Jan 13 12:58:52 sip-3-1 /usr/local/sbin/kamailio[15094]: ALERT: <core> [main.c:741]: child process 15115 exited by a signal 11 Jan 13 12:58:52 sip-3-1 /usr/local/sbin/kamailio[15094]: ALERT: <core> [main.c:744]: core was generated Jan 13 12:58:52 sip-3-1 /usr/local/sbin/kamailio[15094]: INFO: <core> [main.c:756]: INFO: terminating due to SIGCHLD
And the core file is called core.15115
Hope it answers your questions.
Regards, /Kristian.
On Wednesday 14 January 2015 14:09:41 Daniel-Constantin Mierla wrote:
Hello,
the string_val field look ok. But p variable which is assigned that value a line before the reported crash is not, as reported by bt full:
p = 0xcfcdffaf <Address 0xcfcdffaf out of bounds>
Do you have enabled core files per process/pid? I still wonder if it is a overwritten core.
Otherwise, there might be physical problems with memory...
By pointer addresses, I guess it is a 32b CPU, right?
Cheers, Daniel
On 14/01/15 11:30, Kristian F. Høgh wrote:
Hi Daniel,
(gdb) p *row->values $1 = {type = DB1_STRING, nul = 0, free = 0, val = {int_val = 58055112, ll_val = 58055112, double_val = 2.8683036404665902e-316, time_val = 58055112, string_val = 0x375d9c8 "sip:44221xxx@192.168.1.157:37612;rinstance=8c613319a30caff6;transport=TCP", str_val = { s = 0x375d9c8 "sip:44221xxx@192.168.1.157:37612;rinstance=8c613319a30caff6;transport=TCP", len = 0}, blob_val = {s = 0x375d9c8 "sip:44221xxx@192.168.1.157:37612;rinstance=8c613319a30caff6;transport=TCP", len = 0}, bitmap_val = 58055112}}
(gdb) p i $2 = <value optimized out>
Regards, Kristian.
On Wednesday 14 January 2015 11:13:43 Daniel-Constantin Mierla wrote:
Hello,
get also:
p *row->values p i
Cheers, Daniel
On 14/01/15 10:50, Kristian F. Høgh wrote:
Hi Daniel,
(gdb) p *row $1 = {values = 0x9bd2b8, n = 13}
Regards, Kristian.
On Wednesday 14 January 2015 10:01:41 Daniel-Constantin Mierla wrote: > Hello, > > the operation done by that code is practically a sql query, fetching the > result and packing it for nathelper usage. It is no race between it and > handling any sip packet, no shared memory accessed. > > Do you get only one corefile? > > From the backtrace it looks like row pointer is not ok, can you get the > output for: > > p *row > > Cheers, > Daniel > > On 13/01/15 15:08, Kristian F. Høgh wrote: >> Hi, >> >> On an ancient kamailio 3.1, we occasionally experience crash in the function get_all_db_ucontacts in usrloc. >> >> A backtrace reveals the following: >> #0 0x00002b505f2fb4ff in get_all_db_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:176 >> 176 if (VAL_NULL(ROW_VALUES(row)+4) || p1==0 || p1[0]==0){ >> >> >> The lines around line 176 (which look similar to trunk, except p and p1 are str today) >> 159 for(i = 0; i < RES_ROW_N(res); i++) { >> 160 row = RES_ROWS(res) + i; >> 161 >> 162 /* received */ >> 163 p = (char*)VAL_STRING(ROW_VALUES(row)); >> 164 if ( VAL_NULL(ROW_VALUES(row)) || p==0 || p[0]==0 ) { >> 165 /* contact */ >> 166 p = (char*)VAL_STRING(ROW_VALUES(row)+1); >> 167 if (VAL_NULL(ROW_VALUES(row)+1) || p==0 || p[0]==0) { >> 168 LM_ERR("empty contact -> skipping\n"); >> 169 continue; >> 170 } >> 171 } >> 172 p_len = strlen(p); >> 173 >> 174 /* path */ >> 175 p1 = (char*)VAL_STRING(ROW_VALUES(row)+4); >> 176 if (VAL_NULL(ROW_VALUES(row)+4) || p1==0 || p1[0]==0){ >> 177 p1 = NULL; >> 178 p1_len = 0; >> 179 } else { >> 180 p1_len = strlen(p1); >> 181 } >> >> The client from frame #2 (bt full below), send a REGISTER, which was replied "200 OK" at the same second as the crash. >> Is it a timing issue between REGISTER and nathelper ping? >> >> I have a trace and the core. >> >> Regards, >> Kristian Høgh >> Uni-tel >> >> >> >> bt full >> #0 0x00002b505f2fb4ff in get_all_db_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:176 >> sock = 0x0 >> dbflags = 11088 >> host = {s = 0x375ef8e "PROXY_IP:5060", len = 13} >> i = <value optimized out> >> row = 0xa13488 >> dom = 0x2b5062723278 >> p1 = 0x2 <Address 0x2 out of bounds> >> now_s = "'2015-01-13 12:58:51'\177\000\000)" >> port = 11088 >> p_len = 73 >> cp = <value optimized out> >> needed = 16 >> res = 0xa2ed38 >> p = 0xcfcdffaf <Address 0xcfcdffaf out of bounds> >> now_len = 21 >> proto = 1642847400 >> p1_len = 0 >> shortage = 0 >> query_buf = "select received, contact, socket, cflags, path from location where expires > '2015-01-13 12:58:51' and cflags & 64 = 64 and id % 30 = 1", '\000' <repeats 376 times> >> query_str = {s = 0x2b505f515520 "select received, contact, socket, cflags, path from location where expires > '2015-01-13 12:58:51' and cflags & 64 = 64 and id % 30 = 1", len = 135} >> #1 get_all_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:398 >> No locals. >> #2 0x00002b5061eb6b1a in nh_timer (ticks=<value optimized out>, timer_idx=<value optimized out>) at nathelper.c:1623 >> rval = <value optimized out> >> buf = 0xadd270 >> cp = <value optimized out> >> c = {s = 0x0, len = 0} >> opt = {s = 0x0, len = 0} >> path = {s = 0x0, len = 0} >> curi = {user = {s = 0x0, len = 0}, passwd = {s = 0x0, len = 0}, host = {s = 0xadd836 "CLIENT_IP:62252\250u\220", len = 14}, port = {s = 0xadd845 "62252\250u\220", len = 5}, params = {s = 0x0, len = 0}, sip_params = {s = 0x0, len = 0}, headers = { >> s = 0x7fff110134b0 "", len = 566862356}, port_no = 50336, proto = 70, type = ERROR_URI_T, flags = 65536, transport = {s = 0x8cbfa0 "", len = 5441948}, ttl = {s = 0x7fff1101352f "", len = 4448297}, user_param = {s = 0x375ef8e "PROXY_IP:5060", len = 13}, >> maddr = {s = 0x9204b0 "sip3", len = 10677560}, method = {s = 0x0, len = 0}, lr = {s = 0xadd84a "\250u\220", len = 0}, r2 = {s = 0xa2ebc0 "\250\272\252", len = 285291296}, transport_val = {s = 0x40 <Address 0x40 out of bounds>, len = 335544320}, ttl_val = { >> s = 0x3221c302d0 "H\307\300\017", len = 0}, user_param_val = {s = 0x0, len = 0}, maddr_val = {s = 0x0, len = 0}, method_val = {s = 0x0, len = 0}, lr_val = {s = 0x0, len = 0}, r2_val = {s = 0x0, len = 0}} >> he = <value optimized out> >> send_sock = <value optimized out> >> flags = <value optimized out> >> proto = <value optimized out> >> dst = {send_sock = 0x0, to = {s = {sa_family = 0, sa_data = '\000' <repeats 13 times>}, sin = {sin_family = 0, sin_port = 0, sin_addr = {s_addr = 0}, sin_zero = "\000\000\000\000\000\000\000"}, sin6 = {sin6_family = 0, sin6_port = 0, sin6_flowinfo = 0, >> sin6_addr = {in6_u = {u6_addr8 = '\000' <repeats 15 times>, u6_addr16 = {0, 0, 0, 0, 0, 0, 0, 0}, u6_addr32 = {0, 0, 0, 0}}}, sin6_scope_id = 0}}, id = 0, proto = 0 '\000', send_flags = {f = 0 '\000', blst_imask = 0 '\000'}} >> iteration = 1 >> __FUNCTION__ = "nh_timer" >> #3 0x0000000000521c54 in fork_dummy_timer (child_id=<value optimized out>, desc=<value optimized out>, make_sock=<value optimized out>, f=0x2b5061eb6a70 <nh_timer>, param=0x0, interval=1) at timer_proc.c:71 >> pid = <value optimized out> >> #4 0x00002b5061eb42ee in child_init (rank=<value optimized out>) at nathelper.c:673 >> i = 58055190 >> #5 0x00000000004db1d0 in init_mod_child (m=0x917dc0, rank=0) at sr_module.c:829 >> No locals. >> #6 0x00000000004db1a9 in init_mod_child (m=0x918420, rank=0) at sr_module.c:826 >> No locals. >> #7 0x00000000004db1a9 in init_mod_child (m=0x918aa0, rank=0) at sr_module.c:826 >> No locals. >> >> >> >> _______________________________________________ >> SIP Express Router (SER) and Kamailio (OpenSER) - sr-users mailing list >> sr-users@lists.sip-router.org >> http://lists.sip-router.org/cgi-bin/mailman/listinfo/sr-users
Hi Daniel,
Sorry. Forgot to answer your question. I am quite sure, I only got one core file.
Regards, Kristian Høgh Uni-tel
On Wednesday 14 January 2015 10:01:41 Daniel-Constantin Mierla wrote:
Hello,
the operation done by that code is practically a sql query, fetching the result and packing it for nathelper usage. It is no race between it and handling any sip packet, no shared memory accessed.
Do you get only one corefile?
From the backtrace it looks like row pointer is not ok, can you get the output for:
p *row
Cheers, Daniel
On 13/01/15 15:08, Kristian F. Høgh wrote:
Hi,
On an ancient kamailio 3.1, we occasionally experience crash in the function get_all_db_ucontacts in usrloc.
A backtrace reveals the following: #0 0x00002b505f2fb4ff in get_all_db_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:176 176 if (VAL_NULL(ROW_VALUES(row)+4) || p1==0 || p1[0]==0){
The lines around line 176 (which look similar to trunk, except p and p1 are str today) 159 for(i = 0; i < RES_ROW_N(res); i++) { 160 row = RES_ROWS(res) + i; 161 162 /* received */ 163 p = (char*)VAL_STRING(ROW_VALUES(row)); 164 if ( VAL_NULL(ROW_VALUES(row)) || p==0 || p[0]==0 ) { 165 /* contact */ 166 p = (char*)VAL_STRING(ROW_VALUES(row)+1); 167 if (VAL_NULL(ROW_VALUES(row)+1) || p==0 || p[0]==0) { 168 LM_ERR("empty contact -> skipping\n"); 169 continue; 170 } 171 } 172 p_len = strlen(p); 173 174 /* path */ 175 p1 = (char*)VAL_STRING(ROW_VALUES(row)+4); 176 if (VAL_NULL(ROW_VALUES(row)+4) || p1==0 || p1[0]==0){ 177 p1 = NULL; 178 p1_len = 0; 179 } else { 180 p1_len = strlen(p1); 181 }
The client from frame #2 (bt full below), send a REGISTER, which was replied "200 OK" at the same second as the crash. Is it a timing issue between REGISTER and nathelper ping?
I have a trace and the core.
Regards, Kristian Høgh Uni-tel
bt full #0 0x00002b505f2fb4ff in get_all_db_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:176 sock = 0x0 dbflags = 11088 host = {s = 0x375ef8e "PROXY_IP:5060", len = 13} i = <value optimized out> row = 0xa13488 dom = 0x2b5062723278 p1 = 0x2 <Address 0x2 out of bounds> now_s = "'2015-01-13 12:58:51'\177\000\000)" port = 11088 p_len = 73 cp = <value optimized out> needed = 16 res = 0xa2ed38 p = 0xcfcdffaf <Address 0xcfcdffaf out of bounds> now_len = 21 proto = 1642847400 p1_len = 0 shortage = 0 query_buf = "select received, contact, socket, cflags, path from location where expires > '2015-01-13 12:58:51' and cflags & 64 = 64 and id % 30 = 1", '\000' <repeats 376 times> query_str = {s = 0x2b505f515520 "select received, contact, socket, cflags, path from location where expires > '2015-01-13 12:58:51' and cflags & 64 = 64 and id % 30 = 1", len = 135} #1 get_all_ucontacts (buf=0xadd270, len=<value optimized out>, flags=64, part_idx=1, part_max=30) at dlist.c:398 No locals. #2 0x00002b5061eb6b1a in nh_timer (ticks=<value optimized out>, timer_idx=<value optimized out>) at nathelper.c:1623 rval = <value optimized out> buf = 0xadd270 cp = <value optimized out> c = {s = 0x0, len = 0} opt = {s = 0x0, len = 0} path = {s = 0x0, len = 0} curi = {user = {s = 0x0, len = 0}, passwd = {s = 0x0, len = 0}, host = {s = 0xadd836 "CLIENT_IP:62252\250u\220", len = 14}, port = {s = 0xadd845 "62252\250u\220", len = 5}, params = {s = 0x0, len = 0}, sip_params = {s = 0x0, len = 0}, headers = { s = 0x7fff110134b0 "", len = 566862356}, port_no = 50336, proto = 70, type = ERROR_URI_T, flags = 65536, transport = {s = 0x8cbfa0 "", len = 5441948}, ttl = {s = 0x7fff1101352f "", len = 4448297}, user_param = {s = 0x375ef8e "PROXY_IP:5060", len = 13}, maddr = {s = 0x9204b0 "sip3", len = 10677560}, method = {s = 0x0, len = 0}, lr = {s = 0xadd84a "\250u\220", len = 0}, r2 = {s = 0xa2ebc0 "\250\272\252", len = 285291296}, transport_val = {s = 0x40 <Address 0x40 out of bounds>, len = 335544320}, ttl_val = { s = 0x3221c302d0 "H\307\300\017", len = 0}, user_param_val = {s = 0x0, len = 0}, maddr_val = {s = 0x0, len = 0}, method_val = {s = 0x0, len = 0}, lr_val = {s = 0x0, len = 0}, r2_val = {s = 0x0, len = 0}} he = <value optimized out> send_sock = <value optimized out> flags = <value optimized out> proto = <value optimized out> dst = {send_sock = 0x0, to = {s = {sa_family = 0, sa_data = '\000' <repeats 13 times>}, sin = {sin_family = 0, sin_port = 0, sin_addr = {s_addr = 0}, sin_zero = "\000\000\000\000\000\000\000"}, sin6 = {sin6_family = 0, sin6_port = 0, sin6_flowinfo = 0, sin6_addr = {in6_u = {u6_addr8 = '\000' <repeats 15 times>, u6_addr16 = {0, 0, 0, 0, 0, 0, 0, 0}, u6_addr32 = {0, 0, 0, 0}}}, sin6_scope_id = 0}}, id = 0, proto = 0 '\000', send_flags = {f = 0 '\000', blst_imask = 0 '\000'}} iteration = 1 __FUNCTION__ = "nh_timer" #3 0x0000000000521c54 in fork_dummy_timer (child_id=<value optimized out>, desc=<value optimized out>, make_sock=<value optimized out>, f=0x2b5061eb6a70 <nh_timer>, param=0x0, interval=1) at timer_proc.c:71 pid = <value optimized out> #4 0x00002b5061eb42ee in child_init (rank=<value optimized out>) at nathelper.c:673 i = 58055190 #5 0x00000000004db1d0 in init_mod_child (m=0x917dc0, rank=0) at sr_module.c:829 No locals. #6 0x00000000004db1a9 in init_mod_child (m=0x918420, rank=0) at sr_module.c:826 No locals. #7 0x00000000004db1a9 in init_mod_child (m=0x918aa0, rank=0) at sr_module.c:826 No locals.
SIP Express Router (SER) and Kamailio (OpenSER) - sr-users mailing list sr-users@lists.sip-router.org http://lists.sip-router.org/cgi-bin/mailman/listinfo/sr-users