Version: 5.3.3 065668
Platform: CentOS 7 (3.10.0-862.14.4.el7.x86_64)

I seem to have a persistent intermittent crash on all five gateways which use http_async_client to query an HTTP API.

In all cases, the crash is on attempt to access or reference an AVP called $avp(is_subscribe):

Program terminated with signal 11, Segmentation fault.
#0  0x00000000005d479b in match_by_name (avp=0xabcdefed, id=45, name=0x7fff9379c668)
    at core/usr_avp.c:378
378		if (id==avp->id && avp->flags&AVP_NAME_STR &&
hMissing separate debuginfos, use: debuginfo-install libuuid-2.23.2-52.el7_5.1.x86_64 nss-pem-1.0.3-4.el7.x86_64
(gdb) where
#0  0x00000000005d479b in match_by_name (avp=0xabcdefed, id=45, name=0x7fff9379c668)
    at core/usr_avp.c:378
#1  0x00000000005d54bf in search_next_avp (s=0x7fff9379c660, val=0x7fff9379c630)
    at core/usr_avp.c:499
#2  0x00000000005d4f04 in search_avp (ident=..., val=0x7fff9379c630, state=0x7fff9379c660)
    at core/usr_avp.c:465
#3  0x00000000005d4913 in search_first_avp (flags=1, name=..., val=0x7fff9379c630, 
    s=0x7fff9379c660) at core/usr_avp.c:414
#4  0x00007f3f6ba529fd in ops_is_avp_set (msg=0xb04f60 <_faked_msg>, ap=0x7f3f73296ed8)
    at avpops_impl.c:1821
#5  0x00007f3f6ba60d47 in w_is_avp_set (msg=0xb04f60 <_faked_msg>, param=0x7f3f73296ed8 "", 
    op=0x0) at avpops.c:1059
#6  0x0000000000480f3d in do_action (h=0x7fff9379ce70, a=0x7f3f732a8b30, 
    msg=0xb04f60 <_faked_msg>) at core/action.c:1077
#7  0x000000000048da7c in run_actions (h=0x7fff9379ce70, a=0x7f3f732a8b30, 
    msg=0xb04f60 <_faked_msg>) at core/action.c:1576
#8  0x000000000048e13d in run_actions_safe (h=0x7fff9379dae0, a=0x7f3f732a8b30, 
    msg=0xb04f60 <_faked_msg>) at core/action.c:1640
#9  0x0000000000438ff3 in rval_get_int (h=0x7fff9379dae0, msg=0xb04f60 <_faked_msg>, 
    i=0x7fff9379d318, rv=0x7f3f732a8d70, cache=0x0) at core/rvalue.c:915
#10 0x000000000043d5a3 in rval_expr_eval_int (h=0x7fff9379dae0, msg=0xb04f60 <_faked_msg>, 
    res=0x7fff9379d318, rve=0x7f3f732a8d68) at core/rvalue.c:1913
#11 0x000000000043d9f6 in rval_expr_eval_int (h=0x7fff9379dae0, msg=0xb04f60 <_faked_msg>, 
    res=0x7fff9379d7cc, rve=0x7f3f732a9598) at core/rvalue.c:1921
#12 0x000000000048097b in do_action (h=0x7fff9379dae0, a=0x7f3f732aa9c0, 
    msg=0xb04f60 <_faked_msg>) at core/action.c:1047
#13 0x000000000048da7c in run_actions (h=0x7fff9379dae0, a=0x7f3f732aa9c0, 
    msg=0xb04f60 <_faked_msg>) at core/action.c:1576
#14 0x000000000048e205 in run_top_route (a=0x7f3f732aa9c0, msg=0xb04f60 <_faked_msg>, c=0x0)
    at core/action.c:1661
#15 0x00007f3f705f7d0e in async_http_cb (reply=0x7f3f2b7ee620, param=0x7f3f2b7eca18)
    at async_http.c:256
#16 0x00007f3f705f0dab in check_multi_info (g=0x7f3f2abeb9e8) at http_multi.c:597
#17 0x00007f3f705e84ec in event_cb (fd=6, kind=2, userp=0x1d7cdf0) at http_multi.c:145
#18 0x00007f3f70131a14 in event_process_active_single_queue (activeq=0x1c6f820, base=0x1d3ad90)
    at event.c:1350
#19 event_process_active (base=<optimized out>) at event.c:1420
#20 event_base_loop (base=0x1d3ad90, flags=0) at event.c:1621
#21 0x00007f3f705f490f in async_http_run_worker (worker=0x7f3f2abd1bb0) at async_http.c:92
#22 0x00007f3f705dcb17 in child_init (rank=0) at http_async_client_mod.c:366
#23 0x000000000057a810 in init_mod_child (m=0x7f3f73245168, rank=0) at core/sr_module.c:780
#24 0x000000000057a4ac in init_mod_child (m=0x7f3f73245d58, rank=0) at core/sr_module.c:776
#25 0x000000000057a4ac in init_mod_child (m=0x7f3f73246340, rank=0) at core/sr_module.c:776
#26 0x000000000057a4ac in init_mod_child (m=0x7f3f732467f8, rank=0) at core/sr_module.c:776
#27 0x000000000057a4ac in init_mod_child (m=0x7f3f73246e38, rank=0) at core/sr_module.c:776
#28 0x000000000057a4ac in init_mod_child (m=0x7f3f73247210, rank=0) at core/sr_module.c:776
#29 0x000000000057a4ac in init_mod_child (m=0x7f3f73247978, rank=0) at core/sr_module.c:776
#30 0x000000000057a4ac in init_mod_child (m=0x7f3f73247da0, rank=0) at core/sr_module.c:776
#31 0x000000000057af5a in init_child (rank=0) at core/sr_module.c:825
#32 0x0000000000426bc1 in main_loop () at main.c:1753
#33 0x000000000042e63a in main (argc=13, argv=0x7fff9379edb8) at main.c:2802

Most recently, it is here:

route {
   $avp(is_subscribe) = 0;

   ...

   $http_req(method) = 'GET';
   $http_req(suspend) = 1;
   $http_req(hdr) = "x-api-key: " + $sel(cfg_get.api.whitelist_key);

   t_newtran();

   http_async_query("https://some_host/check_ip_in_whi
telist?ip=$(si{s.escape.common})", "WHITELIST_CHECK");
}

...

route[WHITELIST_CHECK] {
   if(!is_avp_set("$avp(is_subscribe)")) {
      xlog("L_INFO", "[R-HTTP-REPLY:$ci] !> AVP is_subscribe is undefined -- cannot process further\n");
      sl_send_reply("500", "Internal Server Error");
      exit;
   }

However, adding is_avp_set() was my attempt to put a safe-guard around unsafe access of this AVP. It didn't work:

(gdb) print *name
$1 = {s = 0x7f3f73261835 "is_subscribe)", len = 12}
(gdb) print avp
$2 = (avp_t *) 0xabcdefed
(gdb) print avp->id
Cannot access memory at address 0xabcdefed

Note that I am making the assumption above that AVPs created in the main request route will be available in the http_async_client callback, because it is just a suspended TM transaction underneath -- right? So, transaction-persistent variables such as AVP should be visible throughout the lifetime of the transaction just as for a conventional async TM transaction, I would think.

Digging further into the mesage being handled in async_http_cb(), it is always a normal HTTP 200 OK reply:

(gdb) set print elements 0
(gdb) frame 15
#15 0x00007f3f705f7d0e in async_http_cb (reply=0x7f3f2b7ee620, param=0x7f3f2b7eca18)
    at async_http.c:256
256					if (run_top_route(act, fmsg, 0)<0) {
(gdb) print *reply
$5 = {retcode = 200, result = 0x7f3f2b7ec560, error = '\000' <repeats 255 times>}
(gdb) print *(reply->result)
$6 = {
  s = 0x7f3f2b7ee798 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: 16\r\nConnection: keep-alive\r\nDate: Wed, 15 Apr 2020 15:06:52 GMT\r\nx-amzn-RequestId: d3a44941-e0c4-4cb9-9144-ea54d929c300\r\nx-amz-apigw-id: LCJqBF7CIAMFXCw=\r\nX-Amzn-Trace-Id: Root=1-5e97230c-4fe1af7d072275d95c231a4c;Sampled=0\r\nX-Cache: Miss from cloudfront\r\nHia: 1.1 ade18dc841d2e1cc8ef49611c5d4c93e.cloudfront.net (CloudFront)\r\nX-Amz-Cf-Pop: IAD89-C2\r\nX-Amz-Cf-Id: 3s1S2kvOgL7L4ZKMlWwQnfIxHmWFGz0PHbcNFefYA1tZ1j0AGy3JHg==\r\n\r\n{\"status\":false}\300\300\300\300", len = 504}

Any insights would be appreciated! These crashes are happening basically hourly from nowhere, after over a year of undisturbed operation.


You are receiving this because you are subscribed to this thread.
Reply to this email directly, view it on GitHub, or unsubscribe.