#!/pkg/bin/ksh
#
# May 2010, Huan Le
#
#
# Copyright (c) 2010-2015, 2017 by cisco Systems, Inc.
# All rights reserved.
#

prog=${0##*/}

typeset client_name=""
typeset client_jid=""
typeset group=""
typeset location=""
typeset target_nodeid=""
typeset ofile="/dev/null"
typeset stop_test=0
typeset test_failed=0
typeset platform=${platform:-$(uname -m | cut -f1 -d "-")}

usage() {
   echo "$prog - show group communication health status"
   echo "usage: $prog -g <gid> [-c <client-name>] [-L <nodeid>]"
}

check_media_ping() {
   typeset plane="global"
   if [[ $1 -ge 2000 ]]; then
       plane="lr"
   fi

   commands="${platform}_${plane}"
   execute_command $commands $1

   if [[ ${stop_test} -eq 1 ]]; then
       echo "   Detect ${plane} plane transport issue."
       test_failed=1
   fi
}

check_gsp_sysgrp_ping() {
    typeset mygroup=1
    if [[ $1 == "lr" ]]; then
        mygroup=2001
    fi
    check_gsp_ping ${mygroup}
} 

check_gsp_seq_ping() {
   typeset cmdargs="-g $1 ${location}"
   typeset output=$(/pkg/bin/gsp_ping -c 10 -x -e -rv ${cmdargs})
   echo "GSP sequence number ping:" >> ${ofile}
   echo $output >> ${ofile}

   typeset any_error=$(echo $output | grep "gsp ping failed" | cut -d ":" -f 3-)
   if [[ $any_error != "" ]]; then
      echo "   Detect group $1 sequence-number mismatch."
      test_failed=1
   fi
}

check_gsp_snv_ping() {
   typeset cmdargs="-g $1 ${location}"
   typeset output=$(/pkg/bin/gsp_ping -c 10 -a -v ${cmdargs})
   echo "GSP node ping:" >> ${ofile}
   echo $output >> ${ofile}

   typeset any_error=$(echo $output | grep -e "NO" -e "TIMEOUT")
   if [[ $any_error != "" ]]; then
      echo "   Detect group $1 membership mismatch."
      echo "   Please retry a few times to avoid transient mismatch."
      test_failed=1
   fi
}

check_gsp_ping() {
   typeset i
   typeset cmdargs="-g $1 ${location}"
   typeset output=$(/pkg/bin/gsp_ping -c 10 -r -e ${cmdargs})
   echo "GSP Ping:" >> ${ofile}
   echo ${output} >> ${ofile}
   typeset failed_nodes=""
   failed_nodes=$(echo ${output} | grep "gsp ping failed" | cut -d ":" -f 3-)

   for i in $failed_nodes; do
      typeset rsi_node=$(/pkg/bin/node_conversion -e $i)
      if [[ $stop_test -eq 0 ]]; then
         stop_test=1
         if [[ ${target_nodeid} -eq "" ]]; then
             echo "   Gsp ping on group $1 to the following node failed:"
         else
             echo "   Gsp ping on group $1 from ${target_nodeid} to the following node failed:"
         fi
      fi
      echo "     $rsi_node"
      # Execute command to capture trouble-shooting data on this node
   done
   
   if [[ $stop_test -eq 1 ]]; then
       echo "   Detect unresponsive node(s) . "
       test_failed=1
   fi
}

# EnXR media check
check_enxr_global_media() {
    stop_test=0
}

check_enxr_lr_media() {
    stop_test=0
}

# CRS media check
check_hfr_fabric_ping() {
    typeset diag_output=$(gsp_stats -a -g $1 | grep "^$1 ")
    typeset grp_addr=$(echo ${diag_output} | cut -d " " -f 2)
    if [[ ${grp_addr} == "" ]]; then
        echo "Warning: cannot get media address for group $1"
        test_failed=1
    fi

    diag_output=$(diag_ping fab fgid ${grp_addr} cnt 10)
    echo "CRS fabric ping: " >> ${ofile}
    echo ${diag_output} >> ${ofile}
    typeset ping_output=$(echo ${diag_output} | grep "All.*node.*s*. responded")

    if [[ ${ping_output} == "" ]]; then
        echo "Detect CRS fabric ping error on \
              gsp group $1 fgid ${grp_addr}"
        test_failed=1
    fi
}

check_hfr_ceth_ping() {
    typeset diag_output=$(gsp_stats -a -g $1 | grep "^$1 ")
    typeset grp_addr=$(echo ${diag_output} | cut -d " " -f 2)
    if [[ ${grp_addr} == "" ]]; then
        echo "Warning: cannot get media address for group $1"
        test_failed=1
    fi

    diag_output=$(mceping -d ${grp_addr} -n 5 -t 2 -l 1 2>&1 &)
    typeset any_error=$(echo ${diag_output} | \
                        grep -e "(0x.* 5" | grep -v " 5.* 5")
    echo "CRS mcast control-ether ping: " >> ${ofile}
    echo $diag_output >> ${ofile}
    if [[ ${any_error} != "" ]]; then
        echo "  Detect unreachable node(s) over control-ethernet."
        test_failed=1
    fi
}

check_hfr_global_media() {
    stop_test=0
    check_hfr_ceth_ping $1
}

check_hfr_lr_media() {
    check_hfr_fabric_ping $1
}

init_command_list() {
    # CRS commands:
    hfr_global[0]=check_hfr_global_media
    hfr_lr[0]=check_hfr_lr_media

    # ENXR commands:
    enxr_global[0]=check_enxr_global_media
    enxr_lr[0]=check_enxr_lr_media
}

execute_command() {
    typeset index=0
    typeset cmds=$1
    typeset number_cmds=$(eval echo "\${#${cmds}[*]}")

    while [[ $index -lt $number_cmds ]] ; do
        typeset exec_cmd=$(eval echo "\${${cmds}[$index]}")
        $exec_cmd $2
        index="$((${index} + 1))"
    done
}

while getopts ":c:g:L:f:hH-?" option
do case $option in 
   c) 
      client_name="$OPTARG"
      ;;
   g)
      group="$OPTARG"
      ;;
   L)
      target_nodeid="$OPTARG"
      location="-L ${target_nodeid}"
      ;;
   f)
      ofile="$OPTARG"
      ;;
   :|?) 
      echo "Error: option -$OPTARG required a value"
      usage
      exit 2
      ;;
   h|H)
      usage
      exit 0
      ;;
esac
done

# Verify the client-name is valid on the node
if [[ ${client_name} != "" ]]; then
    if [[ ${target_nodeid} -eq "" ]]; then
        client_jid=$(sysmgr_show -p ${client_name} | grep "Job Id" | cut -d ":" -f2)
    else
        client_jid=$(sysmgr_show -p ${client_name} -n ${target_nodeid}| grep "Job Id" | cut -d ":" -f2)
    fi

    if [[ ${client_jid} -eq "" ]]; then
        usage
        exit -4
    fi
fi
delete_ofile=1
# Save output of executed command(s) into an output file
if [[ ${ofile} == "" ]]; then
    delete_ofile=0
    ofile="/dev/null"
else
    if [[ -f ${ofile} ]]; then
        echo "$prog: cannot write to existing file ${ofile}"
        exit 1
    fi
fi

init_command_list 

echo "------------------------------------------------------------"
echo "In case this blocks for more than 6 minutes, Ctrl-C to abort."
if [[ ${target_nodeid} != "" ]]; then
    echo "Target node: $(/pkg/bin/node_conversion -e ${target_nodeid})"
fi
echo "------------------------------------------------------------"

echo "1) Verifying global transport ..."
check_media_ping "1"
if [[ ${test_failed} -ne 1 ]]; then
    echo "   gsp transport is healthy."
else
    echo "   gsp transport is not healthy. "
fi

echo "2) Verifying gsp global system group ..."
check_gsp_sysgrp_ping "global"
if [[ ${test_failed} -ne 1 ]]; then
    echo "   gsp global system group transport is healthy."
else
    echo "   gsp global system group transport is not healthy "
fi

echo "3) Verifying SDR transport ..."
check_media_ping "2001"
if [[ ${test_failed} -ne 1 ]]; then
    echo "   gsp SDR system group transport is healthy."
else
    echo "   gsp SDR system group transport is not healthy "
fi

echo "4) Verifying gsp SDR system group ..."
check_gsp_sysgrp_ping "lr"
if [[ ${test_failed} -ne 1 ]]; then
    echo "   gsp SDR system group transport is healthy."
else
    echo "   gsp SDR system group transport is not healthy "
fi

typeset -i testnum=5
if [[ ${client_name} != "" ]]; then
   echo "${testnum}) Verifying $client_name group communication:"
   typeset client_groups=$(/pkg/bin/gsp_show -c ${client_jid} ${location} | grep "Max Thresh" | cut -d " " -f 1)

   typeset a_group=""
   for a_group in ${client_groups}; do
       check_media_ping ${a_group}
       check_gsp_ping ${a_group}
       check_gsp_snv_ping ${a_group}
       echo "    group $a_group is healthy."
   done
   testnum="$((${testnum} + 1))"
fi 

if [[ ${group} != "" ]]; then
   echo "${testnum}) Verifying application group $group ..."
   check_media_ping $group
   check_gsp_ping $group
   check_gsp_snv_ping $group
#   check_gsp_seq_ping $group
   echo "   group $group communication is healthy."
fi

if [[ ${test_failed} -ne 1 ]]; then
    echo "GSP health check completed at $(date)."
    echo "Summary: gsp is healthy."
fi
