#!/bin/bash
#
# Script to collect debug data on Heartbeat loss 
#
# July 2014, Saurabh Gupta
#
# Copyright (c) 2014-2018 by Cisco Systems, Inc.
# All rights reserved.
#
trap '' HUP

#Inputs to script - $1 - vm name (eg default-sdr--1) , $2 HB pending count

core_basename="${1}_`date +"%Y-%m-%d-%H_%M"`-${2}_hbmiss"
coredir="/misc/scratch/$core_basename"

mkdir -p $coredir
LOG=$coredir/cmdlog.txt
> $LOG

function dateandrun
{
    DATE_CMD='date +"%Y-%m-%d-%H:%M:%S:%N"'
    T=`$DATE_CMD`
    echo $T execute: $* >> $LOG
    if [ -z "$OUTF" ]
    then
        # if filename (OUTF) is not provided, collect any
        # command errors into a log file.
        eval $* 1>>$coredir/cmdoutput.txt 2>&1
    else
        eval $* 1>$coredir/$OUTF 2>&1
    fi
}

OUTF= dateandrun echo Start

# Gather the kernel trace buffer (/sys/kernel/debug/tracing)
# Do this first - We are interested in what has already happened.
# We don't want to do this on every iteration from calvados vm_manager,
# so run this based on argument of "3". hushd calls this script with
# argument 3.
if [ -x /usr/bin/trace-cmd -a $2 -eq 3 ]; then
    OUTF=                 dateandrun trace-cmd snapshot -s
    OUTF=kernel_trace_buf dateandrun trace-cmd snapshot
    OUTF=                 dateandrun trace-cmd snapshot -f
fi


tmp=`ps -eo pid,command | grep qemu | grep $1 | grep -v grep | awk '{print $1}'`

#collect the sched and stack output from the /proc
for file in /proc/$tmp/task/*
do
    OUTF= dateandrun mkdir -p $coredir/$file
    OUTF=$file/sched dateandrun cat $file/sched
    OUTF=$file/stack dateandrun cat $file/stack
done

# collect Niantic Driver statistics
OUTF=ifconfig_${2}.log dateandrun ifconfig -a
OUTF=ethtool_${2}.log  dateandrun ethtool eth-pf1
OUTF=ethstats_${2}.log dateandrun ethtool --statistics eth-pf1
OUTF=ethregs_${2}.log  dateandrun ethregs

# Sysrq are intrusive, so collect this data only on 3rd HB miss
if [ $2 -eq 3 ] ; then

    # We leave this single virsh command in vm_debug.sh. This produces a
    # SysRq message in the VM kernel message log, which gives an easy way
    # to correlate the VM kernel log and timestamps with the host logs.
    OUTF= dateandrun virsh qemu-monitor-command $1 --hmp 'sendkey alt-sysrq-3'

    # We are not doing any of this anymore. This is intrusive, of dubious
    # value, and the same information is available from inspection of the
    # kernel core.
    # The VM kernel dump, and the VM kernel trace buffer are the best debug
    # information. While these commands are being executed, the VM state is changing
    # and becoming less relevant. So skip this stuff and let hushd or vm-mgr get on
    # with collecting the VM dump.

    #Dump the kernel memory statistic
    # OUTF= dateandrun virsh qemu-monitor-command $1 --hmp 'sendkey alt-sysrq-m'

    #Dump the stack trace of the current process
    # OUTF= dateandrun virsh qemu-monitor-command $1 --hmp 'sendkey alt-sysrq-l'
    # OUTF= dateandrun usleep 10000
    # OUTF= dateandrun virsh qemu-monitor-command $1 --hmp 'sendkey alt-sysrq-l'
    # OUTF= dateandrun usleep 10000
    # OUTF= dateandrun virsh qemu-monitor-command $1 --hmp 'sendkey alt-sysrq-l'
    # OUTF= dateandrun usleep 10000
    # OUTF= dateandrun virsh qemu-monitor-command $1 --hmp 'sendkey alt-sysrq-l'
    # OUTF= dateandrun usleep 10000
    # OUTF= dateandrun virsh qemu-monitor-command $1 --hmp 'sendkey alt-sysrq-l'
fi

# For HB misses, the current state is the most relevant - we want to
# start the VM dump as early as possible. So do all the rest of this
# stuff in a subshell in the background.
(
    if [ $2 -eq 3 ] ; then
        #collect the pstack output
        OUTF=pstack.txt dateandrun pstack $tmp

    # Collect lspci output on final miss.
    OUTF=lspci.txt dateandrun lspci -tv

    # And collect disk usage - only need to do this once, it
    # should not be very volatile
    OUTF=df.txt dateandrun df -lk

    # And collect the hushd ctrace tarball. hushd does a ctrace_dump_all before
    # calling this script, so the traces will be complete. If this script is
    # executed on host by vm_manager, the ctraces may not be complete.
    OUTF= dateandrun tar -cf $coredir/hushd.ctrace.tar /var/log/ctrace/hushd
fi

# Collect the interrupt statistics and kvm vm_exit statistics
# Do it only once since the script is called twice - once on
# 2nd hb miss and again on 3rd hb miss so we will have two
# snapshots for this
OUTF=interrupts_${2}_hbmiss dateandrun cat /proc/interrupts
OUTF=vm_exits_${2}_hbmiss dateandrun cat /sys/kernel/debug/kvm/exits

# However, hushd only calls the script 1 time, and since we don't know what
# is calling the script, do this again if the argument is '3'
if [ $2 -eq 3 ] ; then
    usleep 200000 
    OUTF=interrupts_${2}_hbmiss_2 dateandrun cat /proc/interrupts
    OUTF=vm_exits_${2}_hbmiss_2 dateandrun cat /sys/kernel/debug/kvm/exits
fi

# Run the top command for 2nd hb miss only as this cmd
# takes ~600-700msec to run and would be good enough to
# provide system info by the time 3rd hb miss happens.
### hushd invokes this script now, but only with the "3" argument - hence
### this output would be missing entirely from hushd takedown output.
### if [ $2 -eq 2 ] ; then
    OUTF=top_snapshot dateandrun top -H -n 1 -b
### fi

# Collect /var/log
OUTF= dateandrun cp /var/log/libvirt/qemu/*.log $coredir/
OUTF= dateandrun cp /var/log/libvirt/libvirtd.log $coredir/libvirtd.log
OUTF= dateandrun cp /var/log/syslog.log $coredir/syslog.log
OUTF= dateandrun cp /var/log/messages $coredir/messages
OUTF= dateandrun cp /var/log/kern.log $coredir/kern.log
OUTF= dateandrun sync

OUTF= dateandrun echo Finish

# Now - tar and compress the output directory. Do this in the background,
# so vm_debug.sh will complete in a timely fashion. The tar.gz file will be
# moved to /misc/scratch/core here, and dumper will then move to the confd
# lead.
# Use the --remove-files option to delete the files after the tarball is
# created.
    cd /misc/scratch || exit 1
    tar -c -C /misc/scratch --remove-files -f ${core_basename}.tar $core_basename
    gzip ${core_basename}.tar && mv ${core_basename}.tar.gz /misc/scratch/core
)&

