#!/bin/bash
#
# Script to collect debug data on for LXC's Heartbeat loss 
#
# Jan 2015, Rajesh Negi
#
# Copyright (c) 2014-2017 by Cisco Systems, Inc.
# All rights reserved.
#

#Inputs to script - $1 - vm name (eg default-sdr--1) , $2 HB pending count

BOOTSTRAP_FILE="/etc/init.d/calvados_bootstrap.cfg"
source $BOOTSTRAP_FILE
VM_NAME=$1
HB_MISS_COUNT=$2
HB_MISS_COUNT_MAX=3

coredir="/misc/scratch/${1}_core_`date +"%Y-%m-%d-%H:%M"`_${2}_hbmiss"
mkdir -p $coredir
LOG=$coredir/cmdlog.txt
> $LOG

function dateandrun
{
    local DATE_CMD='date +"%Y-%m-%d-%H:%M:%S:%N"'
    local T=`$DATE_CMD`
    echo $T execute: $* >> $LOG
    if [ -z "$OUTF" ]
    then
        # if filename (OUTF) is not provided, collect any
        # command errors into a log file.
        eval $* 1>>$coredir/cmdoutput.txt 2>&1
    else
        eval $* 1>>$coredir/$OUTF 2>&1
    fi
}

OUTF= dateandrun echo Start


# Sysrq are intrusive, so collect this data only on 3rd HB miss
if [ $HB_MISS_COUNT -eq $HB_MISS_COUNT_MAX ] ; then
    OUTF= dateandrun echo "Running Process Stack Trace"

    #Skip collection of the pstack, since the cores are collected.
    #pstack is causing spinlock lockup (mostly in arm cards, sometimes in x86 cards)
    ## Collect the pstack for all running process
    #while read line; do
    #    OUTF=pstack.txt dateandrun echo "Stack trace for pid" $line
    #    OUTF=pstack.txt dateandrun pstack $line
    #done < /dev/cgroup/memory/machine/sysadmin.libvirt-lxc/tasks

    # Collect lspci output on final miss.
    OUTF=lspci.txt dateandrun lspci -tv

    # And collect disk usage - only need to do this once, it
    # should not be very volatile
    OUTF=df.txt dateandrun df -k

    # And collect the hushd ctrace tarball. hushd does a ctrace_dump_all before
    # calling this script, so the traces will be complete. If this script is
    # executed on host by vm_manager, the ctraces may not be complete.
    OUTF= dateandrun tar -cf $coredir/hushd.ctrace.tar /var/log/ctrace/hushd
fi

# Collect the interrupt statistics and kvm vm_exit statistics
# Do it only once since the script is called twice - once on
# 2nd hb miss and again on 3rd hb miss so we will have two
# snapshots for this
OUTF=interrupts_${2}_hbmiss dateandrun cat /proc/interrupts

# However, hushd only calls the script 1 time, and since we don't know what
# is calling the script, do this again if the argument is '3'
if [ $HB_MISS_COUNT -eq $HB_MISS_COUNT_MAX ] ; then
    usleep 200000 
    OUTF=interrupts_${2}_hbmiss_2 dateandrun cat /proc/interrupts
fi

OUTF=top_snapshot dateandrun top -H -n 1 -b
OUTF=pstree dateandrun ps awxgf

# Collect kernel trace buffer
if [ -f /sys/kernel/debug/tracing/trace ]; then
    OUTF=ktrace.log dateandrun cat /sys/kernel/debug/tracing/trace 
fi

# Collect /var/log
OUTF= dateandrun cp /var/log/libvirt/libvirtd.log $coredir/libvirtd.log
dateandrun cp /var/log/syslog.log $coredir/syslog.log
dateandrun cp /var/log/messages $coredir/messages
dateandrun sync

dateandrun echo Finish

exit 0

