#!/bin/bash
#
# Copyright (c) 2014-2019, 2021 by cisco Systems, Inc.
#
#---------------------------------------------------------
# cmpressdump.sh - script for compressing core dump
#
# The kernel's core dump sets up pipe between
# kernel and this script. We use lz4 to compress the
# core file (for speed), then in the background, uncompress
# and recompress using gzip, for smaller final result. The mv
# of $final_core.tmp to $final_core triggers corehelper to move
# the core to its final destination.
#
# $1 is the core file name passed by the kernel
#
# This script produces the crashinfo.txt file as well. To that end:
# NOTE that dumper in both sysadmin and XR produce a delimiter in the crashinfo
# file - a series of dashes. This delimiter is after the backtrace for the
# first thread, which is taken as the faulting thread. show context depends
# on this delimiter to know when to cease its output - otherwise, show context
# will dump the entire crashinfo file.
#---------------------------------------------------------

PATH=/bin:/usr/bin:/sbin:/usr/sbin

# Use the same affinity as init to honor kernel isolcpus parameter
taskset -pc $(taskset -pc 1 | cut -d":" -f2) $$ >/dev/null 2>&1

# Increase oom score so that oom killer will terminate core dump 
# on low memory situation instead of any other process.
# Value of 14 will cause kernel to log process termination (CSCuq31010)
# if this process is chosen by lowmemorykiller. Of course, since it
# is not at the "bait" score of 15, it will look less enticing anyway
# First, save current oom_adj
sav_oom_adj=`cat /proc/$$/oom_adj`
echo -n 14 > /proc/$$/oom_adj

function signal_to_name 
{
    local n=$(($1 + 0))
    if [ "$n" -lt 0 -o "$n" -gt 64 ]; then
        echo "Unknown signal $n"
    elif [ "$n" -lt 32 ]; then
        signame=( "Signal 0" "Hangup" "Interrupt" "Quit" "Illegal instruction" "Trace/breakpoint trap" "Aborted" "Bus error" "Arithmetic exception" "Killed" "User defined signal 1" "Segmentation fault" "User defined signal 2" "Broken pipe" "Alarm clock" "Terminated" "Stack fault on coprocessor" "Child status changed" "Continued" "Stopped (signal)" "Stopped (user)" "Stopped (tty input)" "Stopped (tty output)" "Urgent I/O condition" "CPU time limit exceeded" "File size limit exceeded" "Virtual timer expired" "Profiling timer expired" "Window size changed" "I/O possible" "Power fail/restart" "Bad system call" )
        echo ${signame[$n]}
    else 
        echo "Real-time event $n"
    fi
}

function get_core_info {
    local proc=$1
    local pid=$2
    local sig=$3
    local core=$4
    local cmdline

    cmdline=$(tr \\000 \  < /proc/$pid/cmdline)
    cmdline=${cmdline/% /}

    echo "Core for pid = $pid ($proc)"
    if [ -z "$core" ]; then
        echo "Core for process not saved"
    else
        echo "Core for process: ${core##*/}"
    fi
    echo "Core dump time: $(date '+%Y-%m-%d %H:%M:%S.%N %z')"
    echo

    echo "Process:"
    echo "Core was generated by \`${cmdline}'."
    echo
    echo "Build information:"
    if [ -r /etc/build-info.txt ]; then
        cat /etc/build-info.txt
    elif [ -r /var/volatile/tmp/cisco/iosxr/install/version_info_dumper.txt ]; then
        grep -e Version -e Build -e Architecture /var/volatile/tmp/cisco/iosxr/install/version_info_dumper.txt
    else
        echo "Build information not available at this time"
    fi
    echo

    echo "Signal information:"
    echo "Program terminated with signal $sig, $(signal_to_name $sig)."
    echo
    echo "Faulting thread: $faulting_thread"
    echo
}


if [ -x /pkg/bin/logger ]; then
    LOGGER=/pkg/bin/logger
    ulimit -q unlimited
elif [ -x /opt/cisco/calvados/bin/cal_logger ]; then
    LOGGER=/opt/cisco/calvados/bin/cal_logger
else 
    LOGGER=logger
fi

if [ -x /pkg/bin/xrenv ]; then
    # xrenv exists on Lindt-based platforms, and sets up the
    # execution environment for XR processes. Without this,
    # /pkg/bin/logger will not run.
    LOGGER="/pkg/bin/xrenv $LOGGER"
fi

function logger_with_timeout {
    (
        # $@ - store all the arguments in a list of string
        $LOGGER "$@" > /dev/null 2>&1 &
        # $! - last background process PID
        echo $!
        wait $!
        echo done
        # Under bash, read command accept a timeout parameter, "-t <time>",
        # "read pid" to get last background process pid (echo $!) from 1st standin,
        # "read -t 2 ans" to get "done" (echo done) from 2nd standin,
        # the second read will expire in 2 sec, if LOGGER take more than 2 sec
        # then we kill LOGGER process and exit from function.
        # ideally no need call "kill" if LOGGER finish in less than 2 sec,
        # but calling extra "kill" would not likely hurt the system.
    ) | ( read pid && read -t 2 ans ; kill $pid 2> /dev/null)
}

p=`echo "$@"|sed '{s# #_#g;}'`
p=${p//[^-+0-9a-zA-Z_:.\/]/}
comp_file="$p"
comp_file=${comp_file#*/}
core=${comp_file%.*z}
process_pid=${core%.by.*}
process=${process_pid##*/}
process=${process%_*}
pid=${process_pid##*_}
core=${core##*.by.}
signal=${core%%.*}
core=${core#*.}
utime=${core%%.*}
core=${core#*.}
node=${core%%.*}

#
# If it's an LXC, we can expect additional information appended to the 
# core file name.
#
core=${core#*.core.}
lxc=${core%%.*}
lxc_pid=${core#*.}

if [[ ! $p =~ .*\.by\..* ]]; then
    #
    # <process>.<pid>.core.<lxc>.<in_lxc_pid> format
    #
    process=$(echo $p | cut -d. -f2)
    pid=$(echo $p | cut -d. -f3)

    #
    # Fake information
    #
    signal=6 # SIGABRT
    utime=0
fi

time=$(date -d @$utime +%Y%m%d-%H%M%S)
md5sum=$(md5sum /proc/$pid/exe|cut -c28-32) 
cmdline=/proc/$pid/cmdline
root=${p%/*}

if [ ! -d /misc/scratch/core ]; then
    mkdir -p /misc/scratch/core
fi

#
# If this core file, is for a process running within a container, we should 
# ideally call this script within the container namespace and pipe the 
# core data to that process.
#
if [[ "$lxc" == "in_lxc" ]]; then
    lxc_filename="$root/${process}_$lxc_pid.by.$signal.$utime.$node.core"
    /opt/cisco/hostos/bin/lxc_core_helper_static $pid $lxc_filename
    exit
fi

# If we're now in the XR LXC, we need to replace the Linux hostname with the
# XR node name.
# Check for existence of /pkg/bin/hostname
if [[ -x /pkg/bin/hostname ]]; then
    node=`/pkg/bin/hostname`
else
    node=`hostname`
fi

# Grope out faulting thread using this rude hack courtesy of <anon>
faulting_thread=$(grep -l ^pipe_wait$ /proc/$pid/task/*/wchan | cut -f 5 -d/)
[[ ${faulting_thread} -eq 0 ]] && faulting_thread=$pid

LOGFILE=/tmp/cmpressdump.$pid.debug.logs
/bin/touch $LOGFILE
echo "Logging by cmpressdump.sh" > $LOGFILE
echo "  passed in arguments $@" >> $LOGFILE
echo "  cmdline $cmdline" >> $LOGFILE
process_pid=${process_pid##*/}
echo "  passed in process_pid $process_pid" >> $LOGFILE

if [ -f $cmdline ]
then
    cmd1=$(awk -F"\0" '{print $1}' < $cmdline)
    cmd=${cmd1##*/}
    if [ -z "$cmd" ]
    then
        echo "  file $cmdline REMOVED or EMPTY" >> $LOGFILE
    else
        if ( egrep '(bash|ksh|python|perl)$' <<< "$cmd" > /dev/null )
        then
            cmd2=$(awk -F"\0" '{print $2}' < $cmdline)
            cmd=${cmd2##*/}
        fi

        process_pid=${cmd}_${pid}
        process_pid=${process_pid//[^-+0-9a-zA-Z_:.\/]/}
    fi
fi
final_core=$root/$process_pid.by.$signal.$time.$node.$md5sum.core.gz
final_core_crashinfo=$root/$process_pid.by.$signal.$time.$node.$md5sum.core.txt
lz4core=$root/$process_pid.by.$signal.$time.$node.$md5sum.core.lz4

echo "  final process_pid $process_pid" >> $LOGFILE

(
    flock -xn 230
    if [ $? -eq 0 ]; then
        lock_success=true
        echo "  final_core $final_core" >> $LOGFILE
    else
        lock_success=false
    fi

    echo >> $LOGFILE

    if $lock_success; then
        logger_with_timeout -t dumper Dumping core "$final_core"

        # Here is a hack - dumper moves the core to some final destination. We
        # want to put that final dest into the crashinfo file, so show context
        # will tell us where the core ended up. Leave space at the start of the
        # crashinfo file, so we can simply write the destination in, once the
        # core file is moved.
        # Each of these lines is 40 spaces
        echo -n "                                        " > $final_core_crashinfo
        echo -n "                                        " >> $final_core_crashinfo
        echo -n "                                        " >> $final_core_crashinfo
        echo -n "                                        " >> $final_core_crashinfo
        echo    "                                        " >> $final_core_crashinfo

        get_core_info "$process" "$pid" "$signal" "$final_core" >> $final_core_crashinfo
    else
        get_core_info "$process" "$pid" "$signal" > $final_core_crashinfo
    fi

    if [ -f /opt/cisco/thinxr/am_i_thinxr ]; then
        pmap_opt="-p"
    fi

    if [ -x /usr/bin/cbt ]; then
        echo "Mapping information" >>  ${final_core_crashinfo}.p2
        pmap $pmap_opt $pid >>  ${final_core_crashinfo}.p2
        echo >> ${final_core_crashinfo}.p2
        ### CSCut85600: don't use RPM here - rpm does file locking on its database, and some
        ### XR/sysadmin programs lock that database as well. It has been seen that if e.g.
        ### sdr_instagt crashes while holding file lock, RPM will hang and no core file will
        ### be produced.
        ##echo "rpm information" >>  ${final_core_crashinfo}.p2
        ##rpm -qa >> ${final_core_crashinfo}.p2 2>&1

        if $lock_success; then
            /usr/bin/cbt -f $faulting_thread -c "/usr/bin/lz4 -1 - > $lz4core.tmp" \
                $pid 1>> $final_core_crashinfo 2>&1
        else
            /usr/bin/cbt -f $faulting_thread -c "/bin/cat > /dev/null" \
                $pid 1>> $final_core_crashinfo 2>&1
        fi

        cat ${final_core_crashinfo}.p2 >> $final_core_crashinfo
        rm ${final_core_crashinfo}.p2
    else
        ### Don't attempt backtraces at this time. The necessary ptrace
        ### support is not working
        ### /usr/bin/cbt -c -f $faulting_thread $pid >>  $final_core_crashinfo
        echo "Mapping information" >>  $final_core_crashinfo
        pmap $pmap_opt $pid >>  $final_core_crashinfo
        echo >> $final_core_crashinfo
        ### CSCut85600: don't use RPM here - rpm does file locking on its database, and some
        ### XR/sysadmin programs lock that database as well. It has been seen that if e.g.
        ### sdr_instagt crashes while holding file lock, RPM will hang and no core file will
        ### be produced.
        ##echo "rpm information" >>  $final_core_crashinfo
        ##rpm -qa >> $final_core_crashinfo 2>&1

        if $lock_success; then
            lz4 -1 - > $lz4core.tmp
        fi
    fi

    cat $LOGFILE >> $final_core_crashinfo
    rm $LOGFILE

    if $lock_success; then
        flock -u 230
        ( 
            declare -a pstatus

            lz4 -d -c $lz4core.tmp | gzip - > $final_core.tmp
            pstatus=( ${PIPESTATUS[@]} )
            if [ ${pstatus[0]} -eq 0 -a ${pstatus[1]} -eq 0 ]
            then
                # mv to final core triggers dumper to move core to final destination
                rm $lz4core.tmp
                mv $final_core.tmp $final_core
                logger md5 `md5sum $final_core`
            else
                # gzip or lz4 -d failed - could be disk space issue
                # Trigger dumper to move .lz4 core file to final destination
                # and syslog the error
                logger -t dumper "Error decompressing $lz4core or recompressing $final_core"
                logger -t dumper "$lz4core will be preserved rather than $final_core"
                rm $final_core.tmp
                mv $lz4core.tmp $lz4core
                logger md5 `md5sum $lz4core`
            fi

            sync

            # Before running vm.drop_caches, restore our oom score
            # This is to prevent lowmemorykiller from chosing this sysctl
            # command as its target.
            read my_subshell_pid _ < /proc/self/stat
            echo -n $sav_oom_adj > /proc/$my_subshell_pid/oom_adj
            sysctl vm.drop_caches=3
            # release lock
        )&

    else
        logger_with_timeout -t dumper "Too many core dumps in progress. Unable to save $final_core."
    fi
) 230> /var/lock/dumper.lock
