#!/bin/bash
#
DATE=$(date +%Y%m%d%H%M%S)

startc=${1}
stopc=${2}
incr=${3}

if [[ -z ${startc} || -z ${stopc} || -z ${incr} ]] ; then
	echo "Option missing!"
	echo "Usage: nextcore_allq.sh {Start #Cores} {Stop #Cores} {Increade by #Cores}"
	exit 1
fi

dir_selection="Linux-x86_64-icc.net-linux-x86_64-ibverbs-iccAMD"
#dir_selection="Linux-x86_64-icc.mpi-linux-x86_64-mvapich1_2"
#dir_selection="Linux-x86_64-icc.net-linux-x86_64-ibverbs-icc"
#dir_selection="Linux-x86_64-icc.net-linux-x86_64-icc"
#dir_selection="Linux-x86_64-icc.net-linux-x86_64-smp-icc"
templog="/home/blub/temp_${DATE}.log"
cmsh="/cvos/local/apps/cmd/bin/cmsh"
jobfile="/tmp/nextcore_allq_${DATE}.jobs"

maxtemp=25
roomtemp=23
#After all just a polling intervall for the temperature
cooldown=5
update=5
#Execute reset on missing Nodes after X Minutes
reset=360

jobfile="/tmp/nextcore_allq.jobs"
if [[ -e ${jobfile} ]] ; then
	rm ${jobfile}
fi

function suspendjobs() {
	while read line ; do
		echo "Suspending Job ${line}"
		qmod -sj ${line}
	done < "${jobfile}"
}

function setcores() {
	if [[ ${ncores} -le "12" ]] ; then
		NODES="1"
	elif [[ ${ncores} -gt "12" && ${ncores} -lt "24" ]] ; then
		
		NODES=$(( ${ncores} / 12 ))
	fi
#	sed -e "s/PPN=[0-9].*/PPN=${NODES}/" -i ${i}.sh
	sed -e "s/-pe mvapich [0-9].*/-pe mvapich ${ncores}/" -i ${i}.sh
	sed -e "s/-pe charm [0-9].*/-pe charm ${ncores}/" -i ${i}.sh
}

function get_temp() {
	temp=`snmpget -v2c -c public -m "/usr/local/share/powernet396.mib" ups01 iemStatusProbeCurrentTemp.1 | cut -d " " -f 4`
}

function check_temp() {
	get_temp
	echo "$(date +%Y%m%d%H%M%S) ${temp}" >> ${templog}
	if [[ ${temp} -ge ${maxtemp} ]] ; then
		echo "Warning temperature is above ${maxtemp}°C. Shutting down cluster after last job finished"
		maxwait=600
		waitcount=0
		while [[ -n $(qstat) ]] ; do
			echo -e -n "Waiting for last job to finish (max 600 seconds). Waited for ${waitcount} seconds \r"
			waitcount=$(( ${waitcount} + 10 ))
			echo "$(date +%Y%m%d%H%M%S) ${temp}" >> ${templog}
			sleep 10
			if [[ ${waitcount} -ge 600 ]] ; then
				echo "Job did not finish, shutting down Nodes now"
				break
			fi
		done
		shutdown_nodes
		echo -e "\nSTART Cooling cycle"
		echo -e "\nSTART Cooling cycle $(date +%Y%m%d%H%M%S)" >> ${templog}
		waitcount=0
		while [[ ${temp} -gt ${roomtemp} ]] ; do
			echo -e -n "Waiting for current temp (${temp}°C) to drop to ${roomtemp}°C bevor powering up the nodes. (Sleeping since ${waitcount} second) \r"
			waitcount=$(( ${waitcount} + ${cooldown} ))
			echo "$(date +%Y%m%d%H%M%S) ${temp}" >> ${templog}
			get_temp
			sleep ${cooldown}
		done
		echo -e "STOP Cooling cycle\n"
		echo -e "STOP Cooling cycle $(date +%Y%m%d%H%M%S) \n" >> ${templog}
		startup_nodes
	fi

}

function get_status() {
	status_wait_count=0
	status=0
	while [[ ${status} -lt ${required_nodes} ]] ; do
		clear
		echo "Waiting since ${status_wait_count}s for ${required_nodes} devices to come online"
		echo "Executing reset on missing devices after ${reset}s"
		echo "Temperature: ${temp}"
		sudo -H ${cmsh} -c "device ; status" | grep node
		status=`sudo -H ${cmsh} -c "device ; status" | grep node | grep "UP" | wc -l`
		check_temp
		sleep 10
		status_wait_count=$(( ${status_wait_count} + 10 ))
			if [[ ${status_wait_count} -eq ${reset} ]] ; then
				clear
				still_down=`sudo -H ${cmsh} -c "device ; status" | grep node | head -n${required_nodes} | grep DOWN | cut -d " " -f 1 | xargs | sed -e 's/ /,/g'`
				echo "Nodes ${still_down} are still down after ${reset} second, executing reset (1/2)"
				reset=$(( ${reset} + ${reset} ))
				echo "Next and last reset after ${reset}s"
				sudo -H ${cmsh} -c "device ; power -n ${still_down} reset"
			elif [[ ${status_wait_count} -eq ${reset} ]] ; then
				still_down=`sudo -H ${cmsh} -c "device ; status" | grep node | head -n${required_nodes} | grep DOWN | cut -d " " -f 1 | xargs | sed -e 's/ /,/g'`
				echo "Nodes ${still_down} are still down after ${reset} second, executing reset (2/2)"
				reset=$(( ${reset} + 360 ))
				sudo -H ${cmsh} -c "device ; power -n ${still_down} reset"
			elif [[ ${status_wait_count} -eq ${reset} ]] ; then
				echo "Devices ${still_down} not up after 1200 seconds and 2 resets, exiting now"
				exit 1
			fi
	done
	
}

function get_num_nodes() {
	calc_nodes=$(bc <<< "scale=3 ; ${ncores} / 12")
	full=`cut -d "." -f1 <<< ${calc_nodes}`
	rest=`cut -d "." -f2 <<< ${calc_nodes} | sed -e 's/0//g'`
	if [[ ${rest} -gt 0 ]] ; then
		required_nodes=$((${full} + 1))
	else
		required_nodes=${full}
	fi
	echo "Required Nodes to satisfy ${ncores} Cores: ${required_nodes}"
	check_temp
	startup_nodes
}

function shutdown_nodes() {
	#We need to cool down after each run:
	echo "Shutting down all devices to cool down"
	for nerv in {1..9} ; do
		sudo -H ${cmsh} -c "device ; power -n node00${nerv} off"
	done
	for nerv in {10..24} ; do
		sudo -H ${cmsh} -c "device ; power -n node0${nerv} off"
	done
}
	
function startup_nodes() {
	echo "Starting required Nodes (${required_nodes})"
	for (( count=1 ; count <= ${required_nodes} ; count++  )) ; do
		if [[ ${count} -lt 10 ]] ; then
			sudo -H ${cmsh} -c "device ; power -n node00${count} on"
		elif [[ ${count} -lt 100 && ${count} -gt 9 ]] ; then
			sudo -H ${cmsh} -c "device ; power -n node0${count} on"
		elif [[ ${count} -lt 1000 && ${count} -gt 99 ]] ; then
			sudo -H ${cmsh} -c "device ; power -n node${count} on"
		fi
		#if [[ ${count} == 5 || ${count} == 10 || ${count} == 15 || ${count} == 20 ]] ; then
		#	echo "Sleeping 120 seconds befor powering up the next 5 Nodes"
		#	sleep 120
		#fi
	done	
	get_status
}

for (( ncores=${startc} ;  ncores <= ${stopc} ; ncores=ncores + ${incr} )) ; do
	get_num_nodes
	for i in ${dir_selection} ; do
		if [[ -d ${i} ]] ; then
			cd ${i}
			setcores
			rm *.log *.o??? *.po??? *~ *-out.* FFTW_NAMD* > /dev/null 2>&1
			check_temp
			qsub ${i}.sh | cut -d " " -f3 >> ${jobfile}
			cd ..
		else
			echo "Skipping ${i}, not a directory"
		fi
	done
	while [[ $(qstat | wc -l) != 0 ]] ; do
		clear
		check_temp
		echo "Temperature: ${temp}"
		for j in ${dir_selection} ; do
			t=$(grep TIMING ${j}/${j}.log 2>/dev/null | tail -n1)
			b=$(grep Benchmark ${j}/${j}.log 2>/dev/null | tail -n1 )
			echo "Status: ${j} with ${ncores} Cores"
			echo "(Update all ${update}s) $(date)"
			echo "${t}"
			echo "${b}"
			echo "-------------------------------------------------------------"
		done
		sleep ${update}
	done
	mkdir Cores_${ncores} > /dev/null 2>&1
	cp -a ${dir_selection} Cores_${ncores}/
	mkdir plot > /dev/null 2>&1
	cd Cores_${ncores}
	for k in ${dir_selection} ; do
		cd ${k}
		btime=$(grep "Benchmark time" ${k}.log | tail -n1)
		corenum=$(cut -d " " -f4 <<< ${btime})
		sstep=$(cut -d " " -f6 <<< ${btime})
		dns=$(cut -d " " -f8 <<< ${btime})
		mem=$(cut -d " " -f10 <<< ${btime})
		corexsstep=$(bc <<< "${corenum}*${sstep}")
		corexdns=$(bc <<< "${corenum}*${dns}")
		echo "${corenum} ${sstep} ${dns} ${corexsstep} ${corexdns}" >> ../../plot/${k}.dat
		cd ..
	done
	cd ..
done

echo "Done with Benchmark, killing all Nodes"
shutdown_nodes

