2016-09-26 19:14:51 -04:00
package pcp
2016-09-22 18:34:05 -04:00
import (
2016-09-22 19:54:06 -04:00
"bufio"
2018-02-02 19:24:51 -05:00
"fmt"
2016-09-22 18:34:05 -04:00
"log"
2016-10-13 17:15:09 -04:00
"os/exec"
2016-10-07 20:47:59 -04:00
"syscall"
2016-10-13 17:15:09 -04:00
"time"
2018-09-30 18:23:38 -07:00
"github.com/mesos/mesos-go/api/v0/scheduler"
"github.com/montanaflynn/stats"
2018-10-04 13:45:31 -04:00
elekLogDef "gitlab.com/spdf/elektron/logging/def"
2018-09-30 18:23:38 -07:00
"gitlab.com/spdf/elektron/schedulers"
2016-09-22 18:34:05 -04:00
)
2018-10-04 13:45:31 -04:00
func Start ( quit chan struct { } , logging * bool , logMType chan elekLogDef . LogMessageType , logMsg chan string , s scheduler . Scheduler ) {
2018-02-02 19:24:51 -05:00
baseSchedRef := s . ( * schedulers . BaseScheduler )
2016-09-26 19:14:51 -04:00
const pcpCommand string = "pmdumptext -m -l -f '' -t 1.0 -d , -c config"
2016-09-22 18:34:05 -04:00
cmd := exec . Command ( "sh" , "-c" , pcpCommand )
2016-10-07 20:47:59 -04:00
cmd . SysProcAttr = & syscall . SysProcAttr { Setpgid : true }
2016-09-22 18:34:05 -04:00
2016-09-26 19:14:51 -04:00
pipe , err := cmd . StdoutPipe ( )
if err != nil {
log . Fatal ( err )
}
2016-09-22 18:34:05 -04:00
//cmd.Stdout = stdout
scanner := bufio . NewScanner ( pipe )
2016-09-26 19:14:51 -04:00
go func ( logging * bool ) {
2017-09-28 15:36:47 -04:00
// Get names of the columns.
2016-09-22 18:34:05 -04:00
scanner . Scan ( )
2018-01-19 21:20:43 +00:00
// Write to logfile
2018-10-04 13:45:31 -04:00
logMType <- elekLogDef . PCP
2018-01-19 21:20:43 +00:00
logMsg <- scanner . Text ( )
2016-09-26 19:14:51 -04:00
2018-10-04 13:45:31 -04:00
logMType <- elekLogDef . DEG_COL
2018-02-05 02:18:11 -05:00
logMsg <- "CPU Variance, CPU Task Share Variance, Memory Variance, Memory Task Share Variance"
2018-02-03 19:48:17 -05:00
// Throw away first set of results
2016-09-22 18:34:05 -04:00
scanner . Scan ( )
seconds := 0
2018-02-02 19:24:51 -05:00
2016-09-22 18:34:05 -04:00
for scanner . Scan ( ) {
2018-02-02 19:24:51 -05:00
text := scanner . Text ( )
2016-09-26 19:14:51 -04:00
2016-10-13 17:15:09 -04:00
if * logging {
2018-10-04 13:45:31 -04:00
logMType <- elekLogDef . PCP
2018-02-02 19:24:51 -05:00
logMsg <- text
2016-09-26 19:14:51 -04:00
}
2016-09-22 18:34:05 -04:00
seconds ++
2018-02-02 19:24:51 -05:00
memUtils := memUtilPerNode ( text )
memTaskShares := make ( [ ] float64 , len ( memUtils ) )
cpuUtils := cpuUtilPerNode ( text )
cpuTaskShares := make ( [ ] float64 , len ( cpuUtils ) )
for i := 0 ; i < 8 ; i ++ {
host := fmt . Sprintf ( "stratos-00%d.cs.binghamton.edu" , i + 1 )
2018-02-05 02:25:32 -05:00
if slaveID , ok := baseSchedRef . HostNameToSlaveID [ host ] ; ok {
2018-02-05 16:01:48 -05:00
baseSchedRef . TasksRunningMutex . Lock ( )
2018-02-05 02:25:32 -05:00
tasksRunning := len ( baseSchedRef . Running [ slaveID ] )
2018-02-05 16:01:48 -05:00
baseSchedRef . TasksRunningMutex . Unlock ( )
2018-02-05 02:25:32 -05:00
if tasksRunning > 0 {
cpuTaskShares [ i ] = cpuUtils [ i ] / float64 ( tasksRunning )
memTaskShares [ i ] = memUtils [ i ] / float64 ( tasksRunning )
}
2018-02-02 19:24:51 -05:00
}
}
2018-02-05 16:01:48 -05:00
// Variance in resource utilization shows how the current workload has been distributed.
// However, if the number of tasks running are not equally distributed, utilization variance figures become
// less relevant as they do not express the distribution of CPU intensive tasks.
// We thus also calculate `task share variance`, which basically signifies how the workload is distributed
// across each node per share.
2018-02-03 19:37:30 -05:00
cpuVariance , _ := stats . Variance ( cpuUtils )
cpuTaskSharesVariance , _ := stats . Variance ( cpuTaskShares )
memVariance , _ := stats . Variance ( memUtils )
memTaskSharesVariance , _ := stats . Variance ( memTaskShares )
2018-02-02 19:24:51 -05:00
2018-10-04 13:45:31 -04:00
logMType <- elekLogDef . DEG_COL
2018-02-02 19:24:51 -05:00
logMsg <- fmt . Sprintf ( "%f, %f, %f, %f" , cpuVariance , cpuTaskSharesVariance , memVariance , memTaskSharesVariance )
2016-09-22 18:34:05 -04:00
}
2016-09-26 19:14:51 -04:00
} ( logging )
2018-10-04 13:45:31 -04:00
logMType <- elekLogDef . GENERAL
2018-01-19 21:20:43 +00:00
logMsg <- "PCP logging started"
2016-09-22 18:34:05 -04:00
if err := cmd . Start ( ) ; err != nil {
log . Fatal ( err )
}
2016-09-26 19:14:51 -04:00
2016-10-07 20:47:59 -04:00
pgid , err := syscall . Getpgid ( cmd . Process . Pid )
2016-10-13 17:15:09 -04:00
select {
case <- quit :
2018-10-04 13:45:31 -04:00
logMType <- elekLogDef . GENERAL
2018-01-19 21:20:43 +00:00
logMsg <- "Stopping PCP logging in 5 seconds"
2016-09-26 19:14:51 -04:00
time . Sleep ( 5 * time . Second )
2016-10-07 20:47:59 -04:00
// http://stackoverflow.com/questions/22470193/why-wont-go-kill-a-child-process-correctly
2017-09-28 15:36:47 -04:00
// Kill process and all children processes.
2016-10-07 20:47:59 -04:00
syscall . Kill ( - pgid , 15 )
2016-09-26 19:14:51 -04:00
return
}
2016-09-22 18:34:05 -04:00
}