pcp: CPU/MEM utilization & task share variance per node
This commit is contained in:
parent
abadb9a328
commit
46be28ef8d
4 changed files with 113 additions and 22 deletions
34
pcp/pcp.go
34
pcp/pcp.go
|
@ -2,14 +2,18 @@ package pcp
|
||||||
|
|
||||||
import (
|
import (
|
||||||
elecLogDef "bitbucket.org/sunybingcloud/elektron/logging/def"
|
elecLogDef "bitbucket.org/sunybingcloud/elektron/logging/def"
|
||||||
|
"bitbucket.org/sunybingcloud/elektron/schedulers"
|
||||||
"bufio"
|
"bufio"
|
||||||
|
"fmt"
|
||||||
|
"github.com/mesos/mesos-go/api/v0/scheduler"
|
||||||
"log"
|
"log"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"syscall"
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
func Start(quit chan struct{}, logging *bool, logMType chan elecLogDef.LogMessageType, logMsg chan string) {
|
func Start(quit chan struct{}, logging *bool, logMType chan elecLogDef.LogMessageType, logMsg chan string, s scheduler.Scheduler) {
|
||||||
|
baseSchedRef := s.(*schedulers.BaseScheduler)
|
||||||
const pcpCommand string = "pmdumptext -m -l -f '' -t 1.0 -d , -c config"
|
const pcpCommand string = "pmdumptext -m -l -f '' -t 1.0 -d , -c config"
|
||||||
cmd := exec.Command("sh", "-c", pcpCommand)
|
cmd := exec.Command("sh", "-c", pcpCommand)
|
||||||
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||||
|
@ -34,14 +38,40 @@ func Start(quit chan struct{}, logging *bool, logMType chan elecLogDef.LogMessag
|
||||||
scanner.Scan()
|
scanner.Scan()
|
||||||
|
|
||||||
seconds := 0
|
seconds := 0
|
||||||
|
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
|
text := scanner.Text()
|
||||||
|
|
||||||
if *logging {
|
if *logging {
|
||||||
logMType <- elecLogDef.PCP
|
logMType <- elecLogDef.PCP
|
||||||
logMsg <- scanner.Text()
|
logMsg <- text
|
||||||
}
|
}
|
||||||
|
|
||||||
seconds++
|
seconds++
|
||||||
|
|
||||||
|
memUtils := memUtilPerNode(text)
|
||||||
|
memTaskShares := make([]float64, len(memUtils))
|
||||||
|
|
||||||
|
cpuUtils := cpuUtilPerNode(text)
|
||||||
|
cpuTaskShares := make([]float64, len(cpuUtils))
|
||||||
|
|
||||||
|
for i := 0; i < 8; i++ {
|
||||||
|
host := fmt.Sprintf("stratos-00%d.cs.binghamton.edu", i+1)
|
||||||
|
slaveID := baseSchedRef.HostNameToSlaveID[host]
|
||||||
|
tasksRunning := len(baseSchedRef.Running[slaveID])
|
||||||
|
if tasksRunning > 0 {
|
||||||
|
cpuTaskShares[i] = cpuUtils[i] / float64(tasksRunning)
|
||||||
|
memTaskShares[i] = memUtils[i] / float64(tasksRunning)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cpuVariance := calcVariance(cpuUtils)
|
||||||
|
cpuTaskSharesVariance := calcVariance(cpuTaskShares)
|
||||||
|
memVariance := calcVariance(memUtils)
|
||||||
|
memTaskSharesVariance := calcVariance(memTaskShares)
|
||||||
|
|
||||||
|
logMType <- elecLogDef.DEG_COL
|
||||||
|
logMsg <- fmt.Sprintf("%f, %f, %f, %f", cpuVariance, cpuTaskSharesVariance, memVariance, memTaskSharesVariance)
|
||||||
}
|
}
|
||||||
}(logging)
|
}(logging)
|
||||||
|
|
||||||
|
|
52
pcp/utils.go
52
pcp/utils.go
|
@ -3,6 +3,8 @@ package pcp
|
||||||
import (
|
import (
|
||||||
"container/ring"
|
"container/ring"
|
||||||
"math"
|
"math"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
var RAPLUnits = math.Pow(2, -32)
|
var RAPLUnits = math.Pow(2, -32)
|
||||||
|
@ -47,3 +49,53 @@ func AverageClusterPowerHistory(history *ring.Ring) float64 {
|
||||||
|
|
||||||
return (total / count)
|
return (total / count)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func sumAndNormalize(tokenSlice []string, normalizer float64) float64 {
|
||||||
|
sum := 0.0
|
||||||
|
for _, value := range tokenSlice {
|
||||||
|
i, _ := strconv.ParseFloat(value, 64)
|
||||||
|
sum += i
|
||||||
|
}
|
||||||
|
return sum / normalizer
|
||||||
|
}
|
||||||
|
|
||||||
|
func calcMean(a []float64) float64 {
|
||||||
|
total := 0.0
|
||||||
|
for _, v := range a {
|
||||||
|
total += v
|
||||||
|
}
|
||||||
|
return total / float64(len(a))
|
||||||
|
}
|
||||||
|
|
||||||
|
func calcVariance(a []float64) float64 {
|
||||||
|
mean := calcMean(a)
|
||||||
|
total := 0.0
|
||||||
|
for _, v := range a {
|
||||||
|
total += math.Pow(mean-v, 2)
|
||||||
|
}
|
||||||
|
return total / float64(len(a))
|
||||||
|
}
|
||||||
|
|
||||||
|
func utilization(used string, free string) float64 {
|
||||||
|
u, _ := strconv.ParseFloat(used, 64)
|
||||||
|
f, _ := strconv.ParseFloat(free, 64)
|
||||||
|
return u / (u + f)
|
||||||
|
}
|
||||||
|
|
||||||
|
func cpuUtilPerNode(text string) []float64 {
|
||||||
|
tokenSlice := strings.Split(text, ",")
|
||||||
|
cpuUtil := make([]float64, 8)
|
||||||
|
for i := 0; i < 8; i++ {
|
||||||
|
cpuUtil[i] = utilization(tokenSlice[8+i], tokenSlice[24+i])
|
||||||
|
}
|
||||||
|
return cpuUtil
|
||||||
|
}
|
||||||
|
|
||||||
|
func memUtilPerNode(text string) []float64 {
|
||||||
|
tokenSlice := strings.Split(text, ",")
|
||||||
|
memUtil := make([]float64, 8)
|
||||||
|
for i := 0; i < 8; i++ {
|
||||||
|
memUtil[i] = utilization(tokenSlice[40+i], tokenSlice[32+i])
|
||||||
|
}
|
||||||
|
return memUtil
|
||||||
|
}
|
||||||
|
|
|
@ -138,7 +138,7 @@ func main() {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
go pcp.Start(pcpLog, &recordPCP, logMType, logMsg)
|
go pcp.Start(pcpLog, &recordPCP, logMType, logMsg, scheduler)
|
||||||
//go pcp.StartPCPLogAndExtremaDynamicCap(pcpLog, &recordPCP, *hiThreshold, *loThreshold, logMType, logMsg)
|
//go pcp.StartPCPLogAndExtremaDynamicCap(pcpLog, &recordPCP, *hiThreshold, *loThreshold, logMType, logMsg)
|
||||||
//go pcp.StartPCPLogAndProgressiveExtremaCap(pcpLog, &recordPCP, *hiThreshold, *loThreshold, logMType, logMsg)
|
//go pcp.StartPCPLogAndProgressiveExtremaCap(pcpLog, &recordPCP, *hiThreshold, *loThreshold, logMType, logMsg)
|
||||||
time.Sleep(1 * time.Second) // Take a second between starting PCP log and continuing
|
time.Sleep(1 * time.Second) // Take a second between starting PCP log and continuing
|
||||||
|
|
|
@ -3,7 +3,6 @@ package schedulers
|
||||||
import (
|
import (
|
||||||
"bitbucket.org/sunybingcloud/elektron/def"
|
"bitbucket.org/sunybingcloud/elektron/def"
|
||||||
elecLogDef "bitbucket.org/sunybingcloud/elektron/logging/def"
|
elecLogDef "bitbucket.org/sunybingcloud/elektron/logging/def"
|
||||||
"bitbucket.org/sunybingcloud/elektron/utilities"
|
|
||||||
"bitbucket.org/sunybingcloud/elektron/utilities/schedUtils"
|
"bitbucket.org/sunybingcloud/elektron/utilities/schedUtils"
|
||||||
"bytes"
|
"bytes"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
@ -22,14 +21,15 @@ type BaseScheduler struct {
|
||||||
// Current scheduling policy used for resource offer consumption.
|
// Current scheduling policy used for resource offer consumption.
|
||||||
curSchedPolicy SchedPolicyState
|
curSchedPolicy SchedPolicyState
|
||||||
|
|
||||||
tasksCreated int
|
tasksCreated int
|
||||||
tasksRunning int
|
tasksRunning int
|
||||||
tasks []def.Task
|
tasks []def.Task
|
||||||
metrics map[string]def.Metric
|
metrics map[string]def.Metric
|
||||||
running map[string]map[string]bool
|
Running map[string]map[string]bool
|
||||||
wattsAsAResource bool
|
wattsAsAResource bool
|
||||||
classMapWatts bool
|
classMapWatts bool
|
||||||
totalResourceAvailabilityRecorded bool
|
TasksRunningMutex sync.Mutex
|
||||||
|
HostNameToSlaveID map[string]string
|
||||||
|
|
||||||
// First set of PCP values are garbage values, signal to logger to start recording when we're
|
// First set of PCP values are garbage values, signal to logger to start recording when we're
|
||||||
// about to schedule a new task
|
// about to schedule a new task
|
||||||
|
@ -76,7 +76,8 @@ func (s *BaseScheduler) init(opts ...schedPolicyOption) {
|
||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
s.running = make(map[string]map[string]bool)
|
s.Running = make(map[string]map[string]bool)
|
||||||
|
s.HostNameToSlaveID = make(map[string]string)
|
||||||
s.mutex = sync.Mutex{}
|
s.mutex = sync.Mutex{}
|
||||||
s.schedWindowResStrategy = schedUtils.SchedWindowResizingCritToStrategy["fillNextOfferCycle"]
|
s.schedWindowResStrategy = schedUtils.SchedWindowResizingCritToStrategy["fillNextOfferCycle"]
|
||||||
}
|
}
|
||||||
|
@ -96,12 +97,14 @@ func (s *BaseScheduler) newTask(offer *mesos.Offer, task def.Task) *mesos.TaskIn
|
||||||
}
|
}
|
||||||
|
|
||||||
// If this is our first time running into this Agent
|
// If this is our first time running into this Agent
|
||||||
if _, ok := s.running[offer.GetSlaveId().GoString()]; !ok {
|
s.TasksRunningMutex.Lock()
|
||||||
s.running[offer.GetSlaveId().GoString()] = make(map[string]bool)
|
if _, ok := s.Running[offer.GetSlaveId().GoString()]; !ok {
|
||||||
|
s.Running[offer.GetSlaveId().GoString()] = make(map[string]bool)
|
||||||
}
|
}
|
||||||
|
s.TasksRunningMutex.Unlock()
|
||||||
|
|
||||||
// Add task to list of tasks running on node
|
// Add task to list of tasks running on node
|
||||||
s.running[offer.GetSlaveId().GoString()][taskName] = true
|
s.Running[offer.GetSlaveId().GoString()][taskName] = true
|
||||||
|
|
||||||
resources := []*mesos.Resource{
|
resources := []*mesos.Resource{
|
||||||
mesosutil.NewScalarResource("cpus", task.CPU),
|
mesosutil.NewScalarResource("cpus", task.CPU),
|
||||||
|
@ -177,6 +180,11 @@ func (s *BaseScheduler) Disconnected(sched.SchedulerDriver) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *BaseScheduler) ResourceOffers(driver sched.SchedulerDriver, offers []*mesos.Offer) {
|
func (s *BaseScheduler) ResourceOffers(driver sched.SchedulerDriver, offers []*mesos.Offer) {
|
||||||
|
for _, offer := range offers {
|
||||||
|
if _, ok := s.HostNameToSlaveID[offer.GetHostname()]; !ok {
|
||||||
|
s.HostNameToSlaveID[offer.GetHostname()] = offer.GetSlaveId().GoString()
|
||||||
|
}
|
||||||
|
}
|
||||||
s.curSchedPolicy.ConsumeOffers(s, driver, offers)
|
s.curSchedPolicy.ConsumeOffers(s, driver, offers)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -185,10 +193,9 @@ func (s *BaseScheduler) StatusUpdate(driver sched.SchedulerDriver, status *mesos
|
||||||
if *status.State == mesos.TaskState_TASK_RUNNING {
|
if *status.State == mesos.TaskState_TASK_RUNNING {
|
||||||
s.tasksRunning++
|
s.tasksRunning++
|
||||||
} else if IsTerminal(status.State) {
|
} else if IsTerminal(status.State) {
|
||||||
// Update resource availability.
|
s.TasksRunningMutex.Lock()
|
||||||
utilities.ResourceAvailabilityUpdate("ON_TASK_TERMINAL_STATE",
|
delete(s.Running[status.GetSlaveId().GoString()], *status.TaskId.Value)
|
||||||
*status.TaskId, *status.SlaveId)
|
s.TasksRunningMutex.Unlock()
|
||||||
delete(s.running[status.GetSlaveId().GoString()], *status.TaskId.Value)
|
|
||||||
s.tasksRunning--
|
s.tasksRunning--
|
||||||
if s.tasksRunning == 0 {
|
if s.tasksRunning == 0 {
|
||||||
select {
|
select {
|
||||||
|
@ -246,7 +253,7 @@ func (s *BaseScheduler) LogNoPendingTasksDeclineOffers(offer *mesos.Offer) {
|
||||||
func (s *BaseScheduler) LogNumberOfRunningTasks() {
|
func (s *BaseScheduler) LogNumberOfRunningTasks() {
|
||||||
lmt := elecLogDef.GENERAL
|
lmt := elecLogDef.GENERAL
|
||||||
msgColor := elecLogDef.LogMessageColors[lmt]
|
msgColor := elecLogDef.LogMessageColors[lmt]
|
||||||
msg := msgColor.Sprintf("Number of tasks still running = %d", s.tasksRunning)
|
msg := msgColor.Sprintf("Number of tasks still Running = %d", s.tasksRunning)
|
||||||
s.Log(lmt, msg)
|
s.Log(lmt, msg)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -255,9 +262,11 @@ func (s *BaseScheduler) LogCoLocatedTasks(slaveID string) {
|
||||||
msgColor := elecLogDef.LogMessageColors[lmt]
|
msgColor := elecLogDef.LogMessageColors[lmt]
|
||||||
buffer := bytes.Buffer{}
|
buffer := bytes.Buffer{}
|
||||||
buffer.WriteString(fmt.Sprintln("Colocated with:"))
|
buffer.WriteString(fmt.Sprintln("Colocated with:"))
|
||||||
for taskName := range s.running[slaveID] {
|
s.TasksRunningMutex.Lock()
|
||||||
|
for taskName := range s.Running[slaveID] {
|
||||||
buffer.WriteString(fmt.Sprintln(taskName))
|
buffer.WriteString(fmt.Sprintln(taskName))
|
||||||
}
|
}
|
||||||
|
s.TasksRunningMutex.Unlock()
|
||||||
msg := msgColor.Sprintf(buffer.String())
|
msg := msgColor.Sprintf(buffer.String())
|
||||||
s.Log(lmt, msg)
|
s.Log(lmt, msg)
|
||||||
}
|
}
|
||||||
|
|
Reference in a new issue