Finished integrating dynamic capping policy that uses a high and a low threshold to start capping and uncapping node via an ssh commands that triggers a python script. Commit has sensitive data, scrub before releasing to public.
This commit is contained in:
parent
1386049d30
commit
cf3d83e712
5 changed files with 245 additions and 186 deletions
214
pcp/loganddynamiccap.go
Normal file
214
pcp/loganddynamiccap.go
Normal file
|
@ -0,0 +1,214 @@
|
||||||
|
package pcp
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"container/ring"
|
||||||
|
"log"
|
||||||
|
"math"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"sort"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"syscall"
|
||||||
|
"time"
|
||||||
|
"bitbucket.org/bingcloud/electron/rapl"
|
||||||
|
)
|
||||||
|
|
||||||
|
var RAPLUnits = math.Pow(2, -32)
|
||||||
|
|
||||||
|
func meanPKG(history *ring.Ring) float64 {
|
||||||
|
|
||||||
|
total := 0.0
|
||||||
|
count := 0.0
|
||||||
|
|
||||||
|
history.Do(func(x interface{}) {
|
||||||
|
if val, ok := x.(float64); ok { //Add it if we can get a float
|
||||||
|
total += val
|
||||||
|
count++
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
if count == 0.0 {
|
||||||
|
return 0.0
|
||||||
|
}
|
||||||
|
|
||||||
|
count /= 2
|
||||||
|
|
||||||
|
return (total / count)
|
||||||
|
}
|
||||||
|
|
||||||
|
func meanCluster(history *ring.Ring) float64 {
|
||||||
|
|
||||||
|
total := 0.0
|
||||||
|
count := 0.0
|
||||||
|
|
||||||
|
history.Do(func(x interface{}) {
|
||||||
|
if val, ok := x.(float64); ok { //Add it if we can get a float
|
||||||
|
total += val
|
||||||
|
count++
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
if count == 0.0 {
|
||||||
|
return 0.0
|
||||||
|
}
|
||||||
|
|
||||||
|
return (total / count)
|
||||||
|
}
|
||||||
|
|
||||||
|
func StartLogAndDynamicCap(quit chan struct{}, logging *bool, prefix string, hiThreshold, loThreshold float64) {
|
||||||
|
const pcpCommand string = "pmdumptext -m -l -f '' -t 1.0 -d , -c config"
|
||||||
|
cmd := exec.Command("sh", "-c", pcpCommand)
|
||||||
|
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||||
|
startTime := time.Now().Format("20060102150405")
|
||||||
|
|
||||||
|
if hiThreshold < loThreshold {
|
||||||
|
log.Println("High threshold is lower than low threshold!")
|
||||||
|
}
|
||||||
|
|
||||||
|
logFile, err := os.Create("./" + prefix + startTime + ".pcplog")
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
defer logFile.Close()
|
||||||
|
|
||||||
|
pipe, err := cmd.StdoutPipe()
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
//cmd.Stdout = stdout
|
||||||
|
|
||||||
|
scanner := bufio.NewScanner(pipe)
|
||||||
|
|
||||||
|
go func(logging *bool, hiThreshold, loThreshold float64) {
|
||||||
|
// Get names of the columns
|
||||||
|
scanner.Scan()
|
||||||
|
|
||||||
|
// Write to logfile
|
||||||
|
logFile.WriteString(scanner.Text() + "\n")
|
||||||
|
|
||||||
|
headers := strings.Split(scanner.Text(), ",")
|
||||||
|
|
||||||
|
powerIndexes := make([]int, 0, 0)
|
||||||
|
powerHistories := make(map[string]*ring.Ring)
|
||||||
|
indexToHost := make(map[int]string)
|
||||||
|
|
||||||
|
for i, hostMetric := range headers {
|
||||||
|
split := strings.Split(hostMetric, ":")
|
||||||
|
//log.Printf("%d Host %s: Metric: %s\n", i, split[0], split[1])
|
||||||
|
|
||||||
|
if strings.Contains(split[1], "RAPL_ENERGY_PKG") {
|
||||||
|
//fmt.Println("Index: ", i)
|
||||||
|
powerIndexes = append(powerIndexes, i)
|
||||||
|
indexToHost[i] = split[0]
|
||||||
|
powerHistories[split[0]] = ring.New(10) // Two PKGS per node, 10 = 5 seconds tracking
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Throw away first set of results
|
||||||
|
scanner.Scan()
|
||||||
|
|
||||||
|
cappedHosts := make(map[string]bool)
|
||||||
|
orderCapped := make([]string, 0, 8)
|
||||||
|
clusterPowerHist := ring.New(5)
|
||||||
|
seconds := 0
|
||||||
|
|
||||||
|
for scanner.Scan() {
|
||||||
|
|
||||||
|
if *logging {
|
||||||
|
log.Println("Logging PCP...")
|
||||||
|
split := strings.Split(scanner.Text(), ",")
|
||||||
|
logFile.WriteString(scanner.Text() + "\n")
|
||||||
|
|
||||||
|
totalPower := 0.0
|
||||||
|
for _, powerIndex := range powerIndexes {
|
||||||
|
power, _ := strconv.ParseFloat(split[powerIndex], 64)
|
||||||
|
|
||||||
|
host := indexToHost[powerIndex]
|
||||||
|
|
||||||
|
powerHistories[host].Value = power
|
||||||
|
powerHistories[host] = powerHistories[host].Next()
|
||||||
|
|
||||||
|
log.Printf("Host: %s, Power: %f", indexToHost[powerIndex], (power * RAPLUnits))
|
||||||
|
|
||||||
|
totalPower += power
|
||||||
|
}
|
||||||
|
clusterPower := totalPower * RAPLUnits
|
||||||
|
|
||||||
|
clusterPowerHist.Value = clusterPower
|
||||||
|
clusterPowerHist = clusterPowerHist.Next()
|
||||||
|
|
||||||
|
clusterMean := meanCluster(clusterPowerHist)
|
||||||
|
|
||||||
|
log.Printf("Total power: %f, %d Sec Avg: %f", clusterPower, clusterPowerHist.Len(), clusterMean)
|
||||||
|
|
||||||
|
if clusterMean > hiThreshold {
|
||||||
|
log.Printf("Need to cap a node")
|
||||||
|
// Create statics for all victims and choose one to cap
|
||||||
|
victims := make([]Victim, 0, 8)
|
||||||
|
|
||||||
|
// TODO: Just keep track of the largest to reduce fron nlogn to n
|
||||||
|
for name, history := range powerHistories {
|
||||||
|
|
||||||
|
histMean := meanPKG(history)
|
||||||
|
// Consider doing mean calculations using go routines if we need to speed up
|
||||||
|
victims = append(victims, Victim{Watts: histMean, Host: name})
|
||||||
|
//log.Printf("host: %s, Avg: %f", name, histMean * RAPLUnits)
|
||||||
|
}
|
||||||
|
|
||||||
|
sort.Sort(VictimSorter(victims)) // Sort by average wattage
|
||||||
|
|
||||||
|
// From best victim to worst, if everyone is already capped NOOP
|
||||||
|
for _, victim := range victims {
|
||||||
|
// Only cap if host hasn't been capped yet
|
||||||
|
if !cappedHosts[victim.Host] {
|
||||||
|
cappedHosts[victim.Host] = true
|
||||||
|
orderCapped = append(orderCapped, victim.Host)
|
||||||
|
log.Printf("Capping Victim %s Avg. Wattage: %f", victim.Host, victim.Watts*RAPLUnits)
|
||||||
|
if err := rapl.Cap(victim.Host, "rapl", 50); err != nil {
|
||||||
|
log.Print("Error capping host")
|
||||||
|
}
|
||||||
|
break // Only cap one machine at at time
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} else if clusterMean < loThreshold {
|
||||||
|
|
||||||
|
if len(orderCapped) > 0 {
|
||||||
|
host := orderCapped[len(orderCapped)-1]
|
||||||
|
orderCapped = orderCapped[:len(orderCapped)-1]
|
||||||
|
cappedHosts[host] = false
|
||||||
|
// User RAPL package to send uncap
|
||||||
|
log.Printf("Uncapping host %s", host)
|
||||||
|
if err := rapl.Cap(host, "rapl", 100); err != nil {
|
||||||
|
log.Print("Error uncapping host")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
seconds++
|
||||||
|
}
|
||||||
|
}(logging, hiThreshold, loThreshold)
|
||||||
|
|
||||||
|
log.Println("PCP logging started")
|
||||||
|
|
||||||
|
if err := cmd.Start(); err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
pgid, err := syscall.Getpgid(cmd.Process.Pid)
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-quit:
|
||||||
|
log.Println("Stopping PCP logging in 5 seconds")
|
||||||
|
time.Sleep(5 * time.Second)
|
||||||
|
|
||||||
|
// http://stackoverflow.com/questions/22470193/why-wont-go-kill-a-child-process-correctly
|
||||||
|
// kill process and all children processes
|
||||||
|
syscall.Kill(-pgid, 15)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,182 +0,0 @@
|
||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"bufio"
|
|
||||||
"fmt"
|
|
||||||
"log"
|
|
||||||
"os"
|
|
||||||
"os/exec"
|
|
||||||
"strings"
|
|
||||||
"syscall"
|
|
||||||
"time"
|
|
||||||
"strconv"
|
|
||||||
"math"
|
|
||||||
"container/ring"
|
|
||||||
"sort"
|
|
||||||
)
|
|
||||||
|
|
||||||
type Victim struct {
|
|
||||||
Watts float64
|
|
||||||
Host string
|
|
||||||
}
|
|
||||||
|
|
||||||
type VictimSorter []Victim
|
|
||||||
|
|
||||||
func (slice VictimSorter) Len() int {
|
|
||||||
return len(slice)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (slice VictimSorter) Less(i, j int) bool {
|
|
||||||
return slice[i].Watts >= slice[j].Watts
|
|
||||||
}
|
|
||||||
|
|
||||||
func (slice VictimSorter) Swap(i, j int) {
|
|
||||||
slice[i], slice[j] = slice[j], slice[i]
|
|
||||||
}
|
|
||||||
|
|
||||||
var RAPLUnits = math.Pow(2, -32)
|
|
||||||
|
|
||||||
func mean(values *ring.Ring) float64 {
|
|
||||||
|
|
||||||
total := 0.0
|
|
||||||
count := 0.0
|
|
||||||
|
|
||||||
values.Do(func(x interface{}){
|
|
||||||
if val, ok := x.(float64); ok { //Add it if we can get a float
|
|
||||||
total += val
|
|
||||||
count++
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
if count == 0.0 {
|
|
||||||
return 0.0
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
count /= 2
|
|
||||||
|
|
||||||
return (total/count)
|
|
||||||
}
|
|
||||||
|
|
||||||
//func median(values *ring.Ring) {
|
|
||||||
|
|
||||||
//}
|
|
||||||
|
|
||||||
|
|
||||||
func main() {
|
|
||||||
|
|
||||||
prefix := "test"
|
|
||||||
logging := new(bool)
|
|
||||||
*logging = true
|
|
||||||
const pcpCommand string = "pmdumptext -m -l -f '' -t 1.0 -d , -c config"
|
|
||||||
cmd := exec.Command("sh", "-c", pcpCommand)
|
|
||||||
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
|
||||||
startTime := time.Now().Format("20060102150405")
|
|
||||||
|
|
||||||
logFile, err := os.Create("./" + prefix + startTime + ".pcplog")
|
|
||||||
if err != nil {
|
|
||||||
log.Fatal(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
defer logFile.Close()
|
|
||||||
|
|
||||||
pipe, err := cmd.StdoutPipe()
|
|
||||||
if err != nil {
|
|
||||||
log.Fatal(err)
|
|
||||||
}
|
|
||||||
//cmd.Stdout = stdout
|
|
||||||
|
|
||||||
scanner := bufio.NewScanner(pipe)
|
|
||||||
|
|
||||||
go func(logging *bool) {
|
|
||||||
// Get names of the columns
|
|
||||||
scanner.Scan()
|
|
||||||
|
|
||||||
// Write to logfile
|
|
||||||
logFile.WriteString(scanner.Text() + "\n")
|
|
||||||
|
|
||||||
headers := strings.Split(scanner.Text(), ",")
|
|
||||||
|
|
||||||
powerIndexes := make([]int, 0, 0)
|
|
||||||
powerAverage := make(map[string]*ring.Ring)
|
|
||||||
indexToHost := make(map[int]string)
|
|
||||||
|
|
||||||
for i, hostMetric := range headers {
|
|
||||||
split := strings.Split(hostMetric, ":")
|
|
||||||
fmt.Printf("%d Host %s: Metric: %s\n", i, split[0], split[1])
|
|
||||||
|
|
||||||
if strings.Contains(split[1], "RAPL_ENERGY_PKG") {
|
|
||||||
fmt.Println("Index: ", i)
|
|
||||||
powerIndexes = append(powerIndexes, i)
|
|
||||||
indexToHost[i] = split[0]
|
|
||||||
powerAverage[split[0]] = ring.New(10) // Two PKGS per node, 10 = 5 seconds tracking
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Throw away first set of results
|
|
||||||
scanner.Scan()
|
|
||||||
|
|
||||||
seconds := 0
|
|
||||||
for scanner.Scan() {
|
|
||||||
|
|
||||||
if *logging {
|
|
||||||
log.Println("Logging PCP...")
|
|
||||||
split := strings.Split(scanner.Text(), ",")
|
|
||||||
logFile.WriteString(scanner.Text() + "\n")
|
|
||||||
|
|
||||||
|
|
||||||
totalPower := 0.0
|
|
||||||
for _,powerIndex := range powerIndexes {
|
|
||||||
power, _ := strconv.ParseFloat(split[powerIndex], 64)
|
|
||||||
|
|
||||||
host := indexToHost[powerIndex]
|
|
||||||
|
|
||||||
powerAverage[host].Value = power
|
|
||||||
powerAverage[host] = powerAverage[host].Next()
|
|
||||||
|
|
||||||
log.Printf("Host: %s, Index: %d, Power: %f", indexToHost[powerIndex], powerIndex, (power * RAPLUnits))
|
|
||||||
|
|
||||||
totalPower += power
|
|
||||||
}
|
|
||||||
|
|
||||||
log.Println("Total power: ", totalPower * RAPLUnits)
|
|
||||||
|
|
||||||
victims := make([]Victim, 8, 8)
|
|
||||||
|
|
||||||
// TODO: Just keep track of the largest to reduce fron nlogn to n
|
|
||||||
for name,ring := range powerAverage {
|
|
||||||
victims = append(victims, Victim{mean(ring), name})
|
|
||||||
//log.Printf("host: %s, Avg: %f", name, mean(ring) * RAPLUnits)
|
|
||||||
}
|
|
||||||
sort.Sort(VictimSorter(victims))
|
|
||||||
log.Printf("Current Victim %s Avg. Wattage: %f", victims[0].Host, victims[0].Watts * RAPLUnits)
|
|
||||||
}
|
|
||||||
|
|
||||||
seconds++
|
|
||||||
}
|
|
||||||
}(logging)
|
|
||||||
|
|
||||||
log.Println("PCP logging started")
|
|
||||||
|
|
||||||
if err := cmd.Start(); err != nil {
|
|
||||||
log.Fatal(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := cmd.Wait(); err != nil {
|
|
||||||
log.Fatal(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
pgid, err := syscall.Getpgid(cmd.Process.Pid)
|
|
||||||
|
|
||||||
select {
|
|
||||||
case <-quit:
|
|
||||||
log.Println("Stopping PCP logging in 5 seconds")
|
|
||||||
time.Sleep(5 * time.Second)
|
|
||||||
|
|
||||||
// http://stackoverflow.com/questions/22470193/why-wont-go-kill-a-child-process-correctly
|
|
||||||
// kill process and all children processes
|
|
||||||
syscall.Kill(-pgid, 15)
|
|
||||||
return
|
|
||||||
}*/
|
|
||||||
}
|
|
|
@ -1,3 +0,0 @@
|
||||||
package main
|
|
||||||
|
|
||||||
|
|
20
pcp/victim.go
Normal file
20
pcp/victim.go
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
package pcp
|
||||||
|
|
||||||
|
type Victim struct {
|
||||||
|
Watts float64
|
||||||
|
Host string
|
||||||
|
}
|
||||||
|
|
||||||
|
type VictimSorter []Victim
|
||||||
|
|
||||||
|
func (slice VictimSorter) Len() int {
|
||||||
|
return len(slice)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (slice VictimSorter) Less(i, j int) bool {
|
||||||
|
return slice[i].Watts >= slice[j].Watts
|
||||||
|
}
|
||||||
|
|
||||||
|
func (slice VictimSorter) Swap(i, j int) {
|
||||||
|
slice[i], slice[j] = slice[j], slice[i]
|
||||||
|
}
|
12
scheduler.go
12
scheduler.go
|
@ -19,6 +19,8 @@ var master = flag.String("master", "xavier:5050", "Location of leading Mesos mas
|
||||||
var tasksFile = flag.String("workload", "", "JSON file containing task definitions")
|
var tasksFile = flag.String("workload", "", "JSON file containing task definitions")
|
||||||
var ignoreWatts = flag.Bool("ignoreWatts", false, "Ignore watts in offers")
|
var ignoreWatts = flag.Bool("ignoreWatts", false, "Ignore watts in offers")
|
||||||
var pcplogPrefix = flag.String("logPrefix", "", "Prefix for pcplog")
|
var pcplogPrefix = flag.String("logPrefix", "", "Prefix for pcplog")
|
||||||
|
var hiThreshold = flag.Float64("hiThreshold", 0.0, "Upperbound for when we should start capping")
|
||||||
|
var loThreshold = flag.Float64("loThreshold", 0.0, "Lowerbound for when we should start uncapping")
|
||||||
|
|
||||||
// Short hand args
|
// Short hand args
|
||||||
func init() {
|
func init() {
|
||||||
|
@ -26,6 +28,8 @@ func init() {
|
||||||
flag.StringVar(tasksFile, "w", "", "JSON file containing task definitions (shorthand)")
|
flag.StringVar(tasksFile, "w", "", "JSON file containing task definitions (shorthand)")
|
||||||
flag.BoolVar(ignoreWatts, "i", false, "Ignore watts in offers (shorthand)")
|
flag.BoolVar(ignoreWatts, "i", false, "Ignore watts in offers (shorthand)")
|
||||||
flag.StringVar(pcplogPrefix, "p", "", "Prefix for pcplog (shorthand)")
|
flag.StringVar(pcplogPrefix, "p", "", "Prefix for pcplog (shorthand)")
|
||||||
|
flag.Float64Var(hiThreshold, "ht", 700.0, "Upperbound for when we should start capping (shorthand)")
|
||||||
|
flag.Float64Var(loThreshold, "lt", 400.0, "Lowerbound for when we should start uncapping (shorthand)")
|
||||||
}
|
}
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
|
@ -36,6 +40,11 @@ func main() {
|
||||||
os.Exit(1)
|
os.Exit(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if *hiThreshold < *loThreshold {
|
||||||
|
fmt.Println("High threshold is of a lower value than low threhold.")
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
tasks, err := def.TasksFromJSON(*tasksFile)
|
tasks, err := def.TasksFromJSON(*tasksFile)
|
||||||
if err != nil || len(tasks) == 0 {
|
if err != nil || len(tasks) == 0 {
|
||||||
fmt.Println("Invalid tasks specification file provided")
|
fmt.Println("Invalid tasks specification file provided")
|
||||||
|
@ -61,7 +70,8 @@ func main() {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
go pcp.Start(scheduler.PCPLog, &scheduler.RecordPCP, *pcplogPrefix)
|
//go pcp.Start(scheduler.PCPLog, &scheduler.RecordPCP, *pcplogPrefix)
|
||||||
|
go pcp.StartLogAndDynamicCap(scheduler.PCPLog, &scheduler.RecordPCP, *pcplogPrefix, *hiThreshold, *loThreshold)
|
||||||
time.Sleep(1 * time.Second)
|
time.Sleep(1 * time.Second)
|
||||||
|
|
||||||
// Attempt to handle signint to not leave pmdumptext running
|
// Attempt to handle signint to not leave pmdumptext running
|
||||||
|
|
Reference in a new issue