287 lines
9.9 KiB
Go
287 lines
9.9 KiB
Go
package pcp
|
|
|
|
import (
|
|
"bitbucket.org/sunybingcloud/electron/constants"
|
|
"bitbucket.org/sunybingcloud/electron/rapl"
|
|
"bufio"
|
|
"container/ring"
|
|
"log"
|
|
"math"
|
|
"os"
|
|
"os/exec"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"syscall"
|
|
"time"
|
|
)
|
|
|
|
func round(num float64) int {
|
|
return int(math.Floor(num + math.Copysign(0.5, num)))
|
|
}
|
|
|
|
func getNextCapValue(curCapValue float64, precision int) float64 {
|
|
curCapValue /= 2.0
|
|
output := math.Pow(10, float64(precision))
|
|
return float64(round(curCapValue*output)) / output
|
|
}
|
|
|
|
func getNextUncapValue(curCapValue float64, precision int) float64 {
|
|
curCapValue *= 2.0
|
|
output := math.Pow(10, float64(precision))
|
|
return float64(round(curCapValue*output)) / output
|
|
}
|
|
|
|
func StartPCPLogAndProgressiveExtremaCap(quit chan struct{}, logging *bool, prefix string, hiThreshold, loThreshold float64) {
|
|
log.Println("Inside Log and Progressive Extrema")
|
|
const pcpCommand string = "pmdumptext -m -l -f '' -t 1.0 -d , -c config"
|
|
cmd := exec.Command("sh", "-c", pcpCommand)
|
|
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
|
|
|
if hiThreshold < loThreshold {
|
|
log.Println("High threshold is lower than low threshold!")
|
|
}
|
|
|
|
logFile, err := os.Create("./" + prefix + ".pcplog")
|
|
if err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
|
|
defer logFile.Close()
|
|
|
|
pipe, err := cmd.StdoutPipe()
|
|
if err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
//cmd.Stdout = stdout
|
|
|
|
scanner := bufio.NewScanner(pipe)
|
|
|
|
go func(logging *bool, hiThreshold, loThreshold float64) {
|
|
// Get names of the columns
|
|
scanner.Scan()
|
|
|
|
// Write to logfile
|
|
logFile.WriteString(scanner.Text() + "\n")
|
|
|
|
headers := strings.Split(scanner.Text(), ",")
|
|
|
|
powerIndexes := make([]int, 0, 0)
|
|
powerHistories := make(map[string]*ring.Ring)
|
|
indexToHost := make(map[int]string)
|
|
|
|
for i, hostMetric := range headers {
|
|
metricSplit := strings.Split(hostMetric, ":")
|
|
//log.Printf("%d Host %s: Metric: %s\n", i, split[0], split[1])
|
|
|
|
if strings.Contains(metricSplit[1], "RAPL_ENERGY_PKG") ||
|
|
strings.Contains(metricSplit[1], "RAPL_ENERGY_DRAM") {
|
|
//fmt.Println("Index: ", i)
|
|
powerIndexes = append(powerIndexes, i)
|
|
indexToHost[i] = metricSplit[0]
|
|
|
|
// Only create one ring per host
|
|
if _, ok := powerHistories[metricSplit[0]]; !ok {
|
|
powerHistories[metricSplit[0]] = ring.New(20) // Two PKGS, two DRAM per node, 20 = 5 seconds of tracking
|
|
}
|
|
}
|
|
}
|
|
|
|
// Throw away first set of results
|
|
scanner.Scan()
|
|
|
|
// cappedHosts := make(map[string]bool)
|
|
// Keep track of the capped victims and the corresponding cap value.
|
|
cappedVictims := make(map[string]float64)
|
|
orderCapped := make([]string, 0, 8)
|
|
orderCappedVictims := make(map[string]float64) // Parallel data structure to orderCapped to make it possible to search victims, that are to be uncapped, faster.
|
|
clusterPowerHist := ring.New(5)
|
|
seconds := 0
|
|
|
|
for scanner.Scan() {
|
|
if *logging {
|
|
log.Println("Logging PCP...")
|
|
split := strings.Split(scanner.Text(), ",")
|
|
logFile.WriteString(scanner.Text() + "\n")
|
|
|
|
totalPower := 0.0
|
|
for _, powerIndex := range powerIndexes {
|
|
power, _ := strconv.ParseFloat(split[powerIndex], 64)
|
|
|
|
host := indexToHost[powerIndex]
|
|
|
|
powerHistories[host].Value = power
|
|
powerHistories[host] = powerHistories[host].Next()
|
|
|
|
log.Printf("Host: %s, Power: %f", indexToHost[powerIndex], (power * RAPLUnits))
|
|
|
|
totalPower += power
|
|
}
|
|
clusterPower := totalPower * RAPLUnits
|
|
|
|
clusterPowerHist.Value = clusterPower
|
|
clusterPowerHist = clusterPowerHist.Next()
|
|
|
|
clusterMean := averageClusterPowerHistory(clusterPowerHist)
|
|
|
|
log.Printf("Total power: %f, %d Sec Avg: %f", clusterPower, clusterPowerHist.Len(), clusterMean)
|
|
|
|
if clusterMean >= hiThreshold {
|
|
log.Println("Need to cap a node")
|
|
// Create statics for all victims and choose one to cap
|
|
victims := make([]Victim, 0, 8)
|
|
|
|
// TODO: Just keep track of the largest to reduce fron nlogn to n
|
|
for name, history := range powerHistories {
|
|
|
|
histMean := averageNodePowerHistory(history)
|
|
|
|
// Consider doing mean calculations using go routines if we need to speed up
|
|
victims = append(victims, Victim{Watts: histMean, Host: name})
|
|
}
|
|
|
|
sort.Sort(VictimSorter(victims)) // Sort by average wattage
|
|
|
|
// Finding the best victim to cap in a round robin manner
|
|
newVictimFound := false
|
|
alreadyCappedHosts := []string{} // Hosts of already capped hosts in decreasing order of recent power consumption
|
|
for i := 0; i < len(victims); i++ {
|
|
// Try to pick a victim that hasn't been capped yet.
|
|
if _, ok := cappedVictims[victims[i].Host]; !ok {
|
|
// If this victim is present in orderCapped, then we move on to find another victim that we can cap.
|
|
if _, ok := orderCappedVictims[victims[i].Host]; ok {
|
|
// Adding the host to the alreadyCappedHosts
|
|
alreadyCappedHosts = append(alreadyCappedHosts, victims[i].Host)
|
|
continue
|
|
}
|
|
// Need to cap this victim and add it to the cappedVictims
|
|
if err := rapl.Cap(victims[i].Host, "rapl", 50.0); err != nil {
|
|
log.Printf("Error capping host %s", victims[i].Host)
|
|
} else {
|
|
log.Printf("Capped host[%s] at %f", victims[i].Host, 50.0)
|
|
cappedVictims[victims[i].Host] = 50.0
|
|
newVictimFound = true
|
|
// This node can be uncapped and hence adding to orderCapped
|
|
orderCapped = append(orderCapped, victims[i].Host)
|
|
orderCappedVictims[victims[i].Host] = 50.0
|
|
break // breaking only on successful cap
|
|
}
|
|
} else {
|
|
alreadyCappedHosts = append(alreadyCappedHosts, victims[i].Host)
|
|
}
|
|
}
|
|
// If no new victim found, then we need to cap the best victim among the already capped victims
|
|
if !newVictimFound {
|
|
for i := 0; i < len(alreadyCappedHosts); i++ {
|
|
// Checking if this node can be uncapped too
|
|
if capValue, ok := orderCappedVictims[alreadyCappedHosts[i]]; ok {
|
|
// if capValue is greater than the threshold then cap, else continue
|
|
if capValue > constants.CapThreshold {
|
|
newCapValue := getNextCapValue(capValue, 2)
|
|
if err := rapl.Cap(alreadyCappedHosts[i], "rapl", newCapValue); err != nil {
|
|
log.Printf("Error capping host %s", alreadyCappedHosts[i])
|
|
} else {
|
|
// successful cap
|
|
log.Printf("Capped host[%s] at %f", alreadyCappedHosts[i], newCapValue)
|
|
// Checking whether this victim can be capped further
|
|
if newCapValue <= constants.CapThreshold {
|
|
// deleting victim from cappedVictims
|
|
delete(cappedVictims, alreadyCappedHosts[i])
|
|
// updating the cap value in orderCappedVictims
|
|
orderCappedVictims[alreadyCappedHosts[i]] = newCapValue
|
|
} else {
|
|
// updating the cap value
|
|
cappedVictims[alreadyCappedHosts[i]] = newCapValue
|
|
orderCappedVictims[alreadyCappedHosts[i]] = newCapValue
|
|
}
|
|
break // exiting only on a successful cap.
|
|
}
|
|
} else {
|
|
// Do nothing
|
|
}
|
|
} else {
|
|
// This host can definitely be capped.
|
|
// Cap this host to half it's current cap value and update the new cap value in cappedVictims and orderCappedVictims
|
|
// If we have hit the threshold then we add this host to orderCapped to indicate that it needs to be uncapped.
|
|
newCapValue := getNextCapValue(cappedVictims[alreadyCappedHosts[i]], 2)
|
|
if err := rapl.Cap(alreadyCappedHosts[i], "rapl", newCapValue); err != nil {
|
|
log.Printf("Error capping host %s", alreadyCappedHosts[i])
|
|
} else {
|
|
log.Printf("Capped host[%s] at %f", alreadyCappedHosts[i], newCapValue)
|
|
// Checking whether this victim can be capped further
|
|
if newCapValue <= constants.CapThreshold {
|
|
// deleting victim from cappedVictims
|
|
delete(cappedVictims, alreadyCappedHosts[i])
|
|
// staging victim for uncapping
|
|
orderCapped = append(orderCapped, alreadyCappedHosts[i])
|
|
orderCappedVictims[alreadyCappedHosts[i]] = constants.CapThreshold
|
|
} else {
|
|
// Updating the cap value of the victim
|
|
cappedVictims[alreadyCappedHosts[i]] = newCapValue
|
|
}
|
|
break // exiting only on a successful uncap
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
} else if clusterMean < loThreshold {
|
|
log.Println("Need to uncap a node")
|
|
if len(orderCapped) > 0 {
|
|
host := orderCapped[len(orderCapped)-1]
|
|
// Removing victim from orderCapped only if it has been completely uncapped to 100%
|
|
if cappedVictims[host] == 100.0 {
|
|
orderCapped = orderCapped[:len(orderCapped)-1]
|
|
delete(orderCappedVictims, host)
|
|
} else {
|
|
newCapValue := getNextUncapValue(cappedVictims[host], 2)
|
|
// Uncapping the victim
|
|
if err := rapl.Cap(host, "rapl", newCapValue); err != nil {
|
|
log.Printf("Error uncapping host %s", host)
|
|
} else {
|
|
// Successful uncap
|
|
log.Printf("Uncapped host[%s] to %f", host, newCapValue)
|
|
// If the new cap value is 100, then this node cannot be uncapped
|
|
if newCapValue == 100.0 {
|
|
orderCapped = orderCapped[:len(orderCapped)-1]
|
|
delete(orderCappedVictims, host)
|
|
// Updating cappedVictims
|
|
cappedVictims[host] = newCapValue
|
|
} else if newCapValue > constants.CapThreshold {
|
|
// This host can be capped
|
|
cappedVictims[host] = newCapValue
|
|
// Updating orderCappedVictims
|
|
orderCappedVictims[host] = newCapValue
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
// No node has been capped until now.
|
|
}
|
|
}
|
|
}
|
|
seconds++
|
|
}
|
|
|
|
}(logging, hiThreshold, loThreshold)
|
|
|
|
log.Println("PCP logging started")
|
|
|
|
if err := cmd.Start(); err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
|
|
pgid, err := syscall.Getpgid(cmd.Process.Pid)
|
|
|
|
select {
|
|
case <-quit:
|
|
log.Println("Stopping PCP logging in 5 seconds")
|
|
time.Sleep(5 * time.Second)
|
|
|
|
// http://stackoverflow.com/questions/22470193/why-wont-go-kill-a-child-process-correctly
|
|
// kill process and all children processes
|
|
syscall.Kill(-pgid, 15)
|
|
return
|
|
}
|
|
|
|
}
|