2018-10-06 20:03:14 -07:00
// Copyright (C) 2018 spdf
//
// This file is part of Elektron.
//
// Elektron is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// Elektron is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with Elektron. If not, see <http://www.gnu.org/licenses/>.
//
2018-10-04 13:37:14 -04:00
package powerCap
2017-02-15 19:22:56 -05:00
import (
"bufio"
"container/ring"
2018-01-19 21:20:43 +00:00
"fmt"
2017-02-15 19:22:56 -05:00
"log"
2017-02-20 20:55:06 -05:00
"math"
2017-02-15 19:22:56 -05:00
"os/exec"
"sort"
"strconv"
"strings"
"syscall"
2017-02-20 20:55:06 -05:00
"time"
2018-09-30 18:23:38 -07:00
"gitlab.com/spdf/elektron/constants"
2018-10-04 13:45:31 -04:00
elekLogDef "gitlab.com/spdf/elektron/logging/def"
2018-09-30 18:23:38 -07:00
"gitlab.com/spdf/elektron/pcp"
"gitlab.com/spdf/elektron/rapl"
"gitlab.com/spdf/elektron/utilities"
2017-02-15 19:22:56 -05:00
)
func round ( num float64 ) int {
return int ( math . Floor ( num + math . Copysign ( 0.5 , num ) ) )
}
func getNextCapValue ( curCapValue float64 , precision int ) float64 {
2017-02-20 20:55:06 -05:00
curCapValue /= 2.0
output := math . Pow ( 10 , float64 ( precision ) )
return float64 ( round ( curCapValue * output ) ) / output
}
2018-01-19 21:20:43 +00:00
func StartPCPLogAndProgressiveExtremaCap ( quit chan struct { } , logging * bool , hiThreshold , loThreshold float64 ,
2018-10-05 14:11:49 -04:00
logMType chan elekLogDef . LogMessageType , logMsg chan string , pcpConfigFile string ) {
2018-01-19 21:20:43 +00:00
2018-10-04 19:21:45 -04:00
var pcpCommand string = "pmdumptext -m -l -f '' -t 1.0 -d , -c " + pcpConfigFile
2018-10-04 14:10:41 -04:00
cmd := exec . Command ( "sh" , "-c" , pcpCommand , pcpConfigFile )
2017-02-15 19:22:56 -05:00
cmd . SysProcAttr = & syscall . SysProcAttr { Setpgid : true }
if hiThreshold < loThreshold {
2018-10-04 13:45:31 -04:00
logMType <- elekLogDef . GENERAL
2018-01-19 21:20:43 +00:00
logMsg <- "High threshold is lower than low threshold!"
2017-02-15 19:22:56 -05:00
}
pipe , err := cmd . StdoutPipe ( )
if err != nil {
log . Fatal ( err )
}
//cmd.Stdout = stdout
scanner := bufio . NewScanner ( pipe )
go func ( logging * bool , hiThreshold , loThreshold float64 ) {
2017-09-28 15:36:47 -04:00
// Get names of the columns.
2017-02-15 19:22:56 -05:00
scanner . Scan ( )
2018-01-19 21:20:43 +00:00
// Write to logfile
2018-10-04 13:45:31 -04:00
logMType <- elekLogDef . PCP
2018-01-19 21:20:43 +00:00
logMsg <- scanner . Text ( )
2017-02-15 19:22:56 -05:00
headers := strings . Split ( scanner . Text ( ) , "," )
powerIndexes := make ( [ ] int , 0 , 0 )
powerHistories := make ( map [ string ] * ring . Ring )
indexToHost := make ( map [ int ] string )
for i , hostMetric := range headers {
metricSplit := strings . Split ( hostMetric , ":" )
if strings . Contains ( metricSplit [ 1 ] , "RAPL_ENERGY_PKG" ) ||
strings . Contains ( metricSplit [ 1 ] , "RAPL_ENERGY_DRAM" ) {
powerIndexes = append ( powerIndexes , i )
indexToHost [ i ] = metricSplit [ 0 ]
2017-09-28 15:36:47 -04:00
// Only create one ring per host.
2017-02-15 19:22:56 -05:00
if _ , ok := powerHistories [ metricSplit [ 0 ] ] ; ! ok {
2017-09-28 15:36:47 -04:00
// Two PKGS, two DRAM per node, 20 = 5 seconds of tracking.
powerHistories [ metricSplit [ 0 ] ] = ring . New ( 20 )
2017-02-15 19:22:56 -05:00
}
}
}
2017-09-28 15:36:47 -04:00
// Throw away first set of results.
2017-02-15 19:22:56 -05:00
scanner . Scan ( )
2017-09-28 15:36:47 -04:00
// To keep track of the capped states of the capped victims.
2017-02-15 19:22:56 -05:00
cappedVictims := make ( map [ string ] float64 )
2017-02-22 20:09:04 -05:00
// TODO: Come with a better name for this.
2017-02-15 19:22:56 -05:00
orderCapped := make ( [ ] string , 0 , 8 )
2017-02-22 20:09:04 -05:00
// TODO: Change this to a priority queue ordered by the cap value. This will get rid of the sorting performed in the code.
2017-09-28 15:36:47 -04:00
// Parallel data structure to orderCapped to keep track of the uncapped states of the uncapped victims.
2017-02-22 20:09:04 -05:00
orderCappedVictims := make ( map [ string ] float64 )
2017-02-15 19:22:56 -05:00
clusterPowerHist := ring . New ( 5 )
seconds := 0
for scanner . Scan ( ) {
if * logging {
2018-10-04 13:45:31 -04:00
logMType <- elekLogDef . GENERAL
2018-01-19 21:20:43 +00:00
logMsg <- "Logging PCP..."
2017-02-15 19:22:56 -05:00
split := strings . Split ( scanner . Text ( ) , "," )
2018-10-05 00:38:24 -04:00
text := scanner . Text ( )
2018-10-04 13:45:31 -04:00
logMType <- elekLogDef . PCP
2018-10-05 00:38:24 -04:00
logMsg <- text
2017-02-15 19:22:56 -05:00
totalPower := 0.0
for _ , powerIndex := range powerIndexes {
power , _ := strconv . ParseFloat ( split [ powerIndex ] , 64 )
host := indexToHost [ powerIndex ]
powerHistories [ host ] . Value = power
powerHistories [ host ] = powerHistories [ host ] . Next ( )
2018-10-04 13:45:31 -04:00
logMType <- elekLogDef . GENERAL
2018-01-19 21:20:43 +00:00
logMsg <- fmt . Sprintf ( "Host: %s, Power %f" ,
indexToHost [ powerIndex ] , ( power * pcp . RAPLUnits ) )
2017-02-15 19:22:56 -05:00
totalPower += power
}
2017-09-26 00:26:01 -04:00
clusterPower := totalPower * pcp . RAPLUnits
2017-02-15 19:22:56 -05:00
clusterPowerHist . Value = clusterPower
clusterPowerHist = clusterPowerHist . Next ( )
2017-09-26 00:26:01 -04:00
clusterMean := pcp . AverageClusterPowerHistory ( clusterPowerHist )
2017-02-15 19:22:56 -05:00
2018-10-04 13:45:31 -04:00
logMType <- elekLogDef . GENERAL
2018-01-19 21:20:43 +00:00
logMsg <- fmt . Sprintf ( "Total power: %f, %d Sec Avg: %f" , clusterPower , clusterPowerHist . Len ( ) , clusterMean )
2017-02-15 19:22:56 -05:00
if clusterMean >= hiThreshold {
2018-10-04 13:45:31 -04:00
logMType <- elekLogDef . GENERAL
2018-01-19 21:20:43 +00:00
logMsg <- "Need to cap a node"
2018-10-04 13:45:31 -04:00
logMType <- elekLogDef . GENERAL
2018-01-19 21:20:43 +00:00
logMsg <- fmt . Sprintf ( "Cap values of capped victims: %v" , cappedVictims )
2018-10-04 13:45:31 -04:00
logMType <- elekLogDef . GENERAL
2018-01-19 21:20:43 +00:00
logMsg <- fmt . Sprintf ( "Cap values of victims to uncap: %v" , orderCappedVictims )
// Create statics for all victims and choose one to cap
2017-09-26 00:26:01 -04:00
victims := make ( [ ] pcp . Victim , 0 , 8 )
2017-02-15 19:22:56 -05:00
// TODO: Just keep track of the largest to reduce fron nlogn to n
for name , history := range powerHistories {
2017-09-26 00:26:01 -04:00
histMean := pcp . AverageNodePowerHistory ( history )
2017-02-15 19:22:56 -05:00
2017-09-28 15:36:47 -04:00
// Consider doing mean calculations using go routines if we need to speed up.
2017-09-26 00:26:01 -04:00
victims = append ( victims , pcp . Victim { Watts : histMean , Host : name } )
2017-02-15 19:22:56 -05:00
}
2017-09-28 15:36:47 -04:00
sort . Sort ( pcp . VictimSorter ( victims ) ) // Sort by average wattage.
2017-02-15 19:22:56 -05:00
2017-09-28 15:36:47 -04:00
// Finding the best victim to cap in a round robin manner.
2017-02-20 20:55:06 -05:00
newVictimFound := false
2017-09-28 15:36:47 -04:00
alreadyCappedHosts := [ ] string { } // Host-names of victims that are already capped.
2017-02-15 19:22:56 -05:00
for i := 0 ; i < len ( victims ) ; i ++ {
2017-09-28 15:36:47 -04:00
// Try to pick a victim that hasn't been capped yet.
2017-02-20 20:55:06 -05:00
if _ , ok := cappedVictims [ victims [ i ] . Host ] ; ! ok {
2017-09-28 15:36:47 -04:00
// If this victim can't be capped further, then we move on to find another victim.
2017-02-20 20:55:06 -05:00
if _ , ok := orderCappedVictims [ victims [ i ] . Host ] ; ok {
continue
}
2017-09-28 15:36:47 -04:00
// Need to cap this victim.
2017-02-20 20:55:06 -05:00
if err := rapl . Cap ( victims [ i ] . Host , "rapl" , 50.0 ) ; err != nil {
2018-10-04 13:45:31 -04:00
logMType <- elekLogDef . GENERAL
2018-01-19 21:20:43 +00:00
logMsg <- fmt . Sprintf ( "Error capping host %s" , victims [ i ] . Host )
2017-02-15 19:22:56 -05:00
} else {
2018-10-04 13:45:31 -04:00
logMType <- elekLogDef . GENERAL
2018-01-19 21:20:43 +00:00
logMsg <- fmt . Sprintf ( "Capped host[%s] at %f" , victims [ i ] . Host , 50.0 )
// Keeping track of this victim and it's cap value
2017-02-20 20:55:06 -05:00
cappedVictims [ victims [ i ] . Host ] = 50.0
newVictimFound = true
2017-09-28 15:36:47 -04:00
// This node can be uncapped and hence adding to orderCapped.
2017-02-15 19:22:56 -05:00
orderCapped = append ( orderCapped , victims [ i ] . Host )
2017-02-20 20:55:06 -05:00
orderCappedVictims [ victims [ i ] . Host ] = 50.0
2017-09-28 15:36:47 -04:00
break // Breaking only on successful cap.
2017-02-20 20:55:06 -05:00
}
} else {
alreadyCappedHosts = append ( alreadyCappedHosts , victims [ i ] . Host )
}
}
2017-09-28 15:36:47 -04:00
// If no new victim found, then we need to cap the best victim among the ones that are already capped.
2017-02-20 20:55:06 -05:00
if ! newVictimFound {
2017-03-09 19:17:06 -05:00
canCapAlreadyCappedVictim := false
2017-02-20 20:55:06 -05:00
for i := 0 ; i < len ( alreadyCappedHosts ) ; i ++ {
2017-09-28 15:36:47 -04:00
// If already capped then the host must be present in orderCappedVictims.
2017-02-21 21:05:47 -05:00
capValue := orderCappedVictims [ alreadyCappedHosts [ i ] ]
2017-09-28 15:36:47 -04:00
// If capValue is greater than the threshold then cap, else continue.
2017-03-09 19:20:13 -05:00
if capValue > constants . LowerCapLimit {
2017-02-21 21:05:47 -05:00
newCapValue := getNextCapValue ( capValue , 2 )
2017-02-20 20:55:06 -05:00
if err := rapl . Cap ( alreadyCappedHosts [ i ] , "rapl" , newCapValue ) ; err != nil {
2018-10-04 13:45:31 -04:00
logMType <- elekLogDef . ERROR
2018-01-19 21:20:43 +00:00
logMsg <- fmt . Sprintf ( "Error capping host[%s]" , alreadyCappedHosts [ i ] )
2017-02-20 20:55:06 -05:00
} else {
2017-02-22 20:09:04 -05:00
// Successful cap
2018-10-04 13:45:31 -04:00
logMType <- elekLogDef . GENERAL
2018-01-19 21:20:43 +00:00
logMsg <- fmt . Sprintf ( "Capped host[%s] at %f" , alreadyCappedHosts [ i ] , newCapValue )
// Checking whether this victim can be capped further
2017-03-09 19:20:13 -05:00
if newCapValue <= constants . LowerCapLimit {
2017-09-28 15:36:47 -04:00
// Deleting victim from cappedVictims.
2017-02-20 20:55:06 -05:00
delete ( cappedVictims , alreadyCappedHosts [ i ] )
2017-09-28 15:36:47 -04:00
// Updating the cap value in orderCappedVictims.
2017-02-21 21:05:47 -05:00
orderCappedVictims [ alreadyCappedHosts [ i ] ] = newCapValue
2017-02-20 20:55:06 -05:00
} else {
2017-09-28 15:36:47 -04:00
// Updating the cap value.
2017-02-20 20:55:06 -05:00
cappedVictims [ alreadyCappedHosts [ i ] ] = newCapValue
2017-02-21 21:05:47 -05:00
orderCappedVictims [ alreadyCappedHosts [ i ] ] = newCapValue
2017-02-20 20:55:06 -05:00
}
2017-03-09 19:17:06 -05:00
canCapAlreadyCappedVictim = true
2017-02-22 20:09:04 -05:00
break // Breaking only on successful cap.
2017-02-20 20:55:06 -05:00
}
2017-02-21 21:05:47 -05:00
} else {
2017-09-28 15:36:47 -04:00
// Do nothing.
2017-02-21 21:05:47 -05:00
// Continue to find another victim to cap.
2017-09-28 15:36:47 -04:00
// If cannot find any victim, then all nodes have been
// capped to the maximum and we stop capping at this point.
2017-02-15 19:22:56 -05:00
}
}
2017-03-23 22:13:29 -04:00
if ! canCapAlreadyCappedVictim {
2018-10-04 13:45:31 -04:00
logMType <- elekLogDef . GENERAL
2018-01-19 21:20:43 +00:00
logMsg <- "No Victim left to cap."
2017-03-09 19:17:06 -05:00
}
2017-02-15 19:22:56 -05:00
}
} else if clusterMean < loThreshold {
2018-10-04 13:45:31 -04:00
logMType <- elekLogDef . GENERAL
2018-01-19 21:20:43 +00:00
logMsg <- "Need to uncap a node"
2018-10-04 13:45:31 -04:00
logMType <- elekLogDef . GENERAL
2018-01-19 21:20:43 +00:00
logMsg <- fmt . Sprintf ( "Cap values of capped victims: %v" , cappedVictims )
2018-10-04 13:45:31 -04:00
logMType <- elekLogDef . GENERAL
2018-01-19 21:20:43 +00:00
logMsg <- fmt . Sprintf ( "Cap values of victims to uncap: %v" , orderCappedVictims )
2017-02-15 19:22:56 -05:00
if len ( orderCapped ) > 0 {
2017-09-28 15:36:47 -04:00
// We pick the host that is capped the most to uncap.
2017-02-22 20:09:04 -05:00
orderCappedToSort := utilities . GetPairList ( orderCappedVictims )
2017-09-28 15:36:47 -04:00
sort . Sort ( orderCappedToSort ) // Sorted hosts in non-decreasing order of capped states.
2017-02-22 20:09:04 -05:00
hostToUncap := orderCappedToSort [ 0 ] . Key
2017-03-09 19:20:13 -05:00
// Uncapping the host.
// This is a floating point operation and might suffer from precision loss.
2017-02-22 20:09:04 -05:00
newUncapValue := orderCappedVictims [ hostToUncap ] * 2.0
if err := rapl . Cap ( hostToUncap , "rapl" , newUncapValue ) ; err != nil {
2018-10-04 13:45:31 -04:00
logMType <- elekLogDef . ERROR
2018-01-19 21:20:43 +00:00
logMsg <- fmt . Sprintf ( "Error uncapping host[%s]" , hostToUncap )
2017-02-20 20:55:06 -05:00
} else {
2018-01-19 21:20:43 +00:00
// Successful uncap
2018-10-04 13:45:31 -04:00
logMType <- elekLogDef . GENERAL
2018-01-19 21:20:43 +00:00
logMsg <- fmt . Sprintf ( "Uncapped host[%s] to %f" , hostToUncap , newUncapValue )
// Can we uncap this host further. If not, then we remove its entry from orderCapped
if newUncapValue >= 100.0 { // can compare using ==
// Deleting entry from orderCapped
2017-02-22 20:09:04 -05:00
for i , victimHost := range orderCapped {
if victimHost == hostToUncap {
orderCapped = append ( orderCapped [ : i ] , orderCapped [ i + 1 : ] ... )
2017-09-28 15:36:47 -04:00
break // We are done removing host from orderCapped.
2017-02-22 20:09:04 -05:00
}
}
2017-09-28 15:36:47 -04:00
// Removing entry for host from the parallel data structure.
2017-02-22 20:09:04 -05:00
delete ( orderCappedVictims , hostToUncap )
2017-09-28 15:36:47 -04:00
// Removing entry from cappedVictims as this host is no longer capped.
2017-02-22 20:09:04 -05:00
delete ( cappedVictims , hostToUncap )
2017-09-28 15:36:47 -04:00
} else if newUncapValue > constants . LowerCapLimit { // This check is unnecessary and can be converted to 'else'.
// Updating the cap value.
2017-02-22 20:09:04 -05:00
orderCappedVictims [ hostToUncap ] = newUncapValue
cappedVictims [ hostToUncap ] = newUncapValue
2017-02-20 20:55:06 -05:00
}
2017-02-15 19:22:56 -05:00
}
2017-02-20 20:55:06 -05:00
} else {
2018-10-04 13:45:31 -04:00
logMType <- elekLogDef . GENERAL
2018-01-19 21:20:43 +00:00
logMsg <- "No host staged for Uncapped"
2017-02-15 19:22:56 -05:00
}
}
}
seconds ++
}
} ( logging , hiThreshold , loThreshold )
2017-02-20 20:55:06 -05:00
2018-10-04 13:45:31 -04:00
logMType <- elekLogDef . GENERAL
2018-01-19 21:20:43 +00:00
logMsg <- "PCP logging started"
2017-02-20 20:55:06 -05:00
if err := cmd . Start ( ) ; err != nil {
log . Fatal ( err )
}
pgid , err := syscall . Getpgid ( cmd . Process . Pid )
select {
case <- quit :
2018-10-04 13:45:31 -04:00
logMType <- elekLogDef . GENERAL
2018-01-19 21:20:43 +00:00
logMsg <- "Stopping PCP logging in 5 seconds"
2017-02-20 20:55:06 -05:00
time . Sleep ( 5 * time . Second )
// http://stackoverflow.com/questions/22470193/why-wont-go-kill-a-child-process-correctly
2017-09-28 15:36:47 -04:00
// Kill process and all children processes.
2017-02-20 20:55:06 -05:00
syscall . Kill ( - pgid , 15 )
return
}
2017-02-15 19:22:56 -05:00
}