2019-10-31 14:32:46 -04:00
// Copyright (C) 2018 spdfg
2019-12-09 20:15:33 -05:00
//
2018-10-06 20:03:14 -07:00
// This file is part of Elektron.
2019-12-09 20:15:33 -05:00
//
2018-10-06 20:03:14 -07:00
// Elektron is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
2019-12-09 20:15:33 -05:00
//
2018-10-06 20:03:14 -07:00
// Elektron is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
2019-12-09 20:15:33 -05:00
//
2018-10-06 20:03:14 -07:00
// You should have received a copy of the GNU General Public License
// along with Elektron. If not, see <http://www.gnu.org/licenses/>.
2019-12-09 20:15:33 -05:00
//
2018-10-06 20:03:14 -07:00
2018-10-04 13:37:14 -04:00
package powerCap
2016-10-18 17:38:49 -04:00
import (
"bufio"
"container/ring"
2018-01-19 21:20:43 +00:00
"fmt"
2016-10-18 17:38:49 -04:00
"os/exec"
"sort"
"strconv"
"strings"
"syscall"
"time"
2018-09-30 18:23:38 -07:00
2019-12-09 20:15:33 -05:00
log "github.com/sirupsen/logrus"
elekLog "github.com/spdfg/elektron/logging"
. "github.com/spdfg/elektron/logging/types"
2019-10-31 14:32:46 -04:00
"github.com/spdfg/elektron/pcp"
"github.com/spdfg/elektron/rapl"
2016-10-18 17:38:49 -04:00
)
2019-12-09 20:15:33 -05:00
func StartPCPLogAndExtremaDynamicCap ( quit chan struct { } , logging * bool , hiThreshold , loThreshold float64 , pcpConfigFile string ) {
2018-01-19 21:20:43 +00:00
2018-10-04 19:21:45 -04:00
var pcpCommand string = "pmdumptext -m -l -f '' -t 1.0 -d , -c " + pcpConfigFile
2018-10-04 14:10:41 -04:00
cmd := exec . Command ( "sh" , "-c" , pcpCommand , pcpConfigFile )
2016-10-18 17:38:49 -04:00
cmd . SysProcAttr = & syscall . SysProcAttr { Setpgid : true }
if hiThreshold < loThreshold {
2019-12-09 20:15:33 -05:00
elekLog . Log ( CONSOLE , log . InfoLevel , "High threshold is lower than low threshold!" )
2016-10-18 17:38:49 -04:00
}
pipe , err := cmd . StdoutPipe ( )
if err != nil {
log . Fatal ( err )
}
//cmd.Stdout = stdout
scanner := bufio . NewScanner ( pipe )
go func ( logging * bool , hiThreshold , loThreshold float64 ) {
2017-09-28 15:36:47 -04:00
// Get names of the columns.
2016-10-18 17:38:49 -04:00
scanner . Scan ( )
2018-01-19 21:20:43 +00:00
// Write to logfile
2019-12-09 20:15:33 -05:00
elekLog . Log ( PCP , log . InfoLevel , scanner . Text ( ) )
2016-10-18 17:38:49 -04:00
headers := strings . Split ( scanner . Text ( ) , "," )
powerIndexes := make ( [ ] int , 0 , 0 )
powerHistories := make ( map [ string ] * ring . Ring )
indexToHost := make ( map [ int ] string )
for i , hostMetric := range headers {
2017-02-05 14:54:15 -05:00
metricSplit := strings . Split ( hostMetric , ":" )
2016-10-18 17:38:49 -04:00
2017-02-05 14:54:15 -05:00
if strings . Contains ( metricSplit [ 1 ] , "RAPL_ENERGY_PKG" ) ||
strings . Contains ( metricSplit [ 1 ] , "RAPL_ENERGY_DRAM" ) {
2016-10-18 17:38:49 -04:00
powerIndexes = append ( powerIndexes , i )
2017-02-05 14:54:15 -05:00
indexToHost [ i ] = metricSplit [ 0 ]
2017-02-03 23:12:49 -05:00
2017-09-28 15:36:47 -04:00
// Only create one ring per host.
2017-02-05 14:54:15 -05:00
if _ , ok := powerHistories [ metricSplit [ 0 ] ] ; ! ok {
2017-09-28 15:36:47 -04:00
// Two PKGS, two DRAM per node, 20 - 5 seconds of tracking.
powerHistories [ metricSplit [ 0 ] ] = ring . New ( 20 )
2017-02-03 23:12:49 -05:00
}
2016-10-18 17:38:49 -04:00
}
}
2017-09-28 15:36:47 -04:00
// Throw away first set of results.
2016-10-18 17:38:49 -04:00
scanner . Scan ( )
cappedHosts := make ( map [ string ] bool )
orderCapped := make ( [ ] string , 0 , 8 )
clusterPowerHist := ring . New ( 5 )
seconds := 0
for scanner . Scan ( ) {
if * logging {
2019-12-09 20:15:33 -05:00
elekLog . Log ( CONSOLE , log . InfoLevel , "Logging PCP..." )
2018-10-05 00:38:24 -04:00
text := scanner . Text ( )
split := strings . Split ( text , "," )
2019-12-09 20:15:33 -05:00
elekLog . Log ( PCP , log . InfoLevel , text )
2018-10-05 00:38:24 -04:00
2016-10-18 17:38:49 -04:00
totalPower := 0.0
for _ , powerIndex := range powerIndexes {
power , _ := strconv . ParseFloat ( split [ powerIndex ] , 64 )
host := indexToHost [ powerIndex ]
powerHistories [ host ] . Value = power
powerHistories [ host ] = powerHistories [ host ] . Next ( )
2019-12-09 20:15:33 -05:00
elekLog . WithFields ( log . Fields {
"Host" : indexToHost [ powerIndex ] ,
"Power" : fmt . Sprintf ( "%f" , power * pcp . RAPLUnits ) ,
} ) . Log ( CONSOLE , log . InfoLevel , "" )
2016-10-18 17:38:49 -04:00
totalPower += power
}
2017-09-26 00:26:01 -04:00
clusterPower := totalPower * pcp . RAPLUnits
2016-10-18 17:38:49 -04:00
clusterPowerHist . Value = clusterPower
clusterPowerHist = clusterPowerHist . Next ( )
2017-09-26 00:26:01 -04:00
clusterMean := pcp . AverageClusterPowerHistory ( clusterPowerHist )
2016-10-18 17:38:49 -04:00
2019-12-09 20:15:33 -05:00
elekLog . WithFields ( log . Fields {
"Total power" : fmt . Sprintf ( "%f %d Sec" , clusterPower , clusterPowerHist . Len ( ) ) ,
"Avg" : fmt . Sprintf ( "%f" , clusterMean ) ,
} ) . Log ( CONSOLE , log . InfoLevel , "" )
2016-10-18 17:38:49 -04:00
if clusterMean > hiThreshold {
2019-12-09 20:15:33 -05:00
elekLog . Log ( CONSOLE , log . InfoLevel , "Need to cap a node" )
2018-01-19 21:20:43 +00:00
// Create statics for all victims and choose one to cap
2017-09-26 00:26:01 -04:00
victims := make ( [ ] pcp . Victim , 0 , 8 )
2016-10-18 17:38:49 -04:00
// TODO: Just keep track of the largest to reduce fron nlogn to n
for name , history := range powerHistories {
2017-09-26 00:26:01 -04:00
histMean := pcp . AverageNodePowerHistory ( history )
2017-02-05 14:54:15 -05:00
2017-09-28 15:36:47 -04:00
// Consider doing mean calculations using go routines if we need to speed up.
2017-09-26 00:26:01 -04:00
victims = append ( victims , pcp . Victim { Watts : histMean , Host : name } )
2016-10-18 17:38:49 -04:00
}
2017-09-28 15:36:47 -04:00
sort . Sort ( pcp . VictimSorter ( victims ) ) // Sort by average wattage.
2016-10-18 17:38:49 -04:00
2017-09-28 15:36:47 -04:00
// From best victim to worst, if everyone is already capped NOOP.
2016-10-18 17:38:49 -04:00
for _ , victim := range victims {
2017-09-28 15:36:47 -04:00
// Only cap if host hasn't been capped yet.
2016-10-18 17:38:49 -04:00
if ! cappedHosts [ victim . Host ] {
cappedHosts [ victim . Host ] = true
orderCapped = append ( orderCapped , victim . Host )
2019-12-09 20:15:33 -05:00
elekLog . WithField ( "Avg. Wattage" ,
fmt . Sprintf ( "%f" , victim . Watts * pcp . RAPLUnits ) ) . Logf ( CONSOLE , log . InfoLevel , "Capping Victim %s" , victim . Host )
2016-10-18 17:38:49 -04:00
if err := rapl . Cap ( victim . Host , "rapl" , 50 ) ; err != nil {
2019-12-09 20:15:33 -05:00
elekLog . Log ( CONSOLE , log . ErrorLevel , "Error capping host" )
2016-10-18 17:38:49 -04:00
}
2017-09-28 15:36:47 -04:00
break // Only cap one machine at at time.
2016-10-18 17:38:49 -04:00
}
}
} else if clusterMean < loThreshold {
if len ( orderCapped ) > 0 {
host := orderCapped [ len ( orderCapped ) - 1 ]
orderCapped = orderCapped [ : len ( orderCapped ) - 1 ]
cappedHosts [ host ] = false
2017-09-28 15:36:47 -04:00
// User RAPL package to send uncap.
2019-12-09 20:15:33 -05:00
elekLog . Logf ( CONSOLE , log . InfoLevel , "Uncapping host %s" , host )
2016-10-18 17:38:49 -04:00
if err := rapl . Cap ( host , "rapl" , 100 ) ; err != nil {
2019-12-09 20:15:33 -05:00
elekLog . Log ( CONSOLE , log . ErrorLevel , "Error capping host" )
2016-10-18 17:38:49 -04:00
}
}
}
}
seconds ++
}
} ( logging , hiThreshold , loThreshold )
2019-12-09 20:15:33 -05:00
elekLog . Log ( CONSOLE , log . InfoLevel , "PCP logging started" )
2016-10-18 17:38:49 -04:00
if err := cmd . Start ( ) ; err != nil {
log . Fatal ( err )
}
pgid , err := syscall . Getpgid ( cmd . Process . Pid )
select {
case <- quit :
2019-12-09 20:15:33 -05:00
elekLog . Log ( CONSOLE , log . InfoLevel , "Stopping PCP logging in 5 seconds" )
2016-10-18 17:38:49 -04:00
time . Sleep ( 5 * time . Second )
// http://stackoverflow.com/questions/22470193/why-wont-go-kill-a-child-process-correctly
2017-09-28 15:36:47 -04:00
// Kill process and all children processes.
2016-10-18 17:38:49 -04:00
syscall . Kill ( - pgid , 15 )
return
}
}