Rapl node capping daemon (#21)

* Initial work on rapl-daemon. Initial server set up. API to read max power per zone and API to write new power cap have both been written.

* Removing python script since this has been ported to go code now.

* Adding test for happy path retrieving max power.

* Change some data types around to avoid too much conversion.

* Add happy path test for cap zone

* Removing uncessary print statement.

* Change cap node to use a temporary setup.

* Renaming arguments to be more descriptive.

* Changing todo message.

* Changing test structure to only set up mock subsystem once and allowing functions to test on it later.

* Adding some more coverage for unhappy paths and fixing some values to reflect they are no longer floats.

* Keeping the old script around as it should be removed in a different PR.

* Delegating percentage check to capNode function.

* Fixing typo.

* Fixing typos.

* Changing shortWindow to longWindow as constraint_0 actually points to the long window.

* Renaming variable in test.

* capping funciton now returns which zones were sucessfully capped and which zones could not be capped. This information is now returned to the caller of the HTTP api.
This commit is contained in:
Renan I. Del Valle 2020-01-19 11:52:30 -08:00 committed by PRADYUMNA KAUSHIK
parent 3543960689
commit e76c1ae972
5 changed files with 311 additions and 0 deletions

1
go.sum
View file

@ -1,5 +1,6 @@
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/fatih/color v1.7.0 h1:DkWD4oS2D8LGGgTQ6IvwJJXSL5Vp2ffcQg58nFV38Ys=
github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4=

39
rapl-daemon/README.md Normal file
View file

@ -0,0 +1,39 @@
# RAPL Daemon
This runs a server that is capable of changing the percentage at which
a node is being throttled to using RAPL. This daemon should be installed
on all worker nodes.
### Sample payload for testing:
```
curl --header "Content-Type: application/json" \
--request POST \
--data '{"percentage":75}' \
http://localhost:9090/powercap
```
### Payload
```json
{
"percentage":75
}
```
### Response
The daemon will respond with a json payload containing zones that were
successfully capped as well as the zones that were not capped.
```json
{
"cappedZones": null,
"failedZones": [
"intel-rapl:0",
"intel-rapl:1"
],
"error": "some zones were not able to be powercapped"
}
```
Field error will not exist if failed zones is empty.

60
rapl-daemon/main.go Normal file
View file

@ -0,0 +1,60 @@
package main
import (
"encoding/json"
"fmt"
"html"
"log"
"net/http"
)
const powercapDir = "/sys/class/powercap/"
// Cap is a payload that is expected from Elektron to cap a node.
type Cap struct {
Percentage int
}
// CapResponse is the payload sent with information about the capping call
type CapResponse struct {
CappedZones []string `json:"cappedZones"`
FailedZones []string `json:"failedZones"`
Error *string `json:"error"`
}
func main() {
http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
fmt.Fprintf(w, "Unsupported endpoint %s", html.EscapeString(r.URL.Path))
})
http.HandleFunc("/powercap", powercapEndpoint)
log.Fatal(http.ListenAndServe(":9090", nil))
}
// Handler for the powercapping HTTP API endpoint.
func powercapEndpoint(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
var payload Cap
var response CapResponse
decoder := json.NewDecoder(r.Body)
err := decoder.Decode(&payload)
if err != nil {
errorMsg := "error parsing payload: " + err.Error()
response.Error = &errorMsg
json.NewEncoder(w).Encode(response)
return
}
cappedZones, failedZones, err := capNode(powercapDir, payload.Percentage)
if err != nil {
errorMsg := err.Error()
response.Error = &errorMsg
}
response.CappedZones = cappedZones
response.FailedZones = failedZones
json.NewEncoder(w).Encode(response)
}

110
rapl-daemon/util.go Normal file
View file

@ -0,0 +1,110 @@
package main
import (
"fmt"
"io/ioutil"
"math"
"os"
"path/filepath"
"strconv"
"strings"
)
const raplPrefixCPU = "intel-rapl"
// constraint_0 is usually the longer window while constraint_1 is usually the shorter window
const maxPowerFileLongWindow = "constraint_0_max_power_uw"
const powerLimitFileLongWindow = "constraint_0_power_limit_uw"
// capNode uses pseudo files made available by the Linux kernel
// in order to capNode CPU power. More information is available at:
// https://www.kernel.org/doc/html/latest/power/powercap/powercap.html
func capNode(base string, percentage int) ([]string, []string, error) {
if percentage <= 0 || percentage > 100 {
return nil, nil, fmt.Errorf("cap percentage must be between 0 (non-inclusive) and 100 (inclusive): %d", percentage)
}
files, err := ioutil.ReadDir(base)
if err != nil {
return nil, nil, err
}
var capped, failed []string
for _, file := range files {
fields := strings.Split(file.Name(), ":")
// Fields should be in the form intel-rapl:X where X is the power zone
// We ignore sub-zones which follow the form intel-rapl:X:Y
if len(fields) != 2 {
continue
}
if fields[0] == raplPrefixCPU {
maxPower, err := maxPower(filepath.Join(base, file.Name(), maxPowerFileLongWindow))
if err != nil {
failed = append(failed, file.Name())
fmt.Println("unable to retreive max power for zone ", err)
continue
}
// We use floats to mitigate the possibility of an integer overflow.
powercap := uint64(math.Ceil(float64(maxPower) * (float64(percentage) / 100)))
if err := capZone(filepath.Join(base, file.Name(), powerLimitFileLongWindow), powercap); err != nil {
failed = append(failed, file.Name())
fmt.Println("unable to write powercap value: ", err)
continue
}
capped = append(capped, file.Name())
}
}
if len(failed) > 0 {
return capped, failed, fmt.Errorf("some zones were not able to be powercapped")
}
return capped, nil, nil
}
// maxPower returns the value in float of the maximum watts a power zone can use.
func maxPower(maxFile string) (uint64, error) {
maxPower, err := ioutil.ReadFile(maxFile)
if err != nil {
return 0, err
}
maxPoweruW, err := strconv.ParseUint(strings.TrimSpace(string(maxPower)), 10, 64)
if err != nil {
return 0, err
}
return maxPoweruW, nil
}
// capZone caps a power zone to a specific amount of watts specified by value
func capZone(limitFile string, value uint64) error {
if _, err := os.Stat(limitFile); os.IsNotExist(err) {
return err
}
err := ioutil.WriteFile(limitFile, []byte(strconv.FormatUint(value, 10)), 0644)
if err != nil {
return err
}
return nil
}
func currentCap(limit string) (uint64, error) {
powercap, err := ioutil.ReadFile(limit)
if err != nil {
return 0, err
}
powercapuW, err := strconv.ParseUint(strings.TrimSpace(string(powercap)), 10, 64)
if err != nil {
return 0, err
}
return powercapuW, nil
}

101
rapl-daemon/util_test.go Normal file
View file

@ -0,0 +1,101 @@
package main
import (
"io/ioutil"
"log"
"math"
"os"
"path/filepath"
"strconv"
"testing"
"github.com/stretchr/testify/assert"
)
var raplDir string
const maxWattage uint64 = 1500000
func TestMain(m *testing.M) {
var err error
raplDir, err = ioutil.TempDir("", raplPrefixCPU)
if err != nil {
log.Fatal(err)
}
defer os.RemoveAll(raplDir)
// Create temporary directory that mocks powercap subsytem
zonePath := filepath.Join(raplDir, raplPrefixCPU+":0")
err = os.Mkdir(zonePath, 755)
if err != nil {
log.Fatal(err)
}
initialWatts := strconv.FormatUint(maxWattage, 10)
err = ioutil.WriteFile(filepath.Join(zonePath, maxPowerFileLongWindow), []byte(initialWatts), 0444)
if err != nil {
log.Fatal(err)
}
err = ioutil.WriteFile(filepath.Join(zonePath, powerLimitFileLongWindow), []byte(initialWatts), 0644)
if err != nil {
log.Fatal(err)
}
os.Exit(m.Run())
}
// TODO(rdelvalle): Add tests where capping fails
func TestCapNode(t *testing.T) {
capped, failed, err := capNode(raplDir, 95)
assert.NoError(t, err)
assert.Len(t, capped, 1)
assert.Nil(t, failed)
t.Run("bad-percentage", func(t *testing.T) {
capped, failed, err := capNode(raplDir, 1000)
assert.Error(t, err)
assert.Nil(t, capped)
assert.Nil(t, failed)
})
t.Run("zero-percent", func(t *testing.T) {
capped, failed, err := capNode(raplDir, 0)
assert.Error(t, err)
assert.Nil(t, capped)
assert.Nil(t, failed)
})
}
func TestMaxPower(t *testing.T) {
maxFile := filepath.Join(raplDir, raplPrefixCPU+":0", maxPowerFileLongWindow)
maxWatts, err := maxPower(maxFile)
assert.NoError(t, err)
assert.Equal(t, maxWattage, maxWatts)
t.Run("name-does-not-exist", func(t *testing.T) {
_, err := maxPower("madeupname")
assert.Error(t, err)
})
}
func TestCapZone(t *testing.T) {
const percentage float64 = .50
powercap := uint64(math.Ceil(float64(maxWattage) * percentage))
limitFile := filepath.Join(raplDir, raplPrefixCPU+":0", powerLimitFileLongWindow)
err := capZone(limitFile, powercap)
assert.NoError(t, err)
newCap, err := currentCap(limitFile)
assert.NoError(t, err)
assert.Equal(t, powercap, newCap)
t.Run("name-does-not-exist", func(t *testing.T) {
err := capZone("madeupname", powercap)
assert.Error(t, err)
})
}