From e76c1ae9723bd064993027428fd9ba61a02db7b0 Mon Sep 17 00:00:00 2001 From: "Renan I. Del Valle" Date: Sun, 19 Jan 2020 11:52:30 -0800 Subject: [PATCH 1/2] Rapl node capping daemon (#21) * Initial work on rapl-daemon. Initial server set up. API to read max power per zone and API to write new power cap have both been written. * Removing python script since this has been ported to go code now. * Adding test for happy path retrieving max power. * Change some data types around to avoid too much conversion. * Add happy path test for cap zone * Removing uncessary print statement. * Change cap node to use a temporary setup. * Renaming arguments to be more descriptive. * Changing todo message. * Changing test structure to only set up mock subsystem once and allowing functions to test on it later. * Adding some more coverage for unhappy paths and fixing some values to reflect they are no longer floats. * Keeping the old script around as it should be removed in a different PR. * Delegating percentage check to capNode function. * Fixing typo. * Fixing typos. * Changing shortWindow to longWindow as constraint_0 actually points to the long window. * Renaming variable in test. * capping funciton now returns which zones were sucessfully capped and which zones could not be capped. This information is now returned to the caller of the HTTP api. --- go.sum | 1 + rapl-daemon/README.md | 39 ++++++++++++++ rapl-daemon/main.go | 60 +++++++++++++++++++++ rapl-daemon/util.go | 110 +++++++++++++++++++++++++++++++++++++++ rapl-daemon/util_test.go | 101 +++++++++++++++++++++++++++++++++++ 5 files changed, 311 insertions(+) create mode 100644 rapl-daemon/README.md create mode 100644 rapl-daemon/main.go create mode 100644 rapl-daemon/util.go create mode 100644 rapl-daemon/util_test.go diff --git a/go.sum b/go.sum index f2fecd3..411d53c 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,6 @@ github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/fatih/color v1.7.0 h1:DkWD4oS2D8LGGgTQ6IvwJJXSL5Vp2ffcQg58nFV38Ys= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= diff --git a/rapl-daemon/README.md b/rapl-daemon/README.md new file mode 100644 index 0000000..415a31c --- /dev/null +++ b/rapl-daemon/README.md @@ -0,0 +1,39 @@ +# RAPL Daemon + +This runs a server that is capable of changing the percentage at which +a node is being throttled to using RAPL. This daemon should be installed +on all worker nodes. + +### Sample payload for testing: +``` + curl --header "Content-Type: application/json" \ + --request POST \ + --data '{"percentage":75}' \ + http://localhost:9090/powercap + ``` + +### Payload + +```json +{ + "percentage":75 +} +``` + +### Response + +The daemon will respond with a json payload containing zones that were +successfully capped as well as the zones that were not capped. + +```json +{ + "cappedZones": null, + "failedZones": [ + "intel-rapl:0", + "intel-rapl:1" + ], + "error": "some zones were not able to be powercapped" +} +``` + +Field error will not exist if failed zones is empty. \ No newline at end of file diff --git a/rapl-daemon/main.go b/rapl-daemon/main.go new file mode 100644 index 0000000..e88d3eb --- /dev/null +++ b/rapl-daemon/main.go @@ -0,0 +1,60 @@ +package main + +import ( + "encoding/json" + "fmt" + "html" + "log" + "net/http" +) + +const powercapDir = "/sys/class/powercap/" + +// Cap is a payload that is expected from Elektron to cap a node. +type Cap struct { + Percentage int +} + +// CapResponse is the payload sent with information about the capping call +type CapResponse struct { + CappedZones []string `json:"cappedZones"` + FailedZones []string `json:"failedZones"` + Error *string `json:"error"` +} + +func main() { + http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + fmt.Fprintf(w, "Unsupported endpoint %s", html.EscapeString(r.URL.Path)) + }) + + http.HandleFunc("/powercap", powercapEndpoint) + log.Fatal(http.ListenAndServe(":9090", nil)) +} + +// Handler for the powercapping HTTP API endpoint. +func powercapEndpoint(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + + var payload Cap + var response CapResponse + + decoder := json.NewDecoder(r.Body) + err := decoder.Decode(&payload) + if err != nil { + errorMsg := "error parsing payload: " + err.Error() + response.Error = &errorMsg + json.NewEncoder(w).Encode(response) + return + } + + cappedZones, failedZones, err := capNode(powercapDir, payload.Percentage) + if err != nil { + errorMsg := err.Error() + response.Error = &errorMsg + } + + response.CappedZones = cappedZones + response.FailedZones = failedZones + + json.NewEncoder(w).Encode(response) +} diff --git a/rapl-daemon/util.go b/rapl-daemon/util.go new file mode 100644 index 0000000..daf8dff --- /dev/null +++ b/rapl-daemon/util.go @@ -0,0 +1,110 @@ +package main + +import ( + "fmt" + "io/ioutil" + "math" + "os" + "path/filepath" + "strconv" + "strings" +) + +const raplPrefixCPU = "intel-rapl" + +// constraint_0 is usually the longer window while constraint_1 is usually the shorter window +const maxPowerFileLongWindow = "constraint_0_max_power_uw" +const powerLimitFileLongWindow = "constraint_0_power_limit_uw" + +// capNode uses pseudo files made available by the Linux kernel +// in order to capNode CPU power. More information is available at: +// https://www.kernel.org/doc/html/latest/power/powercap/powercap.html +func capNode(base string, percentage int) ([]string, []string, error) { + + if percentage <= 0 || percentage > 100 { + return nil, nil, fmt.Errorf("cap percentage must be between 0 (non-inclusive) and 100 (inclusive): %d", percentage) + } + + files, err := ioutil.ReadDir(base) + if err != nil { + return nil, nil, err + } + + var capped, failed []string + for _, file := range files { + fields := strings.Split(file.Name(), ":") + + // Fields should be in the form intel-rapl:X where X is the power zone + // We ignore sub-zones which follow the form intel-rapl:X:Y + if len(fields) != 2 { + continue + } + + if fields[0] == raplPrefixCPU { + maxPower, err := maxPower(filepath.Join(base, file.Name(), maxPowerFileLongWindow)) + if err != nil { + failed = append(failed, file.Name()) + fmt.Println("unable to retreive max power for zone ", err) + continue + } + + // We use floats to mitigate the possibility of an integer overflow. + powercap := uint64(math.Ceil(float64(maxPower) * (float64(percentage) / 100))) + + if err := capZone(filepath.Join(base, file.Name(), powerLimitFileLongWindow), powercap); err != nil { + failed = append(failed, file.Name()) + fmt.Println("unable to write powercap value: ", err) + continue + } + capped = append(capped, file.Name()) + } + } + + if len(failed) > 0 { + return capped, failed, fmt.Errorf("some zones were not able to be powercapped") + } + + return capped, nil, nil +} + +// maxPower returns the value in float of the maximum watts a power zone can use. +func maxPower(maxFile string) (uint64, error) { + maxPower, err := ioutil.ReadFile(maxFile) + if err != nil { + return 0, err + } + + maxPoweruW, err := strconv.ParseUint(strings.TrimSpace(string(maxPower)), 10, 64) + if err != nil { + return 0, err + } + + return maxPoweruW, nil +} + +// capZone caps a power zone to a specific amount of watts specified by value +func capZone(limitFile string, value uint64) error { + if _, err := os.Stat(limitFile); os.IsNotExist(err) { + return err + } + + err := ioutil.WriteFile(limitFile, []byte(strconv.FormatUint(value, 10)), 0644) + if err != nil { + return err + } + return nil +} + +func currentCap(limit string) (uint64, error) { + powercap, err := ioutil.ReadFile(limit) + if err != nil { + return 0, err + } + + powercapuW, err := strconv.ParseUint(strings.TrimSpace(string(powercap)), 10, 64) + if err != nil { + return 0, err + } + + return powercapuW, nil +} diff --git a/rapl-daemon/util_test.go b/rapl-daemon/util_test.go new file mode 100644 index 0000000..58065bd --- /dev/null +++ b/rapl-daemon/util_test.go @@ -0,0 +1,101 @@ +package main + +import ( + "io/ioutil" + "log" + "math" + "os" + "path/filepath" + "strconv" + "testing" + + "github.com/stretchr/testify/assert" +) + +var raplDir string + +const maxWattage uint64 = 1500000 + +func TestMain(m *testing.M) { + var err error + raplDir, err = ioutil.TempDir("", raplPrefixCPU) + if err != nil { + log.Fatal(err) + } + + defer os.RemoveAll(raplDir) + + // Create temporary directory that mocks powercap subsytem + zonePath := filepath.Join(raplDir, raplPrefixCPU+":0") + err = os.Mkdir(zonePath, 755) + if err != nil { + log.Fatal(err) + } + + initialWatts := strconv.FormatUint(maxWattage, 10) + + err = ioutil.WriteFile(filepath.Join(zonePath, maxPowerFileLongWindow), []byte(initialWatts), 0444) + if err != nil { + log.Fatal(err) + } + + err = ioutil.WriteFile(filepath.Join(zonePath, powerLimitFileLongWindow), []byte(initialWatts), 0644) + if err != nil { + log.Fatal(err) + } + + os.Exit(m.Run()) +} + +// TODO(rdelvalle): Add tests where capping fails +func TestCapNode(t *testing.T) { + capped, failed, err := capNode(raplDir, 95) + assert.NoError(t, err) + assert.Len(t, capped, 1) + assert.Nil(t, failed) + + t.Run("bad-percentage", func(t *testing.T) { + capped, failed, err := capNode(raplDir, 1000) + assert.Error(t, err) + assert.Nil(t, capped) + assert.Nil(t, failed) + }) + + t.Run("zero-percent", func(t *testing.T) { + capped, failed, err := capNode(raplDir, 0) + assert.Error(t, err) + assert.Nil(t, capped) + assert.Nil(t, failed) + }) +} + +func TestMaxPower(t *testing.T) { + maxFile := filepath.Join(raplDir, raplPrefixCPU+":0", maxPowerFileLongWindow) + + maxWatts, err := maxPower(maxFile) + assert.NoError(t, err) + assert.Equal(t, maxWattage, maxWatts) + + t.Run("name-does-not-exist", func(t *testing.T) { + _, err := maxPower("madeupname") + assert.Error(t, err) + }) +} + +func TestCapZone(t *testing.T) { + const percentage float64 = .50 + + powercap := uint64(math.Ceil(float64(maxWattage) * percentage)) + limitFile := filepath.Join(raplDir, raplPrefixCPU+":0", powerLimitFileLongWindow) + err := capZone(limitFile, powercap) + assert.NoError(t, err) + + newCap, err := currentCap(limitFile) + assert.NoError(t, err) + assert.Equal(t, powercap, newCap) + + t.Run("name-does-not-exist", func(t *testing.T) { + err := capZone("madeupname", powercap) + assert.Error(t, err) + }) +} From 73a184b8a8465035e1e1007e45e89a8ac8ab815f Mon Sep 17 00:00:00 2001 From: "Renan I. Del Valle" Date: Mon, 20 Jan 2020 23:47:38 -0800 Subject: [PATCH 2/2] Revert "Rapl node capping daemon (#21)" (#22) This reverts commit e76c1ae9723bd064993027428fd9ba61a02db7b0 as the rapl-daemon will exist in its own repository. --- go.sum | 1 - rapl-daemon/README.md | 39 -------------- rapl-daemon/main.go | 60 --------------------- rapl-daemon/util.go | 110 --------------------------------------- rapl-daemon/util_test.go | 101 ----------------------------------- 5 files changed, 311 deletions(-) delete mode 100644 rapl-daemon/README.md delete mode 100644 rapl-daemon/main.go delete mode 100644 rapl-daemon/util.go delete mode 100644 rapl-daemon/util_test.go diff --git a/go.sum b/go.sum index 411d53c..f2fecd3 100644 --- a/go.sum +++ b/go.sum @@ -1,6 +1,5 @@ github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/fatih/color v1.7.0 h1:DkWD4oS2D8LGGgTQ6IvwJJXSL5Vp2ffcQg58nFV38Ys= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= diff --git a/rapl-daemon/README.md b/rapl-daemon/README.md deleted file mode 100644 index 415a31c..0000000 --- a/rapl-daemon/README.md +++ /dev/null @@ -1,39 +0,0 @@ -# RAPL Daemon - -This runs a server that is capable of changing the percentage at which -a node is being throttled to using RAPL. This daemon should be installed -on all worker nodes. - -### Sample payload for testing: -``` - curl --header "Content-Type: application/json" \ - --request POST \ - --data '{"percentage":75}' \ - http://localhost:9090/powercap - ``` - -### Payload - -```json -{ - "percentage":75 -} -``` - -### Response - -The daemon will respond with a json payload containing zones that were -successfully capped as well as the zones that were not capped. - -```json -{ - "cappedZones": null, - "failedZones": [ - "intel-rapl:0", - "intel-rapl:1" - ], - "error": "some zones were not able to be powercapped" -} -``` - -Field error will not exist if failed zones is empty. \ No newline at end of file diff --git a/rapl-daemon/main.go b/rapl-daemon/main.go deleted file mode 100644 index e88d3eb..0000000 --- a/rapl-daemon/main.go +++ /dev/null @@ -1,60 +0,0 @@ -package main - -import ( - "encoding/json" - "fmt" - "html" - "log" - "net/http" -) - -const powercapDir = "/sys/class/powercap/" - -// Cap is a payload that is expected from Elektron to cap a node. -type Cap struct { - Percentage int -} - -// CapResponse is the payload sent with information about the capping call -type CapResponse struct { - CappedZones []string `json:"cappedZones"` - FailedZones []string `json:"failedZones"` - Error *string `json:"error"` -} - -func main() { - http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { - fmt.Fprintf(w, "Unsupported endpoint %s", html.EscapeString(r.URL.Path)) - }) - - http.HandleFunc("/powercap", powercapEndpoint) - log.Fatal(http.ListenAndServe(":9090", nil)) -} - -// Handler for the powercapping HTTP API endpoint. -func powercapEndpoint(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "application/json") - - var payload Cap - var response CapResponse - - decoder := json.NewDecoder(r.Body) - err := decoder.Decode(&payload) - if err != nil { - errorMsg := "error parsing payload: " + err.Error() - response.Error = &errorMsg - json.NewEncoder(w).Encode(response) - return - } - - cappedZones, failedZones, err := capNode(powercapDir, payload.Percentage) - if err != nil { - errorMsg := err.Error() - response.Error = &errorMsg - } - - response.CappedZones = cappedZones - response.FailedZones = failedZones - - json.NewEncoder(w).Encode(response) -} diff --git a/rapl-daemon/util.go b/rapl-daemon/util.go deleted file mode 100644 index daf8dff..0000000 --- a/rapl-daemon/util.go +++ /dev/null @@ -1,110 +0,0 @@ -package main - -import ( - "fmt" - "io/ioutil" - "math" - "os" - "path/filepath" - "strconv" - "strings" -) - -const raplPrefixCPU = "intel-rapl" - -// constraint_0 is usually the longer window while constraint_1 is usually the shorter window -const maxPowerFileLongWindow = "constraint_0_max_power_uw" -const powerLimitFileLongWindow = "constraint_0_power_limit_uw" - -// capNode uses pseudo files made available by the Linux kernel -// in order to capNode CPU power. More information is available at: -// https://www.kernel.org/doc/html/latest/power/powercap/powercap.html -func capNode(base string, percentage int) ([]string, []string, error) { - - if percentage <= 0 || percentage > 100 { - return nil, nil, fmt.Errorf("cap percentage must be between 0 (non-inclusive) and 100 (inclusive): %d", percentage) - } - - files, err := ioutil.ReadDir(base) - if err != nil { - return nil, nil, err - } - - var capped, failed []string - for _, file := range files { - fields := strings.Split(file.Name(), ":") - - // Fields should be in the form intel-rapl:X where X is the power zone - // We ignore sub-zones which follow the form intel-rapl:X:Y - if len(fields) != 2 { - continue - } - - if fields[0] == raplPrefixCPU { - maxPower, err := maxPower(filepath.Join(base, file.Name(), maxPowerFileLongWindow)) - if err != nil { - failed = append(failed, file.Name()) - fmt.Println("unable to retreive max power for zone ", err) - continue - } - - // We use floats to mitigate the possibility of an integer overflow. - powercap := uint64(math.Ceil(float64(maxPower) * (float64(percentage) / 100))) - - if err := capZone(filepath.Join(base, file.Name(), powerLimitFileLongWindow), powercap); err != nil { - failed = append(failed, file.Name()) - fmt.Println("unable to write powercap value: ", err) - continue - } - capped = append(capped, file.Name()) - } - } - - if len(failed) > 0 { - return capped, failed, fmt.Errorf("some zones were not able to be powercapped") - } - - return capped, nil, nil -} - -// maxPower returns the value in float of the maximum watts a power zone can use. -func maxPower(maxFile string) (uint64, error) { - maxPower, err := ioutil.ReadFile(maxFile) - if err != nil { - return 0, err - } - - maxPoweruW, err := strconv.ParseUint(strings.TrimSpace(string(maxPower)), 10, 64) - if err != nil { - return 0, err - } - - return maxPoweruW, nil -} - -// capZone caps a power zone to a specific amount of watts specified by value -func capZone(limitFile string, value uint64) error { - if _, err := os.Stat(limitFile); os.IsNotExist(err) { - return err - } - - err := ioutil.WriteFile(limitFile, []byte(strconv.FormatUint(value, 10)), 0644) - if err != nil { - return err - } - return nil -} - -func currentCap(limit string) (uint64, error) { - powercap, err := ioutil.ReadFile(limit) - if err != nil { - return 0, err - } - - powercapuW, err := strconv.ParseUint(strings.TrimSpace(string(powercap)), 10, 64) - if err != nil { - return 0, err - } - - return powercapuW, nil -} diff --git a/rapl-daemon/util_test.go b/rapl-daemon/util_test.go deleted file mode 100644 index 58065bd..0000000 --- a/rapl-daemon/util_test.go +++ /dev/null @@ -1,101 +0,0 @@ -package main - -import ( - "io/ioutil" - "log" - "math" - "os" - "path/filepath" - "strconv" - "testing" - - "github.com/stretchr/testify/assert" -) - -var raplDir string - -const maxWattage uint64 = 1500000 - -func TestMain(m *testing.M) { - var err error - raplDir, err = ioutil.TempDir("", raplPrefixCPU) - if err != nil { - log.Fatal(err) - } - - defer os.RemoveAll(raplDir) - - // Create temporary directory that mocks powercap subsytem - zonePath := filepath.Join(raplDir, raplPrefixCPU+":0") - err = os.Mkdir(zonePath, 755) - if err != nil { - log.Fatal(err) - } - - initialWatts := strconv.FormatUint(maxWattage, 10) - - err = ioutil.WriteFile(filepath.Join(zonePath, maxPowerFileLongWindow), []byte(initialWatts), 0444) - if err != nil { - log.Fatal(err) - } - - err = ioutil.WriteFile(filepath.Join(zonePath, powerLimitFileLongWindow), []byte(initialWatts), 0644) - if err != nil { - log.Fatal(err) - } - - os.Exit(m.Run()) -} - -// TODO(rdelvalle): Add tests where capping fails -func TestCapNode(t *testing.T) { - capped, failed, err := capNode(raplDir, 95) - assert.NoError(t, err) - assert.Len(t, capped, 1) - assert.Nil(t, failed) - - t.Run("bad-percentage", func(t *testing.T) { - capped, failed, err := capNode(raplDir, 1000) - assert.Error(t, err) - assert.Nil(t, capped) - assert.Nil(t, failed) - }) - - t.Run("zero-percent", func(t *testing.T) { - capped, failed, err := capNode(raplDir, 0) - assert.Error(t, err) - assert.Nil(t, capped) - assert.Nil(t, failed) - }) -} - -func TestMaxPower(t *testing.T) { - maxFile := filepath.Join(raplDir, raplPrefixCPU+":0", maxPowerFileLongWindow) - - maxWatts, err := maxPower(maxFile) - assert.NoError(t, err) - assert.Equal(t, maxWattage, maxWatts) - - t.Run("name-does-not-exist", func(t *testing.T) { - _, err := maxPower("madeupname") - assert.Error(t, err) - }) -} - -func TestCapZone(t *testing.T) { - const percentage float64 = .50 - - powercap := uint64(math.Ceil(float64(maxWattage) * percentage)) - limitFile := filepath.Join(raplDir, raplPrefixCPU+":0", powerLimitFileLongWindow) - err := capZone(limitFile, powercap) - assert.NoError(t, err) - - newCap, err := currentCap(limitFile) - assert.NoError(t, err) - assert.Equal(t, powercap, newCap) - - t.Run("name-does-not-exist", func(t *testing.T) { - err := capZone("madeupname", powercap) - assert.Error(t, err) - }) -}