diff --git a/go.sum b/go.sum index f2fecd3..411d53c 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,6 @@ github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/fatih/color v1.7.0 h1:DkWD4oS2D8LGGgTQ6IvwJJXSL5Vp2ffcQg58nFV38Ys= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= diff --git a/rapl-daemon/README.md b/rapl-daemon/README.md new file mode 100644 index 0000000..415a31c --- /dev/null +++ b/rapl-daemon/README.md @@ -0,0 +1,39 @@ +# RAPL Daemon + +This runs a server that is capable of changing the percentage at which +a node is being throttled to using RAPL. This daemon should be installed +on all worker nodes. + +### Sample payload for testing: +``` + curl --header "Content-Type: application/json" \ + --request POST \ + --data '{"percentage":75}' \ + http://localhost:9090/powercap + ``` + +### Payload + +```json +{ + "percentage":75 +} +``` + +### Response + +The daemon will respond with a json payload containing zones that were +successfully capped as well as the zones that were not capped. + +```json +{ + "cappedZones": null, + "failedZones": [ + "intel-rapl:0", + "intel-rapl:1" + ], + "error": "some zones were not able to be powercapped" +} +``` + +Field error will not exist if failed zones is empty. \ No newline at end of file diff --git a/rapl-daemon/main.go b/rapl-daemon/main.go new file mode 100644 index 0000000..e88d3eb --- /dev/null +++ b/rapl-daemon/main.go @@ -0,0 +1,60 @@ +package main + +import ( + "encoding/json" + "fmt" + "html" + "log" + "net/http" +) + +const powercapDir = "/sys/class/powercap/" + +// Cap is a payload that is expected from Elektron to cap a node. +type Cap struct { + Percentage int +} + +// CapResponse is the payload sent with information about the capping call +type CapResponse struct { + CappedZones []string `json:"cappedZones"` + FailedZones []string `json:"failedZones"` + Error *string `json:"error"` +} + +func main() { + http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + fmt.Fprintf(w, "Unsupported endpoint %s", html.EscapeString(r.URL.Path)) + }) + + http.HandleFunc("/powercap", powercapEndpoint) + log.Fatal(http.ListenAndServe(":9090", nil)) +} + +// Handler for the powercapping HTTP API endpoint. +func powercapEndpoint(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + + var payload Cap + var response CapResponse + + decoder := json.NewDecoder(r.Body) + err := decoder.Decode(&payload) + if err != nil { + errorMsg := "error parsing payload: " + err.Error() + response.Error = &errorMsg + json.NewEncoder(w).Encode(response) + return + } + + cappedZones, failedZones, err := capNode(powercapDir, payload.Percentage) + if err != nil { + errorMsg := err.Error() + response.Error = &errorMsg + } + + response.CappedZones = cappedZones + response.FailedZones = failedZones + + json.NewEncoder(w).Encode(response) +} diff --git a/rapl-daemon/util.go b/rapl-daemon/util.go new file mode 100644 index 0000000..daf8dff --- /dev/null +++ b/rapl-daemon/util.go @@ -0,0 +1,110 @@ +package main + +import ( + "fmt" + "io/ioutil" + "math" + "os" + "path/filepath" + "strconv" + "strings" +) + +const raplPrefixCPU = "intel-rapl" + +// constraint_0 is usually the longer window while constraint_1 is usually the shorter window +const maxPowerFileLongWindow = "constraint_0_max_power_uw" +const powerLimitFileLongWindow = "constraint_0_power_limit_uw" + +// capNode uses pseudo files made available by the Linux kernel +// in order to capNode CPU power. More information is available at: +// https://www.kernel.org/doc/html/latest/power/powercap/powercap.html +func capNode(base string, percentage int) ([]string, []string, error) { + + if percentage <= 0 || percentage > 100 { + return nil, nil, fmt.Errorf("cap percentage must be between 0 (non-inclusive) and 100 (inclusive): %d", percentage) + } + + files, err := ioutil.ReadDir(base) + if err != nil { + return nil, nil, err + } + + var capped, failed []string + for _, file := range files { + fields := strings.Split(file.Name(), ":") + + // Fields should be in the form intel-rapl:X where X is the power zone + // We ignore sub-zones which follow the form intel-rapl:X:Y + if len(fields) != 2 { + continue + } + + if fields[0] == raplPrefixCPU { + maxPower, err := maxPower(filepath.Join(base, file.Name(), maxPowerFileLongWindow)) + if err != nil { + failed = append(failed, file.Name()) + fmt.Println("unable to retreive max power for zone ", err) + continue + } + + // We use floats to mitigate the possibility of an integer overflow. + powercap := uint64(math.Ceil(float64(maxPower) * (float64(percentage) / 100))) + + if err := capZone(filepath.Join(base, file.Name(), powerLimitFileLongWindow), powercap); err != nil { + failed = append(failed, file.Name()) + fmt.Println("unable to write powercap value: ", err) + continue + } + capped = append(capped, file.Name()) + } + } + + if len(failed) > 0 { + return capped, failed, fmt.Errorf("some zones were not able to be powercapped") + } + + return capped, nil, nil +} + +// maxPower returns the value in float of the maximum watts a power zone can use. +func maxPower(maxFile string) (uint64, error) { + maxPower, err := ioutil.ReadFile(maxFile) + if err != nil { + return 0, err + } + + maxPoweruW, err := strconv.ParseUint(strings.TrimSpace(string(maxPower)), 10, 64) + if err != nil { + return 0, err + } + + return maxPoweruW, nil +} + +// capZone caps a power zone to a specific amount of watts specified by value +func capZone(limitFile string, value uint64) error { + if _, err := os.Stat(limitFile); os.IsNotExist(err) { + return err + } + + err := ioutil.WriteFile(limitFile, []byte(strconv.FormatUint(value, 10)), 0644) + if err != nil { + return err + } + return nil +} + +func currentCap(limit string) (uint64, error) { + powercap, err := ioutil.ReadFile(limit) + if err != nil { + return 0, err + } + + powercapuW, err := strconv.ParseUint(strings.TrimSpace(string(powercap)), 10, 64) + if err != nil { + return 0, err + } + + return powercapuW, nil +} diff --git a/rapl-daemon/util_test.go b/rapl-daemon/util_test.go new file mode 100644 index 0000000..58065bd --- /dev/null +++ b/rapl-daemon/util_test.go @@ -0,0 +1,101 @@ +package main + +import ( + "io/ioutil" + "log" + "math" + "os" + "path/filepath" + "strconv" + "testing" + + "github.com/stretchr/testify/assert" +) + +var raplDir string + +const maxWattage uint64 = 1500000 + +func TestMain(m *testing.M) { + var err error + raplDir, err = ioutil.TempDir("", raplPrefixCPU) + if err != nil { + log.Fatal(err) + } + + defer os.RemoveAll(raplDir) + + // Create temporary directory that mocks powercap subsytem + zonePath := filepath.Join(raplDir, raplPrefixCPU+":0") + err = os.Mkdir(zonePath, 755) + if err != nil { + log.Fatal(err) + } + + initialWatts := strconv.FormatUint(maxWattage, 10) + + err = ioutil.WriteFile(filepath.Join(zonePath, maxPowerFileLongWindow), []byte(initialWatts), 0444) + if err != nil { + log.Fatal(err) + } + + err = ioutil.WriteFile(filepath.Join(zonePath, powerLimitFileLongWindow), []byte(initialWatts), 0644) + if err != nil { + log.Fatal(err) + } + + os.Exit(m.Run()) +} + +// TODO(rdelvalle): Add tests where capping fails +func TestCapNode(t *testing.T) { + capped, failed, err := capNode(raplDir, 95) + assert.NoError(t, err) + assert.Len(t, capped, 1) + assert.Nil(t, failed) + + t.Run("bad-percentage", func(t *testing.T) { + capped, failed, err := capNode(raplDir, 1000) + assert.Error(t, err) + assert.Nil(t, capped) + assert.Nil(t, failed) + }) + + t.Run("zero-percent", func(t *testing.T) { + capped, failed, err := capNode(raplDir, 0) + assert.Error(t, err) + assert.Nil(t, capped) + assert.Nil(t, failed) + }) +} + +func TestMaxPower(t *testing.T) { + maxFile := filepath.Join(raplDir, raplPrefixCPU+":0", maxPowerFileLongWindow) + + maxWatts, err := maxPower(maxFile) + assert.NoError(t, err) + assert.Equal(t, maxWattage, maxWatts) + + t.Run("name-does-not-exist", func(t *testing.T) { + _, err := maxPower("madeupname") + assert.Error(t, err) + }) +} + +func TestCapZone(t *testing.T) { + const percentage float64 = .50 + + powercap := uint64(math.Ceil(float64(maxWattage) * percentage)) + limitFile := filepath.Join(raplDir, raplPrefixCPU+":0", powerLimitFileLongWindow) + err := capZone(limitFile, powercap) + assert.NoError(t, err) + + newCap, err := currentCap(limitFile) + assert.NoError(t, err) + assert.Equal(t, powercap, newCap) + + t.Run("name-does-not-exist", func(t *testing.T) { + err := capZone("madeupname", powercap) + assert.Error(t, err) + }) +}