Moved schedulers from the main programs to schedulers package. Can now choose different scheduelrs to use. Work on code sharing between schedulers remains to be done.
This commit is contained in:
parent
87892ba13b
commit
fce62981da
9 changed files with 575 additions and 323 deletions
241
schedulers/binpackwatts.go
Normal file
241
schedulers/binpackwatts.go
Normal file
|
@ -0,0 +1,241 @@
|
|||
package schedulers
|
||||
|
||||
import (
|
||||
"bitbucket.org/bingcloud/electron/def"
|
||||
"fmt"
|
||||
"github.com/golang/protobuf/proto"
|
||||
mesos "github.com/mesos/mesos-go/mesosproto"
|
||||
"github.com/mesos/mesos-go/mesosutil"
|
||||
sched "github.com/mesos/mesos-go/scheduler"
|
||||
"log"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Decides if to take an offer or not
|
||||
func (*BinPackWatts) takeOffer(offer *mesos.Offer, task def.Task) bool {
|
||||
|
||||
cpus, mem, watts := OfferAgg(offer)
|
||||
|
||||
//TODO: Insert watts calculation here instead of taking them as a parameter
|
||||
|
||||
if cpus >= task.CPU && mem >= task.RAM && watts >= task.Watts {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
type BinPackWatts struct {
|
||||
tasksCreated int
|
||||
tasksRunning int
|
||||
tasks []def.Task
|
||||
metrics map[string]def.Metric
|
||||
running map[string]map[string]bool
|
||||
ignoreWatts bool
|
||||
|
||||
// First set of PCP values are garbage values, signal to logger to start recording when we're
|
||||
// about to schedule a new task
|
||||
RecordPCP bool
|
||||
|
||||
// This channel is closed when the program receives an interrupt,
|
||||
// signalling that the program should shut down.
|
||||
Shutdown chan struct{}
|
||||
// This channel is closed after shutdown is closed, and only when all
|
||||
// outstanding tasks have been cleaned up
|
||||
Done chan struct{}
|
||||
|
||||
// Controls when to shutdown pcp logging
|
||||
PCPLog chan struct{}
|
||||
}
|
||||
|
||||
// New electron scheduler
|
||||
func NewBinPackWatts(tasks []def.Task, ignoreWatts bool) *BinPackWatts {
|
||||
|
||||
s := &BinPackWatts{
|
||||
tasks: tasks,
|
||||
ignoreWatts: ignoreWatts,
|
||||
Shutdown: make(chan struct{}),
|
||||
Done: make(chan struct{}),
|
||||
PCPLog: make(chan struct{}),
|
||||
running: make(map[string]map[string]bool),
|
||||
RecordPCP: false,
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func (s *BinPackWatts) newTask(offer *mesos.Offer, task def.Task) *mesos.TaskInfo {
|
||||
taskName := fmt.Sprintf("%s-%d", task.Name, *task.Instances)
|
||||
s.tasksCreated++
|
||||
|
||||
if !s.RecordPCP {
|
||||
// Turn on logging
|
||||
s.RecordPCP = true
|
||||
time.Sleep(1 * time.Second) // Make sure we're recording by the time the first task starts
|
||||
}
|
||||
|
||||
// If this is our first time running into this Agent
|
||||
if _, ok := s.running[offer.GetSlaveId().GoString()]; !ok {
|
||||
s.running[offer.GetSlaveId().GoString()] = make(map[string]bool)
|
||||
}
|
||||
|
||||
// Add task to list of tasks running on node
|
||||
s.running[offer.GetSlaveId().GoString()][taskName] = true
|
||||
|
||||
resources := []*mesos.Resource{
|
||||
mesosutil.NewScalarResource("cpus", task.CPU),
|
||||
mesosutil.NewScalarResource("mem", task.RAM),
|
||||
}
|
||||
|
||||
if !s.ignoreWatts {
|
||||
resources = append(resources, mesosutil.NewScalarResource("watts", task.Watts))
|
||||
}
|
||||
|
||||
return &mesos.TaskInfo{
|
||||
Name: proto.String(taskName),
|
||||
TaskId: &mesos.TaskID{
|
||||
Value: proto.String("electron-" + taskName),
|
||||
},
|
||||
SlaveId: offer.SlaveId,
|
||||
Resources: resources,
|
||||
Command: &mesos.CommandInfo{
|
||||
Value: proto.String(task.CMD),
|
||||
},
|
||||
Container: &mesos.ContainerInfo{
|
||||
Type: mesos.ContainerInfo_DOCKER.Enum(),
|
||||
Docker: &mesos.ContainerInfo_DockerInfo{
|
||||
Image: proto.String(task.Image),
|
||||
Network: mesos.ContainerInfo_DockerInfo_BRIDGE.Enum(), // Run everything isolated
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *BinPackWatts) Registered(
|
||||
_ sched.SchedulerDriver,
|
||||
frameworkID *mesos.FrameworkID,
|
||||
masterInfo *mesos.MasterInfo) {
|
||||
log.Printf("Framework %s registered with master %s", frameworkID, masterInfo)
|
||||
}
|
||||
|
||||
func (s *BinPackWatts) Reregistered(_ sched.SchedulerDriver, masterInfo *mesos.MasterInfo) {
|
||||
log.Printf("Framework re-registered with master %s", masterInfo)
|
||||
}
|
||||
|
||||
func (s *BinPackWatts) Disconnected(sched.SchedulerDriver) {
|
||||
log.Println("Framework disconnected with master")
|
||||
}
|
||||
|
||||
func (s *BinPackWatts) ResourceOffers(driver sched.SchedulerDriver, offers []*mesos.Offer) {
|
||||
log.Printf("Received %d resource offers", len(offers))
|
||||
|
||||
for _, offer := range offers {
|
||||
select {
|
||||
case <-s.Shutdown:
|
||||
log.Println("Done scheduling tasks: declining offer on [", offer.GetHostname(), "]")
|
||||
driver.DeclineOffer(offer.Id, longFilter)
|
||||
|
||||
log.Println("Number of tasks still running: ", s.tasksRunning)
|
||||
continue
|
||||
default:
|
||||
}
|
||||
|
||||
tasks := []*mesos.TaskInfo{}
|
||||
|
||||
// First fit strategy
|
||||
|
||||
taken := false
|
||||
for i, task := range s.tasks {
|
||||
|
||||
// Check host if it exists
|
||||
if task.Host != "" {
|
||||
// Don't take offer if it doesn't match our task's host requirement
|
||||
if !strings.HasPrefix(*offer.Hostname, task.Host) {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// Decision to take the offer or not
|
||||
if s.takeOffer(offer, task) {
|
||||
|
||||
log.Println("Co-Located with: ")
|
||||
coLocated(s.running[offer.GetSlaveId().GoString()])
|
||||
|
||||
tasks = append(tasks, s.newTask(offer, task))
|
||||
|
||||
log.Printf("Starting %s on [%s]\n", task.Name, offer.GetHostname())
|
||||
driver.LaunchTasks([]*mesos.OfferID{offer.Id}, tasks, defaultFilter)
|
||||
|
||||
taken = true
|
||||
|
||||
fmt.Println("Inst: ", *task.Instances)
|
||||
*task.Instances--
|
||||
|
||||
if *task.Instances <= 0 {
|
||||
// All instances of task have been scheduled, remove it
|
||||
s.tasks[i] = s.tasks[len(s.tasks)-1]
|
||||
s.tasks = s.tasks[:len(s.tasks)-1]
|
||||
|
||||
if len(s.tasks) <= 0 {
|
||||
log.Println("Done scheduling all tasks")
|
||||
close(s.Shutdown)
|
||||
}
|
||||
}
|
||||
break // Offer taken, move on
|
||||
}
|
||||
}
|
||||
|
||||
// If there was no match for the task
|
||||
if !taken {
|
||||
fmt.Println("There is not enough resources to launch a task:")
|
||||
cpus, mem, watts := OfferAgg(offer)
|
||||
|
||||
log.Printf("<CPU: %f, RAM: %f, Watts: %f>\n", cpus, mem, watts)
|
||||
driver.DeclineOffer(offer.Id, defaultFilter)
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
func (s *BinPackWatts) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) {
|
||||
log.Printf("Received task status [%s] for task [%s]", NameFor(status.State), *status.TaskId.Value)
|
||||
|
||||
if *status.State == mesos.TaskState_TASK_RUNNING {
|
||||
s.tasksRunning++
|
||||
} else if IsTerminal(status.State) {
|
||||
delete(s.running[status.GetSlaveId().GoString()], *status.TaskId.Value)
|
||||
s.tasksRunning--
|
||||
if s.tasksRunning == 0 {
|
||||
select {
|
||||
case <-s.Shutdown:
|
||||
close(s.Done)
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
log.Printf("DONE: Task status [%s] for task [%s]", NameFor(status.State), *status.TaskId.Value)
|
||||
}
|
||||
|
||||
func (s *BinPackWatts) FrameworkMessage(
|
||||
driver sched.SchedulerDriver,
|
||||
executorID *mesos.ExecutorID,
|
||||
slaveID *mesos.SlaveID,
|
||||
message string) {
|
||||
|
||||
log.Println("Getting a framework message: ", message)
|
||||
log.Printf("Received a framework message from some unknown source: %s", *executorID.Value)
|
||||
}
|
||||
|
||||
func (s *BinPackWatts) OfferRescinded(_ sched.SchedulerDriver, offerID *mesos.OfferID) {
|
||||
log.Printf("Offer %s rescinded", offerID)
|
||||
}
|
||||
func (s *BinPackWatts) SlaveLost(_ sched.SchedulerDriver, slaveID *mesos.SlaveID) {
|
||||
log.Printf("Slave %s lost", slaveID)
|
||||
}
|
||||
func (s *BinPackWatts) ExecutorLost(_ sched.SchedulerDriver, executorID *mesos.ExecutorID, slaveID *mesos.SlaveID, status int) {
|
||||
log.Printf("Executor %s on slave %s was lost", executorID, slaveID)
|
||||
}
|
||||
|
||||
func (s *BinPackWatts) Error(_ sched.SchedulerDriver, err string) {
|
||||
log.Printf("Receiving an error: %s", err)
|
||||
}
|
242
schedulers/firstfit.go
Normal file
242
schedulers/firstfit.go
Normal file
|
@ -0,0 +1,242 @@
|
|||
package schedulers
|
||||
|
||||
import (
|
||||
"bitbucket.org/bingcloud/electron/def"
|
||||
"fmt"
|
||||
"github.com/golang/protobuf/proto"
|
||||
mesos "github.com/mesos/mesos-go/mesosproto"
|
||||
"github.com/mesos/mesos-go/mesosutil"
|
||||
sched "github.com/mesos/mesos-go/scheduler"
|
||||
"log"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Decides if to take an offer or not
|
||||
func (*FirstFit) takeOffer(offer *mesos.Offer, task def.Task) bool {
|
||||
|
||||
cpus, mem, watts := OfferAgg(offer)
|
||||
|
||||
//TODO: Insert watts calculation here instead of taking them as a parameter
|
||||
|
||||
if cpus >= task.CPU && mem >= task.RAM && watts >= task.Watts {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// electronScheduler implements the Scheduler interface
|
||||
type FirstFit struct {
|
||||
tasksCreated int
|
||||
tasksRunning int
|
||||
tasks []def.Task
|
||||
metrics map[string]def.Metric
|
||||
running map[string]map[string]bool
|
||||
ignoreWatts bool
|
||||
|
||||
// First set of PCP values are garbage values, signal to logger to start recording when we're
|
||||
// about to schedule a new task
|
||||
RecordPCP bool
|
||||
|
||||
// This channel is closed when the program receives an interrupt,
|
||||
// signalling that the program should shut down.
|
||||
Shutdown chan struct{}
|
||||
// This channel is closed after shutdown is closed, and only when all
|
||||
// outstanding tasks have been cleaned up
|
||||
Done chan struct{}
|
||||
|
||||
// Controls when to shutdown pcp logging
|
||||
PCPLog chan struct{}
|
||||
}
|
||||
|
||||
// New electron scheduler
|
||||
func NewFirstFit(tasks []def.Task, ignoreWatts bool) *FirstFit {
|
||||
|
||||
s := &FirstFit{
|
||||
tasks: tasks,
|
||||
ignoreWatts: ignoreWatts,
|
||||
Shutdown: make(chan struct{}),
|
||||
Done: make(chan struct{}),
|
||||
PCPLog: make(chan struct{}),
|
||||
running: make(map[string]map[string]bool),
|
||||
RecordPCP: false,
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func (s *FirstFit) newTask(offer *mesos.Offer, task def.Task) *mesos.TaskInfo {
|
||||
taskName := fmt.Sprintf("%s-%d", task.Name, *task.Instances)
|
||||
s.tasksCreated++
|
||||
|
||||
if !s.RecordPCP {
|
||||
// Turn on logging
|
||||
s.RecordPCP = true
|
||||
time.Sleep(1 * time.Second) // Make sure we're recording by the time the first task starts
|
||||
}
|
||||
|
||||
// If this is our first time running into this Agent
|
||||
if _, ok := s.running[offer.GetSlaveId().GoString()]; !ok {
|
||||
s.running[offer.GetSlaveId().GoString()] = make(map[string]bool)
|
||||
}
|
||||
|
||||
// Add task to list of tasks running on node
|
||||
s.running[offer.GetSlaveId().GoString()][taskName] = true
|
||||
|
||||
resources := []*mesos.Resource{
|
||||
mesosutil.NewScalarResource("cpus", task.CPU),
|
||||
mesosutil.NewScalarResource("mem", task.RAM),
|
||||
}
|
||||
|
||||
if !s.ignoreWatts {
|
||||
resources = append(resources, mesosutil.NewScalarResource("watts", task.Watts))
|
||||
}
|
||||
|
||||
return &mesos.TaskInfo{
|
||||
Name: proto.String(taskName),
|
||||
TaskId: &mesos.TaskID{
|
||||
Value: proto.String("electron-" + taskName),
|
||||
},
|
||||
SlaveId: offer.SlaveId,
|
||||
Resources: resources,
|
||||
Command: &mesos.CommandInfo{
|
||||
Value: proto.String(task.CMD),
|
||||
},
|
||||
Container: &mesos.ContainerInfo{
|
||||
Type: mesos.ContainerInfo_DOCKER.Enum(),
|
||||
Docker: &mesos.ContainerInfo_DockerInfo{
|
||||
Image: proto.String(task.Image),
|
||||
Network: mesos.ContainerInfo_DockerInfo_BRIDGE.Enum(), // Run everything isolated
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *FirstFit) Registered(
|
||||
_ sched.SchedulerDriver,
|
||||
frameworkID *mesos.FrameworkID,
|
||||
masterInfo *mesos.MasterInfo) {
|
||||
log.Printf("Framework %s registered with master %s", frameworkID, masterInfo)
|
||||
}
|
||||
|
||||
func (s *FirstFit) Reregistered(_ sched.SchedulerDriver, masterInfo *mesos.MasterInfo) {
|
||||
log.Printf("Framework re-registered with master %s", masterInfo)
|
||||
}
|
||||
|
||||
func (s *FirstFit) Disconnected(sched.SchedulerDriver) {
|
||||
log.Println("Framework disconnected with master")
|
||||
}
|
||||
|
||||
func (s *FirstFit) ResourceOffers(driver sched.SchedulerDriver, offers []*mesos.Offer) {
|
||||
log.Printf("Received %d resource offers", len(offers))
|
||||
|
||||
for _, offer := range offers {
|
||||
select {
|
||||
case <-s.Shutdown:
|
||||
log.Println("Done scheduling tasks: declining offer on [", offer.GetHostname(), "]")
|
||||
driver.DeclineOffer(offer.Id, longFilter)
|
||||
|
||||
log.Println("Number of tasks still running: ", s.tasksRunning)
|
||||
continue
|
||||
default:
|
||||
}
|
||||
|
||||
tasks := []*mesos.TaskInfo{}
|
||||
|
||||
// First fit strategy
|
||||
|
||||
taken := false
|
||||
for i, task := range s.tasks {
|
||||
|
||||
// Check host if it exists
|
||||
if task.Host != "" {
|
||||
// Don't take offer if it doesn't match our task's host requirement
|
||||
if !strings.HasPrefix(*offer.Hostname, task.Host) {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// Decision to take the offer or not
|
||||
if s.takeOffer(offer, task) {
|
||||
|
||||
log.Println("Co-Located with: ")
|
||||
coLocated(s.running[offer.GetSlaveId().GoString()])
|
||||
|
||||
tasks = append(tasks, s.newTask(offer, task))
|
||||
|
||||
log.Printf("Starting %s on [%s]\n", task.Name, offer.GetHostname())
|
||||
driver.LaunchTasks([]*mesos.OfferID{offer.Id}, tasks, defaultFilter)
|
||||
|
||||
taken = true
|
||||
|
||||
fmt.Println("Inst: ", *task.Instances)
|
||||
*task.Instances--
|
||||
|
||||
if *task.Instances <= 0 {
|
||||
// All instances of task have been scheduled, remove it
|
||||
s.tasks[i] = s.tasks[len(s.tasks)-1]
|
||||
s.tasks = s.tasks[:len(s.tasks)-1]
|
||||
|
||||
if len(s.tasks) <= 0 {
|
||||
log.Println("Done scheduling all tasks")
|
||||
close(s.Shutdown)
|
||||
}
|
||||
}
|
||||
break // Offer taken, move on
|
||||
}
|
||||
}
|
||||
|
||||
// If there was no match for the task
|
||||
if !taken {
|
||||
fmt.Println("There is not enough resources to launch a task:")
|
||||
cpus, mem, watts := OfferAgg(offer)
|
||||
|
||||
log.Printf("<CPU: %f, RAM: %f, Watts: %f>\n", cpus, mem, watts)
|
||||
driver.DeclineOffer(offer.Id, defaultFilter)
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
func (s *FirstFit) StatusUpdate(driver sched.SchedulerDriver, status *mesos.TaskStatus) {
|
||||
log.Printf("Received task status [%s] for task [%s]", NameFor(status.State), *status.TaskId.Value)
|
||||
|
||||
if *status.State == mesos.TaskState_TASK_RUNNING {
|
||||
s.tasksRunning++
|
||||
} else if IsTerminal(status.State) {
|
||||
delete(s.running[status.GetSlaveId().GoString()], *status.TaskId.Value)
|
||||
s.tasksRunning--
|
||||
if s.tasksRunning == 0 {
|
||||
select {
|
||||
case <-s.Shutdown:
|
||||
close(s.Done)
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
log.Printf("DONE: Task status [%s] for task [%s]", NameFor(status.State), *status.TaskId.Value)
|
||||
}
|
||||
|
||||
func (s *FirstFit) FrameworkMessage(
|
||||
driver sched.SchedulerDriver,
|
||||
executorID *mesos.ExecutorID,
|
||||
slaveID *mesos.SlaveID,
|
||||
message string) {
|
||||
|
||||
log.Println("Getting a framework message: ", message)
|
||||
log.Printf("Received a framework message from some unknown source: %s", *executorID.Value)
|
||||
}
|
||||
|
||||
func (s *FirstFit) OfferRescinded(_ sched.SchedulerDriver, offerID *mesos.OfferID) {
|
||||
log.Printf("Offer %s rescinded", offerID)
|
||||
}
|
||||
func (s *FirstFit) SlaveLost(_ sched.SchedulerDriver, slaveID *mesos.SlaveID) {
|
||||
log.Printf("Slave %s lost", slaveID)
|
||||
}
|
||||
func (s *FirstFit) ExecutorLost(_ sched.SchedulerDriver, executorID *mesos.ExecutorID, slaveID *mesos.SlaveID, status int) {
|
||||
log.Printf("Executor %s on slave %s was lost", executorID, slaveID)
|
||||
}
|
||||
|
||||
func (s *FirstFit) Error(_ sched.SchedulerDriver, err string) {
|
||||
log.Printf("Receiving an error: %s", err)
|
||||
}
|
39
schedulers/helpers.go
Normal file
39
schedulers/helpers.go
Normal file
|
@ -0,0 +1,39 @@
|
|||
package schedulers
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/golang/protobuf/proto"
|
||||
mesos "github.com/mesos/mesos-go/mesosproto"
|
||||
"log"
|
||||
)
|
||||
|
||||
var (
|
||||
defaultFilter = &mesos.Filters{RefuseSeconds: proto.Float64(1)}
|
||||
longFilter = &mesos.Filters{RefuseSeconds: proto.Float64(1000)}
|
||||
)
|
||||
|
||||
func OfferAgg(offer *mesos.Offer) (float64, float64, float64) {
|
||||
var cpus, mem, watts float64
|
||||
|
||||
for _, resource := range offer.Resources {
|
||||
switch resource.GetName() {
|
||||
case "cpus":
|
||||
cpus += *resource.GetScalar().Value
|
||||
case "mem":
|
||||
mem += *resource.GetScalar().Value
|
||||
case "watts":
|
||||
watts += *resource.GetScalar().Value
|
||||
}
|
||||
}
|
||||
|
||||
return cpus, mem, watts
|
||||
}
|
||||
|
||||
func coLocated(tasks map[string]bool) {
|
||||
|
||||
for task := range tasks {
|
||||
log.Println(task)
|
||||
}
|
||||
|
||||
fmt.Println("---------------------")
|
||||
}
|
45
schedulers/states.go
Normal file
45
schedulers/states.go
Normal file
|
@ -0,0 +1,45 @@
|
|||
package schedulers
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
mesos "github.com/mesos/mesos-go/mesosproto"
|
||||
)
|
||||
|
||||
// NameFor returns the string name for a TaskState.
|
||||
func NameFor(state *mesos.TaskState) string {
|
||||
switch *state {
|
||||
case mesos.TaskState_TASK_STAGING:
|
||||
return "TASK_STAGING"
|
||||
case mesos.TaskState_TASK_STARTING:
|
||||
return "TASK_STARTING"
|
||||
case mesos.TaskState_TASK_RUNNING:
|
||||
return "TASK_RUNNING"
|
||||
case mesos.TaskState_TASK_FINISHED:
|
||||
return "TASK_FINISHED" // TERMINAL
|
||||
case mesos.TaskState_TASK_FAILED:
|
||||
return "TASK_FAILED" // TERMINAL
|
||||
case mesos.TaskState_TASK_KILLED:
|
||||
return "TASK_KILLED" // TERMINAL
|
||||
case mesos.TaskState_TASK_LOST:
|
||||
return "TASK_LOST" // TERMINAL
|
||||
case mesos.TaskState_TASK_ERROR:
|
||||
return "TASK_ERROR"
|
||||
default:
|
||||
return fmt.Sprintf("UNKNOWN: %d", *state)
|
||||
}
|
||||
}
|
||||
|
||||
// IsTerminal determines if a TaskState is a terminal state, i.e. if it singals
|
||||
// that the task has stopped running.
|
||||
func IsTerminal(state *mesos.TaskState) bool {
|
||||
switch *state {
|
||||
case mesos.TaskState_TASK_FINISHED,
|
||||
mesos.TaskState_TASK_FAILED,
|
||||
mesos.TaskState_TASK_KILLED,
|
||||
mesos.TaskState_TASK_LOST,
|
||||
mesos.TaskState_TASK_ERROR:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
Reference in a new issue