From c2c793d8600b968a7dfabc808898623588a35744 Mon Sep 17 00:00:00 2001 From: Mark Ryan Date: Tue, 15 Nov 2016 16:22:02 +0000 Subject: [PATCH 1/4] ciao-launcher: Don't share docker client among go routines Launcher currently creates a single docker client which is used from all instance go routines that manage docker containers. There was some concern as to whether this client was re-entrant or not but it has never caused problems up until now. Since installing docker 1.12.3 on my SingleVM I'm seeing client.ContainerStats hanging when called from multiple go routines at the same time. As it's cheap to create these client objects, this commit modifies launcher to ensure that docker clients are never used concurrently. Signed-off-by: Mark Ryan --- ciao-launcher/docker.go | 62 +++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/ciao-launcher/docker.go b/ciao-launcher/docker.go index 148aaa4b0..a53fee8f4 100644 --- a/ciao-launcher/docker.go +++ b/ciao-launcher/docker.go @@ -67,6 +67,7 @@ type docker struct { prevSampleTime time.Time storageDriver storage.BlockDriver mount mounter + cli *client.Client } type mounter interface { @@ -74,25 +75,13 @@ type mounter interface { Unmount(path string, flags int) error } -// It's not entirely clear that it's safe to call a client.Client object from -// multiple go routines simulataneously. The code looks like it is re-entrant -// but this doesn't seem to be documented anywhere. Need to check this. - -// There's no real way to return an error from init at the moment, so we'll -// try to retrieve the client object at each new invocation of the virtualizer. - // BUG(markus): We shouldn't report ssh ports for docker instances func getDockerClient() (cli *client.Client, err error) { - dockerClient.Lock() - if dockerClient.cli == nil { - defaultHeaders := map[string]string{"User-Agent": "ciao-1.0"} - dockerClient.cli, err = client.NewClient("unix:///var/run/docker.sock", - "v1.22", nil, defaultHeaders) - } - cli = dockerClient.cli - dockerClient.Unlock() - return cli, err + return client.NewClient("unix:///var/run/docker.sock", "v1.22", nil, + map[string]string{ + "User-Agent": "ciao-1.0", + }) } func (d *docker) init(cfg *vmConfig, instanceDir string) { @@ -101,18 +90,31 @@ func (d *docker) init(cfg *vmConfig, instanceDir string) { if d.mount == nil { d.mount = dockerMounter{} } + _ = d.initDockerClient() +} + +func (d *docker) initDockerClient() error { + if d.cli != nil { + return nil + } + cli, err := getDockerClient() + if err != nil { + return fmt.Errorf("Unable to init docker client: %v", err) + } + d.cli = cli + return nil } func (d *docker) checkBackingImage() error { glog.Infof("Checking backing docker image %s", d.cfg.Image) - cli, err := getDockerClient() + err := d.initDockerClient() if err != nil { return err } args := filters.NewArgs() - images, err := cli.ImageList(context.Background(), + images, err := d.cli.ImageList(context.Background(), types.ImageListOptions{ MatchName: d.cfg.Image, All: false, @@ -137,12 +139,12 @@ func (d *docker) checkBackingImage() error { func (d *docker) downloadBackingImage() error { glog.Infof("Downloading backing docker image %s", d.cfg.Image) - cli, err := getDockerClient() + err := d.initDockerClient() if err != nil { return err } - prog, err := cli.ImagePull(context.Background(), types.ImagePullOptions{ImageID: d.cfg.Image}, nil) + prog, err := d.cli.ImagePull(context.Background(), types.ImagePullOptions{ImageID: d.cfg.Image}, nil) if err != nil { glog.Errorf("Unable to download image %s: %v\n", d.cfg.Image, err) return err @@ -300,7 +302,7 @@ func (d *docker) prepareVolumes() ([]string, error) { } func (d *docker) createImage(bridge string, userData, metaData []byte) error { - cli, err := getDockerClient() + err := d.initDockerClient() if err != nil { return err } @@ -313,7 +315,7 @@ func (d *docker) createImage(bridge string, userData, metaData []byte) error { config, hostConfig, networkConfig := d.createConfigs(bridge, userData, metaData, volumes) - resp, err := cli.ContainerCreate(context.Background(), config, hostConfig, networkConfig, + resp, err := d.cli.ContainerCreate(context.Background(), config, hostConfig, networkConfig, d.cfg.Instance) if err != nil { glog.Errorf("Unable to create container %v", err) @@ -341,12 +343,12 @@ func (d *docker) deleteImage() error { return nil } - cli, err := getDockerClient() + err := d.initDockerClient() if err != nil { return err } - err = cli.ContainerRemove(context.Background(), + err = d.cli.ContainerRemove(context.Background(), types.ContainerRemoveOptions{ ContainerID: d.dockerID, Force: true}) @@ -360,7 +362,7 @@ func (d *docker) deleteImage() error { } func (d *docker) startVM(vnicName, ipAddress, cephID string) error { - cli, err := getDockerClient() + err := d.initDockerClient() if err != nil { return err } @@ -371,7 +373,7 @@ func (d *docker) startVM(vnicName, ipAddress, cephID string) error { return err } - err = cli.ContainerStart(context.Background(), d.dockerID) + err = d.cli.ContainerStart(context.Background(), d.dockerID) if err != nil { d.umountVolumes(d.cfg.Volumes) d.unmapVolumes() @@ -482,12 +484,12 @@ func (d *docker) computeInstanceDiskspace() int { return -1 } - cli, err := getDockerClient() + err := d.initDockerClient() if err != nil { return -1 } - con, _, err := cli.ContainerInspectWithRaw(context.Background(), d.dockerID, true) + con, _, err := d.cli.ContainerInspectWithRaw(context.Background(), d.dockerID, true) if err != nil { glog.Errorf("Unable to determine status of instance %s:%s: %v", d.cfg.Instance, d.dockerID, err) @@ -510,13 +512,13 @@ func (d *docker) stats() (disk, memory, cpu int) { return } - cli, err := getDockerClient() + err := d.initDockerClient() if err != nil { glog.Errorf("Unable to get docker client: %v", err) return } - resp, err := cli.ContainerStats(context.Background(), d.dockerID, false) + resp, err := d.cli.ContainerStats(context.Background(), d.dockerID, false) if err != nil { glog.Errorf("Unable to get stats from container: %s:%s %v", d.cfg.Instance, d.dockerID, err) return From c12608ae8ed5d4d8ab18884d9477198f00238813 Mon Sep 17 00:00:00 2001 From: Mark Ryan Date: Tue, 15 Nov 2016 16:48:28 +0000 Subject: [PATCH 2/4] bat: Improve TestDeleteAllInstances robustness TestDeleteAllInstances periodically fails if the cluster still contains some instances after the ciao-cli instance delete --all command has executed. As this command doesn't wait until the instances have actually been deleted this is not surprising. This commit modifies the test case to give controller 5 seconds to delete all the instances after the ciao-cli instance delete command has returned. Signed-off-by: Mark Ryan --- _release/bat/base_bat/base_bat_test.go | 27 +++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/_release/bat/base_bat/base_bat_test.go b/_release/bat/base_bat/base_bat_test.go index 0afa51c47..248f3f351 100644 --- a/_release/bat/base_bat/base_bat_test.go +++ b/_release/bat/base_bat/base_bat_test.go @@ -266,6 +266,8 @@ func TestGetAllInstances(t *testing.T) { // The instance should be started and scheduled, the DeleteAllInstances command should // succeed and GetAllInstances command should return 0 instances. func TestDeleteAllInstances(t *testing.T) { + const retryCount = 5 + ctx, cancelFunc := context.WithTimeout(context.Background(), standardTimeout) defer cancelFunc() @@ -284,13 +286,28 @@ func TestDeleteAllInstances(t *testing.T) { t.Fatalf("Failed to delete all instances: %v", err) } - instanceDetails, err := bat.GetAllInstances(ctx, "") - if err != nil { - t.Fatalf("Failed to retrieve instances: %v", err) + // TODO: The correct thing to do here is to wait for the Delete Events + // But these aren't correctly reported yet, see + // https://github.com/01org/ciao/issues/792 + + var i int + var instancesFound int + for ; i < retryCount; i++ { + instanceDetails, err := bat.GetAllInstances(ctx, "") + if err != nil { + t.Fatalf("Failed to retrieve instances: %v", err) + } + + instancesFound = len(instanceDetails) + if instancesFound == 0 { + break + } + + time.Sleep(time.Second) } - if len(instanceDetails) != 0 { - t.Fatalf("0 instances expected. Found %d", len(instanceDetails)) + if instancesFound != 0 { + t.Fatalf("0 instances expected. Found %d", instancesFound) } } From e31c08079fcac0835da2ca4e0a578422e6d0ae4d Mon Sep 17 00:00:00 2001 From: Mark Ryan Date: Tue, 15 Nov 2016 17:26:40 +0000 Subject: [PATCH 3/4] ciao-launcher: Timeout ContainerStats The call to ContainerStats seems to be hanging on recent versions of docker. This commit times the call out after 5 seconds. Signed-off-by: Mark Ryan --- ciao-launcher/docker.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ciao-launcher/docker.go b/ciao-launcher/docker.go index a53fee8f4..cea47b795 100644 --- a/ciao-launcher/docker.go +++ b/ciao-launcher/docker.go @@ -518,7 +518,9 @@ func (d *docker) stats() (disk, memory, cpu int) { return } - resp, err := d.cli.ContainerStats(context.Background(), d.dockerID, false) + ctx, cancelFunc := context.WithTimeout(context.Background(), 5*time.Second) + resp, err := d.cli.ContainerStats(ctx, d.dockerID, false) + cancelFunc() if err != nil { glog.Errorf("Unable to get stats from container: %s:%s %v", d.cfg.Instance, d.dockerID, err) return From 55513fbfda2a03278b287b2f2dca1097e7b5a073 Mon Sep 17 00:00:00 2001 From: Mark Ryan Date: Wed, 16 Nov 2016 11:12:30 +0000 Subject: [PATCH 4/4] ciao-launcher: Send InstanceDeleted after instance deletion Launcher was incorrectly sending the InstanceDeleted event before the instance and its associated network assets had been deleted. This was causing a race condition in which controller was re-using some of the resources assigned to the instance before it was actually deleted for new instances. When this happened the new instances typically failed to start. This commit delays the sending of the InstanceDelete event until after the instance has been deleted. Fixes #808 Signed-off-by: Mark Ryan --- ciao-launcher/instance.go | 21 +++++++++++++++++++++ ciao-launcher/main.go | 1 - ciao-launcher/overseer.go | 22 ---------------------- 3 files changed, 21 insertions(+), 23 deletions(-) diff --git a/ciao-launcher/instance.go b/ciao-launcher/instance.go index aac728ab0..35b4638ac 100644 --- a/ciao-launcher/instance.go +++ b/ciao-launcher/instance.go @@ -21,6 +21,8 @@ import ( "sync" "time" + yaml "gopkg.in/yaml.v2" + storage "github.com/01org/ciao/ciao-storage" "github.com/01org/ciao/payloads" "github.com/01org/ciao/ssntp" @@ -192,6 +194,24 @@ func (id *instanceData) stopCommand(cmd *insStopCmd) { id.monitorCh <- virtualizerStopCmd{} } +func (id *instanceData) sendInstanceDeletedEvent() { + var event payloads.EventInstanceDeleted + + event.InstanceDeleted.InstanceUUID = id.instance + + payload, err := yaml.Marshal(&event) + if err != nil { + glog.Errorf("Unable to Marshall STATS %v", err) + return + } + + _, err = id.ac.conn.SendEvent(ssntp.InstanceDeleted, payload) + if err != nil { + glog.Errorf("Failed to send event command %v", err) + return + } +} + func (id *instanceData) deleteCommand(cmd *insDeleteCmd) bool { if id.shuttingDown && !cmd.suicide { deleteErr := &deleteError{nil, payloads.DeleteNoInstance} @@ -211,6 +231,7 @@ func (id *instanceData) deleteCommand(cmd *insDeleteCmd) bool { id.unmapVolumes() if !cmd.suicide { + id.sendInstanceDeletedEvent() id.ovsCh <- &ovsStatusCmd{} } return true diff --git a/ciao-launcher/main.go b/ciao-launcher/main.go index d229f8ca7..7787fa8ad 100644 --- a/ciao-launcher/main.go +++ b/ciao-launcher/main.go @@ -362,7 +362,6 @@ func processCommand(conn serverConn, cmd *cmdWrapper, ovsCh chan<- interface{}) errCh := make(chan error) ovsCh <- &ovsRemoveCmd{ cmd.instance, - delCmd.suicide, errCh} <-errCh } diff --git a/ciao-launcher/overseer.go b/ciao-launcher/overseer.go index 2b26d21e3..e0d2427fb 100644 --- a/ciao-launcher/overseer.go +++ b/ciao-launcher/overseer.go @@ -58,7 +58,6 @@ type ovsGetCmd struct { type ovsRemoveCmd struct { instance string - suicide bool errCh chan<- error } @@ -494,24 +493,6 @@ func getStats(instancesDir string) *cnStats { return &s } -func (ovs *overseer) sendInstanceDeletedEvent(instance string) { - var event payloads.EventInstanceDeleted - - event.InstanceDeleted.InstanceUUID = instance - - payload, err := yaml.Marshal(&event) - if err != nil { - glog.Errorf("Unable to Marshall STATS %v", err) - return - } - - _, err = ovs.ac.conn.SendEvent(ssntp.InstanceDeleted, payload) - if err != nil { - glog.Errorf("Failed to send event command %v", err) - return - } -} - func (ovs *overseer) processGetCommand(cmd *ovsGetCmd) { glog.Infof("Overseer: looking for instance %s", cmd.instance) var insState ovsGetResult @@ -579,9 +560,6 @@ func (ovs *overseer) processRemoveCommand(cmd *ovsRemoveCmd) { } delete(ovs.instances, cmd.instance) - if !cmd.suicide { - ovs.sendInstanceDeletedEvent(cmd.instance) - } cmd.errCh <- nil }