joyent/node-triton#129 triton reboot --wait INST doesn't wait

Reviewed by: Julien Gilli <julien.gilli@joyent.com>
2017-02-08 16:49:00 -08:00 · 2017-02-08 16:49:00 -08:00 · 3dd59ab109
commit 3dd59ab109
parent 1503088abd
6 changed files with 271 additions and 14 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@ -7,6 +7,11 @@ Known issues:

 ## not yet released

+- [joyent/node-triton#129] Fix `triton reboot --wait` to properly wait. Before
+  it would often return immediately, before the instance started rebooting.
+  Add `--wait-timeout N` option to `triton reboot`.
+  Also add `TritonApi#rebootInstance()` api method.
+
 - [joyent/node-triton#166] Update sshpk to fix issue with the TLS client cert
  created by `triton profile docker-setup` so that it doesn't create a cert
  that Go's TLS library doesn't like.
--- a/lib/do_instance/do_reboot.js
+++ b/lib/do_instance/do_reboot.js
@ -5,13 +5,100 @@
 */

 /*
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2017 Joyent, Inc.
 *
 * `triton instance reboot ...`
 */

-var gen_do_ACTION = require('./gen_do_ACTION');
+var assert = require('assert-plus');
+var vasync = require('vasync');
+
+var common = require('../common');
+var errors = require('../errors');
+
+
+function do_reboot(subcmd, opts, args, cb) {
+    if (opts.help) {
+        this.do_help('help', {}, [subcmd], cb);
+        return;
+    } else if (args.length < 1) {
+        cb(new errors.UsageError('missing INST arg(s)'));
+        return;
+    }
+
+    var tritonapi = this.top.tritonapi;
+    common.cliSetupTritonApi({cli: this.top}, function onSetup(setupErr) {
+        if (setupErr) {
+            cb(setupErr);
+        }
+
+        var rebootErrs = [];
+
+        vasync.forEachParallel({
+            inputs: args,
+            func: function rebootOne(arg, nextInst) {
+                console.log('Rebooting instance %s', arg);
+                tritonapi.rebootInstance({
+                    id: arg,
+                    wait: opts.wait,
+                    waitTimeout: opts.wait_timeout * 1000
+                }, function (rebootErr) {
+                    if (rebootErr) {
+                        rebootErrs.push(rebootErr);
+                        console.log('Error rebooting instance %s: %s', arg,
+                            rebootErr.message);
+                    } else if (opts.wait) {
+                        console.log('Rebooted instance %s', arg);
+                    }
+                    nextInst();
+                });
+
+            }
+        }, function doneReboots(err) {
+            assert.ok(!err, '"err" should be impossible as written');
+            if (rebootErrs.length === 1) {
+                cb(rebootErrs[0]);
+            } else if (rebootErrs.length > 1) {
+                cb(new errors.MultiError(rebootErrs));
+            } else {
+                cb();
+            }
+        });
+    });
+}
+
+
+do_reboot.synopses = ['{{name}} reboot [OPTIONS] INST [INST ...]'];
+do_reboot.help = [
+    'Reboot one or more instances.',
+    '',
+    '{{usage}}',
+    '',
+    '{{options}}',
+    'Where "INST" is an instance name, id, or short id.'
+].join('\n');
+do_reboot.options = [
+    {
+        names: ['help', 'h'],
+        type: 'bool',
+        help: 'Show this help.'
+    },
+    {
+        names: ['wait', 'w'],
+        type: 'bool',
+        help: 'Wait until the instance(s) have rebooted.'
+    },
+    {
+        names: ['wait-timeout'],
+        type: 'positiveInteger',
+        default: 120,
+        help: 'The number of seconds to wait before timing out with an error. '
+            + 'The default is 120 seconds.'
+    }
+];
+
+do_reboot.completionArgtypes = ['tritoninstance'];
+


-var do_reboot = gen_do_ACTION({action: 'reboot'});
 module.exports = do_reboot;
--- a/lib/do_instance/gen_do_ACTION.js
+++ b/lib/do_instance/gen_do_ACTION.js
@ -10,7 +10,6 @@
 * Shared support for:
 * `triton instance start ...`
 * `triton instance stop ...`
- * `triton instance reboot ...`
 * `triton instance delete ...`
 */

@ -32,7 +31,7 @@ function gen_do_ACTION(opts) {
    assert.optionalArrayOfString(opts.aliases, 'opts.aliases');
    var action = opts.action;

-    assert.ok(['start', 'stop', 'reboot', 'delete'].indexOf(action) >= 0,
+    assert.ok(['start', 'stop', 'delete'].indexOf(action) >= 0,
        'invalid action');

    function do_ACTION(subcmd, _opts, args, callback) {
@ -94,10 +93,6 @@ function _doTheAction(action, subcmd, opts, args, callback) {
            command = 'stopMachine';
            state = 'stopped';
            break;
-        case 'reboot':
-            command = 'rebootMachine';
-            state = 'running';
-            break;
        case 'delete':
            command = 'deleteMachine';
            state = 'deleted';
--- a/lib/errors.js
+++ b/lib/errors.js
@ -261,9 +261,13 @@ function MultiError(errs) {
    var lines = [format('multiple (%d) errors', errs.length)];
    for (var i = 0; i < errs.length; i++) {
        var err = errs[i];
+        if (err.code) {
            lines.push(format('    error (%s): %s', err.code, err.message));
+        } else {
+            lines.push(format('    error: %s', err.message));
        }
-    _TritonBaseVError.call(this, {
+    }
+    _TritonBaseWError.call(this, {
        cause: errs[0],
        message: lines.join('\n'),
        code: 'MultiError',
@ -271,7 +275,7 @@ function MultiError(errs) {
    });
 }
 MultiError.description = 'Multiple errors.';
-util.inherits(MultiError, _TritonBaseVError);
+util.inherits(MultiError, _TritonBaseWError);



--- a/lib/tritonapi.js
+++ b/lib/tritonapi.js
@ -2261,6 +2261,155 @@ TritonApi.prototype.deletePolicy = function deletePolicy(opts, cb) {
    });
 };

+
+/**
+ * Reboot an instance by id.
+ *
+ * @param {Object} opts
+ *      - {String} id: Required. The instance name, short id, or id (a UUID).
+ *      - {Boolean} wait: Wait (via polling) until the reboot is complete.
+ *        Warning: Time skew (between the cloudapi server and the CN on
+ *        which the instance resides) or a concurrent reboot can result in this
+ *        polling being unable to notice the change properly. Use `waitTimeout`
+ *        to put an upper bound.
+ *      - {Number} waitTimeout: The number of milliseconds after which to
+ *        timeout (call `cb` with a timeout error) waiting. Only relevant if
+ *        `opts.wait === true`. Default is Infinity (i.e. it doesn't timeout).
+ * @param {Function} callback of the form `function (err, _, res)`
+ *
+ * Dev Note: This polls on MachineAudit... which might be heavy on TritonDC's
+ * currently implementation of that. PUBAPI-1347 is a better solution.
+ */
+TritonApi.prototype.rebootInstance = function rebootInstance(opts, cb) {
+    assert.string(opts.id, 'opts.id');
+    assert.optionalBool(opts.wait, 'opts.wait');
+    assert.optionalNumber(opts.waitTimeout, 'opts.waitTimeout');
+    assert.func(cb, 'cb');
+
+    var self = this;
+    var res;
+
+    function randrange(min, max) {
+        return Math.floor(Math.random() * (max - min + 1)) + min;
+    }
+    function timeDiffMs(relativeTo) {
+        var diff = process.hrtime(relativeTo);
+        var ms = (diff[0] * 1e3) + (diff[1] / 1e6); // in milliseconds
+        return ms;
+    }
+
+    vasync.pipeline({arg: {client: self, id: opts.id}, funcs: [
+        _stepInstId,
+
+        function rebootIt(arg, next) {
+            self.cloudapi.rebootMachine(arg.instId, function (err, _, _res) {
+                res = _res;
+                next(err);
+            });
+        },
+
+        function waitForIt(arg, next) {
+            if (!opts.wait) {
+                next();
+                return;
+            }
+
+            /*
+             * Polling on the instance `state` doesn't work for a reboot,
+             * because a first poll value of "running" is ambiguous: was it
+             * a fast reboot, or has the instance not yet left the running
+             * state?
+             *
+             * Lacking PUBAPI-1347, we'll use the MachineAudit endpoint to
+             * watch for a 'reboot' action that finished after the server time
+             * for the RebootMachine response (i.e. the "Date" header), e.g.:
+             *      date: Wed, 08 Feb 2017 20:55:35 GMT
+             * Example reboot audit entry:
+             *      {"success":"yes",
+             *       "time":"2017-02-08T20:55:44.045Z",
+             *       "action":"reboot",
+             *       ...}
+             *
+             * Hardcoded 2s poll interval for now (randomized for the first
+             * poll). Not yet configurable, being mindful of avoiding lots of
+             * clients naively swamping a CloudAPI and hitting throttling.
+             */
+            var POLL_INTERVAL = 2 * 1000;
+            var startTime = process.hrtime();
+            var dateHeader = res.headers['date'];
+            var resTime = Date.parse(dateHeader);
+            if (!dateHeader) {
+                next(new errors.InternalError(format(
+                    'cannot wait for reboot: CloudAPI RebootMachine response '
+                    + 'did not include a "Date" header (req %s)',
+                    res.headers['request-id'])));
+                return;
+            } else if (isNaN(resTime)) {
+                next(new errors.InternalError(format(
+                    'cannot wait for reboot: could not parse CloudAPI '
+                    + 'RebootMachine response "Date" header: "%s" (req %s)',
+                    dateHeader, res.headers['request-id'])));
+                return;
+            }
+            self.log.trace({id: arg.instId, resTime: resTime},
+                'wait for reboot audit record');
+
+            var pollMachineAudit = function () {
+                self.cloudapi.machineAudit(arg.instId, function (aErr, audit) {
+                    if (aErr) {
+                        next(aErr);
+                        return;
+                    }
+
+                    /*
+                     * Search the top few audit records, in case some other
+                     * action slipped in.
+                     */
+                    var theRecord = null;
+                    for (var i = 0; i < audit.length; i++) {
+                        if (audit[i].action === 'reboot' &&
+                            Date.parse(audit[i].time) > resTime)
+                        {
+                            theRecord = audit[i];
+                            break;
+                        }
+                    }
+
+                    if (!theRecord) {
+                        if (opts.waitTimeout) {
+                            var elapsedMs = timeDiffMs(startTime);
+                            if (elapsedMs > opts.waitTimeout) {
+                                next(new errors.TimeoutError(format('timeout '
+                                    + 'waiting for instance %s reboot '
+                                    + '(elapsed %ds)',
+                                    arg.instId,
+                                    Math.round(elapsedMs / 1000))));
+                                return;
+                            }
+                        }
+                        setTimeout(pollMachineAudit, POLL_INTERVAL);
+                    } else if (theRecord.success !== 'yes') {
+                        next(new errors.TritonError(format(
+                            'reboot failed (audit id %s)', theRecord.id)));
+                    } else {
+                        next();
+                    }
+                });
+            };
+
+            /*
+             * Add a random start delay to avoid a number of concurrent reboots
+             * all polling at the same time.
+             */
+            setTimeout(pollMachineAudit,
+                (POLL_INTERVAL / 2) + randrange(0, POLL_INTERVAL));
+        }
+    ]}, function (err) {
+        cb(err, null, res);
+    });
+};
+
+
 /**
 * rename a machine by id.
 *
@ -2324,7 +2473,6 @@ TritonApi.prototype.renameInstance = function renameInstance(opts, cb) {
 *        Default is Infinity (i.e. it doesn't timeout).
 * @param {Function} cb: `function (err)`
 */
-
 TritonApi.prototype._waitForInstanceRename =
 function _waitForInstanceRename(opts, cb) {
    var self = this;
--- a/test/integration/cli-manage-workflow.test.js
+++ b/test/integration/cli-manage-workflow.test.js
@ -226,8 +226,26 @@ test('triton manage workflow', opts, function (tt) {
            t.end();
        });
    });
+    tt.test('  confirm running', function (t) {
+        h.safeTriton(t, {json: true, args: ['inst', 'get', '-j', INST_ALIAS]},
+                function (err, d) {
+            instance = d;
+            t.equal(d.state, 'running', 'machine running');
+            t.end();
+        });
+    });

-    // wait for the machine to start
+    // reboot the machine
+    tt.test('  triton reboot', function (t) {
+        h.safeTriton(t, ['reboot', '-w', INST_ALIAS],
+                function (err, stdout) {
+            t.ok(stdout.match(/^Rebooting instance/),
+                '"Rebooting ..." in stdout');
+            t.ok(stdout.match(/^Rebooted instance/m),
+                '"Rebooted ..." in stdout');
+            t.end();
+        });
+    });
    tt.test('  confirm running', function (t) {
        h.safeTriton(t, {json: true, args: ['inst', 'get', '-j', INST_ALIAS]},
                function (err, d) {