Need better diagnostics/warnings/reporting when locks time out #6

nickminutello · 2013-05-02T17:00:13Z

I have found that when trying to diagnose issues with lock timeouts in a multi-process/multi-host environment, it was kind of essential to know "who" took the lock that timed out (that way you can then know where to start looking as to why that lock timed out).

To do this I made the lock value slightly more sophisticated than a timeout value - I added the host/pid.

I cant do a pull-request - but I can paste the code here: (this has a configurable delay patch mixed in too)

var hostName;
try {
    var OS = require('os');
    hostName = OS.hostname();
} catch (e) {
    hostName = process.env.HOST || process.env.HOSTNAME;
}
hostName = hostName ? hostName.split('.')[0] : 'unknown-host';

function createLockValue(lockTimeoutValue) {
    var callerId = hostName + ":" + process.pid;
    return callerId + "|" + lockTimeoutValue;
}

function getTimeoutValue(lockValue) {
    if (lockValue) {
        var parts = lockValue.split("|");
        if (parts.length ===2) {
            return parseFloat(parts[1]);
        }
    }
    return 0;
}

function getCallerId(lockValue) {
    if (lockValue) {
        var parts = lockValue.split("|");
        if (parts.length ===2) {
            return parts[0];
        }
    }
    return "unknown-caller-id";
}

function acquireLock(client, lockName, timeout, retryDelay, onLockAquired) {
    function retry() {
        acquireLock(client, lockName, timeout, retryDelay, onLockAquired);
    }

    var lockTimeoutValue = (Date.now() + timeout + 1);

    client.setnx(lockName, createLockValue(lockTimeoutValue), function(err, result) {
        if(err) {
            return setTimeout(retry, retryDelay);
        }
        if(result === 0) {
            client.get(lockName, function(err, lockValue) {
                if(err) {
                    return setTimeout(retry, retryDelay);
                }

                var timeStamp = getTimeoutValue(lockValue);

                var now = Date.now();
                if(timeStamp > now) {
                    setTimeout(retry, retryDelay);
                } else {
                    lockTimeoutValue = (Date.now() + timeout + 1);
                    client.getset(lockName, createLockValue(lockTimeoutValue), function(err, result) {
                        if(err) {
                            return setTimeout(retry, retryDelay);
                        }

                        if(result === lockValue) {
                            if (lockValue) {
                                console.log("[PID %s] [%s] Warning : Lock '%s' timed out by %dms - stealing lock from %s ...", process.pid, new Date().toISOString(), lockName, (now - timeStamp), getCallerId(lockValue));
                            }
                            onLockAquired(lockTimeoutValue);
                        } else {
                            setTimeout(retry, retryDelay);
                        }
                    });
                }
            });
        } else {
            onLockAquired(lockTimeoutValue);
        }
    });
}

module.exports = function(client, retryDelay) {
    if(!(client && client.setnx)) {
        throw new Error("You must specify a client instance of http://github.com/mranney/node_redis");
    }

    retryDelay = retryDelay || 50;

    return function(lockName, timeout, taskToPerform) {
        if(!lockName) {
            throw new Error("You must specify a lock string. It is on the basis on this the lock is acquired.");
        }

        if(!taskToPerform) {
            taskToPerform = timeout;
            timeout = 5000;
        }

        lockName = "lock." + lockName;

        acquireLock(client, lockName, timeout, retryDelay, function(lockTimeoutValue) {
            var start  = Date.now();
            taskToPerform(function(done) {
                if(lockTimeoutValue > Date.now()) {
                    client.del(lockName, done);
                } else {
                    console.log("[PID %s] [%s] Warning : Lock '%s' timed out before lock could be released. Locked operation took %dms", process.pid, new Date().toISOString(),lockName, Date.now() - start);
                    done();
                }
            });
        });
    };
};

rakeshpai · 2013-05-02T17:51:20Z

@nickminutello Thanks for this.

Couple of questions.

Why have a try/catch around os.hostname()? The docs don't say anything about error conditions when fetching the hostname.
Why use only the stuff before the "." in the hostname?
Why bother with unknown-hostname? Is that a possibility?
Ditto for unknown-caller-id.
Why have two console.logs? The first one seems redundant. Also, how would the (now - timestamp) in the console.log be useful?

Rest looks fine, though I shall break these up into multiple commits. Also, tests broken. Will fix. Thanks again.

nickminutello · 2013-05-03T17:10:52Z

Try/Catch around os.hostname() - I have to admit that I ripped that code from another module (config) without much thought (may have been a hangover from early nodejs days - may be complete nonsense?) :-/ By all means nuke it.
Hostname: Taking the first part is just a matter of taste. I found it a bit easier on the eye just seeing the first part of the hostname... the fully-qualified name seemed a bit noisy (and usually repetitive). Also I justified it to myself that sending 0.1% fewer chars over the wire to redis would be "better" :)
The unknown-hostname and unknown-caller-id are there purely for graceful handling of "old" locks left in redis. (e.g. upgrade redis-lock, now any locks laying around in redis will be incompatible).
The first logging output is from the lock-stealer. The second logging output is from the lock-theft-victim. These two will likely be in different processes. Also, the lock-theft-victim records exactly how long its task took - which is useful when diagnosing whether the timeout is just a little too low - or there is a serious issue and its not even remotely close.

nickminutello · 2013-05-03T17:14:17Z

Also. The unknown-hostname and unknown-caller-id also handle the case when the lock value happens to be undefined (e.g. the lock has been deleted after the call to .setnx has failed)

nickminutello · 2013-05-07T17:22:53Z

Make sense?

toblerpwn · 2013-07-17T22:17:37Z

just to add: had a long, long, long troubleshooting session where it ended up that this was the issue (timeout && my callbacks and promise chains not completing). very hard to track down.

i'd suggest even creating an error condition/handler that is triggered when timeout occurs and may optionally be handled in its own function/etc. perhaps like:

lock(lockName, timeout, callback(), timeoutHandler())

to summarize: i wish there was a way to receive a notification & react appropriately to a timeout condition

boxbag · 2015-08-04T19:09:40Z

Any update on this issue? I'm evaluating this library and would love a way to be notified when the callback times out for debugging purposes

LoicMahieu · 2015-08-05T09:49:50Z

It is a really old issue, created long before I join the project. I don't think anybody is working on this anymore. If you think that is interesting and valuable, pull request or discussion are welcome 🚀

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Need better diagnostics/warnings/reporting when locks time out #6

Need better diagnostics/warnings/reporting when locks time out #6

nickminutello commented May 2, 2013

rakeshpai commented May 2, 2013

nickminutello commented May 3, 2013

nickminutello commented May 3, 2013

nickminutello commented May 7, 2013

toblerpwn commented Jul 17, 2013

boxbag commented Aug 4, 2015

LoicMahieu commented Aug 5, 2015

Need better diagnostics/warnings/reporting when locks time out #6

Need better diagnostics/warnings/reporting when locks time out #6

Comments

nickminutello commented May 2, 2013

rakeshpai commented May 2, 2013

nickminutello commented May 3, 2013

nickminutello commented May 3, 2013

nickminutello commented May 7, 2013

toblerpwn commented Jul 17, 2013

boxbag commented Aug 4, 2015

LoicMahieu commented Aug 5, 2015