From 449de46f0d8aa59e9633729447d2273bec113746 Mon Sep 17 00:00:00 2001 From: Nikolas Komonen <118216176+nkomonen-amazon@users.noreply.github.com> Date: Wed, 16 Oct 2024 23:33:10 -0400 Subject: [PATCH] fix(crash): handle sleep/wake appropriately (#5787) ## Problem: When a user does a sleep then wake of their computer, the heartbeat is not up to date since it cannot be sent when the user's computer is asleep. Due to this there is a race condition on wake between the next fresh heartbeat being sent versus when we check the heartbeats to determine if they are stale (crash). If the crash checker runs before a new heartbeat can be sent, it will be seen as a crash. ## Solution: Use a TimeLag class that helps to determine when there is a time discrepancy. It works by updating a state every second, and if we determine that the next update to that state took longer than a second, we determine that there was a lag. Then we simply skip the next crash check, allowing a fresh heartbeat to be sent. --- License: I confirm that my contribution is made under the terms of the Apache 2.0 license. --------- Signed-off-by: nkomonen-amazon --- packages/core/src/shared/constants.ts | 6 +- packages/core/src/shared/crashMonitoring.ts | 224 +++++++++--------- packages/core/src/shared/errors.ts | 6 +- .../core/src/shared/utilities/timeoutUtils.ts | 67 ++++++ .../src/test/shared/crashMonitoring.test.ts | 3 +- 5 files changed, 180 insertions(+), 126 deletions(-) diff --git a/packages/core/src/shared/constants.ts b/packages/core/src/shared/constants.ts index 98e61d220f0..908624383dc 100644 --- a/packages/core/src/shared/constants.ts +++ b/packages/core/src/shared/constants.ts @@ -164,10 +164,6 @@ export const amazonQVscodeMarketplace = * * Moved here to resolve circular dependency issues. */ -export const crashMonitoringDirNames = { - root: 'crashMonitoring', - running: 'running', - shutdown: 'shutdown', -} as const +export const crashMonitoringDirName = 'crashMonitoring' export const amazonQTabSuffix = '(Generated by Amazon Q)' diff --git a/packages/core/src/shared/crashMonitoring.ts b/packages/core/src/shared/crashMonitoring.ts index 00fbb162862..b99bbef9436 100644 --- a/packages/core/src/shared/crashMonitoring.ts +++ b/packages/core/src/shared/crashMonitoring.ts @@ -15,9 +15,10 @@ import { isNewOsSession } from './utilities/osUtils' import nodeFs from 'fs/promises' import fs from './fs/fs' import { getLogger } from './logger/logger' -import { crashMonitoringDirNames } from './constants' +import { crashMonitoringDirName } from './constants' import { throwOnUnstableFileSystem } from './filesystemUtilities' import { withRetries } from './utilities/functionUtils' +import { TimeLag } from './utilities/timeoutUtils' const className = 'CrashMonitoring' @@ -112,15 +113,17 @@ export class CrashMonitoring { try { this.heartbeat = new Heartbeat(this.state, this.checkInterval, this.isDevMode) + this.heartbeat.onFailure(() => this.cleanup()) + this.crashChecker = new CrashChecker(this.state, this.checkInterval, this.isDevMode, this.devLogger) + this.crashChecker.onFailure(() => this.cleanup()) await this.heartbeat.start() await this.crashChecker.start() } catch (error) { emitFailure({ functionName: 'start', error }) try { - this.crashChecker?.cleanup() - await this.heartbeat?.cleanup() + await this.cleanup() } catch {} // Surface errors during development, otherwise it can be missed. @@ -146,6 +149,11 @@ export class CrashMonitoring { } } } + + public async cleanup() { + this.crashChecker?.cleanup() + await this.heartbeat?.shutdown() + } } /** @@ -154,15 +162,19 @@ export class CrashMonitoring { */ class Heartbeat { private intervalRef: NodeJS.Timer | undefined + private _onFailure = new vscode.EventEmitter() + public onFailure: vscode.Event = this._onFailure.event + private readonly heartbeatInterval: number + constructor( private readonly state: FileSystemState, - private readonly checkInterval: number, + checkInterval: number, private readonly isDevMode: boolean - ) {} + ) { + this.heartbeatInterval = checkInterval / 2 + } public async start() { - const heartbeatInterval = this.checkInterval / 2 - // Send an initial heartbeat immediately await withFailCtx('initialSendHeartbeat', () => this.state.sendHeartbeat()) @@ -172,38 +184,22 @@ class Heartbeat { await this.state.sendHeartbeat() } catch (e) { try { - await this.cleanup() + await this.shutdown() emitFailure({ functionName: 'sendHeartbeatInterval', error: e }) } catch {} if (this.isDevMode) { throw e } + this._onFailure.fire() } - }, heartbeatInterval) + }, this.heartbeatInterval) } /** Stops everything, signifying a graceful shutdown */ public async shutdown() { globals.clock.clearInterval(this.intervalRef) - return this.state.indicateGracefulShutdown() - } - - /** - * Safely attempts to clean up this heartbeat from the state to try and avoid - * an incorrectly indicated crash. Use this on failures. - * - * --- - * - * IMPORTANT: This function must not throw as this function is run within a catch - */ - public async cleanup() { - try { - await this.shutdown() - } catch {} - try { - await this.state.clearHeartbeat() - } catch {} + await this.state.indicateGracefulShutdown() } /** Mimics a crash, only for testing */ @@ -217,34 +213,55 @@ class Heartbeat { */ class CrashChecker { private intervalRef: NodeJS.Timer | undefined + private _onFailure = new vscode.EventEmitter() + public onFailure = this._onFailure.event constructor( private readonly state: FileSystemState, private readonly checkInterval: number, private readonly isDevMode: boolean, - private readonly devLogger: Logger | undefined + private readonly devLogger: Logger | undefined, + /** + * This class is required for the following edge case: + * 1. Heartbeat is sent + * 2. Computer goes to sleep for X minutes + * 3. Wake up computer. But before a new heartbeat can be sent, a crash checker (can be from another ext instance) runs + * and sees a stale heartbeat. It assumes a crash. + * + * Why? Intervals do not run while the computer is asleep, so the latest heartbeat has a "lag" since it wasn't able to send + * a new heartbeat. + * Then on wake, there is a racecondition for the next heartbeat to be sent before the next crash check. If the crash checker + * runs first it will incorrectly conclude a crash. + * + * Solution: Keep track of the lag, and then skip the next crash check if there was a lag. This will give time for the + * next heartbeat to be sent. + */ + private readonly timeLag: TimeLag = new TimeLag() ) {} public async start() { { this.devLogger?.debug(`crashMonitoring: checkInterval ${this.checkInterval}`) + this.timeLag.start() + // do an initial check await withFailCtx('initialCrashCheck', () => - tryCheckCrash(this.state, this.checkInterval, this.isDevMode, this.devLogger) + tryCheckCrash(this.state, this.checkInterval, this.isDevMode, this.devLogger, this.timeLag) ) // check on an interval this.intervalRef = globals.clock.setInterval(async () => { try { - await tryCheckCrash(this.state, this.checkInterval, this.isDevMode, this.devLogger) + await tryCheckCrash(this.state, this.checkInterval, this.isDevMode, this.devLogger, this.timeLag) } catch (e) { emitFailure({ functionName: 'checkCrashInterval', error: e }) - this.cleanup() if (this.isDevMode) { throw e } + + this._onFailure.fire() } }, this.checkInterval) } @@ -255,46 +272,46 @@ class CrashChecker { state: FileSystemState, checkInterval: number, isDevMode: boolean, - devLogger: Logger | undefined + devLogger: Logger | undefined, + timeLag: TimeLag ) { - // Iterate all known extensions and for each check if they have crashed + if (await timeLag.didLag()) { + timeLag.reset() + devLogger?.warn('crashMonitoring: SKIPPED check crash due to time lag') + return + } + + // check each extension if it crashed const knownExts = await state.getAllExts() const runningExts: ExtInstanceHeartbeat[] = [] for (const ext of knownExts) { + // is still running if (!isStoppedHeartbeats(ext, checkInterval)) { runningExts.push(ext) continue } - // Ext is not running anymore, handle appropriately depending on why it stopped running - await state.handleExtNotRunning(ext, { - onShutdown: async () => { - // Nothing to do, just log info if necessary - devLogger?.debug( - `crashMonitoring: SHUTDOWN: following has gracefully shutdown: pid ${ext.extHostPid} + sessionId: ${ext.sessionId}` - ) - }, - onCrash: async () => { - // Debugger instances may incorrectly look like they crashed, so don't emit. - // Example is if I hit the red square in the debug menu, it is a non-graceful shutdown. But the regular - // 'x' button in the Debug IDE instance is a graceful shutdown. - if (ext.isDebug) { - devLogger?.debug(`crashMonitoring: DEBUG instance crashed: ${JSON.stringify(ext)}`) - return - } - - // This is the metric to let us know the extension crashed - telemetry.session_end.emit({ - result: 'Failed', - proxiedSessionId: ext.sessionId, - reason: 'ExtHostCrashed', - passive: true, - }) - - devLogger?.debug( - `crashMonitoring: CRASH: following has crashed: pid ${ext.extHostPid} + sessionId: ${ext.sessionId}` - ) - }, + // did crash + await state.handleCrashedExt(ext, () => { + // Debugger instances may incorrectly look like they crashed, so don't emit. + // Example is if I hit the red square in the debug menu, it is a non-graceful shutdown. But the regular + // 'x' button in the Debug IDE instance is a graceful shutdown. + if (ext.isDebug) { + devLogger?.debug(`crashMonitoring: DEBUG instance crashed: ${JSON.stringify(ext)}`) + return + } + + // This is the metric to let us know the extension crashed + telemetry.session_end.emit({ + result: 'Failed', + proxiedSessionId: ext.sessionId, + reason: 'ExtHostCrashed', + passive: true, + }) + + devLogger?.debug( + `crashMonitoring: CRASH: following has crashed: pid ${ext.extHostPid} + sessionId: ${ext.sessionId}` + ) }) } @@ -320,11 +337,12 @@ class CrashChecker { /** Use this on failures to terminate the crash checker */ public cleanup() { globals.clock.clearInterval(this.intervalRef) + this.timeLag.cleanup() } /** Mimics a crash, only for testing */ public testCrash() { - globals.clock.clearInterval(this.intervalRef) + this.cleanup() } } @@ -379,7 +397,7 @@ export class FileSystemState { * IMORTANT: Use {@link crashMonitoringStateFactory} to make an instance */ constructor(protected readonly deps: MementoStateDependencies) { - this.stateDirPath = path.join(this.deps.workDirPath, crashMonitoringDirNames.root) + this.stateDirPath = path.join(this.deps.workDirPath, crashMonitoringDirName) this.deps.devLogger?.debug(`crashMonitoring: pid: ${this.deps.pid}`) this.deps.devLogger?.debug(`crashMonitoring: sessionId: ${this.deps.sessionId.slice(0, 8)}-...`) @@ -405,17 +423,16 @@ export class FileSystemState { if (await this.deps.isStateStale()) { await this.clearState() } + + await withFailCtx('init', () => fs.mkdir(this.stateDirPath)) } // ------------------ Heartbeat methods ------------------ public async sendHeartbeat() { - const extId = this.createExtId(this.ext) - try { const func = async () => { - const dir = await this.runningExtsDir() await fs.writeFile( - path.join(dir, extId), + this.makeStateFilePath(this.extId), JSON.stringify({ ...this.ext, lastHeartbeat: this.deps.now() }, undefined, 4) ) this.deps.devLogger?.debug( @@ -437,7 +454,7 @@ export class FileSystemState { } /** - * Signal that this extension is gracefully shutting down. This will prevent the IDE from thinking it crashed. + * Indicates that this extension instance has gracefully shutdown. * * IMPORTANT: This code is being run in `deactivate()` where VS Code api is not available. Due to this we cannot * easily update the state to indicate a graceful shutdown. So the next best option is to write to a file on disk, @@ -447,46 +464,26 @@ export class FileSystemState { * function touches. */ public async indicateGracefulShutdown(): Promise { - const dir = await this.shutdownExtsDir() - await withFailCtx('writeShutdownFile', () => nodeFs.writeFile(path.join(dir, this.extId), '')) + // By removing the heartbeat entry, the crash checkers will not be able to find this entry anymore, making it + // impossible to report on since the file system is the source of truth + await withFailCtx('indicateGracefulShutdown', () => + nodeFs.rm(this.makeStateFilePath(this.extId), { force: true }) + ) } // ------------------ Checker Methods ------------------ - /** - * Signals the state that the given extension is not running, allowing the state to appropriately update - * depending on a graceful shutdown or crash. - * - * NOTE: This does NOT run in the `deactivate()` method, so it CAN reliably use the VS Code FS api - * - * @param opts - functions to run depending on why the extension stopped running - */ - public async handleExtNotRunning( - ext: ExtInstance, - opts: { onShutdown: () => Promise; onCrash: () => Promise } - ): Promise { - const extId = this.createExtId(ext) - const shutdownFilePath = path.join(await this.shutdownExtsDir(), extId) - - if (await withFailCtx('existsShutdownFile', () => fs.exists(shutdownFilePath))) { - await opts.onShutdown() - // We intentionally do not clean up the file in shutdown since there may be another - // extension may be doing the same thing in parallel, and would read the extension as - // crashed since the file was missing. The file will be cleared on computer restart though. - - // TODO: Be smart and clean up the file after some time. - } else { - await opts.onCrash() - } - - // Clean up the running extension file since it no longer exists - await this.deleteHeartbeatFile(extId) + public async handleCrashedExt(ext: ExtInstance, fn: () => void) { + await withFailCtx('handleCrashedExt', async () => { + await this.deleteHeartbeatFile(ext) + fn() + }) } - public async deleteHeartbeatFile(extId: ExtInstanceId) { - const dir = await this.runningExtsDir() + + private async deleteHeartbeatFile(ext: ExtInstanceId | ExtInstance) { // Retry file deletion to prevent incorrect crash reports. Common Windows errors seen in telemetry: EPERM/EBUSY. // See: https://github.com/aws/aws-toolkit-vscode/pull/5335 - await withRetries(() => withFailCtx('deleteStaleRunningFile', () => fs.delete(path.join(dir, extId))), { + await withRetries(() => withFailCtx('deleteStaleRunningFile', () => fs.delete(this.makeStateFilePath(ext))), { maxRetries: 8, delay: 100, backoff: 2, @@ -515,17 +512,9 @@ export class FileSystemState { protected createExtId(ext: ExtInstance): ExtInstanceId { return `${ext.extHostPid}_${ext.sessionId}` } - private async runningExtsDir(): Promise { - const p = path.join(this.stateDirPath, crashMonitoringDirNames.running) - // ensure the dir exists - await withFailCtx('ensureRunningExtsDir', () => nodeFs.mkdir(p, { recursive: true })) - return p - } - private async shutdownExtsDir() { - const p = path.join(this.stateDirPath, crashMonitoringDirNames.shutdown) - // Since this runs in `deactivate()` it cannot use the VS Code FS api - await withFailCtx('ensureShutdownExtsDir', () => nodeFs.mkdir(p, { recursive: true })) - return p + private makeStateFilePath(ext: ExtInstance | ExtInstanceId) { + const extId = typeof ext === 'string' ? ext : this.createExtId(ext) + return path.join(this.stateDirPath, extId) } public async clearState(): Promise { this.deps.devLogger?.debug('crashMonitoring: CLEAR_STATE: Started') @@ -538,7 +527,7 @@ export class FileSystemState { const res = await withFailCtx('getAllExts', async () => { // The file names are intentionally the IDs for easy mapping const allExtIds: ExtInstanceId[] = await withFailCtx('readdir', async () => - (await fs.readdir(await this.runningExtsDir())).map((k) => k[0]) + (await fs.readdir(this.stateDirPath)).map((k) => k[0]) ) const allExts = allExtIds.map>(async (extId: string) => { @@ -549,7 +538,7 @@ export class FileSystemState { () => withFailCtx('parseRunningExtFile', async () => ignoreBadFileError(async () => { - const text = await fs.readFileText(path.join(await this.runningExtsDir(), extId)) + const text = await fs.readFileText(this.makeStateFilePath(extId)) if (!text) { return undefined @@ -617,7 +606,10 @@ export type ExtInstance = { isDebug?: boolean } -type ExtInstanceHeartbeat = ExtInstance & { lastHeartbeat: number } +type ExtInstanceHeartbeat = ExtInstance & { + /** Timestamp of the last heartbeat in milliseconds */ + lastHeartbeat: number +} function isExtHeartbeat(ext: unknown): ext is ExtInstanceHeartbeat { return typeof ext === 'object' && ext !== null && 'lastHeartbeat' in ext && ext.lastHeartbeat !== undefined diff --git a/packages/core/src/shared/errors.ts b/packages/core/src/shared/errors.ts index 09597a8285f..95cebb03111 100644 --- a/packages/core/src/shared/errors.ts +++ b/packages/core/src/shared/errors.ts @@ -15,7 +15,7 @@ import type * as os from 'os' import { CodeWhispererStreamingServiceException } from '@amzn/codewhisperer-streaming' import { driveLetterRegex } from './utilities/pathUtils' import { getLogger } from './logger/logger' -import { crashMonitoringDirNames } from './constants' +import { crashMonitoringDirName } from './constants' let _username = 'unknown-user' let _isAutomation = false @@ -397,9 +397,7 @@ export function scrubNames(s: string, username?: string) { 'tmp', 'aws-toolkit-vscode', 'globalStorage', // from vscode globalStorageUri - crashMonitoringDirNames.root, - crashMonitoringDirNames.running, - crashMonitoringDirNames.shutdown, + crashMonitoringDirName, ]) if (username && username.length > 2) { diff --git a/packages/core/src/shared/utilities/timeoutUtils.ts b/packages/core/src/shared/utilities/timeoutUtils.ts index f009b0676c9..18922067946 100644 --- a/packages/core/src/shared/utilities/timeoutUtils.ts +++ b/packages/core/src/shared/utilities/timeoutUtils.ts @@ -292,3 +292,70 @@ export function sleep(duration: number = 0): Promise { const schedule = globals?.clock?.setTimeout ?? setTimeout return new Promise((r) => schedule(r, Math.max(duration, 0))) } + +/** + * Time lag occurs when the computer goes to sleep, and intervals cannot run on the expected + * cadence. This keeps track of that lag in cadence. + */ +export class TimeLag { + private intervalRef: NodeJS.Timer | undefined + private isCompleted: Promise | undefined + /** Resolves {@link isCompleted} when the next interval of lag checking is completed */ + private setCompleted: (() => void) | undefined + + /** The last timestamp we remember. If this is more than 1 {@link lagCheckInterval} we probably did a sleep+wake */ + private latestTimestamp: number = 0 + /** The accumulation of lag before the next crash checker interval, since the user can sleep+wake multiple times in between */ + private totalLag: number = 0 + private readonly lagCheckInterval = 1000 + + public start() { + this.reset() // initialize + + // Every interval calculate the lag + this.intervalRef = globals.clock.setInterval(() => { + const expectedNow = this.latestTimestamp + this.lagCheckInterval + const actualNow = globals.clock.Date.now() + const lag = actualNow - expectedNow + + // interval callback execution is not exact, so this buffer avoids micro lag from being + // considered actual lag + if (lag > 5000) { + this.totalLag += lag + } + + this.latestTimestamp = Date.now() + + // race condition between crash checker and lag checker on computer wake. So we have + // the crash checker block until this is completed. + this.setCompleted!() + this.setInProgress() + }, this.lagCheckInterval) + } + + /** True if there is a time lag */ + public async didLag() { + await this.isCompleted! + return this.totalLag > 0 + } + + /** Indicates the next time lag interval check is not completed */ + private setInProgress() { + this.isCompleted = new Promise((resolve) => { + this.setCompleted = resolve + }) + } + + /** + * Call this once the user of this instance has handled the lag. If not done, {@link didLag} can + * be permanently true. + */ + public reset() { + this.totalLag = 0 + this.latestTimestamp = globals.clock.Date.now() + this.setInProgress() + } + public cleanup() { + globals.clock.clearInterval(this.intervalRef) + } +} diff --git a/packages/core/src/test/shared/crashMonitoring.test.ts b/packages/core/src/test/shared/crashMonitoring.test.ts index 5802592ea8c..88c448a6cf4 100644 --- a/packages/core/src/test/shared/crashMonitoring.test.ts +++ b/packages/core/src/test/shared/crashMonitoring.test.ts @@ -30,7 +30,7 @@ export const crashMonitoringTest = async () => { // Add some buffer since after 1 interval the work is actually done, including file i/o which may be slow. // **IF FLAKY**, see if increasing the buffer helps. - const oneInterval = checkInterval + 500 + const oneInterval = checkInterval + 1000 /** * Makes N "extension instances" that can be used for testing. @@ -122,6 +122,7 @@ export const crashMonitoringTest = async () => { await exts[i].ext.start() } + // Crash all exts except the 0th one for (let i = 1; i < extCount; i++) { await exts[i].ext.crash() latestCrashedExts.push(exts[i])