feat: Implement comprehensive job recovery and resume process improvements

- Added a startup recovery script to handle interrupted jobs before application startup.
- Enhanced recovery system with database connection validation and stale job cleanup.
- Improved middleware to check for recovery needs and handle recovery during requests.
- Updated health check endpoint to include recovery system status and metrics.
- Introduced test scripts for verifying recovery functionality and job state management.
- Enhanced logging and error handling throughout the recovery process.
This commit is contained in:
Arunavo Ray
2025-05-24 13:45:25 +05:30
parent 98610482ae
commit a988be1028
10 changed files with 1006 additions and 111 deletions

View File

@@ -118,6 +118,27 @@ bun scripts/fix-interrupted-jobs.ts <userId>
Use this script if you're having trouble cleaning up activities due to "interrupted" jobs that won't delete.
### Startup Recovery (startup-recovery.ts)
Runs job recovery during application startup to handle any interrupted jobs from previous runs.
```bash
# Run startup recovery (normal mode)
bun scripts/startup-recovery.ts
# Force recovery even if recent attempt was made
bun scripts/startup-recovery.ts --force
# Set custom timeout (default: 30000ms)
bun scripts/startup-recovery.ts --timeout=60000
# Using npm scripts
bun run startup-recovery
bun run startup-recovery-force
```
This script is automatically run by the Docker entrypoint during container startup. It ensures that any jobs interrupted by container restarts or application crashes are properly recovered or marked as failed.
## Deployment Scripts
### Docker Deployment

113
scripts/startup-recovery.ts Normal file
View File

@@ -0,0 +1,113 @@
#!/usr/bin/env bun
/**
* Startup recovery script
* This script runs job recovery before the application starts serving requests
* It ensures that any interrupted jobs from previous runs are properly handled
*
* Usage:
* bun scripts/startup-recovery.ts [--force] [--timeout=30000]
*
* Options:
* --force: Force recovery even if a recent attempt was made
* --timeout: Maximum time to wait for recovery (in milliseconds, default: 30000)
*/
import { initializeRecovery, hasJobsNeedingRecovery, getRecoveryStatus } from "../src/lib/recovery";
// Parse command line arguments
const args = process.argv.slice(2);
const forceRecovery = args.includes('--force');
const timeoutArg = args.find(arg => arg.startsWith('--timeout='));
const timeout = timeoutArg ? parseInt(timeoutArg.split('=')[1], 10) : 30000;
if (isNaN(timeout) || timeout < 1000) {
console.error("Error: Timeout must be at least 1000ms");
process.exit(1);
}
async function runStartupRecovery() {
console.log('=== Gitea Mirror Startup Recovery ===');
console.log(`Timeout: ${timeout}ms`);
console.log(`Force recovery: ${forceRecovery}`);
console.log('');
const startTime = Date.now();
try {
// Set up timeout
const timeoutPromise = new Promise<never>((_, reject) => {
setTimeout(() => {
reject(new Error(`Recovery timeout after ${timeout}ms`));
}, timeout);
});
// Check if recovery is needed first
console.log('Checking if recovery is needed...');
const needsRecovery = await hasJobsNeedingRecovery();
if (!needsRecovery) {
console.log('✅ No jobs need recovery. Startup can proceed.');
process.exit(0);
}
console.log('⚠️ Jobs found that need recovery. Starting recovery process...');
// Run recovery with timeout
const recoveryPromise = initializeRecovery({
skipIfRecentAttempt: !forceRecovery,
maxRetries: 3,
retryDelay: 5000,
});
const recoveryResult = await Promise.race([recoveryPromise, timeoutPromise]);
const endTime = Date.now();
const duration = endTime - startTime;
if (recoveryResult) {
console.log(`✅ Recovery completed successfully in ${duration}ms`);
console.log('Application startup can proceed.');
process.exit(0);
} else {
console.log(`⚠️ Recovery completed with some failures in ${duration}ms`);
console.log('Application startup can proceed, but some jobs may have failed.');
process.exit(0);
}
} catch (error) {
const endTime = Date.now();
const duration = endTime - startTime;
if (error instanceof Error && error.message.includes('timeout')) {
console.error(`❌ Recovery timed out after ${duration}ms`);
console.error('Application will start anyway, but some jobs may remain interrupted.');
// Get current recovery status
const status = getRecoveryStatus();
console.log('Recovery status:', status);
// Exit with warning code but allow startup to continue
process.exit(1);
} else {
console.error(`❌ Recovery failed after ${duration}ms:`, error);
console.error('Application will start anyway, but recovery was unsuccessful.');
// Exit with error code but allow startup to continue
process.exit(1);
}
}
}
// Handle process signals gracefully
process.on('SIGINT', () => {
console.log('\n⚠ Recovery interrupted by SIGINT');
process.exit(130);
});
process.on('SIGTERM', () => {
console.log('\n⚠ Recovery interrupted by SIGTERM');
process.exit(143);
});
// Run the startup recovery
runStartupRecovery();

183
scripts/test-recovery.ts Normal file
View File

@@ -0,0 +1,183 @@
#!/usr/bin/env bun
/**
* Test script for the recovery system
* This script creates test jobs and verifies that the recovery system can handle them
*
* Usage:
* bun scripts/test-recovery.ts [--cleanup]
*
* Options:
* --cleanup: Clean up test jobs after testing
*/
import { db, mirrorJobs } from "../src/lib/db";
import { createMirrorJob } from "../src/lib/helpers";
import { initializeRecovery, hasJobsNeedingRecovery, getRecoveryStatus } from "../src/lib/recovery";
import { eq } from "drizzle-orm";
import { v4 as uuidv4 } from "uuid";
// Parse command line arguments
const args = process.argv.slice(2);
const cleanup = args.includes('--cleanup');
// Test configuration
const TEST_USER_ID = "test-user-recovery";
const TEST_BATCH_ID = "test-batch-recovery";
async function runRecoveryTest() {
console.log('=== Recovery System Test ===');
console.log(`Cleanup mode: ${cleanup}`);
console.log('');
try {
if (cleanup) {
await cleanupTestJobs();
return;
}
// Step 1: Create test jobs that simulate interrupted state
console.log('Step 1: Creating test interrupted jobs...');
await createTestInterruptedJobs();
// Step 2: Check if recovery system detects them
console.log('Step 2: Checking if recovery system detects interrupted jobs...');
const needsRecovery = await hasJobsNeedingRecovery();
console.log(`Jobs needing recovery: ${needsRecovery}`);
if (!needsRecovery) {
console.log('❌ Recovery system did not detect interrupted jobs');
return;
}
// Step 3: Get recovery status
console.log('Step 3: Getting recovery status...');
const status = getRecoveryStatus();
console.log('Recovery status:', status);
// Step 4: Run recovery
console.log('Step 4: Running recovery...');
const recoveryResult = await initializeRecovery({
skipIfRecentAttempt: false,
maxRetries: 2,
retryDelay: 2000,
});
console.log(`Recovery result: ${recoveryResult}`);
// Step 5: Verify recovery completed
console.log('Step 5: Verifying recovery completed...');
const stillNeedsRecovery = await hasJobsNeedingRecovery();
console.log(`Jobs still needing recovery: ${stillNeedsRecovery}`);
// Step 6: Check final job states
console.log('Step 6: Checking final job states...');
await checkTestJobStates();
console.log('');
console.log('✅ Recovery test completed successfully!');
console.log('Run with --cleanup to remove test jobs');
} catch (error) {
console.error('❌ Recovery test failed:', error);
process.exit(1);
}
}
/**
* Create test jobs that simulate interrupted state
*/
async function createTestInterruptedJobs() {
const testJobs = [
{
repositoryId: uuidv4(),
repositoryName: "test-repo-1",
message: "Test mirror job 1",
status: "mirroring" as const,
jobType: "mirror" as const,
},
{
repositoryId: uuidv4(),
repositoryName: "test-repo-2",
message: "Test sync job 2",
status: "syncing" as const,
jobType: "sync" as const,
},
];
for (const job of testJobs) {
const jobId = await createMirrorJob({
userId: TEST_USER_ID,
repositoryId: job.repositoryId,
repositoryName: job.repositoryName,
message: job.message,
status: job.status,
jobType: job.jobType,
batchId: TEST_BATCH_ID,
totalItems: 5,
itemIds: [job.repositoryId, uuidv4(), uuidv4(), uuidv4(), uuidv4()],
inProgress: true,
skipDuplicateEvent: true,
});
// Manually set the job to look interrupted (old timestamp)
const oldTimestamp = new Date();
oldTimestamp.setMinutes(oldTimestamp.getMinutes() - 15); // 15 minutes ago
await db
.update(mirrorJobs)
.set({
startedAt: oldTimestamp,
lastCheckpoint: oldTimestamp,
})
.where(eq(mirrorJobs.id, jobId));
console.log(`Created test job: ${jobId} (${job.repositoryName})`);
}
}
/**
* Check the final states of test jobs
*/
async function checkTestJobStates() {
const testJobs = await db
.select()
.from(mirrorJobs)
.where(eq(mirrorJobs.userId, TEST_USER_ID));
console.log(`Found ${testJobs.length} test jobs:`);
for (const job of testJobs) {
console.log(`- Job ${job.id}: ${job.status} (inProgress: ${job.inProgress})`);
console.log(` Message: ${job.message}`);
console.log(` Started: ${job.startedAt ? new Date(job.startedAt).toISOString() : 'never'}`);
console.log(` Completed: ${job.completedAt ? new Date(job.completedAt).toISOString() : 'never'}`);
console.log('');
}
}
/**
* Clean up test jobs
*/
async function cleanupTestJobs() {
console.log('Cleaning up test jobs...');
const result = await db
.delete(mirrorJobs)
.where(eq(mirrorJobs.userId, TEST_USER_ID));
console.log('✅ Test jobs cleaned up successfully');
}
// Handle process signals gracefully
process.on('SIGINT', () => {
console.log('\n⚠ Test interrupted by SIGINT');
process.exit(130);
});
process.on('SIGTERM', () => {
console.log('\n⚠ Test interrupted by SIGTERM');
process.exit(143);
});
// Run the test
runRecoveryTest();