mirror of
https://github.com/RayLabsHQ/gitea-mirror.git
synced 2026-01-27 12:50:54 +03:00
feat: Implement comprehensive job recovery and resume process improvements
- Added a startup recovery script to handle interrupted jobs before application startup. - Enhanced recovery system with database connection validation and stale job cleanup. - Improved middleware to check for recovery needs and handle recovery during requests. - Updated health check endpoint to include recovery system status and metrics. - Introduced test scripts for verifying recovery functionality and job state management. - Enhanced logging and error handling throughout the recovery process.
This commit is contained in:
@@ -118,6 +118,27 @@ bun scripts/fix-interrupted-jobs.ts <userId>
|
||||
|
||||
Use this script if you're having trouble cleaning up activities due to "interrupted" jobs that won't delete.
|
||||
|
||||
### Startup Recovery (startup-recovery.ts)
|
||||
|
||||
Runs job recovery during application startup to handle any interrupted jobs from previous runs.
|
||||
|
||||
```bash
|
||||
# Run startup recovery (normal mode)
|
||||
bun scripts/startup-recovery.ts
|
||||
|
||||
# Force recovery even if recent attempt was made
|
||||
bun scripts/startup-recovery.ts --force
|
||||
|
||||
# Set custom timeout (default: 30000ms)
|
||||
bun scripts/startup-recovery.ts --timeout=60000
|
||||
|
||||
# Using npm scripts
|
||||
bun run startup-recovery
|
||||
bun run startup-recovery-force
|
||||
```
|
||||
|
||||
This script is automatically run by the Docker entrypoint during container startup. It ensures that any jobs interrupted by container restarts or application crashes are properly recovered or marked as failed.
|
||||
|
||||
## Deployment Scripts
|
||||
|
||||
### Docker Deployment
|
||||
|
||||
113
scripts/startup-recovery.ts
Normal file
113
scripts/startup-recovery.ts
Normal file
@@ -0,0 +1,113 @@
|
||||
#!/usr/bin/env bun
|
||||
/**
|
||||
* Startup recovery script
|
||||
* This script runs job recovery before the application starts serving requests
|
||||
* It ensures that any interrupted jobs from previous runs are properly handled
|
||||
*
|
||||
* Usage:
|
||||
* bun scripts/startup-recovery.ts [--force] [--timeout=30000]
|
||||
*
|
||||
* Options:
|
||||
* --force: Force recovery even if a recent attempt was made
|
||||
* --timeout: Maximum time to wait for recovery (in milliseconds, default: 30000)
|
||||
*/
|
||||
|
||||
import { initializeRecovery, hasJobsNeedingRecovery, getRecoveryStatus } from "../src/lib/recovery";
|
||||
|
||||
// Parse command line arguments
|
||||
const args = process.argv.slice(2);
|
||||
const forceRecovery = args.includes('--force');
|
||||
const timeoutArg = args.find(arg => arg.startsWith('--timeout='));
|
||||
const timeout = timeoutArg ? parseInt(timeoutArg.split('=')[1], 10) : 30000;
|
||||
|
||||
if (isNaN(timeout) || timeout < 1000) {
|
||||
console.error("Error: Timeout must be at least 1000ms");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
async function runStartupRecovery() {
|
||||
console.log('=== Gitea Mirror Startup Recovery ===');
|
||||
console.log(`Timeout: ${timeout}ms`);
|
||||
console.log(`Force recovery: ${forceRecovery}`);
|
||||
console.log('');
|
||||
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
// Set up timeout
|
||||
const timeoutPromise = new Promise<never>((_, reject) => {
|
||||
setTimeout(() => {
|
||||
reject(new Error(`Recovery timeout after ${timeout}ms`));
|
||||
}, timeout);
|
||||
});
|
||||
|
||||
// Check if recovery is needed first
|
||||
console.log('Checking if recovery is needed...');
|
||||
const needsRecovery = await hasJobsNeedingRecovery();
|
||||
|
||||
if (!needsRecovery) {
|
||||
console.log('✅ No jobs need recovery. Startup can proceed.');
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
console.log('⚠️ Jobs found that need recovery. Starting recovery process...');
|
||||
|
||||
// Run recovery with timeout
|
||||
const recoveryPromise = initializeRecovery({
|
||||
skipIfRecentAttempt: !forceRecovery,
|
||||
maxRetries: 3,
|
||||
retryDelay: 5000,
|
||||
});
|
||||
|
||||
const recoveryResult = await Promise.race([recoveryPromise, timeoutPromise]);
|
||||
|
||||
const endTime = Date.now();
|
||||
const duration = endTime - startTime;
|
||||
|
||||
if (recoveryResult) {
|
||||
console.log(`✅ Recovery completed successfully in ${duration}ms`);
|
||||
console.log('Application startup can proceed.');
|
||||
process.exit(0);
|
||||
} else {
|
||||
console.log(`⚠️ Recovery completed with some failures in ${duration}ms`);
|
||||
console.log('Application startup can proceed, but some jobs may have failed.');
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
const endTime = Date.now();
|
||||
const duration = endTime - startTime;
|
||||
|
||||
if (error instanceof Error && error.message.includes('timeout')) {
|
||||
console.error(`❌ Recovery timed out after ${duration}ms`);
|
||||
console.error('Application will start anyway, but some jobs may remain interrupted.');
|
||||
|
||||
// Get current recovery status
|
||||
const status = getRecoveryStatus();
|
||||
console.log('Recovery status:', status);
|
||||
|
||||
// Exit with warning code but allow startup to continue
|
||||
process.exit(1);
|
||||
} else {
|
||||
console.error(`❌ Recovery failed after ${duration}ms:`, error);
|
||||
console.error('Application will start anyway, but recovery was unsuccessful.');
|
||||
|
||||
// Exit with error code but allow startup to continue
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Handle process signals gracefully
|
||||
process.on('SIGINT', () => {
|
||||
console.log('\n⚠️ Recovery interrupted by SIGINT');
|
||||
process.exit(130);
|
||||
});
|
||||
|
||||
process.on('SIGTERM', () => {
|
||||
console.log('\n⚠️ Recovery interrupted by SIGTERM');
|
||||
process.exit(143);
|
||||
});
|
||||
|
||||
// Run the startup recovery
|
||||
runStartupRecovery();
|
||||
183
scripts/test-recovery.ts
Normal file
183
scripts/test-recovery.ts
Normal file
@@ -0,0 +1,183 @@
|
||||
#!/usr/bin/env bun
|
||||
/**
|
||||
* Test script for the recovery system
|
||||
* This script creates test jobs and verifies that the recovery system can handle them
|
||||
*
|
||||
* Usage:
|
||||
* bun scripts/test-recovery.ts [--cleanup]
|
||||
*
|
||||
* Options:
|
||||
* --cleanup: Clean up test jobs after testing
|
||||
*/
|
||||
|
||||
import { db, mirrorJobs } from "../src/lib/db";
|
||||
import { createMirrorJob } from "../src/lib/helpers";
|
||||
import { initializeRecovery, hasJobsNeedingRecovery, getRecoveryStatus } from "../src/lib/recovery";
|
||||
import { eq } from "drizzle-orm";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
|
||||
// Parse command line arguments
|
||||
const args = process.argv.slice(2);
|
||||
const cleanup = args.includes('--cleanup');
|
||||
|
||||
// Test configuration
|
||||
const TEST_USER_ID = "test-user-recovery";
|
||||
const TEST_BATCH_ID = "test-batch-recovery";
|
||||
|
||||
async function runRecoveryTest() {
|
||||
console.log('=== Recovery System Test ===');
|
||||
console.log(`Cleanup mode: ${cleanup}`);
|
||||
console.log('');
|
||||
|
||||
try {
|
||||
if (cleanup) {
|
||||
await cleanupTestJobs();
|
||||
return;
|
||||
}
|
||||
|
||||
// Step 1: Create test jobs that simulate interrupted state
|
||||
console.log('Step 1: Creating test interrupted jobs...');
|
||||
await createTestInterruptedJobs();
|
||||
|
||||
// Step 2: Check if recovery system detects them
|
||||
console.log('Step 2: Checking if recovery system detects interrupted jobs...');
|
||||
const needsRecovery = await hasJobsNeedingRecovery();
|
||||
console.log(`Jobs needing recovery: ${needsRecovery}`);
|
||||
|
||||
if (!needsRecovery) {
|
||||
console.log('❌ Recovery system did not detect interrupted jobs');
|
||||
return;
|
||||
}
|
||||
|
||||
// Step 3: Get recovery status
|
||||
console.log('Step 3: Getting recovery status...');
|
||||
const status = getRecoveryStatus();
|
||||
console.log('Recovery status:', status);
|
||||
|
||||
// Step 4: Run recovery
|
||||
console.log('Step 4: Running recovery...');
|
||||
const recoveryResult = await initializeRecovery({
|
||||
skipIfRecentAttempt: false,
|
||||
maxRetries: 2,
|
||||
retryDelay: 2000,
|
||||
});
|
||||
|
||||
console.log(`Recovery result: ${recoveryResult}`);
|
||||
|
||||
// Step 5: Verify recovery completed
|
||||
console.log('Step 5: Verifying recovery completed...');
|
||||
const stillNeedsRecovery = await hasJobsNeedingRecovery();
|
||||
console.log(`Jobs still needing recovery: ${stillNeedsRecovery}`);
|
||||
|
||||
// Step 6: Check final job states
|
||||
console.log('Step 6: Checking final job states...');
|
||||
await checkTestJobStates();
|
||||
|
||||
console.log('');
|
||||
console.log('✅ Recovery test completed successfully!');
|
||||
console.log('Run with --cleanup to remove test jobs');
|
||||
|
||||
} catch (error) {
|
||||
console.error('❌ Recovery test failed:', error);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create test jobs that simulate interrupted state
|
||||
*/
|
||||
async function createTestInterruptedJobs() {
|
||||
const testJobs = [
|
||||
{
|
||||
repositoryId: uuidv4(),
|
||||
repositoryName: "test-repo-1",
|
||||
message: "Test mirror job 1",
|
||||
status: "mirroring" as const,
|
||||
jobType: "mirror" as const,
|
||||
},
|
||||
{
|
||||
repositoryId: uuidv4(),
|
||||
repositoryName: "test-repo-2",
|
||||
message: "Test sync job 2",
|
||||
status: "syncing" as const,
|
||||
jobType: "sync" as const,
|
||||
},
|
||||
];
|
||||
|
||||
for (const job of testJobs) {
|
||||
const jobId = await createMirrorJob({
|
||||
userId: TEST_USER_ID,
|
||||
repositoryId: job.repositoryId,
|
||||
repositoryName: job.repositoryName,
|
||||
message: job.message,
|
||||
status: job.status,
|
||||
jobType: job.jobType,
|
||||
batchId: TEST_BATCH_ID,
|
||||
totalItems: 5,
|
||||
itemIds: [job.repositoryId, uuidv4(), uuidv4(), uuidv4(), uuidv4()],
|
||||
inProgress: true,
|
||||
skipDuplicateEvent: true,
|
||||
});
|
||||
|
||||
// Manually set the job to look interrupted (old timestamp)
|
||||
const oldTimestamp = new Date();
|
||||
oldTimestamp.setMinutes(oldTimestamp.getMinutes() - 15); // 15 minutes ago
|
||||
|
||||
await db
|
||||
.update(mirrorJobs)
|
||||
.set({
|
||||
startedAt: oldTimestamp,
|
||||
lastCheckpoint: oldTimestamp,
|
||||
})
|
||||
.where(eq(mirrorJobs.id, jobId));
|
||||
|
||||
console.log(`Created test job: ${jobId} (${job.repositoryName})`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check the final states of test jobs
|
||||
*/
|
||||
async function checkTestJobStates() {
|
||||
const testJobs = await db
|
||||
.select()
|
||||
.from(mirrorJobs)
|
||||
.where(eq(mirrorJobs.userId, TEST_USER_ID));
|
||||
|
||||
console.log(`Found ${testJobs.length} test jobs:`);
|
||||
|
||||
for (const job of testJobs) {
|
||||
console.log(`- Job ${job.id}: ${job.status} (inProgress: ${job.inProgress})`);
|
||||
console.log(` Message: ${job.message}`);
|
||||
console.log(` Started: ${job.startedAt ? new Date(job.startedAt).toISOString() : 'never'}`);
|
||||
console.log(` Completed: ${job.completedAt ? new Date(job.completedAt).toISOString() : 'never'}`);
|
||||
console.log('');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean up test jobs
|
||||
*/
|
||||
async function cleanupTestJobs() {
|
||||
console.log('Cleaning up test jobs...');
|
||||
|
||||
const result = await db
|
||||
.delete(mirrorJobs)
|
||||
.where(eq(mirrorJobs.userId, TEST_USER_ID));
|
||||
|
||||
console.log('✅ Test jobs cleaned up successfully');
|
||||
}
|
||||
|
||||
// Handle process signals gracefully
|
||||
process.on('SIGINT', () => {
|
||||
console.log('\n⚠️ Test interrupted by SIGINT');
|
||||
process.exit(130);
|
||||
});
|
||||
|
||||
process.on('SIGTERM', () => {
|
||||
console.log('\n⚠️ Test interrupted by SIGTERM');
|
||||
process.exit(143);
|
||||
});
|
||||
|
||||
// Run the test
|
||||
runRecoveryTest();
|
||||
Reference in New Issue
Block a user