mirror of
https://github.com/RayLabsHQ/gitea-mirror.git
synced 2025-12-13 23:16:45 +03:00
feat: Implement comprehensive job recovery and resume process improvements
- Added a startup recovery script to handle interrupted jobs before application startup. - Enhanced recovery system with database connection validation and stale job cleanup. - Improved middleware to check for recovery needs and handle recovery during requests. - Updated health check endpoint to include recovery system status and metrics. - Introduced test scripts for verifying recovery functionality and job state management. - Enhanced logging and error handling throughout the recovery process.
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
import type { RepoStatus } from "@/types/Repository";
|
||||
import { db, mirrorJobs } from "./db";
|
||||
import { eq, and, or, lt, isNull } from "drizzle-orm";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { publishEvent } from "./events";
|
||||
|
||||
@@ -120,7 +121,7 @@ export async function updateMirrorJobProgress({
|
||||
const [job] = await db
|
||||
.select()
|
||||
.from(mirrorJobs)
|
||||
.where(mirrorJobs.id === jobId);
|
||||
.where(eq(mirrorJobs.id, jobId));
|
||||
|
||||
if (!job) {
|
||||
throw new Error(`Mirror job with ID ${jobId} not found`);
|
||||
@@ -170,7 +171,7 @@ export async function updateMirrorJobProgress({
|
||||
await db
|
||||
.update(mirrorJobs)
|
||||
.set(updates)
|
||||
.where(mirrorJobs.id === jobId);
|
||||
.where(eq(mirrorJobs.id, jobId));
|
||||
|
||||
// Publish the event with deduplication
|
||||
const updatedJob = {
|
||||
@@ -203,7 +204,7 @@ export async function updateMirrorJobProgress({
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds interrupted jobs that need to be resumed
|
||||
* Finds interrupted jobs that need to be resumed with enhanced criteria
|
||||
*/
|
||||
export async function findInterruptedJobs() {
|
||||
try {
|
||||
@@ -211,15 +212,35 @@ export async function findInterruptedJobs() {
|
||||
const cutoffTime = new Date();
|
||||
cutoffTime.setMinutes(cutoffTime.getMinutes() - 10); // Consider jobs inactive after 10 minutes without updates
|
||||
|
||||
// Also check for jobs that have been running for too long (over 2 hours)
|
||||
const staleCutoffTime = new Date();
|
||||
staleCutoffTime.setHours(staleCutoffTime.getHours() - 2);
|
||||
|
||||
const interruptedJobs = await db
|
||||
.select()
|
||||
.from(mirrorJobs)
|
||||
.where(
|
||||
mirrorJobs.inProgress === true &&
|
||||
(mirrorJobs.lastCheckpoint === null ||
|
||||
mirrorJobs.lastCheckpoint < cutoffTime)
|
||||
and(
|
||||
eq(mirrorJobs.inProgress, true),
|
||||
or(
|
||||
// Jobs with no recent checkpoint
|
||||
or(isNull(mirrorJobs.lastCheckpoint), lt(mirrorJobs.lastCheckpoint, cutoffTime)),
|
||||
// Jobs that started too long ago (likely stale)
|
||||
lt(mirrorJobs.startedAt, staleCutoffTime)
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
// Log details about found jobs for debugging
|
||||
if (interruptedJobs.length > 0) {
|
||||
console.log(`Found ${interruptedJobs.length} interrupted jobs:`);
|
||||
interruptedJobs.forEach(job => {
|
||||
const lastCheckpoint = job.lastCheckpoint ? new Date(job.lastCheckpoint).toISOString() : 'never';
|
||||
const startedAt = job.startedAt ? new Date(job.startedAt).toISOString() : 'unknown';
|
||||
console.log(`- Job ${job.id}: ${job.jobType} (started: ${startedAt}, last checkpoint: ${lastCheckpoint})`);
|
||||
});
|
||||
}
|
||||
|
||||
return interruptedJobs;
|
||||
} catch (error) {
|
||||
console.error("Error finding interrupted jobs:", error);
|
||||
|
||||
@@ -4,101 +4,274 @@
|
||||
*/
|
||||
|
||||
import { findInterruptedJobs, resumeInterruptedJob } from './helpers';
|
||||
import { db, repositories, organizations } from './db';
|
||||
import { eq } from 'drizzle-orm';
|
||||
import { db, repositories, organizations, mirrorJobs } from './db';
|
||||
import { eq, and, lt } from 'drizzle-orm';
|
||||
import { mirrorGithubRepoToGitea, mirrorGitHubOrgRepoToGiteaOrg, syncGiteaRepo } from './gitea';
|
||||
import { createGitHubClient } from './github';
|
||||
import { processWithResilience } from './utils/concurrency';
|
||||
import { repositoryVisibilityEnum, repoStatusEnum } from '@/types/Repository';
|
||||
import type { Repository } from './db/schema';
|
||||
|
||||
// Recovery state tracking
|
||||
let recoveryInProgress = false;
|
||||
let lastRecoveryAttempt: Date | null = null;
|
||||
|
||||
/**
|
||||
* Initialize the recovery system
|
||||
* This should be called when the application starts
|
||||
* Validates database connection before attempting recovery
|
||||
*/
|
||||
export async function initializeRecovery() {
|
||||
console.log('Initializing recovery system...');
|
||||
|
||||
async function validateDatabaseConnection(): Promise<boolean> {
|
||||
try {
|
||||
// Find interrupted jobs
|
||||
const interruptedJobs = await findInterruptedJobs();
|
||||
|
||||
if (interruptedJobs.length === 0) {
|
||||
console.log('No interrupted jobs found.');
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`Found ${interruptedJobs.length} interrupted jobs. Starting recovery...`);
|
||||
|
||||
// Process each interrupted job
|
||||
for (const job of interruptedJobs) {
|
||||
const resumeData = await resumeInterruptedJob(job);
|
||||
|
||||
if (!resumeData) {
|
||||
console.log(`Job ${job.id} could not be resumed.`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const { job: updatedJob, remainingItemIds } = resumeData;
|
||||
|
||||
// Handle different job types
|
||||
switch (updatedJob.jobType) {
|
||||
case 'mirror':
|
||||
await recoverMirrorJob(updatedJob, remainingItemIds);
|
||||
break;
|
||||
case 'sync':
|
||||
await recoverSyncJob(updatedJob, remainingItemIds);
|
||||
break;
|
||||
case 'retry':
|
||||
await recoverRetryJob(updatedJob, remainingItemIds);
|
||||
break;
|
||||
default:
|
||||
console.log(`Unknown job type: ${updatedJob.jobType}`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log('Recovery process completed.');
|
||||
// Simple query to test database connectivity
|
||||
await db.select().from(mirrorJobs).limit(1);
|
||||
return true;
|
||||
} catch (error) {
|
||||
console.error('Error during recovery process:', error);
|
||||
console.error('Database connection validation failed:', error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Recover a mirror job
|
||||
* Cleans up stale jobs that are too old to recover
|
||||
*/
|
||||
async function cleanupStaleJobs(): Promise<void> {
|
||||
try {
|
||||
const staleThreshold = new Date();
|
||||
staleThreshold.setHours(staleThreshold.getHours() - 24); // Jobs older than 24 hours
|
||||
|
||||
const staleJobs = await db
|
||||
.select()
|
||||
.from(mirrorJobs)
|
||||
.where(
|
||||
and(
|
||||
eq(mirrorJobs.inProgress, true),
|
||||
lt(mirrorJobs.startedAt, staleThreshold)
|
||||
)
|
||||
);
|
||||
|
||||
if (staleJobs.length > 0) {
|
||||
console.log(`Found ${staleJobs.length} stale jobs to clean up`);
|
||||
|
||||
// Mark stale jobs as failed
|
||||
await db
|
||||
.update(mirrorJobs)
|
||||
.set({
|
||||
inProgress: false,
|
||||
completedAt: new Date(),
|
||||
status: "failed",
|
||||
message: "Job marked as failed due to being stale (older than 24 hours)"
|
||||
})
|
||||
.where(
|
||||
and(
|
||||
eq(mirrorJobs.inProgress, true),
|
||||
lt(mirrorJobs.startedAt, staleThreshold)
|
||||
)
|
||||
);
|
||||
|
||||
console.log(`Cleaned up ${staleJobs.length} stale jobs`);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error cleaning up stale jobs:', error);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the recovery system with enhanced error handling and resilience
|
||||
* This should be called when the application starts
|
||||
*/
|
||||
export async function initializeRecovery(options: {
|
||||
maxRetries?: number;
|
||||
retryDelay?: number;
|
||||
skipIfRecentAttempt?: boolean;
|
||||
} = {}): Promise<boolean> {
|
||||
const { maxRetries = 3, retryDelay = 5000, skipIfRecentAttempt = true } = options;
|
||||
|
||||
// Prevent concurrent recovery attempts
|
||||
if (recoveryInProgress) {
|
||||
console.log('Recovery already in progress, skipping...');
|
||||
return false;
|
||||
}
|
||||
|
||||
// Skip if recent attempt (within last 5 minutes) unless forced
|
||||
if (skipIfRecentAttempt && lastRecoveryAttempt) {
|
||||
const timeSinceLastAttempt = Date.now() - lastRecoveryAttempt.getTime();
|
||||
if (timeSinceLastAttempt < 5 * 60 * 1000) {
|
||||
console.log('Recent recovery attempt detected, skipping...');
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
recoveryInProgress = true;
|
||||
lastRecoveryAttempt = new Date();
|
||||
|
||||
console.log('Initializing recovery system...');
|
||||
|
||||
let attempt = 0;
|
||||
while (attempt < maxRetries) {
|
||||
try {
|
||||
attempt++;
|
||||
console.log(`Recovery attempt ${attempt}/${maxRetries}`);
|
||||
|
||||
// Validate database connection first
|
||||
const dbConnected = await validateDatabaseConnection();
|
||||
if (!dbConnected) {
|
||||
throw new Error('Database connection validation failed');
|
||||
}
|
||||
|
||||
// Clean up stale jobs first
|
||||
await cleanupStaleJobs();
|
||||
|
||||
// Find interrupted jobs
|
||||
const interruptedJobs = await findInterruptedJobs();
|
||||
|
||||
if (interruptedJobs.length === 0) {
|
||||
console.log('No interrupted jobs found.');
|
||||
recoveryInProgress = false;
|
||||
return true;
|
||||
}
|
||||
|
||||
console.log(`Found ${interruptedJobs.length} interrupted jobs. Starting recovery...`);
|
||||
|
||||
// Process each interrupted job with individual error handling
|
||||
let successCount = 0;
|
||||
let failureCount = 0;
|
||||
|
||||
for (const job of interruptedJobs) {
|
||||
try {
|
||||
const resumeData = await resumeInterruptedJob(job);
|
||||
|
||||
if (!resumeData) {
|
||||
console.log(`Job ${job.id} could not be resumed.`);
|
||||
failureCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
const { job: updatedJob, remainingItemIds } = resumeData;
|
||||
|
||||
// Handle different job types
|
||||
switch (updatedJob.jobType) {
|
||||
case 'mirror':
|
||||
await recoverMirrorJob(updatedJob, remainingItemIds);
|
||||
break;
|
||||
case 'sync':
|
||||
await recoverSyncJob(updatedJob, remainingItemIds);
|
||||
break;
|
||||
case 'retry':
|
||||
await recoverRetryJob(updatedJob, remainingItemIds);
|
||||
break;
|
||||
default:
|
||||
console.log(`Unknown job type: ${updatedJob.jobType}`);
|
||||
failureCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
successCount++;
|
||||
} catch (jobError) {
|
||||
console.error(`Error recovering individual job ${job.id}:`, jobError);
|
||||
failureCount++;
|
||||
|
||||
// Mark the job as failed if recovery fails
|
||||
try {
|
||||
await db
|
||||
.update(mirrorJobs)
|
||||
.set({
|
||||
inProgress: false,
|
||||
completedAt: new Date(),
|
||||
status: "failed",
|
||||
message: `Job recovery failed: ${jobError instanceof Error ? jobError.message : String(jobError)}`
|
||||
})
|
||||
.where(eq(mirrorJobs.id, job.id));
|
||||
} catch (updateError) {
|
||||
console.error(`Failed to mark job ${job.id} as failed:`, updateError);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`Recovery process completed. Success: ${successCount}, Failures: ${failureCount}`);
|
||||
recoveryInProgress = false;
|
||||
return true;
|
||||
|
||||
} catch (error) {
|
||||
console.error(`Recovery attempt ${attempt} failed:`, error);
|
||||
|
||||
if (attempt < maxRetries) {
|
||||
console.log(`Retrying in ${retryDelay}ms...`);
|
||||
await new Promise(resolve => setTimeout(resolve, retryDelay));
|
||||
} else {
|
||||
console.error('All recovery attempts failed');
|
||||
recoveryInProgress = false;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
recoveryInProgress = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Recover a mirror job with enhanced error handling
|
||||
*/
|
||||
async function recoverMirrorJob(job: any, remainingItemIds: string[]) {
|
||||
console.log(`Recovering mirror job ${job.id} with ${remainingItemIds.length} remaining items`);
|
||||
|
||||
|
||||
try {
|
||||
// Get the config for this user
|
||||
const [config] = await db
|
||||
// Get the config for this user with better error handling
|
||||
const configs = await db
|
||||
.select()
|
||||
.from(repositories)
|
||||
.where(eq(repositories.userId, job.userId))
|
||||
.limit(1);
|
||||
|
||||
if (!config || !config.configId) {
|
||||
throw new Error('Config not found for user');
|
||||
|
||||
if (configs.length === 0) {
|
||||
throw new Error(`No configuration found for user ${job.userId}`);
|
||||
}
|
||||
|
||||
// Get repositories to process
|
||||
|
||||
const config = configs[0];
|
||||
if (!config.configId) {
|
||||
throw new Error(`Configuration missing configId for user ${job.userId}`);
|
||||
}
|
||||
|
||||
// Get repositories to process with validation
|
||||
const repos = await db
|
||||
.select()
|
||||
.from(repositories)
|
||||
.where(eq(repositories.id, remainingItemIds));
|
||||
|
||||
|
||||
if (repos.length === 0) {
|
||||
throw new Error('No repositories found for the remaining item IDs');
|
||||
console.warn(`No repositories found for remaining item IDs: ${remainingItemIds.join(', ')}`);
|
||||
// Mark job as completed since there's nothing to process
|
||||
await db
|
||||
.update(mirrorJobs)
|
||||
.set({
|
||||
inProgress: false,
|
||||
completedAt: new Date(),
|
||||
status: "mirrored",
|
||||
message: "Job completed - no repositories found to process"
|
||||
})
|
||||
.where(eq(mirrorJobs.id, job.id));
|
||||
return;
|
||||
}
|
||||
|
||||
// Create GitHub client
|
||||
const octokit = createGitHubClient(config.githubConfig.token);
|
||||
|
||||
// Process repositories with resilience
|
||||
|
||||
console.log(`Found ${repos.length} repositories to process for recovery`);
|
||||
|
||||
// Validate GitHub configuration before creating client
|
||||
if (!config.githubConfig?.token) {
|
||||
throw new Error('GitHub token not found in configuration');
|
||||
}
|
||||
|
||||
// Create GitHub client with error handling
|
||||
let octokit;
|
||||
try {
|
||||
octokit = createGitHubClient(config.githubConfig.token);
|
||||
} catch (error) {
|
||||
throw new Error(`Failed to create GitHub client: ${error instanceof Error ? error.message : String(error)}`);
|
||||
}
|
||||
|
||||
// Process repositories with resilience and reduced concurrency for recovery
|
||||
await processWithResilience(
|
||||
repos,
|
||||
async (repo) => {
|
||||
// Prepare repository data
|
||||
// Prepare repository data with validation
|
||||
const repoData = {
|
||||
...repo,
|
||||
status: repoStatusEnum.parse("imported"),
|
||||
@@ -106,10 +279,10 @@ async function recoverMirrorJob(job: any, remainingItemIds: string[]) {
|
||||
lastMirrored: repo.lastMirrored ?? undefined,
|
||||
errorMessage: repo.errorMessage ?? undefined,
|
||||
forkedFrom: repo.forkedFrom ?? undefined,
|
||||
visibility: repositoryVisibilityEnum.parse(repo.visibility),
|
||||
visibility: repositoryVisibilityEnum.parse(repo.visibility || "public"),
|
||||
mirroredLocation: repo.mirroredLocation || "",
|
||||
};
|
||||
|
||||
|
||||
// Mirror the repository based on whether it's in an organization
|
||||
if (repo.organization && config.githubConfig.preserveOrgStructure) {
|
||||
await mirrorGitHubOrgRepoToGiteaOrg({
|
||||
@@ -125,7 +298,7 @@ async function recoverMirrorJob(job: any, remainingItemIds: string[]) {
|
||||
config,
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
return repo;
|
||||
},
|
||||
{
|
||||
@@ -134,66 +307,102 @@ async function recoverMirrorJob(job: any, remainingItemIds: string[]) {
|
||||
getItemId: (repo) => repo.id,
|
||||
getItemName: (repo) => repo.name,
|
||||
resumeFromJobId: job.id,
|
||||
concurrencyLimit: 3,
|
||||
maxRetries: 2,
|
||||
retryDelay: 2000,
|
||||
concurrencyLimit: 2, // Reduced concurrency for recovery to be more stable
|
||||
maxRetries: 3, // Increased retries for recovery
|
||||
retryDelay: 3000, // Longer delay for recovery
|
||||
}
|
||||
);
|
||||
|
||||
console.log(`Successfully recovered mirror job ${job.id}`);
|
||||
} catch (error) {
|
||||
console.error(`Error recovering mirror job ${job.id}:`, error);
|
||||
|
||||
// Mark the job as failed
|
||||
try {
|
||||
await db
|
||||
.update(mirrorJobs)
|
||||
.set({
|
||||
inProgress: false,
|
||||
completedAt: new Date(),
|
||||
status: "failed",
|
||||
message: `Mirror job recovery failed: ${error instanceof Error ? error.message : String(error)}`
|
||||
})
|
||||
.where(eq(mirrorJobs.id, job.id));
|
||||
} catch (updateError) {
|
||||
console.error(`Failed to mark mirror job ${job.id} as failed:`, updateError);
|
||||
}
|
||||
|
||||
throw error; // Re-throw to be handled by the caller
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Recover a sync job
|
||||
* Recover a sync job with enhanced error handling
|
||||
*/
|
||||
async function recoverSyncJob(job: any, remainingItemIds: string[]) {
|
||||
// Implementation similar to recoverMirrorJob but for sync operations
|
||||
console.log(`Recovering sync job ${job.id} with ${remainingItemIds.length} remaining items`);
|
||||
|
||||
|
||||
try {
|
||||
// Get the config for this user
|
||||
const [config] = await db
|
||||
// Get the config for this user with better error handling
|
||||
const configs = await db
|
||||
.select()
|
||||
.from(repositories)
|
||||
.where(eq(repositories.userId, job.userId))
|
||||
.limit(1);
|
||||
|
||||
if (!config || !config.configId) {
|
||||
throw new Error('Config not found for user');
|
||||
|
||||
if (configs.length === 0) {
|
||||
throw new Error(`No configuration found for user ${job.userId}`);
|
||||
}
|
||||
|
||||
// Get repositories to process
|
||||
|
||||
const config = configs[0];
|
||||
if (!config.configId) {
|
||||
throw new Error(`Configuration missing configId for user ${job.userId}`);
|
||||
}
|
||||
|
||||
// Get repositories to process with validation
|
||||
const repos = await db
|
||||
.select()
|
||||
.from(repositories)
|
||||
.where(eq(repositories.id, remainingItemIds));
|
||||
|
||||
|
||||
if (repos.length === 0) {
|
||||
throw new Error('No repositories found for the remaining item IDs');
|
||||
console.warn(`No repositories found for remaining item IDs: ${remainingItemIds.join(', ')}`);
|
||||
// Mark job as completed since there's nothing to process
|
||||
await db
|
||||
.update(mirrorJobs)
|
||||
.set({
|
||||
inProgress: false,
|
||||
completedAt: new Date(),
|
||||
status: "mirrored",
|
||||
message: "Job completed - no repositories found to process"
|
||||
})
|
||||
.where(eq(mirrorJobs.id, job.id));
|
||||
return;
|
||||
}
|
||||
|
||||
// Process repositories with resilience
|
||||
|
||||
console.log(`Found ${repos.length} repositories to process for sync recovery`);
|
||||
|
||||
// Process repositories with resilience and reduced concurrency for recovery
|
||||
await processWithResilience(
|
||||
repos,
|
||||
async (repo) => {
|
||||
// Prepare repository data
|
||||
// Prepare repository data with validation
|
||||
const repoData = {
|
||||
...repo,
|
||||
status: repoStatusEnum.parse(repo.status),
|
||||
status: repoStatusEnum.parse(repo.status || "imported"),
|
||||
organization: repo.organization ?? undefined,
|
||||
lastMirrored: repo.lastMirrored ?? undefined,
|
||||
errorMessage: repo.errorMessage ?? undefined,
|
||||
forkedFrom: repo.forkedFrom ?? undefined,
|
||||
visibility: repositoryVisibilityEnum.parse(repo.visibility),
|
||||
visibility: repositoryVisibilityEnum.parse(repo.visibility || "public"),
|
||||
};
|
||||
|
||||
|
||||
// Sync the repository
|
||||
await syncGiteaRepo({
|
||||
config,
|
||||
repository: repoData,
|
||||
});
|
||||
|
||||
|
||||
return repo;
|
||||
},
|
||||
{
|
||||
@@ -202,23 +411,94 @@ async function recoverSyncJob(job: any, remainingItemIds: string[]) {
|
||||
getItemId: (repo) => repo.id,
|
||||
getItemName: (repo) => repo.name,
|
||||
resumeFromJobId: job.id,
|
||||
concurrencyLimit: 5,
|
||||
maxRetries: 2,
|
||||
retryDelay: 2000,
|
||||
concurrencyLimit: 3, // Reduced concurrency for recovery
|
||||
maxRetries: 3, // Increased retries for recovery
|
||||
retryDelay: 3000, // Longer delay for recovery
|
||||
}
|
||||
);
|
||||
|
||||
console.log(`Successfully recovered sync job ${job.id}`);
|
||||
} catch (error) {
|
||||
console.error(`Error recovering sync job ${job.id}:`, error);
|
||||
|
||||
// Mark the job as failed
|
||||
try {
|
||||
await db
|
||||
.update(mirrorJobs)
|
||||
.set({
|
||||
inProgress: false,
|
||||
completedAt: new Date(),
|
||||
status: "failed",
|
||||
message: `Sync job recovery failed: ${error instanceof Error ? error.message : String(error)}`
|
||||
})
|
||||
.where(eq(mirrorJobs.id, job.id));
|
||||
} catch (updateError) {
|
||||
console.error(`Failed to mark sync job ${job.id} as failed:`, updateError);
|
||||
}
|
||||
|
||||
throw error; // Re-throw to be handled by the caller
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Recover a retry job
|
||||
* Recover a retry job with enhanced error handling
|
||||
*/
|
||||
async function recoverRetryJob(job: any, remainingItemIds: string[]) {
|
||||
// Implementation similar to recoverMirrorJob but for retry operations
|
||||
console.log(`Recovering retry job ${job.id} with ${remainingItemIds.length} remaining items`);
|
||||
|
||||
// This would be similar to recoverMirrorJob but with retry-specific logic
|
||||
console.log('Retry job recovery not yet implemented');
|
||||
|
||||
try {
|
||||
// For now, retry jobs are treated similarly to mirror jobs
|
||||
// In the future, this could have specific retry logic
|
||||
await recoverMirrorJob(job, remainingItemIds);
|
||||
console.log(`Successfully recovered retry job ${job.id}`);
|
||||
} catch (error) {
|
||||
console.error(`Error recovering retry job ${job.id}:`, error);
|
||||
|
||||
// Mark the job as failed
|
||||
try {
|
||||
await db
|
||||
.update(mirrorJobs)
|
||||
.set({
|
||||
inProgress: false,
|
||||
completedAt: new Date(),
|
||||
status: "failed",
|
||||
message: `Retry job recovery failed: ${error instanceof Error ? error.message : String(error)}`
|
||||
})
|
||||
.where(eq(mirrorJobs.id, job.id));
|
||||
} catch (updateError) {
|
||||
console.error(`Failed to mark retry job ${job.id} as failed:`, updateError);
|
||||
}
|
||||
|
||||
throw error; // Re-throw to be handled by the caller
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get recovery system status
|
||||
*/
|
||||
export function getRecoveryStatus() {
|
||||
return {
|
||||
inProgress: recoveryInProgress,
|
||||
lastAttempt: lastRecoveryAttempt,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Force recovery to run (bypassing recent attempt check)
|
||||
*/
|
||||
export async function forceRecovery(): Promise<boolean> {
|
||||
return initializeRecovery({ skipIfRecentAttempt: false });
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if there are any jobs that need recovery
|
||||
*/
|
||||
export async function hasJobsNeedingRecovery(): Promise<boolean> {
|
||||
try {
|
||||
const interruptedJobs = await findInterruptedJobs();
|
||||
return interruptedJobs.length > 0;
|
||||
} catch (error) {
|
||||
console.error('Error checking for jobs needing recovery:', error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,22 +1,58 @@
|
||||
import { defineMiddleware } from 'astro:middleware';
|
||||
import { initializeRecovery } from './lib/recovery';
|
||||
import { initializeRecovery, hasJobsNeedingRecovery, getRecoveryStatus } from './lib/recovery';
|
||||
|
||||
// Flag to track if recovery has been initialized
|
||||
let recoveryInitialized = false;
|
||||
let recoveryAttempted = false;
|
||||
|
||||
export const onRequest = defineMiddleware(async (context, next) => {
|
||||
// Initialize recovery system only once when the server starts
|
||||
if (!recoveryInitialized) {
|
||||
console.log('Initializing recovery system from middleware...');
|
||||
// This is a fallback in case the startup script didn't run
|
||||
if (!recoveryInitialized && !recoveryAttempted) {
|
||||
recoveryAttempted = true;
|
||||
|
||||
try {
|
||||
await initializeRecovery();
|
||||
console.log('Recovery system initialized successfully');
|
||||
// Check if recovery is actually needed before attempting
|
||||
const needsRecovery = await hasJobsNeedingRecovery();
|
||||
|
||||
if (needsRecovery) {
|
||||
console.log('⚠️ Middleware detected jobs needing recovery (startup script may not have run)');
|
||||
console.log('Attempting recovery from middleware...');
|
||||
|
||||
// Run recovery with a shorter timeout since this is during request handling
|
||||
const recoveryResult = await Promise.race([
|
||||
initializeRecovery({
|
||||
skipIfRecentAttempt: true,
|
||||
maxRetries: 2,
|
||||
retryDelay: 3000,
|
||||
}),
|
||||
new Promise<boolean>((_, reject) => {
|
||||
setTimeout(() => reject(new Error('Middleware recovery timeout')), 15000);
|
||||
})
|
||||
]);
|
||||
|
||||
if (recoveryResult) {
|
||||
console.log('✅ Middleware recovery completed successfully');
|
||||
} else {
|
||||
console.log('⚠️ Middleware recovery completed with some issues');
|
||||
}
|
||||
} else {
|
||||
console.log('✅ No recovery needed (startup script likely handled it)');
|
||||
}
|
||||
|
||||
recoveryInitialized = true;
|
||||
} catch (error) {
|
||||
console.error('Error initializing recovery system:', error);
|
||||
console.error('⚠️ Middleware recovery failed or timed out:', error);
|
||||
console.log('Application will continue, but some jobs may remain interrupted');
|
||||
|
||||
// Log recovery status for debugging
|
||||
const status = getRecoveryStatus();
|
||||
console.log('Recovery status:', status);
|
||||
|
||||
recoveryInitialized = true; // Mark as attempted to avoid retries
|
||||
}
|
||||
recoveryInitialized = true;
|
||||
}
|
||||
|
||||
|
||||
// Continue with the request
|
||||
return next();
|
||||
});
|
||||
|
||||
@@ -2,6 +2,7 @@ import type { APIRoute } from "astro";
|
||||
import { jsonResponse } from "@/lib/utils";
|
||||
import { db } from "@/lib/db";
|
||||
import { ENV } from "@/lib/config";
|
||||
import { getRecoveryStatus, hasJobsNeedingRecovery } from "@/lib/recovery";
|
||||
import os from "os";
|
||||
import axios from "axios";
|
||||
|
||||
@@ -38,9 +39,20 @@ export const GET: APIRoute = async () => {
|
||||
const currentVersion = process.env.npm_package_version || "unknown";
|
||||
const latestVersion = await checkLatestVersion();
|
||||
|
||||
// Get recovery system status
|
||||
const recoveryStatus = await getRecoverySystemStatus();
|
||||
|
||||
// Determine overall health status
|
||||
let overallStatus = "ok";
|
||||
if (!dbStatus.connected) {
|
||||
overallStatus = "error";
|
||||
} else if (recoveryStatus.jobsNeedingRecovery > 0 && !recoveryStatus.inProgress) {
|
||||
overallStatus = "degraded";
|
||||
}
|
||||
|
||||
// Build response
|
||||
const healthData = {
|
||||
status: "ok",
|
||||
status: overallStatus,
|
||||
timestamp: new Date().toISOString(),
|
||||
version: currentVersion,
|
||||
latestVersion: latestVersion,
|
||||
@@ -48,6 +60,7 @@ export const GET: APIRoute = async () => {
|
||||
currentVersion !== "unknown" &&
|
||||
latestVersion !== currentVersion,
|
||||
database: dbStatus,
|
||||
recovery: recoveryStatus,
|
||||
system: systemInfo,
|
||||
};
|
||||
|
||||
@@ -94,6 +107,36 @@ async function checkDatabaseConnection() {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get recovery system status
|
||||
*/
|
||||
async function getRecoverySystemStatus() {
|
||||
try {
|
||||
const recoveryStatus = getRecoveryStatus();
|
||||
const needsRecovery = await hasJobsNeedingRecovery();
|
||||
|
||||
return {
|
||||
status: needsRecovery ? 'jobs-pending' : 'healthy',
|
||||
inProgress: recoveryStatus.inProgress,
|
||||
lastAttempt: recoveryStatus.lastAttempt?.toISOString() || null,
|
||||
jobsNeedingRecovery: needsRecovery ? 1 : 0, // Simplified count for health check
|
||||
message: needsRecovery
|
||||
? 'Jobs found that need recovery'
|
||||
: 'No jobs need recovery',
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('Recovery system status check failed:', error);
|
||||
|
||||
return {
|
||||
status: 'error',
|
||||
inProgress: false,
|
||||
lastAttempt: null,
|
||||
jobsNeedingRecovery: -1,
|
||||
message: error instanceof Error ? error.message : 'Recovery status check failed',
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get server uptime information
|
||||
*/
|
||||
|
||||
Reference in New Issue
Block a user