From 502796371ffd2bdeb6c232ac971a471d45a4e86e Mon Sep 17 00:00:00 2001 From: Arunavo Ray Date: Sun, 7 Sep 2025 13:55:20 +0530 Subject: [PATCH] Attempt to address #84 --- README.md | 14 ++- docs/ENVIRONMENT_VARIABLES.md | 17 +++- src/lib/gitea.ts | 119 +++++++++++++++++++++++--- src/lib/repository-cleanup-service.ts | 48 ++++++++--- 4 files changed, 174 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index c57e432..93ea66b 100644 --- a/README.md +++ b/README.md @@ -235,11 +235,21 @@ AUTO_IMPORT_REPOS=true # Auto-cleanup orphaned repositories CLEANUP_DELETE_IF_NOT_IN_GITHUB=true -CLEANUP_ORPHANED_REPO_ACTION=archive # or 'delete' +CLEANUP_ORPHANED_REPO_ACTION=archive # 'archive' (recommended) or 'delete' CLEANUP_DRY_RUN=false # Set to true to test without changes ``` -**Important**: The scheduler checks every minute for tasks to run. The `GITEA_MIRROR_INTERVAL` determines how often each repository is actually synced. For example, with `8h`, each repo syncs every 8 hours from its last successful sync. +**Important Notes**: +- The scheduler checks every minute for tasks to run. The `GITEA_MIRROR_INTERVAL` determines how often each repository is actually synced. For example, with `8h`, each repo syncs every 8 hours from its last successful sync. + +**🛡️ Backup Protection Features**: +- **No Accidental Deletions**: Repository cleanup is automatically skipped if GitHub is inaccessible (account deleted, banned, or API errors) +- **Archive Never Deletes Data**: The `archive` action preserves all repository data: + - Regular repositories: Made read-only using Gitea's archive feature + - Mirror repositories: Renamed with `[ARCHIVED]` prefix (Gitea API limitation prevents archiving mirrors) + - Failed operations: Repository remains fully accessible even if marking as archived fails +- **The Whole Point of Backups**: Your Gitea mirrors are preserved even when GitHub sources disappear - that's why you have backups! +- **Strongly Recommended**: Always use `CLEANUP_ORPHANED_REPO_ACTION=archive` (default) instead of `delete` ## Troubleshooting diff --git a/docs/ENVIRONMENT_VARIABLES.md b/docs/ENVIRONMENT_VARIABLES.md index 182342f..d841bb9 100644 --- a/docs/ENVIRONMENT_VARIABLES.md +++ b/docs/ENVIRONMENT_VARIABLES.md @@ -206,10 +206,25 @@ Configure automatic cleanup of old events and data. |----------|-------------|---------|---------| | `CLEANUP_DELETE_FROM_GITEA` | Delete repositories from Gitea | `false` | `true`, `false` | | `CLEANUP_DELETE_IF_NOT_IN_GITHUB` | Delete repos not found in GitHub (automatically enables cleanup) | `true` | `true`, `false` | -| `CLEANUP_ORPHANED_REPO_ACTION` | Action for orphaned repositories | `archive` | `skip`, `archive`, `delete` | +| `CLEANUP_ORPHANED_REPO_ACTION` | Action for orphaned repositories. **Note**: `archive` is recommended to preserve backups | `archive` | `skip`, `archive`, `delete` | | `CLEANUP_DRY_RUN` | Test mode without actual deletion | `true` | `true`, `false` | | `CLEANUP_PROTECTED_REPOS` | Comma-separated list of protected repository names | - | Comma-separated strings | +**🛡️ Safety Features (Backup Protection)**: +- **GitHub Failures Don't Delete Backups**: Cleanup is automatically skipped if GitHub API returns errors (404, 403, connection issues) +- **Archive Never Deletes**: The `archive` action ALWAYS preserves repository data, it never deletes +- **Graceful Degradation**: If marking as archived fails, the repository remains fully accessible in Gitea +- **The Purpose of Backups**: Your mirrors are preserved even when GitHub sources disappear - that's the whole point! + +**Archive Behavior (Aligned with Gitea API)**: +- **Regular repositories**: Uses Gitea's native archive feature (PATCH `/repos/{owner}/{repo}` with `archived: true`) + - Makes repository read-only while preserving all data +- **Mirror repositories**: Uses rename strategy (Gitea API returns 422 for archiving mirrors) + - Renamed with `[ARCHIVED]` prefix for clear identification + - Description updated with preservation notice and timestamp + - Mirror interval set to 8760h (1 year) to minimize sync attempts + - Repository remains fully accessible and cloneable + ### Execution Settings | Variable | Description | Default | Options | diff --git a/src/lib/gitea.ts b/src/lib/gitea.ts index f7a1a92..211ad42 100644 --- a/src/lib/gitea.ts +++ b/src/lib/gitea.ts @@ -7,7 +7,7 @@ import { membershipRoleEnum } from "@/types/organizations"; import { Octokit } from "@octokit/rest"; import type { Config } from "@/types/config"; import type { Organization, Repository } from "./db/schema"; -import { httpPost, httpGet, httpDelete, httpPut } from "./http-client"; +import { httpPost, httpGet, httpDelete, httpPut, httpPatch } from "./http-client"; import { createMirrorJob } from "./helpers"; import { db, organizations, repositories } from "./db"; import { eq, and } from "drizzle-orm"; @@ -2016,6 +2016,12 @@ export async function deleteGiteaRepo( /** * Archive a repository in Gitea + * + * IMPORTANT: This function NEVER deletes data. It only marks repositories as archived. + * - For regular repos: Uses Gitea's archive feature (makes read-only) + * - For mirror repos: Renames with [ARCHIVED] prefix (Gitea doesn't allow archiving mirrors) + * + * This ensures backups are preserved even when the GitHub source disappears. */ export async function archiveGiteaRepo( client: { url: string; token: string }, @@ -2023,24 +2029,115 @@ export async function archiveGiteaRepo( repo: string ): Promise { try { - const response = await httpPut( + // First, check if this is a mirror repository + const repoResponse = await httpGet( `${client.url}/api/v1/repos/${owner}/${repo}`, - { - archived: true, - }, { Authorization: `token ${client.token}`, - 'Content-Type': 'application/json', } ); - if (response.status >= 400) { - throw new Error(`Failed to archive repository ${owner}/${repo}: ${response.status} ${response.statusText}`); + if (!repoResponse.data) { + console.warn(`[Archive] Repository ${owner}/${repo} not found in Gitea. Skipping.`); + return; } - console.log(`Successfully archived repository ${owner}/${repo} in Gitea`); + if (repoResponse.data?.mirror) { + console.log(`[Archive] Repository ${owner}/${repo} is a mirror. Using safe rename strategy.`); + + // IMPORTANT: Gitea API doesn't allow archiving mirror repositories + // According to Gitea source code, attempting to archive a mirror returns: + // "repo is a mirror, cannot archive/un-archive" (422 Unprocessable Entity) + // + // Our solution: Rename the repo to clearly mark it as orphaned + // This preserves all data while indicating the repo is no longer actively synced + + const currentName = repoResponse.data.name; + + // Skip if already marked as archived + if (currentName.startsWith('[ARCHIVED]')) { + console.log(`[Archive] Repository ${owner}/${repo} already marked as archived. Skipping.`); + return; + } + + const archivedName = `[ARCHIVED] ${currentName}`; + const currentDesc = repoResponse.data.description || ''; + const archiveNotice = `\n\n⚠️ ARCHIVED: Original GitHub repository no longer exists. Preserved as backup on ${new Date().toISOString()}`; + + // Only add notice if not already present + const newDescription = currentDesc.includes('⚠️ ARCHIVED:') + ? currentDesc + : currentDesc + archiveNotice; + + const renameResponse = await httpPatch( + `${client.url}/api/v1/repos/${owner}/${repo}`, + { + name: archivedName, + description: newDescription, + }, + { + Authorization: `token ${client.token}`, + 'Content-Type': 'application/json', + } + ); + + if (renameResponse.status >= 400) { + // If rename fails, log but don't throw - data is still preserved + console.error(`[Archive] Failed to rename mirror repository ${owner}/${repo}: ${renameResponse.status}`); + console.log(`[Archive] Repository ${owner}/${repo} remains accessible but not marked as archived`); + return; + } + + console.log(`[Archive] Successfully marked mirror repository ${owner}/${repo} as archived (renamed to ${archivedName})`); + + // Also try to reduce sync frequency to prevent unnecessary API calls + // This is optional - if it fails, the repo is still preserved + try { + await httpPatch( + `${client.url}/api/v1/repos/${owner}/${archivedName}`, + { + mirror_interval: "8760h", // 1 year - minimizes sync attempts + }, + { + Authorization: `token ${client.token}`, + 'Content-Type': 'application/json', + } + ); + console.log(`[Archive] Reduced sync frequency for ${owner}/${archivedName} to yearly`); + } catch (intervalError) { + // Non-critical - repo is still preserved even if we can't change interval + console.debug(`[Archive] Could not update mirror interval (non-critical):`, intervalError); + } + } else { + // For non-mirror repositories, use Gitea's native archive feature + // This makes the repository read-only but preserves all data + console.log(`[Archive] Archiving regular repository ${owner}/${repo}`); + + const response = await httpPatch( + `${client.url}/api/v1/repos/${owner}/${repo}`, + { + archived: true, + }, + { + Authorization: `token ${client.token}`, + 'Content-Type': 'application/json', + } + ); + + if (response.status >= 400) { + // If archive fails, log but data is still preserved in Gitea + console.error(`[Archive] Failed to archive repository ${owner}/${repo}: ${response.status}`); + console.log(`[Archive] Repository ${owner}/${repo} remains accessible but not marked as archived`); + return; + } + + console.log(`[Archive] Successfully archived repository ${owner}/${repo} (now read-only)`); + } } catch (error) { - console.error(`Error archiving repository ${owner}/${repo}:`, error); - throw error; + // Even on error, the repository data is preserved in Gitea + // We just couldn't mark it as archived + console.error(`[Archive] Could not mark repository ${owner}/${repo} as archived:`, error); + console.log(`[Archive] Repository ${owner}/${repo} data is preserved but not marked as archived`); + // Don't throw - we want cleanup to continue for other repos } } diff --git a/src/lib/repository-cleanup-service.ts b/src/lib/repository-cleanup-service.ts index e68ae14..93b6698 100644 --- a/src/lib/repository-cleanup-service.ts +++ b/src/lib/repository-cleanup-service.ts @@ -27,15 +27,37 @@ async function identifyOrphanedRepositories(config: any): Promise { const decryptedToken = getDecryptedGitHubToken(config); const octokit = createGitHubClient(decryptedToken); - // Fetch GitHub data - const [basicAndForkedRepos, starredRepos] = await Promise.all([ - getGithubRepositories({ octokit, config }), - config.githubConfig?.includeStarred - ? getGithubStarredRepositories({ octokit, config }) - : Promise.resolve([]), - ]); + let allGithubRepos = []; + let githubApiAccessible = true; + + try { + // Fetch GitHub data + const [basicAndForkedRepos, starredRepos] = await Promise.all([ + getGithubRepositories({ octokit, config }), + config.githubConfig?.includeStarred + ? getGithubStarredRepositories({ octokit, config }) + : Promise.resolve([]), + ]); + + allGithubRepos = [...basicAndForkedRepos, ...starredRepos]; + } catch (githubError: any) { + // Handle GitHub API errors gracefully + console.warn(`[Repository Cleanup] GitHub API error for user ${userId}: ${githubError.message}`); + + // Check if it's a critical error (like account deleted/banned) + if (githubError.status === 404 || githubError.status === 403) { + console.error(`[Repository Cleanup] CRITICAL: GitHub account may be deleted/banned. Skipping cleanup to prevent data loss.`); + console.error(`[Repository Cleanup] Consider using CLEANUP_ORPHANED_REPO_ACTION=archive instead of delete for safety.`); + + // Return empty array to skip cleanup entirely when GitHub account is inaccessible + return []; + } + + // For other errors, also skip cleanup to be safe + console.error(`[Repository Cleanup] Skipping cleanup due to GitHub API error. This prevents accidental deletion of backups.`); + return []; + } - const allGithubRepos = [...basicAndForkedRepos, ...starredRepos]; const githubRepoFullNames = new Set(allGithubRepos.map(repo => repo.fullName)); // Get all repositories from our database @@ -44,13 +66,19 @@ async function identifyOrphanedRepositories(config: any): Promise { .from(repositories) .where(eq(repositories.userId, userId)); - // Identify orphaned repositories + // Only identify repositories as orphaned if we successfully accessed GitHub + // This prevents false positives when GitHub is down or account is inaccessible const orphanedRepos = dbRepos.filter(repo => !githubRepoFullNames.has(repo.fullName)); + if (orphanedRepos.length > 0) { + console.log(`[Repository Cleanup] Found ${orphanedRepos.length} orphaned repositories for user ${userId}`); + } + return orphanedRepos; } catch (error) { console.error(`[Repository Cleanup] Error identifying orphaned repositories for user ${userId}:`, error); - throw error; + // Return empty array on error to prevent accidental deletions + return []; } }