From 01ad57973cfeeeb7f581713d07823091d1c08acf Mon Sep 17 00:00:00 2001 From: francy51 Date: Sun, 1 Mar 2026 19:05:53 -0500 Subject: [PATCH] Add one-time filing metrics backfill script --- package.json | 1 + scripts/backfill-filing-metrics.ts | 339 +++++++++++++++++++++++++++++ 2 files changed, 340 insertions(+) create mode 100644 scripts/backfill-filing-metrics.ts diff --git a/package.json b/package.json index de4eede..b70e917 100644 --- a/package.json +++ b/package.json @@ -8,6 +8,7 @@ "build": "bun --bun next build --turbopack", "start": "bun --bun next start", "lint": "bun --bun tsc --noEmit", + "backfill:filing-metrics": "bun run scripts/backfill-filing-metrics.ts", "db:generate": "bun x drizzle-kit generate", "db:migrate": "bun x drizzle-kit migrate" }, diff --git a/scripts/backfill-filing-metrics.ts b/scripts/backfill-filing-metrics.ts new file mode 100644 index 0000000..cddd7eb --- /dev/null +++ b/scripts/backfill-filing-metrics.ts @@ -0,0 +1,339 @@ +import { asc, and, eq, gt, inArray } from 'drizzle-orm'; +import { db } from '@/lib/server/db'; +import { filing } from '@/lib/server/db/schema'; +import { fetchFilingMetricsForFilings } from '@/lib/server/sec'; + +const FINANCIAL_FORMS = ['10-K', '10-Q'] as const; +const PAGE_SIZE = 500; +const REQUEST_DELAY_MS = 220; + +type FinancialForm = (typeof FINANCIAL_FORMS)[number]; + +type FilingMetrics = { + revenue: number | null; + netIncome: number | null; + totalAssets: number | null; + cash: number | null; + debt: number | null; +}; + +type FilingRow = { + id: number; + ticker: string; + cik: string; + accessionNumber: string; + filingDate: string; + filingType: FinancialForm; + metrics: FilingMetrics | null; +}; + +type ScriptOptions = { + apply: boolean; + ticker: string | null; + limitCiks: number | null; + allowNullOverwrite: boolean; +}; + +type ScriptSummary = { + scannedRows: number; + scannedCiks: number; + changedRows: number; + unchangedRows: number; + skippedRows: number; + skippedCiks: number; +}; + +function parseOptions(argv: string[]): ScriptOptions { + const options: ScriptOptions = { + apply: false, + ticker: null, + limitCiks: null, + allowNullOverwrite: false + }; + + for (const arg of argv) { + if (arg === '--apply') { + options.apply = true; + continue; + } + + if (arg === '--allow-null-overwrite') { + options.allowNullOverwrite = true; + continue; + } + + if (arg.startsWith('--ticker=')) { + const raw = arg.slice('--ticker='.length).trim(); + options.ticker = raw ? raw.toUpperCase() : null; + continue; + } + + if (arg.startsWith('--limit-ciks=')) { + const raw = Number(arg.slice('--limit-ciks='.length)); + if (Number.isFinite(raw) && raw > 0) { + options.limitCiks = Math.trunc(raw); + } + continue; + } + + if (arg === '--help' || arg === '-h') { + printUsage(); + process.exit(0); + } + } + + return options; +} + +function printUsage() { + console.log( + [ + 'One-time backfill of filing metrics using per-filing SEC mapping.', + '', + 'Usage:', + ' bun run scripts/backfill-filing-metrics.ts [--apply] [--ticker=SYMBOL] [--limit-ciks=N] [--allow-null-overwrite]', + '', + 'Flags:', + ' --apply Persist updates. Without this flag, script runs as a dry run.', + ' --ticker=SYMBOL Restrict processing to one ticker (e.g. AAPL).', + ' --limit-ciks=N Process only first N CIK groups (useful for staged rollouts).', + ' --allow-null-overwrite Allow replacing non-null metrics with all-null API results.', + ' --help, -h Show help.' + ].join('\n') + ); +} + +function toMetrics(value: FilingMetrics | null | undefined): FilingMetrics | null { + if (!value) { + return null; + } + + return { + revenue: value.revenue ?? null, + netIncome: value.netIncome ?? null, + totalAssets: value.totalAssets ?? null, + cash: value.cash ?? null, + debt: value.debt ?? null + }; +} + +function hasAnyMetricValue(metrics: FilingMetrics | null | undefined) { + const normalized = toMetrics(metrics); + if (!normalized) { + return false; + } + + return Object.values(normalized).some((value) => value !== null && Number.isFinite(value)); +} + +function isAllNull(metrics: FilingMetrics | null | undefined) { + const normalized = toMetrics(metrics); + if (!normalized) { + return true; + } + + return Object.values(normalized).every((value) => value === null); +} + +function metricsEqual(left: FilingMetrics | null | undefined, right: FilingMetrics | null | undefined) { + const a = toMetrics(left); + const b = toMetrics(right); + + if (!a && !b) { + return true; + } + + if (!a || !b) { + return false; + } + + return a.revenue === b.revenue + && a.netIncome === b.netIncome + && a.totalAssets === b.totalAssets + && a.cash === b.cash + && a.debt === b.debt; +} + +async function loadFinancialRows(tickerFilter: string | null): Promise { + const rows: FilingRow[] = []; + let cursor = 0; + + for (;;) { + const conditions = [ + inArray(filing.filing_type, [...FINANCIAL_FORMS]), + gt(filing.id, cursor) + ]; + + if (tickerFilter) { + conditions.push(eq(filing.ticker, tickerFilter)); + } + + const page = await db + .select({ + id: filing.id, + ticker: filing.ticker, + cik: filing.cik, + accessionNumber: filing.accession_number, + filingDate: filing.filing_date, + filingType: filing.filing_type, + metrics: filing.metrics + }) + .from(filing) + .where(and(...conditions)) + .orderBy(asc(filing.id)) + .limit(PAGE_SIZE); + + if (page.length === 0) { + break; + } + + for (const row of page) { + if (row.filingType !== '10-K' && row.filingType !== '10-Q') { + continue; + } + + rows.push({ + ...row, + filingType: row.filingType, + metrics: toMetrics(row.metrics) + }); + } + + cursor = page[page.length - 1]?.id ?? cursor; + } + + return rows; +} + +function groupByCik(rows: FilingRow[]) { + const grouped = new Map(); + + for (const row of rows) { + const group = grouped.get(row.cik); + if (group) { + group.push(row); + continue; + } + + grouped.set(row.cik, [row]); + } + + return grouped; +} + +async function runBackfill(options: ScriptOptions): Promise { + const startedAt = new Date(); + const rows = await loadFinancialRows(options.ticker); + const grouped = groupByCik(rows); + const entries = [...grouped.entries()]; + const limitedEntries = options.limitCiks + ? entries.slice(0, options.limitCiks) + : entries; + + const summary: ScriptSummary = { + scannedRows: 0, + scannedCiks: 0, + changedRows: 0, + unchangedRows: 0, + skippedRows: 0, + skippedCiks: 0 + }; + + console.log(`[backfill-filing-metrics] mode=${options.apply ? 'apply' : 'dry-run'} rows=${rows.length} ciks=${limitedEntries.length}`); + if (options.ticker) { + console.log(`[backfill-filing-metrics] ticker filter=${options.ticker}`); + } + + for (const [index, [cik, filingsForCik]] of limitedEntries.entries()) { + summary.scannedCiks += 1; + summary.scannedRows += filingsForCik.length; + + const ticker = filingsForCik[0]?.ticker ?? 'UNKNOWN'; + console.log(`[backfill-filing-metrics] [${index + 1}/${limitedEntries.length}] CIK=${cik} ticker=${ticker} filings=${filingsForCik.length}`); + + const metricsMap = await fetchFilingMetricsForFilings( + cik, + ticker, + filingsForCik.map((entry) => ({ + accessionNumber: entry.accessionNumber, + filingDate: entry.filingDate, + filingType: entry.filingType + })) + ); + + const fetchedMetrics = filingsForCik.map((entry) => metricsMap.get(entry.accessionNumber) ?? null); + const apiReturnedAllNull = fetchedMetrics.every((entry) => isAllNull(entry)); + const cikHasExistingValues = filingsForCik.some((entry) => hasAnyMetricValue(entry.metrics)); + + if (apiReturnedAllNull && cikHasExistingValues && !options.allowNullOverwrite) { + summary.skippedCiks += 1; + summary.skippedRows += filingsForCik.length; + console.warn(`[backfill-filing-metrics] skipped CIK=${cik}: API returned all-null metrics; pass --allow-null-overwrite to force replacement`); + await Bun.sleep(REQUEST_DELAY_MS); + continue; + } + + const pendingUpdates: Array<{ id: number; metrics: FilingMetrics | null }> = []; + + for (const entry of filingsForCik) { + const nextMetrics = toMetrics(metricsMap.get(entry.accessionNumber) ?? null); + if (metricsEqual(entry.metrics, nextMetrics)) { + summary.unchangedRows += 1; + continue; + } + + summary.changedRows += 1; + pendingUpdates.push({ + id: entry.id, + metrics: nextMetrics + }); + } + + if (!options.apply || pendingUpdates.length === 0) { + console.log(`[backfill-filing-metrics] changes=${pendingUpdates.length} (not persisted)`); + await Bun.sleep(REQUEST_DELAY_MS); + continue; + } + + for (const update of pendingUpdates) { + await db + .update(filing) + .set({ + metrics: update.metrics, + updated_at: new Date().toISOString() + }) + .where(eq(filing.id, update.id)); + } + + console.log(`[backfill-filing-metrics] updated rows=${pendingUpdates.length}`); + await Bun.sleep(REQUEST_DELAY_MS); + } + + const elapsedMs = Date.now() - startedAt.getTime(); + console.log( + [ + '[backfill-filing-metrics] complete', + ` mode=${options.apply ? 'apply' : 'dry-run'}`, + ` scanned_ciks=${summary.scannedCiks}`, + ` scanned_rows=${summary.scannedRows}`, + ` changed_rows=${summary.changedRows}`, + ` unchanged_rows=${summary.unchangedRows}`, + ` skipped_ciks=${summary.skippedCiks}`, + ` skipped_rows=${summary.skippedRows}`, + ` elapsed_ms=${elapsedMs}` + ].join('\n') + ); + + return summary; +} + +async function main() { + const options = parseOptions(Bun.argv.slice(2)); + await runBackfill(options); +} + +main().catch((error) => { + const message = error instanceof Error ? error.stack ?? error.message : String(error); + console.error('[backfill-filing-metrics] failed\n' + message); + process.exitCode = 1; +});