import { readFileSync, readdirSync } from 'node:fs'; import { join } from 'node:path'; import { and, desc, eq, gte, inArray, lte } from 'drizzle-orm'; import { db } from '@/lib/server/db'; import { filingTaxonomyConcept, filingTaxonomySnapshot } from '@/lib/server/db/schema'; type ScriptOptions = { ticker: string | null; from: string | null; to: string | null; sampleLimit: number; failOnResiduals: boolean; }; type SnapshotRow = { id: number; filing_id: number; ticker: string; filing_date: string; filing_type: '10-K' | '10-Q'; parse_status: string; parse_error: string | null; parser_engine: string; parser_version: string; fiscal_pack: string | null; normalization_summary: { issuerOverlayMatchCount?: number; residualDisclosureCount?: number; residualPrimaryCount?: number; unsupportedConceptCount?: number; warnings?: string[]; } | null; surface_rows: Record> | null; updated_at: string; }; type ResidualConceptRow = { snapshot_id: number; qname: string; statement_kind: string | null; role_uri: string | null; }; type SurfacePackFile = { surfaces: Array<{ statement: string; allowed_source_concepts?: string[]; allowed_authoritative_concepts?: string[]; }>; }; function parseOptions(argv: string[]): ScriptOptions { const options: ScriptOptions = { ticker: null, from: null, to: null, sampleLimit: 5, failOnResiduals: false }; for (const arg of argv) { if (arg === '--help' || arg === '-h') { console.log('Report taxonomy snapshot health from the local database.'); console.log(''); console.log('Usage:'); console.log(' bun run scripts/report-taxonomy-health.ts [--ticker=SYMBOL] [--from=YYYY-MM-DD] [--to=YYYY-MM-DD] [--sample-limit=N] [--fail-on-residuals]'); process.exit(0); } if (arg === '--fail-on-residuals') { options.failOnResiduals = true; continue; } if (arg.startsWith('--ticker=')) { const value = arg.slice('--ticker='.length).trim().toUpperCase(); options.ticker = value.length > 0 ? value : null; continue; } if (arg.startsWith('--from=')) { const value = arg.slice('--from='.length).trim(); options.from = value.length > 0 ? value : null; continue; } if (arg.startsWith('--to=')) { const value = arg.slice('--to='.length).trim(); options.to = value.length > 0 ? value : null; continue; } if (arg.startsWith('--sample-limit=')) { const parsed = Number.parseInt(arg.slice('--sample-limit='.length), 10); if (Number.isFinite(parsed) && parsed > 0) { options.sampleLimit = parsed; } } } return options; } function incrementCount(map: Map, key: string) { map.set(key, (map.get(key) ?? 0) + 1); } function printCountMap(label: string, counts: Map) { console.log(`[report-taxonomy-health] ${label}`); if (counts.size === 0) { console.log(' (none)'); return; } for (const [key, count] of [...counts.entries()].sort((left, right) => right[1] - left[1] || left[0].localeCompare(right[0]))) { console.log(` ${key}=${count}`); } } function printSamples(label: string, rows: SnapshotRow[]) { console.log(`[report-taxonomy-health] ${label}`); if (rows.length === 0) { console.log(' (none)'); return; } for (const row of rows) { const warnings = row.normalization_summary?.warnings ?? []; console.log( ` ${row.ticker} ${row.filing_type} ${row.filing_date} filing_id=${row.filing_id} status=${row.parse_status} parser=${row.parser_engine} pack=${row.fiscal_pack ?? 'null'} warnings=${warnings.join(',') || '-'} error=${row.parse_error ?? '-'}` ); } } async function loadRows(options: ScriptOptions): Promise { const conditions = []; if (options.ticker) { conditions.push(eq(filingTaxonomySnapshot.ticker, options.ticker)); } if (options.from) { conditions.push(gte(filingTaxonomySnapshot.filing_date, options.from)); } if (options.to) { conditions.push(lte(filingTaxonomySnapshot.filing_date, options.to)); } const whereClause = conditions.length > 0 ? and(...conditions) : undefined; const baseQuery = db.select({ id: filingTaxonomySnapshot.id, filing_id: filingTaxonomySnapshot.filing_id, ticker: filingTaxonomySnapshot.ticker, filing_date: filingTaxonomySnapshot.filing_date, filing_type: filingTaxonomySnapshot.filing_type, parse_status: filingTaxonomySnapshot.parse_status, parse_error: filingTaxonomySnapshot.parse_error, parser_engine: filingTaxonomySnapshot.parser_engine, parser_version: filingTaxonomySnapshot.parser_version, fiscal_pack: filingTaxonomySnapshot.fiscal_pack, normalization_summary: filingTaxonomySnapshot.normalization_summary, surface_rows: filingTaxonomySnapshot.surface_rows, updated_at: filingTaxonomySnapshot.updated_at }).from(filingTaxonomySnapshot).orderBy(desc(filingTaxonomySnapshot.updated_at)); if (whereClause) { return await baseQuery.where(whereClause); } return await baseQuery; } async function loadResidualConceptRows(snapshotIds: number[]): Promise { if (snapshotIds.length === 0) { return []; } return await db.select({ snapshot_id: filingTaxonomyConcept.snapshot_id, qname: filingTaxonomyConcept.qname, statement_kind: filingTaxonomyConcept.statement_kind, role_uri: filingTaxonomyConcept.role_uri }).from(filingTaxonomyConcept).where(and( inArray(filingTaxonomyConcept.snapshot_id, snapshotIds), eq(filingTaxonomyConcept.residual_flag, true) )); } function normalizeConcept(value: string) { return value.trim().toLowerCase(); } function loadTaxonomyStatementIndex() { const index = new Map>(); const taxonomyDir = join(process.cwd(), 'rust', 'taxonomy', 'fiscal', 'v1'); const fileNames = readdirSync(taxonomyDir) .filter((fileName) => fileName.endsWith('.surface.json') && fileName !== 'universal_income.surface.json') .sort((left, right) => left.localeCompare(right)); for (const fileName of fileNames) { const file = JSON.parse(readFileSync(join(taxonomyDir, fileName), 'utf8')) as SurfacePackFile; for (const surface of file.surfaces ?? []) { const concepts = [ ...(surface.allowed_source_concepts ?? []), ...(surface.allowed_authoritative_concepts ?? []) ]; for (const concept of concepts) { const normalized = normalizeConcept(concept); const statements = index.get(normalized) ?? new Set(); statements.add(surface.statement); index.set(normalized, statements); } } } return index; } function statementsForConcept(index: Map>, qname: string) { return [...(index.get(normalizeConcept(qname)) ?? new Set())] .sort((left, right) => left.localeCompare(right)); } async function main() { const options = parseOptions(process.argv.slice(2)); const rows = await loadRows(options); const residualRows = await loadResidualConceptRows(rows.map((row) => row.id)); const taxonomyStatementIndex = loadTaxonomyStatementIndex(); const statusCounts = new Map(); const parserCounts = new Map(); const packCounts = new Map(); const warningCounts = new Map(); const parserVersionCounts = new Map(); const residualCounts = new Map(); const residualNoRoleCounts = new Map(); const residualDisclosureOnlyCounts = new Map(); const residualEquityMatchCounts = new Map(); const residualDifferentStatementCounts = new Map(); const residualAbsentCounts = new Map(); const disclosureSurfaceCounts = new Map(); const issuerOverlayMatchCounts = new Map(); const unsupportedSurfacedConcepts = new Map(); for (const row of rows) { incrementCount(statusCounts, row.parse_status); incrementCount(parserCounts, row.parser_engine); incrementCount(parserVersionCounts, row.parser_version); incrementCount(packCounts, row.fiscal_pack ?? 'null'); for (const warning of row.normalization_summary?.warnings ?? []) { incrementCount(warningCounts, warning); } for (const disclosureRow of row.surface_rows?.disclosure ?? []) { incrementCount(disclosureSurfaceCounts, disclosureRow.key); } const issuerOverlayMatches = row.normalization_summary?.issuerOverlayMatchCount ?? 0; if (issuerOverlayMatches > 0) { issuerOverlayMatchCounts.set( row.ticker, (issuerOverlayMatchCounts.get(row.ticker) ?? 0) + issuerOverlayMatches ); } const unsupportedConcepts = row.normalization_summary?.unsupportedConceptCount ?? 0; if (unsupportedConcepts > 0) { unsupportedSurfacedConcepts.set( row.ticker, (unsupportedSurfacedConcepts.get(row.ticker) ?? 0) + unsupportedConcepts ); } } for (const row of residualRows) { incrementCount(residualCounts, row.qname); if (!row.role_uri || row.role_uri.trim().length === 0) { incrementCount(residualNoRoleCounts, row.qname); } const matchedStatements = statementsForConcept(taxonomyStatementIndex, row.qname); const primaryStatements = matchedStatements.filter((statement) => statement !== 'disclosure'); if (matchedStatements.length > 0 && primaryStatements.length === 0) { incrementCount(residualDisclosureOnlyCounts, row.qname); } if (matchedStatements.includes('equity')) { incrementCount(residualEquityMatchCounts, row.qname); } if ( row.statement_kind && primaryStatements.length > 0 && !primaryStatements.includes(row.statement_kind) ) { incrementCount( residualDifferentStatementCounts, `${row.qname}::${row.statement_kind}->${primaryStatements.join('|')}` ); } if (matchedStatements.length === 0) { incrementCount(residualAbsentCounts, row.qname); } } const failedRows = rows .filter((row) => row.parse_status === 'failed') .slice(0, options.sampleLimit); const warningRows = rows .filter((row) => (row.normalization_summary?.warnings?.length ?? 0) > 0) .slice(0, options.sampleLimit); const legacyCount = rows.filter((row) => row.parser_engine === 'legacy-ts').length; const deferredCount = rows.filter((row) => (row.normalization_summary?.warnings ?? []).includes('surface_rows_deferred_to_typescript')).length; const fallbackCount = rows.filter((row) => (row.normalization_summary?.warnings ?? []).includes('ts_compact_surface_fallback_used')).length; console.log(`[report-taxonomy-health] snapshots=${rows.length}`); if (options.ticker) { console.log(`[report-taxonomy-health] ticker=${options.ticker}`); } if (options.from || options.to) { console.log(`[report-taxonomy-health] range=${options.from ?? 'min'}..${options.to ?? 'max'}`); } console.log(`[report-taxonomy-health] legacy_ts=${legacyCount}`); console.log(`[report-taxonomy-health] deferred_to_typescript=${deferredCount}`); console.log(`[report-taxonomy-health] ts_compact_surface_fallback=${fallbackCount}`); console.log(`[report-taxonomy-health] residual_rows=${residualRows.length}`); printCountMap('parse_status', statusCounts); printCountMap('parser_engine', parserCounts); printCountMap('parser_version', parserVersionCounts); printCountMap('fiscal_pack', packCounts); printCountMap('warnings', warningCounts); printCountMap('residual_top_concepts', residualCounts); printCountMap('residual_missing_role_uri', residualNoRoleCounts); printCountMap('residual_disclosure_only', residualDisclosureOnlyCounts); printCountMap('residual_matching_equity_taxonomy', residualEquityMatchCounts); printCountMap('residual_different_primary_statement', residualDifferentStatementCounts); printCountMap('residual_absent_from_taxonomy', residualAbsentCounts); printCountMap('disclosure_surface_counts', disclosureSurfaceCounts); printCountMap('issuer_overlay_match_counts', issuerOverlayMatchCounts); printCountMap('unsupported_surfaced_concepts', unsupportedSurfacedConcepts); printSamples('failed_samples', failedRows); printSamples('warning_samples', warningRows); if (options.failOnResiduals && residualRows.length > 0) { throw new Error(`strict mode failed: surfaced residual_rows=${residualRows.length}`); } } void main().catch((error) => { const message = error instanceof Error ? error.message : String(error); console.error(`[report-taxonomy-health] fatal: ${message}`); process.exitCode = 1; });