Files
Neon-Desk/scripts/report-taxonomy-health.ts

362 lines
12 KiB
TypeScript

import { readFileSync, readdirSync } from 'node:fs';
import { join } from 'node:path';
import { and, desc, eq, gte, inArray, lte } from 'drizzle-orm';
import { db } from '@/lib/server/db';
import { filingTaxonomyConcept, filingTaxonomySnapshot } from '@/lib/server/db/schema';
type ScriptOptions = {
ticker: string | null;
from: string | null;
to: string | null;
sampleLimit: number;
failOnResiduals: boolean;
};
type SnapshotRow = {
id: number;
filing_id: number;
ticker: string;
filing_date: string;
filing_type: '10-K' | '10-Q';
parse_status: string;
parse_error: string | null;
parser_engine: string;
parser_version: string;
fiscal_pack: string | null;
normalization_summary: {
issuerOverlayMatchCount?: number;
residualDisclosureCount?: number;
residualPrimaryCount?: number;
unsupportedConceptCount?: number;
warnings?: string[];
} | null;
surface_rows: Record<string, Array<{ key: string }>> | null;
updated_at: string;
};
type ResidualConceptRow = {
snapshot_id: number;
qname: string;
statement_kind: string | null;
role_uri: string | null;
};
type SurfacePackFile = {
surfaces: Array<{
statement: string;
allowed_source_concepts?: string[];
allowed_authoritative_concepts?: string[];
}>;
};
function parseOptions(argv: string[]): ScriptOptions {
const options: ScriptOptions = {
ticker: null,
from: null,
to: null,
sampleLimit: 5,
failOnResiduals: false
};
for (const arg of argv) {
if (arg === '--help' || arg === '-h') {
console.log('Report taxonomy snapshot health from the local database.');
console.log('');
console.log('Usage:');
console.log(' bun run scripts/report-taxonomy-health.ts [--ticker=SYMBOL] [--from=YYYY-MM-DD] [--to=YYYY-MM-DD] [--sample-limit=N] [--fail-on-residuals]');
process.exit(0);
}
if (arg === '--fail-on-residuals') {
options.failOnResiduals = true;
continue;
}
if (arg.startsWith('--ticker=')) {
const value = arg.slice('--ticker='.length).trim().toUpperCase();
options.ticker = value.length > 0 ? value : null;
continue;
}
if (arg.startsWith('--from=')) {
const value = arg.slice('--from='.length).trim();
options.from = value.length > 0 ? value : null;
continue;
}
if (arg.startsWith('--to=')) {
const value = arg.slice('--to='.length).trim();
options.to = value.length > 0 ? value : null;
continue;
}
if (arg.startsWith('--sample-limit=')) {
const parsed = Number.parseInt(arg.slice('--sample-limit='.length), 10);
if (Number.isFinite(parsed) && parsed > 0) {
options.sampleLimit = parsed;
}
}
}
return options;
}
function incrementCount(map: Map<string, number>, key: string) {
map.set(key, (map.get(key) ?? 0) + 1);
}
function printCountMap(label: string, counts: Map<string, number>) {
console.log(`[report-taxonomy-health] ${label}`);
if (counts.size === 0) {
console.log(' (none)');
return;
}
for (const [key, count] of [...counts.entries()].sort((left, right) => right[1] - left[1] || left[0].localeCompare(right[0]))) {
console.log(` ${key}=${count}`);
}
}
function printSamples(label: string, rows: SnapshotRow[]) {
console.log(`[report-taxonomy-health] ${label}`);
if (rows.length === 0) {
console.log(' (none)');
return;
}
for (const row of rows) {
const warnings = row.normalization_summary?.warnings ?? [];
console.log(
` ${row.ticker} ${row.filing_type} ${row.filing_date} filing_id=${row.filing_id} status=${row.parse_status} parser=${row.parser_engine} pack=${row.fiscal_pack ?? 'null'} warnings=${warnings.join(',') || '-'} error=${row.parse_error ?? '-'}`
);
}
}
async function loadRows(options: ScriptOptions): Promise<SnapshotRow[]> {
const conditions = [];
if (options.ticker) {
conditions.push(eq(filingTaxonomySnapshot.ticker, options.ticker));
}
if (options.from) {
conditions.push(gte(filingTaxonomySnapshot.filing_date, options.from));
}
if (options.to) {
conditions.push(lte(filingTaxonomySnapshot.filing_date, options.to));
}
const whereClause = conditions.length > 0 ? and(...conditions) : undefined;
const baseQuery = db.select({
id: filingTaxonomySnapshot.id,
filing_id: filingTaxonomySnapshot.filing_id,
ticker: filingTaxonomySnapshot.ticker,
filing_date: filingTaxonomySnapshot.filing_date,
filing_type: filingTaxonomySnapshot.filing_type,
parse_status: filingTaxonomySnapshot.parse_status,
parse_error: filingTaxonomySnapshot.parse_error,
parser_engine: filingTaxonomySnapshot.parser_engine,
parser_version: filingTaxonomySnapshot.parser_version,
fiscal_pack: filingTaxonomySnapshot.fiscal_pack,
normalization_summary: filingTaxonomySnapshot.normalization_summary,
surface_rows: filingTaxonomySnapshot.surface_rows,
updated_at: filingTaxonomySnapshot.updated_at
}).from(filingTaxonomySnapshot).orderBy(desc(filingTaxonomySnapshot.updated_at));
if (whereClause) {
return await baseQuery.where(whereClause);
}
return await baseQuery;
}
async function loadResidualConceptRows(snapshotIds: number[]): Promise<ResidualConceptRow[]> {
if (snapshotIds.length === 0) {
return [];
}
return await db.select({
snapshot_id: filingTaxonomyConcept.snapshot_id,
qname: filingTaxonomyConcept.qname,
statement_kind: filingTaxonomyConcept.statement_kind,
role_uri: filingTaxonomyConcept.role_uri
}).from(filingTaxonomyConcept).where(and(
inArray(filingTaxonomyConcept.snapshot_id, snapshotIds),
eq(filingTaxonomyConcept.residual_flag, true)
));
}
function normalizeConcept(value: string) {
return value.trim().toLowerCase();
}
function loadTaxonomyStatementIndex() {
const index = new Map<string, Set<string>>();
const taxonomyDir = join(process.cwd(), 'rust', 'taxonomy', 'fiscal', 'v1');
const fileNames = readdirSync(taxonomyDir)
.filter((fileName) => fileName.endsWith('.surface.json') && fileName !== 'universal_income.surface.json')
.sort((left, right) => left.localeCompare(right));
for (const fileName of fileNames) {
const file = JSON.parse(readFileSync(join(taxonomyDir, fileName), 'utf8')) as SurfacePackFile;
for (const surface of file.surfaces ?? []) {
const concepts = [
...(surface.allowed_source_concepts ?? []),
...(surface.allowed_authoritative_concepts ?? [])
];
for (const concept of concepts) {
const normalized = normalizeConcept(concept);
const statements = index.get(normalized) ?? new Set<string>();
statements.add(surface.statement);
index.set(normalized, statements);
}
}
}
return index;
}
function statementsForConcept(index: Map<string, Set<string>>, qname: string) {
return [...(index.get(normalizeConcept(qname)) ?? new Set<string>())]
.sort((left, right) => left.localeCompare(right));
}
async function main() {
const options = parseOptions(process.argv.slice(2));
const rows = await loadRows(options);
const residualRows = await loadResidualConceptRows(rows.map((row) => row.id));
const taxonomyStatementIndex = loadTaxonomyStatementIndex();
const statusCounts = new Map<string, number>();
const parserCounts = new Map<string, number>();
const packCounts = new Map<string, number>();
const warningCounts = new Map<string, number>();
const parserVersionCounts = new Map<string, number>();
const residualCounts = new Map<string, number>();
const residualNoRoleCounts = new Map<string, number>();
const residualDisclosureOnlyCounts = new Map<string, number>();
const residualEquityMatchCounts = new Map<string, number>();
const residualDifferentStatementCounts = new Map<string, number>();
const residualAbsentCounts = new Map<string, number>();
const disclosureSurfaceCounts = new Map<string, number>();
const issuerOverlayMatchCounts = new Map<string, number>();
const unsupportedSurfacedConcepts = new Map<string, number>();
for (const row of rows) {
incrementCount(statusCounts, row.parse_status);
incrementCount(parserCounts, row.parser_engine);
incrementCount(parserVersionCounts, row.parser_version);
incrementCount(packCounts, row.fiscal_pack ?? 'null');
for (const warning of row.normalization_summary?.warnings ?? []) {
incrementCount(warningCounts, warning);
}
for (const disclosureRow of row.surface_rows?.disclosure ?? []) {
incrementCount(disclosureSurfaceCounts, disclosureRow.key);
}
const issuerOverlayMatches = row.normalization_summary?.issuerOverlayMatchCount ?? 0;
if (issuerOverlayMatches > 0) {
issuerOverlayMatchCounts.set(
row.ticker,
(issuerOverlayMatchCounts.get(row.ticker) ?? 0) + issuerOverlayMatches
);
}
const unsupportedConcepts = row.normalization_summary?.unsupportedConceptCount ?? 0;
if (unsupportedConcepts > 0) {
unsupportedSurfacedConcepts.set(
row.ticker,
(unsupportedSurfacedConcepts.get(row.ticker) ?? 0) + unsupportedConcepts
);
}
}
for (const row of residualRows) {
incrementCount(residualCounts, row.qname);
if (!row.role_uri || row.role_uri.trim().length === 0) {
incrementCount(residualNoRoleCounts, row.qname);
}
const matchedStatements = statementsForConcept(taxonomyStatementIndex, row.qname);
const primaryStatements = matchedStatements.filter((statement) => statement !== 'disclosure');
if (matchedStatements.length > 0 && primaryStatements.length === 0) {
incrementCount(residualDisclosureOnlyCounts, row.qname);
}
if (matchedStatements.includes('equity')) {
incrementCount(residualEquityMatchCounts, row.qname);
}
if (
row.statement_kind
&& primaryStatements.length > 0
&& !primaryStatements.includes(row.statement_kind)
) {
incrementCount(
residualDifferentStatementCounts,
`${row.qname}::${row.statement_kind}->${primaryStatements.join('|')}`
);
}
if (matchedStatements.length === 0) {
incrementCount(residualAbsentCounts, row.qname);
}
}
const failedRows = rows
.filter((row) => row.parse_status === 'failed')
.slice(0, options.sampleLimit);
const warningRows = rows
.filter((row) => (row.normalization_summary?.warnings?.length ?? 0) > 0)
.slice(0, options.sampleLimit);
const legacyCount = rows.filter((row) => row.parser_engine === 'legacy-ts').length;
const deferredCount = rows.filter((row) => (row.normalization_summary?.warnings ?? []).includes('surface_rows_deferred_to_typescript')).length;
const fallbackCount = rows.filter((row) => (row.normalization_summary?.warnings ?? []).includes('ts_compact_surface_fallback_used')).length;
console.log(`[report-taxonomy-health] snapshots=${rows.length}`);
if (options.ticker) {
console.log(`[report-taxonomy-health] ticker=${options.ticker}`);
}
if (options.from || options.to) {
console.log(`[report-taxonomy-health] range=${options.from ?? 'min'}..${options.to ?? 'max'}`);
}
console.log(`[report-taxonomy-health] legacy_ts=${legacyCount}`);
console.log(`[report-taxonomy-health] deferred_to_typescript=${deferredCount}`);
console.log(`[report-taxonomy-health] ts_compact_surface_fallback=${fallbackCount}`);
console.log(`[report-taxonomy-health] residual_rows=${residualRows.length}`);
printCountMap('parse_status', statusCounts);
printCountMap('parser_engine', parserCounts);
printCountMap('parser_version', parserVersionCounts);
printCountMap('fiscal_pack', packCounts);
printCountMap('warnings', warningCounts);
printCountMap('residual_top_concepts', residualCounts);
printCountMap('residual_missing_role_uri', residualNoRoleCounts);
printCountMap('residual_disclosure_only', residualDisclosureOnlyCounts);
printCountMap('residual_matching_equity_taxonomy', residualEquityMatchCounts);
printCountMap('residual_different_primary_statement', residualDifferentStatementCounts);
printCountMap('residual_absent_from_taxonomy', residualAbsentCounts);
printCountMap('disclosure_surface_counts', disclosureSurfaceCounts);
printCountMap('issuer_overlay_match_counts', issuerOverlayMatchCounts);
printCountMap('unsupported_surfaced_concepts', unsupportedSurfacedConcepts);
printSamples('failed_samples', failedRows);
printSamples('warning_samples', warningRows);
if (options.failOnResiduals && residualRows.length > 0) {
throw new Error(`strict mode failed: surfaced residual_rows=${residualRows.length}`);
}
}
void main().catch((error) => {
const message = error instanceof Error ? error.message : String(error);
console.error(`[report-taxonomy-health] fatal: ${message}`);
process.exitCode = 1;
});