Fix financial taxonomy snapshot normalization

This commit is contained in:
2026-03-13 19:01:56 -04:00
parent b1c9c0ef08
commit 30977dc15f
16 changed files with 1273 additions and 156 deletions

View File

@@ -1,5 +1,6 @@
import { and, desc, eq, gte, inArray, lt, sql } from 'drizzle-orm';
import type {
DetailFinancialRow,
Filing,
FinancialStatementKind,
MetricValidationResult,
@@ -283,6 +284,18 @@ export type UpsertFilingTaxonomySnapshotInput = {
}>;
};
const FINANCIAL_STATEMENT_KINDS = [
'income',
'balance',
'cash_flow',
'equity',
'comprehensive_income'
] as const satisfies FinancialStatementKind[];
type StatementRowMap = Record<FinancialStatementKind, TaxonomyStatementRow[]>;
type SurfaceRowMap = Record<FinancialStatementKind, SurfaceFinancialRow[]>;
type DetailRowMap = Record<FinancialStatementKind, SurfaceDetailMap>;
function tenYearsAgoIso() {
const date = new Date();
date.setUTCFullYear(date.getUTCFullYear() - 10);
@@ -310,7 +323,394 @@ function asNumericText(value: number | null) {
return String(value);
}
function emptyStatementRows(): Record<FinancialStatementKind, TaxonomyStatementRow[]> {
function asObject(value: unknown) {
return value !== null && typeof value === 'object' && !Array.isArray(value)
? value as Record<string, unknown>
: null;
}
function asString(value: unknown) {
return typeof value === 'string' ? value : null;
}
function asNullableString(value: unknown) {
return typeof value === 'string'
? value
: value === null
? null
: null;
}
function asBoolean(value: unknown) {
return typeof value === 'boolean' ? value : Boolean(value);
}
function asStatementKind(value: unknown): FinancialStatementKind | null {
return value === 'income'
|| value === 'balance'
|| value === 'cash_flow'
|| value === 'equity'
|| value === 'comprehensive_income'
? value
: null;
}
function normalizeNumberMap(value: unknown) {
const object = asObject(value);
if (!object) {
return {};
}
return Object.fromEntries(
Object.entries(object).map(([key, entry]) => [key, asNumber(entry)])
);
}
function normalizeNullableStringMap(value: unknown) {
const object = asObject(value);
if (!object) {
return {};
}
return Object.fromEntries(
Object.entries(object).map(([key, entry]) => [key, asNullableString(entry)])
);
}
function normalizeStringArray(value: unknown) {
return Array.isArray(value)
? value.filter((entry): entry is string => typeof entry === 'string')
: [];
}
function normalizeNumberArray(value: unknown) {
if (!Array.isArray(value)) {
return [];
}
return value
.map((entry) => asNumber(entry))
.filter((entry): entry is number => entry !== null);
}
function normalizePeriods(value: unknown): FilingTaxonomyPeriod[] {
if (!Array.isArray(value)) {
return [];
}
return value
.map((entry) => {
const row = asObject(entry);
if (!row) {
return null;
}
const id = asString(row.id);
const filingId = asNumber(row.filingId ?? row.filing_id);
const accessionNumber = asString(row.accessionNumber ?? row.accession_number);
const filingDate = asString(row.filingDate ?? row.filing_date);
const filingType = row.filingType === '10-K' || row.filing_type === '10-K'
? '10-K'
: row.filingType === '10-Q' || row.filing_type === '10-Q'
? '10-Q'
: null;
const periodLabel = asString(row.periodLabel ?? row.period_label);
if (!id || filingId === null || !accessionNumber || !filingDate || !filingType || !periodLabel) {
return null;
}
return {
id,
filingId,
accessionNumber,
filingDate,
periodStart: asNullableString(row.periodStart ?? row.period_start),
periodEnd: asNullableString(row.periodEnd ?? row.period_end),
filingType,
periodLabel
} satisfies FilingTaxonomyPeriod;
})
.filter((entry): entry is FilingTaxonomyPeriod => entry !== null);
}
function normalizeStatementRows(
value: unknown,
fallbackRows: StatementRowMap = emptyStatementRows()
): StatementRowMap {
const object = asObject(value);
if (!object) {
return fallbackRows;
}
const normalized = emptyStatementRows();
for (const statement of FINANCIAL_STATEMENT_KINDS) {
const rows = Array.isArray(object[statement]) ? object[statement] : [];
normalized[statement] = rows
.map((entry) => {
const row = asObject(entry);
if (!row) {
return null;
}
const key = asString(row.key) ?? asString(row.conceptKey ?? row.concept_key);
const label = asString(row.label);
const conceptKey = asString(row.conceptKey ?? row.concept_key);
const qname = asString(row.qname);
const namespaceUri = asString(row.namespaceUri ?? row.namespace_uri);
const localName = asString(row.localName ?? row.local_name);
if (!key || !label || !conceptKey || !qname || !namespaceUri || !localName) {
return null;
}
return {
key,
label,
conceptKey,
qname,
namespaceUri,
localName,
isExtension: asBoolean(row.isExtension ?? row.is_extension),
statement: asStatementKind(row.statement) ?? statement,
roleUri: asNullableString(row.roleUri ?? row.role_uri),
order: asNumber(row.order) ?? Number.MAX_SAFE_INTEGER,
depth: asNumber(row.depth) ?? 0,
parentKey: asNullableString(row.parentKey ?? row.parent_key),
values: normalizeNumberMap(row.values),
units: normalizeNullableStringMap(row.units),
hasDimensions: asBoolean(row.hasDimensions ?? row.has_dimensions),
sourceFactIds: normalizeNumberArray(row.sourceFactIds ?? row.source_fact_ids)
};
})
.filter((entry): entry is TaxonomyStatementRow => entry !== null);
}
return normalized;
}
function normalizeSurfaceRows(
value: unknown,
fallbackRows: SurfaceRowMap = emptySurfaceRows()
): SurfaceRowMap {
const object = asObject(value);
if (!object) {
return fallbackRows;
}
const normalized = emptySurfaceRows();
for (const statement of FINANCIAL_STATEMENT_KINDS) {
const rows = Array.isArray(object[statement]) ? object[statement] : [];
normalized[statement] = rows
.map((entry) => {
const row = asObject(entry);
if (!row) {
return null;
}
const key = asString(row.key);
const label = asString(row.label);
const category = asString(row.category);
const unit = asString(row.unit);
if (!key || !label || !category || !unit) {
return null;
}
const normalizedStatement = asStatementKind(row.statement);
const resolutionMethod = row.resolutionMethod ?? row.resolution_method;
const confidence = row.confidence;
const normalizedRow: SurfaceFinancialRow = {
key,
label,
category: category as SurfaceFinancialRow['category'],
order: asNumber(row.order) ?? Number.MAX_SAFE_INTEGER,
unit: unit as SurfaceFinancialRow['unit'],
values: normalizeNumberMap(row.values),
sourceConcepts: normalizeStringArray(row.sourceConcepts ?? row.source_concepts),
sourceRowKeys: normalizeStringArray(row.sourceRowKeys ?? row.source_row_keys),
sourceFactIds: normalizeNumberArray(row.sourceFactIds ?? row.source_fact_ids),
formulaKey: asNullableString(row.formulaKey ?? row.formula_key),
hasDimensions: asBoolean(row.hasDimensions ?? row.has_dimensions),
resolvedSourceRowKeys: normalizeNullableStringMap(row.resolvedSourceRowKeys ?? row.resolved_source_row_keys)
};
const templateSection = asString(row.templateSection ?? row.template_section);
if (templateSection) {
normalizedRow.templateSection = templateSection as SurfaceFinancialRow['templateSection'];
}
if (normalizedStatement === 'income' || normalizedStatement === 'balance' || normalizedStatement === 'cash_flow') {
normalizedRow.statement = normalizedStatement;
}
const detailCount = asNumber(row.detailCount ?? row.detail_count);
if (detailCount !== null) {
normalizedRow.detailCount = detailCount;
}
if (
resolutionMethod === 'direct'
|| resolutionMethod === 'surface_bridge'
|| resolutionMethod === 'formula_derived'
|| resolutionMethod === 'not_meaningful'
) {
normalizedRow.resolutionMethod = resolutionMethod;
}
if (confidence === 'high' || confidence === 'medium' || confidence === 'low') {
normalizedRow.confidence = confidence;
}
const warningCodes = normalizeStringArray(row.warningCodes ?? row.warning_codes);
if (warningCodes.length > 0) {
normalizedRow.warningCodes = warningCodes;
}
return normalizedRow;
})
.filter((entry): entry is SurfaceFinancialRow => entry !== null);
}
return normalized;
}
function normalizeDetailRows(
value: unknown,
fallbackRows: DetailRowMap = emptyDetailRows()
): DetailRowMap {
const object = asObject(value);
if (!object) {
return fallbackRows;
}
const normalized = emptyDetailRows();
for (const statement of FINANCIAL_STATEMENT_KINDS) {
const groups = asObject(object[statement]) ?? {};
normalized[statement] = Object.fromEntries(
Object.entries(groups).map(([surfaceKey, rows]) => {
const normalizedRows = Array.isArray(rows)
? rows
.map((entry) => {
const row = asObject(entry);
if (!row) {
return null;
}
const key = asString(row.key) ?? asString(row.conceptKey ?? row.concept_key);
const label = asString(row.label);
const conceptKey = asString(row.conceptKey ?? row.concept_key);
const qname = asString(row.qname);
const namespaceUri = asString(row.namespaceUri ?? row.namespace_uri);
const localName = asString(row.localName ?? row.local_name);
if (!key || !label || !conceptKey || !qname || !namespaceUri || !localName) {
return null;
}
return {
key,
parentSurfaceKey: asString(row.parentSurfaceKey ?? row.parent_surface_key) ?? surfaceKey,
label,
conceptKey,
qname,
namespaceUri,
localName,
unit: asNullableString(row.unit),
values: normalizeNumberMap(row.values),
sourceFactIds: normalizeNumberArray(row.sourceFactIds ?? row.source_fact_ids),
isExtension: asBoolean(row.isExtension ?? row.is_extension),
dimensionsSummary: normalizeStringArray(row.dimensionsSummary ?? row.dimensions_summary),
residualFlag: asBoolean(row.residualFlag ?? row.residual_flag)
};
})
.filter((entry): entry is DetailFinancialRow => entry !== null)
: [];
return [surfaceKey, normalizedRows];
})
);
}
return normalized;
}
function normalizeKpiRows(value: unknown) {
if (!Array.isArray(value)) {
return [];
}
return value
.map((entry) => {
const row = asObject(entry);
if (!row) {
return null;
}
const key = asString(row.key);
const label = asString(row.label);
const category = asString(row.category);
const unit = asString(row.unit);
const provenanceType = row.provenanceType ?? row.provenance_type;
if (!key || !label || !category || !unit || (provenanceType !== 'taxonomy' && provenanceType !== 'structured_note')) {
return null;
}
return {
key,
label,
category: category as StructuredKpiRow['category'],
unit: unit as StructuredKpiRow['unit'],
order: asNumber(row.order) ?? Number.MAX_SAFE_INTEGER,
segment: asNullableString(row.segment),
axis: asNullableString(row.axis),
member: asNullableString(row.member),
values: normalizeNumberMap(row.values),
sourceConcepts: normalizeStringArray(row.sourceConcepts ?? row.source_concepts),
sourceFactIds: normalizeNumberArray(row.sourceFactIds ?? row.source_fact_ids),
provenanceType,
hasDimensions: asBoolean(row.hasDimensions ?? row.has_dimensions)
} satisfies StructuredKpiRow;
})
.filter((entry): entry is StructuredKpiRow => entry !== null);
}
function normalizeNormalizationSummary(value: unknown) {
const row = asObject(value);
if (!row) {
return null;
}
return {
surfaceRowCount: asNumber(row.surfaceRowCount ?? row.surface_row_count) ?? 0,
detailRowCount: asNumber(row.detailRowCount ?? row.detail_row_count) ?? 0,
kpiRowCount: asNumber(row.kpiRowCount ?? row.kpi_row_count) ?? 0,
unmappedRowCount: asNumber(row.unmappedRowCount ?? row.unmapped_row_count) ?? 0,
materialUnmappedRowCount: asNumber(row.materialUnmappedRowCount ?? row.material_unmapped_row_count) ?? 0,
warnings: normalizeStringArray(row.warnings)
} satisfies NormalizationSummary;
}
export function normalizeFilingTaxonomySnapshotPayload(input: {
periods: unknown;
faithful_rows: unknown;
statement_rows: unknown;
surface_rows: unknown;
detail_rows: unknown;
kpi_rows: unknown;
normalization_summary: unknown;
}) {
const faithfulRows = normalizeStatementRows(input.faithful_rows);
const statementRows = normalizeStatementRows(input.statement_rows, faithfulRows);
return {
periods: normalizePeriods(input.periods),
faithful_rows: faithfulRows,
statement_rows: statementRows,
surface_rows: normalizeSurfaceRows(input.surface_rows),
detail_rows: normalizeDetailRows(input.detail_rows),
kpi_rows: normalizeKpiRows(input.kpi_rows),
normalization_summary: normalizeNormalizationSummary(input.normalization_summary)
};
}
function emptyStatementRows(): StatementRowMap {
return {
income: [],
balance: [],
@@ -320,7 +720,7 @@ function emptyStatementRows(): Record<FinancialStatementKind, TaxonomyStatementR
};
}
function emptySurfaceRows(): Record<FinancialStatementKind, SurfaceFinancialRow[]> {
function emptySurfaceRows(): SurfaceRowMap {
return {
income: [],
balance: [],
@@ -330,7 +730,7 @@ function emptySurfaceRows(): Record<FinancialStatementKind, SurfaceFinancialRow[
};
}
function emptyDetailRows(): Record<FinancialStatementKind, SurfaceDetailMap> {
function emptyDetailRows(): DetailRowMap {
return {
income: {},
balance: {},
@@ -341,7 +741,15 @@ function emptyDetailRows(): Record<FinancialStatementKind, SurfaceDetailMap> {
}
function toSnapshotRecord(row: typeof filingTaxonomySnapshot.$inferSelect): FilingTaxonomySnapshotRecord {
const faithfulRows = row.faithful_rows ?? row.statement_rows ?? emptyStatementRows();
const normalized = normalizeFilingTaxonomySnapshotPayload({
periods: row.periods,
faithful_rows: row.faithful_rows,
statement_rows: row.statement_rows,
surface_rows: row.surface_rows,
detail_rows: row.detail_rows,
kpi_rows: row.kpi_rows,
normalization_summary: row.normalization_summary
});
return {
id: row.id,
@@ -356,15 +764,15 @@ function toSnapshotRecord(row: typeof filingTaxonomySnapshot.$inferSelect): Fili
parser_version: row.parser_version,
taxonomy_regime: row.taxonomy_regime,
fiscal_pack: row.fiscal_pack,
periods: row.periods ?? [],
faithful_rows: faithfulRows,
statement_rows: faithfulRows,
surface_rows: row.surface_rows ?? emptySurfaceRows(),
detail_rows: row.detail_rows ?? emptyDetailRows(),
kpi_rows: row.kpi_rows ?? [],
periods: normalized.periods,
faithful_rows: normalized.faithful_rows,
statement_rows: normalized.statement_rows,
surface_rows: normalized.surface_rows,
detail_rows: normalized.detail_rows,
kpi_rows: normalized.kpi_rows,
derived_metrics: row.derived_metrics ?? null,
validation_result: row.validation_result ?? null,
normalization_summary: row.normalization_summary ?? null,
normalization_summary: normalized.normalization_summary,
facts_count: row.facts_count,
concepts_count: row.concepts_count,
dimensions_count: row.dimensions_count,
@@ -552,6 +960,7 @@ export async function listFilingTaxonomyMetricValidations(snapshotId: number) {
export async function upsertFilingTaxonomySnapshot(input: UpsertFilingTaxonomySnapshotInput) {
const now = new Date().toISOString();
const normalized = normalizeFilingTaxonomySnapshotPayload(input);
const [saved] = await withFinancialIngestionSchemaRetry({
client: getSqliteClient(),
@@ -570,15 +979,15 @@ export async function upsertFilingTaxonomySnapshot(input: UpsertFilingTaxonomySn
parser_version: input.parser_version,
taxonomy_regime: input.taxonomy_regime,
fiscal_pack: input.fiscal_pack,
periods: input.periods,
faithful_rows: input.faithful_rows,
statement_rows: input.statement_rows,
surface_rows: input.surface_rows,
detail_rows: input.detail_rows,
kpi_rows: input.kpi_rows,
periods: normalized.periods,
faithful_rows: normalized.faithful_rows,
statement_rows: normalized.statement_rows,
surface_rows: normalized.surface_rows,
detail_rows: normalized.detail_rows,
kpi_rows: normalized.kpi_rows,
derived_metrics: input.derived_metrics,
validation_result: input.validation_result,
normalization_summary: input.normalization_summary,
normalization_summary: normalized.normalization_summary,
facts_count: input.facts_count,
concepts_count: input.concepts_count,
dimensions_count: input.dimensions_count,
@@ -598,15 +1007,15 @@ export async function upsertFilingTaxonomySnapshot(input: UpsertFilingTaxonomySn
parser_version: input.parser_version,
taxonomy_regime: input.taxonomy_regime,
fiscal_pack: input.fiscal_pack,
periods: input.periods,
faithful_rows: input.faithful_rows,
statement_rows: input.statement_rows,
surface_rows: input.surface_rows,
detail_rows: input.detail_rows,
kpi_rows: input.kpi_rows,
periods: normalized.periods,
faithful_rows: normalized.faithful_rows,
statement_rows: normalized.statement_rows,
surface_rows: normalized.surface_rows,
detail_rows: normalized.detail_rows,
kpi_rows: normalized.kpi_rows,
derived_metrics: input.derived_metrics,
validation_result: input.validation_result,
normalization_summary: input.normalization_summary,
normalization_summary: normalized.normalization_summary,
facts_count: input.facts_count,
concepts_count: input.concepts_count,
dimensions_count: input.dimensions_count,
@@ -906,3 +1315,8 @@ export async function listTaxonomyAssetsBySnapshotIds(snapshotIds: number[]) {
return rows.map(toAssetRecord);
}
export const __filingTaxonomyInternals = {
normalizeFilingTaxonomySnapshotPayload,
toSnapshotRecord
};