Fix financial taxonomy snapshot normalization
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
import { and, desc, eq, gte, inArray, lt, sql } from 'drizzle-orm';
|
||||
import type {
|
||||
DetailFinancialRow,
|
||||
Filing,
|
||||
FinancialStatementKind,
|
||||
MetricValidationResult,
|
||||
@@ -283,6 +284,18 @@ export type UpsertFilingTaxonomySnapshotInput = {
|
||||
}>;
|
||||
};
|
||||
|
||||
const FINANCIAL_STATEMENT_KINDS = [
|
||||
'income',
|
||||
'balance',
|
||||
'cash_flow',
|
||||
'equity',
|
||||
'comprehensive_income'
|
||||
] as const satisfies FinancialStatementKind[];
|
||||
|
||||
type StatementRowMap = Record<FinancialStatementKind, TaxonomyStatementRow[]>;
|
||||
type SurfaceRowMap = Record<FinancialStatementKind, SurfaceFinancialRow[]>;
|
||||
type DetailRowMap = Record<FinancialStatementKind, SurfaceDetailMap>;
|
||||
|
||||
function tenYearsAgoIso() {
|
||||
const date = new Date();
|
||||
date.setUTCFullYear(date.getUTCFullYear() - 10);
|
||||
@@ -310,7 +323,394 @@ function asNumericText(value: number | null) {
|
||||
return String(value);
|
||||
}
|
||||
|
||||
function emptyStatementRows(): Record<FinancialStatementKind, TaxonomyStatementRow[]> {
|
||||
function asObject(value: unknown) {
|
||||
return value !== null && typeof value === 'object' && !Array.isArray(value)
|
||||
? value as Record<string, unknown>
|
||||
: null;
|
||||
}
|
||||
|
||||
function asString(value: unknown) {
|
||||
return typeof value === 'string' ? value : null;
|
||||
}
|
||||
|
||||
function asNullableString(value: unknown) {
|
||||
return typeof value === 'string'
|
||||
? value
|
||||
: value === null
|
||||
? null
|
||||
: null;
|
||||
}
|
||||
|
||||
function asBoolean(value: unknown) {
|
||||
return typeof value === 'boolean' ? value : Boolean(value);
|
||||
}
|
||||
|
||||
function asStatementKind(value: unknown): FinancialStatementKind | null {
|
||||
return value === 'income'
|
||||
|| value === 'balance'
|
||||
|| value === 'cash_flow'
|
||||
|| value === 'equity'
|
||||
|| value === 'comprehensive_income'
|
||||
? value
|
||||
: null;
|
||||
}
|
||||
|
||||
function normalizeNumberMap(value: unknown) {
|
||||
const object = asObject(value);
|
||||
if (!object) {
|
||||
return {};
|
||||
}
|
||||
|
||||
return Object.fromEntries(
|
||||
Object.entries(object).map(([key, entry]) => [key, asNumber(entry)])
|
||||
);
|
||||
}
|
||||
|
||||
function normalizeNullableStringMap(value: unknown) {
|
||||
const object = asObject(value);
|
||||
if (!object) {
|
||||
return {};
|
||||
}
|
||||
|
||||
return Object.fromEntries(
|
||||
Object.entries(object).map(([key, entry]) => [key, asNullableString(entry)])
|
||||
);
|
||||
}
|
||||
|
||||
function normalizeStringArray(value: unknown) {
|
||||
return Array.isArray(value)
|
||||
? value.filter((entry): entry is string => typeof entry === 'string')
|
||||
: [];
|
||||
}
|
||||
|
||||
function normalizeNumberArray(value: unknown) {
|
||||
if (!Array.isArray(value)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return value
|
||||
.map((entry) => asNumber(entry))
|
||||
.filter((entry): entry is number => entry !== null);
|
||||
}
|
||||
|
||||
function normalizePeriods(value: unknown): FilingTaxonomyPeriod[] {
|
||||
if (!Array.isArray(value)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return value
|
||||
.map((entry) => {
|
||||
const row = asObject(entry);
|
||||
if (!row) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const id = asString(row.id);
|
||||
const filingId = asNumber(row.filingId ?? row.filing_id);
|
||||
const accessionNumber = asString(row.accessionNumber ?? row.accession_number);
|
||||
const filingDate = asString(row.filingDate ?? row.filing_date);
|
||||
const filingType = row.filingType === '10-K' || row.filing_type === '10-K'
|
||||
? '10-K'
|
||||
: row.filingType === '10-Q' || row.filing_type === '10-Q'
|
||||
? '10-Q'
|
||||
: null;
|
||||
const periodLabel = asString(row.periodLabel ?? row.period_label);
|
||||
|
||||
if (!id || filingId === null || !accessionNumber || !filingDate || !filingType || !periodLabel) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
id,
|
||||
filingId,
|
||||
accessionNumber,
|
||||
filingDate,
|
||||
periodStart: asNullableString(row.periodStart ?? row.period_start),
|
||||
periodEnd: asNullableString(row.periodEnd ?? row.period_end),
|
||||
filingType,
|
||||
periodLabel
|
||||
} satisfies FilingTaxonomyPeriod;
|
||||
})
|
||||
.filter((entry): entry is FilingTaxonomyPeriod => entry !== null);
|
||||
}
|
||||
|
||||
function normalizeStatementRows(
|
||||
value: unknown,
|
||||
fallbackRows: StatementRowMap = emptyStatementRows()
|
||||
): StatementRowMap {
|
||||
const object = asObject(value);
|
||||
if (!object) {
|
||||
return fallbackRows;
|
||||
}
|
||||
|
||||
const normalized = emptyStatementRows();
|
||||
for (const statement of FINANCIAL_STATEMENT_KINDS) {
|
||||
const rows = Array.isArray(object[statement]) ? object[statement] : [];
|
||||
normalized[statement] = rows
|
||||
.map((entry) => {
|
||||
const row = asObject(entry);
|
||||
if (!row) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const key = asString(row.key) ?? asString(row.conceptKey ?? row.concept_key);
|
||||
const label = asString(row.label);
|
||||
const conceptKey = asString(row.conceptKey ?? row.concept_key);
|
||||
const qname = asString(row.qname);
|
||||
const namespaceUri = asString(row.namespaceUri ?? row.namespace_uri);
|
||||
const localName = asString(row.localName ?? row.local_name);
|
||||
if (!key || !label || !conceptKey || !qname || !namespaceUri || !localName) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
key,
|
||||
label,
|
||||
conceptKey,
|
||||
qname,
|
||||
namespaceUri,
|
||||
localName,
|
||||
isExtension: asBoolean(row.isExtension ?? row.is_extension),
|
||||
statement: asStatementKind(row.statement) ?? statement,
|
||||
roleUri: asNullableString(row.roleUri ?? row.role_uri),
|
||||
order: asNumber(row.order) ?? Number.MAX_SAFE_INTEGER,
|
||||
depth: asNumber(row.depth) ?? 0,
|
||||
parentKey: asNullableString(row.parentKey ?? row.parent_key),
|
||||
values: normalizeNumberMap(row.values),
|
||||
units: normalizeNullableStringMap(row.units),
|
||||
hasDimensions: asBoolean(row.hasDimensions ?? row.has_dimensions),
|
||||
sourceFactIds: normalizeNumberArray(row.sourceFactIds ?? row.source_fact_ids)
|
||||
};
|
||||
})
|
||||
.filter((entry): entry is TaxonomyStatementRow => entry !== null);
|
||||
}
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
function normalizeSurfaceRows(
|
||||
value: unknown,
|
||||
fallbackRows: SurfaceRowMap = emptySurfaceRows()
|
||||
): SurfaceRowMap {
|
||||
const object = asObject(value);
|
||||
if (!object) {
|
||||
return fallbackRows;
|
||||
}
|
||||
|
||||
const normalized = emptySurfaceRows();
|
||||
for (const statement of FINANCIAL_STATEMENT_KINDS) {
|
||||
const rows = Array.isArray(object[statement]) ? object[statement] : [];
|
||||
normalized[statement] = rows
|
||||
.map((entry) => {
|
||||
const row = asObject(entry);
|
||||
if (!row) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const key = asString(row.key);
|
||||
const label = asString(row.label);
|
||||
const category = asString(row.category);
|
||||
const unit = asString(row.unit);
|
||||
if (!key || !label || !category || !unit) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const normalizedStatement = asStatementKind(row.statement);
|
||||
const resolutionMethod = row.resolutionMethod ?? row.resolution_method;
|
||||
const confidence = row.confidence;
|
||||
const normalizedRow: SurfaceFinancialRow = {
|
||||
key,
|
||||
label,
|
||||
category: category as SurfaceFinancialRow['category'],
|
||||
order: asNumber(row.order) ?? Number.MAX_SAFE_INTEGER,
|
||||
unit: unit as SurfaceFinancialRow['unit'],
|
||||
values: normalizeNumberMap(row.values),
|
||||
sourceConcepts: normalizeStringArray(row.sourceConcepts ?? row.source_concepts),
|
||||
sourceRowKeys: normalizeStringArray(row.sourceRowKeys ?? row.source_row_keys),
|
||||
sourceFactIds: normalizeNumberArray(row.sourceFactIds ?? row.source_fact_ids),
|
||||
formulaKey: asNullableString(row.formulaKey ?? row.formula_key),
|
||||
hasDimensions: asBoolean(row.hasDimensions ?? row.has_dimensions),
|
||||
resolvedSourceRowKeys: normalizeNullableStringMap(row.resolvedSourceRowKeys ?? row.resolved_source_row_keys)
|
||||
};
|
||||
|
||||
const templateSection = asString(row.templateSection ?? row.template_section);
|
||||
if (templateSection) {
|
||||
normalizedRow.templateSection = templateSection as SurfaceFinancialRow['templateSection'];
|
||||
}
|
||||
if (normalizedStatement === 'income' || normalizedStatement === 'balance' || normalizedStatement === 'cash_flow') {
|
||||
normalizedRow.statement = normalizedStatement;
|
||||
}
|
||||
|
||||
const detailCount = asNumber(row.detailCount ?? row.detail_count);
|
||||
if (detailCount !== null) {
|
||||
normalizedRow.detailCount = detailCount;
|
||||
}
|
||||
|
||||
if (
|
||||
resolutionMethod === 'direct'
|
||||
|| resolutionMethod === 'surface_bridge'
|
||||
|| resolutionMethod === 'formula_derived'
|
||||
|| resolutionMethod === 'not_meaningful'
|
||||
) {
|
||||
normalizedRow.resolutionMethod = resolutionMethod;
|
||||
}
|
||||
|
||||
if (confidence === 'high' || confidence === 'medium' || confidence === 'low') {
|
||||
normalizedRow.confidence = confidence;
|
||||
}
|
||||
|
||||
const warningCodes = normalizeStringArray(row.warningCodes ?? row.warning_codes);
|
||||
if (warningCodes.length > 0) {
|
||||
normalizedRow.warningCodes = warningCodes;
|
||||
}
|
||||
|
||||
return normalizedRow;
|
||||
})
|
||||
.filter((entry): entry is SurfaceFinancialRow => entry !== null);
|
||||
}
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
function normalizeDetailRows(
|
||||
value: unknown,
|
||||
fallbackRows: DetailRowMap = emptyDetailRows()
|
||||
): DetailRowMap {
|
||||
const object = asObject(value);
|
||||
if (!object) {
|
||||
return fallbackRows;
|
||||
}
|
||||
|
||||
const normalized = emptyDetailRows();
|
||||
for (const statement of FINANCIAL_STATEMENT_KINDS) {
|
||||
const groups = asObject(object[statement]) ?? {};
|
||||
normalized[statement] = Object.fromEntries(
|
||||
Object.entries(groups).map(([surfaceKey, rows]) => {
|
||||
const normalizedRows = Array.isArray(rows)
|
||||
? rows
|
||||
.map((entry) => {
|
||||
const row = asObject(entry);
|
||||
if (!row) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const key = asString(row.key) ?? asString(row.conceptKey ?? row.concept_key);
|
||||
const label = asString(row.label);
|
||||
const conceptKey = asString(row.conceptKey ?? row.concept_key);
|
||||
const qname = asString(row.qname);
|
||||
const namespaceUri = asString(row.namespaceUri ?? row.namespace_uri);
|
||||
const localName = asString(row.localName ?? row.local_name);
|
||||
if (!key || !label || !conceptKey || !qname || !namespaceUri || !localName) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
key,
|
||||
parentSurfaceKey: asString(row.parentSurfaceKey ?? row.parent_surface_key) ?? surfaceKey,
|
||||
label,
|
||||
conceptKey,
|
||||
qname,
|
||||
namespaceUri,
|
||||
localName,
|
||||
unit: asNullableString(row.unit),
|
||||
values: normalizeNumberMap(row.values),
|
||||
sourceFactIds: normalizeNumberArray(row.sourceFactIds ?? row.source_fact_ids),
|
||||
isExtension: asBoolean(row.isExtension ?? row.is_extension),
|
||||
dimensionsSummary: normalizeStringArray(row.dimensionsSummary ?? row.dimensions_summary),
|
||||
residualFlag: asBoolean(row.residualFlag ?? row.residual_flag)
|
||||
};
|
||||
})
|
||||
.filter((entry): entry is DetailFinancialRow => entry !== null)
|
||||
: [];
|
||||
|
||||
return [surfaceKey, normalizedRows];
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
function normalizeKpiRows(value: unknown) {
|
||||
if (!Array.isArray(value)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return value
|
||||
.map((entry) => {
|
||||
const row = asObject(entry);
|
||||
if (!row) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const key = asString(row.key);
|
||||
const label = asString(row.label);
|
||||
const category = asString(row.category);
|
||||
const unit = asString(row.unit);
|
||||
const provenanceType = row.provenanceType ?? row.provenance_type;
|
||||
if (!key || !label || !category || !unit || (provenanceType !== 'taxonomy' && provenanceType !== 'structured_note')) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
key,
|
||||
label,
|
||||
category: category as StructuredKpiRow['category'],
|
||||
unit: unit as StructuredKpiRow['unit'],
|
||||
order: asNumber(row.order) ?? Number.MAX_SAFE_INTEGER,
|
||||
segment: asNullableString(row.segment),
|
||||
axis: asNullableString(row.axis),
|
||||
member: asNullableString(row.member),
|
||||
values: normalizeNumberMap(row.values),
|
||||
sourceConcepts: normalizeStringArray(row.sourceConcepts ?? row.source_concepts),
|
||||
sourceFactIds: normalizeNumberArray(row.sourceFactIds ?? row.source_fact_ids),
|
||||
provenanceType,
|
||||
hasDimensions: asBoolean(row.hasDimensions ?? row.has_dimensions)
|
||||
} satisfies StructuredKpiRow;
|
||||
})
|
||||
.filter((entry): entry is StructuredKpiRow => entry !== null);
|
||||
}
|
||||
|
||||
function normalizeNormalizationSummary(value: unknown) {
|
||||
const row = asObject(value);
|
||||
if (!row) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
surfaceRowCount: asNumber(row.surfaceRowCount ?? row.surface_row_count) ?? 0,
|
||||
detailRowCount: asNumber(row.detailRowCount ?? row.detail_row_count) ?? 0,
|
||||
kpiRowCount: asNumber(row.kpiRowCount ?? row.kpi_row_count) ?? 0,
|
||||
unmappedRowCount: asNumber(row.unmappedRowCount ?? row.unmapped_row_count) ?? 0,
|
||||
materialUnmappedRowCount: asNumber(row.materialUnmappedRowCount ?? row.material_unmapped_row_count) ?? 0,
|
||||
warnings: normalizeStringArray(row.warnings)
|
||||
} satisfies NormalizationSummary;
|
||||
}
|
||||
|
||||
export function normalizeFilingTaxonomySnapshotPayload(input: {
|
||||
periods: unknown;
|
||||
faithful_rows: unknown;
|
||||
statement_rows: unknown;
|
||||
surface_rows: unknown;
|
||||
detail_rows: unknown;
|
||||
kpi_rows: unknown;
|
||||
normalization_summary: unknown;
|
||||
}) {
|
||||
const faithfulRows = normalizeStatementRows(input.faithful_rows);
|
||||
const statementRows = normalizeStatementRows(input.statement_rows, faithfulRows);
|
||||
|
||||
return {
|
||||
periods: normalizePeriods(input.periods),
|
||||
faithful_rows: faithfulRows,
|
||||
statement_rows: statementRows,
|
||||
surface_rows: normalizeSurfaceRows(input.surface_rows),
|
||||
detail_rows: normalizeDetailRows(input.detail_rows),
|
||||
kpi_rows: normalizeKpiRows(input.kpi_rows),
|
||||
normalization_summary: normalizeNormalizationSummary(input.normalization_summary)
|
||||
};
|
||||
}
|
||||
|
||||
function emptyStatementRows(): StatementRowMap {
|
||||
return {
|
||||
income: [],
|
||||
balance: [],
|
||||
@@ -320,7 +720,7 @@ function emptyStatementRows(): Record<FinancialStatementKind, TaxonomyStatementR
|
||||
};
|
||||
}
|
||||
|
||||
function emptySurfaceRows(): Record<FinancialStatementKind, SurfaceFinancialRow[]> {
|
||||
function emptySurfaceRows(): SurfaceRowMap {
|
||||
return {
|
||||
income: [],
|
||||
balance: [],
|
||||
@@ -330,7 +730,7 @@ function emptySurfaceRows(): Record<FinancialStatementKind, SurfaceFinancialRow[
|
||||
};
|
||||
}
|
||||
|
||||
function emptyDetailRows(): Record<FinancialStatementKind, SurfaceDetailMap> {
|
||||
function emptyDetailRows(): DetailRowMap {
|
||||
return {
|
||||
income: {},
|
||||
balance: {},
|
||||
@@ -341,7 +741,15 @@ function emptyDetailRows(): Record<FinancialStatementKind, SurfaceDetailMap> {
|
||||
}
|
||||
|
||||
function toSnapshotRecord(row: typeof filingTaxonomySnapshot.$inferSelect): FilingTaxonomySnapshotRecord {
|
||||
const faithfulRows = row.faithful_rows ?? row.statement_rows ?? emptyStatementRows();
|
||||
const normalized = normalizeFilingTaxonomySnapshotPayload({
|
||||
periods: row.periods,
|
||||
faithful_rows: row.faithful_rows,
|
||||
statement_rows: row.statement_rows,
|
||||
surface_rows: row.surface_rows,
|
||||
detail_rows: row.detail_rows,
|
||||
kpi_rows: row.kpi_rows,
|
||||
normalization_summary: row.normalization_summary
|
||||
});
|
||||
|
||||
return {
|
||||
id: row.id,
|
||||
@@ -356,15 +764,15 @@ function toSnapshotRecord(row: typeof filingTaxonomySnapshot.$inferSelect): Fili
|
||||
parser_version: row.parser_version,
|
||||
taxonomy_regime: row.taxonomy_regime,
|
||||
fiscal_pack: row.fiscal_pack,
|
||||
periods: row.periods ?? [],
|
||||
faithful_rows: faithfulRows,
|
||||
statement_rows: faithfulRows,
|
||||
surface_rows: row.surface_rows ?? emptySurfaceRows(),
|
||||
detail_rows: row.detail_rows ?? emptyDetailRows(),
|
||||
kpi_rows: row.kpi_rows ?? [],
|
||||
periods: normalized.periods,
|
||||
faithful_rows: normalized.faithful_rows,
|
||||
statement_rows: normalized.statement_rows,
|
||||
surface_rows: normalized.surface_rows,
|
||||
detail_rows: normalized.detail_rows,
|
||||
kpi_rows: normalized.kpi_rows,
|
||||
derived_metrics: row.derived_metrics ?? null,
|
||||
validation_result: row.validation_result ?? null,
|
||||
normalization_summary: row.normalization_summary ?? null,
|
||||
normalization_summary: normalized.normalization_summary,
|
||||
facts_count: row.facts_count,
|
||||
concepts_count: row.concepts_count,
|
||||
dimensions_count: row.dimensions_count,
|
||||
@@ -552,6 +960,7 @@ export async function listFilingTaxonomyMetricValidations(snapshotId: number) {
|
||||
|
||||
export async function upsertFilingTaxonomySnapshot(input: UpsertFilingTaxonomySnapshotInput) {
|
||||
const now = new Date().toISOString();
|
||||
const normalized = normalizeFilingTaxonomySnapshotPayload(input);
|
||||
|
||||
const [saved] = await withFinancialIngestionSchemaRetry({
|
||||
client: getSqliteClient(),
|
||||
@@ -570,15 +979,15 @@ export async function upsertFilingTaxonomySnapshot(input: UpsertFilingTaxonomySn
|
||||
parser_version: input.parser_version,
|
||||
taxonomy_regime: input.taxonomy_regime,
|
||||
fiscal_pack: input.fiscal_pack,
|
||||
periods: input.periods,
|
||||
faithful_rows: input.faithful_rows,
|
||||
statement_rows: input.statement_rows,
|
||||
surface_rows: input.surface_rows,
|
||||
detail_rows: input.detail_rows,
|
||||
kpi_rows: input.kpi_rows,
|
||||
periods: normalized.periods,
|
||||
faithful_rows: normalized.faithful_rows,
|
||||
statement_rows: normalized.statement_rows,
|
||||
surface_rows: normalized.surface_rows,
|
||||
detail_rows: normalized.detail_rows,
|
||||
kpi_rows: normalized.kpi_rows,
|
||||
derived_metrics: input.derived_metrics,
|
||||
validation_result: input.validation_result,
|
||||
normalization_summary: input.normalization_summary,
|
||||
normalization_summary: normalized.normalization_summary,
|
||||
facts_count: input.facts_count,
|
||||
concepts_count: input.concepts_count,
|
||||
dimensions_count: input.dimensions_count,
|
||||
@@ -598,15 +1007,15 @@ export async function upsertFilingTaxonomySnapshot(input: UpsertFilingTaxonomySn
|
||||
parser_version: input.parser_version,
|
||||
taxonomy_regime: input.taxonomy_regime,
|
||||
fiscal_pack: input.fiscal_pack,
|
||||
periods: input.periods,
|
||||
faithful_rows: input.faithful_rows,
|
||||
statement_rows: input.statement_rows,
|
||||
surface_rows: input.surface_rows,
|
||||
detail_rows: input.detail_rows,
|
||||
kpi_rows: input.kpi_rows,
|
||||
periods: normalized.periods,
|
||||
faithful_rows: normalized.faithful_rows,
|
||||
statement_rows: normalized.statement_rows,
|
||||
surface_rows: normalized.surface_rows,
|
||||
detail_rows: normalized.detail_rows,
|
||||
kpi_rows: normalized.kpi_rows,
|
||||
derived_metrics: input.derived_metrics,
|
||||
validation_result: input.validation_result,
|
||||
normalization_summary: input.normalization_summary,
|
||||
normalization_summary: normalized.normalization_summary,
|
||||
facts_count: input.facts_count,
|
||||
concepts_count: input.concepts_count,
|
||||
dimensions_count: input.dimensions_count,
|
||||
@@ -906,3 +1315,8 @@ export async function listTaxonomyAssetsBySnapshotIds(snapshotIds: number[]) {
|
||||
|
||||
return rows.map(toAssetRecord);
|
||||
}
|
||||
|
||||
export const __filingTaxonomyInternals = {
|
||||
normalizeFilingTaxonomySnapshotPayload,
|
||||
toSnapshotRecord
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user