Files
Neon-Desk/lib/server/repos/filing-taxonomy.ts
francy51 4313058d65
Some checks failed
PR Checks / typecheck-and-build (pull_request) Has been cancelled
Taxonomy Sidecar / taxonomy-sidecar (pull_request) Has been cancelled
Fix P0 issues in financial ingestion architecture
- Wrap snapshot updates in transactions with error context for each child table
- Add sidecar retry with exponential backoff (3 attempts, 2s base, 10s max, 30% jitter)
- Add HTTP timeout (30s per request) and SEC rate limiting (10 req/s) in Rust
- Add XBRL validation with status reporting (checks root element, tag balance)
2026-03-15 16:51:32 -04:00

1345 lines
43 KiB
TypeScript

import { and, desc, eq, gte, inArray, lt, sql } from 'drizzle-orm';
import type {
DetailFinancialRow,
Filing,
FinancialStatementKind,
MetricValidationResult,
NormalizationSummary,
StructuredKpiRow,
SurfaceDetailMap,
SurfaceFinancialRow,
TaxonomyDimensionMember,
TaxonomyFactRow,
TaxonomyStatementRow
} from '@/lib/types';
import { db, getSqliteClient } from '@/lib/server/db';
import { withFinancialIngestionSchemaRetry } from '@/lib/server/db/financial-ingestion-schema';
import {
filingTaxonomyAsset,
filingTaxonomyConcept,
filingTaxonomyContext,
filingTaxonomyFact,
filingTaxonomyMetricValidation,
filingTaxonomySnapshot
} from '@/lib/server/db/schema';
export type FilingTaxonomyParseStatus = 'ready' | 'partial' | 'failed';
export type FilingTaxonomySource = 'xbrl_instance' | 'xbrl_instance_with_linkbase' | 'legacy_html_fallback';
export type FilingTaxonomyAssetType =
| 'instance'
| 'schema'
| 'presentation'
| 'label'
| 'calculation'
| 'definition'
| 'pdf'
| 'other';
export type FilingTaxonomyPeriod = {
id: string;
filingId: number;
accessionNumber: string;
filingDate: string;
periodStart: string | null;
periodEnd: string | null;
filingType: '10-K' | '10-Q';
periodLabel: string;
};
export type FilingTaxonomySnapshotRecord = {
id: number;
filing_id: number;
ticker: string;
filing_date: string;
filing_type: '10-K' | '10-Q';
parse_status: FilingTaxonomyParseStatus;
parse_error: string | null;
source: FilingTaxonomySource;
parser_engine: string;
parser_version: string;
taxonomy_regime: 'us-gaap' | 'ifrs-full' | 'unknown';
fiscal_pack: string | null;
periods: FilingTaxonomyPeriod[];
faithful_rows: Record<FinancialStatementKind, TaxonomyStatementRow[]>;
statement_rows: Record<FinancialStatementKind, TaxonomyStatementRow[]>;
surface_rows: Record<FinancialStatementKind, SurfaceFinancialRow[]>;
detail_rows: Record<FinancialStatementKind, SurfaceDetailMap>;
kpi_rows: StructuredKpiRow[];
derived_metrics: Filing['metrics'];
validation_result: MetricValidationResult | null;
normalization_summary: NormalizationSummary | null;
facts_count: number;
concepts_count: number;
dimensions_count: number;
created_at: string;
updated_at: string;
};
export type FilingTaxonomyContextRecord = {
id: number;
snapshot_id: number;
context_id: string;
entity_identifier: string | null;
entity_scheme: string | null;
period_start: string | null;
period_end: string | null;
period_instant: string | null;
segment_json: Record<string, unknown> | null;
scenario_json: Record<string, unknown> | null;
created_at: string;
};
export type FilingTaxonomyAssetRecord = {
id: number;
snapshot_id: number;
asset_type: FilingTaxonomyAssetType;
name: string;
url: string;
size_bytes: number | null;
score: number | null;
is_selected: boolean;
created_at: string;
};
export type FilingTaxonomyConceptRecord = {
id: number;
snapshot_id: number;
concept_key: string;
qname: string;
namespace_uri: string;
local_name: string;
label: string | null;
is_extension: boolean;
balance: string | null;
period_type: string | null;
data_type: string | null;
statement_kind: FinancialStatementKind | null;
role_uri: string | null;
authoritative_concept_key: string | null;
mapping_method: string | null;
surface_key: string | null;
detail_parent_surface_key: string | null;
kpi_key: string | null;
residual_flag: boolean;
presentation_order: number | null;
presentation_depth: number | null;
parent_concept_key: string | null;
is_abstract: boolean;
created_at: string;
};
export type FilingTaxonomyFactRecord = {
id: number;
snapshot_id: number;
concept_key: string;
qname: string;
namespace_uri: string;
local_name: string;
data_type: string | null;
statement_kind: FinancialStatementKind | null;
role_uri: string | null;
authoritative_concept_key: string | null;
mapping_method: string | null;
surface_key: string | null;
detail_parent_surface_key: string | null;
kpi_key: string | null;
residual_flag: boolean;
context_id: string;
unit: string | null;
decimals: string | null;
precision: string | null;
nil: boolean;
value_num: number;
period_start: string | null;
period_end: string | null;
period_instant: string | null;
dimensions: TaxonomyDimensionMember[];
is_dimensionless: boolean;
source_file: string | null;
created_at: string;
};
export type FilingTaxonomyMetricValidationRecord = {
id: number;
snapshot_id: number;
metric_key: keyof NonNullable<Filing['metrics']>;
taxonomy_value: number | null;
llm_value: number | null;
absolute_diff: number | null;
relative_diff: number | null;
status: 'not_run' | 'matched' | 'mismatch' | 'error';
evidence_pages: number[];
pdf_url: string | null;
provider: string | null;
model: string | null;
error: string | null;
created_at: string;
updated_at: string;
};
export type UpsertFilingTaxonomySnapshotInput = {
filing_id: number;
ticker: string;
filing_date: string;
filing_type: '10-K' | '10-Q';
parse_status: FilingTaxonomyParseStatus;
parse_error: string | null;
source: FilingTaxonomySource;
parser_engine: string;
parser_version: string;
taxonomy_regime: 'us-gaap' | 'ifrs-full' | 'unknown';
fiscal_pack: string | null;
periods: FilingTaxonomyPeriod[];
faithful_rows: Record<FinancialStatementKind, TaxonomyStatementRow[]>;
statement_rows: Record<FinancialStatementKind, TaxonomyStatementRow[]>;
surface_rows: Record<FinancialStatementKind, SurfaceFinancialRow[]>;
detail_rows: Record<FinancialStatementKind, SurfaceDetailMap>;
kpi_rows: StructuredKpiRow[];
derived_metrics: Filing['metrics'];
validation_result: MetricValidationResult | null;
normalization_summary: NormalizationSummary | null;
facts_count: number;
concepts_count: number;
dimensions_count: number;
contexts: Array<{
context_id: string;
entity_identifier: string | null;
entity_scheme: string | null;
period_start: string | null;
period_end: string | null;
period_instant: string | null;
segment_json: Record<string, unknown> | null;
scenario_json: Record<string, unknown> | null;
}>;
assets: Array<{
asset_type: FilingTaxonomyAssetType;
name: string;
url: string;
size_bytes: number | null;
score: number | null;
is_selected: boolean;
}>;
concepts: Array<{
concept_key: string;
qname: string;
namespace_uri: string;
local_name: string;
label: string | null;
is_extension: boolean;
balance: string | null;
period_type: string | null;
data_type: string | null;
statement_kind: FinancialStatementKind | null;
role_uri: string | null;
authoritative_concept_key: string | null;
mapping_method: string | null;
surface_key: string | null;
detail_parent_surface_key: string | null;
kpi_key: string | null;
residual_flag: boolean;
presentation_order: number | null;
presentation_depth: number | null;
parent_concept_key: string | null;
is_abstract: boolean;
}>;
facts: Array<{
concept_key: string;
qname: string;
namespace_uri: string;
local_name: string;
data_type: string | null;
statement_kind: FinancialStatementKind | null;
role_uri: string | null;
authoritative_concept_key: string | null;
mapping_method: string | null;
surface_key: string | null;
detail_parent_surface_key: string | null;
kpi_key: string | null;
residual_flag: boolean;
context_id: string;
unit: string | null;
decimals: string | null;
precision: string | null;
nil: boolean;
value_num: number;
period_start: string | null;
period_end: string | null;
period_instant: string | null;
dimensions: TaxonomyDimensionMember[];
is_dimensionless: boolean;
source_file: string | null;
}>;
metric_validations: Array<{
metric_key: keyof NonNullable<Filing['metrics']>;
taxonomy_value: number | null;
llm_value: number | null;
absolute_diff: number | null;
relative_diff: number | null;
status: 'not_run' | 'matched' | 'mismatch' | 'error';
evidence_pages: number[];
pdf_url: string | null;
provider: string | null;
model: string | null;
error: string | null;
}>;
};
const FINANCIAL_STATEMENT_KINDS = [
'income',
'balance',
'cash_flow',
'equity',
'comprehensive_income'
] as const satisfies FinancialStatementKind[];
type StatementRowMap = Record<FinancialStatementKind, TaxonomyStatementRow[]>;
type SurfaceRowMap = Record<FinancialStatementKind, SurfaceFinancialRow[]>;
type DetailRowMap = Record<FinancialStatementKind, SurfaceDetailMap>;
function tenYearsAgoIso() {
const date = new Date();
date.setUTCFullYear(date.getUTCFullYear() - 10);
return date.toISOString().slice(0, 10);
}
function asNumber(value: unknown) {
if (typeof value === 'number') {
return Number.isFinite(value) ? value : null;
}
if (typeof value === 'string') {
const parsed = Number(value);
return Number.isFinite(parsed) ? parsed : null;
}
return null;
}
function asNumericText(value: number | null) {
if (value === null || !Number.isFinite(value)) {
return null;
}
return String(value);
}
function asObject(value: unknown) {
return value !== null && typeof value === 'object' && !Array.isArray(value)
? value as Record<string, unknown>
: null;
}
function asString(value: unknown) {
return typeof value === 'string' ? value : null;
}
function asNullableString(value: unknown) {
return typeof value === 'string'
? value
: value === null
? null
: null;
}
function asBoolean(value: unknown) {
return typeof value === 'boolean' ? value : Boolean(value);
}
function asStatementKind(value: unknown): FinancialStatementKind | null {
return value === 'income'
|| value === 'balance'
|| value === 'cash_flow'
|| value === 'equity'
|| value === 'comprehensive_income'
? value
: null;
}
function normalizeNumberMap(value: unknown) {
const object = asObject(value);
if (!object) {
return {};
}
return Object.fromEntries(
Object.entries(object).map(([key, entry]) => [key, asNumber(entry)])
);
}
function normalizeNullableStringMap(value: unknown) {
const object = asObject(value);
if (!object) {
return {};
}
return Object.fromEntries(
Object.entries(object).map(([key, entry]) => [key, asNullableString(entry)])
);
}
function normalizeStringArray(value: unknown) {
return Array.isArray(value)
? value.filter((entry): entry is string => typeof entry === 'string')
: [];
}
function normalizeNumberArray(value: unknown) {
if (!Array.isArray(value)) {
return [];
}
return value
.map((entry) => asNumber(entry))
.filter((entry): entry is number => entry !== null);
}
function normalizePeriods(value: unknown): FilingTaxonomyPeriod[] {
if (!Array.isArray(value)) {
return [];
}
return value
.map((entry) => {
const row = asObject(entry);
if (!row) {
return null;
}
const id = asString(row.id);
const filingId = asNumber(row.filingId ?? row.filing_id);
const accessionNumber = asString(row.accessionNumber ?? row.accession_number);
const filingDate = asString(row.filingDate ?? row.filing_date);
const filingType = row.filingType === '10-K' || row.filing_type === '10-K'
? '10-K'
: row.filingType === '10-Q' || row.filing_type === '10-Q'
? '10-Q'
: null;
const periodLabel = asString(row.periodLabel ?? row.period_label);
if (!id || filingId === null || !accessionNumber || !filingDate || !filingType || !periodLabel) {
return null;
}
return {
id,
filingId,
accessionNumber,
filingDate,
periodStart: asNullableString(row.periodStart ?? row.period_start),
periodEnd: asNullableString(row.periodEnd ?? row.period_end),
filingType,
periodLabel
} satisfies FilingTaxonomyPeriod;
})
.filter((entry): entry is FilingTaxonomyPeriod => entry !== null);
}
function normalizeStatementRows(
value: unknown,
fallbackRows: StatementRowMap = emptyStatementRows()
): StatementRowMap {
const object = asObject(value);
if (!object) {
return fallbackRows;
}
const normalized = emptyStatementRows();
for (const statement of FINANCIAL_STATEMENT_KINDS) {
const rows = Array.isArray(object[statement]) ? object[statement] : [];
normalized[statement] = rows
.map((entry) => {
const row = asObject(entry);
if (!row) {
return null;
}
const key = asString(row.key) ?? asString(row.conceptKey ?? row.concept_key);
const label = asString(row.label);
const conceptKey = asString(row.conceptKey ?? row.concept_key);
const qname = asString(row.qname);
const namespaceUri = asString(row.namespaceUri ?? row.namespace_uri);
const localName = asString(row.localName ?? row.local_name);
if (!key || !label || !conceptKey || !qname || !namespaceUri || !localName) {
return null;
}
return {
key,
label,
conceptKey,
qname,
namespaceUri,
localName,
isExtension: asBoolean(row.isExtension ?? row.is_extension),
statement: asStatementKind(row.statement) ?? statement,
roleUri: asNullableString(row.roleUri ?? row.role_uri),
order: asNumber(row.order) ?? Number.MAX_SAFE_INTEGER,
depth: asNumber(row.depth) ?? 0,
parentKey: asNullableString(row.parentKey ?? row.parent_key),
values: normalizeNumberMap(row.values),
units: normalizeNullableStringMap(row.units),
hasDimensions: asBoolean(row.hasDimensions ?? row.has_dimensions),
sourceFactIds: normalizeNumberArray(row.sourceFactIds ?? row.source_fact_ids)
};
})
.filter((entry): entry is TaxonomyStatementRow => entry !== null);
}
return normalized;
}
function normalizeSurfaceRows(
value: unknown,
fallbackRows: SurfaceRowMap = emptySurfaceRows()
): SurfaceRowMap {
const object = asObject(value);
if (!object) {
return fallbackRows;
}
const normalized = emptySurfaceRows();
for (const statement of FINANCIAL_STATEMENT_KINDS) {
const rows = Array.isArray(object[statement]) ? object[statement] : [];
normalized[statement] = rows
.map((entry) => {
const row = asObject(entry);
if (!row) {
return null;
}
const key = asString(row.key);
const label = asString(row.label);
const category = asString(row.category);
const unit = asString(row.unit);
if (!key || !label || !category || !unit) {
return null;
}
const normalizedStatement = asStatementKind(row.statement);
const resolutionMethod = row.resolutionMethod ?? row.resolution_method;
const confidence = row.confidence;
const normalizedRow: SurfaceFinancialRow = {
key,
label,
category: category as SurfaceFinancialRow['category'],
order: asNumber(row.order) ?? Number.MAX_SAFE_INTEGER,
unit: unit as SurfaceFinancialRow['unit'],
values: normalizeNumberMap(row.values),
sourceConcepts: normalizeStringArray(row.sourceConcepts ?? row.source_concepts),
sourceRowKeys: normalizeStringArray(row.sourceRowKeys ?? row.source_row_keys),
sourceFactIds: normalizeNumberArray(row.sourceFactIds ?? row.source_fact_ids),
formulaKey: asNullableString(row.formulaKey ?? row.formula_key),
hasDimensions: asBoolean(row.hasDimensions ?? row.has_dimensions),
resolvedSourceRowKeys: normalizeNullableStringMap(row.resolvedSourceRowKeys ?? row.resolved_source_row_keys)
};
const templateSection = asString(row.templateSection ?? row.template_section);
if (templateSection) {
normalizedRow.templateSection = templateSection as SurfaceFinancialRow['templateSection'];
}
if (normalizedStatement === 'income' || normalizedStatement === 'balance' || normalizedStatement === 'cash_flow') {
normalizedRow.statement = normalizedStatement;
}
const detailCount = asNumber(row.detailCount ?? row.detail_count);
if (detailCount !== null) {
normalizedRow.detailCount = detailCount;
}
if (
resolutionMethod === 'direct'
|| resolutionMethod === 'surface_bridge'
|| resolutionMethod === 'formula_derived'
|| resolutionMethod === 'not_meaningful'
) {
normalizedRow.resolutionMethod = resolutionMethod;
}
if (confidence === 'high' || confidence === 'medium' || confidence === 'low') {
normalizedRow.confidence = confidence;
}
const warningCodes = normalizeStringArray(row.warningCodes ?? row.warning_codes);
if (warningCodes.length > 0) {
normalizedRow.warningCodes = warningCodes;
}
return normalizedRow;
})
.filter((entry): entry is SurfaceFinancialRow => entry !== null);
}
return normalized;
}
function normalizeDetailRows(
value: unknown,
fallbackRows: DetailRowMap = emptyDetailRows()
): DetailRowMap {
const object = asObject(value);
if (!object) {
return fallbackRows;
}
const normalized = emptyDetailRows();
for (const statement of FINANCIAL_STATEMENT_KINDS) {
const groups = asObject(object[statement]) ?? {};
normalized[statement] = Object.fromEntries(
Object.entries(groups).map(([surfaceKey, rows]) => {
const normalizedRows = Array.isArray(rows)
? rows
.map((entry) => {
const row = asObject(entry);
if (!row) {
return null;
}
const key = asString(row.key) ?? asString(row.conceptKey ?? row.concept_key);
const label = asString(row.label);
const conceptKey = asString(row.conceptKey ?? row.concept_key);
const qname = asString(row.qname);
const namespaceUri = asString(row.namespaceUri ?? row.namespace_uri);
const localName = asString(row.localName ?? row.local_name);
if (!key || !label || !conceptKey || !qname || !namespaceUri || !localName) {
return null;
}
return {
key,
parentSurfaceKey: asString(row.parentSurfaceKey ?? row.parent_surface_key) ?? surfaceKey,
label,
conceptKey,
qname,
namespaceUri,
localName,
unit: asNullableString(row.unit),
values: normalizeNumberMap(row.values),
sourceFactIds: normalizeNumberArray(row.sourceFactIds ?? row.source_fact_ids),
isExtension: asBoolean(row.isExtension ?? row.is_extension),
dimensionsSummary: normalizeStringArray(row.dimensionsSummary ?? row.dimensions_summary),
residualFlag: asBoolean(row.residualFlag ?? row.residual_flag)
};
})
.filter((entry): entry is DetailFinancialRow => entry !== null)
: [];
return [surfaceKey, normalizedRows];
})
);
}
return normalized;
}
function normalizeKpiRows(value: unknown) {
if (!Array.isArray(value)) {
return [];
}
return value
.map((entry) => {
const row = asObject(entry);
if (!row) {
return null;
}
const key = asString(row.key);
const label = asString(row.label);
const category = asString(row.category);
const unit = asString(row.unit);
const provenanceType = row.provenanceType ?? row.provenance_type;
if (!key || !label || !category || !unit || (provenanceType !== 'taxonomy' && provenanceType !== 'structured_note')) {
return null;
}
return {
key,
label,
category: category as StructuredKpiRow['category'],
unit: unit as StructuredKpiRow['unit'],
order: asNumber(row.order) ?? Number.MAX_SAFE_INTEGER,
segment: asNullableString(row.segment),
axis: asNullableString(row.axis),
member: asNullableString(row.member),
values: normalizeNumberMap(row.values),
sourceConcepts: normalizeStringArray(row.sourceConcepts ?? row.source_concepts),
sourceFactIds: normalizeNumberArray(row.sourceFactIds ?? row.source_fact_ids),
provenanceType,
hasDimensions: asBoolean(row.hasDimensions ?? row.has_dimensions)
} satisfies StructuredKpiRow;
})
.filter((entry): entry is StructuredKpiRow => entry !== null);
}
function normalizeNormalizationSummary(value: unknown) {
const row = asObject(value);
if (!row) {
return null;
}
return {
surfaceRowCount: asNumber(row.surfaceRowCount ?? row.surface_row_count) ?? 0,
detailRowCount: asNumber(row.detailRowCount ?? row.detail_row_count) ?? 0,
kpiRowCount: asNumber(row.kpiRowCount ?? row.kpi_row_count) ?? 0,
unmappedRowCount: asNumber(row.unmappedRowCount ?? row.unmapped_row_count) ?? 0,
materialUnmappedRowCount: asNumber(row.materialUnmappedRowCount ?? row.material_unmapped_row_count) ?? 0,
warnings: normalizeStringArray(row.warnings)
} satisfies NormalizationSummary;
}
export function normalizeFilingTaxonomySnapshotPayload(input: {
periods: unknown;
faithful_rows: unknown;
statement_rows: unknown;
surface_rows: unknown;
detail_rows: unknown;
kpi_rows: unknown;
normalization_summary: unknown;
}) {
const faithfulRows = normalizeStatementRows(input.faithful_rows);
const statementRows = normalizeStatementRows(input.statement_rows, faithfulRows);
return {
periods: normalizePeriods(input.periods),
faithful_rows: faithfulRows,
statement_rows: statementRows,
surface_rows: normalizeSurfaceRows(input.surface_rows),
detail_rows: normalizeDetailRows(input.detail_rows),
kpi_rows: normalizeKpiRows(input.kpi_rows),
normalization_summary: normalizeNormalizationSummary(input.normalization_summary)
};
}
function emptyStatementRows(): StatementRowMap {
return {
income: [],
balance: [],
cash_flow: [],
equity: [],
comprehensive_income: []
};
}
function emptySurfaceRows(): SurfaceRowMap {
return {
income: [],
balance: [],
cash_flow: [],
equity: [],
comprehensive_income: []
};
}
function emptyDetailRows(): DetailRowMap {
return {
income: {},
balance: {},
cash_flow: {},
equity: {},
comprehensive_income: {}
};
}
function toSnapshotRecord(row: typeof filingTaxonomySnapshot.$inferSelect): FilingTaxonomySnapshotRecord {
const normalized = normalizeFilingTaxonomySnapshotPayload({
periods: row.periods,
faithful_rows: row.faithful_rows,
statement_rows: row.statement_rows,
surface_rows: row.surface_rows,
detail_rows: row.detail_rows,
kpi_rows: row.kpi_rows,
normalization_summary: row.normalization_summary
});
return {
id: row.id,
filing_id: row.filing_id,
ticker: row.ticker,
filing_date: row.filing_date,
filing_type: row.filing_type,
parse_status: row.parse_status,
parse_error: row.parse_error,
source: row.source,
parser_engine: row.parser_engine,
parser_version: row.parser_version,
taxonomy_regime: row.taxonomy_regime,
fiscal_pack: row.fiscal_pack,
periods: normalized.periods,
faithful_rows: normalized.faithful_rows,
statement_rows: normalized.statement_rows,
surface_rows: normalized.surface_rows,
detail_rows: normalized.detail_rows,
kpi_rows: normalized.kpi_rows,
derived_metrics: row.derived_metrics ?? null,
validation_result: row.validation_result ?? null,
normalization_summary: normalized.normalization_summary,
facts_count: row.facts_count,
concepts_count: row.concepts_count,
dimensions_count: row.dimensions_count,
created_at: row.created_at,
updated_at: row.updated_at
};
}
function toContextRecord(row: typeof filingTaxonomyContext.$inferSelect): FilingTaxonomyContextRecord {
return {
id: row.id,
snapshot_id: row.snapshot_id,
context_id: row.context_id,
entity_identifier: row.entity_identifier,
entity_scheme: row.entity_scheme,
period_start: row.period_start,
period_end: row.period_end,
period_instant: row.period_instant,
segment_json: row.segment_json ?? null,
scenario_json: row.scenario_json ?? null,
created_at: row.created_at
};
}
function toAssetRecord(row: typeof filingTaxonomyAsset.$inferSelect): FilingTaxonomyAssetRecord {
return {
id: row.id,
snapshot_id: row.snapshot_id,
asset_type: row.asset_type,
name: row.name,
url: row.url,
size_bytes: row.size_bytes,
score: asNumber(row.score),
is_selected: row.is_selected,
created_at: row.created_at
};
}
function toConceptRecord(row: typeof filingTaxonomyConcept.$inferSelect): FilingTaxonomyConceptRecord {
return {
id: row.id,
snapshot_id: row.snapshot_id,
concept_key: row.concept_key,
qname: row.qname,
namespace_uri: row.namespace_uri,
local_name: row.local_name,
label: row.label,
is_extension: row.is_extension,
balance: row.balance,
period_type: row.period_type,
data_type: row.data_type,
statement_kind: row.statement_kind ?? null,
role_uri: row.role_uri,
authoritative_concept_key: row.authoritative_concept_key,
mapping_method: row.mapping_method,
surface_key: row.surface_key,
detail_parent_surface_key: row.detail_parent_surface_key,
kpi_key: row.kpi_key,
residual_flag: row.residual_flag,
presentation_order: asNumber(row.presentation_order),
presentation_depth: row.presentation_depth,
parent_concept_key: row.parent_concept_key,
is_abstract: row.is_abstract,
created_at: row.created_at
};
}
function toFactRecord(row: typeof filingTaxonomyFact.$inferSelect): FilingTaxonomyFactRecord {
const value = asNumber(row.value_num);
if (value === null) {
throw new Error(`Invalid value_num for taxonomy fact row ${row.id}`);
}
return {
id: row.id,
snapshot_id: row.snapshot_id,
concept_key: row.concept_key,
qname: row.qname,
namespace_uri: row.namespace_uri,
local_name: row.local_name,
data_type: row.data_type,
statement_kind: row.statement_kind ?? null,
role_uri: row.role_uri,
authoritative_concept_key: row.authoritative_concept_key,
mapping_method: row.mapping_method,
surface_key: row.surface_key,
detail_parent_surface_key: row.detail_parent_surface_key,
kpi_key: row.kpi_key,
residual_flag: row.residual_flag,
context_id: row.context_id,
unit: row.unit,
decimals: row.decimals,
precision: row.precision,
nil: row.nil,
value_num: value,
period_start: row.period_start,
period_end: row.period_end,
period_instant: row.period_instant,
dimensions: row.dimensions,
is_dimensionless: row.is_dimensionless,
source_file: row.source_file,
created_at: row.created_at
};
}
function toMetricValidationRecord(row: typeof filingTaxonomyMetricValidation.$inferSelect): FilingTaxonomyMetricValidationRecord {
return {
id: row.id,
snapshot_id: row.snapshot_id,
metric_key: row.metric_key,
taxonomy_value: asNumber(row.taxonomy_value),
llm_value: asNumber(row.llm_value),
absolute_diff: asNumber(row.absolute_diff),
relative_diff: asNumber(row.relative_diff),
status: row.status,
evidence_pages: row.evidence_pages ?? [],
pdf_url: row.pdf_url,
provider: row.provider,
model: row.model,
error: row.error,
created_at: row.created_at,
updated_at: row.updated_at
};
}
export async function getFilingTaxonomySnapshotByFilingId(filingId: number) {
const [row] = await db
.select()
.from(filingTaxonomySnapshot)
.where(eq(filingTaxonomySnapshot.filing_id, filingId))
.limit(1);
return row ? toSnapshotRecord(row) : null;
}
export async function listFilingTaxonomyAssets(snapshotId: number) {
const rows = await db
.select()
.from(filingTaxonomyAsset)
.where(eq(filingTaxonomyAsset.snapshot_id, snapshotId))
.orderBy(desc(filingTaxonomyAsset.id));
return rows.map(toAssetRecord);
}
export async function listFilingTaxonomyContexts(snapshotId: number) {
const rows = await db
.select()
.from(filingTaxonomyContext)
.where(eq(filingTaxonomyContext.snapshot_id, snapshotId))
.orderBy(desc(filingTaxonomyContext.id));
return rows.map(toContextRecord);
}
export async function listFilingTaxonomyConcepts(snapshotId: number) {
const rows = await db
.select()
.from(filingTaxonomyConcept)
.where(eq(filingTaxonomyConcept.snapshot_id, snapshotId))
.orderBy(desc(filingTaxonomyConcept.id));
return rows.map(toConceptRecord);
}
export async function listFilingTaxonomyFacts(snapshotId: number) {
const rows = await db
.select()
.from(filingTaxonomyFact)
.where(eq(filingTaxonomyFact.snapshot_id, snapshotId))
.orderBy(desc(filingTaxonomyFact.id));
return rows.map(toFactRecord);
}
export async function listFilingTaxonomyMetricValidations(snapshotId: number) {
const rows = await db
.select()
.from(filingTaxonomyMetricValidation)
.where(eq(filingTaxonomyMetricValidation.snapshot_id, snapshotId))
.orderBy(desc(filingTaxonomyMetricValidation.id));
return rows.map(toMetricValidationRecord);
}
export async function upsertFilingTaxonomySnapshot(input: UpsertFilingTaxonomySnapshotInput) {
const now = new Date().toISOString();
const normalized = normalizeFilingTaxonomySnapshotPayload(input);
return db.transaction(async (tx) => {
const [saved] = await tx
.insert(filingTaxonomySnapshot)
.values({
filing_id: input.filing_id,
ticker: input.ticker,
filing_date: input.filing_date,
filing_type: input.filing_type,
parse_status: input.parse_status,
parse_error: input.parse_error,
source: input.source,
parser_engine: input.parser_engine,
parser_version: input.parser_version,
taxonomy_regime: input.taxonomy_regime,
fiscal_pack: input.fiscal_pack,
periods: normalized.periods,
faithful_rows: normalized.faithful_rows,
statement_rows: normalized.statement_rows,
surface_rows: normalized.surface_rows,
detail_rows: normalized.detail_rows,
kpi_rows: normalized.kpi_rows,
derived_metrics: input.derived_metrics,
validation_result: input.validation_result,
normalization_summary: normalized.normalization_summary,
facts_count: input.facts_count,
concepts_count: input.concepts_count,
dimensions_count: input.dimensions_count,
created_at: now,
updated_at: now
})
.onConflictDoUpdate({
target: filingTaxonomySnapshot.filing_id,
set: {
ticker: input.ticker,
filing_date: input.filing_date,
filing_type: input.filing_type,
parse_status: input.parse_status,
parse_error: input.parse_error,
source: input.source,
parser_engine: input.parser_engine,
parser_version: input.parser_version,
taxonomy_regime: input.taxonomy_regime,
fiscal_pack: input.fiscal_pack,
periods: normalized.periods,
faithful_rows: normalized.faithful_rows,
statement_rows: normalized.statement_rows,
surface_rows: normalized.surface_rows,
detail_rows: normalized.detail_rows,
kpi_rows: normalized.kpi_rows,
derived_metrics: input.derived_metrics,
validation_result: input.validation_result,
normalization_summary: normalized.normalization_summary,
facts_count: input.facts_count,
concepts_count: input.concepts_count,
dimensions_count: input.dimensions_count,
updated_at: now
}
})
.returning();
const snapshotId = saved.id;
try {
await tx.delete(filingTaxonomyAsset).where(eq(filingTaxonomyAsset.snapshot_id, snapshotId));
await tx.delete(filingTaxonomyContext).where(eq(filingTaxonomyContext.snapshot_id, snapshotId));
await tx.delete(filingTaxonomyConcept).where(eq(filingTaxonomyConcept.snapshot_id, snapshotId));
await tx.delete(filingTaxonomyFact).where(eq(filingTaxonomyFact.snapshot_id, snapshotId));
await tx.delete(filingTaxonomyMetricValidation).where(eq(filingTaxonomyMetricValidation.snapshot_id, snapshotId));
} catch (error) {
throw new Error(`Failed to delete child records for snapshot ${snapshotId}: ${error}`);
}
if (input.contexts.length > 0) {
try {
await tx.insert(filingTaxonomyContext).values(input.contexts.map((context) => ({
snapshot_id: snapshotId,
context_id: context.context_id,
entity_identifier: context.entity_identifier,
entity_scheme: context.entity_scheme,
period_start: context.period_start,
period_end: context.period_end,
period_instant: context.period_instant,
segment_json: context.segment_json,
scenario_json: context.scenario_json,
created_at: now
})));
} catch (error) {
throw new Error(`Failed to insert ${input.contexts.length} contexts for snapshot ${snapshotId}: ${error}`);
}
}
if (input.assets.length > 0) {
try {
await tx.insert(filingTaxonomyAsset).values(input.assets.map((asset) => ({
snapshot_id: snapshotId,
asset_type: asset.asset_type,
name: asset.name,
url: asset.url,
size_bytes: asset.size_bytes,
score: asNumericText(asset.score),
is_selected: asset.is_selected,
created_at: now
})));
} catch (error) {
throw new Error(`Failed to insert ${input.assets.length} assets for snapshot ${snapshotId}: ${error}`);
}
}
if (input.concepts.length > 0) {
try {
await tx.insert(filingTaxonomyConcept).values(input.concepts.map((concept) => ({
snapshot_id: snapshotId,
concept_key: concept.concept_key,
qname: concept.qname,
namespace_uri: concept.namespace_uri,
local_name: concept.local_name,
label: concept.label,
is_extension: concept.is_extension,
balance: concept.balance,
period_type: concept.period_type,
data_type: concept.data_type,
statement_kind: concept.statement_kind,
role_uri: concept.role_uri,
authoritative_concept_key: concept.authoritative_concept_key,
mapping_method: concept.mapping_method,
surface_key: concept.surface_key,
detail_parent_surface_key: concept.detail_parent_surface_key,
kpi_key: concept.kpi_key,
residual_flag: concept.residual_flag,
presentation_order: asNumericText(concept.presentation_order),
presentation_depth: concept.presentation_depth,
parent_concept_key: concept.parent_concept_key,
is_abstract: concept.is_abstract,
created_at: now
})));
} catch (error) {
throw new Error(`Failed to insert ${input.concepts.length} concepts for snapshot ${snapshotId}: ${error}`);
}
}
if (input.facts.length > 0) {
try {
await tx.insert(filingTaxonomyFact).values(input.facts.map((fact) => ({
snapshot_id: snapshotId,
concept_key: fact.concept_key,
qname: fact.qname,
namespace_uri: fact.namespace_uri,
local_name: fact.local_name,
data_type: fact.data_type,
statement_kind: fact.statement_kind,
role_uri: fact.role_uri,
authoritative_concept_key: fact.authoritative_concept_key,
mapping_method: fact.mapping_method,
surface_key: fact.surface_key,
detail_parent_surface_key: fact.detail_parent_surface_key,
kpi_key: fact.kpi_key,
residual_flag: fact.residual_flag,
context_id: fact.context_id,
unit: fact.unit,
decimals: fact.decimals,
precision: fact.precision,
nil: fact.nil,
value_num: String(fact.value_num),
period_start: fact.period_start,
period_end: fact.period_end,
period_instant: fact.period_instant,
dimensions: fact.dimensions,
is_dimensionless: fact.is_dimensionless,
source_file: fact.source_file,
created_at: now
})));
} catch (error) {
throw new Error(`Failed to insert ${input.facts.length} facts for snapshot ${snapshotId}: ${error}`);
}
}
if (input.metric_validations.length > 0) {
try {
await tx.insert(filingTaxonomyMetricValidation).values(input.metric_validations.map((check) => ({
snapshot_id: snapshotId,
metric_key: check.metric_key,
taxonomy_value: asNumericText(check.taxonomy_value),
llm_value: asNumericText(check.llm_value),
absolute_diff: asNumericText(check.absolute_diff),
relative_diff: asNumericText(check.relative_diff),
status: check.status,
evidence_pages: check.evidence_pages,
pdf_url: check.pdf_url,
provider: check.provider,
model: check.model,
error: check.error,
created_at: now,
updated_at: now
})));
} catch (error) {
throw new Error(`Failed to insert ${input.metric_validations.length} metric validations for snapshot ${snapshotId}: ${error}`);
}
}
return toSnapshotRecord(saved);
});
}
export async function listFilingTaxonomySnapshotsByTicker(input: {
ticker: string;
window: '10y' | 'all';
filingTypes?: Array<'10-K' | '10-Q'>;
limit?: number;
cursor?: string | null;
}) {
const safeLimit = Math.min(Math.max(Math.trunc(input.limit ?? 40), 1), 120);
const cursorId = input.cursor ? Number.parseInt(input.cursor, 10) : null;
const constraints = [eq(filingTaxonomySnapshot.ticker, input.ticker.trim().toUpperCase())];
if (input.window === '10y') {
constraints.push(gte(filingTaxonomySnapshot.filing_date, tenYearsAgoIso()));
}
if (cursorId && Number.isFinite(cursorId) && cursorId > 0) {
constraints.push(lt(filingTaxonomySnapshot.id, cursorId));
}
if (input.filingTypes && input.filingTypes.length > 0) {
constraints.push(inArray(filingTaxonomySnapshot.filing_type, input.filingTypes));
}
const rows = await db
.select()
.from(filingTaxonomySnapshot)
.where(and(...constraints))
.orderBy(desc(filingTaxonomySnapshot.filing_date), desc(filingTaxonomySnapshot.id))
.limit(safeLimit + 1);
const hasMore = rows.length > safeLimit;
const usedRows = hasMore ? rows.slice(0, safeLimit) : rows;
const nextCursor = hasMore
? String(usedRows[usedRows.length - 1]?.id ?? '')
: null;
return {
snapshots: usedRows.map(toSnapshotRecord),
nextCursor
};
}
export async function countFilingTaxonomySnapshotStatuses(ticker: string) {
const rows = await db
.select({
status: filingTaxonomySnapshot.parse_status,
count: sql<string>`count(*)`
})
.from(filingTaxonomySnapshot)
.where(eq(filingTaxonomySnapshot.ticker, ticker.trim().toUpperCase()))
.groupBy(filingTaxonomySnapshot.parse_status);
return rows.reduce<Record<FilingTaxonomyParseStatus, number>>((acc, row) => {
acc[row.status] = Number(row.count);
return acc;
}, {
ready: 0,
partial: 0,
failed: 0
});
}
export async function listTaxonomyFactsByTicker(input: {
ticker: string;
window: '10y' | 'all';
statement?: FinancialStatementKind;
filingTypes?: Array<'10-K' | '10-Q'>;
cursor?: string | null;
limit?: number;
}) {
const safeLimit = Math.min(Math.max(Math.trunc(input.limit ?? 500), 1), 10000);
const cursorId = input.cursor ? Number.parseInt(input.cursor, 10) : null;
const conditions = [eq(filingTaxonomySnapshot.ticker, input.ticker.trim().toUpperCase())];
if (input.window === '10y') {
conditions.push(gte(filingTaxonomySnapshot.filing_date, tenYearsAgoIso()));
}
if (input.statement) {
conditions.push(eq(filingTaxonomyFact.statement_kind, input.statement));
}
if (input.filingTypes && input.filingTypes.length > 0) {
conditions.push(inArray(filingTaxonomySnapshot.filing_type, input.filingTypes));
}
if (cursorId && Number.isFinite(cursorId) && cursorId > 0) {
conditions.push(lt(filingTaxonomyFact.id, cursorId));
}
const rows = await db
.select({
id: filingTaxonomyFact.id,
snapshot_id: filingTaxonomyFact.snapshot_id,
filing_id: filingTaxonomySnapshot.filing_id,
filing_date: filingTaxonomySnapshot.filing_date,
statement_kind: filingTaxonomyFact.statement_kind,
role_uri: filingTaxonomyFact.role_uri,
concept_key: filingTaxonomyFact.concept_key,
qname: filingTaxonomyFact.qname,
namespace_uri: filingTaxonomyFact.namespace_uri,
local_name: filingTaxonomyFact.local_name,
value_num: filingTaxonomyFact.value_num,
context_id: filingTaxonomyFact.context_id,
unit: filingTaxonomyFact.unit,
decimals: filingTaxonomyFact.decimals,
period_start: filingTaxonomyFact.period_start,
period_end: filingTaxonomyFact.period_end,
period_instant: filingTaxonomyFact.period_instant,
dimensions: filingTaxonomyFact.dimensions,
is_dimensionless: filingTaxonomyFact.is_dimensionless,
source_file: filingTaxonomyFact.source_file
})
.from(filingTaxonomyFact)
.innerJoin(filingTaxonomySnapshot, eq(filingTaxonomyFact.snapshot_id, filingTaxonomySnapshot.id))
.where(and(...conditions))
.orderBy(desc(filingTaxonomyFact.id))
.limit(safeLimit + 1);
const hasMore = rows.length > safeLimit;
const used = hasMore ? rows.slice(0, safeLimit) : rows;
const nextCursor = hasMore ? String(used[used.length - 1]?.id ?? '') : null;
const facts: TaxonomyFactRow[] = used.map((row) => {
const value = asNumber(row.value_num);
if (value === null) {
throw new Error(`Invalid value_num in taxonomy fact ${row.id}`);
}
return {
id: row.id,
snapshotId: row.snapshot_id,
filingId: row.filing_id,
filingDate: row.filing_date,
statement: row.statement_kind,
roleUri: row.role_uri,
conceptKey: row.concept_key,
qname: row.qname,
namespaceUri: row.namespace_uri,
localName: row.local_name,
value,
contextId: row.context_id,
unit: row.unit,
decimals: row.decimals,
periodStart: row.period_start,
periodEnd: row.period_end,
periodInstant: row.period_instant,
dimensions: row.dimensions,
isDimensionless: row.is_dimensionless,
sourceFile: row.source_file
};
});
return {
facts,
nextCursor
};
}
export async function listTaxonomyAssetsBySnapshotIds(snapshotIds: number[]) {
if (snapshotIds.length === 0) {
return [];
}
const rows = await db
.select()
.from(filingTaxonomyAsset)
.where(inArray(filingTaxonomyAsset.snapshot_id, snapshotIds))
.orderBy(desc(filingTaxonomyAsset.id));
return rows.map(toAssetRecord);
}
export const __filingTaxonomyInternals = {
normalizeFilingTaxonomySnapshotPayload,
toSnapshotRecord
};