Files
Neon-Desk/lib/server/repos/filing-taxonomy.ts

1636 lines
49 KiB
TypeScript

import { and, desc, eq, gte, inArray, lt, sql } from "drizzle-orm";
import type { ComputedDefinition } from "@/lib/generated";
import type {
DetailFinancialRow,
Filing,
FinancialStatementKind,
MetricValidationResult,
NormalizationSummary,
StructuredKpiRow,
SurfaceDetailMap,
SurfaceFinancialRow,
TaxonomyDimensionMember,
TaxonomyFactRow,
TaxonomyStatementRow,
} from "@/lib/types";
import { db, getSqliteClient } from "@/lib/server/db";
import { withFinancialIngestionSchemaRetry } from "@/lib/server/db/financial-ingestion-schema";
import {
filingTaxonomyAsset,
filingTaxonomyConcept,
filingTaxonomyContext,
filingTaxonomyFact,
filingTaxonomyMetricValidation,
filingTaxonomySnapshot,
} from "@/lib/server/db/schema";
export type FilingTaxonomyParseStatus = "ready" | "partial" | "failed";
export type FilingTaxonomySource =
| "xbrl_instance"
| "xbrl_instance_with_linkbase"
| "legacy_html_fallback";
export type FilingTaxonomyAssetType =
| "instance"
| "schema"
| "presentation"
| "label"
| "calculation"
| "definition"
| "pdf"
| "other";
export type FilingTaxonomyPeriod = {
id: string;
filingId: number;
accessionNumber: string;
filingDate: string;
periodStart: string | null;
periodEnd: string | null;
filingType: "10-K" | "10-Q";
periodLabel: string;
};
export type FilingTaxonomySnapshotRecord = {
id: number;
filing_id: number;
ticker: string;
filing_date: string;
filing_type: "10-K" | "10-Q";
parse_status: FilingTaxonomyParseStatus;
parse_error: string | null;
source: FilingTaxonomySource;
parser_engine: string;
parser_version: string;
taxonomy_regime: "us-gaap" | "ifrs-full" | "unknown";
fiscal_pack: string | null;
periods: FilingTaxonomyPeriod[];
faithful_rows: Record<FinancialStatementKind, TaxonomyStatementRow[]>;
statement_rows: Record<FinancialStatementKind, TaxonomyStatementRow[]>;
surface_rows: Record<FinancialStatementKind, SurfaceFinancialRow[]>;
detail_rows: Record<FinancialStatementKind, SurfaceDetailMap>;
kpi_rows: StructuredKpiRow[];
computed_definitions: ComputedDefinition[];
derived_metrics: Filing["metrics"];
validation_result: MetricValidationResult | null;
normalization_summary: NormalizationSummary | null;
issuer_overlay_revision_id: number | null;
facts_count: number;
concepts_count: number;
dimensions_count: number;
created_at: string;
updated_at: string;
};
type FilingTaxonomyContextRecord = {
id: number;
snapshot_id: number;
context_id: string;
entity_identifier: string | null;
entity_scheme: string | null;
period_start: string | null;
period_end: string | null;
period_instant: string | null;
segment_json: Record<string, unknown> | null;
scenario_json: Record<string, unknown> | null;
created_at: string;
};
type FilingTaxonomyAssetRecord = {
id: number;
snapshot_id: number;
asset_type: FilingTaxonomyAssetType;
name: string;
url: string;
size_bytes: number | null;
score: number | null;
is_selected: boolean;
created_at: string;
};
export type FilingTaxonomyConceptRecord = {
id: number;
snapshot_id: number;
concept_key: string;
qname: string;
namespace_uri: string;
local_name: string;
label: string | null;
is_extension: boolean;
balance: string | null;
period_type: string | null;
data_type: string | null;
statement_kind: FinancialStatementKind | null;
role_uri: string | null;
authoritative_concept_key: string | null;
mapping_method: string | null;
surface_key: string | null;
detail_parent_surface_key: string | null;
kpi_key: string | null;
residual_flag: boolean;
presentation_order: number | null;
presentation_depth: number | null;
parent_concept_key: string | null;
is_abstract: boolean;
created_at: string;
};
type FilingTaxonomyFactRecord = {
id: number;
snapshot_id: number;
concept_key: string;
qname: string;
namespace_uri: string;
local_name: string;
data_type: string | null;
statement_kind: FinancialStatementKind | null;
role_uri: string | null;
authoritative_concept_key: string | null;
mapping_method: string | null;
surface_key: string | null;
detail_parent_surface_key: string | null;
kpi_key: string | null;
residual_flag: boolean;
context_id: string;
unit: string | null;
decimals: string | null;
precision: string | null;
nil: boolean;
value_num: number;
period_start: string | null;
period_end: string | null;
period_instant: string | null;
dimensions: TaxonomyDimensionMember[];
is_dimensionless: boolean;
source_file: string | null;
created_at: string;
};
type FilingTaxonomyMetricValidationRecord = {
id: number;
snapshot_id: number;
metric_key: keyof NonNullable<Filing["metrics"]>;
taxonomy_value: number | null;
llm_value: number | null;
absolute_diff: number | null;
relative_diff: number | null;
status: "not_run" | "matched" | "mismatch" | "error";
evidence_pages: number[];
pdf_url: string | null;
provider: string | null;
model: string | null;
error: string | null;
created_at: string;
updated_at: string;
};
type UpsertFilingTaxonomySnapshotInput = {
filing_id: number;
ticker: string;
filing_date: string;
filing_type: "10-K" | "10-Q";
parse_status: FilingTaxonomyParseStatus;
parse_error: string | null;
source: FilingTaxonomySource;
parser_engine: string;
parser_version: string;
taxonomy_regime: "us-gaap" | "ifrs-full" | "unknown";
fiscal_pack: string | null;
periods: FilingTaxonomyPeriod[];
faithful_rows: Record<FinancialStatementKind, TaxonomyStatementRow[]>;
statement_rows: Record<FinancialStatementKind, TaxonomyStatementRow[]>;
surface_rows: Record<FinancialStatementKind, SurfaceFinancialRow[]>;
detail_rows: Record<FinancialStatementKind, SurfaceDetailMap>;
kpi_rows: StructuredKpiRow[];
computed_definitions: ComputedDefinition[];
derived_metrics: Filing["metrics"];
validation_result: MetricValidationResult | null;
normalization_summary: NormalizationSummary | null;
issuer_overlay_revision_id?: number | null;
facts_count: number;
concepts_count: number;
dimensions_count: number;
contexts: Array<{
context_id: string;
entity_identifier: string | null;
entity_scheme: string | null;
period_start: string | null;
period_end: string | null;
period_instant: string | null;
segment_json: Record<string, unknown> | null;
scenario_json: Record<string, unknown> | null;
}>;
assets: Array<{
asset_type: FilingTaxonomyAssetType;
name: string;
url: string;
size_bytes: number | null;
score: number | null;
is_selected: boolean;
}>;
concepts: Array<{
concept_key: string;
qname: string;
namespace_uri: string;
local_name: string;
label: string | null;
is_extension: boolean;
balance: string | null;
period_type: string | null;
data_type: string | null;
statement_kind: FinancialStatementKind | null;
role_uri: string | null;
authoritative_concept_key: string | null;
mapping_method: string | null;
surface_key: string | null;
detail_parent_surface_key: string | null;
kpi_key: string | null;
residual_flag: boolean;
presentation_order: number | null;
presentation_depth: number | null;
parent_concept_key: string | null;
is_abstract: boolean;
}>;
facts: Array<{
concept_key: string;
qname: string;
namespace_uri: string;
local_name: string;
data_type: string | null;
statement_kind: FinancialStatementKind | null;
role_uri: string | null;
authoritative_concept_key: string | null;
mapping_method: string | null;
surface_key: string | null;
detail_parent_surface_key: string | null;
kpi_key: string | null;
residual_flag: boolean;
context_id: string;
unit: string | null;
decimals: string | null;
precision: string | null;
nil: boolean;
value_num: number;
period_start: string | null;
period_end: string | null;
period_instant: string | null;
dimensions: TaxonomyDimensionMember[];
is_dimensionless: boolean;
source_file: string | null;
}>;
metric_validations: Array<{
metric_key: keyof NonNullable<Filing["metrics"]>;
taxonomy_value: number | null;
llm_value: number | null;
absolute_diff: number | null;
relative_diff: number | null;
status: "not_run" | "matched" | "mismatch" | "error";
evidence_pages: number[];
pdf_url: string | null;
provider: string | null;
model: string | null;
error: string | null;
}>;
};
const FINANCIAL_STATEMENT_KINDS = [
"income",
"balance",
"cash_flow",
"disclosure",
"equity",
"comprehensive_income",
] as const satisfies FinancialStatementKind[];
type StatementRowMap = Record<FinancialStatementKind, TaxonomyStatementRow[]>;
type SurfaceRowMap = Record<FinancialStatementKind, SurfaceFinancialRow[]>;
type DetailRowMap = Record<FinancialStatementKind, SurfaceDetailMap>;
function tenYearsAgoIso() {
const date = new Date();
date.setUTCFullYear(date.getUTCFullYear() - 10);
return date.toISOString().slice(0, 10);
}
function asNumber(value: unknown) {
if (typeof value === "number") {
return Number.isFinite(value) ? value : null;
}
if (typeof value === "string") {
const parsed = Number(value);
return Number.isFinite(parsed) ? parsed : null;
}
return null;
}
function asNumericText(value: number | null) {
if (value === null || !Number.isFinite(value)) {
return null;
}
return String(value);
}
function asObject(value: unknown) {
return value !== null && typeof value === "object" && !Array.isArray(value)
? (value as Record<string, unknown>)
: null;
}
function asString(value: unknown) {
return typeof value === "string" ? value : null;
}
function asNullableString(value: unknown) {
return typeof value === "string" ? value : value === null ? null : null;
}
function asBoolean(value: unknown) {
return typeof value === "boolean" ? value : Boolean(value);
}
function asStatementKind(value: unknown): FinancialStatementKind | null {
return value === "income" ||
value === "balance" ||
value === "cash_flow" ||
value === "disclosure" ||
value === "equity" ||
value === "comprehensive_income"
? value
: null;
}
function normalizeNumberMap(value: unknown) {
const object = asObject(value);
if (!object) {
return {};
}
return Object.fromEntries(
Object.entries(object).map(([key, entry]) => [key, asNumber(entry)]),
);
}
function normalizeNullableStringMap(value: unknown) {
const object = asObject(value);
if (!object) {
return {};
}
return Object.fromEntries(
Object.entries(object).map(([key, entry]) => [
key,
asNullableString(entry),
]),
);
}
function normalizeStringArray(value: unknown) {
return Array.isArray(value)
? value.filter((entry): entry is string => typeof entry === "string")
: [];
}
function normalizeNumberArray(value: unknown) {
if (!Array.isArray(value)) {
return [];
}
return value
.map((entry) => asNumber(entry))
.filter((entry): entry is number => entry !== null);
}
function normalizePeriods(value: unknown): FilingTaxonomyPeriod[] {
if (!Array.isArray(value)) {
return [];
}
return value
.map((entry) => {
const row = asObject(entry);
if (!row) {
return null;
}
const id = asString(row.id);
const filingId = asNumber(row.filingId ?? row.filing_id);
const accessionNumber = asString(
row.accessionNumber ?? row.accession_number,
);
const filingDate = asString(row.filingDate ?? row.filing_date);
const filingType =
row.filingType === "10-K" || row.filing_type === "10-K"
? "10-K"
: row.filingType === "10-Q" || row.filing_type === "10-Q"
? "10-Q"
: null;
const periodLabel = asString(row.periodLabel ?? row.period_label);
if (
!id ||
filingId === null ||
!accessionNumber ||
!filingDate ||
!filingType ||
!periodLabel
) {
return null;
}
return {
id,
filingId,
accessionNumber,
filingDate,
periodStart: asNullableString(row.periodStart ?? row.period_start),
periodEnd: asNullableString(row.periodEnd ?? row.period_end),
filingType,
periodLabel,
} satisfies FilingTaxonomyPeriod;
})
.filter((entry): entry is FilingTaxonomyPeriod => entry !== null);
}
function normalizeStatementRows(
value: unknown,
fallbackRows: StatementRowMap = emptyStatementRows(),
): StatementRowMap {
const object = asObject(value);
if (!object) {
return fallbackRows;
}
const normalized = emptyStatementRows();
for (const statement of FINANCIAL_STATEMENT_KINDS) {
const rows = Array.isArray(object[statement]) ? object[statement] : [];
normalized[statement] = rows
.map((entry) => {
const row = asObject(entry);
if (!row) {
return null;
}
const key =
asString(row.key) ?? asString(row.conceptKey ?? row.concept_key);
const label = asString(row.label);
const conceptKey = asString(row.conceptKey ?? row.concept_key);
const qname = asString(row.qname);
const namespaceUri = asString(row.namespaceUri ?? row.namespace_uri);
const localName = asString(row.localName ?? row.local_name);
if (
!key ||
!label ||
!conceptKey ||
!qname ||
!namespaceUri ||
!localName
) {
return null;
}
return {
key,
label,
conceptKey,
qname,
namespaceUri,
localName,
isExtension: asBoolean(row.isExtension ?? row.is_extension),
statement: asStatementKind(row.statement) ?? statement,
roleUri: asNullableString(row.roleUri ?? row.role_uri),
order: asNumber(row.order) ?? Number.MAX_SAFE_INTEGER,
depth: asNumber(row.depth) ?? 0,
parentKey: asNullableString(row.parentKey ?? row.parent_key),
values: normalizeNumberMap(row.values),
units: normalizeNullableStringMap(row.units),
hasDimensions: asBoolean(row.hasDimensions ?? row.has_dimensions),
sourceFactIds: normalizeNumberArray(
row.sourceFactIds ?? row.source_fact_ids,
),
};
})
.filter((entry): entry is TaxonomyStatementRow => entry !== null);
}
return normalized;
}
function normalizeSurfaceRows(
value: unknown,
fallbackRows: SurfaceRowMap = emptySurfaceRows(),
): SurfaceRowMap {
const object = asObject(value);
if (!object) {
return fallbackRows;
}
const normalized = emptySurfaceRows();
for (const statement of FINANCIAL_STATEMENT_KINDS) {
const rows = Array.isArray(object[statement]) ? object[statement] : [];
normalized[statement] = rows
.map((entry) => {
const row = asObject(entry);
if (!row) {
return null;
}
const key = asString(row.key);
const label = asString(row.label);
const category = asString(row.category);
const unit = asString(row.unit);
if (!key || !label || !category || !unit) {
return null;
}
const normalizedStatement = asStatementKind(row.statement);
const resolutionMethod = row.resolutionMethod ?? row.resolution_method;
const confidence = row.confidence;
const normalizedRow: SurfaceFinancialRow = {
key,
label,
category: category as SurfaceFinancialRow["category"],
order: asNumber(row.order) ?? Number.MAX_SAFE_INTEGER,
unit: unit as SurfaceFinancialRow["unit"],
values: normalizeNumberMap(row.values),
sourceConcepts: normalizeStringArray(
row.sourceConcepts ?? row.source_concepts,
),
sourceRowKeys: normalizeStringArray(
row.sourceRowKeys ?? row.source_row_keys,
),
sourceFactIds: normalizeNumberArray(
row.sourceFactIds ?? row.source_fact_ids,
),
formulaKey: asNullableString(row.formulaKey ?? row.formula_key),
hasDimensions: asBoolean(row.hasDimensions ?? row.has_dimensions),
resolvedSourceRowKeys: normalizeNullableStringMap(
row.resolvedSourceRowKeys ?? row.resolved_source_row_keys,
),
};
const templateSection = asString(
row.templateSection ?? row.template_section,
);
if (templateSection) {
normalizedRow.templateSection =
templateSection as SurfaceFinancialRow["templateSection"];
}
if (
normalizedStatement === "income" ||
normalizedStatement === "balance" ||
normalizedStatement === "cash_flow" ||
normalizedStatement === "equity" ||
normalizedStatement === "disclosure"
) {
normalizedRow.statement = normalizedStatement;
}
const detailCount = asNumber(row.detailCount ?? row.detail_count);
if (detailCount !== null) {
normalizedRow.detailCount = detailCount;
}
if (
resolutionMethod === "direct" ||
resolutionMethod === "surface_bridge" ||
resolutionMethod === "formula_derived" ||
resolutionMethod === "not_meaningful"
) {
normalizedRow.resolutionMethod = resolutionMethod;
}
if (
confidence === "high" ||
confidence === "medium" ||
confidence === "low"
) {
normalizedRow.confidence = confidence;
}
const warningCodes = normalizeStringArray(
row.warningCodes ?? row.warning_codes,
);
if (warningCodes.length > 0) {
normalizedRow.warningCodes = warningCodes;
}
return normalizedRow;
})
.filter((entry): entry is SurfaceFinancialRow => entry !== null);
}
return normalized;
}
function normalizeDetailRows(
value: unknown,
fallbackRows: DetailRowMap = emptyDetailRows(),
): DetailRowMap {
const object = asObject(value);
if (!object) {
return fallbackRows;
}
const normalized = emptyDetailRows();
for (const statement of FINANCIAL_STATEMENT_KINDS) {
const groups = asObject(object[statement]) ?? {};
normalized[statement] = Object.fromEntries(
Object.entries(groups).map(([surfaceKey, rows]) => {
const normalizedRows = Array.isArray(rows)
? rows
.map((entry) => {
const row = asObject(entry);
if (!row) {
return null;
}
const key =
asString(row.key) ??
asString(row.conceptKey ?? row.concept_key);
const label = asString(row.label);
const conceptKey = asString(row.conceptKey ?? row.concept_key);
const qname = asString(row.qname);
const namespaceUri = asString(
row.namespaceUri ?? row.namespace_uri,
);
const localName = asString(row.localName ?? row.local_name);
if (
!key ||
!label ||
!conceptKey ||
!qname ||
!namespaceUri ||
!localName
) {
return null;
}
return {
key,
parentSurfaceKey:
asString(row.parentSurfaceKey ?? row.parent_surface_key) ??
surfaceKey,
label,
conceptKey,
qname,
namespaceUri,
localName,
unit: asNullableString(row.unit),
values: normalizeNumberMap(row.values),
sourceFactIds: normalizeNumberArray(
row.sourceFactIds ?? row.source_fact_ids,
),
isExtension: asBoolean(row.isExtension ?? row.is_extension),
dimensionsSummary: normalizeStringArray(
row.dimensionsSummary ?? row.dimensions_summary,
),
residualFlag: asBoolean(
row.residualFlag ?? row.residual_flag,
),
};
})
.filter((entry): entry is DetailFinancialRow => entry !== null)
: [];
return [surfaceKey, normalizedRows];
}),
);
}
return normalized;
}
function normalizeKpiRows(value: unknown) {
if (!Array.isArray(value)) {
return [];
}
return value
.map((entry) => {
const row = asObject(entry);
if (!row) {
return null;
}
const key = asString(row.key);
const label = asString(row.label);
const category = asString(row.category);
const unit = asString(row.unit);
const provenanceType = row.provenanceType ?? row.provenance_type;
if (
!key ||
!label ||
!category ||
!unit ||
(provenanceType !== "taxonomy" && provenanceType !== "structured_note")
) {
return null;
}
return {
key,
label,
category: category as StructuredKpiRow["category"],
unit: unit as StructuredKpiRow["unit"],
order: asNumber(row.order) ?? Number.MAX_SAFE_INTEGER,
segment: asNullableString(row.segment),
axis: asNullableString(row.axis),
member: asNullableString(row.member),
values: normalizeNumberMap(row.values),
sourceConcepts: normalizeStringArray(
row.sourceConcepts ?? row.source_concepts,
),
sourceFactIds: normalizeNumberArray(
row.sourceFactIds ?? row.source_fact_ids,
),
provenanceType,
hasDimensions: asBoolean(row.hasDimensions ?? row.has_dimensions),
} satisfies StructuredKpiRow;
})
.filter((entry): entry is StructuredKpiRow => entry !== null);
}
function normalizeComputedDefinitions(value: unknown): ComputedDefinition[] {
if (!Array.isArray(value)) {
return [];
}
return value
.map((entry) => {
const row = asObject(entry);
if (!row) {
return null;
}
const key = asString(row.key);
const label = asString(row.label);
const category = asString(row.category);
const unit = asString(row.unit);
const computation = asObject(row.computation);
const computationType = asString(computation?.type);
if (
!key ||
!label ||
!category ||
!unit ||
!computation ||
!computationType
) {
return null;
}
const normalizedComputation = (() => {
if (computationType === "ratio") {
const numerator = asString(computation.numerator);
const denominator = asString(computation.denominator);
return numerator && denominator
? ({ type: "ratio", numerator, denominator } as const)
: null;
}
if (computationType === "yoy_growth") {
const source = asString(computation.source);
return source ? ({ type: "yoy_growth", source } as const) : null;
}
if (computationType === "cagr") {
const source = asString(computation.source);
const years = asNumber(computation.years);
return source && years !== null
? ({ type: "cagr", source, years } as const)
: null;
}
if (computationType === "per_share") {
const source = asString(computation.source);
const shares_key = asString(
computation.shares_key ?? computation.sharesKey,
);
return source && shares_key
? ({ type: "per_share", source, shares_key } as const)
: null;
}
if (computationType === "simple") {
const formula = asString(computation.formula);
return formula ? ({ type: "simple", formula } as const) : null;
}
return null;
})();
if (!normalizedComputation) {
return null;
}
const normalizedDefinition: ComputedDefinition = {
key,
label,
category,
order: asNumber(row.order) ?? Number.MAX_SAFE_INTEGER,
unit: unit as ComputedDefinition["unit"],
computation: normalizedComputation,
supported_cadences: normalizeStringArray(
row.supported_cadences ?? row.supportedCadences,
) as ComputedDefinition["supported_cadences"],
requires_external_data: normalizeStringArray(
row.requires_external_data ?? row.requiresExternalData,
),
};
return normalizedDefinition;
})
.filter((entry): entry is ComputedDefinition => entry !== null);
}
function normalizeNormalizationSummary(value: unknown) {
const row = asObject(value);
if (!row) {
return null;
}
return {
surfaceRowCount:
asNumber(row.surfaceRowCount ?? row.surface_row_count) ?? 0,
detailRowCount: asNumber(row.detailRowCount ?? row.detail_row_count) ?? 0,
kpiRowCount: asNumber(row.kpiRowCount ?? row.kpi_row_count) ?? 0,
unmappedRowCount:
asNumber(row.unmappedRowCount ?? row.unmapped_row_count) ?? 0,
materialUnmappedRowCount:
asNumber(
row.materialUnmappedRowCount ?? row.material_unmapped_row_count,
) ?? 0,
residualPrimaryCount:
asNumber(row.residualPrimaryCount ?? row.residual_primary_count) ?? 0,
residualDisclosureCount:
asNumber(row.residualDisclosureCount ?? row.residual_disclosure_count) ??
0,
unsupportedConceptCount:
asNumber(row.unsupportedConceptCount ?? row.unsupported_concept_count) ??
0,
issuerOverlayMatchCount:
asNumber(row.issuerOverlayMatchCount ?? row.issuer_overlay_match_count) ??
0,
warnings: normalizeStringArray(row.warnings),
} satisfies NormalizationSummary;
}
export function normalizeFilingTaxonomySnapshotPayload(input: {
periods: unknown;
faithful_rows: unknown;
statement_rows: unknown;
surface_rows: unknown;
detail_rows: unknown;
kpi_rows: unknown;
computed_definitions: unknown;
normalization_summary: unknown;
}) {
const faithfulRows = normalizeStatementRows(input.faithful_rows);
const statementRows = normalizeStatementRows(
input.statement_rows,
faithfulRows,
);
return {
periods: normalizePeriods(input.periods),
faithful_rows: faithfulRows,
statement_rows: statementRows,
surface_rows: normalizeSurfaceRows(input.surface_rows),
detail_rows: normalizeDetailRows(input.detail_rows),
kpi_rows: normalizeKpiRows(input.kpi_rows),
computed_definitions: normalizeComputedDefinitions(
input.computed_definitions,
),
normalization_summary: normalizeNormalizationSummary(
input.normalization_summary,
),
};
}
function emptyStatementRows(): StatementRowMap {
return {
income: [],
balance: [],
cash_flow: [],
disclosure: [],
equity: [],
comprehensive_income: [],
};
}
function emptySurfaceRows(): SurfaceRowMap {
return {
income: [],
balance: [],
cash_flow: [],
disclosure: [],
equity: [],
comprehensive_income: [],
};
}
function emptyDetailRows(): DetailRowMap {
return {
income: {},
balance: {},
cash_flow: {},
disclosure: {},
equity: {},
comprehensive_income: {},
};
}
function toSnapshotRecord(
row: typeof filingTaxonomySnapshot.$inferSelect,
): FilingTaxonomySnapshotRecord {
const normalized = normalizeFilingTaxonomySnapshotPayload({
periods: row.periods,
faithful_rows: row.faithful_rows,
statement_rows: row.statement_rows,
surface_rows: row.surface_rows,
detail_rows: row.detail_rows,
kpi_rows: row.kpi_rows,
computed_definitions: row.computed_definitions,
normalization_summary: row.normalization_summary,
});
return {
id: row.id,
filing_id: row.filing_id,
ticker: row.ticker,
filing_date: row.filing_date,
filing_type: row.filing_type,
parse_status: row.parse_status,
parse_error: row.parse_error,
source: row.source,
parser_engine: row.parser_engine,
parser_version: row.parser_version,
taxonomy_regime: row.taxonomy_regime,
fiscal_pack: row.fiscal_pack,
periods: normalized.periods,
faithful_rows: normalized.faithful_rows,
statement_rows: normalized.statement_rows,
surface_rows: normalized.surface_rows,
detail_rows: normalized.detail_rows,
kpi_rows: normalized.kpi_rows,
computed_definitions: normalized.computed_definitions,
derived_metrics: row.derived_metrics ?? null,
validation_result: row.validation_result ?? null,
normalization_summary: normalized.normalization_summary,
issuer_overlay_revision_id: row.issuer_overlay_revision_id ?? null,
facts_count: row.facts_count,
concepts_count: row.concepts_count,
dimensions_count: row.dimensions_count,
created_at: row.created_at,
updated_at: row.updated_at,
};
}
function toContextRecord(
row: typeof filingTaxonomyContext.$inferSelect,
): FilingTaxonomyContextRecord {
return {
id: row.id,
snapshot_id: row.snapshot_id,
context_id: row.context_id,
entity_identifier: row.entity_identifier,
entity_scheme: row.entity_scheme,
period_start: row.period_start,
period_end: row.period_end,
period_instant: row.period_instant,
segment_json: row.segment_json ?? null,
scenario_json: row.scenario_json ?? null,
created_at: row.created_at,
};
}
function toAssetRecord(
row: typeof filingTaxonomyAsset.$inferSelect,
): FilingTaxonomyAssetRecord {
return {
id: row.id,
snapshot_id: row.snapshot_id,
asset_type: row.asset_type,
name: row.name,
url: row.url,
size_bytes: row.size_bytes,
score: asNumber(row.score),
is_selected: row.is_selected,
created_at: row.created_at,
};
}
function toConceptRecord(
row: typeof filingTaxonomyConcept.$inferSelect,
): FilingTaxonomyConceptRecord {
return {
id: row.id,
snapshot_id: row.snapshot_id,
concept_key: row.concept_key,
qname: row.qname,
namespace_uri: row.namespace_uri,
local_name: row.local_name,
label: row.label,
is_extension: row.is_extension,
balance: row.balance,
period_type: row.period_type,
data_type: row.data_type,
statement_kind: row.statement_kind ?? null,
role_uri: row.role_uri,
authoritative_concept_key: row.authoritative_concept_key,
mapping_method: row.mapping_method,
surface_key: row.surface_key,
detail_parent_surface_key: row.detail_parent_surface_key,
kpi_key: row.kpi_key,
residual_flag: row.residual_flag,
presentation_order: asNumber(row.presentation_order),
presentation_depth: row.presentation_depth,
parent_concept_key: row.parent_concept_key,
is_abstract: row.is_abstract,
created_at: row.created_at,
};
}
function toFactRecord(
row: typeof filingTaxonomyFact.$inferSelect,
): FilingTaxonomyFactRecord {
const value = asNumber(row.value_num);
if (value === null) {
throw new Error(`Invalid value_num for taxonomy fact row ${row.id}`);
}
return {
id: row.id,
snapshot_id: row.snapshot_id,
concept_key: row.concept_key,
qname: row.qname,
namespace_uri: row.namespace_uri,
local_name: row.local_name,
data_type: row.data_type,
statement_kind: row.statement_kind ?? null,
role_uri: row.role_uri,
authoritative_concept_key: row.authoritative_concept_key,
mapping_method: row.mapping_method,
surface_key: row.surface_key,
detail_parent_surface_key: row.detail_parent_surface_key,
kpi_key: row.kpi_key,
residual_flag: row.residual_flag,
context_id: row.context_id,
unit: row.unit,
decimals: row.decimals,
precision: row.precision,
nil: row.nil,
value_num: value,
period_start: row.period_start,
period_end: row.period_end,
period_instant: row.period_instant,
dimensions: row.dimensions,
is_dimensionless: row.is_dimensionless,
source_file: row.source_file,
created_at: row.created_at,
};
}
function toMetricValidationRecord(
row: typeof filingTaxonomyMetricValidation.$inferSelect,
): FilingTaxonomyMetricValidationRecord {
return {
id: row.id,
snapshot_id: row.snapshot_id,
metric_key: row.metric_key,
taxonomy_value: asNumber(row.taxonomy_value),
llm_value: asNumber(row.llm_value),
absolute_diff: asNumber(row.absolute_diff),
relative_diff: asNumber(row.relative_diff),
status: row.status,
evidence_pages: row.evidence_pages ?? [],
pdf_url: row.pdf_url,
provider: row.provider,
model: row.model,
error: row.error,
created_at: row.created_at,
updated_at: row.updated_at,
};
}
export async function getFilingTaxonomySnapshotByFilingId(filingId: number) {
const [row] = await db
.select()
.from(filingTaxonomySnapshot)
.where(eq(filingTaxonomySnapshot.filing_id, filingId))
.limit(1);
return row ? toSnapshotRecord(row) : null;
}
async function listFilingTaxonomyAssets(snapshotId: number) {
const rows = await db
.select()
.from(filingTaxonomyAsset)
.where(eq(filingTaxonomyAsset.snapshot_id, snapshotId))
.orderBy(desc(filingTaxonomyAsset.id));
return rows.map(toAssetRecord);
}
async function listFilingTaxonomyContexts(snapshotId: number) {
const rows = await db
.select()
.from(filingTaxonomyContext)
.where(eq(filingTaxonomyContext.snapshot_id, snapshotId))
.orderBy(desc(filingTaxonomyContext.id));
return rows.map(toContextRecord);
}
async function listFilingTaxonomyConcepts(snapshotId: number) {
const rows = await db
.select()
.from(filingTaxonomyConcept)
.where(eq(filingTaxonomyConcept.snapshot_id, snapshotId))
.orderBy(desc(filingTaxonomyConcept.id));
return rows.map(toConceptRecord);
}
async function listFilingTaxonomyFacts(snapshotId: number) {
const rows = await db
.select()
.from(filingTaxonomyFact)
.where(eq(filingTaxonomyFact.snapshot_id, snapshotId))
.orderBy(desc(filingTaxonomyFact.id));
return rows.map(toFactRecord);
}
async function listFilingTaxonomyMetricValidations(snapshotId: number) {
const rows = await db
.select()
.from(filingTaxonomyMetricValidation)
.where(eq(filingTaxonomyMetricValidation.snapshot_id, snapshotId))
.orderBy(desc(filingTaxonomyMetricValidation.id));
return rows.map(toMetricValidationRecord);
}
export async function upsertFilingTaxonomySnapshot(
input: UpsertFilingTaxonomySnapshotInput,
) {
const now = new Date().toISOString();
const normalized = normalizeFilingTaxonomySnapshotPayload(input);
return db.transaction(async (tx) => {
const [saved] = await tx
.insert(filingTaxonomySnapshot)
.values({
filing_id: input.filing_id,
ticker: input.ticker,
filing_date: input.filing_date,
filing_type: input.filing_type,
parse_status: input.parse_status,
parse_error: input.parse_error,
source: input.source,
parser_engine: input.parser_engine,
parser_version: input.parser_version,
taxonomy_regime: input.taxonomy_regime,
fiscal_pack: input.fiscal_pack,
periods: normalized.periods,
faithful_rows: normalized.faithful_rows,
statement_rows: normalized.statement_rows,
surface_rows: normalized.surface_rows,
detail_rows: normalized.detail_rows,
kpi_rows: normalized.kpi_rows,
computed_definitions: normalized.computed_definitions,
derived_metrics: input.derived_metrics,
validation_result: input.validation_result,
normalization_summary: normalized.normalization_summary,
issuer_overlay_revision_id: input.issuer_overlay_revision_id ?? null,
facts_count: input.facts_count,
concepts_count: input.concepts_count,
dimensions_count: input.dimensions_count,
created_at: now,
updated_at: now,
})
.onConflictDoUpdate({
target: filingTaxonomySnapshot.filing_id,
set: {
ticker: input.ticker,
filing_date: input.filing_date,
filing_type: input.filing_type,
parse_status: input.parse_status,
parse_error: input.parse_error,
source: input.source,
parser_engine: input.parser_engine,
parser_version: input.parser_version,
taxonomy_regime: input.taxonomy_regime,
fiscal_pack: input.fiscal_pack,
periods: normalized.periods,
faithful_rows: normalized.faithful_rows,
statement_rows: normalized.statement_rows,
surface_rows: normalized.surface_rows,
detail_rows: normalized.detail_rows,
kpi_rows: normalized.kpi_rows,
computed_definitions: normalized.computed_definitions,
derived_metrics: input.derived_metrics,
validation_result: input.validation_result,
normalization_summary: normalized.normalization_summary,
issuer_overlay_revision_id: input.issuer_overlay_revision_id ?? null,
facts_count: input.facts_count,
concepts_count: input.concepts_count,
dimensions_count: input.dimensions_count,
updated_at: now,
},
})
.returning();
const snapshotId = saved.id;
try {
await tx
.delete(filingTaxonomyAsset)
.where(eq(filingTaxonomyAsset.snapshot_id, snapshotId));
await tx
.delete(filingTaxonomyContext)
.where(eq(filingTaxonomyContext.snapshot_id, snapshotId));
await tx
.delete(filingTaxonomyConcept)
.where(eq(filingTaxonomyConcept.snapshot_id, snapshotId));
await tx
.delete(filingTaxonomyFact)
.where(eq(filingTaxonomyFact.snapshot_id, snapshotId));
await tx
.delete(filingTaxonomyMetricValidation)
.where(eq(filingTaxonomyMetricValidation.snapshot_id, snapshotId));
} catch (error) {
throw new Error(
`Failed to delete child records for snapshot ${snapshotId}: ${error}`,
);
}
if (input.contexts.length > 0) {
try {
await tx.insert(filingTaxonomyContext).values(
input.contexts.map((context) => ({
snapshot_id: snapshotId,
context_id: context.context_id,
entity_identifier: context.entity_identifier,
entity_scheme: context.entity_scheme,
period_start: context.period_start,
period_end: context.period_end,
period_instant: context.period_instant,
segment_json: context.segment_json,
scenario_json: context.scenario_json,
created_at: now,
})),
);
} catch (error) {
throw new Error(
`Failed to insert ${input.contexts.length} contexts for snapshot ${snapshotId}: ${error}`,
);
}
}
if (input.assets.length > 0) {
try {
await tx.insert(filingTaxonomyAsset).values(
input.assets.map((asset) => ({
snapshot_id: snapshotId,
asset_type: asset.asset_type,
name: asset.name,
url: asset.url,
size_bytes: asset.size_bytes,
score: asNumericText(asset.score),
is_selected: asset.is_selected,
created_at: now,
})),
);
} catch (error) {
throw new Error(
`Failed to insert ${input.assets.length} assets for snapshot ${snapshotId}: ${error}`,
);
}
}
if (input.concepts.length > 0) {
try {
await tx.insert(filingTaxonomyConcept).values(
input.concepts.map((concept) => ({
snapshot_id: snapshotId,
concept_key: concept.concept_key,
qname: concept.qname,
namespace_uri: concept.namespace_uri,
local_name: concept.local_name,
label: concept.label,
is_extension: concept.is_extension,
balance: concept.balance,
period_type: concept.period_type,
data_type: concept.data_type,
statement_kind: concept.statement_kind,
role_uri: concept.role_uri,
authoritative_concept_key: concept.authoritative_concept_key,
mapping_method: concept.mapping_method,
surface_key: concept.surface_key,
detail_parent_surface_key: concept.detail_parent_surface_key,
kpi_key: concept.kpi_key,
residual_flag: concept.residual_flag,
presentation_order: asNumericText(concept.presentation_order),
presentation_depth: concept.presentation_depth,
parent_concept_key: concept.parent_concept_key,
is_abstract: concept.is_abstract,
created_at: now,
})),
);
} catch (error) {
throw new Error(
`Failed to insert ${input.concepts.length} concepts for snapshot ${snapshotId}: ${error}`,
);
}
}
if (input.facts.length > 0) {
try {
await tx.insert(filingTaxonomyFact).values(
input.facts.map((fact) => ({
snapshot_id: snapshotId,
concept_key: fact.concept_key,
qname: fact.qname,
namespace_uri: fact.namespace_uri,
local_name: fact.local_name,
data_type: fact.data_type,
statement_kind: fact.statement_kind,
role_uri: fact.role_uri,
authoritative_concept_key: fact.authoritative_concept_key,
mapping_method: fact.mapping_method,
surface_key: fact.surface_key,
detail_parent_surface_key: fact.detail_parent_surface_key,
kpi_key: fact.kpi_key,
residual_flag: fact.residual_flag,
context_id: fact.context_id,
unit: fact.unit,
decimals: fact.decimals,
precision: fact.precision,
nil: fact.nil,
value_num: String(fact.value_num),
period_start: fact.period_start,
period_end: fact.period_end,
period_instant: fact.period_instant,
dimensions: fact.dimensions,
is_dimensionless: fact.is_dimensionless,
source_file: fact.source_file,
created_at: now,
})),
);
} catch (error) {
throw new Error(
`Failed to insert ${input.facts.length} facts for snapshot ${snapshotId}: ${error}`,
);
}
}
if (input.metric_validations.length > 0) {
try {
await tx.insert(filingTaxonomyMetricValidation).values(
input.metric_validations.map((check) => ({
snapshot_id: snapshotId,
metric_key: check.metric_key,
taxonomy_value: asNumericText(check.taxonomy_value),
llm_value: asNumericText(check.llm_value),
absolute_diff: asNumericText(check.absolute_diff),
relative_diff: asNumericText(check.relative_diff),
status: check.status,
evidence_pages: check.evidence_pages,
pdf_url: check.pdf_url,
provider: check.provider,
model: check.model,
error: check.error,
created_at: now,
updated_at: now,
})),
);
} catch (error) {
throw new Error(
`Failed to insert ${input.metric_validations.length} metric validations for snapshot ${snapshotId}: ${error}`,
);
}
}
return toSnapshotRecord(saved);
});
}
export async function listFilingTaxonomySnapshotsByTicker(input: {
ticker: string;
window: "10y" | "all";
filingTypes?: Array<"10-K" | "10-Q">;
limit?: number;
cursor?: string | null;
}) {
const safeLimit = Math.min(Math.max(Math.trunc(input.limit ?? 40), 1), 120);
const cursorId = input.cursor ? Number.parseInt(input.cursor, 10) : null;
const constraints = [
eq(filingTaxonomySnapshot.ticker, input.ticker.trim().toUpperCase()),
];
if (input.window === "10y") {
constraints.push(gte(filingTaxonomySnapshot.filing_date, tenYearsAgoIso()));
}
if (cursorId && Number.isFinite(cursorId) && cursorId > 0) {
constraints.push(lt(filingTaxonomySnapshot.id, cursorId));
}
if (input.filingTypes && input.filingTypes.length > 0) {
constraints.push(
inArray(filingTaxonomySnapshot.filing_type, input.filingTypes),
);
}
const rows = await db
.select()
.from(filingTaxonomySnapshot)
.where(and(...constraints))
.orderBy(
desc(filingTaxonomySnapshot.filing_date),
desc(filingTaxonomySnapshot.id),
)
.limit(safeLimit + 1);
const hasMore = rows.length > safeLimit;
const usedRows = hasMore ? rows.slice(0, safeLimit) : rows;
const nextCursor = hasMore
? String(usedRows[usedRows.length - 1]?.id ?? "")
: null;
return {
snapshots: usedRows.map(toSnapshotRecord),
nextCursor,
};
}
export async function countFilingTaxonomySnapshotStatuses(ticker: string) {
const rows = await db
.select({
status: filingTaxonomySnapshot.parse_status,
count: sql<string>`count(*)`,
})
.from(filingTaxonomySnapshot)
.where(eq(filingTaxonomySnapshot.ticker, ticker.trim().toUpperCase()))
.groupBy(filingTaxonomySnapshot.parse_status);
return rows.reduce<Record<FilingTaxonomyParseStatus, number>>(
(acc, row) => {
acc[row.status] = Number(row.count);
return acc;
},
{
ready: 0,
partial: 0,
failed: 0,
},
);
}
export async function listTaxonomyFactsByTicker(input: {
ticker: string;
window: "10y" | "all";
statement?: FinancialStatementKind;
filingTypes?: Array<"10-K" | "10-Q">;
cursor?: string | null;
limit?: number;
}) {
const safeLimit = Math.min(
Math.max(Math.trunc(input.limit ?? 500), 1),
10000,
);
const cursorId = input.cursor ? Number.parseInt(input.cursor, 10) : null;
const conditions = [
eq(filingTaxonomySnapshot.ticker, input.ticker.trim().toUpperCase()),
];
if (input.window === "10y") {
conditions.push(gte(filingTaxonomySnapshot.filing_date, tenYearsAgoIso()));
}
if (input.statement) {
conditions.push(eq(filingTaxonomyFact.statement_kind, input.statement));
}
if (input.filingTypes && input.filingTypes.length > 0) {
conditions.push(
inArray(filingTaxonomySnapshot.filing_type, input.filingTypes),
);
}
if (cursorId && Number.isFinite(cursorId) && cursorId > 0) {
conditions.push(lt(filingTaxonomyFact.id, cursorId));
}
const rows = await db
.select({
id: filingTaxonomyFact.id,
snapshot_id: filingTaxonomyFact.snapshot_id,
filing_id: filingTaxonomySnapshot.filing_id,
filing_date: filingTaxonomySnapshot.filing_date,
statement_kind: filingTaxonomyFact.statement_kind,
role_uri: filingTaxonomyFact.role_uri,
concept_key: filingTaxonomyFact.concept_key,
qname: filingTaxonomyFact.qname,
namespace_uri: filingTaxonomyFact.namespace_uri,
local_name: filingTaxonomyFact.local_name,
value_num: filingTaxonomyFact.value_num,
context_id: filingTaxonomyFact.context_id,
unit: filingTaxonomyFact.unit,
decimals: filingTaxonomyFact.decimals,
period_start: filingTaxonomyFact.period_start,
period_end: filingTaxonomyFact.period_end,
period_instant: filingTaxonomyFact.period_instant,
dimensions: filingTaxonomyFact.dimensions,
is_dimensionless: filingTaxonomyFact.is_dimensionless,
source_file: filingTaxonomyFact.source_file,
})
.from(filingTaxonomyFact)
.innerJoin(
filingTaxonomySnapshot,
eq(filingTaxonomyFact.snapshot_id, filingTaxonomySnapshot.id),
)
.where(and(...conditions))
.orderBy(desc(filingTaxonomyFact.id))
.limit(safeLimit + 1);
const hasMore = rows.length > safeLimit;
const used = hasMore ? rows.slice(0, safeLimit) : rows;
const nextCursor = hasMore ? String(used[used.length - 1]?.id ?? "") : null;
const facts: TaxonomyFactRow[] = used.map((row) => {
const value = asNumber(row.value_num);
if (value === null) {
throw new Error(`Invalid value_num in taxonomy fact ${row.id}`);
}
return {
id: row.id,
snapshotId: row.snapshot_id,
filingId: row.filing_id,
filingDate: row.filing_date,
statement: row.statement_kind,
roleUri: row.role_uri,
conceptKey: row.concept_key,
qname: row.qname,
namespaceUri: row.namespace_uri,
localName: row.local_name,
value,
contextId: row.context_id,
unit: row.unit,
decimals: row.decimals,
periodStart: row.period_start,
periodEnd: row.period_end,
periodInstant: row.period_instant,
dimensions: row.dimensions,
isDimensionless: row.is_dimensionless,
sourceFile: row.source_file,
};
});
return {
facts,
nextCursor,
};
}
async function listTaxonomyAssetsBySnapshotIds(snapshotIds: number[]) {
if (snapshotIds.length === 0) {
return [];
}
const rows = await db
.select()
.from(filingTaxonomyAsset)
.where(inArray(filingTaxonomyAsset.snapshot_id, snapshotIds))
.orderBy(desc(filingTaxonomyAsset.id));
return rows.map(toAssetRecord);
}
export async function listFilingTaxonomyConceptsBySnapshotIds(
snapshotIds: number[],
) {
if (snapshotIds.length === 0) {
return [];
}
const rows = await db
.select()
.from(filingTaxonomyConcept)
.where(inArray(filingTaxonomyConcept.snapshot_id, snapshotIds))
.orderBy(desc(filingTaxonomyConcept.id));
return rows.map(toConceptRecord);
}
export const __filingTaxonomyInternals = {
normalizeFilingTaxonomySnapshotPayload,
toSnapshotRecord,
};