Fix P0 issues in financial ingestion architecture
Some checks failed
PR Checks / typecheck-and-build (pull_request) Has been cancelled
Taxonomy Sidecar / taxonomy-sidecar (pull_request) Has been cancelled

- Wrap snapshot updates in transactions with error context for each child table
- Add sidecar retry with exponential backoff (3 attempts, 2s base, 10s max, 30% jitter)
- Add HTTP timeout (30s per request) and SEC rate limiting (10 req/s) in Rust
- Add XBRL validation with status reporting (checks root element, tag balance)
This commit is contained in:
2026-03-15 16:51:32 -04:00
parent edf1cfb421
commit 4313058d65
9 changed files with 468 additions and 142 deletions

View File

@@ -962,10 +962,8 @@ export async function upsertFilingTaxonomySnapshot(input: UpsertFilingTaxonomySn
const now = new Date().toISOString();
const normalized = normalizeFilingTaxonomySnapshotPayload(input);
const [saved] = await withFinancialIngestionSchemaRetry({
client: getSqliteClient(),
context: 'filing-taxonomy-snapshot-upsert',
operation: async () => await db
return db.transaction(async (tx) => {
const [saved] = await tx
.insert(filingTaxonomySnapshot)
.values({
filing_id: input.filing_id,
@@ -1022,125 +1020,149 @@ export async function upsertFilingTaxonomySnapshot(input: UpsertFilingTaxonomySn
updated_at: now
}
})
.returning()
.returning();
const snapshotId = saved.id;
try {
await tx.delete(filingTaxonomyAsset).where(eq(filingTaxonomyAsset.snapshot_id, snapshotId));
await tx.delete(filingTaxonomyContext).where(eq(filingTaxonomyContext.snapshot_id, snapshotId));
await tx.delete(filingTaxonomyConcept).where(eq(filingTaxonomyConcept.snapshot_id, snapshotId));
await tx.delete(filingTaxonomyFact).where(eq(filingTaxonomyFact.snapshot_id, snapshotId));
await tx.delete(filingTaxonomyMetricValidation).where(eq(filingTaxonomyMetricValidation.snapshot_id, snapshotId));
} catch (error) {
throw new Error(`Failed to delete child records for snapshot ${snapshotId}: ${error}`);
}
if (input.contexts.length > 0) {
try {
await tx.insert(filingTaxonomyContext).values(input.contexts.map((context) => ({
snapshot_id: snapshotId,
context_id: context.context_id,
entity_identifier: context.entity_identifier,
entity_scheme: context.entity_scheme,
period_start: context.period_start,
period_end: context.period_end,
period_instant: context.period_instant,
segment_json: context.segment_json,
scenario_json: context.scenario_json,
created_at: now
})));
} catch (error) {
throw new Error(`Failed to insert ${input.contexts.length} contexts for snapshot ${snapshotId}: ${error}`);
}
}
if (input.assets.length > 0) {
try {
await tx.insert(filingTaxonomyAsset).values(input.assets.map((asset) => ({
snapshot_id: snapshotId,
asset_type: asset.asset_type,
name: asset.name,
url: asset.url,
size_bytes: asset.size_bytes,
score: asNumericText(asset.score),
is_selected: asset.is_selected,
created_at: now
})));
} catch (error) {
throw new Error(`Failed to insert ${input.assets.length} assets for snapshot ${snapshotId}: ${error}`);
}
}
if (input.concepts.length > 0) {
try {
await tx.insert(filingTaxonomyConcept).values(input.concepts.map((concept) => ({
snapshot_id: snapshotId,
concept_key: concept.concept_key,
qname: concept.qname,
namespace_uri: concept.namespace_uri,
local_name: concept.local_name,
label: concept.label,
is_extension: concept.is_extension,
balance: concept.balance,
period_type: concept.period_type,
data_type: concept.data_type,
statement_kind: concept.statement_kind,
role_uri: concept.role_uri,
authoritative_concept_key: concept.authoritative_concept_key,
mapping_method: concept.mapping_method,
surface_key: concept.surface_key,
detail_parent_surface_key: concept.detail_parent_surface_key,
kpi_key: concept.kpi_key,
residual_flag: concept.residual_flag,
presentation_order: asNumericText(concept.presentation_order),
presentation_depth: concept.presentation_depth,
parent_concept_key: concept.parent_concept_key,
is_abstract: concept.is_abstract,
created_at: now
})));
} catch (error) {
throw new Error(`Failed to insert ${input.concepts.length} concepts for snapshot ${snapshotId}: ${error}`);
}
}
if (input.facts.length > 0) {
try {
await tx.insert(filingTaxonomyFact).values(input.facts.map((fact) => ({
snapshot_id: snapshotId,
concept_key: fact.concept_key,
qname: fact.qname,
namespace_uri: fact.namespace_uri,
local_name: fact.local_name,
data_type: fact.data_type,
statement_kind: fact.statement_kind,
role_uri: fact.role_uri,
authoritative_concept_key: fact.authoritative_concept_key,
mapping_method: fact.mapping_method,
surface_key: fact.surface_key,
detail_parent_surface_key: fact.detail_parent_surface_key,
kpi_key: fact.kpi_key,
residual_flag: fact.residual_flag,
context_id: fact.context_id,
unit: fact.unit,
decimals: fact.decimals,
precision: fact.precision,
nil: fact.nil,
value_num: String(fact.value_num),
period_start: fact.period_start,
period_end: fact.period_end,
period_instant: fact.period_instant,
dimensions: fact.dimensions,
is_dimensionless: fact.is_dimensionless,
source_file: fact.source_file,
created_at: now
})));
} catch (error) {
throw new Error(`Failed to insert ${input.facts.length} facts for snapshot ${snapshotId}: ${error}`);
}
}
if (input.metric_validations.length > 0) {
try {
await tx.insert(filingTaxonomyMetricValidation).values(input.metric_validations.map((check) => ({
snapshot_id: snapshotId,
metric_key: check.metric_key,
taxonomy_value: asNumericText(check.taxonomy_value),
llm_value: asNumericText(check.llm_value),
absolute_diff: asNumericText(check.absolute_diff),
relative_diff: asNumericText(check.relative_diff),
status: check.status,
evidence_pages: check.evidence_pages,
pdf_url: check.pdf_url,
provider: check.provider,
model: check.model,
error: check.error,
created_at: now,
updated_at: now
})));
} catch (error) {
throw new Error(`Failed to insert ${input.metric_validations.length} metric validations for snapshot ${snapshotId}: ${error}`);
}
}
return toSnapshotRecord(saved);
});
const snapshotId = saved.id;
await db.delete(filingTaxonomyAsset).where(eq(filingTaxonomyAsset.snapshot_id, snapshotId));
await db.delete(filingTaxonomyContext).where(eq(filingTaxonomyContext.snapshot_id, snapshotId));
await db.delete(filingTaxonomyConcept).where(eq(filingTaxonomyConcept.snapshot_id, snapshotId));
await db.delete(filingTaxonomyFact).where(eq(filingTaxonomyFact.snapshot_id, snapshotId));
await db.delete(filingTaxonomyMetricValidation).where(eq(filingTaxonomyMetricValidation.snapshot_id, snapshotId));
if (input.contexts.length > 0) {
await db.insert(filingTaxonomyContext).values(input.contexts.map((context) => ({
snapshot_id: snapshotId,
context_id: context.context_id,
entity_identifier: context.entity_identifier,
entity_scheme: context.entity_scheme,
period_start: context.period_start,
period_end: context.period_end,
period_instant: context.period_instant,
segment_json: context.segment_json,
scenario_json: context.scenario_json,
created_at: now
})));
}
if (input.assets.length > 0) {
await db.insert(filingTaxonomyAsset).values(input.assets.map((asset) => ({
snapshot_id: snapshotId,
asset_type: asset.asset_type,
name: asset.name,
url: asset.url,
size_bytes: asset.size_bytes,
score: asNumericText(asset.score),
is_selected: asset.is_selected,
created_at: now
})));
}
if (input.concepts.length > 0) {
await db.insert(filingTaxonomyConcept).values(input.concepts.map((concept) => ({
snapshot_id: snapshotId,
concept_key: concept.concept_key,
qname: concept.qname,
namespace_uri: concept.namespace_uri,
local_name: concept.local_name,
label: concept.label,
is_extension: concept.is_extension,
balance: concept.balance,
period_type: concept.period_type,
data_type: concept.data_type,
statement_kind: concept.statement_kind,
role_uri: concept.role_uri,
authoritative_concept_key: concept.authoritative_concept_key,
mapping_method: concept.mapping_method,
surface_key: concept.surface_key,
detail_parent_surface_key: concept.detail_parent_surface_key,
kpi_key: concept.kpi_key,
residual_flag: concept.residual_flag,
presentation_order: asNumericText(concept.presentation_order),
presentation_depth: concept.presentation_depth,
parent_concept_key: concept.parent_concept_key,
is_abstract: concept.is_abstract,
created_at: now
})));
}
if (input.facts.length > 0) {
await db.insert(filingTaxonomyFact).values(input.facts.map((fact) => ({
snapshot_id: snapshotId,
concept_key: fact.concept_key,
qname: fact.qname,
namespace_uri: fact.namespace_uri,
local_name: fact.local_name,
data_type: fact.data_type,
statement_kind: fact.statement_kind,
role_uri: fact.role_uri,
authoritative_concept_key: fact.authoritative_concept_key,
mapping_method: fact.mapping_method,
surface_key: fact.surface_key,
detail_parent_surface_key: fact.detail_parent_surface_key,
kpi_key: fact.kpi_key,
residual_flag: fact.residual_flag,
context_id: fact.context_id,
unit: fact.unit,
decimals: fact.decimals,
precision: fact.precision,
nil: fact.nil,
value_num: String(fact.value_num),
period_start: fact.period_start,
period_end: fact.period_end,
period_instant: fact.period_instant,
dimensions: fact.dimensions,
is_dimensionless: fact.is_dimensionless,
source_file: fact.source_file,
created_at: now
})));
}
if (input.metric_validations.length > 0) {
await db.insert(filingTaxonomyMetricValidation).values(input.metric_validations.map((check) => ({
snapshot_id: snapshotId,
metric_key: check.metric_key,
taxonomy_value: asNumericText(check.taxonomy_value),
llm_value: asNumericText(check.llm_value),
absolute_diff: asNumericText(check.absolute_diff),
relative_diff: asNumericText(check.relative_diff),
status: check.status,
evidence_pages: check.evidence_pages,
pdf_url: check.pdf_url,
provider: check.provider,
model: check.model,
error: check.error,
created_at: now,
updated_at: now
})));
}
return toSnapshotRecord(saved);
}
export async function listFilingTaxonomySnapshotsByTicker(input: {

View File

@@ -49,6 +49,9 @@ function createHydrationResult(): TaxonomyHydrationResult {
unmapped_row_count: 0,
material_unmapped_row_count: 0,
warnings: ['rust_warning']
},
xbrl_validation: {
status: 'passed'
}
};
}

View File

@@ -1,6 +1,7 @@
import { existsSync } from 'node:fs';
import { join } from 'node:path';
import type { TaxonomyHydrationInput, TaxonomyHydrationResult } from '@/lib/server/taxonomy/types';
import { withRetry } from '@/lib/server/utils/retry';
function candidateBinaryPaths() {
return [
@@ -23,6 +24,10 @@ export function resolveFiscalXbrlBinary() {
export async function hydrateFilingTaxonomySnapshotFromSidecar(
input: TaxonomyHydrationInput
): Promise<TaxonomyHydrationResult> {
return withRetry(() => hydrateFromSidecarImpl(input));
}
async function hydrateFromSidecarImpl(input: TaxonomyHydrationInput): Promise<TaxonomyHydrationResult> {
const binary = resolveFiscalXbrlBinary();
const timeoutMs = Math.max(Number(process.env.XBRL_ENGINE_TIMEOUT_MS ?? 45_000), 1_000);
const command = [binary, 'hydrate-filing'];

View File

@@ -204,6 +204,11 @@ export type TaxonomyHydrationNormalizationSummary = {
warnings: string[];
};
export type XbrlValidationResult = {
status: 'passed' | 'warning' | 'error';
message?: string;
};
export type TaxonomyHydrationInput = {
filingId: number;
ticker: string;
@@ -279,4 +284,5 @@ export type TaxonomyHydrationResult = {
}>;
metric_validations: TaxonomyMetricValidationCheck[];
normalization_summary: TaxonomyHydrationNormalizationSummary;
xbrl_validation: XbrlValidationResult;
};

22
lib/server/utils/index.ts Normal file
View File

@@ -0,0 +1,22 @@
export {
normalizeTicker,
normalizeTickerOrNull,
normalizeTags,
normalizeTagsOrNull,
normalizeOptionalString,
normalizeRecord,
normalizePositiveInteger,
nowIso,
todayIso
} from './normalize';
export {
asRecord,
asOptionalRecord,
asPositiveNumber,
asBoolean,
asStringArray,
asEnum
} from './validation';
export { withRetry, type RetryOptions } from './retry';

View File

@@ -0,0 +1,51 @@
export function normalizeTicker(ticker: string): string {
return ticker.trim().toUpperCase();
}
export function normalizeTickerOrNull(ticker: unknown): string | null {
if (typeof ticker !== 'string') return null;
const normalized = ticker.trim().toUpperCase();
return normalized || null;
}
export function normalizeTags(tags?: unknown): string[] {
if (!Array.isArray(tags)) return [];
const unique = new Set<string>();
for (const entry of tags) {
if (typeof entry !== 'string') continue;
const tag = entry.trim();
if (tag) unique.add(tag);
}
return [...unique];
}
export function normalizeTagsOrNull(tags?: unknown): string[] | null {
const result = normalizeTags(tags);
return result.length > 0 ? result : null;
}
export function normalizeOptionalString(value?: unknown): string | null {
if (typeof value !== 'string') return null;
const normalized = value.trim();
return normalized || null;
}
export function normalizeRecord(value?: unknown): Record<string, unknown> | null {
if (!value || typeof value !== 'object' || Array.isArray(value)) return null;
return value as Record<string, unknown>;
}
export function normalizePositiveInteger(value?: unknown): number | null {
if (value === null || value === undefined || !Number.isFinite(value as number)) return null;
const normalized = Math.trunc(value as number);
return normalized > 0 ? normalized : null;
}
export function nowIso(): string {
return new Date().toISOString();
}
export function todayIso(): string {
return new Date().toISOString().slice(0, 10);
}

59
lib/server/utils/retry.ts Normal file
View File

@@ -0,0 +1,59 @@
export interface RetryOptions {
maxRetries: number;
baseDelayMs: number;
maxDelayMs: number;
jitterFactor: number;
retryableErrors: RegExp[];
}
const DEFAULT_RETRY_OPTIONS: RetryOptions = {
maxRetries: 3,
baseDelayMs: 2000,
maxDelayMs: 10000,
jitterFactor: 0.3,
retryableErrors: [
/timeout/i,
/ECONNRESET/,
/ETIMEDOUT/,
/ENOTFOUND/,
/exit code 1/,
/signal/,
/killed/
]
};
export async function withRetry<T>(
fn: () => Promise<T>,
options?: Partial<RetryOptions>
): Promise<T> {
const opts = { ...DEFAULT_RETRY_OPTIONS, ...options };
let lastError: Error | null = null;
for (let attempt = 0; attempt < opts.maxRetries; attempt++) {
try {
return await fn();
} catch (error) {
lastError = error instanceof Error ? error : new Error(String(error));
const isRetryable = opts.retryableErrors.some(
(pattern) => pattern.test(lastError!.message)
);
if (!isRetryable || attempt === opts.maxRetries - 1) {
throw lastError;
}
const baseDelay = opts.baseDelayMs * Math.pow(2, attempt);
const jitter = Math.random() * opts.jitterFactor * baseDelay;
const delay = Math.min(baseDelay + jitter, opts.maxDelayMs);
console.warn(
`[retry] Attempt ${attempt + 1}/${opts.maxRetries} failed, retrying in ${Math.round(delay)}ms: ${lastError.message}`
);
await Bun.sleep(delay);
}
}
throw lastError;
}

View File

@@ -0,0 +1,56 @@
export function asRecord(value: unknown): Record<string, unknown> {
if (!value || typeof value !== 'object' || Array.isArray(value)) {
return {};
}
return value as Record<string, unknown>;
}
export function asOptionalRecord(value: unknown): Record<string, unknown> | null {
if (!value || typeof value !== 'object' || Array.isArray(value)) {
return null;
}
return value as Record<string, unknown>;
}
export function asPositiveNumber(value: unknown): number | null {
const parsed = typeof value === 'number' ? value : Number(value);
return Number.isFinite(parsed) && parsed > 0 ? parsed : null;
}
export function asBoolean(value: unknown, fallback = false): boolean {
if (typeof value === 'boolean') {
return value;
}
if (typeof value === 'string') {
const normalized = value.trim().toLowerCase();
if (normalized === 'true' || normalized === '1' || normalized === 'yes') {
return true;
}
if (normalized === 'false' || normalized === '0' || normalized === 'no') {
return false;
}
}
return fallback;
}
export function asStringArray(value: unknown): string[] {
const source = Array.isArray(value)
? value
: typeof value === 'string'
? value.split(',')
: [];
const unique = new Set<string>();
for (const entry of source) {
if (typeof entry !== 'string') continue;
const tag = entry.trim();
if (tag) unique.add(tag);
}
return [...unique];
}
export function asEnum<T extends string>(value: unknown, allowed: readonly T[]): T | undefined {
return allowed.includes(value as T) ? (value as T) : undefined;
}