Integrate crabrl parser into taxonomy hydration

This commit is contained in:
2026-03-16 15:18:01 -04:00
parent cf084793ed
commit a58b07456e
23 changed files with 4696 additions and 2466 deletions

View File

@@ -1,36 +1,89 @@
import { existsSync } from 'node:fs';
import { join } from 'node:path';
import type { TaxonomyHydrationInput, TaxonomyHydrationResult } from '@/lib/server/taxonomy/types';
import { withRetry } from '@/lib/server/utils/retry';
import { existsSync } from "node:fs";
import { join } from "node:path";
import type {
TaxonomyHydrationInput,
TaxonomyHydrationResult,
} from "@/lib/server/taxonomy/types";
import { withRetry } from "@/lib/server/utils/retry";
type SpawnedSidecar = {
stdin: { write: (chunk: Uint8Array) => void; end: () => void };
stdout: ReadableStream<Uint8Array>;
stderr: ReadableStream<Uint8Array>;
exited: Promise<number>;
kill: () => void;
};
type SidecarDeps = {
existsSync: typeof existsSync;
spawn: typeof Bun.spawn;
setTimeout: typeof globalThis.setTimeout;
clearTimeout: typeof globalThis.clearTimeout;
};
function candidateBinaryPaths() {
return [
process.env.FISCAL_XBRL_BIN?.trim(),
join(process.cwd(), 'bin', 'fiscal-xbrl'),
join(process.cwd(), 'rust', 'target', 'release', 'fiscal-xbrl'),
join(process.cwd(), 'rust', 'target', 'debug', 'fiscal-xbrl')
].filter((value): value is string => typeof value === 'string' && value.length > 0);
join(process.cwd(), "bin", "fiscal-xbrl"),
join(process.cwd(), "rust", "target", "release", "fiscal-xbrl"),
join(process.cwd(), "rust", "target", "debug", "fiscal-xbrl"),
].filter(
(value): value is string => typeof value === "string" && value.length > 0,
);
}
export function resolveFiscalXbrlBinary() {
const resolved = candidateBinaryPaths().find((path) => existsSync(path));
return resolveFiscalXbrlBinaryWithDeps({
existsSync,
});
}
function resolveFiscalXbrlBinaryWithDeps(
deps: Pick<SidecarDeps, "existsSync">,
) {
const resolved = candidateBinaryPaths().find((path) => deps.existsSync(path));
if (!resolved) {
throw new Error('Rust XBRL sidecar binary is required but was not found. Set FISCAL_XBRL_BIN or build `fiscal-xbrl` under rust/target.');
throw new Error(
"Rust XBRL sidecar binary is required but was not found. Set FISCAL_XBRL_BIN or build `fiscal-xbrl` under rust/target.",
);
}
return resolved;
}
export async function hydrateFilingTaxonomySnapshotFromSidecar(
input: TaxonomyHydrationInput
input: TaxonomyHydrationInput,
): Promise<TaxonomyHydrationResult> {
return withRetry(() => hydrateFromSidecarImpl(input));
return hydrateFilingTaxonomySnapshotFromSidecarWithDeps(input, {
existsSync,
spawn: Bun.spawn,
setTimeout: globalThis.setTimeout,
clearTimeout: globalThis.clearTimeout,
});
}
async function hydrateFromSidecarImpl(input: TaxonomyHydrationInput): Promise<TaxonomyHydrationResult> {
const binary = resolveFiscalXbrlBinary();
const timeoutMs = Math.max(Number(process.env.XBRL_ENGINE_TIMEOUT_MS ?? 45_000), 1_000);
const command = [binary, 'hydrate-filing'];
async function hydrateFilingTaxonomySnapshotFromSidecarWithDeps(
input: TaxonomyHydrationInput,
deps: SidecarDeps,
): Promise<TaxonomyHydrationResult> {
return withRetry(() => hydrateFromSidecarImpl(input, deps));
}
async function hydrateFromSidecarImpl(
input: TaxonomyHydrationInput,
deps: SidecarDeps = {
existsSync,
spawn: Bun.spawn,
setTimeout: globalThis.setTimeout,
clearTimeout: globalThis.clearTimeout,
},
): Promise<TaxonomyHydrationResult> {
const binary = resolveFiscalXbrlBinaryWithDeps(deps);
const timeoutMs = Math.max(
Number(process.env.XBRL_ENGINE_TIMEOUT_MS ?? 45_000),
1_000,
);
const command = [binary, "hydrate-filing"];
const requestBody = JSON.stringify({
filingId: input.filingId,
ticker: input.ticker,
@@ -40,22 +93,24 @@ async function hydrateFromSidecarImpl(input: TaxonomyHydrationInput): Promise<Ta
filingType: input.filingType,
filingUrl: input.filingUrl,
primaryDocument: input.primaryDocument,
cacheDir: process.env.FISCAL_XBRL_CACHE_DIR ?? join(process.cwd(), '.cache', 'xbrl')
cacheDir:
process.env.FISCAL_XBRL_CACHE_DIR ??
join(process.cwd(), ".cache", "xbrl"),
});
const child = Bun.spawn(command, {
stdin: 'pipe',
stdout: 'pipe',
stderr: 'pipe',
const child = deps.spawn(command, {
stdin: "pipe",
stdout: "pipe",
stderr: "pipe",
env: {
...process.env
}
});
...process.env,
},
}) as SpawnedSidecar;
child.stdin.write(new TextEncoder().encode(requestBody));
child.stdin.end();
const timeout = setTimeout(() => {
const timeout = deps.setTimeout(() => {
child.kill();
}, timeoutMs);
@@ -63,7 +118,7 @@ async function hydrateFromSidecarImpl(input: TaxonomyHydrationInput): Promise<Ta
const [stdout, stderr, exitCode] = await Promise.all([
new Response(child.stdout).text(),
new Response(child.stderr).text(),
child.exited
child.exited,
]);
if (stderr.trim().length > 0) {
@@ -71,11 +126,20 @@ async function hydrateFromSidecarImpl(input: TaxonomyHydrationInput): Promise<Ta
}
if (exitCode !== 0) {
throw new Error(`Rust XBRL sidecar failed with exit code ${exitCode}: ${stderr.trim() || stdout.trim() || 'no error output'}`);
throw new Error(
`Rust XBRL sidecar failed with exit code ${exitCode}: ${stderr.trim() || stdout.trim() || "no error output"}`,
);
}
return JSON.parse(stdout) as TaxonomyHydrationResult;
} finally {
clearTimeout(timeout);
deps.clearTimeout(timeout);
}
}
export const __parserClientInternals = {
candidateBinaryPaths,
hydrateFilingTaxonomySnapshotFromSidecarWithDeps,
hydrateFromSidecarImpl,
resolveFiscalXbrlBinary: resolveFiscalXbrlBinaryWithDeps,
};