import { execFile } from 'node:child_process'; import { mkdtemp, rm, writeFile } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; import { promisify } from 'node:util'; import type { Filing, MetricValidationResult } from '@/lib/types'; import { runAiAnalysis } from '@/lib/server/ai'; import type { TaxonomyAsset, TaxonomyMetricValidationCheck } from '@/lib/server/taxonomy/types'; const execFileAsync = promisify(execFile); const METRIC_KEYS: Array> = [ 'revenue', 'netIncome', 'totalAssets', 'cash', 'debt' ]; function extractJsonCandidate(raw: string) { const fencedJson = raw.match(/```(?:json)?\s*([\s\S]*?)```/i)?.[1]; const candidate = fencedJson ?? (() => { const start = raw.indexOf('{'); const end = raw.lastIndexOf('}'); return start >= 0 && end > start ? raw.slice(start, end + 1) : null; })(); return candidate; } function parseValidationPayload(raw: string) { const candidate = extractJsonCandidate(raw); if (!candidate) { return null; } try { return JSON.parse(candidate) as Record; }>; } catch { return null; } } function asNumber(value: unknown) { if (typeof value === 'number') { return Number.isFinite(value) ? value : null; } if (typeof value === 'string') { const parsed = Number(value.replace(/[,\s]/g, '')); return Number.isFinite(parsed) ? parsed : null; } return null; } function asPageNumbers(raw: unknown): number[] { if (!Array.isArray(raw)) { return []; } return raw .map((entry) => { if (typeof entry === 'number' && Number.isFinite(entry)) { return Math.trunc(entry); } if (typeof entry === 'string') { const parsed = Number(entry); return Number.isFinite(parsed) ? Math.trunc(parsed) : Number.NaN; } return Number.NaN; }) .filter((entry) => Number.isFinite(entry) && entry > 0); } function diffStatus(taxonomyValue: number | null, llmValue: number | null) { if (taxonomyValue === null && llmValue === null) { return { status: 'not_run' as const, absoluteDiff: null, relativeDiff: null }; } if (taxonomyValue === null || llmValue === null) { return { status: 'mismatch' as const, absoluteDiff: null, relativeDiff: null }; } const absoluteDiff = Math.abs(taxonomyValue - llmValue); const denominator = Math.max(Math.abs(taxonomyValue), 1); const relativeDiff = absoluteDiff / denominator; const tolerance = Math.max(1, Math.abs(taxonomyValue) * 0.005); return { status: absoluteDiff <= tolerance ? 'matched' as const : 'mismatch' as const, absoluteDiff, relativeDiff }; } async function extractPdfText(url: string, fetchImpl: typeof fetch) { const response = await fetchImpl(url, { headers: { Accept: 'application/pdf, */*;q=0.8' }, cache: 'no-store' }); if (!response.ok) { throw new Error(`PDF request failed (${response.status})`); } const contentType = response.headers.get('content-type') ?? ''; if (!/pdf/i.test(contentType) && !/\.pdf$/i.test(url)) { throw new Error(`Asset is not a PDF (${contentType || 'unknown content-type'})`); } const bytes = new Uint8Array(await response.arrayBuffer()); const tempRoot = await mkdtemp(join(tmpdir(), 'fiscal-pdf-')); const pdfPath = join(tempRoot, 'source.pdf'); try { await writeFile(pdfPath, bytes); const { stdout } = await execFileAsync('pdftotext', ['-layout', '-enc', 'UTF-8', pdfPath, '-'], { maxBuffer: 16 * 1024 * 1024 }); const text = stdout.trim(); if (!text) { return null; } return text; } finally { await rm(tempRoot, { recursive: true, force: true }); } } function validationPrompt(metrics: Filing['metrics'], pdfText: string) { const textSlice = pdfText.slice(0, 80_000); return [ 'Extract numeric financial metrics from the provided financial statement PDF text.', `Taxonomy baseline metrics: ${JSON.stringify(metrics ?? {})}`, 'Return ONLY JSON with keys revenue, netIncome, totalAssets, cash, debt.', 'Each key must map to: {"value": number|null, "pages": [number]}.', 'Use null when a metric is not found.', 'PDF text follows:', textSlice ].join('\n\n'); } function providerModelOrNull(value: string | undefined | null) { const normalized = value?.trim(); return normalized && normalized.length > 0 ? normalized : null; } export async function validateMetricsWithPdfLlm(input: { metrics: Filing['metrics']; assets: TaxonomyAsset[]; fetchImpl?: typeof fetch; }): Promise<{ validation_result: MetricValidationResult | null; metric_validations: TaxonomyMetricValidationCheck[]; }> { const taxonomyMetrics = input.metrics ?? { revenue: null, netIncome: null, totalAssets: null, cash: null, debt: null }; const selectedPdf = input.assets.find((asset) => asset.asset_type === 'pdf' && asset.is_selected); if (!selectedPdf) { return { validation_result: { status: 'not_run', checks: [], validatedAt: null }, metric_validations: [] }; } const fetchImpl = input.fetchImpl ?? fetch; let pdfText: string | null = null; try { pdfText = await extractPdfText(selectedPdf.url, fetchImpl); } catch (error) { const message = error instanceof Error ? error.message : 'PDF extraction failed'; const checks: TaxonomyMetricValidationCheck[] = METRIC_KEYS.map((metricKey) => ({ metric_key: metricKey, taxonomy_value: taxonomyMetrics[metricKey], llm_value: null, absolute_diff: null, relative_diff: null, status: 'error', evidence_pages: [], pdf_url: selectedPdf.url, provider: null, model: null, error: message })); return { validation_result: { status: 'error', checks: checks.map((check) => ({ metricKey: check.metric_key, taxonomyValue: check.taxonomy_value, llmValue: check.llm_value, absoluteDiff: check.absolute_diff, relativeDiff: check.relative_diff, status: check.status, evidencePages: check.evidence_pages, pdfUrl: check.pdf_url, provider: check.provider, model: check.model, error: check.error })), validatedAt: new Date().toISOString() }, metric_validations: checks }; } if (!pdfText) { return { validation_result: { status: 'not_run', checks: [], validatedAt: new Date().toISOString() }, metric_validations: [] }; } let parsed: Record }> | null = null; let provider: string | null = null; let model: string | null = null; let modelError: string | null = null; try { const aiResult = await runAiAnalysis(validationPrompt(taxonomyMetrics, pdfText), undefined, { workload: 'extraction' }); provider = providerModelOrNull(aiResult.provider); model = providerModelOrNull(aiResult.model); parsed = parseValidationPayload(aiResult.text); if (!parsed) { modelError = 'LLM response did not contain valid JSON payload'; } } catch (error) { modelError = error instanceof Error ? error.message : 'LLM validation failed'; } const validations: TaxonomyMetricValidationCheck[] = METRIC_KEYS.map((metricKey) => { const taxonomyValue = taxonomyMetrics[metricKey] ?? null; if (!parsed) { return { metric_key: metricKey, taxonomy_value: taxonomyValue, llm_value: null, absolute_diff: null, relative_diff: null, status: modelError ? 'error' : 'not_run', evidence_pages: [], pdf_url: selectedPdf.url, provider, model, error: modelError }; } const entry = parsed[metricKey as string] ?? {}; const llmValue = asNumber(entry.value); const pages = asPageNumbers(entry.pages); const diff = diffStatus(taxonomyValue, llmValue); return { metric_key: metricKey, taxonomy_value: taxonomyValue, llm_value: llmValue, absolute_diff: diff.absoluteDiff, relative_diff: diff.relativeDiff, status: diff.status, evidence_pages: pages, pdf_url: selectedPdf.url, provider, model, error: null }; }); const hasError = validations.some((entry) => entry.status === 'error'); const hasMismatch = validations.some((entry) => entry.status === 'mismatch'); return { validation_result: { status: hasError ? 'error' : hasMismatch ? 'mismatch' : 'matched', checks: validations.map((check) => ({ metricKey: check.metric_key, taxonomyValue: check.taxonomy_value, llmValue: check.llm_value, absoluteDiff: check.absolute_diff, relativeDiff: check.relative_diff, status: check.status, evidencePages: check.evidence_pages, pdfUrl: check.pdf_url, provider: check.provider, model: check.model, error: check.error })), validatedAt: new Date().toISOString() }, metric_validations: validations }; } export const __pdfValidationInternals = { parseValidationPayload, diffStatus };