Files
Neon-Desk/lib/server/taxonomy/pdf-validation.ts
2026-03-06 14:40:43 -05:00

337 lines
9.2 KiB
TypeScript

import { execFile } from 'node:child_process';
import { mkdtemp, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { promisify } from 'node:util';
import type { Filing, MetricValidationResult } from '@/lib/types';
import { runAiAnalysis } from '@/lib/server/ai';
import type { TaxonomyAsset, TaxonomyMetricValidationCheck } from '@/lib/server/taxonomy/types';
const execFileAsync = promisify(execFile);
const METRIC_KEYS: Array<keyof NonNullable<Filing['metrics']>> = [
'revenue',
'netIncome',
'totalAssets',
'cash',
'debt'
];
function extractJsonCandidate(raw: string) {
const fencedJson = raw.match(/```(?:json)?\s*([\s\S]*?)```/i)?.[1];
const candidate = fencedJson ?? (() => {
const start = raw.indexOf('{');
const end = raw.lastIndexOf('}');
return start >= 0 && end > start ? raw.slice(start, end + 1) : null;
})();
return candidate;
}
function parseValidationPayload(raw: string) {
const candidate = extractJsonCandidate(raw);
if (!candidate) {
return null;
}
try {
return JSON.parse(candidate) as Record<string, {
value?: number | string | null;
pages?: Array<number | string>;
}>;
} catch {
return null;
}
}
function asNumber(value: unknown) {
if (typeof value === 'number') {
return Number.isFinite(value) ? value : null;
}
if (typeof value === 'string') {
const parsed = Number(value.replace(/[,\s]/g, ''));
return Number.isFinite(parsed) ? parsed : null;
}
return null;
}
function asPageNumbers(raw: unknown): number[] {
if (!Array.isArray(raw)) {
return [];
}
return raw
.map((entry) => {
if (typeof entry === 'number' && Number.isFinite(entry)) {
return Math.trunc(entry);
}
if (typeof entry === 'string') {
const parsed = Number(entry);
return Number.isFinite(parsed) ? Math.trunc(parsed) : Number.NaN;
}
return Number.NaN;
})
.filter((entry) => Number.isFinite(entry) && entry > 0);
}
function diffStatus(taxonomyValue: number | null, llmValue: number | null) {
if (taxonomyValue === null && llmValue === null) {
return {
status: 'not_run' as const,
absoluteDiff: null,
relativeDiff: null
};
}
if (taxonomyValue === null || llmValue === null) {
return {
status: 'mismatch' as const,
absoluteDiff: null,
relativeDiff: null
};
}
const absoluteDiff = Math.abs(taxonomyValue - llmValue);
const denominator = Math.max(Math.abs(taxonomyValue), 1);
const relativeDiff = absoluteDiff / denominator;
const tolerance = Math.max(1, Math.abs(taxonomyValue) * 0.005);
return {
status: absoluteDiff <= tolerance ? 'matched' as const : 'mismatch' as const,
absoluteDiff,
relativeDiff
};
}
async function extractPdfText(url: string, fetchImpl: typeof fetch) {
const response = await fetchImpl(url, {
headers: {
Accept: 'application/pdf, */*;q=0.8'
},
cache: 'no-store'
});
if (!response.ok) {
throw new Error(`PDF request failed (${response.status})`);
}
const contentType = response.headers.get('content-type') ?? '';
if (!/pdf/i.test(contentType) && !/\.pdf$/i.test(url)) {
throw new Error(`Asset is not a PDF (${contentType || 'unknown content-type'})`);
}
const bytes = new Uint8Array(await response.arrayBuffer());
const tempRoot = await mkdtemp(join(tmpdir(), 'fiscal-pdf-'));
const pdfPath = join(tempRoot, 'source.pdf');
try {
await writeFile(pdfPath, bytes);
const { stdout } = await execFileAsync('pdftotext', ['-layout', '-enc', 'UTF-8', pdfPath, '-'], {
maxBuffer: 16 * 1024 * 1024
});
const text = stdout.trim();
if (!text) {
return null;
}
return text;
} finally {
await rm(tempRoot, { recursive: true, force: true });
}
}
function validationPrompt(metrics: Filing['metrics'], pdfText: string) {
const textSlice = pdfText.slice(0, 80_000);
return [
'Extract numeric financial metrics from the provided financial statement PDF text.',
`Taxonomy baseline metrics: ${JSON.stringify(metrics ?? {})}`,
'Return ONLY JSON with keys revenue, netIncome, totalAssets, cash, debt.',
'Each key must map to: {"value": number|null, "pages": [number]}.',
'Use null when a metric is not found.',
'PDF text follows:',
textSlice
].join('\n\n');
}
function providerModelOrNull(value: string | undefined | null) {
const normalized = value?.trim();
return normalized && normalized.length > 0 ? normalized : null;
}
export async function validateMetricsWithPdfLlm(input: {
metrics: Filing['metrics'];
assets: TaxonomyAsset[];
fetchImpl?: typeof fetch;
}): Promise<{
validation_result: MetricValidationResult | null;
metric_validations: TaxonomyMetricValidationCheck[];
}> {
const taxonomyMetrics = input.metrics ?? {
revenue: null,
netIncome: null,
totalAssets: null,
cash: null,
debt: null
};
const selectedPdf = input.assets.find((asset) => asset.asset_type === 'pdf' && asset.is_selected);
if (!selectedPdf) {
return {
validation_result: {
status: 'not_run',
checks: [],
validatedAt: null
},
metric_validations: []
};
}
const fetchImpl = input.fetchImpl ?? fetch;
let pdfText: string | null = null;
try {
pdfText = await extractPdfText(selectedPdf.url, fetchImpl);
} catch (error) {
const message = error instanceof Error ? error.message : 'PDF extraction failed';
const checks: TaxonomyMetricValidationCheck[] = METRIC_KEYS.map((metricKey) => ({
metric_key: metricKey,
taxonomy_value: taxonomyMetrics[metricKey],
llm_value: null,
absolute_diff: null,
relative_diff: null,
status: 'error',
evidence_pages: [],
pdf_url: selectedPdf.url,
provider: null,
model: null,
error: message
}));
return {
validation_result: {
status: 'error',
checks: checks.map((check) => ({
metricKey: check.metric_key,
taxonomyValue: check.taxonomy_value,
llmValue: check.llm_value,
absoluteDiff: check.absolute_diff,
relativeDiff: check.relative_diff,
status: check.status,
evidencePages: check.evidence_pages,
pdfUrl: check.pdf_url,
provider: check.provider,
model: check.model,
error: check.error
})),
validatedAt: new Date().toISOString()
},
metric_validations: checks
};
}
if (!pdfText) {
return {
validation_result: {
status: 'not_run',
checks: [],
validatedAt: new Date().toISOString()
},
metric_validations: []
};
}
let parsed: Record<string, { value?: number | string | null; pages?: Array<number | string> }> | null = null;
let provider: string | null = null;
let model: string | null = null;
let modelError: string | null = null;
try {
const aiResult = await runAiAnalysis(validationPrompt(taxonomyMetrics, pdfText), undefined, {
workload: 'extraction'
});
provider = providerModelOrNull(aiResult.provider);
model = providerModelOrNull(aiResult.model);
parsed = parseValidationPayload(aiResult.text);
if (!parsed) {
modelError = 'LLM response did not contain valid JSON payload';
}
} catch (error) {
modelError = error instanceof Error ? error.message : 'LLM validation failed';
}
const validations: TaxonomyMetricValidationCheck[] = METRIC_KEYS.map((metricKey) => {
const taxonomyValue = taxonomyMetrics[metricKey] ?? null;
if (!parsed) {
return {
metric_key: metricKey,
taxonomy_value: taxonomyValue,
llm_value: null,
absolute_diff: null,
relative_diff: null,
status: modelError ? 'error' : 'not_run',
evidence_pages: [],
pdf_url: selectedPdf.url,
provider,
model,
error: modelError
};
}
const entry = parsed[metricKey as string] ?? {};
const llmValue = asNumber(entry.value);
const pages = asPageNumbers(entry.pages);
const diff = diffStatus(taxonomyValue, llmValue);
return {
metric_key: metricKey,
taxonomy_value: taxonomyValue,
llm_value: llmValue,
absolute_diff: diff.absoluteDiff,
relative_diff: diff.relativeDiff,
status: diff.status,
evidence_pages: pages,
pdf_url: selectedPdf.url,
provider,
model,
error: null
};
});
const hasError = validations.some((entry) => entry.status === 'error');
const hasMismatch = validations.some((entry) => entry.status === 'mismatch');
return {
validation_result: {
status: hasError ? 'error' : hasMismatch ? 'mismatch' : 'matched',
checks: validations.map((check) => ({
metricKey: check.metric_key,
taxonomyValue: check.taxonomy_value,
llmValue: check.llm_value,
absoluteDiff: check.absolute_diff,
relativeDiff: check.relative_diff,
status: check.status,
evidencePages: check.evidence_pages,
pdfUrl: check.pdf_url,
provider: check.provider,
model: check.model,
error: check.error
})),
validatedAt: new Date().toISOString()
},
metric_validations: validations
};
}
export const __pdfValidationInternals = {
parseValidationPayload,
diffStatus
};