337 lines
9.2 KiB
TypeScript
337 lines
9.2 KiB
TypeScript
import { execFile } from 'node:child_process';
|
|
import { mkdtemp, rm, writeFile } from 'node:fs/promises';
|
|
import { tmpdir } from 'node:os';
|
|
import { join } from 'node:path';
|
|
import { promisify } from 'node:util';
|
|
import type { Filing, MetricValidationResult } from '@/lib/types';
|
|
import { runAiAnalysis } from '@/lib/server/ai';
|
|
import type { TaxonomyAsset, TaxonomyMetricValidationCheck } from '@/lib/server/taxonomy/types';
|
|
|
|
const execFileAsync = promisify(execFile);
|
|
|
|
const METRIC_KEYS: Array<keyof NonNullable<Filing['metrics']>> = [
|
|
'revenue',
|
|
'netIncome',
|
|
'totalAssets',
|
|
'cash',
|
|
'debt'
|
|
];
|
|
|
|
function extractJsonCandidate(raw: string) {
|
|
const fencedJson = raw.match(/```(?:json)?\s*([\s\S]*?)```/i)?.[1];
|
|
const candidate = fencedJson ?? (() => {
|
|
const start = raw.indexOf('{');
|
|
const end = raw.lastIndexOf('}');
|
|
return start >= 0 && end > start ? raw.slice(start, end + 1) : null;
|
|
})();
|
|
|
|
return candidate;
|
|
}
|
|
|
|
function parseValidationPayload(raw: string) {
|
|
const candidate = extractJsonCandidate(raw);
|
|
if (!candidate) {
|
|
return null;
|
|
}
|
|
|
|
try {
|
|
return JSON.parse(candidate) as Record<string, {
|
|
value?: number | string | null;
|
|
pages?: Array<number | string>;
|
|
}>;
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
function asNumber(value: unknown) {
|
|
if (typeof value === 'number') {
|
|
return Number.isFinite(value) ? value : null;
|
|
}
|
|
|
|
if (typeof value === 'string') {
|
|
const parsed = Number(value.replace(/[,\s]/g, ''));
|
|
return Number.isFinite(parsed) ? parsed : null;
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
function asPageNumbers(raw: unknown): number[] {
|
|
if (!Array.isArray(raw)) {
|
|
return [];
|
|
}
|
|
|
|
return raw
|
|
.map((entry) => {
|
|
if (typeof entry === 'number' && Number.isFinite(entry)) {
|
|
return Math.trunc(entry);
|
|
}
|
|
|
|
if (typeof entry === 'string') {
|
|
const parsed = Number(entry);
|
|
return Number.isFinite(parsed) ? Math.trunc(parsed) : Number.NaN;
|
|
}
|
|
|
|
return Number.NaN;
|
|
})
|
|
.filter((entry) => Number.isFinite(entry) && entry > 0);
|
|
}
|
|
|
|
function diffStatus(taxonomyValue: number | null, llmValue: number | null) {
|
|
if (taxonomyValue === null && llmValue === null) {
|
|
return {
|
|
status: 'not_run' as const,
|
|
absoluteDiff: null,
|
|
relativeDiff: null
|
|
};
|
|
}
|
|
|
|
if (taxonomyValue === null || llmValue === null) {
|
|
return {
|
|
status: 'mismatch' as const,
|
|
absoluteDiff: null,
|
|
relativeDiff: null
|
|
};
|
|
}
|
|
|
|
const absoluteDiff = Math.abs(taxonomyValue - llmValue);
|
|
const denominator = Math.max(Math.abs(taxonomyValue), 1);
|
|
const relativeDiff = absoluteDiff / denominator;
|
|
const tolerance = Math.max(1, Math.abs(taxonomyValue) * 0.005);
|
|
|
|
return {
|
|
status: absoluteDiff <= tolerance ? 'matched' as const : 'mismatch' as const,
|
|
absoluteDiff,
|
|
relativeDiff
|
|
};
|
|
}
|
|
|
|
async function extractPdfText(url: string, fetchImpl: typeof fetch) {
|
|
const response = await fetchImpl(url, {
|
|
headers: {
|
|
Accept: 'application/pdf, */*;q=0.8'
|
|
},
|
|
cache: 'no-store'
|
|
});
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`PDF request failed (${response.status})`);
|
|
}
|
|
|
|
const contentType = response.headers.get('content-type') ?? '';
|
|
if (!/pdf/i.test(contentType) && !/\.pdf$/i.test(url)) {
|
|
throw new Error(`Asset is not a PDF (${contentType || 'unknown content-type'})`);
|
|
}
|
|
|
|
const bytes = new Uint8Array(await response.arrayBuffer());
|
|
const tempRoot = await mkdtemp(join(tmpdir(), 'fiscal-pdf-'));
|
|
const pdfPath = join(tempRoot, 'source.pdf');
|
|
|
|
try {
|
|
await writeFile(pdfPath, bytes);
|
|
const { stdout } = await execFileAsync('pdftotext', ['-layout', '-enc', 'UTF-8', pdfPath, '-'], {
|
|
maxBuffer: 16 * 1024 * 1024
|
|
});
|
|
|
|
const text = stdout.trim();
|
|
if (!text) {
|
|
return null;
|
|
}
|
|
|
|
return text;
|
|
} finally {
|
|
await rm(tempRoot, { recursive: true, force: true });
|
|
}
|
|
}
|
|
|
|
function validationPrompt(metrics: Filing['metrics'], pdfText: string) {
|
|
const textSlice = pdfText.slice(0, 80_000);
|
|
|
|
return [
|
|
'Extract numeric financial metrics from the provided financial statement PDF text.',
|
|
`Taxonomy baseline metrics: ${JSON.stringify(metrics ?? {})}`,
|
|
'Return ONLY JSON with keys revenue, netIncome, totalAssets, cash, debt.',
|
|
'Each key must map to: {"value": number|null, "pages": [number]}.',
|
|
'Use null when a metric is not found.',
|
|
'PDF text follows:',
|
|
textSlice
|
|
].join('\n\n');
|
|
}
|
|
|
|
function providerModelOrNull(value: string | undefined | null) {
|
|
const normalized = value?.trim();
|
|
return normalized && normalized.length > 0 ? normalized : null;
|
|
}
|
|
|
|
export async function validateMetricsWithPdfLlm(input: {
|
|
metrics: Filing['metrics'];
|
|
assets: TaxonomyAsset[];
|
|
fetchImpl?: typeof fetch;
|
|
}): Promise<{
|
|
validation_result: MetricValidationResult | null;
|
|
metric_validations: TaxonomyMetricValidationCheck[];
|
|
}> {
|
|
const taxonomyMetrics = input.metrics ?? {
|
|
revenue: null,
|
|
netIncome: null,
|
|
totalAssets: null,
|
|
cash: null,
|
|
debt: null
|
|
};
|
|
|
|
const selectedPdf = input.assets.find((asset) => asset.asset_type === 'pdf' && asset.is_selected);
|
|
if (!selectedPdf) {
|
|
return {
|
|
validation_result: {
|
|
status: 'not_run',
|
|
checks: [],
|
|
validatedAt: null
|
|
},
|
|
metric_validations: []
|
|
};
|
|
}
|
|
|
|
const fetchImpl = input.fetchImpl ?? fetch;
|
|
let pdfText: string | null = null;
|
|
try {
|
|
pdfText = await extractPdfText(selectedPdf.url, fetchImpl);
|
|
} catch (error) {
|
|
const message = error instanceof Error ? error.message : 'PDF extraction failed';
|
|
|
|
const checks: TaxonomyMetricValidationCheck[] = METRIC_KEYS.map((metricKey) => ({
|
|
metric_key: metricKey,
|
|
taxonomy_value: taxonomyMetrics[metricKey],
|
|
llm_value: null,
|
|
absolute_diff: null,
|
|
relative_diff: null,
|
|
status: 'error',
|
|
evidence_pages: [],
|
|
pdf_url: selectedPdf.url,
|
|
provider: null,
|
|
model: null,
|
|
error: message
|
|
}));
|
|
|
|
return {
|
|
validation_result: {
|
|
status: 'error',
|
|
checks: checks.map((check) => ({
|
|
metricKey: check.metric_key,
|
|
taxonomyValue: check.taxonomy_value,
|
|
llmValue: check.llm_value,
|
|
absoluteDiff: check.absolute_diff,
|
|
relativeDiff: check.relative_diff,
|
|
status: check.status,
|
|
evidencePages: check.evidence_pages,
|
|
pdfUrl: check.pdf_url,
|
|
provider: check.provider,
|
|
model: check.model,
|
|
error: check.error
|
|
})),
|
|
validatedAt: new Date().toISOString()
|
|
},
|
|
metric_validations: checks
|
|
};
|
|
}
|
|
|
|
if (!pdfText) {
|
|
return {
|
|
validation_result: {
|
|
status: 'not_run',
|
|
checks: [],
|
|
validatedAt: new Date().toISOString()
|
|
},
|
|
metric_validations: []
|
|
};
|
|
}
|
|
|
|
let parsed: Record<string, { value?: number | string | null; pages?: Array<number | string> }> | null = null;
|
|
let provider: string | null = null;
|
|
let model: string | null = null;
|
|
let modelError: string | null = null;
|
|
|
|
try {
|
|
const aiResult = await runAiAnalysis(validationPrompt(taxonomyMetrics, pdfText), undefined, {
|
|
workload: 'extraction'
|
|
});
|
|
|
|
provider = providerModelOrNull(aiResult.provider);
|
|
model = providerModelOrNull(aiResult.model);
|
|
parsed = parseValidationPayload(aiResult.text);
|
|
if (!parsed) {
|
|
modelError = 'LLM response did not contain valid JSON payload';
|
|
}
|
|
} catch (error) {
|
|
modelError = error instanceof Error ? error.message : 'LLM validation failed';
|
|
}
|
|
|
|
const validations: TaxonomyMetricValidationCheck[] = METRIC_KEYS.map((metricKey) => {
|
|
const taxonomyValue = taxonomyMetrics[metricKey] ?? null;
|
|
|
|
if (!parsed) {
|
|
return {
|
|
metric_key: metricKey,
|
|
taxonomy_value: taxonomyValue,
|
|
llm_value: null,
|
|
absolute_diff: null,
|
|
relative_diff: null,
|
|
status: modelError ? 'error' : 'not_run',
|
|
evidence_pages: [],
|
|
pdf_url: selectedPdf.url,
|
|
provider,
|
|
model,
|
|
error: modelError
|
|
};
|
|
}
|
|
|
|
const entry = parsed[metricKey as string] ?? {};
|
|
const llmValue = asNumber(entry.value);
|
|
const pages = asPageNumbers(entry.pages);
|
|
const diff = diffStatus(taxonomyValue, llmValue);
|
|
|
|
return {
|
|
metric_key: metricKey,
|
|
taxonomy_value: taxonomyValue,
|
|
llm_value: llmValue,
|
|
absolute_diff: diff.absoluteDiff,
|
|
relative_diff: diff.relativeDiff,
|
|
status: diff.status,
|
|
evidence_pages: pages,
|
|
pdf_url: selectedPdf.url,
|
|
provider,
|
|
model,
|
|
error: null
|
|
};
|
|
});
|
|
|
|
const hasError = validations.some((entry) => entry.status === 'error');
|
|
const hasMismatch = validations.some((entry) => entry.status === 'mismatch');
|
|
|
|
return {
|
|
validation_result: {
|
|
status: hasError ? 'error' : hasMismatch ? 'mismatch' : 'matched',
|
|
checks: validations.map((check) => ({
|
|
metricKey: check.metric_key,
|
|
taxonomyValue: check.taxonomy_value,
|
|
llmValue: check.llm_value,
|
|
absoluteDiff: check.absolute_diff,
|
|
relativeDiff: check.relative_diff,
|
|
status: check.status,
|
|
evidencePages: check.evidence_pages,
|
|
pdfUrl: check.pdf_url,
|
|
provider: check.provider,
|
|
model: check.model,
|
|
error: check.error
|
|
})),
|
|
validatedAt: new Date().toISOString()
|
|
},
|
|
metric_validations: validations
|
|
};
|
|
}
|
|
|
|
export const __pdfValidationInternals = {
|
|
parseValidationPayload,
|
|
diffStatus
|
|
};
|