Implement dual-model filing pipeline with Ollama extraction
This commit is contained in:
@@ -1,4 +1,10 @@
|
||||
import type { Filing, Holding, Task } from '@/lib/types';
|
||||
import type {
|
||||
Filing,
|
||||
FilingExtraction,
|
||||
FilingExtractionMeta,
|
||||
Holding,
|
||||
Task
|
||||
} from '@/lib/types';
|
||||
import { runAiAnalysis } from '@/lib/server/ai';
|
||||
import { buildPortfolioSummary } from '@/lib/server/portfolio';
|
||||
import { getQuote } from '@/lib/server/prices';
|
||||
@@ -13,7 +19,23 @@ import {
|
||||
listUserHoldings
|
||||
} from '@/lib/server/repos/holdings';
|
||||
import { createPortfolioInsight } from '@/lib/server/repos/insights';
|
||||
import { fetchFilingMetrics, fetchRecentFilings } from '@/lib/server/sec';
|
||||
import {
|
||||
fetchFilingMetrics,
|
||||
fetchPrimaryFilingText,
|
||||
fetchRecentFilings
|
||||
} from '@/lib/server/sec';
|
||||
|
||||
const EXTRACTION_REQUIRED_KEYS = [
|
||||
'summary',
|
||||
'keyPoints',
|
||||
'redFlags',
|
||||
'followUpQuestions',
|
||||
'portfolioSignals',
|
||||
'confidence'
|
||||
] as const;
|
||||
const EXTRACTION_MAX_ITEMS = 6;
|
||||
const EXTRACTION_ITEM_MAX_LENGTH = 280;
|
||||
const EXTRACTION_SUMMARY_MAX_LENGTH = 900;
|
||||
|
||||
function toTaskResult(value: unknown): Record<string, unknown> {
|
||||
if (!value || typeof value !== 'object' || Array.isArray(value)) {
|
||||
@@ -42,6 +64,173 @@ function parseLimit(raw: unknown, fallback: number, min: number, max: number) {
|
||||
return Math.min(Math.max(intValue, min), max);
|
||||
}
|
||||
|
||||
function sanitizeExtractionText(value: unknown, maxLength: number) {
|
||||
if (typeof value !== 'string') {
|
||||
return null;
|
||||
}
|
||||
|
||||
const collapsed = value.replace(/\s+/g, ' ').trim();
|
||||
if (!collapsed) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return collapsed.slice(0, maxLength);
|
||||
}
|
||||
|
||||
function sanitizeExtractionList(value: unknown) {
|
||||
if (!Array.isArray(value)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const cleaned: string[] = [];
|
||||
|
||||
for (const entry of value) {
|
||||
const normalized = sanitizeExtractionText(entry, EXTRACTION_ITEM_MAX_LENGTH);
|
||||
if (!normalized) {
|
||||
continue;
|
||||
}
|
||||
|
||||
cleaned.push(normalized);
|
||||
if (cleaned.length >= EXTRACTION_MAX_ITEMS) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return cleaned;
|
||||
}
|
||||
|
||||
function parseExtractionPayload(raw: string): FilingExtraction | null {
|
||||
const fencedJson = raw.match(/```(?:json)?\s*([\s\S]*?)```/i)?.[1];
|
||||
const candidate = fencedJson ?? (() => {
|
||||
const start = raw.indexOf('{');
|
||||
const end = raw.lastIndexOf('}');
|
||||
return start >= 0 && end > start ? raw.slice(start, end + 1) : null;
|
||||
})();
|
||||
|
||||
if (!candidate) {
|
||||
return null;
|
||||
}
|
||||
|
||||
let parsed: unknown;
|
||||
try {
|
||||
parsed = JSON.parse(candidate);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const payload = parsed as Record<string, unknown>;
|
||||
const keys = Object.keys(payload);
|
||||
if (keys.length !== EXTRACTION_REQUIRED_KEYS.length) {
|
||||
return null;
|
||||
}
|
||||
|
||||
for (const key of EXTRACTION_REQUIRED_KEYS) {
|
||||
if (!(key in payload)) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
for (const key of keys) {
|
||||
if (!EXTRACTION_REQUIRED_KEYS.includes(key as (typeof EXTRACTION_REQUIRED_KEYS)[number])) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
const summary = sanitizeExtractionText(payload.summary, EXTRACTION_SUMMARY_MAX_LENGTH);
|
||||
const keyPoints = sanitizeExtractionList(payload.keyPoints);
|
||||
const redFlags = sanitizeExtractionList(payload.redFlags);
|
||||
const followUpQuestions = sanitizeExtractionList(payload.followUpQuestions);
|
||||
const portfolioSignals = sanitizeExtractionList(payload.portfolioSignals);
|
||||
const confidenceRaw = typeof payload.confidence === 'number'
|
||||
? payload.confidence
|
||||
: Number(payload.confidence);
|
||||
|
||||
if (!summary || !keyPoints || !redFlags || !followUpQuestions || !portfolioSignals || !Number.isFinite(confidenceRaw)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
summary,
|
||||
keyPoints,
|
||||
redFlags,
|
||||
followUpQuestions,
|
||||
portfolioSignals,
|
||||
confidence: Math.min(Math.max(confidenceRaw, 0), 1)
|
||||
};
|
||||
}
|
||||
|
||||
function metricSnapshotLine(label: string, value: number | null | undefined) {
|
||||
if (value === null || value === undefined || !Number.isFinite(value)) {
|
||||
return `${label}: not reported`;
|
||||
}
|
||||
|
||||
return `${label}: ${Math.round(value).toLocaleString('en-US')}`;
|
||||
}
|
||||
|
||||
function deterministicExtractionFallback(filing: Filing): FilingExtraction {
|
||||
const metrics = filing.metrics;
|
||||
|
||||
return {
|
||||
summary: `${filing.company_name} ${filing.filing_type} filed on ${filing.filing_date}. Deterministic extraction fallback used due unavailable or invalid local parsing output.`,
|
||||
keyPoints: [
|
||||
`${filing.filing_type} filing recorded for ${filing.ticker}.`,
|
||||
metricSnapshotLine('Revenue', metrics?.revenue),
|
||||
metricSnapshotLine('Net income', metrics?.netIncome),
|
||||
metricSnapshotLine('Total assets', metrics?.totalAssets)
|
||||
],
|
||||
redFlags: [
|
||||
metricSnapshotLine('Cash', metrics?.cash),
|
||||
metricSnapshotLine('Debt', metrics?.debt),
|
||||
filing.primary_document ? 'Primary document is indexed and available for review.' : 'Primary document reference is unavailable in current filing metadata.'
|
||||
],
|
||||
followUpQuestions: [
|
||||
'What changed versus the prior filing in guidance, margins, or liquidity?',
|
||||
'Are any material risks under-emphasized relative to historical filings?',
|
||||
'Should portfolio exposure be adjusted before the next reporting cycle?'
|
||||
],
|
||||
portfolioSignals: [
|
||||
'Validate trend direction using at least two prior filings.',
|
||||
'Cross-check leverage and liquidity metrics against position sizing rules.',
|
||||
'Track language shifts around guidance or demand assumptions.'
|
||||
],
|
||||
confidence: 0.2
|
||||
};
|
||||
}
|
||||
|
||||
function extractionPrompt(filing: Filing, filingText: string) {
|
||||
return [
|
||||
'Extract structured signals from the SEC filing text.',
|
||||
`Company: ${filing.company_name} (${filing.ticker})`,
|
||||
`Form: ${filing.filing_type}`,
|
||||
`Filed: ${filing.filing_date}`,
|
||||
'Return ONLY valid JSON with exactly these keys and no extra keys:',
|
||||
'{"summary":"string","keyPoints":["string"],"redFlags":["string"],"followUpQuestions":["string"],"portfolioSignals":["string"],"confidence":0}',
|
||||
`Rules: keyPoints/redFlags/followUpQuestions/portfolioSignals arrays max ${EXTRACTION_MAX_ITEMS} items; each item <= ${EXTRACTION_ITEM_MAX_LENGTH} chars; summary <= ${EXTRACTION_SUMMARY_MAX_LENGTH} chars; confidence between 0 and 1.`,
|
||||
'Filing text follows:',
|
||||
filingText
|
||||
].join('\n\n');
|
||||
}
|
||||
|
||||
function reportPrompt(
|
||||
filing: Filing,
|
||||
extraction: FilingExtraction,
|
||||
extractionMeta: FilingExtractionMeta
|
||||
) {
|
||||
return [
|
||||
'You are a fiscal research assistant focused on regulatory signals.',
|
||||
`Analyze this SEC filing from ${filing.company_name} (${filing.ticker}).`,
|
||||
`Form: ${filing.filing_type}`,
|
||||
`Filed: ${filing.filing_date}`,
|
||||
`Metrics: ${JSON.stringify(filing.metrics ?? {})}`,
|
||||
`Structured extraction context (${extractionMeta.source}): ${JSON.stringify(extraction)}`,
|
||||
'Return concise sections: Thesis, Red Flags, Follow-up Questions, Portfolio Impact.'
|
||||
].join('\n');
|
||||
}
|
||||
|
||||
function filingLinks(filing: {
|
||||
filingUrl: string | null;
|
||||
submissionUrl: string | null;
|
||||
@@ -134,27 +323,65 @@ async function processAnalyzeFiling(task: Task) {
|
||||
throw new Error(`Filing ${accessionNumber} not found`);
|
||||
}
|
||||
|
||||
const prompt = [
|
||||
'You are a fiscal research assistant focused on regulatory signals.',
|
||||
`Analyze this SEC filing from ${filing.company_name} (${filing.ticker}).`,
|
||||
`Form: ${filing.filing_type}`,
|
||||
`Filed: ${filing.filing_date}`,
|
||||
`Metrics: ${JSON.stringify(filing.metrics ?? {})}`,
|
||||
'Return concise sections: Thesis, Red Flags, Follow-up Questions, Portfolio Impact.'
|
||||
].join('\n');
|
||||
const defaultExtraction = deterministicExtractionFallback(filing);
|
||||
let extraction = defaultExtraction;
|
||||
let extractionMeta: FilingExtractionMeta = {
|
||||
provider: 'deterministic-fallback',
|
||||
model: 'metadata-fallback',
|
||||
source: 'metadata_fallback',
|
||||
generatedAt: new Date().toISOString()
|
||||
};
|
||||
|
||||
const analysis = await runAiAnalysis(prompt, 'Use concise institutional analyst language.');
|
||||
try {
|
||||
const filingDocument = await fetchPrimaryFilingText({
|
||||
filingUrl: filing.filing_url,
|
||||
cik: filing.cik,
|
||||
accessionNumber: filing.accession_number,
|
||||
primaryDocument: filing.primary_document ?? null
|
||||
});
|
||||
|
||||
if (filingDocument?.text) {
|
||||
const extractionResult = await runAiAnalysis(
|
||||
extractionPrompt(filing, filingDocument.text),
|
||||
'Return strict JSON only.',
|
||||
{ workload: 'extraction' }
|
||||
);
|
||||
|
||||
const parsed = parseExtractionPayload(extractionResult.text);
|
||||
if (parsed) {
|
||||
extraction = parsed;
|
||||
extractionMeta = {
|
||||
provider: extractionResult.provider === 'local-fallback' ? 'deterministic-fallback' : 'ollama',
|
||||
model: extractionResult.model,
|
||||
source: filingDocument.source,
|
||||
generatedAt: new Date().toISOString()
|
||||
};
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
extraction = defaultExtraction;
|
||||
}
|
||||
|
||||
const analysis = await runAiAnalysis(
|
||||
reportPrompt(filing, extraction, extractionMeta),
|
||||
'Use concise institutional analyst language.',
|
||||
{ workload: 'report' }
|
||||
);
|
||||
|
||||
await saveFilingAnalysis(accessionNumber, {
|
||||
provider: analysis.provider,
|
||||
model: analysis.model,
|
||||
text: analysis.text
|
||||
text: analysis.text,
|
||||
extraction,
|
||||
extractionMeta
|
||||
});
|
||||
|
||||
return {
|
||||
accessionNumber,
|
||||
provider: analysis.provider,
|
||||
model: analysis.model
|
||||
model: analysis.model,
|
||||
extractionProvider: extractionMeta.provider,
|
||||
extractionModel: extractionMeta.model
|
||||
};
|
||||
}
|
||||
|
||||
@@ -186,7 +413,11 @@ async function processPortfolioInsights(task: Task) {
|
||||
'Respond with: 1) health score (0-100), 2) top 3 risks, 3) top 3 opportunities, 4) next actions in 7 days.'
|
||||
].join('\n');
|
||||
|
||||
const analysis = await runAiAnalysis(prompt, 'Act as a risk-aware buy-side analyst.');
|
||||
const analysis = await runAiAnalysis(
|
||||
prompt,
|
||||
'Act as a risk-aware buy-side analyst.',
|
||||
{ workload: 'report' }
|
||||
);
|
||||
|
||||
await createPortfolioInsight({
|
||||
userId,
|
||||
@@ -202,6 +433,11 @@ async function processPortfolioInsights(task: Task) {
|
||||
};
|
||||
}
|
||||
|
||||
export const __taskProcessorInternals = {
|
||||
parseExtractionPayload,
|
||||
deterministicExtractionFallback
|
||||
};
|
||||
|
||||
export async function runTaskProcessor(task: Task) {
|
||||
switch (task.task_type) {
|
||||
case 'sync_filings':
|
||||
|
||||
Reference in New Issue
Block a user