Implement dual-model filing pipeline with Ollama extraction

This commit is contained in:
2026-02-28 16:31:25 -05:00
parent 0615534f4b
commit a09001501e
16 changed files with 872 additions and 51 deletions

View File

@@ -2,6 +2,7 @@ import { beforeEach, describe, expect, it, mock } from 'bun:test';
import {
__resetAiWarningsForTests,
getAiConfig,
getExtractionAiConfig,
runAiAnalysis
} from './ai';
@@ -154,4 +155,65 @@ describe('ai config and runtime', () => {
})
).rejects.toThrow('AI SDK returned an empty response');
});
it('uses ollama defaults for extraction workload config', () => {
const config = getExtractionAiConfig({
env: {},
warn: () => {}
});
expect(config.provider).toBe('ollama');
expect(config.baseUrl).toBe('http://127.0.0.1:11434');
expect(config.model).toBe('qwen3:8b');
expect(config.apiKey).toBe('ollama');
expect(config.temperature).toBe(0);
});
it('uses extraction workload and returns ollama provider on success', async () => {
const createModel = mock((config: {
provider: string;
apiKey?: string;
model: string;
baseUrl: string;
temperature: number;
}) => {
expect(config.provider).toBe('ollama');
expect(config.baseUrl).toBe('http://127.0.0.1:11434');
expect(config.model).toBe('qwen3:8b');
expect(config.temperature).toBe(0);
return { modelId: config.model };
});
const generate = mock(async () => ({ text: '{"summary":"ok","keyPoints":[],"redFlags":[],"followUpQuestions":[],"portfolioSignals":[],"confidence":0.6}' }));
const result = await runAiAnalysis('Extract this filing', 'Return JSON', {
env: {
OLLAMA_MODEL: 'qwen3:8b'
},
warn: () => {},
workload: 'extraction',
createModel,
generate
});
expect(createModel).toHaveBeenCalledTimes(1);
expect(generate).toHaveBeenCalledTimes(1);
expect(result.provider).toBe('ollama');
expect(result.model).toBe('qwen3:8b');
});
it('falls back to local text when extraction workload generation fails', async () => {
const result = await runAiAnalysis('Extract this filing', 'Return JSON', {
env: {},
warn: () => {},
workload: 'extraction',
createModel: () => ({}),
generate: async () => {
throw new Error('ollama unavailable');
}
});
expect(result.provider).toBe('local-fallback');
expect(result.model).toBe('qwen3:8b');
expect(result.text).toContain('AI SDK fallback mode is active');
});
});

View File

@@ -1,7 +1,12 @@
import { createOpenAI } from '@ai-sdk/openai';
import { generateText } from 'ai';
import { createZhipu } from 'zhipu-ai-provider';
type AiWorkload = 'report' | 'extraction';
type AiProvider = 'zhipu' | 'ollama';
type AiConfig = {
provider: AiProvider;
apiKey?: string;
baseUrl: string;
model: string;
@@ -27,11 +32,15 @@ type AiGenerateOutput = {
};
type RunAiAnalysisOptions = GetAiConfigOptions & {
workload?: AiWorkload;
createModel?: (config: AiConfig) => unknown;
generate?: (input: AiGenerateInput) => Promise<AiGenerateOutput>;
};
const CODING_API_BASE_URL = 'https://api.z.ai/api/coding/paas/v4';
const OLLAMA_BASE_URL = 'http://127.0.0.1:11434';
const OLLAMA_MODEL = 'qwen3:8b';
const OLLAMA_API_KEY = 'ollama';
let warnedIgnoredZhipuBaseUrl = false;
@@ -74,20 +83,47 @@ function fallbackResponse(prompt: string) {
const clipped = prompt.split('\n').slice(0, 6).join(' ').slice(0, 260);
return [
'AI SDK fallback mode is active (Zhipu configuration is missing).',
'AI SDK fallback mode is active (live model configuration is missing or unavailable).',
'Thesis: Portfolio remains analyzable with local heuristics until live model access is configured.',
'Risk scan: Concentration and filing sentiment should be monitored after each sync cycle.',
`Context digest: ${clipped}`
].join('\n\n');
}
function toOpenAiCompatibleBaseUrl(baseUrl: string) {
const normalized = baseUrl.endsWith('/')
? baseUrl.slice(0, -1)
: baseUrl;
return normalized.endsWith('/v1')
? normalized
: `${normalized}/v1`;
}
function asErrorMessage(error: unknown) {
if (error instanceof Error && error.message) {
return error.message;
}
return String(error);
}
function defaultCreateModel(config: AiConfig) {
const zhipu = createZhipu({
apiKey: config.apiKey,
baseURL: config.baseUrl
if (config.provider === 'zhipu') {
const zhipu = createZhipu({
apiKey: config.apiKey,
baseURL: config.baseUrl
});
return zhipu(config.model);
}
const openai = createOpenAI({
apiKey: config.apiKey ?? OLLAMA_API_KEY,
baseURL: toOpenAiCompatibleBaseUrl(config.baseUrl)
});
return zhipu(config.model);
return openai.chat(config.model);
}
async function defaultGenerate(input: AiGenerateInput): Promise<AiGenerateOutput> {
@@ -102,10 +138,15 @@ async function defaultGenerate(input: AiGenerateInput): Promise<AiGenerateOutput
}
export function getAiConfig(options?: GetAiConfigOptions) {
return getReportAiConfig(options);
}
export function getReportAiConfig(options?: GetAiConfigOptions) {
const env = options?.env ?? process.env;
warnIgnoredZhipuBaseUrl(env, options?.warn ?? console.warn);
return {
provider: 'zhipu',
apiKey: envValue('ZHIPU_API_KEY', env),
baseUrl: CODING_API_BASE_URL,
model: envValue('ZHIPU_MODEL', env) ?? 'glm-4.7-flashx',
@@ -113,15 +154,30 @@ export function getAiConfig(options?: GetAiConfigOptions) {
} satisfies AiConfig;
}
export function getExtractionAiConfig(options?: GetAiConfigOptions) {
const env = options?.env ?? process.env;
return {
provider: 'ollama',
apiKey: envValue('OLLAMA_API_KEY', env) ?? OLLAMA_API_KEY,
baseUrl: envValue('OLLAMA_BASE_URL', env) ?? OLLAMA_BASE_URL,
model: envValue('OLLAMA_MODEL', env) ?? OLLAMA_MODEL,
temperature: 0
} satisfies AiConfig;
}
export function isAiConfigured(options?: GetAiConfigOptions) {
const config = getAiConfig(options);
const config = getReportAiConfig(options);
return Boolean(config.apiKey);
}
export async function runAiAnalysis(prompt: string, systemPrompt?: string, options?: RunAiAnalysisOptions) {
const config = getAiConfig(options);
const workload = options?.workload ?? 'report';
const config = workload === 'extraction'
? getExtractionAiConfig(options)
: getReportAiConfig(options);
if (!config.apiKey) {
if (workload === 'report' && !config.apiKey) {
return {
provider: 'local-fallback',
model: config.model,
@@ -131,25 +187,49 @@ export async function runAiAnalysis(prompt: string, systemPrompt?: string, optio
const createModel = options?.createModel ?? defaultCreateModel;
const generate = options?.generate ?? defaultGenerate;
const model = createModel(config);
const warn = options?.warn ?? console.warn;
const result = await generate({
model,
system: systemPrompt,
prompt,
temperature: config.temperature
});
try {
const model = createModel(config);
const text = result.text.trim();
if (!text) {
throw new Error('AI SDK returned an empty response');
const result = await generate({
model,
system: systemPrompt,
prompt,
temperature: config.temperature
});
const text = result.text.trim();
if (!text) {
if (workload === 'extraction') {
return {
provider: 'local-fallback',
model: config.model,
text: fallbackResponse(prompt)
};
}
throw new Error('AI SDK returned an empty response');
}
return {
provider: config.provider,
model: config.model,
text
};
} catch (error) {
if (workload === 'extraction') {
warn(`[AI SDK] Extraction fallback activated: ${asErrorMessage(error)}`);
return {
provider: 'local-fallback',
model: config.model,
text: fallbackResponse(prompt)
};
}
throw error;
}
return {
provider: 'zhipu',
model: config.model,
text
};
}
export function __resetAiWarningsForTests() {

View File

@@ -4,6 +4,7 @@ import { auth } from '@/lib/auth';
import { requireAuthenticatedSession } from '@/lib/server/auth-session';
import { asErrorMessage, jsonError } from '@/lib/server/http';
import { buildPortfolioSummary } from '@/lib/server/portfolio';
import { redactInternalFilingAnalysisFields } from '@/lib/server/api/filing-redaction';
import { getFilingByAccession, listFilingsRecords } from '@/lib/server/repos/filings';
import {
deleteHoldingByIdRecord,
@@ -332,8 +333,9 @@ export const app = new Elysia({ prefix: '/api' })
getQuote(ticker),
getPriceHistory(ticker)
]);
const redactedFilings = filings.map(redactInternalFilingAnalysisFields);
const latestFiling = filings[0] ?? null;
const latestFiling = redactedFilings[0] ?? null;
const holding = holdings.find((entry) => entry.ticker === ticker) ?? null;
const watchlistItem = watchlist.find((entry) => entry.ticker === ticker) ?? null;
@@ -341,7 +343,7 @@ export const app = new Elysia({ prefix: '/api' })
?? watchlistItem?.company_name
?? ticker;
const financials = filings
const financials = redactedFilings
.filter((entry) => entry.metrics)
.map((entry) => ({
filingDate: entry.filing_date,
@@ -353,7 +355,7 @@ export const app = new Elysia({ prefix: '/api' })
debt: entry.metrics?.debt ?? null
}));
const aiReports = filings
const aiReports = redactedFilings
.filter((entry) => entry.analysis?.text || entry.analysis?.legacyInsights)
.slice(0, 8)
.map((entry) => ({
@@ -377,7 +379,7 @@ export const app = new Elysia({ prefix: '/api' })
position: holding,
priceHistory,
financials,
filings: filings.slice(0, 20),
filings: redactedFilings.slice(0, 20),
aiReports
}
});
@@ -446,7 +448,7 @@ export const app = new Elysia({ prefix: '/api' })
limit: Number.isFinite(limit) ? limit : 50
});
return Response.json({ filings });
return Response.json({ filings: filings.map(redactInternalFilingAnalysisFields) });
}, {
query: t.Object({
ticker: t.Optional(t.String()),

View File

@@ -0,0 +1,52 @@
import { describe, expect, it } from 'bun:test';
import type { Filing } from '@/lib/types';
import { redactInternalFilingAnalysisFields } from './filing-redaction';
function filingWithExtraction(): Filing {
return {
id: 7,
ticker: 'MSFT',
filing_type: '10-K',
filing_date: '2026-02-01',
accession_number: '0000789019-26-000001',
cik: '0000789019',
company_name: 'Microsoft Corporation',
filing_url: 'https://www.sec.gov/Archives/edgar/data/789019/000078901926000001/a10k.htm',
submission_url: null,
primary_document: 'a10k.htm',
metrics: null,
analysis: {
provider: 'zhipu',
model: 'glm-4.7-flashx',
text: 'Report text',
extraction: {
summary: 'Internal extraction summary',
keyPoints: ['a'],
redFlags: ['b'],
followUpQuestions: ['c'],
portfolioSignals: ['d'],
confidence: 0.4
},
extractionMeta: {
provider: 'ollama',
model: 'qwen3:8b',
source: 'primary_document',
generatedAt: '2026-02-01T00:00:00.000Z'
}
},
created_at: '2026-02-01T00:00:00.000Z',
updated_at: '2026-02-01T00:00:00.000Z'
};
}
describe('filing response redaction', () => {
it('removes internal extraction fields while preserving public analysis fields', () => {
const redacted = redactInternalFilingAnalysisFields(filingWithExtraction());
expect(redacted.analysis?.provider).toBe('zhipu');
expect(redacted.analysis?.model).toBe('glm-4.7-flashx');
expect(redacted.analysis?.text).toBe('Report text');
expect(redacted.analysis?.extraction).toBeUndefined();
expect(redacted.analysis?.extractionMeta).toBeUndefined();
});
});

View File

@@ -0,0 +1,15 @@
import type { Filing } from '@/lib/types';
export function redactInternalFilingAnalysisFields(filing: Filing): Filing {
if (!filing.analysis) {
return filing;
}
const { extraction: _extraction, extractionMeta: _extractionMeta, ...analysis } = filing.analysis;
const hasPublicFields = Object.keys(analysis).length > 0;
return {
...filing,
analysis: hasPublicFields ? analysis : null
};
}

View File

@@ -20,6 +20,20 @@ type FilingAnalysis = {
model?: string;
text?: string;
legacyInsights?: string;
extraction?: {
summary: string;
keyPoints: string[];
redFlags: string[];
followUpQuestions: string[];
portfolioSignals: string[];
confidence: number;
};
extractionMeta?: {
provider: string;
model: string;
source: 'primary_document' | 'metadata_fallback';
generatedAt: string;
};
};
const authDateColumn = {

84
lib/server/sec.test.ts Normal file
View File

@@ -0,0 +1,84 @@
import { describe, expect, it, mock } from 'bun:test';
import {
fetchPrimaryFilingText,
normalizeSecDocumentText,
resolvePrimaryFilingUrl,
trimSecDocumentTextForPrompt
} from './sec';
describe('sec filing text helpers', () => {
it('normalizes html filing content into plain text', () => {
const html = `
<html>
<head>
<style>.x { color: red; }</style>
<script>console.log("ignore")</script>
</head>
<body>
<h1>Quarterly&nbsp;Report</h1>
<p>Revenue &amp; margin improved.</p>
<div>See &#39;Risk Factors&#39; section.</div>
</body>
</html>
`;
const normalized = normalizeSecDocumentText(html);
expect(normalized).toContain('Quarterly Report');
expect(normalized).toContain('Revenue & margin improved.');
expect(normalized).toContain('See \'Risk Factors\' section.');
expect(normalized).not.toContain('<script>');
expect(normalized).not.toContain('console.log');
});
it('trims filing text to prompt budget boundaries', () => {
const text = `A`.repeat(4_500);
const result = trimSecDocumentTextForPrompt(text, 2_000);
expect(result.truncated).toBe(true);
expect(result.text.length).toBeLessThanOrEqual(2_000);
});
it('prefers explicit filing url when available', () => {
const url = resolvePrimaryFilingUrl({
filingUrl: 'https://www.sec.gov/Archives/edgar/data/123/x.htm',
cik: '123',
accessionNumber: '0000-00-00',
primaryDocument: 'x.htm'
});
expect(url).toBe('https://www.sec.gov/Archives/edgar/data/123/x.htm');
});
it('reconstructs primary filing url when filing url is absent', () => {
const url = resolvePrimaryFilingUrl({
filingUrl: null,
cik: '0000320193',
accessionNumber: '0000320193-24-000001',
primaryDocument: 'a10q.htm'
});
expect(url).toBe('https://www.sec.gov/Archives/edgar/data/320193/000032019324000001/a10q.htm');
});
it('fetches, normalizes, and clips primary filing text', async () => {
const longHtml = `<html><body><p>${'Alpha '.repeat(600)}</p></body></html>`;
const fetchImpl = mock(async () => new Response(longHtml, { status: 200 })) as unknown as typeof fetch;
const result = await fetchPrimaryFilingText({
filingUrl: null,
cik: '0000320193',
accessionNumber: '0000320193-24-000001',
primaryDocument: 'a10q.htm'
}, {
fetchImpl,
maxChars: 1_000
});
expect(fetchImpl).toHaveBeenCalledTimes(1);
expect(result).not.toBeNull();
expect(result?.source).toBe('primary_document');
expect(result?.truncated).toBe(true);
expect(result?.text.length).toBeLessThanOrEqual(1_000);
});
});

View File

@@ -39,8 +39,28 @@ type SecFiling = {
primaryDocument: string | null;
};
type FilingDocumentInput = {
filingUrl: string | null;
cik: string;
accessionNumber: string;
primaryDocument: string | null;
};
type FetchPrimaryFilingTextOptions = {
fetchImpl?: typeof fetch;
maxChars?: number;
};
export type FilingDocumentText = {
source: 'primary_document';
url: string;
text: string;
truncated: boolean;
};
const SUPPORTED_FORMS: FilingType[] = ['10-K', '10-Q', '8-K'];
const TICKER_CACHE_TTL_MS = 1000 * 60 * 60 * 12;
const FILING_TEXT_MAX_CHARS = 24_000;
let tickerCache = new Map<string, TickerDirectoryRecord>();
let tickerCacheLoadedAt = 0;
@@ -53,6 +73,147 @@ function todayIso() {
return new Date().toISOString().slice(0, 10);
}
function decodeHtmlEntities(value: string) {
const decodeCodePoint = (code: number) => {
if (!Number.isFinite(code) || code < 0 || code > 0x10ffff) {
return ' ';
}
try {
return String.fromCodePoint(code);
} catch {
return ' ';
}
};
return value
.replace(/&nbsp;|&#160;/gi, ' ')
.replace(/&amp;/gi, '&')
.replace(/&lt;/gi, '<')
.replace(/&gt;/gi, '>')
.replace(/&quot;/gi, '"')
.replace(/&#39;/gi, '\'')
.replace(/&#x([0-9a-f]+);/gi, (_match, rawCode: string) => {
const code = Number.parseInt(rawCode, 16);
return decodeCodePoint(code);
})
.replace(/&#([0-9]+);/g, (_match, rawCode: string) => {
const code = Number.parseInt(rawCode, 10);
return decodeCodePoint(code);
});
}
export function normalizeSecDocumentText(raw: string) {
return decodeHtmlEntities(
raw
.replace(/\r/g, '\n')
.replace(/<script[\s\S]*?<\/script>/gi, ' ')
.replace(/<style[\s\S]*?<\/style>/gi, ' ')
.replace(/<noscript[\s\S]*?<\/noscript>/gi, ' ')
.replace(/<!--[\s\S]*?-->/g, ' ')
.replace(/<\/?(p|div|section|article|li|tr|td|th|h[1-6]|br|hr)[^>]*>/gi, '\n')
.replace(/<[^>]+>/g, ' ')
)
.replace(/[ \t]+\n/g, '\n')
.replace(/\n[ \t]+/g, '\n')
.replace(/[ \t]{2,}/g, ' ')
.replace(/\n{3,}/g, '\n\n')
.trim();
}
export function trimSecDocumentTextForPrompt(text: string, maxChars = FILING_TEXT_MAX_CHARS) {
const safeMax = Math.max(Math.trunc(maxChars), 1_000);
if (text.length <= safeMax) {
return { text, truncated: false };
}
const slice = text.slice(0, safeMax);
const newlineBoundary = slice.lastIndexOf('\n');
const wordBoundary = slice.lastIndexOf(' ');
const boundary = Math.max(newlineBoundary, wordBoundary);
const clipped = (boundary > safeMax * 0.7 ? slice.slice(0, boundary) : slice).trimEnd();
return { text: clipped, truncated: true };
}
function compactAccessionNumber(value: string) {
return value.replace(/-/g, '');
}
function normalizeCikForPath(value: string) {
const digits = value.replace(/\D/g, '');
if (!digits) {
return null;
}
const numeric = Number(digits);
if (!Number.isFinite(numeric)) {
return null;
}
return String(numeric);
}
export function resolvePrimaryFilingUrl(input: FilingDocumentInput) {
const directUrl = input.filingUrl?.trim();
if (directUrl) {
return directUrl;
}
if (!input.primaryDocument) {
return null;
}
const cikPath = normalizeCikForPath(input.cik);
const accessionPath = compactAccessionNumber(input.accessionNumber);
if (!cikPath || !accessionPath) {
return null;
}
return `https://www.sec.gov/Archives/edgar/data/${cikPath}/${accessionPath}/${input.primaryDocument}`;
}
export async function fetchPrimaryFilingText(
input: FilingDocumentInput,
options?: FetchPrimaryFilingTextOptions
): Promise<FilingDocumentText | null> {
const url = resolvePrimaryFilingUrl(input);
if (!url) {
return null;
}
const doFetch = options?.fetchImpl ?? fetch;
const response = await doFetch(url, {
headers: {
'User-Agent': envUserAgent(),
Accept: 'text/html, text/plain;q=0.9, */*;q=0.8'
},
cache: 'no-store'
});
if (!response.ok) {
throw new Error(`SEC filing request failed (${response.status})`);
}
const raw = await response.text();
const normalized = normalizeSecDocumentText(raw);
if (!normalized) {
return null;
}
const clipped = trimSecDocumentTextForPrompt(normalized, options?.maxChars ?? FILING_TEXT_MAX_CHARS);
if (!clipped.text) {
return null;
}
return {
source: 'primary_document',
url,
text: clipped.text,
truncated: clipped.truncated
};
}
function pseudoMetric(seed: string, min: number, max: number) {
let hash = 0;
for (const char of seed) {

View File

@@ -0,0 +1,71 @@
import { describe, expect, it } from 'bun:test';
import type { Filing } from '@/lib/types';
import { __taskProcessorInternals } from './task-processors';
function sampleFiling(): Filing {
return {
id: 1,
ticker: 'AAPL',
filing_type: '10-Q',
filing_date: '2026-01-30',
accession_number: '0000320193-26-000001',
cik: '0000320193',
company_name: 'Apple Inc.',
filing_url: 'https://www.sec.gov/Archives/edgar/data/320193/000032019326000001/a10q.htm',
submission_url: 'https://data.sec.gov/submissions/CIK0000320193.json',
primary_document: 'a10q.htm',
metrics: {
revenue: 120_000_000_000,
netIncome: 25_000_000_000,
totalAssets: 410_000_000_000,
cash: 70_000_000_000,
debt: 98_000_000_000
},
analysis: null,
created_at: '2026-01-30T00:00:00.000Z',
updated_at: '2026-01-30T00:00:00.000Z'
};
}
describe('task processor extraction helpers', () => {
it('parses strict extraction payloads', () => {
const raw = JSON.stringify({
summary: 'Revenue growth remained resilient despite FX pressure.',
keyPoints: ['Revenue up year-over-year'],
redFlags: ['Debt service burden is rising'],
followUpQuestions: ['Is margin guidance sustainable?'],
portfolioSignals: ['Monitor leverage trend'],
confidence: 0.72
});
const parsed = __taskProcessorInternals.parseExtractionPayload(raw);
expect(parsed).not.toBeNull();
expect(parsed?.summary).toContain('Revenue growth');
expect(parsed?.confidence).toBe(0.72);
});
it('rejects extraction payloads with extra keys', () => {
const raw = JSON.stringify({
summary: 'ok',
keyPoints: [],
redFlags: [],
followUpQuestions: [],
portfolioSignals: [],
confidence: 0.2,
extra: 'not-allowed'
});
const parsed = __taskProcessorInternals.parseExtractionPayload(raw);
expect(parsed).toBeNull();
});
it('builds deterministic extraction fallback from filing metadata', () => {
const fallback = __taskProcessorInternals.deterministicExtractionFallback(sampleFiling());
expect(fallback.summary).toContain('Deterministic extraction fallback');
expect(fallback.keyPoints.length).toBeGreaterThan(0);
expect(fallback.redFlags.length).toBeGreaterThan(0);
expect(fallback.confidence).toBe(0.2);
});
});

View File

@@ -1,4 +1,10 @@
import type { Filing, Holding, Task } from '@/lib/types';
import type {
Filing,
FilingExtraction,
FilingExtractionMeta,
Holding,
Task
} from '@/lib/types';
import { runAiAnalysis } from '@/lib/server/ai';
import { buildPortfolioSummary } from '@/lib/server/portfolio';
import { getQuote } from '@/lib/server/prices';
@@ -13,7 +19,23 @@ import {
listUserHoldings
} from '@/lib/server/repos/holdings';
import { createPortfolioInsight } from '@/lib/server/repos/insights';
import { fetchFilingMetrics, fetchRecentFilings } from '@/lib/server/sec';
import {
fetchFilingMetrics,
fetchPrimaryFilingText,
fetchRecentFilings
} from '@/lib/server/sec';
const EXTRACTION_REQUIRED_KEYS = [
'summary',
'keyPoints',
'redFlags',
'followUpQuestions',
'portfolioSignals',
'confidence'
] as const;
const EXTRACTION_MAX_ITEMS = 6;
const EXTRACTION_ITEM_MAX_LENGTH = 280;
const EXTRACTION_SUMMARY_MAX_LENGTH = 900;
function toTaskResult(value: unknown): Record<string, unknown> {
if (!value || typeof value !== 'object' || Array.isArray(value)) {
@@ -42,6 +64,173 @@ function parseLimit(raw: unknown, fallback: number, min: number, max: number) {
return Math.min(Math.max(intValue, min), max);
}
function sanitizeExtractionText(value: unknown, maxLength: number) {
if (typeof value !== 'string') {
return null;
}
const collapsed = value.replace(/\s+/g, ' ').trim();
if (!collapsed) {
return null;
}
return collapsed.slice(0, maxLength);
}
function sanitizeExtractionList(value: unknown) {
if (!Array.isArray(value)) {
return null;
}
const cleaned: string[] = [];
for (const entry of value) {
const normalized = sanitizeExtractionText(entry, EXTRACTION_ITEM_MAX_LENGTH);
if (!normalized) {
continue;
}
cleaned.push(normalized);
if (cleaned.length >= EXTRACTION_MAX_ITEMS) {
break;
}
}
return cleaned;
}
function parseExtractionPayload(raw: string): FilingExtraction | null {
const fencedJson = raw.match(/```(?:json)?\s*([\s\S]*?)```/i)?.[1];
const candidate = fencedJson ?? (() => {
const start = raw.indexOf('{');
const end = raw.lastIndexOf('}');
return start >= 0 && end > start ? raw.slice(start, end + 1) : null;
})();
if (!candidate) {
return null;
}
let parsed: unknown;
try {
parsed = JSON.parse(candidate);
} catch {
return null;
}
if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) {
return null;
}
const payload = parsed as Record<string, unknown>;
const keys = Object.keys(payload);
if (keys.length !== EXTRACTION_REQUIRED_KEYS.length) {
return null;
}
for (const key of EXTRACTION_REQUIRED_KEYS) {
if (!(key in payload)) {
return null;
}
}
for (const key of keys) {
if (!EXTRACTION_REQUIRED_KEYS.includes(key as (typeof EXTRACTION_REQUIRED_KEYS)[number])) {
return null;
}
}
const summary = sanitizeExtractionText(payload.summary, EXTRACTION_SUMMARY_MAX_LENGTH);
const keyPoints = sanitizeExtractionList(payload.keyPoints);
const redFlags = sanitizeExtractionList(payload.redFlags);
const followUpQuestions = sanitizeExtractionList(payload.followUpQuestions);
const portfolioSignals = sanitizeExtractionList(payload.portfolioSignals);
const confidenceRaw = typeof payload.confidence === 'number'
? payload.confidence
: Number(payload.confidence);
if (!summary || !keyPoints || !redFlags || !followUpQuestions || !portfolioSignals || !Number.isFinite(confidenceRaw)) {
return null;
}
return {
summary,
keyPoints,
redFlags,
followUpQuestions,
portfolioSignals,
confidence: Math.min(Math.max(confidenceRaw, 0), 1)
};
}
function metricSnapshotLine(label: string, value: number | null | undefined) {
if (value === null || value === undefined || !Number.isFinite(value)) {
return `${label}: not reported`;
}
return `${label}: ${Math.round(value).toLocaleString('en-US')}`;
}
function deterministicExtractionFallback(filing: Filing): FilingExtraction {
const metrics = filing.metrics;
return {
summary: `${filing.company_name} ${filing.filing_type} filed on ${filing.filing_date}. Deterministic extraction fallback used due unavailable or invalid local parsing output.`,
keyPoints: [
`${filing.filing_type} filing recorded for ${filing.ticker}.`,
metricSnapshotLine('Revenue', metrics?.revenue),
metricSnapshotLine('Net income', metrics?.netIncome),
metricSnapshotLine('Total assets', metrics?.totalAssets)
],
redFlags: [
metricSnapshotLine('Cash', metrics?.cash),
metricSnapshotLine('Debt', metrics?.debt),
filing.primary_document ? 'Primary document is indexed and available for review.' : 'Primary document reference is unavailable in current filing metadata.'
],
followUpQuestions: [
'What changed versus the prior filing in guidance, margins, or liquidity?',
'Are any material risks under-emphasized relative to historical filings?',
'Should portfolio exposure be adjusted before the next reporting cycle?'
],
portfolioSignals: [
'Validate trend direction using at least two prior filings.',
'Cross-check leverage and liquidity metrics against position sizing rules.',
'Track language shifts around guidance or demand assumptions.'
],
confidence: 0.2
};
}
function extractionPrompt(filing: Filing, filingText: string) {
return [
'Extract structured signals from the SEC filing text.',
`Company: ${filing.company_name} (${filing.ticker})`,
`Form: ${filing.filing_type}`,
`Filed: ${filing.filing_date}`,
'Return ONLY valid JSON with exactly these keys and no extra keys:',
'{"summary":"string","keyPoints":["string"],"redFlags":["string"],"followUpQuestions":["string"],"portfolioSignals":["string"],"confidence":0}',
`Rules: keyPoints/redFlags/followUpQuestions/portfolioSignals arrays max ${EXTRACTION_MAX_ITEMS} items; each item <= ${EXTRACTION_ITEM_MAX_LENGTH} chars; summary <= ${EXTRACTION_SUMMARY_MAX_LENGTH} chars; confidence between 0 and 1.`,
'Filing text follows:',
filingText
].join('\n\n');
}
function reportPrompt(
filing: Filing,
extraction: FilingExtraction,
extractionMeta: FilingExtractionMeta
) {
return [
'You are a fiscal research assistant focused on regulatory signals.',
`Analyze this SEC filing from ${filing.company_name} (${filing.ticker}).`,
`Form: ${filing.filing_type}`,
`Filed: ${filing.filing_date}`,
`Metrics: ${JSON.stringify(filing.metrics ?? {})}`,
`Structured extraction context (${extractionMeta.source}): ${JSON.stringify(extraction)}`,
'Return concise sections: Thesis, Red Flags, Follow-up Questions, Portfolio Impact.'
].join('\n');
}
function filingLinks(filing: {
filingUrl: string | null;
submissionUrl: string | null;
@@ -134,27 +323,65 @@ async function processAnalyzeFiling(task: Task) {
throw new Error(`Filing ${accessionNumber} not found`);
}
const prompt = [
'You are a fiscal research assistant focused on regulatory signals.',
`Analyze this SEC filing from ${filing.company_name} (${filing.ticker}).`,
`Form: ${filing.filing_type}`,
`Filed: ${filing.filing_date}`,
`Metrics: ${JSON.stringify(filing.metrics ?? {})}`,
'Return concise sections: Thesis, Red Flags, Follow-up Questions, Portfolio Impact.'
].join('\n');
const defaultExtraction = deterministicExtractionFallback(filing);
let extraction = defaultExtraction;
let extractionMeta: FilingExtractionMeta = {
provider: 'deterministic-fallback',
model: 'metadata-fallback',
source: 'metadata_fallback',
generatedAt: new Date().toISOString()
};
const analysis = await runAiAnalysis(prompt, 'Use concise institutional analyst language.');
try {
const filingDocument = await fetchPrimaryFilingText({
filingUrl: filing.filing_url,
cik: filing.cik,
accessionNumber: filing.accession_number,
primaryDocument: filing.primary_document ?? null
});
if (filingDocument?.text) {
const extractionResult = await runAiAnalysis(
extractionPrompt(filing, filingDocument.text),
'Return strict JSON only.',
{ workload: 'extraction' }
);
const parsed = parseExtractionPayload(extractionResult.text);
if (parsed) {
extraction = parsed;
extractionMeta = {
provider: extractionResult.provider === 'local-fallback' ? 'deterministic-fallback' : 'ollama',
model: extractionResult.model,
source: filingDocument.source,
generatedAt: new Date().toISOString()
};
}
}
} catch {
extraction = defaultExtraction;
}
const analysis = await runAiAnalysis(
reportPrompt(filing, extraction, extractionMeta),
'Use concise institutional analyst language.',
{ workload: 'report' }
);
await saveFilingAnalysis(accessionNumber, {
provider: analysis.provider,
model: analysis.model,
text: analysis.text
text: analysis.text,
extraction,
extractionMeta
});
return {
accessionNumber,
provider: analysis.provider,
model: analysis.model
model: analysis.model,
extractionProvider: extractionMeta.provider,
extractionModel: extractionMeta.model
};
}
@@ -186,7 +413,11 @@ async function processPortfolioInsights(task: Task) {
'Respond with: 1) health score (0-100), 2) top 3 risks, 3) top 3 opportunities, 4) next actions in 7 days.'
].join('\n');
const analysis = await runAiAnalysis(prompt, 'Act as a risk-aware buy-side analyst.');
const analysis = await runAiAnalysis(
prompt,
'Act as a risk-aware buy-side analyst.',
{ workload: 'report' }
);
await createPortfolioInsight({
userId,
@@ -202,6 +433,11 @@ async function processPortfolioInsights(task: Task) {
};
}
export const __taskProcessorInternals = {
parseExtractionPayload,
deterministicExtractionFallback
};
export async function runTaskProcessor(task: Task) {
switch (task.task_type) {
case 'sync_filings':