Implement dual-model filing pipeline with Ollama extraction
This commit is contained in:
@@ -2,6 +2,7 @@ import { beforeEach, describe, expect, it, mock } from 'bun:test';
|
||||
import {
|
||||
__resetAiWarningsForTests,
|
||||
getAiConfig,
|
||||
getExtractionAiConfig,
|
||||
runAiAnalysis
|
||||
} from './ai';
|
||||
|
||||
@@ -154,4 +155,65 @@ describe('ai config and runtime', () => {
|
||||
})
|
||||
).rejects.toThrow('AI SDK returned an empty response');
|
||||
});
|
||||
|
||||
it('uses ollama defaults for extraction workload config', () => {
|
||||
const config = getExtractionAiConfig({
|
||||
env: {},
|
||||
warn: () => {}
|
||||
});
|
||||
|
||||
expect(config.provider).toBe('ollama');
|
||||
expect(config.baseUrl).toBe('http://127.0.0.1:11434');
|
||||
expect(config.model).toBe('qwen3:8b');
|
||||
expect(config.apiKey).toBe('ollama');
|
||||
expect(config.temperature).toBe(0);
|
||||
});
|
||||
|
||||
it('uses extraction workload and returns ollama provider on success', async () => {
|
||||
const createModel = mock((config: {
|
||||
provider: string;
|
||||
apiKey?: string;
|
||||
model: string;
|
||||
baseUrl: string;
|
||||
temperature: number;
|
||||
}) => {
|
||||
expect(config.provider).toBe('ollama');
|
||||
expect(config.baseUrl).toBe('http://127.0.0.1:11434');
|
||||
expect(config.model).toBe('qwen3:8b');
|
||||
expect(config.temperature).toBe(0);
|
||||
return { modelId: config.model };
|
||||
});
|
||||
const generate = mock(async () => ({ text: '{"summary":"ok","keyPoints":[],"redFlags":[],"followUpQuestions":[],"portfolioSignals":[],"confidence":0.6}' }));
|
||||
|
||||
const result = await runAiAnalysis('Extract this filing', 'Return JSON', {
|
||||
env: {
|
||||
OLLAMA_MODEL: 'qwen3:8b'
|
||||
},
|
||||
warn: () => {},
|
||||
workload: 'extraction',
|
||||
createModel,
|
||||
generate
|
||||
});
|
||||
|
||||
expect(createModel).toHaveBeenCalledTimes(1);
|
||||
expect(generate).toHaveBeenCalledTimes(1);
|
||||
expect(result.provider).toBe('ollama');
|
||||
expect(result.model).toBe('qwen3:8b');
|
||||
});
|
||||
|
||||
it('falls back to local text when extraction workload generation fails', async () => {
|
||||
const result = await runAiAnalysis('Extract this filing', 'Return JSON', {
|
||||
env: {},
|
||||
warn: () => {},
|
||||
workload: 'extraction',
|
||||
createModel: () => ({}),
|
||||
generate: async () => {
|
||||
throw new Error('ollama unavailable');
|
||||
}
|
||||
});
|
||||
|
||||
expect(result.provider).toBe('local-fallback');
|
||||
expect(result.model).toBe('qwen3:8b');
|
||||
expect(result.text).toContain('AI SDK fallback mode is active');
|
||||
});
|
||||
});
|
||||
|
||||
128
lib/server/ai.ts
128
lib/server/ai.ts
@@ -1,7 +1,12 @@
|
||||
import { createOpenAI } from '@ai-sdk/openai';
|
||||
import { generateText } from 'ai';
|
||||
import { createZhipu } from 'zhipu-ai-provider';
|
||||
|
||||
type AiWorkload = 'report' | 'extraction';
|
||||
type AiProvider = 'zhipu' | 'ollama';
|
||||
|
||||
type AiConfig = {
|
||||
provider: AiProvider;
|
||||
apiKey?: string;
|
||||
baseUrl: string;
|
||||
model: string;
|
||||
@@ -27,11 +32,15 @@ type AiGenerateOutput = {
|
||||
};
|
||||
|
||||
type RunAiAnalysisOptions = GetAiConfigOptions & {
|
||||
workload?: AiWorkload;
|
||||
createModel?: (config: AiConfig) => unknown;
|
||||
generate?: (input: AiGenerateInput) => Promise<AiGenerateOutput>;
|
||||
};
|
||||
|
||||
const CODING_API_BASE_URL = 'https://api.z.ai/api/coding/paas/v4';
|
||||
const OLLAMA_BASE_URL = 'http://127.0.0.1:11434';
|
||||
const OLLAMA_MODEL = 'qwen3:8b';
|
||||
const OLLAMA_API_KEY = 'ollama';
|
||||
|
||||
let warnedIgnoredZhipuBaseUrl = false;
|
||||
|
||||
@@ -74,20 +83,47 @@ function fallbackResponse(prompt: string) {
|
||||
const clipped = prompt.split('\n').slice(0, 6).join(' ').slice(0, 260);
|
||||
|
||||
return [
|
||||
'AI SDK fallback mode is active (Zhipu configuration is missing).',
|
||||
'AI SDK fallback mode is active (live model configuration is missing or unavailable).',
|
||||
'Thesis: Portfolio remains analyzable with local heuristics until live model access is configured.',
|
||||
'Risk scan: Concentration and filing sentiment should be monitored after each sync cycle.',
|
||||
`Context digest: ${clipped}`
|
||||
].join('\n\n');
|
||||
}
|
||||
|
||||
function toOpenAiCompatibleBaseUrl(baseUrl: string) {
|
||||
const normalized = baseUrl.endsWith('/')
|
||||
? baseUrl.slice(0, -1)
|
||||
: baseUrl;
|
||||
|
||||
return normalized.endsWith('/v1')
|
||||
? normalized
|
||||
: `${normalized}/v1`;
|
||||
}
|
||||
|
||||
function asErrorMessage(error: unknown) {
|
||||
if (error instanceof Error && error.message) {
|
||||
return error.message;
|
||||
}
|
||||
|
||||
return String(error);
|
||||
}
|
||||
|
||||
function defaultCreateModel(config: AiConfig) {
|
||||
const zhipu = createZhipu({
|
||||
apiKey: config.apiKey,
|
||||
baseURL: config.baseUrl
|
||||
if (config.provider === 'zhipu') {
|
||||
const zhipu = createZhipu({
|
||||
apiKey: config.apiKey,
|
||||
baseURL: config.baseUrl
|
||||
});
|
||||
|
||||
return zhipu(config.model);
|
||||
}
|
||||
|
||||
const openai = createOpenAI({
|
||||
apiKey: config.apiKey ?? OLLAMA_API_KEY,
|
||||
baseURL: toOpenAiCompatibleBaseUrl(config.baseUrl)
|
||||
});
|
||||
|
||||
return zhipu(config.model);
|
||||
return openai.chat(config.model);
|
||||
}
|
||||
|
||||
async function defaultGenerate(input: AiGenerateInput): Promise<AiGenerateOutput> {
|
||||
@@ -102,10 +138,15 @@ async function defaultGenerate(input: AiGenerateInput): Promise<AiGenerateOutput
|
||||
}
|
||||
|
||||
export function getAiConfig(options?: GetAiConfigOptions) {
|
||||
return getReportAiConfig(options);
|
||||
}
|
||||
|
||||
export function getReportAiConfig(options?: GetAiConfigOptions) {
|
||||
const env = options?.env ?? process.env;
|
||||
warnIgnoredZhipuBaseUrl(env, options?.warn ?? console.warn);
|
||||
|
||||
return {
|
||||
provider: 'zhipu',
|
||||
apiKey: envValue('ZHIPU_API_KEY', env),
|
||||
baseUrl: CODING_API_BASE_URL,
|
||||
model: envValue('ZHIPU_MODEL', env) ?? 'glm-4.7-flashx',
|
||||
@@ -113,15 +154,30 @@ export function getAiConfig(options?: GetAiConfigOptions) {
|
||||
} satisfies AiConfig;
|
||||
}
|
||||
|
||||
export function getExtractionAiConfig(options?: GetAiConfigOptions) {
|
||||
const env = options?.env ?? process.env;
|
||||
|
||||
return {
|
||||
provider: 'ollama',
|
||||
apiKey: envValue('OLLAMA_API_KEY', env) ?? OLLAMA_API_KEY,
|
||||
baseUrl: envValue('OLLAMA_BASE_URL', env) ?? OLLAMA_BASE_URL,
|
||||
model: envValue('OLLAMA_MODEL', env) ?? OLLAMA_MODEL,
|
||||
temperature: 0
|
||||
} satisfies AiConfig;
|
||||
}
|
||||
|
||||
export function isAiConfigured(options?: GetAiConfigOptions) {
|
||||
const config = getAiConfig(options);
|
||||
const config = getReportAiConfig(options);
|
||||
return Boolean(config.apiKey);
|
||||
}
|
||||
|
||||
export async function runAiAnalysis(prompt: string, systemPrompt?: string, options?: RunAiAnalysisOptions) {
|
||||
const config = getAiConfig(options);
|
||||
const workload = options?.workload ?? 'report';
|
||||
const config = workload === 'extraction'
|
||||
? getExtractionAiConfig(options)
|
||||
: getReportAiConfig(options);
|
||||
|
||||
if (!config.apiKey) {
|
||||
if (workload === 'report' && !config.apiKey) {
|
||||
return {
|
||||
provider: 'local-fallback',
|
||||
model: config.model,
|
||||
@@ -131,25 +187,49 @@ export async function runAiAnalysis(prompt: string, systemPrompt?: string, optio
|
||||
|
||||
const createModel = options?.createModel ?? defaultCreateModel;
|
||||
const generate = options?.generate ?? defaultGenerate;
|
||||
const model = createModel(config);
|
||||
const warn = options?.warn ?? console.warn;
|
||||
|
||||
const result = await generate({
|
||||
model,
|
||||
system: systemPrompt,
|
||||
prompt,
|
||||
temperature: config.temperature
|
||||
});
|
||||
try {
|
||||
const model = createModel(config);
|
||||
|
||||
const text = result.text.trim();
|
||||
if (!text) {
|
||||
throw new Error('AI SDK returned an empty response');
|
||||
const result = await generate({
|
||||
model,
|
||||
system: systemPrompt,
|
||||
prompt,
|
||||
temperature: config.temperature
|
||||
});
|
||||
|
||||
const text = result.text.trim();
|
||||
if (!text) {
|
||||
if (workload === 'extraction') {
|
||||
return {
|
||||
provider: 'local-fallback',
|
||||
model: config.model,
|
||||
text: fallbackResponse(prompt)
|
||||
};
|
||||
}
|
||||
|
||||
throw new Error('AI SDK returned an empty response');
|
||||
}
|
||||
|
||||
return {
|
||||
provider: config.provider,
|
||||
model: config.model,
|
||||
text
|
||||
};
|
||||
} catch (error) {
|
||||
if (workload === 'extraction') {
|
||||
warn(`[AI SDK] Extraction fallback activated: ${asErrorMessage(error)}`);
|
||||
|
||||
return {
|
||||
provider: 'local-fallback',
|
||||
model: config.model,
|
||||
text: fallbackResponse(prompt)
|
||||
};
|
||||
}
|
||||
|
||||
throw error;
|
||||
}
|
||||
|
||||
return {
|
||||
provider: 'zhipu',
|
||||
model: config.model,
|
||||
text
|
||||
};
|
||||
}
|
||||
|
||||
export function __resetAiWarningsForTests() {
|
||||
|
||||
@@ -4,6 +4,7 @@ import { auth } from '@/lib/auth';
|
||||
import { requireAuthenticatedSession } from '@/lib/server/auth-session';
|
||||
import { asErrorMessage, jsonError } from '@/lib/server/http';
|
||||
import { buildPortfolioSummary } from '@/lib/server/portfolio';
|
||||
import { redactInternalFilingAnalysisFields } from '@/lib/server/api/filing-redaction';
|
||||
import { getFilingByAccession, listFilingsRecords } from '@/lib/server/repos/filings';
|
||||
import {
|
||||
deleteHoldingByIdRecord,
|
||||
@@ -332,8 +333,9 @@ export const app = new Elysia({ prefix: '/api' })
|
||||
getQuote(ticker),
|
||||
getPriceHistory(ticker)
|
||||
]);
|
||||
const redactedFilings = filings.map(redactInternalFilingAnalysisFields);
|
||||
|
||||
const latestFiling = filings[0] ?? null;
|
||||
const latestFiling = redactedFilings[0] ?? null;
|
||||
const holding = holdings.find((entry) => entry.ticker === ticker) ?? null;
|
||||
const watchlistItem = watchlist.find((entry) => entry.ticker === ticker) ?? null;
|
||||
|
||||
@@ -341,7 +343,7 @@ export const app = new Elysia({ prefix: '/api' })
|
||||
?? watchlistItem?.company_name
|
||||
?? ticker;
|
||||
|
||||
const financials = filings
|
||||
const financials = redactedFilings
|
||||
.filter((entry) => entry.metrics)
|
||||
.map((entry) => ({
|
||||
filingDate: entry.filing_date,
|
||||
@@ -353,7 +355,7 @@ export const app = new Elysia({ prefix: '/api' })
|
||||
debt: entry.metrics?.debt ?? null
|
||||
}));
|
||||
|
||||
const aiReports = filings
|
||||
const aiReports = redactedFilings
|
||||
.filter((entry) => entry.analysis?.text || entry.analysis?.legacyInsights)
|
||||
.slice(0, 8)
|
||||
.map((entry) => ({
|
||||
@@ -377,7 +379,7 @@ export const app = new Elysia({ prefix: '/api' })
|
||||
position: holding,
|
||||
priceHistory,
|
||||
financials,
|
||||
filings: filings.slice(0, 20),
|
||||
filings: redactedFilings.slice(0, 20),
|
||||
aiReports
|
||||
}
|
||||
});
|
||||
@@ -446,7 +448,7 @@ export const app = new Elysia({ prefix: '/api' })
|
||||
limit: Number.isFinite(limit) ? limit : 50
|
||||
});
|
||||
|
||||
return Response.json({ filings });
|
||||
return Response.json({ filings: filings.map(redactInternalFilingAnalysisFields) });
|
||||
}, {
|
||||
query: t.Object({
|
||||
ticker: t.Optional(t.String()),
|
||||
|
||||
52
lib/server/api/filing-redaction.test.ts
Normal file
52
lib/server/api/filing-redaction.test.ts
Normal file
@@ -0,0 +1,52 @@
|
||||
import { describe, expect, it } from 'bun:test';
|
||||
import type { Filing } from '@/lib/types';
|
||||
import { redactInternalFilingAnalysisFields } from './filing-redaction';
|
||||
|
||||
function filingWithExtraction(): Filing {
|
||||
return {
|
||||
id: 7,
|
||||
ticker: 'MSFT',
|
||||
filing_type: '10-K',
|
||||
filing_date: '2026-02-01',
|
||||
accession_number: '0000789019-26-000001',
|
||||
cik: '0000789019',
|
||||
company_name: 'Microsoft Corporation',
|
||||
filing_url: 'https://www.sec.gov/Archives/edgar/data/789019/000078901926000001/a10k.htm',
|
||||
submission_url: null,
|
||||
primary_document: 'a10k.htm',
|
||||
metrics: null,
|
||||
analysis: {
|
||||
provider: 'zhipu',
|
||||
model: 'glm-4.7-flashx',
|
||||
text: 'Report text',
|
||||
extraction: {
|
||||
summary: 'Internal extraction summary',
|
||||
keyPoints: ['a'],
|
||||
redFlags: ['b'],
|
||||
followUpQuestions: ['c'],
|
||||
portfolioSignals: ['d'],
|
||||
confidence: 0.4
|
||||
},
|
||||
extractionMeta: {
|
||||
provider: 'ollama',
|
||||
model: 'qwen3:8b',
|
||||
source: 'primary_document',
|
||||
generatedAt: '2026-02-01T00:00:00.000Z'
|
||||
}
|
||||
},
|
||||
created_at: '2026-02-01T00:00:00.000Z',
|
||||
updated_at: '2026-02-01T00:00:00.000Z'
|
||||
};
|
||||
}
|
||||
|
||||
describe('filing response redaction', () => {
|
||||
it('removes internal extraction fields while preserving public analysis fields', () => {
|
||||
const redacted = redactInternalFilingAnalysisFields(filingWithExtraction());
|
||||
|
||||
expect(redacted.analysis?.provider).toBe('zhipu');
|
||||
expect(redacted.analysis?.model).toBe('glm-4.7-flashx');
|
||||
expect(redacted.analysis?.text).toBe('Report text');
|
||||
expect(redacted.analysis?.extraction).toBeUndefined();
|
||||
expect(redacted.analysis?.extractionMeta).toBeUndefined();
|
||||
});
|
||||
});
|
||||
15
lib/server/api/filing-redaction.ts
Normal file
15
lib/server/api/filing-redaction.ts
Normal file
@@ -0,0 +1,15 @@
|
||||
import type { Filing } from '@/lib/types';
|
||||
|
||||
export function redactInternalFilingAnalysisFields(filing: Filing): Filing {
|
||||
if (!filing.analysis) {
|
||||
return filing;
|
||||
}
|
||||
|
||||
const { extraction: _extraction, extractionMeta: _extractionMeta, ...analysis } = filing.analysis;
|
||||
const hasPublicFields = Object.keys(analysis).length > 0;
|
||||
|
||||
return {
|
||||
...filing,
|
||||
analysis: hasPublicFields ? analysis : null
|
||||
};
|
||||
}
|
||||
@@ -20,6 +20,20 @@ type FilingAnalysis = {
|
||||
model?: string;
|
||||
text?: string;
|
||||
legacyInsights?: string;
|
||||
extraction?: {
|
||||
summary: string;
|
||||
keyPoints: string[];
|
||||
redFlags: string[];
|
||||
followUpQuestions: string[];
|
||||
portfolioSignals: string[];
|
||||
confidence: number;
|
||||
};
|
||||
extractionMeta?: {
|
||||
provider: string;
|
||||
model: string;
|
||||
source: 'primary_document' | 'metadata_fallback';
|
||||
generatedAt: string;
|
||||
};
|
||||
};
|
||||
|
||||
const authDateColumn = {
|
||||
|
||||
84
lib/server/sec.test.ts
Normal file
84
lib/server/sec.test.ts
Normal file
@@ -0,0 +1,84 @@
|
||||
import { describe, expect, it, mock } from 'bun:test';
|
||||
import {
|
||||
fetchPrimaryFilingText,
|
||||
normalizeSecDocumentText,
|
||||
resolvePrimaryFilingUrl,
|
||||
trimSecDocumentTextForPrompt
|
||||
} from './sec';
|
||||
|
||||
describe('sec filing text helpers', () => {
|
||||
it('normalizes html filing content into plain text', () => {
|
||||
const html = `
|
||||
<html>
|
||||
<head>
|
||||
<style>.x { color: red; }</style>
|
||||
<script>console.log("ignore")</script>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Quarterly Report</h1>
|
||||
<p>Revenue & margin improved.</p>
|
||||
<div>See 'Risk Factors' section.</div>
|
||||
</body>
|
||||
</html>
|
||||
`;
|
||||
|
||||
const normalized = normalizeSecDocumentText(html);
|
||||
|
||||
expect(normalized).toContain('Quarterly Report');
|
||||
expect(normalized).toContain('Revenue & margin improved.');
|
||||
expect(normalized).toContain('See \'Risk Factors\' section.');
|
||||
expect(normalized).not.toContain('<script>');
|
||||
expect(normalized).not.toContain('console.log');
|
||||
});
|
||||
|
||||
it('trims filing text to prompt budget boundaries', () => {
|
||||
const text = `A`.repeat(4_500);
|
||||
const result = trimSecDocumentTextForPrompt(text, 2_000);
|
||||
|
||||
expect(result.truncated).toBe(true);
|
||||
expect(result.text.length).toBeLessThanOrEqual(2_000);
|
||||
});
|
||||
|
||||
it('prefers explicit filing url when available', () => {
|
||||
const url = resolvePrimaryFilingUrl({
|
||||
filingUrl: 'https://www.sec.gov/Archives/edgar/data/123/x.htm',
|
||||
cik: '123',
|
||||
accessionNumber: '0000-00-00',
|
||||
primaryDocument: 'x.htm'
|
||||
});
|
||||
|
||||
expect(url).toBe('https://www.sec.gov/Archives/edgar/data/123/x.htm');
|
||||
});
|
||||
|
||||
it('reconstructs primary filing url when filing url is absent', () => {
|
||||
const url = resolvePrimaryFilingUrl({
|
||||
filingUrl: null,
|
||||
cik: '0000320193',
|
||||
accessionNumber: '0000320193-24-000001',
|
||||
primaryDocument: 'a10q.htm'
|
||||
});
|
||||
|
||||
expect(url).toBe('https://www.sec.gov/Archives/edgar/data/320193/000032019324000001/a10q.htm');
|
||||
});
|
||||
|
||||
it('fetches, normalizes, and clips primary filing text', async () => {
|
||||
const longHtml = `<html><body><p>${'Alpha '.repeat(600)}</p></body></html>`;
|
||||
const fetchImpl = mock(async () => new Response(longHtml, { status: 200 })) as unknown as typeof fetch;
|
||||
|
||||
const result = await fetchPrimaryFilingText({
|
||||
filingUrl: null,
|
||||
cik: '0000320193',
|
||||
accessionNumber: '0000320193-24-000001',
|
||||
primaryDocument: 'a10q.htm'
|
||||
}, {
|
||||
fetchImpl,
|
||||
maxChars: 1_000
|
||||
});
|
||||
|
||||
expect(fetchImpl).toHaveBeenCalledTimes(1);
|
||||
expect(result).not.toBeNull();
|
||||
expect(result?.source).toBe('primary_document');
|
||||
expect(result?.truncated).toBe(true);
|
||||
expect(result?.text.length).toBeLessThanOrEqual(1_000);
|
||||
});
|
||||
});
|
||||
@@ -39,8 +39,28 @@ type SecFiling = {
|
||||
primaryDocument: string | null;
|
||||
};
|
||||
|
||||
type FilingDocumentInput = {
|
||||
filingUrl: string | null;
|
||||
cik: string;
|
||||
accessionNumber: string;
|
||||
primaryDocument: string | null;
|
||||
};
|
||||
|
||||
type FetchPrimaryFilingTextOptions = {
|
||||
fetchImpl?: typeof fetch;
|
||||
maxChars?: number;
|
||||
};
|
||||
|
||||
export type FilingDocumentText = {
|
||||
source: 'primary_document';
|
||||
url: string;
|
||||
text: string;
|
||||
truncated: boolean;
|
||||
};
|
||||
|
||||
const SUPPORTED_FORMS: FilingType[] = ['10-K', '10-Q', '8-K'];
|
||||
const TICKER_CACHE_TTL_MS = 1000 * 60 * 60 * 12;
|
||||
const FILING_TEXT_MAX_CHARS = 24_000;
|
||||
|
||||
let tickerCache = new Map<string, TickerDirectoryRecord>();
|
||||
let tickerCacheLoadedAt = 0;
|
||||
@@ -53,6 +73,147 @@ function todayIso() {
|
||||
return new Date().toISOString().slice(0, 10);
|
||||
}
|
||||
|
||||
function decodeHtmlEntities(value: string) {
|
||||
const decodeCodePoint = (code: number) => {
|
||||
if (!Number.isFinite(code) || code < 0 || code > 0x10ffff) {
|
||||
return ' ';
|
||||
}
|
||||
|
||||
try {
|
||||
return String.fromCodePoint(code);
|
||||
} catch {
|
||||
return ' ';
|
||||
}
|
||||
};
|
||||
|
||||
return value
|
||||
.replace(/ | /gi, ' ')
|
||||
.replace(/&/gi, '&')
|
||||
.replace(/</gi, '<')
|
||||
.replace(/>/gi, '>')
|
||||
.replace(/"/gi, '"')
|
||||
.replace(/'/gi, '\'')
|
||||
.replace(/&#x([0-9a-f]+);/gi, (_match, rawCode: string) => {
|
||||
const code = Number.parseInt(rawCode, 16);
|
||||
return decodeCodePoint(code);
|
||||
})
|
||||
.replace(/&#([0-9]+);/g, (_match, rawCode: string) => {
|
||||
const code = Number.parseInt(rawCode, 10);
|
||||
return decodeCodePoint(code);
|
||||
});
|
||||
}
|
||||
|
||||
export function normalizeSecDocumentText(raw: string) {
|
||||
return decodeHtmlEntities(
|
||||
raw
|
||||
.replace(/\r/g, '\n')
|
||||
.replace(/<script[\s\S]*?<\/script>/gi, ' ')
|
||||
.replace(/<style[\s\S]*?<\/style>/gi, ' ')
|
||||
.replace(/<noscript[\s\S]*?<\/noscript>/gi, ' ')
|
||||
.replace(/<!--[\s\S]*?-->/g, ' ')
|
||||
.replace(/<\/?(p|div|section|article|li|tr|td|th|h[1-6]|br|hr)[^>]*>/gi, '\n')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
)
|
||||
.replace(/[ \t]+\n/g, '\n')
|
||||
.replace(/\n[ \t]+/g, '\n')
|
||||
.replace(/[ \t]{2,}/g, ' ')
|
||||
.replace(/\n{3,}/g, '\n\n')
|
||||
.trim();
|
||||
}
|
||||
|
||||
export function trimSecDocumentTextForPrompt(text: string, maxChars = FILING_TEXT_MAX_CHARS) {
|
||||
const safeMax = Math.max(Math.trunc(maxChars), 1_000);
|
||||
if (text.length <= safeMax) {
|
||||
return { text, truncated: false };
|
||||
}
|
||||
|
||||
const slice = text.slice(0, safeMax);
|
||||
const newlineBoundary = slice.lastIndexOf('\n');
|
||||
const wordBoundary = slice.lastIndexOf(' ');
|
||||
const boundary = Math.max(newlineBoundary, wordBoundary);
|
||||
const clipped = (boundary > safeMax * 0.7 ? slice.slice(0, boundary) : slice).trimEnd();
|
||||
|
||||
return { text: clipped, truncated: true };
|
||||
}
|
||||
|
||||
function compactAccessionNumber(value: string) {
|
||||
return value.replace(/-/g, '');
|
||||
}
|
||||
|
||||
function normalizeCikForPath(value: string) {
|
||||
const digits = value.replace(/\D/g, '');
|
||||
if (!digits) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const numeric = Number(digits);
|
||||
if (!Number.isFinite(numeric)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return String(numeric);
|
||||
}
|
||||
|
||||
export function resolvePrimaryFilingUrl(input: FilingDocumentInput) {
|
||||
const directUrl = input.filingUrl?.trim();
|
||||
if (directUrl) {
|
||||
return directUrl;
|
||||
}
|
||||
|
||||
if (!input.primaryDocument) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const cikPath = normalizeCikForPath(input.cik);
|
||||
const accessionPath = compactAccessionNumber(input.accessionNumber);
|
||||
if (!cikPath || !accessionPath) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return `https://www.sec.gov/Archives/edgar/data/${cikPath}/${accessionPath}/${input.primaryDocument}`;
|
||||
}
|
||||
|
||||
export async function fetchPrimaryFilingText(
|
||||
input: FilingDocumentInput,
|
||||
options?: FetchPrimaryFilingTextOptions
|
||||
): Promise<FilingDocumentText | null> {
|
||||
const url = resolvePrimaryFilingUrl(input);
|
||||
if (!url) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const doFetch = options?.fetchImpl ?? fetch;
|
||||
const response = await doFetch(url, {
|
||||
headers: {
|
||||
'User-Agent': envUserAgent(),
|
||||
Accept: 'text/html, text/plain;q=0.9, */*;q=0.8'
|
||||
},
|
||||
cache: 'no-store'
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`SEC filing request failed (${response.status})`);
|
||||
}
|
||||
|
||||
const raw = await response.text();
|
||||
const normalized = normalizeSecDocumentText(raw);
|
||||
if (!normalized) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const clipped = trimSecDocumentTextForPrompt(normalized, options?.maxChars ?? FILING_TEXT_MAX_CHARS);
|
||||
if (!clipped.text) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
source: 'primary_document',
|
||||
url,
|
||||
text: clipped.text,
|
||||
truncated: clipped.truncated
|
||||
};
|
||||
}
|
||||
|
||||
function pseudoMetric(seed: string, min: number, max: number) {
|
||||
let hash = 0;
|
||||
for (const char of seed) {
|
||||
|
||||
71
lib/server/task-processors.test.ts
Normal file
71
lib/server/task-processors.test.ts
Normal file
@@ -0,0 +1,71 @@
|
||||
import { describe, expect, it } from 'bun:test';
|
||||
import type { Filing } from '@/lib/types';
|
||||
import { __taskProcessorInternals } from './task-processors';
|
||||
|
||||
function sampleFiling(): Filing {
|
||||
return {
|
||||
id: 1,
|
||||
ticker: 'AAPL',
|
||||
filing_type: '10-Q',
|
||||
filing_date: '2026-01-30',
|
||||
accession_number: '0000320193-26-000001',
|
||||
cik: '0000320193',
|
||||
company_name: 'Apple Inc.',
|
||||
filing_url: 'https://www.sec.gov/Archives/edgar/data/320193/000032019326000001/a10q.htm',
|
||||
submission_url: 'https://data.sec.gov/submissions/CIK0000320193.json',
|
||||
primary_document: 'a10q.htm',
|
||||
metrics: {
|
||||
revenue: 120_000_000_000,
|
||||
netIncome: 25_000_000_000,
|
||||
totalAssets: 410_000_000_000,
|
||||
cash: 70_000_000_000,
|
||||
debt: 98_000_000_000
|
||||
},
|
||||
analysis: null,
|
||||
created_at: '2026-01-30T00:00:00.000Z',
|
||||
updated_at: '2026-01-30T00:00:00.000Z'
|
||||
};
|
||||
}
|
||||
|
||||
describe('task processor extraction helpers', () => {
|
||||
it('parses strict extraction payloads', () => {
|
||||
const raw = JSON.stringify({
|
||||
summary: 'Revenue growth remained resilient despite FX pressure.',
|
||||
keyPoints: ['Revenue up year-over-year'],
|
||||
redFlags: ['Debt service burden is rising'],
|
||||
followUpQuestions: ['Is margin guidance sustainable?'],
|
||||
portfolioSignals: ['Monitor leverage trend'],
|
||||
confidence: 0.72
|
||||
});
|
||||
|
||||
const parsed = __taskProcessorInternals.parseExtractionPayload(raw);
|
||||
|
||||
expect(parsed).not.toBeNull();
|
||||
expect(parsed?.summary).toContain('Revenue growth');
|
||||
expect(parsed?.confidence).toBe(0.72);
|
||||
});
|
||||
|
||||
it('rejects extraction payloads with extra keys', () => {
|
||||
const raw = JSON.stringify({
|
||||
summary: 'ok',
|
||||
keyPoints: [],
|
||||
redFlags: [],
|
||||
followUpQuestions: [],
|
||||
portfolioSignals: [],
|
||||
confidence: 0.2,
|
||||
extra: 'not-allowed'
|
||||
});
|
||||
|
||||
const parsed = __taskProcessorInternals.parseExtractionPayload(raw);
|
||||
expect(parsed).toBeNull();
|
||||
});
|
||||
|
||||
it('builds deterministic extraction fallback from filing metadata', () => {
|
||||
const fallback = __taskProcessorInternals.deterministicExtractionFallback(sampleFiling());
|
||||
|
||||
expect(fallback.summary).toContain('Deterministic extraction fallback');
|
||||
expect(fallback.keyPoints.length).toBeGreaterThan(0);
|
||||
expect(fallback.redFlags.length).toBeGreaterThan(0);
|
||||
expect(fallback.confidence).toBe(0.2);
|
||||
});
|
||||
});
|
||||
@@ -1,4 +1,10 @@
|
||||
import type { Filing, Holding, Task } from '@/lib/types';
|
||||
import type {
|
||||
Filing,
|
||||
FilingExtraction,
|
||||
FilingExtractionMeta,
|
||||
Holding,
|
||||
Task
|
||||
} from '@/lib/types';
|
||||
import { runAiAnalysis } from '@/lib/server/ai';
|
||||
import { buildPortfolioSummary } from '@/lib/server/portfolio';
|
||||
import { getQuote } from '@/lib/server/prices';
|
||||
@@ -13,7 +19,23 @@ import {
|
||||
listUserHoldings
|
||||
} from '@/lib/server/repos/holdings';
|
||||
import { createPortfolioInsight } from '@/lib/server/repos/insights';
|
||||
import { fetchFilingMetrics, fetchRecentFilings } from '@/lib/server/sec';
|
||||
import {
|
||||
fetchFilingMetrics,
|
||||
fetchPrimaryFilingText,
|
||||
fetchRecentFilings
|
||||
} from '@/lib/server/sec';
|
||||
|
||||
const EXTRACTION_REQUIRED_KEYS = [
|
||||
'summary',
|
||||
'keyPoints',
|
||||
'redFlags',
|
||||
'followUpQuestions',
|
||||
'portfolioSignals',
|
||||
'confidence'
|
||||
] as const;
|
||||
const EXTRACTION_MAX_ITEMS = 6;
|
||||
const EXTRACTION_ITEM_MAX_LENGTH = 280;
|
||||
const EXTRACTION_SUMMARY_MAX_LENGTH = 900;
|
||||
|
||||
function toTaskResult(value: unknown): Record<string, unknown> {
|
||||
if (!value || typeof value !== 'object' || Array.isArray(value)) {
|
||||
@@ -42,6 +64,173 @@ function parseLimit(raw: unknown, fallback: number, min: number, max: number) {
|
||||
return Math.min(Math.max(intValue, min), max);
|
||||
}
|
||||
|
||||
function sanitizeExtractionText(value: unknown, maxLength: number) {
|
||||
if (typeof value !== 'string') {
|
||||
return null;
|
||||
}
|
||||
|
||||
const collapsed = value.replace(/\s+/g, ' ').trim();
|
||||
if (!collapsed) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return collapsed.slice(0, maxLength);
|
||||
}
|
||||
|
||||
function sanitizeExtractionList(value: unknown) {
|
||||
if (!Array.isArray(value)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const cleaned: string[] = [];
|
||||
|
||||
for (const entry of value) {
|
||||
const normalized = sanitizeExtractionText(entry, EXTRACTION_ITEM_MAX_LENGTH);
|
||||
if (!normalized) {
|
||||
continue;
|
||||
}
|
||||
|
||||
cleaned.push(normalized);
|
||||
if (cleaned.length >= EXTRACTION_MAX_ITEMS) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return cleaned;
|
||||
}
|
||||
|
||||
function parseExtractionPayload(raw: string): FilingExtraction | null {
|
||||
const fencedJson = raw.match(/```(?:json)?\s*([\s\S]*?)```/i)?.[1];
|
||||
const candidate = fencedJson ?? (() => {
|
||||
const start = raw.indexOf('{');
|
||||
const end = raw.lastIndexOf('}');
|
||||
return start >= 0 && end > start ? raw.slice(start, end + 1) : null;
|
||||
})();
|
||||
|
||||
if (!candidate) {
|
||||
return null;
|
||||
}
|
||||
|
||||
let parsed: unknown;
|
||||
try {
|
||||
parsed = JSON.parse(candidate);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const payload = parsed as Record<string, unknown>;
|
||||
const keys = Object.keys(payload);
|
||||
if (keys.length !== EXTRACTION_REQUIRED_KEYS.length) {
|
||||
return null;
|
||||
}
|
||||
|
||||
for (const key of EXTRACTION_REQUIRED_KEYS) {
|
||||
if (!(key in payload)) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
for (const key of keys) {
|
||||
if (!EXTRACTION_REQUIRED_KEYS.includes(key as (typeof EXTRACTION_REQUIRED_KEYS)[number])) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
const summary = sanitizeExtractionText(payload.summary, EXTRACTION_SUMMARY_MAX_LENGTH);
|
||||
const keyPoints = sanitizeExtractionList(payload.keyPoints);
|
||||
const redFlags = sanitizeExtractionList(payload.redFlags);
|
||||
const followUpQuestions = sanitizeExtractionList(payload.followUpQuestions);
|
||||
const portfolioSignals = sanitizeExtractionList(payload.portfolioSignals);
|
||||
const confidenceRaw = typeof payload.confidence === 'number'
|
||||
? payload.confidence
|
||||
: Number(payload.confidence);
|
||||
|
||||
if (!summary || !keyPoints || !redFlags || !followUpQuestions || !portfolioSignals || !Number.isFinite(confidenceRaw)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
summary,
|
||||
keyPoints,
|
||||
redFlags,
|
||||
followUpQuestions,
|
||||
portfolioSignals,
|
||||
confidence: Math.min(Math.max(confidenceRaw, 0), 1)
|
||||
};
|
||||
}
|
||||
|
||||
function metricSnapshotLine(label: string, value: number | null | undefined) {
|
||||
if (value === null || value === undefined || !Number.isFinite(value)) {
|
||||
return `${label}: not reported`;
|
||||
}
|
||||
|
||||
return `${label}: ${Math.round(value).toLocaleString('en-US')}`;
|
||||
}
|
||||
|
||||
function deterministicExtractionFallback(filing: Filing): FilingExtraction {
|
||||
const metrics = filing.metrics;
|
||||
|
||||
return {
|
||||
summary: `${filing.company_name} ${filing.filing_type} filed on ${filing.filing_date}. Deterministic extraction fallback used due unavailable or invalid local parsing output.`,
|
||||
keyPoints: [
|
||||
`${filing.filing_type} filing recorded for ${filing.ticker}.`,
|
||||
metricSnapshotLine('Revenue', metrics?.revenue),
|
||||
metricSnapshotLine('Net income', metrics?.netIncome),
|
||||
metricSnapshotLine('Total assets', metrics?.totalAssets)
|
||||
],
|
||||
redFlags: [
|
||||
metricSnapshotLine('Cash', metrics?.cash),
|
||||
metricSnapshotLine('Debt', metrics?.debt),
|
||||
filing.primary_document ? 'Primary document is indexed and available for review.' : 'Primary document reference is unavailable in current filing metadata.'
|
||||
],
|
||||
followUpQuestions: [
|
||||
'What changed versus the prior filing in guidance, margins, or liquidity?',
|
||||
'Are any material risks under-emphasized relative to historical filings?',
|
||||
'Should portfolio exposure be adjusted before the next reporting cycle?'
|
||||
],
|
||||
portfolioSignals: [
|
||||
'Validate trend direction using at least two prior filings.',
|
||||
'Cross-check leverage and liquidity metrics against position sizing rules.',
|
||||
'Track language shifts around guidance or demand assumptions.'
|
||||
],
|
||||
confidence: 0.2
|
||||
};
|
||||
}
|
||||
|
||||
function extractionPrompt(filing: Filing, filingText: string) {
|
||||
return [
|
||||
'Extract structured signals from the SEC filing text.',
|
||||
`Company: ${filing.company_name} (${filing.ticker})`,
|
||||
`Form: ${filing.filing_type}`,
|
||||
`Filed: ${filing.filing_date}`,
|
||||
'Return ONLY valid JSON with exactly these keys and no extra keys:',
|
||||
'{"summary":"string","keyPoints":["string"],"redFlags":["string"],"followUpQuestions":["string"],"portfolioSignals":["string"],"confidence":0}',
|
||||
`Rules: keyPoints/redFlags/followUpQuestions/portfolioSignals arrays max ${EXTRACTION_MAX_ITEMS} items; each item <= ${EXTRACTION_ITEM_MAX_LENGTH} chars; summary <= ${EXTRACTION_SUMMARY_MAX_LENGTH} chars; confidence between 0 and 1.`,
|
||||
'Filing text follows:',
|
||||
filingText
|
||||
].join('\n\n');
|
||||
}
|
||||
|
||||
function reportPrompt(
|
||||
filing: Filing,
|
||||
extraction: FilingExtraction,
|
||||
extractionMeta: FilingExtractionMeta
|
||||
) {
|
||||
return [
|
||||
'You are a fiscal research assistant focused on regulatory signals.',
|
||||
`Analyze this SEC filing from ${filing.company_name} (${filing.ticker}).`,
|
||||
`Form: ${filing.filing_type}`,
|
||||
`Filed: ${filing.filing_date}`,
|
||||
`Metrics: ${JSON.stringify(filing.metrics ?? {})}`,
|
||||
`Structured extraction context (${extractionMeta.source}): ${JSON.stringify(extraction)}`,
|
||||
'Return concise sections: Thesis, Red Flags, Follow-up Questions, Portfolio Impact.'
|
||||
].join('\n');
|
||||
}
|
||||
|
||||
function filingLinks(filing: {
|
||||
filingUrl: string | null;
|
||||
submissionUrl: string | null;
|
||||
@@ -134,27 +323,65 @@ async function processAnalyzeFiling(task: Task) {
|
||||
throw new Error(`Filing ${accessionNumber} not found`);
|
||||
}
|
||||
|
||||
const prompt = [
|
||||
'You are a fiscal research assistant focused on regulatory signals.',
|
||||
`Analyze this SEC filing from ${filing.company_name} (${filing.ticker}).`,
|
||||
`Form: ${filing.filing_type}`,
|
||||
`Filed: ${filing.filing_date}`,
|
||||
`Metrics: ${JSON.stringify(filing.metrics ?? {})}`,
|
||||
'Return concise sections: Thesis, Red Flags, Follow-up Questions, Portfolio Impact.'
|
||||
].join('\n');
|
||||
const defaultExtraction = deterministicExtractionFallback(filing);
|
||||
let extraction = defaultExtraction;
|
||||
let extractionMeta: FilingExtractionMeta = {
|
||||
provider: 'deterministic-fallback',
|
||||
model: 'metadata-fallback',
|
||||
source: 'metadata_fallback',
|
||||
generatedAt: new Date().toISOString()
|
||||
};
|
||||
|
||||
const analysis = await runAiAnalysis(prompt, 'Use concise institutional analyst language.');
|
||||
try {
|
||||
const filingDocument = await fetchPrimaryFilingText({
|
||||
filingUrl: filing.filing_url,
|
||||
cik: filing.cik,
|
||||
accessionNumber: filing.accession_number,
|
||||
primaryDocument: filing.primary_document ?? null
|
||||
});
|
||||
|
||||
if (filingDocument?.text) {
|
||||
const extractionResult = await runAiAnalysis(
|
||||
extractionPrompt(filing, filingDocument.text),
|
||||
'Return strict JSON only.',
|
||||
{ workload: 'extraction' }
|
||||
);
|
||||
|
||||
const parsed = parseExtractionPayload(extractionResult.text);
|
||||
if (parsed) {
|
||||
extraction = parsed;
|
||||
extractionMeta = {
|
||||
provider: extractionResult.provider === 'local-fallback' ? 'deterministic-fallback' : 'ollama',
|
||||
model: extractionResult.model,
|
||||
source: filingDocument.source,
|
||||
generatedAt: new Date().toISOString()
|
||||
};
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
extraction = defaultExtraction;
|
||||
}
|
||||
|
||||
const analysis = await runAiAnalysis(
|
||||
reportPrompt(filing, extraction, extractionMeta),
|
||||
'Use concise institutional analyst language.',
|
||||
{ workload: 'report' }
|
||||
);
|
||||
|
||||
await saveFilingAnalysis(accessionNumber, {
|
||||
provider: analysis.provider,
|
||||
model: analysis.model,
|
||||
text: analysis.text
|
||||
text: analysis.text,
|
||||
extraction,
|
||||
extractionMeta
|
||||
});
|
||||
|
||||
return {
|
||||
accessionNumber,
|
||||
provider: analysis.provider,
|
||||
model: analysis.model
|
||||
model: analysis.model,
|
||||
extractionProvider: extractionMeta.provider,
|
||||
extractionModel: extractionMeta.model
|
||||
};
|
||||
}
|
||||
|
||||
@@ -186,7 +413,11 @@ async function processPortfolioInsights(task: Task) {
|
||||
'Respond with: 1) health score (0-100), 2) top 3 risks, 3) top 3 opportunities, 4) next actions in 7 days.'
|
||||
].join('\n');
|
||||
|
||||
const analysis = await runAiAnalysis(prompt, 'Act as a risk-aware buy-side analyst.');
|
||||
const analysis = await runAiAnalysis(
|
||||
prompt,
|
||||
'Act as a risk-aware buy-side analyst.',
|
||||
{ workload: 'report' }
|
||||
);
|
||||
|
||||
await createPortfolioInsight({
|
||||
userId,
|
||||
@@ -202,6 +433,11 @@ async function processPortfolioInsights(task: Task) {
|
||||
};
|
||||
}
|
||||
|
||||
export const __taskProcessorInternals = {
|
||||
parseExtractionPayload,
|
||||
deterministicExtractionFallback
|
||||
};
|
||||
|
||||
export async function runTaskProcessor(task: Task) {
|
||||
switch (task.task_type) {
|
||||
case 'sync_filings':
|
||||
|
||||
Reference in New Issue
Block a user