Implement dual-model filing pipeline with Ollama extraction
This commit is contained in:
84
lib/server/sec.test.ts
Normal file
84
lib/server/sec.test.ts
Normal file
@@ -0,0 +1,84 @@
|
||||
import { describe, expect, it, mock } from 'bun:test';
|
||||
import {
|
||||
fetchPrimaryFilingText,
|
||||
normalizeSecDocumentText,
|
||||
resolvePrimaryFilingUrl,
|
||||
trimSecDocumentTextForPrompt
|
||||
} from './sec';
|
||||
|
||||
describe('sec filing text helpers', () => {
|
||||
it('normalizes html filing content into plain text', () => {
|
||||
const html = `
|
||||
<html>
|
||||
<head>
|
||||
<style>.x { color: red; }</style>
|
||||
<script>console.log("ignore")</script>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Quarterly Report</h1>
|
||||
<p>Revenue & margin improved.</p>
|
||||
<div>See 'Risk Factors' section.</div>
|
||||
</body>
|
||||
</html>
|
||||
`;
|
||||
|
||||
const normalized = normalizeSecDocumentText(html);
|
||||
|
||||
expect(normalized).toContain('Quarterly Report');
|
||||
expect(normalized).toContain('Revenue & margin improved.');
|
||||
expect(normalized).toContain('See \'Risk Factors\' section.');
|
||||
expect(normalized).not.toContain('<script>');
|
||||
expect(normalized).not.toContain('console.log');
|
||||
});
|
||||
|
||||
it('trims filing text to prompt budget boundaries', () => {
|
||||
const text = `A`.repeat(4_500);
|
||||
const result = trimSecDocumentTextForPrompt(text, 2_000);
|
||||
|
||||
expect(result.truncated).toBe(true);
|
||||
expect(result.text.length).toBeLessThanOrEqual(2_000);
|
||||
});
|
||||
|
||||
it('prefers explicit filing url when available', () => {
|
||||
const url = resolvePrimaryFilingUrl({
|
||||
filingUrl: 'https://www.sec.gov/Archives/edgar/data/123/x.htm',
|
||||
cik: '123',
|
||||
accessionNumber: '0000-00-00',
|
||||
primaryDocument: 'x.htm'
|
||||
});
|
||||
|
||||
expect(url).toBe('https://www.sec.gov/Archives/edgar/data/123/x.htm');
|
||||
});
|
||||
|
||||
it('reconstructs primary filing url when filing url is absent', () => {
|
||||
const url = resolvePrimaryFilingUrl({
|
||||
filingUrl: null,
|
||||
cik: '0000320193',
|
||||
accessionNumber: '0000320193-24-000001',
|
||||
primaryDocument: 'a10q.htm'
|
||||
});
|
||||
|
||||
expect(url).toBe('https://www.sec.gov/Archives/edgar/data/320193/000032019324000001/a10q.htm');
|
||||
});
|
||||
|
||||
it('fetches, normalizes, and clips primary filing text', async () => {
|
||||
const longHtml = `<html><body><p>${'Alpha '.repeat(600)}</p></body></html>`;
|
||||
const fetchImpl = mock(async () => new Response(longHtml, { status: 200 })) as unknown as typeof fetch;
|
||||
|
||||
const result = await fetchPrimaryFilingText({
|
||||
filingUrl: null,
|
||||
cik: '0000320193',
|
||||
accessionNumber: '0000320193-24-000001',
|
||||
primaryDocument: 'a10q.htm'
|
||||
}, {
|
||||
fetchImpl,
|
||||
maxChars: 1_000
|
||||
});
|
||||
|
||||
expect(fetchImpl).toHaveBeenCalledTimes(1);
|
||||
expect(result).not.toBeNull();
|
||||
expect(result?.source).toBe('primary_document');
|
||||
expect(result?.truncated).toBe(true);
|
||||
expect(result?.text.length).toBeLessThanOrEqual(1_000);
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user