Implement dual-model filing pipeline with Ollama extraction
This commit is contained in:
@@ -39,8 +39,28 @@ type SecFiling = {
|
||||
primaryDocument: string | null;
|
||||
};
|
||||
|
||||
type FilingDocumentInput = {
|
||||
filingUrl: string | null;
|
||||
cik: string;
|
||||
accessionNumber: string;
|
||||
primaryDocument: string | null;
|
||||
};
|
||||
|
||||
type FetchPrimaryFilingTextOptions = {
|
||||
fetchImpl?: typeof fetch;
|
||||
maxChars?: number;
|
||||
};
|
||||
|
||||
export type FilingDocumentText = {
|
||||
source: 'primary_document';
|
||||
url: string;
|
||||
text: string;
|
||||
truncated: boolean;
|
||||
};
|
||||
|
||||
const SUPPORTED_FORMS: FilingType[] = ['10-K', '10-Q', '8-K'];
|
||||
const TICKER_CACHE_TTL_MS = 1000 * 60 * 60 * 12;
|
||||
const FILING_TEXT_MAX_CHARS = 24_000;
|
||||
|
||||
let tickerCache = new Map<string, TickerDirectoryRecord>();
|
||||
let tickerCacheLoadedAt = 0;
|
||||
@@ -53,6 +73,147 @@ function todayIso() {
|
||||
return new Date().toISOString().slice(0, 10);
|
||||
}
|
||||
|
||||
function decodeHtmlEntities(value: string) {
|
||||
const decodeCodePoint = (code: number) => {
|
||||
if (!Number.isFinite(code) || code < 0 || code > 0x10ffff) {
|
||||
return ' ';
|
||||
}
|
||||
|
||||
try {
|
||||
return String.fromCodePoint(code);
|
||||
} catch {
|
||||
return ' ';
|
||||
}
|
||||
};
|
||||
|
||||
return value
|
||||
.replace(/ | /gi, ' ')
|
||||
.replace(/&/gi, '&')
|
||||
.replace(/</gi, '<')
|
||||
.replace(/>/gi, '>')
|
||||
.replace(/"/gi, '"')
|
||||
.replace(/'/gi, '\'')
|
||||
.replace(/&#x([0-9a-f]+);/gi, (_match, rawCode: string) => {
|
||||
const code = Number.parseInt(rawCode, 16);
|
||||
return decodeCodePoint(code);
|
||||
})
|
||||
.replace(/&#([0-9]+);/g, (_match, rawCode: string) => {
|
||||
const code = Number.parseInt(rawCode, 10);
|
||||
return decodeCodePoint(code);
|
||||
});
|
||||
}
|
||||
|
||||
export function normalizeSecDocumentText(raw: string) {
|
||||
return decodeHtmlEntities(
|
||||
raw
|
||||
.replace(/\r/g, '\n')
|
||||
.replace(/<script[\s\S]*?<\/script>/gi, ' ')
|
||||
.replace(/<style[\s\S]*?<\/style>/gi, ' ')
|
||||
.replace(/<noscript[\s\S]*?<\/noscript>/gi, ' ')
|
||||
.replace(/<!--[\s\S]*?-->/g, ' ')
|
||||
.replace(/<\/?(p|div|section|article|li|tr|td|th|h[1-6]|br|hr)[^>]*>/gi, '\n')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
)
|
||||
.replace(/[ \t]+\n/g, '\n')
|
||||
.replace(/\n[ \t]+/g, '\n')
|
||||
.replace(/[ \t]{2,}/g, ' ')
|
||||
.replace(/\n{3,}/g, '\n\n')
|
||||
.trim();
|
||||
}
|
||||
|
||||
export function trimSecDocumentTextForPrompt(text: string, maxChars = FILING_TEXT_MAX_CHARS) {
|
||||
const safeMax = Math.max(Math.trunc(maxChars), 1_000);
|
||||
if (text.length <= safeMax) {
|
||||
return { text, truncated: false };
|
||||
}
|
||||
|
||||
const slice = text.slice(0, safeMax);
|
||||
const newlineBoundary = slice.lastIndexOf('\n');
|
||||
const wordBoundary = slice.lastIndexOf(' ');
|
||||
const boundary = Math.max(newlineBoundary, wordBoundary);
|
||||
const clipped = (boundary > safeMax * 0.7 ? slice.slice(0, boundary) : slice).trimEnd();
|
||||
|
||||
return { text: clipped, truncated: true };
|
||||
}
|
||||
|
||||
function compactAccessionNumber(value: string) {
|
||||
return value.replace(/-/g, '');
|
||||
}
|
||||
|
||||
function normalizeCikForPath(value: string) {
|
||||
const digits = value.replace(/\D/g, '');
|
||||
if (!digits) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const numeric = Number(digits);
|
||||
if (!Number.isFinite(numeric)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return String(numeric);
|
||||
}
|
||||
|
||||
export function resolvePrimaryFilingUrl(input: FilingDocumentInput) {
|
||||
const directUrl = input.filingUrl?.trim();
|
||||
if (directUrl) {
|
||||
return directUrl;
|
||||
}
|
||||
|
||||
if (!input.primaryDocument) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const cikPath = normalizeCikForPath(input.cik);
|
||||
const accessionPath = compactAccessionNumber(input.accessionNumber);
|
||||
if (!cikPath || !accessionPath) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return `https://www.sec.gov/Archives/edgar/data/${cikPath}/${accessionPath}/${input.primaryDocument}`;
|
||||
}
|
||||
|
||||
export async function fetchPrimaryFilingText(
|
||||
input: FilingDocumentInput,
|
||||
options?: FetchPrimaryFilingTextOptions
|
||||
): Promise<FilingDocumentText | null> {
|
||||
const url = resolvePrimaryFilingUrl(input);
|
||||
if (!url) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const doFetch = options?.fetchImpl ?? fetch;
|
||||
const response = await doFetch(url, {
|
||||
headers: {
|
||||
'User-Agent': envUserAgent(),
|
||||
Accept: 'text/html, text/plain;q=0.9, */*;q=0.8'
|
||||
},
|
||||
cache: 'no-store'
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`SEC filing request failed (${response.status})`);
|
||||
}
|
||||
|
||||
const raw = await response.text();
|
||||
const normalized = normalizeSecDocumentText(raw);
|
||||
if (!normalized) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const clipped = trimSecDocumentTextForPrompt(normalized, options?.maxChars ?? FILING_TEXT_MAX_CHARS);
|
||||
if (!clipped.text) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
source: 'primary_document',
|
||||
url,
|
||||
text: clipped.text,
|
||||
truncated: clipped.truncated
|
||||
};
|
||||
}
|
||||
|
||||
function pseudoMetric(seed: string, min: number, max: number) {
|
||||
let hash = 0;
|
||||
for (const char of seed) {
|
||||
|
||||
Reference in New Issue
Block a user