Files
Neon-Desk/lib/server/sec-description.ts

135 lines
3.5 KiB
TypeScript

import type { Filing } from '@/lib/types';
import { fetchPrimaryFilingText } from '@/lib/server/sec';
type CacheEntry<T> = {
expiresAt: number;
value: T;
};
const DESCRIPTION_CACHE_TTL_MS = 1000 * 60 * 60 * 6;
const DESCRIPTION_MAX_CHARS = 1_600;
const descriptionCache = new Map<string, CacheEntry<string | null>>();
function normalizeWhitespace(value: string) {
return value
.replace(/[ \t]+/g, ' ')
.replace(/\n{3,}/g, '\n\n')
.trim();
}
function clipAtSentenceBoundary(value: string, maxChars = DESCRIPTION_MAX_CHARS) {
if (value.length <= maxChars) {
return value;
}
const slice = value.slice(0, maxChars);
const sentenceBoundary = Math.max(
slice.lastIndexOf('. '),
slice.lastIndexOf('! '),
slice.lastIndexOf('? ')
);
if (sentenceBoundary > maxChars * 0.6) {
return slice.slice(0, sentenceBoundary + 1).trim();
}
const wordBoundary = slice.lastIndexOf(' ');
return (wordBoundary > maxChars * 0.7 ? slice.slice(0, wordBoundary) : slice).trim();
}
function cleanupExtractedSection(value: string) {
return clipAtSentenceBoundary(
normalizeWhitespace(
value
.replace(/\btable of contents\b/gi, ' ')
.replace(/\bitem\s+1\.?\s+business\b/gi, ' ')
.replace(/\bpart\s+i\b/gi, ' ')
)
);
}
function fallbackDescription(text: string) {
const paragraphs = text
.split(/\n{2,}/)
.map((paragraph) => normalizeWhitespace(paragraph))
.filter((paragraph) => paragraph.length >= 80)
.filter((paragraph) => !/^item\s+\d+[a-z]?\.?/i.test(paragraph))
.slice(0, 3);
if (paragraphs.length === 0) {
return null;
}
return clipAtSentenceBoundary(paragraphs.join(' '));
}
export function extractBusinessDescription(text: string) {
const normalized = normalizeWhitespace(text);
if (!normalized) {
return null;
}
const startMatch = /\bitem\s+1\.?\s+business\b/i.exec(normalized);
if (!startMatch || startMatch.index < 0) {
return fallbackDescription(normalized);
}
const afterStart = normalized.slice(startMatch.index + startMatch[0].length);
const endMatch = /\bitem\s+1a\.?\s+risk factors\b|\bitem\s+2\.?\s+properties\b|\bitem\s+2\.?\b/i.exec(afterStart);
const section = endMatch
? afterStart.slice(0, endMatch.index)
: afterStart;
const extracted = cleanupExtractedSection(section);
if (extracted.length >= 120) {
return extracted;
}
return fallbackDescription(normalized);
}
export async function getCompanyDescription(
filing: Pick<Filing, 'accession_number' | 'cik' | 'filing_url' | 'primary_document'> | null
) {
if (!filing) {
return null;
}
const cached = descriptionCache.get(filing.accession_number);
if (cached && cached.expiresAt > Date.now()) {
return cached.value;
}
try {
const document = await fetchPrimaryFilingText({
filingUrl: filing.filing_url,
cik: filing.cik,
accessionNumber: filing.accession_number,
primaryDocument: filing.primary_document ?? null
}, {
maxChars: 40_000
});
const description = document ? extractBusinessDescription(document.text) : null;
descriptionCache.set(filing.accession_number, {
value: description,
expiresAt: Date.now() + DESCRIPTION_CACHE_TTL_MS
});
return description;
} catch {
descriptionCache.set(filing.accession_number, {
value: null,
expiresAt: Date.now() + DESCRIPTION_CACHE_TTL_MS
});
return null;
}
}
export const __secDescriptionInternals = {
cleanupExtractedSection,
clipAtSentenceBoundary,
fallbackDescription
};