135 lines
3.5 KiB
TypeScript
135 lines
3.5 KiB
TypeScript
import type { Filing } from '@/lib/types';
|
|
import { fetchPrimaryFilingText } from '@/lib/server/sec';
|
|
|
|
type CacheEntry<T> = {
|
|
expiresAt: number;
|
|
value: T;
|
|
};
|
|
|
|
const DESCRIPTION_CACHE_TTL_MS = 1000 * 60 * 60 * 6;
|
|
const DESCRIPTION_MAX_CHARS = 1_600;
|
|
|
|
const descriptionCache = new Map<string, CacheEntry<string | null>>();
|
|
|
|
function normalizeWhitespace(value: string) {
|
|
return value
|
|
.replace(/[ \t]+/g, ' ')
|
|
.replace(/\n{3,}/g, '\n\n')
|
|
.trim();
|
|
}
|
|
|
|
function clipAtSentenceBoundary(value: string, maxChars = DESCRIPTION_MAX_CHARS) {
|
|
if (value.length <= maxChars) {
|
|
return value;
|
|
}
|
|
|
|
const slice = value.slice(0, maxChars);
|
|
const sentenceBoundary = Math.max(
|
|
slice.lastIndexOf('. '),
|
|
slice.lastIndexOf('! '),
|
|
slice.lastIndexOf('? ')
|
|
);
|
|
|
|
if (sentenceBoundary > maxChars * 0.6) {
|
|
return slice.slice(0, sentenceBoundary + 1).trim();
|
|
}
|
|
|
|
const wordBoundary = slice.lastIndexOf(' ');
|
|
return (wordBoundary > maxChars * 0.7 ? slice.slice(0, wordBoundary) : slice).trim();
|
|
}
|
|
|
|
function cleanupExtractedSection(value: string) {
|
|
return clipAtSentenceBoundary(
|
|
normalizeWhitespace(
|
|
value
|
|
.replace(/\btable of contents\b/gi, ' ')
|
|
.replace(/\bitem\s+1\.?\s+business\b/gi, ' ')
|
|
.replace(/\bpart\s+i\b/gi, ' ')
|
|
)
|
|
);
|
|
}
|
|
|
|
function fallbackDescription(text: string) {
|
|
const paragraphs = text
|
|
.split(/\n{2,}/)
|
|
.map((paragraph) => normalizeWhitespace(paragraph))
|
|
.filter((paragraph) => paragraph.length >= 80)
|
|
.filter((paragraph) => !/^item\s+\d+[a-z]?\.?/i.test(paragraph))
|
|
.slice(0, 3);
|
|
|
|
if (paragraphs.length === 0) {
|
|
return null;
|
|
}
|
|
|
|
return clipAtSentenceBoundary(paragraphs.join(' '));
|
|
}
|
|
|
|
export function extractBusinessDescription(text: string) {
|
|
const normalized = normalizeWhitespace(text);
|
|
if (!normalized) {
|
|
return null;
|
|
}
|
|
|
|
const startMatch = /\bitem\s+1\.?\s+business\b/i.exec(normalized);
|
|
if (!startMatch || startMatch.index < 0) {
|
|
return fallbackDescription(normalized);
|
|
}
|
|
|
|
const afterStart = normalized.slice(startMatch.index + startMatch[0].length);
|
|
const endMatch = /\bitem\s+1a\.?\s+risk factors\b|\bitem\s+2\.?\s+properties\b|\bitem\s+2\.?\b/i.exec(afterStart);
|
|
const section = endMatch
|
|
? afterStart.slice(0, endMatch.index)
|
|
: afterStart;
|
|
const extracted = cleanupExtractedSection(section);
|
|
|
|
if (extracted.length >= 120) {
|
|
return extracted;
|
|
}
|
|
|
|
return fallbackDescription(normalized);
|
|
}
|
|
|
|
export async function getCompanyDescription(
|
|
filing: Pick<Filing, 'accession_number' | 'cik' | 'filing_url' | 'primary_document'> | null
|
|
) {
|
|
if (!filing) {
|
|
return null;
|
|
}
|
|
|
|
const cached = descriptionCache.get(filing.accession_number);
|
|
if (cached && cached.expiresAt > Date.now()) {
|
|
return cached.value;
|
|
}
|
|
|
|
try {
|
|
const document = await fetchPrimaryFilingText({
|
|
filingUrl: filing.filing_url,
|
|
cik: filing.cik,
|
|
accessionNumber: filing.accession_number,
|
|
primaryDocument: filing.primary_document ?? null
|
|
}, {
|
|
maxChars: 40_000
|
|
});
|
|
const description = document ? extractBusinessDescription(document.text) : null;
|
|
|
|
descriptionCache.set(filing.accession_number, {
|
|
value: description,
|
|
expiresAt: Date.now() + DESCRIPTION_CACHE_TTL_MS
|
|
});
|
|
|
|
return description;
|
|
} catch {
|
|
descriptionCache.set(filing.accession_number, {
|
|
value: null,
|
|
expiresAt: Date.now() + DESCRIPTION_CACHE_TTL_MS
|
|
});
|
|
return null;
|
|
}
|
|
}
|
|
|
|
export const __secDescriptionInternals = {
|
|
cleanupExtractedSection,
|
|
clipAtSentenceBoundary,
|
|
fallbackDescription
|
|
};
|