Prioritize SEC financials for 10-K/10-Q and keep other filings qualitative

This commit is contained in:
2026-03-01 00:37:47 -05:00
parent 953d7c0099
commit 2a5b548d89
11 changed files with 773 additions and 149 deletions

View File

@@ -20,7 +20,7 @@ import {
} from '@/lib/server/repos/holdings';
import { createPortfolioInsight } from '@/lib/server/repos/insights';
import {
fetchFilingMetrics,
fetchFilingMetricsForFilings,
fetchPrimaryFilingText,
fetchRecentFilings
} from '@/lib/server/sec';
@@ -31,11 +31,88 @@ const EXTRACTION_REQUIRED_KEYS = [
'redFlags',
'followUpQuestions',
'portfolioSignals',
'segmentSpecificData',
'geographicRevenueBreakdown',
'companySpecificData',
'secApiCrossChecks',
'confidence'
] as const;
const EXTRACTION_MAX_ITEMS = 6;
const EXTRACTION_ITEM_MAX_LENGTH = 280;
const EXTRACTION_SUMMARY_MAX_LENGTH = 900;
const SEGMENT_PATTERNS = [
/\boperating segment\b/i,
/\bsegment revenue\b/i,
/\bsegment margin\b/i,
/\bsegment profit\b/i,
/\bbusiness segment\b/i,
/\breportable segment\b/i
];
const GEOGRAPHIC_PATTERNS = [
/\bgeographic\b/i,
/\bamericas\b/i,
/\bemea\b/i,
/\bapac\b/i,
/\basia pacific\b/i,
/\bnorth america\b/i,
/\beurope\b/i,
/\bchina\b/i,
/\binternational\b/i
];
const COMPANY_SPECIFIC_PATTERNS = [
/\bsame[- ]store\b/i,
/\bcomparable[- ]store\b/i,
/\bcomp sales\b/i,
/\borganic sales\b/i,
/\bbookings\b/i,
/\bbacklog\b/i,
/\barpu\b/i,
/\bmau\b/i,
/\bdau\b/i,
/\bsubscriber\b/i,
/\boccupancy\b/i,
/\brevpar\b/i,
/\bretention\b/i,
/\bchurn\b/i
];
type FilingMetricKey = keyof NonNullable<Filing['metrics']>;
const METRIC_CHECK_PATTERNS: Array<{
key: FilingMetricKey;
label: string;
patterns: RegExp[];
}> = [
{
key: 'revenue',
label: 'Revenue',
patterns: [/\brevenue\b/i, /\bsales\b/i]
},
{
key: 'netIncome',
label: 'Net income',
patterns: [/\bnet income\b/i, /\bprofit\b/i]
},
{
key: 'totalAssets',
label: 'Total assets',
patterns: [/\btotal assets\b/i, /\bassets\b/i]
},
{
key: 'cash',
label: 'Cash',
patterns: [/\bcash\b/i, /\bcash equivalents\b/i]
},
{
key: 'debt',
label: 'Debt',
patterns: [/\bdebt\b/i, /\bborrowings\b/i, /\bliabilit(?:y|ies)\b/i]
}
];
function isFinancialMetricsForm(form: Filing['filing_type']) {
return form === '10-K' || form === '10-Q';
}
function toTaskResult(value: unknown): Record<string, unknown> {
if (!value || typeof value !== 'object' || Array.isArray(value)) {
@@ -99,6 +176,55 @@ function sanitizeExtractionList(value: unknown) {
return cleaned;
}
function uniqueExtractionList(items: Array<string | null | undefined>) {
const seen = new Set<string>();
const unique: string[] = [];
for (const item of items) {
const normalized = sanitizeExtractionText(item, EXTRACTION_ITEM_MAX_LENGTH);
if (!normalized) {
continue;
}
const signature = normalized.toLowerCase();
if (seen.has(signature)) {
continue;
}
seen.add(signature);
unique.push(normalized);
if (unique.length >= EXTRACTION_MAX_ITEMS) {
break;
}
}
return unique;
}
function collectTextSignals(filingText: string, patterns: RegExp[]) {
const lines = filingText
.replace(/\r/g, '\n')
.split(/\n+/)
.map((line) => line.replace(/\s+/g, ' ').trim())
.filter((line) => line.length >= 24);
const matches: string[] = [];
for (const line of lines) {
if (!patterns.some((pattern) => pattern.test(line))) {
continue;
}
matches.push(line);
if (matches.length >= EXTRACTION_MAX_ITEMS * 2) {
break;
}
}
return uniqueExtractionList(matches);
}
function parseExtractionPayload(raw: string): FilingExtraction | null {
const fencedJson = raw.match(/```(?:json)?\s*([\s\S]*?)```/i)?.[1];
const candidate = fencedJson ?? (() => {
@@ -145,11 +271,26 @@ function parseExtractionPayload(raw: string): FilingExtraction | null {
const redFlags = sanitizeExtractionList(payload.redFlags);
const followUpQuestions = sanitizeExtractionList(payload.followUpQuestions);
const portfolioSignals = sanitizeExtractionList(payload.portfolioSignals);
const segmentSpecificData = sanitizeExtractionList(payload.segmentSpecificData);
const geographicRevenueBreakdown = sanitizeExtractionList(payload.geographicRevenueBreakdown);
const companySpecificData = sanitizeExtractionList(payload.companySpecificData);
const secApiCrossChecks = sanitizeExtractionList(payload.secApiCrossChecks);
const confidenceRaw = typeof payload.confidence === 'number'
? payload.confidence
: Number(payload.confidence);
if (!summary || !keyPoints || !redFlags || !followUpQuestions || !portfolioSignals || !Number.isFinite(confidenceRaw)) {
if (
!summary
|| !keyPoints
|| !redFlags
|| !followUpQuestions
|| !portfolioSignals
|| !segmentSpecificData
|| !geographicRevenueBreakdown
|| !companySpecificData
|| !secApiCrossChecks
|| !Number.isFinite(confidenceRaw)
) {
return null;
}
@@ -159,6 +300,10 @@ function parseExtractionPayload(raw: string): FilingExtraction | null {
redFlags,
followUpQuestions,
portfolioSignals,
segmentSpecificData,
geographicRevenueBreakdown,
companySpecificData,
secApiCrossChecks,
confidence: Math.min(Math.max(confidenceRaw, 0), 1)
};
}
@@ -171,11 +316,37 @@ function metricSnapshotLine(label: string, value: number | null | undefined) {
return `${label}: ${Math.round(value).toLocaleString('en-US')}`;
}
function buildSecApiCrossChecks(filing: Filing, filingText: string) {
const normalizedText = filingText.toLowerCase();
const checks: string[] = [];
for (const descriptor of METRIC_CHECK_PATTERNS) {
const value = filing.metrics?.[descriptor.key];
if (value === null || value === undefined || !Number.isFinite(value)) {
checks.push(`${descriptor.label}: SEC API metric unavailable for this filing.`);
continue;
}
const hasMention = descriptor.patterns.some((pattern) => pattern.test(normalizedText));
if (hasMention) {
checks.push(
`${descriptor.label}: SEC API value ${Math.round(value).toLocaleString('en-US')} appears referenced in filing narrative.`
);
} else {
checks.push(
`${descriptor.label}: SEC API value ${Math.round(value).toLocaleString('en-US')} was not confidently located in sampled filing text.`
);
}
}
return uniqueExtractionList(checks);
}
function deterministicExtractionFallback(filing: Filing): FilingExtraction {
const metrics = filing.metrics;
return {
summary: `${filing.company_name} ${filing.filing_type} filed on ${filing.filing_date}. Deterministic extraction fallback used due unavailable or invalid local parsing output.`,
summary: `${filing.company_name} ${filing.filing_type} filed on ${filing.filing_date}. Deterministic extraction fallback was used because filing text parsing was unavailable or invalid.`,
keyPoints: [
`${filing.filing_type} filing recorded for ${filing.ticker}.`,
metricSnapshotLine('Revenue', metrics?.revenue),
@@ -197,19 +368,101 @@ function deterministicExtractionFallback(filing: Filing): FilingExtraction {
'Cross-check leverage and liquidity metrics against position sizing rules.',
'Track language shifts around guidance or demand assumptions.'
],
segmentSpecificData: [
'Segment-level disclosures were not parsed in deterministic fallback mode.'
],
geographicRevenueBreakdown: [
'Geographic revenue disclosures were not parsed in deterministic fallback mode.'
],
companySpecificData: [
'Company-specific operating KPIs (for example same-store sales) were not parsed in deterministic fallback mode.'
],
secApiCrossChecks: [
`${metricSnapshotLine('Revenue', metrics?.revenue)} (SEC API baseline; text verification unavailable).`,
`${metricSnapshotLine('Net income', metrics?.netIncome)} (SEC API baseline; text verification unavailable).`
],
confidence: 0.2
};
}
function buildRuleBasedExtraction(filing: Filing, filingText: string): FilingExtraction {
const baseline = deterministicExtractionFallback(filing);
const segmentSpecificData = collectTextSignals(filingText, SEGMENT_PATTERNS);
const geographicRevenueBreakdown = collectTextSignals(filingText, GEOGRAPHIC_PATTERNS);
const companySpecificData = collectTextSignals(filingText, COMPANY_SPECIFIC_PATTERNS);
const secApiCrossChecks = buildSecApiCrossChecks(filing, filingText);
const segmentLead = segmentSpecificData[0] ? `Segment detail: ${segmentSpecificData[0]}` : null;
const geographicLead = geographicRevenueBreakdown[0] ? `Geographic detail: ${geographicRevenueBreakdown[0]}` : null;
const companyLead = companySpecificData[0] ? `Company-specific KPI: ${companySpecificData[0]}` : null;
return {
summary: `${filing.company_name} ${filing.filing_type} filed on ${filing.filing_date}. SEC API metrics were retained as the baseline and filing text was scanned for segment and company-specific disclosures.`,
keyPoints: uniqueExtractionList([
...baseline.keyPoints,
segmentLead,
geographicLead,
companyLead
]),
redFlags: uniqueExtractionList([
...baseline.redFlags,
secApiCrossChecks.find((line) => /not confidently located/i.test(line))
]),
followUpQuestions: uniqueExtractionList([
...baseline.followUpQuestions,
segmentSpecificData.length > 0 ? 'How do segment trends change the consolidated margin outlook?' : 'Does management provide segment-level KPIs in supplemental exhibits?'
]),
portfolioSignals: uniqueExtractionList([
...baseline.portfolioSignals,
companySpecificData.length > 0 ? 'Incorporate company-specific KPI direction into near-term position sizing.' : 'Track future filings for explicit operating KPI disclosures.'
]),
segmentSpecificData: segmentSpecificData.length > 0
? segmentSpecificData
: baseline.segmentSpecificData,
geographicRevenueBreakdown: geographicRevenueBreakdown.length > 0
? geographicRevenueBreakdown
: baseline.geographicRevenueBreakdown,
companySpecificData: companySpecificData.length > 0
? companySpecificData
: baseline.companySpecificData,
secApiCrossChecks: secApiCrossChecks.length > 0
? secApiCrossChecks
: baseline.secApiCrossChecks,
confidence: segmentSpecificData.length + geographicRevenueBreakdown.length + companySpecificData.length > 0 ? 0.4 : 0.3
};
}
function preferExtractionList(primary: string[], fallback: string[]) {
return primary.length > 0 ? primary : fallback;
}
function mergeExtractionWithFallback(primary: FilingExtraction, fallback: FilingExtraction): FilingExtraction {
return {
summary: primary.summary || fallback.summary,
keyPoints: preferExtractionList(primary.keyPoints, fallback.keyPoints),
redFlags: preferExtractionList(primary.redFlags, fallback.redFlags),
followUpQuestions: preferExtractionList(primary.followUpQuestions, fallback.followUpQuestions),
portfolioSignals: preferExtractionList(primary.portfolioSignals, fallback.portfolioSignals),
segmentSpecificData: preferExtractionList(primary.segmentSpecificData, fallback.segmentSpecificData),
geographicRevenueBreakdown: preferExtractionList(primary.geographicRevenueBreakdown, fallback.geographicRevenueBreakdown),
companySpecificData: preferExtractionList(primary.companySpecificData, fallback.companySpecificData),
secApiCrossChecks: preferExtractionList(primary.secApiCrossChecks, fallback.secApiCrossChecks),
confidence: Math.min(Math.max(primary.confidence, 0), 1)
};
}
function extractionPrompt(filing: Filing, filingText: string) {
return [
'Extract structured signals from the SEC filing text.',
`Company: ${filing.company_name} (${filing.ticker})`,
`Form: ${filing.filing_type}`,
`Filed: ${filing.filing_date}`,
`SEC API baseline metrics: ${JSON.stringify(filing.metrics ?? {})}`,
'Use SEC API metrics as canonical numeric values and validate whether each appears consistent with filing text context.',
'Prioritize company-specific and segment-specific disclosures not covered by SEC endpoint fields (for example same-store sales, geographic mix, segment margin).',
'Return ONLY valid JSON with exactly these keys and no extra keys:',
'{"summary":"string","keyPoints":["string"],"redFlags":["string"],"followUpQuestions":["string"],"portfolioSignals":["string"],"confidence":0}',
`Rules: keyPoints/redFlags/followUpQuestions/portfolioSignals arrays max ${EXTRACTION_MAX_ITEMS} items; each item <= ${EXTRACTION_ITEM_MAX_LENGTH} chars; summary <= ${EXTRACTION_SUMMARY_MAX_LENGTH} chars; confidence between 0 and 1.`,
'{"summary":"string","keyPoints":["string"],"redFlags":["string"],"followUpQuestions":["string"],"portfolioSignals":["string"],"segmentSpecificData":["string"],"geographicRevenueBreakdown":["string"],"companySpecificData":["string"],"secApiCrossChecks":["string"],"confidence":0}',
`Rules: every array max ${EXTRACTION_MAX_ITEMS} items; each item <= ${EXTRACTION_ITEM_MAX_LENGTH} chars; summary <= ${EXTRACTION_SUMMARY_MAX_LENGTH} chars; confidence between 0 and 1.`,
'Filing text follows:',
filingText
].join('\n\n');
@@ -225,8 +478,9 @@ function reportPrompt(
`Analyze this SEC filing from ${filing.company_name} (${filing.ticker}).`,
`Form: ${filing.filing_type}`,
`Filed: ${filing.filing_date}`,
`Metrics: ${JSON.stringify(filing.metrics ?? {})}`,
`SEC API baseline metrics: ${JSON.stringify(filing.metrics ?? {})}`,
`Structured extraction context (${extractionMeta.source}): ${JSON.stringify(extraction)}`,
'Use SEC API values as the baseline financials and explicitly reference segment/company-specific details from extraction.',
'Return concise sections: Thesis, Red Flags, Follow-up Questions, Portfolio Impact.'
].join('\n');
}
@@ -252,12 +506,37 @@ async function processSyncFilings(task: Task) {
const ticker = parseTicker(task.payload.ticker);
const limit = parseLimit(task.payload.limit, 20, 1, 50);
const filings = await fetchRecentFilings(ticker, limit);
const metricsByCik = new Map<string, Filing['metrics']>();
const metricsByAccession = new Map<string, Filing['metrics']>();
const filingsByCik = new Map<string, typeof filings>();
for (const filing of filings) {
if (!metricsByCik.has(filing.cik)) {
const metrics = await fetchFilingMetrics(filing.cik, filing.ticker);
metricsByCik.set(filing.cik, metrics);
const group = filingsByCik.get(filing.cik);
if (group) {
group.push(filing);
continue;
}
filingsByCik.set(filing.cik, [filing]);
}
for (const [cik, filingsForCik] of filingsByCik) {
const filingsForFinancialMetrics = filingsForCik.filter((filing) => isFinancialMetricsForm(filing.filingType));
if (filingsForFinancialMetrics.length === 0) {
continue;
}
const metricsMap = await fetchFilingMetricsForFilings(
cik,
filingsForCik[0]?.ticker ?? ticker,
filingsForFinancialMetrics.map((filing) => ({
accessionNumber: filing.accessionNumber,
filingDate: filing.filingDate,
filingType: filing.filingType
}))
);
for (const [accessionNumber, metrics] of metricsMap.entries()) {
metricsByAccession.set(accessionNumber, metrics);
}
}
@@ -272,7 +551,7 @@ async function processSyncFilings(task: Task) {
filing_url: filing.filingUrl,
submission_url: filing.submissionUrl,
primary_document: filing.primaryDocument,
metrics: metricsByCik.get(filing.cik) ?? null,
metrics: metricsByAccession.get(filing.accessionNumber) ?? null,
links: filingLinks(filing)
}))
);
@@ -341,6 +620,15 @@ async function processAnalyzeFiling(task: Task) {
});
if (filingDocument?.text) {
const ruleBasedExtraction = buildRuleBasedExtraction(filing, filingDocument.text);
extraction = ruleBasedExtraction;
extractionMeta = {
provider: 'deterministic-fallback',
model: 'filing-rule-based',
source: filingDocument.source,
generatedAt: new Date().toISOString()
};
const extractionResult = await runAiAnalysis(
extractionPrompt(filing, filingDocument.text),
'Return strict JSON only.',
@@ -349,7 +637,7 @@ async function processAnalyzeFiling(task: Task) {
const parsed = parseExtractionPayload(extractionResult.text);
if (parsed) {
extraction = parsed;
extraction = mergeExtractionWithFallback(parsed, ruleBasedExtraction);
extractionMeta = {
provider: extractionResult.provider === 'local-fallback' ? 'deterministic-fallback' : 'ollama',
model: extractionResult.model,
@@ -360,6 +648,12 @@ async function processAnalyzeFiling(task: Task) {
}
} catch {
extraction = defaultExtraction;
extractionMeta = {
provider: 'deterministic-fallback',
model: 'metadata-fallback',
source: 'metadata_fallback',
generatedAt: new Date().toISOString()
};
}
const analysis = await runAiAnalysis(
@@ -435,7 +729,8 @@ async function processPortfolioInsights(task: Task) {
export const __taskProcessorInternals = {
parseExtractionPayload,
deterministicExtractionFallback
deterministicExtractionFallback,
isFinancialMetricsForm
};
export async function runTaskProcessor(task: Task) {