Files
Neon-Desk/lib/server/sec.ts

1623 lines
41 KiB
TypeScript

import type { Filing, FinancialStatementKind } from "@/lib/types";
import type {
DimensionStatementBundle,
DimensionStatementSnapshotRow,
FilingFaithfulStatementSnapshotRow,
FilingStatementBundle,
FilingStatementSnapshotPeriod,
StandardizedStatementBundle,
StandardizedStatementSnapshotRow,
} from "@/lib/server/repos/filing-statements";
type FilingType = Filing["filing_type"];
type FilingMetrics = NonNullable<Filing["metrics"]>;
type TickerDirectoryRecord = {
cik_str: number;
ticker: string;
title: string;
};
type RecentFilingsPayload = {
filings?: {
recent?: {
accessionNumber?: string[];
filingDate?: string[];
form?: string[];
primaryDocument?: string[];
};
};
cik?: string;
name?: string;
};
type CompanyFactsPayload = {
facts?: {
"us-gaap"?: Record<string, { units?: Record<string, CompanyFactPoint[]> }>;
};
};
type CompanyFactPoint = {
val?: number;
end?: string;
filed?: string;
accn?: string;
form?: string;
fy?: number;
fp?: string;
frame?: string;
};
type SecFiling = {
ticker: string;
cik: string;
companyName: string;
filingType: FilingType;
filingDate: string;
accessionNumber: string;
filingUrl: string | null;
submissionUrl: string | null;
primaryDocument: string | null;
};
type FilingDocumentInput = {
filingUrl: string | null;
cik: string;
accessionNumber: string;
primaryDocument: string | null;
};
type FetchPrimaryFilingTextOptions = {
fetchImpl?: typeof fetch;
maxChars?: number;
};
type FilingDocumentText = {
source: "primary_document";
url: string;
text: string;
truncated: boolean;
};
type FilingMetricsLookupInput = {
accessionNumber: string;
filingDate: string;
filingType: FilingType;
};
const SUPPORTED_FORMS: FilingType[] = ["10-K", "10-Q", "8-K"];
const TICKER_CACHE_TTL_MS = 1000 * 60 * 60 * 12;
const FILING_TEXT_MAX_CHARS = 24_000;
const METRIC_TAGS = {
revenue: [
"Revenues",
"SalesRevenueNet",
"RevenueFromContractWithCustomerExcludingAssessedTax",
"TotalRevenuesAndOtherIncome",
],
netIncome: ["NetIncomeLoss", "ProfitLoss"],
totalAssets: ["Assets"],
cash: [
"CashAndCashEquivalentsAtCarryingValue",
"CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalents",
],
debt: [
"LongTermDebtAndCapitalLeaseObligations",
"LongTermDebtNoncurrent",
"LongTermDebt",
"DebtAndFinanceLeaseLiabilities",
],
} as const;
let tickerCache = new Map<string, TickerDirectoryRecord>();
let tickerCacheLoadedAt = 0;
function envUserAgent() {
return process.env.SEC_USER_AGENT || "Fiscal Clone <support@fiscal.local>";
}
function todayIso() {
return new Date().toISOString().slice(0, 10);
}
function decodeHtmlEntities(value: string) {
const decodeCodePoint = (code: number) => {
if (!Number.isFinite(code) || code < 0 || code > 0x10ffff) {
return " ";
}
try {
return String.fromCodePoint(code);
} catch {
return " ";
}
};
return value
.replace(/&nbsp;|&#160;/gi, " ")
.replace(/&amp;/gi, "&")
.replace(/&lt;/gi, "<")
.replace(/&gt;/gi, ">")
.replace(/&quot;/gi, '"')
.replace(/&#39;/gi, "'")
.replace(/&#x([0-9a-f]+);/gi, (_match, rawCode: string) => {
const code = Number.parseInt(rawCode, 16);
return decodeCodePoint(code);
})
.replace(/&#([0-9]+);/g, (_match, rawCode: string) => {
const code = Number.parseInt(rawCode, 10);
return decodeCodePoint(code);
});
}
export function normalizeSecDocumentText(raw: string) {
return decodeHtmlEntities(
raw
.replace(/\r/g, "\n")
.replace(/<script[\s\S]*?<\/script>/gi, " ")
.replace(/<style[\s\S]*?<\/style>/gi, " ")
.replace(/<noscript[\s\S]*?<\/noscript>/gi, " ")
.replace(/<!--[\s\S]*?-->/g, " ")
.replace(
/<\/?(p|div|section|article|li|tr|td|th|h[1-6]|br|hr)[^>]*>/gi,
"\n",
)
.replace(/<[^>]+>/g, " "),
)
.replace(/[ \t]+\n/g, "\n")
.replace(/\n[ \t]+/g, "\n")
.replace(/[ \t]{2,}/g, " ")
.replace(/\n{3,}/g, "\n\n")
.trim();
}
export function trimSecDocumentTextForPrompt(
text: string,
maxChars = FILING_TEXT_MAX_CHARS,
) {
const safeMax = Math.max(Math.trunc(maxChars), 1_000);
if (text.length <= safeMax) {
return { text, truncated: false };
}
const slice = text.slice(0, safeMax);
const newlineBoundary = slice.lastIndexOf("\n");
const wordBoundary = slice.lastIndexOf(" ");
const boundary = Math.max(newlineBoundary, wordBoundary);
const clipped = (
boundary > safeMax * 0.7 ? slice.slice(0, boundary) : slice
).trimEnd();
return { text: clipped, truncated: true };
}
function compactAccessionNumber(value: string) {
return value.replace(/-/g, "");
}
function normalizeAccessionKey(value: string | undefined | null) {
return (value ?? "").replace(/\D/g, "");
}
function normalizeForm(value: string | undefined | null) {
const normalized = (value ?? "").trim().toUpperCase();
if (!normalized) {
return "";
}
return normalized.endsWith("/A") ? normalized.slice(0, -2) : normalized;
}
function parseDate(value: string | undefined | null) {
if (!value) {
return Number.NaN;
}
return Date.parse(value);
}
function normalizeCikForPath(value: string) {
const digits = value.replace(/\D/g, "");
if (!digits) {
return null;
}
const numeric = Number(digits);
if (!Number.isFinite(numeric)) {
return null;
}
return String(numeric);
}
export function resolvePrimaryFilingUrl(input: FilingDocumentInput) {
const directUrl = input.filingUrl?.trim();
if (directUrl) {
return directUrl;
}
if (!input.primaryDocument) {
return null;
}
const cikPath = normalizeCikForPath(input.cik);
const accessionPath = compactAccessionNumber(input.accessionNumber);
if (!cikPath || !accessionPath) {
return null;
}
return `https://www.sec.gov/Archives/edgar/data/${cikPath}/${accessionPath}/${input.primaryDocument}`;
}
export async function fetchPrimaryFilingText(
input: FilingDocumentInput,
options?: FetchPrimaryFilingTextOptions,
): Promise<FilingDocumentText | null> {
const url = resolvePrimaryFilingUrl(input);
if (!url) {
return null;
}
const doFetch = options?.fetchImpl ?? fetch;
const response = await doFetch(url, {
headers: {
"User-Agent": envUserAgent(),
Accept: "text/html, text/plain;q=0.9, */*;q=0.8",
},
cache: "no-store",
});
if (!response.ok) {
throw new Error(`SEC filing request failed (${response.status})`);
}
const raw = await response.text();
const normalized = normalizeSecDocumentText(raw);
if (!normalized) {
return null;
}
const clipped = trimSecDocumentTextForPrompt(
normalized,
options?.maxChars ?? FILING_TEXT_MAX_CHARS,
);
if (!clipped.text) {
return null;
}
return {
source: "primary_document",
url,
text: clipped.text,
truncated: clipped.truncated,
};
}
async function fetchJson<T>(url: string): Promise<T> {
const response = await fetch(url, {
headers: {
"User-Agent": envUserAgent(),
Accept: "application/json",
},
cache: "no-store",
});
if (!response.ok) {
throw new Error(`SEC request failed (${response.status})`);
}
return (await response.json()) as T;
}
async function ensureTickerCache() {
const isFresh = Date.now() - tickerCacheLoadedAt < TICKER_CACHE_TTL_MS;
if (isFresh && tickerCache.size > 0) {
return;
}
const payload = await fetchJson<Record<string, TickerDirectoryRecord>>(
"https://www.sec.gov/files/company_tickers.json",
);
const next = new Map<string, TickerDirectoryRecord>();
for (const record of Object.values(payload)) {
next.set(record.ticker.toUpperCase(), record);
}
tickerCache = next;
tickerCacheLoadedAt = Date.now();
}
async function resolveTicker(ticker: string) {
await ensureTickerCache();
const normalized = ticker.trim().toUpperCase();
const record = tickerCache.get(normalized);
if (!record) {
throw new Error(`Ticker ${normalized} not found in SEC directory`);
}
return {
ticker: normalized,
cik: String(record.cik_str),
companyName: record.title,
};
}
function pickLatestFact(
payload: CompanyFactsPayload,
tag: string,
): number | null {
return pickFactForFiling(payload, tag, {
accessionNumber: "",
filingDate: "",
filingType: "10-Q",
});
}
function collectFactSeries(
payload: CompanyFactsPayload,
tag: string,
): CompanyFactPoint[] {
const unitCollections = payload.facts?.["us-gaap"]?.[tag]?.units;
if (!unitCollections) {
return [];
}
const usdSeries: CompanyFactPoint[] = [];
const fallbackSeries: CompanyFactPoint[] = [];
for (const [unit, series] of Object.entries(unitCollections)) {
if (!Array.isArray(series) || series.length === 0) {
continue;
}
if (unit === "USD" || /^USD(?!\/shares)/i.test(unit)) {
usdSeries.push(...series);
continue;
}
fallbackSeries.push(...series);
}
const points = usdSeries.length > 0 ? usdSeries : fallbackSeries;
return points.filter(
(point) => typeof point.val === "number" && Number.isFinite(point.val),
);
}
function pickMostRecentFact(points: CompanyFactPoint[]) {
return (
[...points].sort((a, b) => {
const aDate = parseDate(a.filed ?? a.end);
const bDate = parseDate(b.filed ?? b.end);
if (Number.isFinite(aDate) && Number.isFinite(bDate)) {
return bDate - aDate;
}
if (Number.isFinite(bDate)) {
return 1;
}
if (Number.isFinite(aDate)) {
return -1;
}
return 0;
})[0] ?? null
);
}
function pickClosestByDate(points: CompanyFactPoint[], targetDate: number) {
if (points.length === 0) {
return null;
}
if (!Number.isFinite(targetDate)) {
return pickMostRecentFact(points);
}
const dated = points
.map((point) => ({ point, date: parseDate(point.filed ?? point.end) }))
.filter((entry) => Number.isFinite(entry.date));
if (dated.length === 0) {
return pickMostRecentFact(points);
}
const beforeTarget = dated.filter((entry) => entry.date <= targetDate);
if (beforeTarget.length > 0) {
return beforeTarget.sort((a, b) => b.date - a.date)[0]?.point ?? null;
}
return (
dated.sort((a, b) => {
const distance =
Math.abs(a.date - targetDate) - Math.abs(b.date - targetDate);
if (distance !== 0) {
return distance;
}
return b.date - a.date;
})[0]?.point ?? null
);
}
function pickFactForFiling(
payload: CompanyFactsPayload,
tag: string,
filing: FilingMetricsLookupInput,
): number | null {
const points = collectFactSeries(payload, tag);
if (points.length === 0) {
return null;
}
const accessionKey = normalizeAccessionKey(filing.accessionNumber);
if (accessionKey) {
const byAccession = points.filter(
(point) => normalizeAccessionKey(point.accn) === accessionKey,
);
if (byAccession.length > 0) {
const matched = pickMostRecentFact(byAccession);
if (typeof matched?.val === "number" && Number.isFinite(matched.val)) {
return matched.val;
}
}
}
const filingForm = normalizeForm(filing.filingType);
const byForm = filingForm
? points.filter((point) => normalizeForm(point.form) === filingForm)
: points;
const targetDate = parseDate(filing.filingDate);
const bestByForm = pickClosestByDate(byForm, targetDate);
if (typeof bestByForm?.val === "number" && Number.isFinite(bestByForm.val)) {
return bestByForm.val;
}
const bestAny = pickClosestByDate(points, targetDate);
return typeof bestAny?.val === "number" && Number.isFinite(bestAny.val)
? bestAny.val
: null;
}
function pickFactByTags(
payload: CompanyFactsPayload,
tags: readonly string[],
filing: FilingMetricsLookupInput,
) {
for (const tag of tags) {
const value = pickFactForFiling(payload, tag, filing);
if (value !== null) {
return value;
}
}
return null;
}
function emptyMetrics(): FilingMetrics {
return {
revenue: null,
netIncome: null,
totalAssets: null,
cash: null,
debt: null,
};
}
export async function fetchRecentFilings(
ticker: string,
limit = 20,
): Promise<SecFiling[]> {
const safeLimit = Math.min(Math.max(Math.trunc(limit), 1), 50);
const company = await resolveTicker(ticker);
const cikPadded = company.cik.padStart(10, "0");
const payload = await fetchJson<RecentFilingsPayload>(
`https://data.sec.gov/submissions/CIK${cikPadded}.json`,
);
const recent = payload.filings?.recent;
const submissionUrl = `https://data.sec.gov/submissions/CIK${cikPadded}.json`;
if (!recent) {
return [];
}
const forms = recent.form ?? [];
const accessionNumbers = recent.accessionNumber ?? [];
const filingDates = recent.filingDate ?? [];
const primaryDocuments = recent.primaryDocument ?? [];
const filings: SecFiling[] = [];
for (let i = 0; i < forms.length; i += 1) {
const normalizedForm = normalizeForm(forms[i]) as FilingType;
if (!SUPPORTED_FORMS.includes(normalizedForm)) {
continue;
}
const accessionNumber = accessionNumbers[i];
if (!accessionNumber) {
continue;
}
const compactAccession = accessionNumber.replace(/-/g, "");
const documentName = primaryDocuments[i];
const filingUrl = documentName
? `https://www.sec.gov/Archives/edgar/data/${Number(company.cik)}/${compactAccession}/${documentName}`
: null;
filings.push({
ticker: company.ticker,
cik: company.cik,
companyName: payload.name ?? company.companyName,
filingType: normalizedForm,
filingDate: filingDates[i] ?? todayIso(),
accessionNumber,
filingUrl,
submissionUrl,
primaryDocument: documentName ?? null,
});
if (filings.length >= safeLimit) {
break;
}
}
return filings;
}
async function fetchLatestFilingMetrics(cik: string) {
const normalized = cik.padStart(10, "0");
const payload = await fetchJson<CompanyFactsPayload>(
`https://data.sec.gov/api/xbrl/companyfacts/CIK${normalized}.json`,
);
return {
revenue: pickLatestFact(payload, "Revenues"),
netIncome: pickLatestFact(payload, "NetIncomeLoss"),
totalAssets: pickLatestFact(payload, "Assets"),
cash: pickLatestFact(payload, "CashAndCashEquivalentsAtCarryingValue"),
debt: pickLatestFact(payload, "LongTermDebt"),
} satisfies FilingMetrics;
}
export async function fetchFilingMetricsForFilings(
cik: string,
_ticker: string,
filings: FilingMetricsLookupInput[],
) {
const metricsByAccession = new Map<string, FilingMetrics>();
if (filings.length === 0) {
return metricsByAccession;
}
try {
const normalized = cik.padStart(10, "0");
const payload = await fetchJson<CompanyFactsPayload>(
`https://data.sec.gov/api/xbrl/companyfacts/CIK${normalized}.json`,
);
for (const filing of filings) {
metricsByAccession.set(filing.accessionNumber, {
revenue: pickFactByTags(payload, METRIC_TAGS.revenue, filing),
netIncome: pickFactByTags(payload, METRIC_TAGS.netIncome, filing),
totalAssets: pickFactByTags(payload, METRIC_TAGS.totalAssets, filing),
cash: pickFactByTags(payload, METRIC_TAGS.cash, filing),
debt: pickFactByTags(payload, METRIC_TAGS.debt, filing),
});
}
return metricsByAccession;
} catch {
for (const filing of filings) {
metricsByAccession.set(filing.accessionNumber, emptyMetrics());
}
return metricsByAccession;
}
}
type FilingStatementHydrationInput = {
filingId: number;
ticker: string;
cik: string;
accessionNumber: string;
filingDate: string;
filingType: "10-K" | "10-Q";
filingUrl: string | null;
primaryDocument: string | null;
metrics: Filing["metrics"];
};
type FilingStatementHydrationResult = {
filing_id: number;
ticker: string;
filing_date: string;
filing_type: "10-K" | "10-Q";
period_end: string | null;
statement_bundle: FilingStatementBundle | null;
standardized_bundle: StandardizedStatementBundle | null;
dimension_bundle: DimensionStatementBundle | null;
parse_status: "ready" | "partial" | "failed";
parse_error: string | null;
source: "sec_filing_summary" | "xbrl_instance" | "companyfacts_fallback";
};
type StatementReportDescriptor = {
shortName: string;
longName: string;
htmlFileName: string | null;
xmlFileName: string | null;
};
type StatementParseRow = {
key: string;
label: string;
concept: string | null;
order: number;
depth: number;
isSubtotal: boolean;
value: number | null;
};
type DimensionContext = {
endDate: string | null;
dimensions: Array<{ axis: string; member: string }>;
};
type CanonicalRowDefinition = {
key: string;
label: string;
category: string;
conceptPatterns: RegExp[];
labelPatterns: RegExp[];
};
const FINANCIAL_STATEMENT_KINDS: FinancialStatementKind[] = [
"income",
"balance",
"cash_flow",
"equity",
"comprehensive_income",
];
const STATEMENT_REPORT_PATTERNS: Record<FinancialStatementKind, RegExp[]> = {
income: [
/\bstatements?\s+of\s+operations?\b/i,
/\bstatements?\s+of\s+income\b/i,
/\bincome\s+statement/i,
],
balance: [
/\bbalance\s+sheets?\b/i,
/\bstatement\s+of\s+financial\s+position\b/i,
],
cash_flow: [/\bstatements?\s+of\s+cash\s+flows?\b/i, /\bcash\s+flows?\b/i],
equity: [
/\bstatements?\s+of\s+(stockholders|shareholders)['']?\s+equity\b/i,
/\bchanges\s+in\s+equity\b/i,
],
comprehensive_income: [
/\bstatements?\s+of\s+comprehensive\s+income\b/i,
/\bcomprehensive\s+income\b/i,
],
disclosure: [],
};
const STANDARDIZED_ROW_DEFINITIONS: Record<
FinancialStatementKind,
CanonicalRowDefinition[]
> = {
income: [
{
key: "revenue",
label: "Revenue",
category: "core",
conceptPatterns: [/revenue/i, /salesrevenuenet/i],
labelPatterns: [/\brevenue\b/i, /\bsales\b/i],
},
{
key: "cost-of-revenue",
label: "Cost of Revenue",
category: "core",
conceptPatterns: [/costofrevenue/i, /costofgoods/i],
labelPatterns: [/\bcost of revenue\b/i, /\bcost of sales\b/i],
},
{
key: "gross-profit",
label: "Gross Profit",
category: "core",
conceptPatterns: [/grossprofit/i],
labelPatterns: [/\bgross profit\b/i],
},
{
key: "operating-income",
label: "Operating Income",
category: "core",
conceptPatterns: [/operatingincome/i, /incomefromoperations/i],
labelPatterns: [/\boperating income\b/i, /\bincome from operations\b/i],
},
{
key: "net-income",
label: "Net Income",
category: "core",
conceptPatterns: [/netincomeloss/i, /profitloss/i],
labelPatterns: [/\bnet income\b/i, /\bnet earnings\b/i],
},
],
balance: [
{
key: "total-assets",
label: "Total Assets",
category: "core",
conceptPatterns: [/^assets$/i],
labelPatterns: [/\btotal assets\b/i],
},
{
key: "total-liabilities",
label: "Total Liabilities",
category: "core",
conceptPatterns: [/liabilities/i],
labelPatterns: [/\btotal liabilities\b/i],
},
{
key: "stockholders-equity",
label: "Stockholders Equity",
category: "core",
conceptPatterns: [
/stockholdersequity/i,
/shareholdersequity/i,
/equity/i,
],
labelPatterns: [/\bequity\b/i],
},
{
key: "cash-and-equivalents",
label: "Cash and Equivalents",
category: "liquidity",
conceptPatterns: [/cashandcashequivalents/i, /cashandequivalents/i],
labelPatterns: [/\bcash\b/i, /\bcash equivalents\b/i],
},
{
key: "total-debt",
label: "Total Debt",
category: "leverage",
conceptPatterns: [/longtermdebt/i, /debt/i, /borrowings/i],
labelPatterns: [/\btotal debt\b/i, /\blong-term debt\b/i, /\bdebt\b/i],
},
],
cash_flow: [
{
key: "net-cash-operating",
label: "Net Cash from Operating Activities",
category: "core",
conceptPatterns: [
/netcashprovidedbyusedinoperatingactivities/i,
/netcashfromoperatingactivities/i,
],
labelPatterns: [/\boperating activities\b/i],
},
{
key: "net-cash-investing",
label: "Net Cash from Investing Activities",
category: "core",
conceptPatterns: [/netcashprovidedbyusedininvestingactivities/i],
labelPatterns: [/\binvesting activities\b/i],
},
{
key: "net-cash-financing",
label: "Net Cash from Financing Activities",
category: "core",
conceptPatterns: [/netcashprovidedbyusedinfinancingactivities/i],
labelPatterns: [/\bfinancing activities\b/i],
},
{
key: "net-change-cash",
label: "Net Change in Cash",
category: "core",
conceptPatterns: [
/cashandcashequivalentsperiodincrease/i,
/increase.*cash/i,
],
labelPatterns: [/\bnet change\b/i, /\bincrease.*cash\b/i],
},
],
equity: [
{
key: "equity-balance",
label: "Total Equity",
category: "core",
conceptPatterns: [
/stockholdersequity/i,
/shareholdersequity/i,
/equity/i,
],
labelPatterns: [/\btotal equity\b/i, /\bequity\b/i],
},
{
key: "retained-earnings",
label: "Retained Earnings",
category: "core",
conceptPatterns: [/retainedearnings/i],
labelPatterns: [/\bretained earnings\b/i],
},
],
comprehensive_income: [
{
key: "comprehensive-income",
label: "Comprehensive Income",
category: "core",
conceptPatterns: [/comprehensiveincome/i],
labelPatterns: [/\bcomprehensive income\b/i],
},
{
key: "other-comprehensive-income",
label: "Other Comprehensive Income",
category: "core",
conceptPatterns: [/othercomprehensiveincome/i],
labelPatterns: [/\bother comprehensive income\b/i],
},
],
disclosure: [],
};
function createStatementRecord<T>(
factory: () => T,
): Record<FinancialStatementKind, T> {
const record = {} as Record<FinancialStatementKind, T>;
for (const kind of FINANCIAL_STATEMENT_KINDS) {
record[kind] = factory();
}
record.disclosure = factory();
return record;
}
function statementKindLabel(kind: FinancialStatementKind) {
switch (kind) {
case "income":
return "Income Statement";
case "balance":
return "Balance Sheet";
case "cash_flow":
return "Cash Flow Statement";
case "equity":
return "Statement of Equity";
case "comprehensive_income":
return "Comprehensive Income";
case "disclosure":
return "Disclosures";
default:
return kind;
}
}
function resolveFilingDirectoryUrl(input: {
filingUrl: string | null;
cik: string;
accessionNumber: string;
}) {
const direct = input.filingUrl?.trim();
if (direct) {
const lastSlash = direct.lastIndexOf("/");
if (lastSlash > "https://".length) {
return direct.slice(0, lastSlash + 1);
}
}
const cikPath = normalizeCikForPath(input.cik);
const accessionPath = compactAccessionNumber(input.accessionNumber);
if (!cikPath || !accessionPath) {
return null;
}
return `https://www.sec.gov/Archives/edgar/data/${cikPath}/${accessionPath}/`;
}
function toAbsoluteArchiveUrl(baseUrl: string, relativePath: string | null) {
const normalized = (relativePath ?? "").trim();
if (!normalized) {
return null;
}
if (/^https?:\/\//i.test(normalized)) {
return normalized;
}
return `${baseUrl}${normalized.replace(/^\/+/, "")}`;
}
async function fetchText(url: string, fetchImpl: typeof fetch) {
const response = await fetchImpl(url, {
headers: {
"User-Agent": envUserAgent(),
Accept: "text/xml, text/html, text/plain;q=0.9, */*;q=0.8",
},
cache: "no-store",
});
if (!response.ok) {
throw new Error(`SEC request failed (${response.status})`);
}
return await response.text();
}
function xmlTextValue(block: string, tagName: string) {
const escaped = tagName.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
const pattern = new RegExp(`<${escaped}>([\\s\\S]*?)<\\/${escaped}>`, "i");
const match = block.match(pattern);
if (!match) {
return "";
}
return decodeHtmlEntities(match[1] ?? "").trim();
}
function parseFilingSummaryReports(xml: string) {
const reports: StatementReportDescriptor[] = [];
const reportPattern = /<Report>([\s\S]*?)<\/Report>/gi;
for (const match of xml.matchAll(reportPattern)) {
const block = match[1] ?? "";
reports.push({
shortName: xmlTextValue(block, "ShortName"),
longName: xmlTextValue(block, "LongName"),
htmlFileName: xmlTextValue(block, "HtmlFileName") || null,
xmlFileName: xmlTextValue(block, "XmlFileName") || null,
});
}
return reports;
}
function scoreReport(
kind: FinancialStatementKind,
report: StatementReportDescriptor,
) {
const haystack = `${report.shortName} ${report.longName}`.trim();
if (!haystack) {
return 0;
}
let score = 0;
for (const pattern of STATEMENT_REPORT_PATTERNS[kind]) {
if (pattern.test(haystack)) {
score += 2;
}
}
if (/\bparenthetical\b/i.test(haystack) || /\bdetail\b/i.test(haystack)) {
score -= 1;
}
return score;
}
function chooseStatementReport(
kind: FinancialStatementKind,
reports: StatementReportDescriptor[],
) {
let best: StatementReportDescriptor | null = null;
let bestScore = 0;
for (const report of reports) {
const score = scoreReport(kind, report);
if (score > bestScore) {
best = report;
bestScore = score;
}
}
return bestScore > 0 ? best : null;
}
function sanitizeCellText(raw: string) {
return decodeHtmlEntities(
raw.replace(/<br\s*\/?>/gi, "\n").replace(/<[^>]+>/g, " "),
)
.replace(/[ \t]+/g, " ")
.replace(/\n+/g, " ")
.trim();
}
function extractConceptFromMarkup(markup: string) {
const defref = markup.match(/defref[_:-]([a-z0-9_:.:-]+)/i);
if (defref?.[1]) {
return defref[1].replace(/_/g, ":");
}
const nameAttr = markup.match(/\bname=[\"']([a-z0-9_:.:-]+)[\"']/i);
if (nameAttr?.[1]) {
return nameAttr[1];
}
return null;
}
function parseIndentDepth(attrs: string) {
const style = attrs.match(/\bstyle=[\"']([^\"']+)[\"']/i)?.[1] ?? "";
const padding = style.match(/padding-left:\s*([0-9.]+)px/i)?.[1];
if (padding) {
const numeric = Number.parseFloat(padding);
if (Number.isFinite(numeric) && numeric > 0) {
return Math.max(0, Math.round(numeric / 12));
}
}
const margin = style.match(/margin-left:\s*([0-9.]+)px/i)?.[1];
if (margin) {
const numeric = Number.parseFloat(margin);
if (Number.isFinite(numeric) && numeric > 0) {
return Math.max(0, Math.round(numeric / 12));
}
}
return 0;
}
function parseStatementNumber(raw: string): number | null {
const trimmed = raw.trim();
if (!trimmed || /^n\/a$/i.test(trimmed) || /^--+$/.test(trimmed)) {
return null;
}
if (/%$/.test(trimmed)) {
return null;
}
const negative = trimmed.startsWith("(") && trimmed.endsWith(")");
const cleaned = trimmed
.replace(/[$,\s]/g, "")
.replace(/[()]/g, "")
.replace(/\u2212/g, "-");
const value = Number.parseFloat(cleaned);
if (!Number.isFinite(value)) {
return null;
}
return negative ? -Math.abs(value) : value;
}
function slug(value: string) {
return value
.toLowerCase()
.replace(/[^a-z0-9]+/g, "-")
.replace(/^-+|-+$/g, "");
}
function parseStatementRowsFromReport(content: string): StatementParseRow[] {
const tables = [...content.matchAll(/<table[^>]*>([\s\S]*?)<\/table>/gi)];
if (tables.length === 0) {
return [];
}
let bestRows: StatementParseRow[] = [];
for (const tableMatch of tables) {
const table = tableMatch[0] ?? "";
const rows: StatementParseRow[] = [];
let order = 0;
for (const rowMatch of table.matchAll(/<tr[^>]*>([\s\S]*?)<\/tr>/gi)) {
const rowMarkup = rowMatch[0] ?? "";
const cells = [
...rowMarkup.matchAll(/<t[dh]([^>]*)>([\s\S]*?)<\/t[dh]>/gi),
];
if (cells.length < 2) {
continue;
}
const labelCell = cells[0];
const labelAttrs = labelCell?.[1] ?? "";
const labelRaw = labelCell?.[2] ?? "";
const label = sanitizeCellText(labelRaw);
if (!label || /^(years ended|assets|liabilities|equity)$/i.test(label)) {
continue;
}
let value: number | null = null;
for (let i = 1; i < cells.length; i += 1) {
const text = sanitizeCellText(cells[i]?.[2] ?? "");
const parsed = parseStatementNumber(text);
if (parsed !== null) {
value = parsed;
break;
}
}
if (value === null) {
continue;
}
order += 1;
const concept = extractConceptFromMarkup(rowMarkup);
rows.push({
key: concept ? slug(concept) : `${slug(label)}-${order}`,
label,
concept,
order,
depth: parseIndentDepth(labelAttrs),
isSubtotal: /^total\b/i.test(label) || /\bsubtotal\b/i.test(label),
value,
});
}
if (rows.length > bestRows.length) {
bestRows = rows;
}
}
return bestRows;
}
function toSnapshotRows(
periodId: string,
rows: StatementParseRow[],
): FilingFaithfulStatementSnapshotRow[] {
return rows.map((row) => ({
key: row.key,
label: row.label,
concept: row.concept,
order: row.order,
depth: row.depth,
isSubtotal: row.isSubtotal,
values: {
[periodId]: row.value,
},
}));
}
function matchStandardizedDefinition(
row: FilingFaithfulStatementSnapshotRow,
definition: CanonicalRowDefinition,
) {
const concept = row.concept ?? "";
return (
definition.conceptPatterns.some((pattern) => pattern.test(concept)) ||
definition.labelPatterns.some((pattern) => pattern.test(row.label))
);
}
function fallbackMetricValue(
kind: FinancialStatementKind,
rowKey: string,
metrics: Filing["metrics"],
) {
if (!metrics) {
return null;
}
if (kind === "income" && rowKey === "revenue") {
return metrics.revenue ?? null;
}
if (kind === "income" && rowKey === "net-income") {
return metrics.netIncome ?? null;
}
if (kind === "balance" && rowKey === "total-assets") {
return metrics.totalAssets ?? null;
}
if (kind === "balance" && rowKey === "cash-and-equivalents") {
return metrics.cash ?? null;
}
if (kind === "balance" && rowKey === "total-debt") {
return metrics.debt ?? null;
}
return null;
}
function toStandardizedRows(
kind: FinancialStatementKind,
periodId: string,
rows: FilingFaithfulStatementSnapshotRow[],
metrics: Filing["metrics"],
): StandardizedStatementSnapshotRow[] {
const definitions = STANDARDIZED_ROW_DEFINITIONS[kind];
const normalizedRows = [...rows];
const usedKeys = new Set<string>();
const standardizedRows: StandardizedStatementSnapshotRow[] = [];
for (const definition of definitions) {
const matched = normalizedRows.find(
(row) =>
!usedKeys.has(row.key) && matchStandardizedDefinition(row, definition),
);
const matchedValue = matched?.values[periodId] ?? null;
const fallbackValue =
matchedValue === null
? fallbackMetricValue(kind, definition.key, metrics)
: null;
if (matched) {
usedKeys.add(matched.key);
}
standardizedRows.push({
key: definition.key,
label: definition.label,
concept: matched?.concept ?? definition.key,
category: definition.category,
sourceConcepts: matched?.concept ? [matched.concept] : [],
values: {
[periodId]: matchedValue ?? fallbackValue,
},
});
}
for (const row of normalizedRows) {
if (usedKeys.has(row.key)) {
continue;
}
standardizedRows.push({
key: `other-${row.key}`,
label: row.label,
concept: row.concept ?? row.key,
category: "other",
sourceConcepts: row.concept ? [row.concept] : [],
values: {
[periodId]: row.values[periodId] ?? null,
},
});
}
return standardizedRows;
}
function parseContextsWithDimensions(raw: string) {
const contexts = new Map<string, DimensionContext>();
const contextPattern =
/<(?:[a-z0-9]+:)?context\b[^>]*\bid=["']([^"']+)["'][^>]*>([\s\S]*?)<\/(?:[a-z0-9]+:)?context>/gi;
for (const match of raw.matchAll(contextPattern)) {
const contextId = match[1] ?? "";
const block = match[2] ?? "";
if (!contextId) {
continue;
}
const endDate =
block
.match(
/<(?:[a-z0-9]+:)?endDate>([^<]+)<\/(?:[a-z0-9]+:)?endDate>/i,
)?.[1]
?.trim() ?? null;
const dimensions: Array<{ axis: string; member: string }> = [];
const dimPattern =
/<(?:[a-z0-9]+:)?explicitMember\b[^>]*\bdimension=["']([^"']+)["'][^>]*>([^<]+)<\/(?:[a-z0-9]+:)?explicitMember>/gi;
for (const dimMatch of block.matchAll(dimPattern)) {
const axis = (dimMatch[1] ?? "").trim();
const member = (dimMatch[2] ?? "").trim();
if (!axis || !member) {
continue;
}
dimensions.push({ axis, member });
}
if (dimensions.length === 0) {
continue;
}
contexts.set(contextId, { endDate, dimensions });
}
return contexts;
}
function statementKindFromConcept(
concept: string,
): FinancialStatementKind | null {
const normalized = concept.toLowerCase();
if (
/cash|operatingactivities|investingactivities|financingactivities/.test(
normalized,
)
) {
return "cash_flow";
}
if (/equity|retainedearnings|additionalpaidincapital/.test(normalized)) {
return "equity";
}
if (/comprehensiveincome/.test(normalized)) {
return "comprehensive_income";
}
if (/asset|liabilit|debt/.test(normalized)) {
return "balance";
}
if (/revenue|income|profit|expense|costof/.test(normalized)) {
return "income";
}
return null;
}
function parseDimensionFacts(raw: string, fallbackPeriodId: string) {
const contexts = parseContextsWithDimensions(raw);
if (contexts.size === 0) {
return createStatementRecord<DimensionStatementSnapshotRow[]>(() => []);
}
const rows = createStatementRecord<DimensionStatementSnapshotRow[]>(() => []);
const ixPattern = /<ix:nonfraction\b([^>]*)>([\s\S]*?)<\/ix:nonfraction>/gi;
let guard = 0;
for (const match of raw.matchAll(ixPattern)) {
guard += 1;
if (guard > 8_000) {
break;
}
const attrs = match[1] ?? "";
const body = sanitizeCellText(match[2] ?? "");
const contextRef = attrs.match(/\bcontextref=["']([^"']+)["']/i)?.[1] ?? "";
const concept = attrs.match(/\bname=["']([^"']+)["']/i)?.[1] ?? "";
const unit = attrs.match(/\bunitref=["']([^"']+)["']/i)?.[1] ?? null;
if (!contextRef || !concept) {
continue;
}
const context = contexts.get(contextRef);
if (!context || context.dimensions.length === 0) {
continue;
}
const kind = statementKindFromConcept(concept);
if (!kind) {
continue;
}
const value = parseStatementNumber(body);
if (value === null) {
continue;
}
const periodId = context.endDate ?? fallbackPeriodId;
const rowKey = slug(concept);
for (const dimension of context.dimensions) {
rows[kind].push({
rowKey,
concept,
periodId,
axis: dimension.axis,
member: dimension.member,
value,
unit,
});
}
}
return rows;
}
function markHasDimensions<
T extends { key: string; concept: string | null; hasDimensions?: boolean },
>(rows: T[], dimensions: DimensionStatementSnapshotRow[]) {
const dimensionConcepts = new Set(
dimensions.map((item) => item.concept?.toLowerCase() ?? "").filter(Boolean),
);
const dimensionRowKeys = new Set(dimensions.map((item) => item.rowKey));
return rows.map((row) => {
const concept = row.concept?.toLowerCase() ?? "";
const hasDimensions =
dimensionRowKeys.has(row.key) ||
(concept ? dimensionConcepts.has(concept) : false);
return {
...row,
hasDimensions,
};
});
}
function emptyStatementBundle(
period: FilingStatementSnapshotPeriod,
): FilingStatementBundle {
return {
periods: [period],
statements: createStatementRecord(() => []),
};
}
function emptyStandardizedBundle(
period: FilingStatementSnapshotPeriod,
): StandardizedStatementBundle {
return {
periods: [period],
statements: createStatementRecord(() => []),
};
}
function emptyDimensionBundle(): DimensionStatementBundle {
return {
statements: createStatementRecord(() => []),
};
}
export async function hydrateFilingStatementSnapshot(
input: FilingStatementHydrationInput,
options?: {
fetchImpl?: typeof fetch;
},
): Promise<FilingStatementHydrationResult> {
const periodId = `${input.filingDate}-${compactAccessionNumber(input.accessionNumber)}`;
const period: FilingStatementSnapshotPeriod = {
id: periodId,
filingId: input.filingId,
accessionNumber: input.accessionNumber,
filingDate: input.filingDate,
periodStart: null,
periodEnd: input.filingDate,
filingType: input.filingType,
periodLabel:
input.filingType === "10-Q" ? "Quarter End" : "Fiscal Year End",
};
const fetchImpl = options?.fetchImpl ?? fetch;
const statementBundle = emptyStatementBundle(period);
const standardizedBundle = emptyStandardizedBundle(period);
const dimensionBundle = emptyDimensionBundle();
let source: FilingStatementHydrationResult["source"] =
"companyfacts_fallback";
let parseError: string | null = null;
try {
const filingDirectory = resolveFilingDirectoryUrl({
filingUrl: input.filingUrl,
cik: input.cik,
accessionNumber: input.accessionNumber,
});
if (filingDirectory) {
const summaryXml = await fetchText(
`${filingDirectory}FilingSummary.xml`,
fetchImpl,
);
const reports = parseFilingSummaryReports(summaryXml);
for (const kind of FINANCIAL_STATEMENT_KINDS) {
const report = chooseStatementReport(kind, reports);
if (!report) {
continue;
}
const reportUrl = toAbsoluteArchiveUrl(
filingDirectory,
report.htmlFileName ?? report.xmlFileName,
);
if (!reportUrl) {
continue;
}
try {
const reportText = await fetchText(reportUrl, fetchImpl);
const parsedRows = parseStatementRowsFromReport(reportText);
if (parsedRows.length === 0) {
continue;
}
source = "sec_filing_summary";
statementBundle.statements[kind] = toSnapshotRows(
periodId,
parsedRows,
);
} catch {
// Continue to other statements when one report fails.
}
}
}
} catch (error) {
parseError =
error instanceof Error ? error.message : "Failed to parse filing summary";
}
try {
const primaryUrl = resolvePrimaryFilingUrl({
filingUrl: input.filingUrl,
cik: input.cik,
accessionNumber: input.accessionNumber,
primaryDocument: input.primaryDocument,
});
if (primaryUrl) {
const rawDocument = await fetchText(primaryUrl, fetchImpl);
const dimensions = parseDimensionFacts(rawDocument, periodId);
for (const kind of FINANCIAL_STATEMENT_KINDS) {
dimensionBundle.statements[kind] = dimensions[kind];
}
const hasAnyDimensions = FINANCIAL_STATEMENT_KINDS.some(
(kind) => dimensionBundle.statements[kind].length > 0,
);
if (hasAnyDimensions && source === "companyfacts_fallback") {
source = "xbrl_instance";
}
}
} catch (error) {
if (!parseError) {
parseError =
error instanceof Error
? error.message
: "Failed to parse inline XBRL dimensions";
}
}
for (const kind of FINANCIAL_STATEMENT_KINDS) {
const faithfulRows = statementBundle.statements[kind];
standardizedBundle.statements[kind] = toStandardizedRows(
kind,
periodId,
faithfulRows,
input.metrics,
);
statementBundle.statements[kind] = markHasDimensions(
faithfulRows,
dimensionBundle.statements[kind],
);
standardizedBundle.statements[kind] = markHasDimensions(
standardizedBundle.statements[kind],
dimensionBundle.statements[kind],
);
}
const statementCount = FINANCIAL_STATEMENT_KINDS.filter(
(kind) => statementBundle.statements[kind].length > 0,
).length;
const standardizedCount = FINANCIAL_STATEMENT_KINDS.filter(
(kind) => standardizedBundle.statements[kind].length > 0,
).length;
const parseStatus: FilingStatementHydrationResult["parse_status"] =
statementCount === FINANCIAL_STATEMENT_KINDS.length
? "ready"
: statementCount > 0 || standardizedCount > 0
? "partial"
: "failed";
return {
filing_id: input.filingId,
ticker: input.ticker.trim().toUpperCase(),
filing_date: input.filingDate,
filing_type: input.filingType,
period_end: input.filingDate,
statement_bundle: statementBundle,
standardized_bundle: standardizedBundle,
dimension_bundle: dimensionBundle,
parse_status: parseStatus,
parse_error:
parseStatus === "failed"
? (parseError ?? "No financial statement tables found")
: parseError,
source,
};
}
export const __statementInternals = {
parseFilingSummaryReports,
parseStatementRowsFromReport,
parseDimensionFacts,
statementKindLabel,
};