import type { TaxonomyAsset } from '@/lib/server/taxonomy/types'; type FilingAssetDiscoveryInput = { cik: string; accessionNumber: string; filingUrl: string | null; primaryDocument: string | null; fetchImpl?: typeof fetch; }; type FilingDirectoryJson = { directory?: { item?: Array<{ name?: string; type?: string; size?: string | number; }>; }; }; function envUserAgent() { return process.env.SEC_USER_AGENT || 'Fiscal Clone '; } function compactAccessionNumber(value: string) { return value.replace(/-/g, ''); } function normalizeCikForPath(value: string) { const digits = value.replace(/\D/g, ''); if (!digits) { return null; } const numeric = Number(digits); if (!Number.isFinite(numeric)) { return null; } return String(numeric); } function resolveFilingDirectoryUrl(input: { filingUrl: string | null; cik: string; accessionNumber: string; }) { const direct = input.filingUrl?.trim(); if (direct) { const lastSlash = direct.lastIndexOf('/'); if (lastSlash > 'https://'.length) { return direct.slice(0, lastSlash + 1); } } const cikPath = normalizeCikForPath(input.cik); const accessionPath = compactAccessionNumber(input.accessionNumber); if (!cikPath || !accessionPath) { return null; } return `https://www.sec.gov/Archives/edgar/data/${cikPath}/${accessionPath}/`; } function classifyAssetType(name: string): TaxonomyAsset['asset_type'] { const lower = name.toLowerCase(); if (lower.endsWith('.pdf')) { return 'pdf'; } if (lower.endsWith('.xsd')) { return 'schema'; } if (lower.endsWith('.xml')) { if (/(_|-)pre\.xml$/.test(lower) || /presentation/.test(lower)) { return 'presentation'; } if (/(_|-)lab\.xml$/.test(lower) || /label/.test(lower)) { return 'label'; } if (/(_|-)cal\.xml$/.test(lower) || /calculation/.test(lower)) { return 'calculation'; } if (/(_|-)def\.xml$/.test(lower) || /definition/.test(lower)) { return 'definition'; } return 'instance'; } return 'other'; } function scorePdf(name: string, sizeBytes: number | null) { const lower = name.toLowerCase(); let score = 0; if (/financial|statement|annual|quarter|10k|10q/.test(lower)) { score += 8; } if (/exhibit|ex-\d+/.test(lower)) { score -= 2; } if (sizeBytes && sizeBytes > 100_000) { score += 1; } return score; } function scoreInstance(name: string, primaryDocument: string | null) { const lower = name.toLowerCase(); let score = 1; if (/_htm\.xml$/.test(lower)) { score += 4; } if (/_ins\.xml$/.test(lower)) { score += 4; } const basePrimary = (primaryDocument ?? '').replace(/\.[a-z0-9]+$/i, '').toLowerCase(); if (basePrimary && lower.includes(basePrimary)) { score += 5; } if (/cal|def|lab|pre/.test(lower)) { score -= 3; } return score; } function parseSize(raw: unknown) { if (typeof raw === 'number') { return Number.isFinite(raw) ? raw : null; } if (typeof raw === 'string') { const parsed = Number(raw); return Number.isFinite(parsed) ? parsed : null; } return null; } async function fetchJson(url: string, fetchImpl: typeof fetch): Promise { const response = await fetchImpl(url, { headers: { 'User-Agent': envUserAgent(), Accept: 'application/json' }, cache: 'no-store' }); if (!response.ok) { throw new Error(`SEC request failed (${response.status})`); } return await response.json() as T; } export async function discoverFilingAssets(input: FilingAssetDiscoveryInput): Promise<{ directoryUrl: string | null; assets: TaxonomyAsset[]; }> { const fetchImpl = input.fetchImpl ?? fetch; const directoryUrl = resolveFilingDirectoryUrl({ filingUrl: input.filingUrl, cik: input.cik, accessionNumber: input.accessionNumber }); if (!directoryUrl) { return { directoryUrl: null, assets: [] }; } let payload: FilingDirectoryJson | null = null; try { payload = await fetchJson(`${directoryUrl}index.json`, fetchImpl); } catch { payload = null; } const discovered: TaxonomyAsset[] = []; for (const item of payload?.directory?.item ?? []) { const name = (item.name ?? '').trim(); if (!name) { continue; } const url = `${directoryUrl}${name.replace(/^\/+/, '')}`; const asset_type = classifyAssetType(name); const size_bytes = parseSize(item.size); discovered.push({ asset_type, name, url, size_bytes, score: null, is_selected: false }); } if (discovered.length === 0 && input.filingUrl) { const fallbackName = input.primaryDocument ?? input.filingUrl.split('/').pop() ?? 'primary_document'; discovered.push({ asset_type: fallbackName.toLowerCase().endsWith('.xml') ? 'instance' : 'other', name: fallbackName, url: input.filingUrl, size_bytes: null, score: null, is_selected: true }); } const instanceCandidates = discovered .filter((asset) => asset.asset_type === 'instance') .map((asset) => ({ asset, score: scoreInstance(asset.name, input.primaryDocument) })) .sort((a, b) => b.score - a.score); const selectedInstanceUrl = instanceCandidates[0]?.asset.url ?? null; const selectedPdfUrls = discovered .filter((asset) => asset.asset_type === 'pdf') .map((asset) => ({ asset, score: scorePdf(asset.name, asset.size_bytes) })) .sort((a, b) => b.score - a.score) .slice(0, 3) .map((entry) => entry.asset.url); const assets = discovered.map((asset) => { if (asset.asset_type === 'instance') { const score = scoreInstance(asset.name, input.primaryDocument); return { ...asset, score, is_selected: asset.url === selectedInstanceUrl }; } if (asset.asset_type === 'pdf') { const score = scorePdf(asset.name, asset.size_bytes); return { ...asset, score, is_selected: selectedPdfUrls.includes(asset.url) }; } return { ...asset, score: null, is_selected: asset.asset_type === 'presentation' || asset.asset_type === 'label' || asset.asset_type === 'calculation' || asset.asset_type === 'definition' || asset.asset_type === 'schema' }; }); return { directoryUrl, assets }; }