Files
Neon-Desk/lib/server/taxonomy/asset-discovery.ts
2026-03-06 14:40:43 -05:00

284 lines
6.4 KiB
TypeScript

import type { TaxonomyAsset } from '@/lib/server/taxonomy/types';
type FilingAssetDiscoveryInput = {
cik: string;
accessionNumber: string;
filingUrl: string | null;
primaryDocument: string | null;
fetchImpl?: typeof fetch;
};
type FilingDirectoryJson = {
directory?: {
item?: Array<{
name?: string;
type?: string;
size?: string | number;
}>;
};
};
function envUserAgent() {
return process.env.SEC_USER_AGENT || 'Fiscal Clone <support@fiscal.local>';
}
function compactAccessionNumber(value: string) {
return value.replace(/-/g, '');
}
function normalizeCikForPath(value: string) {
const digits = value.replace(/\D/g, '');
if (!digits) {
return null;
}
const numeric = Number(digits);
if (!Number.isFinite(numeric)) {
return null;
}
return String(numeric);
}
function resolveFilingDirectoryUrl(input: {
filingUrl: string | null;
cik: string;
accessionNumber: string;
}) {
const direct = input.filingUrl?.trim();
if (direct) {
const lastSlash = direct.lastIndexOf('/');
if (lastSlash > 'https://'.length) {
return direct.slice(0, lastSlash + 1);
}
}
const cikPath = normalizeCikForPath(input.cik);
const accessionPath = compactAccessionNumber(input.accessionNumber);
if (!cikPath || !accessionPath) {
return null;
}
return `https://www.sec.gov/Archives/edgar/data/${cikPath}/${accessionPath}/`;
}
function classifyAssetType(name: string): TaxonomyAsset['asset_type'] {
const lower = name.toLowerCase();
if (lower.endsWith('.pdf')) {
return 'pdf';
}
if (lower.endsWith('.xsd')) {
return 'schema';
}
if (lower.endsWith('.xml')) {
if (/(_|-)pre\.xml$/.test(lower) || /presentation/.test(lower)) {
return 'presentation';
}
if (/(_|-)lab\.xml$/.test(lower) || /label/.test(lower)) {
return 'label';
}
if (/(_|-)cal\.xml$/.test(lower) || /calculation/.test(lower)) {
return 'calculation';
}
if (/(_|-)def\.xml$/.test(lower) || /definition/.test(lower)) {
return 'definition';
}
return 'instance';
}
return 'other';
}
function scorePdf(name: string, sizeBytes: number | null) {
const lower = name.toLowerCase();
let score = 0;
if (/financial|statement|annual|quarter|10k|10q/.test(lower)) {
score += 8;
}
if (/exhibit|ex-\d+/.test(lower)) {
score -= 2;
}
if (sizeBytes && sizeBytes > 100_000) {
score += 1;
}
return score;
}
function scoreInstance(name: string, primaryDocument: string | null) {
const lower = name.toLowerCase();
let score = 1;
if (/_htm\.xml$/.test(lower)) {
score += 4;
}
if (/_ins\.xml$/.test(lower)) {
score += 4;
}
const basePrimary = (primaryDocument ?? '').replace(/\.[a-z0-9]+$/i, '').toLowerCase();
if (basePrimary && lower.includes(basePrimary)) {
score += 5;
}
if (/cal|def|lab|pre/.test(lower)) {
score -= 3;
}
return score;
}
function parseSize(raw: unknown) {
if (typeof raw === 'number') {
return Number.isFinite(raw) ? raw : null;
}
if (typeof raw === 'string') {
const parsed = Number(raw);
return Number.isFinite(parsed) ? parsed : null;
}
return null;
}
async function fetchJson<T>(url: string, fetchImpl: typeof fetch): Promise<T> {
const response = await fetchImpl(url, {
headers: {
'User-Agent': envUserAgent(),
Accept: 'application/json'
},
cache: 'no-store'
});
if (!response.ok) {
throw new Error(`SEC request failed (${response.status})`);
}
return await response.json() as T;
}
export async function discoverFilingAssets(input: FilingAssetDiscoveryInput): Promise<{
directoryUrl: string | null;
assets: TaxonomyAsset[];
}> {
const fetchImpl = input.fetchImpl ?? fetch;
const directoryUrl = resolveFilingDirectoryUrl({
filingUrl: input.filingUrl,
cik: input.cik,
accessionNumber: input.accessionNumber
});
if (!directoryUrl) {
return {
directoryUrl: null,
assets: []
};
}
let payload: FilingDirectoryJson | null = null;
try {
payload = await fetchJson<FilingDirectoryJson>(`${directoryUrl}index.json`, fetchImpl);
} catch {
payload = null;
}
const discovered: TaxonomyAsset[] = [];
for (const item of payload?.directory?.item ?? []) {
const name = (item.name ?? '').trim();
if (!name) {
continue;
}
const url = `${directoryUrl}${name.replace(/^\/+/, '')}`;
const asset_type = classifyAssetType(name);
const size_bytes = parseSize(item.size);
discovered.push({
asset_type,
name,
url,
size_bytes,
score: null,
is_selected: false
});
}
if (discovered.length === 0 && input.filingUrl) {
const fallbackName = input.primaryDocument ?? input.filingUrl.split('/').pop() ?? 'primary_document';
discovered.push({
asset_type: fallbackName.toLowerCase().endsWith('.xml') ? 'instance' : 'other',
name: fallbackName,
url: input.filingUrl,
size_bytes: null,
score: null,
is_selected: true
});
}
const instanceCandidates = discovered
.filter((asset) => asset.asset_type === 'instance')
.map((asset) => ({
asset,
score: scoreInstance(asset.name, input.primaryDocument)
}))
.sort((a, b) => b.score - a.score);
const selectedInstanceUrl = instanceCandidates[0]?.asset.url ?? null;
const selectedPdfUrls = discovered
.filter((asset) => asset.asset_type === 'pdf')
.map((asset) => ({
asset,
score: scorePdf(asset.name, asset.size_bytes)
}))
.sort((a, b) => b.score - a.score)
.slice(0, 3)
.map((entry) => entry.asset.url);
const assets = discovered.map((asset) => {
if (asset.asset_type === 'instance') {
const score = scoreInstance(asset.name, input.primaryDocument);
return {
...asset,
score,
is_selected: asset.url === selectedInstanceUrl
};
}
if (asset.asset_type === 'pdf') {
const score = scorePdf(asset.name, asset.size_bytes);
return {
...asset,
score,
is_selected: selectedPdfUrls.includes(asset.url)
};
}
return {
...asset,
score: null,
is_selected: asset.asset_type === 'presentation'
|| asset.asset_type === 'label'
|| asset.asset_type === 'calculation'
|| asset.asset_type === 'definition'
|| asset.asset_type === 'schema'
};
});
return {
directoryUrl,
assets
};
}