284 lines
6.4 KiB
TypeScript
284 lines
6.4 KiB
TypeScript
import type { TaxonomyAsset } from '@/lib/server/taxonomy/types';
|
|
|
|
type FilingAssetDiscoveryInput = {
|
|
cik: string;
|
|
accessionNumber: string;
|
|
filingUrl: string | null;
|
|
primaryDocument: string | null;
|
|
fetchImpl?: typeof fetch;
|
|
};
|
|
|
|
type FilingDirectoryJson = {
|
|
directory?: {
|
|
item?: Array<{
|
|
name?: string;
|
|
type?: string;
|
|
size?: string | number;
|
|
}>;
|
|
};
|
|
};
|
|
|
|
function envUserAgent() {
|
|
return process.env.SEC_USER_AGENT || 'Fiscal Clone <support@fiscal.local>';
|
|
}
|
|
|
|
function compactAccessionNumber(value: string) {
|
|
return value.replace(/-/g, '');
|
|
}
|
|
|
|
function normalizeCikForPath(value: string) {
|
|
const digits = value.replace(/\D/g, '');
|
|
if (!digits) {
|
|
return null;
|
|
}
|
|
|
|
const numeric = Number(digits);
|
|
if (!Number.isFinite(numeric)) {
|
|
return null;
|
|
}
|
|
|
|
return String(numeric);
|
|
}
|
|
|
|
function resolveFilingDirectoryUrl(input: {
|
|
filingUrl: string | null;
|
|
cik: string;
|
|
accessionNumber: string;
|
|
}) {
|
|
const direct = input.filingUrl?.trim();
|
|
if (direct) {
|
|
const lastSlash = direct.lastIndexOf('/');
|
|
if (lastSlash > 'https://'.length) {
|
|
return direct.slice(0, lastSlash + 1);
|
|
}
|
|
}
|
|
|
|
const cikPath = normalizeCikForPath(input.cik);
|
|
const accessionPath = compactAccessionNumber(input.accessionNumber);
|
|
if (!cikPath || !accessionPath) {
|
|
return null;
|
|
}
|
|
|
|
return `https://www.sec.gov/Archives/edgar/data/${cikPath}/${accessionPath}/`;
|
|
}
|
|
|
|
function classifyAssetType(name: string): TaxonomyAsset['asset_type'] {
|
|
const lower = name.toLowerCase();
|
|
|
|
if (lower.endsWith('.pdf')) {
|
|
return 'pdf';
|
|
}
|
|
|
|
if (lower.endsWith('.xsd')) {
|
|
return 'schema';
|
|
}
|
|
|
|
if (lower.endsWith('.xml')) {
|
|
if (/(_|-)pre\.xml$/.test(lower) || /presentation/.test(lower)) {
|
|
return 'presentation';
|
|
}
|
|
|
|
if (/(_|-)lab\.xml$/.test(lower) || /label/.test(lower)) {
|
|
return 'label';
|
|
}
|
|
|
|
if (/(_|-)cal\.xml$/.test(lower) || /calculation/.test(lower)) {
|
|
return 'calculation';
|
|
}
|
|
|
|
if (/(_|-)def\.xml$/.test(lower) || /definition/.test(lower)) {
|
|
return 'definition';
|
|
}
|
|
|
|
return 'instance';
|
|
}
|
|
|
|
return 'other';
|
|
}
|
|
|
|
function scorePdf(name: string, sizeBytes: number | null) {
|
|
const lower = name.toLowerCase();
|
|
let score = 0;
|
|
|
|
if (/financial|statement|annual|quarter|10k|10q/.test(lower)) {
|
|
score += 8;
|
|
}
|
|
|
|
if (/exhibit|ex-\d+/.test(lower)) {
|
|
score -= 2;
|
|
}
|
|
|
|
if (sizeBytes && sizeBytes > 100_000) {
|
|
score += 1;
|
|
}
|
|
|
|
return score;
|
|
}
|
|
|
|
function scoreInstance(name: string, primaryDocument: string | null) {
|
|
const lower = name.toLowerCase();
|
|
let score = 1;
|
|
|
|
if (/_htm\.xml$/.test(lower)) {
|
|
score += 4;
|
|
}
|
|
|
|
if (/_ins\.xml$/.test(lower)) {
|
|
score += 4;
|
|
}
|
|
|
|
const basePrimary = (primaryDocument ?? '').replace(/\.[a-z0-9]+$/i, '').toLowerCase();
|
|
if (basePrimary && lower.includes(basePrimary)) {
|
|
score += 5;
|
|
}
|
|
|
|
if (/cal|def|lab|pre/.test(lower)) {
|
|
score -= 3;
|
|
}
|
|
|
|
return score;
|
|
}
|
|
|
|
function parseSize(raw: unknown) {
|
|
if (typeof raw === 'number') {
|
|
return Number.isFinite(raw) ? raw : null;
|
|
}
|
|
|
|
if (typeof raw === 'string') {
|
|
const parsed = Number(raw);
|
|
return Number.isFinite(parsed) ? parsed : null;
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
async function fetchJson<T>(url: string, fetchImpl: typeof fetch): Promise<T> {
|
|
const response = await fetchImpl(url, {
|
|
headers: {
|
|
'User-Agent': envUserAgent(),
|
|
Accept: 'application/json'
|
|
},
|
|
cache: 'no-store'
|
|
});
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`SEC request failed (${response.status})`);
|
|
}
|
|
|
|
return await response.json() as T;
|
|
}
|
|
|
|
export async function discoverFilingAssets(input: FilingAssetDiscoveryInput): Promise<{
|
|
directoryUrl: string | null;
|
|
assets: TaxonomyAsset[];
|
|
}> {
|
|
const fetchImpl = input.fetchImpl ?? fetch;
|
|
const directoryUrl = resolveFilingDirectoryUrl({
|
|
filingUrl: input.filingUrl,
|
|
cik: input.cik,
|
|
accessionNumber: input.accessionNumber
|
|
});
|
|
|
|
if (!directoryUrl) {
|
|
return {
|
|
directoryUrl: null,
|
|
assets: []
|
|
};
|
|
}
|
|
|
|
let payload: FilingDirectoryJson | null = null;
|
|
try {
|
|
payload = await fetchJson<FilingDirectoryJson>(`${directoryUrl}index.json`, fetchImpl);
|
|
} catch {
|
|
payload = null;
|
|
}
|
|
|
|
const discovered: TaxonomyAsset[] = [];
|
|
for (const item of payload?.directory?.item ?? []) {
|
|
const name = (item.name ?? '').trim();
|
|
if (!name) {
|
|
continue;
|
|
}
|
|
|
|
const url = `${directoryUrl}${name.replace(/^\/+/, '')}`;
|
|
const asset_type = classifyAssetType(name);
|
|
const size_bytes = parseSize(item.size);
|
|
|
|
discovered.push({
|
|
asset_type,
|
|
name,
|
|
url,
|
|
size_bytes,
|
|
score: null,
|
|
is_selected: false
|
|
});
|
|
}
|
|
|
|
if (discovered.length === 0 && input.filingUrl) {
|
|
const fallbackName = input.primaryDocument ?? input.filingUrl.split('/').pop() ?? 'primary_document';
|
|
discovered.push({
|
|
asset_type: fallbackName.toLowerCase().endsWith('.xml') ? 'instance' : 'other',
|
|
name: fallbackName,
|
|
url: input.filingUrl,
|
|
size_bytes: null,
|
|
score: null,
|
|
is_selected: true
|
|
});
|
|
}
|
|
|
|
const instanceCandidates = discovered
|
|
.filter((asset) => asset.asset_type === 'instance')
|
|
.map((asset) => ({
|
|
asset,
|
|
score: scoreInstance(asset.name, input.primaryDocument)
|
|
}))
|
|
.sort((a, b) => b.score - a.score);
|
|
|
|
const selectedInstanceUrl = instanceCandidates[0]?.asset.url ?? null;
|
|
|
|
const selectedPdfUrls = discovered
|
|
.filter((asset) => asset.asset_type === 'pdf')
|
|
.map((asset) => ({
|
|
asset,
|
|
score: scorePdf(asset.name, asset.size_bytes)
|
|
}))
|
|
.sort((a, b) => b.score - a.score)
|
|
.slice(0, 3)
|
|
.map((entry) => entry.asset.url);
|
|
|
|
const assets = discovered.map((asset) => {
|
|
if (asset.asset_type === 'instance') {
|
|
const score = scoreInstance(asset.name, input.primaryDocument);
|
|
return {
|
|
...asset,
|
|
score,
|
|
is_selected: asset.url === selectedInstanceUrl
|
|
};
|
|
}
|
|
|
|
if (asset.asset_type === 'pdf') {
|
|
const score = scorePdf(asset.name, asset.size_bytes);
|
|
return {
|
|
...asset,
|
|
score,
|
|
is_selected: selectedPdfUrls.includes(asset.url)
|
|
};
|
|
}
|
|
|
|
return {
|
|
...asset,
|
|
score: null,
|
|
is_selected: asset.asset_type === 'presentation'
|
|
|| asset.asset_type === 'label'
|
|
|| asset.asset_type === 'calculation'
|
|
|| asset.asset_type === 'definition'
|
|
|| asset.asset_type === 'schema'
|
|
};
|
|
});
|
|
|
|
return {
|
|
directoryUrl,
|
|
assets
|
|
};
|
|
}
|