Run playwright UI tests
This commit is contained in:
283
lib/server/taxonomy/asset-discovery.ts
Normal file
283
lib/server/taxonomy/asset-discovery.ts
Normal file
@@ -0,0 +1,283 @@
|
||||
import type { TaxonomyAsset } from '@/lib/server/taxonomy/types';
|
||||
|
||||
type FilingAssetDiscoveryInput = {
|
||||
cik: string;
|
||||
accessionNumber: string;
|
||||
filingUrl: string | null;
|
||||
primaryDocument: string | null;
|
||||
fetchImpl?: typeof fetch;
|
||||
};
|
||||
|
||||
type FilingDirectoryJson = {
|
||||
directory?: {
|
||||
item?: Array<{
|
||||
name?: string;
|
||||
type?: string;
|
||||
size?: string | number;
|
||||
}>;
|
||||
};
|
||||
};
|
||||
|
||||
function envUserAgent() {
|
||||
return process.env.SEC_USER_AGENT || 'Fiscal Clone <support@fiscal.local>';
|
||||
}
|
||||
|
||||
function compactAccessionNumber(value: string) {
|
||||
return value.replace(/-/g, '');
|
||||
}
|
||||
|
||||
function normalizeCikForPath(value: string) {
|
||||
const digits = value.replace(/\D/g, '');
|
||||
if (!digits) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const numeric = Number(digits);
|
||||
if (!Number.isFinite(numeric)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return String(numeric);
|
||||
}
|
||||
|
||||
function resolveFilingDirectoryUrl(input: {
|
||||
filingUrl: string | null;
|
||||
cik: string;
|
||||
accessionNumber: string;
|
||||
}) {
|
||||
const direct = input.filingUrl?.trim();
|
||||
if (direct) {
|
||||
const lastSlash = direct.lastIndexOf('/');
|
||||
if (lastSlash > 'https://'.length) {
|
||||
return direct.slice(0, lastSlash + 1);
|
||||
}
|
||||
}
|
||||
|
||||
const cikPath = normalizeCikForPath(input.cik);
|
||||
const accessionPath = compactAccessionNumber(input.accessionNumber);
|
||||
if (!cikPath || !accessionPath) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return `https://www.sec.gov/Archives/edgar/data/${cikPath}/${accessionPath}/`;
|
||||
}
|
||||
|
||||
function classifyAssetType(name: string): TaxonomyAsset['asset_type'] {
|
||||
const lower = name.toLowerCase();
|
||||
|
||||
if (lower.endsWith('.pdf')) {
|
||||
return 'pdf';
|
||||
}
|
||||
|
||||
if (lower.endsWith('.xsd')) {
|
||||
return 'schema';
|
||||
}
|
||||
|
||||
if (lower.endsWith('.xml')) {
|
||||
if (/(_|-)pre\.xml$/.test(lower) || /presentation/.test(lower)) {
|
||||
return 'presentation';
|
||||
}
|
||||
|
||||
if (/(_|-)lab\.xml$/.test(lower) || /label/.test(lower)) {
|
||||
return 'label';
|
||||
}
|
||||
|
||||
if (/(_|-)cal\.xml$/.test(lower) || /calculation/.test(lower)) {
|
||||
return 'calculation';
|
||||
}
|
||||
|
||||
if (/(_|-)def\.xml$/.test(lower) || /definition/.test(lower)) {
|
||||
return 'definition';
|
||||
}
|
||||
|
||||
return 'instance';
|
||||
}
|
||||
|
||||
return 'other';
|
||||
}
|
||||
|
||||
function scorePdf(name: string, sizeBytes: number | null) {
|
||||
const lower = name.toLowerCase();
|
||||
let score = 0;
|
||||
|
||||
if (/financial|statement|annual|quarter|10k|10q/.test(lower)) {
|
||||
score += 8;
|
||||
}
|
||||
|
||||
if (/exhibit|ex-\d+/.test(lower)) {
|
||||
score -= 2;
|
||||
}
|
||||
|
||||
if (sizeBytes && sizeBytes > 100_000) {
|
||||
score += 1;
|
||||
}
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
function scoreInstance(name: string, primaryDocument: string | null) {
|
||||
const lower = name.toLowerCase();
|
||||
let score = 1;
|
||||
|
||||
if (/_htm\.xml$/.test(lower)) {
|
||||
score += 4;
|
||||
}
|
||||
|
||||
if (/_ins\.xml$/.test(lower)) {
|
||||
score += 4;
|
||||
}
|
||||
|
||||
const basePrimary = (primaryDocument ?? '').replace(/\.[a-z0-9]+$/i, '').toLowerCase();
|
||||
if (basePrimary && lower.includes(basePrimary)) {
|
||||
score += 5;
|
||||
}
|
||||
|
||||
if (/cal|def|lab|pre/.test(lower)) {
|
||||
score -= 3;
|
||||
}
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
function parseSize(raw: unknown) {
|
||||
if (typeof raw === 'number') {
|
||||
return Number.isFinite(raw) ? raw : null;
|
||||
}
|
||||
|
||||
if (typeof raw === 'string') {
|
||||
const parsed = Number(raw);
|
||||
return Number.isFinite(parsed) ? parsed : null;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
async function fetchJson<T>(url: string, fetchImpl: typeof fetch): Promise<T> {
|
||||
const response = await fetchImpl(url, {
|
||||
headers: {
|
||||
'User-Agent': envUserAgent(),
|
||||
Accept: 'application/json'
|
||||
},
|
||||
cache: 'no-store'
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`SEC request failed (${response.status})`);
|
||||
}
|
||||
|
||||
return await response.json() as T;
|
||||
}
|
||||
|
||||
export async function discoverFilingAssets(input: FilingAssetDiscoveryInput): Promise<{
|
||||
directoryUrl: string | null;
|
||||
assets: TaxonomyAsset[];
|
||||
}> {
|
||||
const fetchImpl = input.fetchImpl ?? fetch;
|
||||
const directoryUrl = resolveFilingDirectoryUrl({
|
||||
filingUrl: input.filingUrl,
|
||||
cik: input.cik,
|
||||
accessionNumber: input.accessionNumber
|
||||
});
|
||||
|
||||
if (!directoryUrl) {
|
||||
return {
|
||||
directoryUrl: null,
|
||||
assets: []
|
||||
};
|
||||
}
|
||||
|
||||
let payload: FilingDirectoryJson | null = null;
|
||||
try {
|
||||
payload = await fetchJson<FilingDirectoryJson>(`${directoryUrl}index.json`, fetchImpl);
|
||||
} catch {
|
||||
payload = null;
|
||||
}
|
||||
|
||||
const discovered: TaxonomyAsset[] = [];
|
||||
for (const item of payload?.directory?.item ?? []) {
|
||||
const name = (item.name ?? '').trim();
|
||||
if (!name) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const url = `${directoryUrl}${name.replace(/^\/+/, '')}`;
|
||||
const asset_type = classifyAssetType(name);
|
||||
const size_bytes = parseSize(item.size);
|
||||
|
||||
discovered.push({
|
||||
asset_type,
|
||||
name,
|
||||
url,
|
||||
size_bytes,
|
||||
score: null,
|
||||
is_selected: false
|
||||
});
|
||||
}
|
||||
|
||||
if (discovered.length === 0 && input.filingUrl) {
|
||||
const fallbackName = input.primaryDocument ?? input.filingUrl.split('/').pop() ?? 'primary_document';
|
||||
discovered.push({
|
||||
asset_type: fallbackName.toLowerCase().endsWith('.xml') ? 'instance' : 'other',
|
||||
name: fallbackName,
|
||||
url: input.filingUrl,
|
||||
size_bytes: null,
|
||||
score: null,
|
||||
is_selected: true
|
||||
});
|
||||
}
|
||||
|
||||
const instanceCandidates = discovered
|
||||
.filter((asset) => asset.asset_type === 'instance')
|
||||
.map((asset) => ({
|
||||
asset,
|
||||
score: scoreInstance(asset.name, input.primaryDocument)
|
||||
}))
|
||||
.sort((a, b) => b.score - a.score);
|
||||
|
||||
const selectedInstanceUrl = instanceCandidates[0]?.asset.url ?? null;
|
||||
|
||||
const selectedPdfUrls = discovered
|
||||
.filter((asset) => asset.asset_type === 'pdf')
|
||||
.map((asset) => ({
|
||||
asset,
|
||||
score: scorePdf(asset.name, asset.size_bytes)
|
||||
}))
|
||||
.sort((a, b) => b.score - a.score)
|
||||
.slice(0, 3)
|
||||
.map((entry) => entry.asset.url);
|
||||
|
||||
const assets = discovered.map((asset) => {
|
||||
if (asset.asset_type === 'instance') {
|
||||
const score = scoreInstance(asset.name, input.primaryDocument);
|
||||
return {
|
||||
...asset,
|
||||
score,
|
||||
is_selected: asset.url === selectedInstanceUrl
|
||||
};
|
||||
}
|
||||
|
||||
if (asset.asset_type === 'pdf') {
|
||||
const score = scorePdf(asset.name, asset.size_bytes);
|
||||
return {
|
||||
...asset,
|
||||
score,
|
||||
is_selected: selectedPdfUrls.includes(asset.url)
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
...asset,
|
||||
score: null,
|
||||
is_selected: asset.asset_type === 'presentation'
|
||||
|| asset.asset_type === 'label'
|
||||
|| asset.asset_type === 'calculation'
|
||||
|| asset.asset_type === 'definition'
|
||||
|| asset.asset_type === 'schema'
|
||||
};
|
||||
});
|
||||
|
||||
return {
|
||||
directoryUrl,
|
||||
assets
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user