Automate issuer overlay creation from ticker searches
This commit is contained in:
@@ -19,7 +19,7 @@ type LocalDevOverrideSummary = {
|
||||
workflowChanged: boolean;
|
||||
};
|
||||
|
||||
export type LocalDevConfig = {
|
||||
type LocalDevConfig = {
|
||||
bindHost: string;
|
||||
env: EnvMap;
|
||||
overrides: LocalDevOverrideSummary;
|
||||
|
||||
@@ -1,16 +1,20 @@
|
||||
import { and, desc, eq, gte, lte } from 'drizzle-orm';
|
||||
import { readFileSync, readdirSync } from 'node:fs';
|
||||
import { join } from 'node:path';
|
||||
import { and, desc, eq, gte, inArray, lte } from 'drizzle-orm';
|
||||
|
||||
import { db } from '@/lib/server/db';
|
||||
import { filingTaxonomySnapshot } from '@/lib/server/db/schema';
|
||||
import { filingTaxonomyConcept, filingTaxonomySnapshot } from '@/lib/server/db/schema';
|
||||
|
||||
type ScriptOptions = {
|
||||
ticker: string | null;
|
||||
from: string | null;
|
||||
to: string | null;
|
||||
sampleLimit: number;
|
||||
failOnResiduals: boolean;
|
||||
};
|
||||
|
||||
type SnapshotRow = {
|
||||
id: number;
|
||||
filing_id: number;
|
||||
ticker: string;
|
||||
filing_date: string;
|
||||
@@ -21,17 +25,38 @@ type SnapshotRow = {
|
||||
parser_version: string;
|
||||
fiscal_pack: string | null;
|
||||
normalization_summary: {
|
||||
issuerOverlayMatchCount?: number;
|
||||
residualDisclosureCount?: number;
|
||||
residualPrimaryCount?: number;
|
||||
unsupportedConceptCount?: number;
|
||||
warnings?: string[];
|
||||
} | null;
|
||||
surface_rows: Record<string, Array<{ key: string }>> | null;
|
||||
updated_at: string;
|
||||
};
|
||||
|
||||
type ResidualConceptRow = {
|
||||
snapshot_id: number;
|
||||
qname: string;
|
||||
statement_kind: string | null;
|
||||
role_uri: string | null;
|
||||
};
|
||||
|
||||
type SurfacePackFile = {
|
||||
surfaces: Array<{
|
||||
statement: string;
|
||||
allowed_source_concepts?: string[];
|
||||
allowed_authoritative_concepts?: string[];
|
||||
}>;
|
||||
};
|
||||
|
||||
function parseOptions(argv: string[]): ScriptOptions {
|
||||
const options: ScriptOptions = {
|
||||
ticker: null,
|
||||
from: null,
|
||||
to: null,
|
||||
sampleLimit: 5
|
||||
sampleLimit: 5,
|
||||
failOnResiduals: false
|
||||
};
|
||||
|
||||
for (const arg of argv) {
|
||||
@@ -39,10 +64,15 @@ function parseOptions(argv: string[]): ScriptOptions {
|
||||
console.log('Report taxonomy snapshot health from the local database.');
|
||||
console.log('');
|
||||
console.log('Usage:');
|
||||
console.log(' bun run scripts/report-taxonomy-health.ts [--ticker=SYMBOL] [--from=YYYY-MM-DD] [--to=YYYY-MM-DD] [--sample-limit=N]');
|
||||
console.log(' bun run scripts/report-taxonomy-health.ts [--ticker=SYMBOL] [--from=YYYY-MM-DD] [--to=YYYY-MM-DD] [--sample-limit=N] [--fail-on-residuals]');
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
if (arg === '--fail-on-residuals') {
|
||||
options.failOnResiduals = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg.startsWith('--ticker=')) {
|
||||
const value = arg.slice('--ticker='.length).trim().toUpperCase();
|
||||
options.ticker = value.length > 0 ? value : null;
|
||||
@@ -121,6 +151,7 @@ async function loadRows(options: ScriptOptions): Promise<SnapshotRow[]> {
|
||||
const whereClause = conditions.length > 0 ? and(...conditions) : undefined;
|
||||
|
||||
const baseQuery = db.select({
|
||||
id: filingTaxonomySnapshot.id,
|
||||
filing_id: filingTaxonomySnapshot.filing_id,
|
||||
ticker: filingTaxonomySnapshot.ticker,
|
||||
filing_date: filingTaxonomySnapshot.filing_date,
|
||||
@@ -131,6 +162,7 @@ async function loadRows(options: ScriptOptions): Promise<SnapshotRow[]> {
|
||||
parser_version: filingTaxonomySnapshot.parser_version,
|
||||
fiscal_pack: filingTaxonomySnapshot.fiscal_pack,
|
||||
normalization_summary: filingTaxonomySnapshot.normalization_summary,
|
||||
surface_rows: filingTaxonomySnapshot.surface_rows,
|
||||
updated_at: filingTaxonomySnapshot.updated_at
|
||||
}).from(filingTaxonomySnapshot).orderBy(desc(filingTaxonomySnapshot.updated_at));
|
||||
|
||||
@@ -141,15 +173,77 @@ async function loadRows(options: ScriptOptions): Promise<SnapshotRow[]> {
|
||||
return await baseQuery;
|
||||
}
|
||||
|
||||
async function loadResidualConceptRows(snapshotIds: number[]): Promise<ResidualConceptRow[]> {
|
||||
if (snapshotIds.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return await db.select({
|
||||
snapshot_id: filingTaxonomyConcept.snapshot_id,
|
||||
qname: filingTaxonomyConcept.qname,
|
||||
statement_kind: filingTaxonomyConcept.statement_kind,
|
||||
role_uri: filingTaxonomyConcept.role_uri
|
||||
}).from(filingTaxonomyConcept).where(and(
|
||||
inArray(filingTaxonomyConcept.snapshot_id, snapshotIds),
|
||||
eq(filingTaxonomyConcept.residual_flag, true)
|
||||
));
|
||||
}
|
||||
|
||||
function normalizeConcept(value: string) {
|
||||
return value.trim().toLowerCase();
|
||||
}
|
||||
|
||||
function loadTaxonomyStatementIndex() {
|
||||
const index = new Map<string, Set<string>>();
|
||||
const taxonomyDir = join(process.cwd(), 'rust', 'taxonomy', 'fiscal', 'v1');
|
||||
const fileNames = readdirSync(taxonomyDir)
|
||||
.filter((fileName) => fileName.endsWith('.surface.json') && fileName !== 'universal_income.surface.json')
|
||||
.sort((left, right) => left.localeCompare(right));
|
||||
|
||||
for (const fileName of fileNames) {
|
||||
const file = JSON.parse(readFileSync(join(taxonomyDir, fileName), 'utf8')) as SurfacePackFile;
|
||||
for (const surface of file.surfaces ?? []) {
|
||||
const concepts = [
|
||||
...(surface.allowed_source_concepts ?? []),
|
||||
...(surface.allowed_authoritative_concepts ?? [])
|
||||
];
|
||||
for (const concept of concepts) {
|
||||
const normalized = normalizeConcept(concept);
|
||||
const statements = index.get(normalized) ?? new Set<string>();
|
||||
statements.add(surface.statement);
|
||||
index.set(normalized, statements);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return index;
|
||||
}
|
||||
|
||||
function statementsForConcept(index: Map<string, Set<string>>, qname: string) {
|
||||
return [...(index.get(normalizeConcept(qname)) ?? new Set<string>())]
|
||||
.sort((left, right) => left.localeCompare(right));
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const options = parseOptions(process.argv.slice(2));
|
||||
const rows = await loadRows(options);
|
||||
const residualRows = await loadResidualConceptRows(rows.map((row) => row.id));
|
||||
const taxonomyStatementIndex = loadTaxonomyStatementIndex();
|
||||
|
||||
const statusCounts = new Map<string, number>();
|
||||
const parserCounts = new Map<string, number>();
|
||||
const packCounts = new Map<string, number>();
|
||||
const warningCounts = new Map<string, number>();
|
||||
const parserVersionCounts = new Map<string, number>();
|
||||
const residualCounts = new Map<string, number>();
|
||||
const residualNoRoleCounts = new Map<string, number>();
|
||||
const residualDisclosureOnlyCounts = new Map<string, number>();
|
||||
const residualEquityMatchCounts = new Map<string, number>();
|
||||
const residualDifferentStatementCounts = new Map<string, number>();
|
||||
const residualAbsentCounts = new Map<string, number>();
|
||||
const disclosureSurfaceCounts = new Map<string, number>();
|
||||
const issuerOverlayMatchCounts = new Map<string, number>();
|
||||
const unsupportedSurfacedConcepts = new Map<string, number>();
|
||||
|
||||
for (const row of rows) {
|
||||
incrementCount(statusCounts, row.parse_status);
|
||||
@@ -160,6 +254,59 @@ async function main() {
|
||||
for (const warning of row.normalization_summary?.warnings ?? []) {
|
||||
incrementCount(warningCounts, warning);
|
||||
}
|
||||
|
||||
for (const disclosureRow of row.surface_rows?.disclosure ?? []) {
|
||||
incrementCount(disclosureSurfaceCounts, disclosureRow.key);
|
||||
}
|
||||
|
||||
const issuerOverlayMatches = row.normalization_summary?.issuerOverlayMatchCount ?? 0;
|
||||
if (issuerOverlayMatches > 0) {
|
||||
issuerOverlayMatchCounts.set(
|
||||
row.ticker,
|
||||
(issuerOverlayMatchCounts.get(row.ticker) ?? 0) + issuerOverlayMatches
|
||||
);
|
||||
}
|
||||
|
||||
const unsupportedConcepts = row.normalization_summary?.unsupportedConceptCount ?? 0;
|
||||
if (unsupportedConcepts > 0) {
|
||||
unsupportedSurfacedConcepts.set(
|
||||
row.ticker,
|
||||
(unsupportedSurfacedConcepts.get(row.ticker) ?? 0) + unsupportedConcepts
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
for (const row of residualRows) {
|
||||
incrementCount(residualCounts, row.qname);
|
||||
if (!row.role_uri || row.role_uri.trim().length === 0) {
|
||||
incrementCount(residualNoRoleCounts, row.qname);
|
||||
}
|
||||
|
||||
const matchedStatements = statementsForConcept(taxonomyStatementIndex, row.qname);
|
||||
const primaryStatements = matchedStatements.filter((statement) => statement !== 'disclosure');
|
||||
|
||||
if (matchedStatements.length > 0 && primaryStatements.length === 0) {
|
||||
incrementCount(residualDisclosureOnlyCounts, row.qname);
|
||||
}
|
||||
|
||||
if (matchedStatements.includes('equity')) {
|
||||
incrementCount(residualEquityMatchCounts, row.qname);
|
||||
}
|
||||
|
||||
if (
|
||||
row.statement_kind
|
||||
&& primaryStatements.length > 0
|
||||
&& !primaryStatements.includes(row.statement_kind)
|
||||
) {
|
||||
incrementCount(
|
||||
residualDifferentStatementCounts,
|
||||
`${row.qname}::${row.statement_kind}->${primaryStatements.join('|')}`
|
||||
);
|
||||
}
|
||||
|
||||
if (matchedStatements.length === 0) {
|
||||
incrementCount(residualAbsentCounts, row.qname);
|
||||
}
|
||||
}
|
||||
|
||||
const failedRows = rows
|
||||
@@ -183,14 +330,28 @@ async function main() {
|
||||
console.log(`[report-taxonomy-health] legacy_ts=${legacyCount}`);
|
||||
console.log(`[report-taxonomy-health] deferred_to_typescript=${deferredCount}`);
|
||||
console.log(`[report-taxonomy-health] ts_compact_surface_fallback=${fallbackCount}`);
|
||||
console.log(`[report-taxonomy-health] residual_rows=${residualRows.length}`);
|
||||
|
||||
printCountMap('parse_status', statusCounts);
|
||||
printCountMap('parser_engine', parserCounts);
|
||||
printCountMap('parser_version', parserVersionCounts);
|
||||
printCountMap('fiscal_pack', packCounts);
|
||||
printCountMap('warnings', warningCounts);
|
||||
printCountMap('residual_top_concepts', residualCounts);
|
||||
printCountMap('residual_missing_role_uri', residualNoRoleCounts);
|
||||
printCountMap('residual_disclosure_only', residualDisclosureOnlyCounts);
|
||||
printCountMap('residual_matching_equity_taxonomy', residualEquityMatchCounts);
|
||||
printCountMap('residual_different_primary_statement', residualDifferentStatementCounts);
|
||||
printCountMap('residual_absent_from_taxonomy', residualAbsentCounts);
|
||||
printCountMap('disclosure_surface_counts', disclosureSurfaceCounts);
|
||||
printCountMap('issuer_overlay_match_counts', issuerOverlayMatchCounts);
|
||||
printCountMap('unsupported_surfaced_concepts', unsupportedSurfacedConcepts);
|
||||
printSamples('failed_samples', failedRows);
|
||||
printSamples('warning_samples', warningRows);
|
||||
|
||||
if (options.failOnResiduals && residualRows.length > 0) {
|
||||
throw new Error(`strict mode failed: surfaced residual_rows=${residualRows.length}`);
|
||||
}
|
||||
}
|
||||
|
||||
void main().catch((error) => {
|
||||
|
||||
@@ -6,7 +6,7 @@ const HOMEBREW_SQLITE_LIBRARY_PATHS = [
|
||||
"/usr/local/opt/sqlite/lib/libsqlite3.dylib",
|
||||
] as const;
|
||||
|
||||
export type LocalSqliteVectorConfig =
|
||||
type LocalSqliteVectorConfig =
|
||||
| {
|
||||
mode: "native";
|
||||
source: "explicit-env" | "autodetect-homebrew";
|
||||
|
||||
Reference in New Issue
Block a user