Integrate crabrl parser into taxonomy hydration

This commit is contained in:
2026-03-16 15:18:01 -04:00
parent cf084793ed
commit a58b07456e
23 changed files with 4696 additions and 2466 deletions

View File

@@ -1,96 +1,239 @@
import { describe, expect, it } from 'bun:test';
import { readFileSync } from 'node:fs';
import { join } from 'node:path';
import { Database } from 'bun:sqlite';
import { __dbInternals } from './index';
import { describe, expect, it } from "bun:test";
import { readFileSync } from "node:fs";
import { join } from "node:path";
import { Database } from "bun:sqlite";
import { __dbInternals } from "./index";
function applyMigration(client: Database, fileName: string) {
const sql = readFileSync(join(process.cwd(), 'drizzle', fileName), 'utf8');
const sql = readFileSync(join(process.cwd(), "drizzle", fileName), "utf8");
client.exec(sql);
}
describe('sqlite schema compatibility bootstrap', () => {
it('adds missing watchlist columns and taxonomy tables for older local databases', () => {
const client = new Database(':memory:');
client.exec('PRAGMA foreign_keys = ON;');
describe("sqlite schema compatibility bootstrap", () => {
it("adds missing watchlist columns and taxonomy tables for older local databases", () => {
const client = new Database(":memory:");
client.exec("PRAGMA foreign_keys = ON;");
applyMigration(client, '0000_cold_silver_centurion.sql');
applyMigration(client, '0001_glossy_statement_snapshots.sql');
applyMigration(client, '0002_workflow_task_projection_metadata.sql');
applyMigration(client, '0003_task_stage_event_timeline.sql');
applyMigration(client, '0009_task_notification_context.sql');
applyMigration(client, "0000_cold_silver_centurion.sql");
applyMigration(client, "0001_glossy_statement_snapshots.sql");
applyMigration(client, "0002_workflow_task_projection_metadata.sql");
applyMigration(client, "0003_task_stage_event_timeline.sql");
applyMigration(client, "0009_task_notification_context.sql");
expect(__dbInternals.hasColumn(client, 'watchlist_item', 'category')).toBe(false);
expect(__dbInternals.hasColumn(client, 'watchlist_item', 'status')).toBe(false);
expect(__dbInternals.hasColumn(client, 'holding', 'company_name')).toBe(false);
expect(__dbInternals.hasTable(client, 'filing_taxonomy_snapshot')).toBe(false);
expect(__dbInternals.hasTable(client, 'research_journal_entry')).toBe(false);
expect(__dbInternals.hasTable(client, 'research_artifact')).toBe(false);
expect(__dbInternals.hasTable(client, 'research_memo')).toBe(false);
expect(__dbInternals.hasColumn(client, "watchlist_item", "category")).toBe(
false,
);
expect(__dbInternals.hasColumn(client, "watchlist_item", "status")).toBe(
false,
);
expect(__dbInternals.hasColumn(client, "holding", "company_name")).toBe(
false,
);
expect(__dbInternals.hasTable(client, "filing_taxonomy_snapshot")).toBe(
false,
);
expect(__dbInternals.hasTable(client, "research_journal_entry")).toBe(
false,
);
expect(__dbInternals.hasTable(client, "research_artifact")).toBe(false);
expect(__dbInternals.hasTable(client, "research_memo")).toBe(false);
__dbInternals.ensureLocalSqliteSchema(client);
expect(__dbInternals.hasColumn(client, 'watchlist_item', 'category')).toBe(true);
expect(__dbInternals.hasColumn(client, 'watchlist_item', 'tags')).toBe(true);
expect(__dbInternals.hasColumn(client, 'watchlist_item', 'status')).toBe(true);
expect(__dbInternals.hasColumn(client, 'watchlist_item', 'priority')).toBe(true);
expect(__dbInternals.hasColumn(client, 'watchlist_item', 'updated_at')).toBe(true);
expect(__dbInternals.hasColumn(client, 'watchlist_item', 'last_reviewed_at')).toBe(true);
expect(__dbInternals.hasColumn(client, 'holding', 'company_name')).toBe(true);
expect(__dbInternals.hasTable(client, 'filing_taxonomy_snapshot')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_snapshot', 'parser_engine')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_snapshot', 'parser_version')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_snapshot', 'taxonomy_regime')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_snapshot', 'faithful_rows')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_snapshot', 'surface_rows')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_snapshot', 'detail_rows')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_snapshot', 'kpi_rows')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_snapshot', 'normalization_summary')).toBe(true);
expect(__dbInternals.hasTable(client, 'filing_taxonomy_context')).toBe(true);
expect(__dbInternals.hasTable(client, 'filing_taxonomy_fact')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_concept', 'balance')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_concept', 'period_type')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_concept', 'data_type')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_concept', 'authoritative_concept_key')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_concept', 'mapping_method')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_concept', 'surface_key')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_concept', 'detail_parent_surface_key')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_concept', 'kpi_key')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_concept', 'residual_flag')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_fact', 'data_type')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_fact', 'authoritative_concept_key')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_fact', 'mapping_method')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_fact', 'surface_key')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_fact', 'detail_parent_surface_key')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_fact', 'kpi_key')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_fact', 'residual_flag')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_fact', 'precision')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_fact', 'nil')).toBe(true);
expect(__dbInternals.hasColumn(client, 'task_run', 'stage_context')).toBe(true);
expect(__dbInternals.hasColumn(client, 'task_stage_event', 'stage_context')).toBe(true);
expect(__dbInternals.hasTable(client, 'research_journal_entry')).toBe(true);
expect(__dbInternals.hasTable(client, 'search_document')).toBe(true);
expect(__dbInternals.hasTable(client, 'search_chunk')).toBe(true);
expect(__dbInternals.hasTable(client, 'research_artifact')).toBe(true);
expect(__dbInternals.hasTable(client, 'research_memo')).toBe(true);
expect(__dbInternals.hasTable(client, 'research_memo_evidence')).toBe(true);
expect(__dbInternals.hasTable(client, 'company_overview_cache')).toBe(true);
expect(__dbInternals.hasColumn(client, "watchlist_item", "category")).toBe(
true,
);
expect(__dbInternals.hasColumn(client, "watchlist_item", "tags")).toBe(
true,
);
expect(__dbInternals.hasColumn(client, "watchlist_item", "status")).toBe(
true,
);
expect(__dbInternals.hasColumn(client, "watchlist_item", "priority")).toBe(
true,
);
expect(
__dbInternals.hasColumn(client, "watchlist_item", "updated_at"),
).toBe(true);
expect(
__dbInternals.hasColumn(client, "watchlist_item", "last_reviewed_at"),
).toBe(true);
expect(__dbInternals.hasColumn(client, "holding", "company_name")).toBe(
true,
);
expect(__dbInternals.hasTable(client, "filing_taxonomy_snapshot")).toBe(
true,
);
expect(
__dbInternals.hasColumn(
client,
"filing_taxonomy_snapshot",
"parser_engine",
),
).toBe(true);
expect(
__dbInternals.hasColumn(
client,
"filing_taxonomy_snapshot",
"parser_version",
),
).toBe(true);
expect(
__dbInternals.hasColumn(
client,
"filing_taxonomy_snapshot",
"taxonomy_regime",
),
).toBe(true);
expect(
__dbInternals.hasColumn(
client,
"filing_taxonomy_snapshot",
"faithful_rows",
),
).toBe(true);
expect(
__dbInternals.hasColumn(
client,
"filing_taxonomy_snapshot",
"surface_rows",
),
).toBe(true);
expect(
__dbInternals.hasColumn(
client,
"filing_taxonomy_snapshot",
"detail_rows",
),
).toBe(true);
expect(
__dbInternals.hasColumn(client, "filing_taxonomy_snapshot", "kpi_rows"),
).toBe(true);
expect(
__dbInternals.hasColumn(
client,
"filing_taxonomy_snapshot",
"computed_definitions",
),
).toBe(true);
expect(
__dbInternals.hasColumn(
client,
"filing_taxonomy_snapshot",
"normalization_summary",
),
).toBe(true);
expect(__dbInternals.hasTable(client, "filing_taxonomy_context")).toBe(
true,
);
expect(__dbInternals.hasTable(client, "filing_taxonomy_fact")).toBe(true);
expect(
__dbInternals.hasColumn(client, "filing_taxonomy_concept", "balance"),
).toBe(true);
expect(
__dbInternals.hasColumn(client, "filing_taxonomy_concept", "period_type"),
).toBe(true);
expect(
__dbInternals.hasColumn(client, "filing_taxonomy_concept", "data_type"),
).toBe(true);
expect(
__dbInternals.hasColumn(
client,
"filing_taxonomy_concept",
"authoritative_concept_key",
),
).toBe(true);
expect(
__dbInternals.hasColumn(
client,
"filing_taxonomy_concept",
"mapping_method",
),
).toBe(true);
expect(
__dbInternals.hasColumn(client, "filing_taxonomy_concept", "surface_key"),
).toBe(true);
expect(
__dbInternals.hasColumn(
client,
"filing_taxonomy_concept",
"detail_parent_surface_key",
),
).toBe(true);
expect(
__dbInternals.hasColumn(client, "filing_taxonomy_concept", "kpi_key"),
).toBe(true);
expect(
__dbInternals.hasColumn(
client,
"filing_taxonomy_concept",
"residual_flag",
),
).toBe(true);
expect(
__dbInternals.hasColumn(client, "filing_taxonomy_fact", "data_type"),
).toBe(true);
expect(
__dbInternals.hasColumn(
client,
"filing_taxonomy_fact",
"authoritative_concept_key",
),
).toBe(true);
expect(
__dbInternals.hasColumn(client, "filing_taxonomy_fact", "mapping_method"),
).toBe(true);
expect(
__dbInternals.hasColumn(client, "filing_taxonomy_fact", "surface_key"),
).toBe(true);
expect(
__dbInternals.hasColumn(
client,
"filing_taxonomy_fact",
"detail_parent_surface_key",
),
).toBe(true);
expect(
__dbInternals.hasColumn(client, "filing_taxonomy_fact", "kpi_key"),
).toBe(true);
expect(
__dbInternals.hasColumn(client, "filing_taxonomy_fact", "residual_flag"),
).toBe(true);
expect(
__dbInternals.hasColumn(client, "filing_taxonomy_fact", "precision"),
).toBe(true);
expect(__dbInternals.hasColumn(client, "filing_taxonomy_fact", "nil")).toBe(
true,
);
expect(__dbInternals.hasColumn(client, "task_run", "stage_context")).toBe(
true,
);
expect(
__dbInternals.hasColumn(client, "task_stage_event", "stage_context"),
).toBe(true);
expect(__dbInternals.hasTable(client, "research_journal_entry")).toBe(true);
expect(__dbInternals.hasTable(client, "search_document")).toBe(true);
expect(__dbInternals.hasTable(client, "search_chunk")).toBe(true);
expect(__dbInternals.hasTable(client, "research_artifact")).toBe(true);
expect(__dbInternals.hasTable(client, "research_memo")).toBe(true);
expect(__dbInternals.hasTable(client, "research_memo_evidence")).toBe(true);
expect(__dbInternals.hasTable(client, "company_overview_cache")).toBe(true);
__dbInternals.loadSqliteExtensions(client);
__dbInternals.ensureSearchVirtualTables(client);
expect(__dbInternals.hasTable(client, 'search_chunk_fts')).toBe(true);
expect(__dbInternals.hasTable(client, 'search_chunk_vec')).toBe(true);
expect(__dbInternals.hasTable(client, "search_chunk_fts")).toBe(true);
expect(__dbInternals.hasTable(client, "search_chunk_vec")).toBe(true);
client.close();
});
it('backfills legacy taxonomy snapshot sidecar columns and remains idempotent', () => {
const client = new Database(':memory:');
client.exec('PRAGMA foreign_keys = ON;');
it("backfills legacy taxonomy snapshot sidecar columns and remains idempotent", () => {
const client = new Database(":memory:");
client.exec("PRAGMA foreign_keys = ON;");
applyMigration(client, '0000_cold_silver_centurion.sql');
applyMigration(client, '0005_financial_taxonomy_v3.sql');
applyMigration(client, "0000_cold_silver_centurion.sql");
applyMigration(client, "0005_financial_taxonomy_v3.sql");
client.exec(`
INSERT INTO \`filing\` (
@@ -114,7 +257,8 @@ describe('sqlite schema compatibility bootstrap', () => {
);
`);
const statementRows = '{"income":[{"label":"Revenue","value":1}],"balance":[],"cash_flow":[],"equity":[],"comprehensive_income":[]}';
const statementRows =
'{"income":[{"label":"Revenue","value":1}],"balance":[],"cash_flow":[],"equity":[],"comprehensive_income":[]}';
client.exec(`
INSERT INTO \`filing_taxonomy_snapshot\` (
@@ -143,7 +287,9 @@ describe('sqlite schema compatibility bootstrap', () => {
__dbInternals.ensureLocalSqliteSchema(client);
__dbInternals.ensureLocalSqliteSchema(client);
const row = client.query(`
const row = client
.query(
`
SELECT
\`parser_engine\`,
\`parser_version\`,
@@ -152,10 +298,13 @@ describe('sqlite schema compatibility bootstrap', () => {
\`surface_rows\`,
\`detail_rows\`,
\`kpi_rows\`,
\`computed_definitions\`,
\`normalization_summary\`
FROM \`filing_taxonomy_snapshot\`
WHERE \`filing_id\` = 1
`).get() as {
`,
)
.get() as {
parser_engine: string;
parser_version: string;
taxonomy_regime: string;
@@ -163,66 +312,116 @@ describe('sqlite schema compatibility bootstrap', () => {
surface_rows: string | null;
detail_rows: string | null;
kpi_rows: string | null;
computed_definitions: string | null;
normalization_summary: string | null;
};
expect(row.parser_engine).toBe('fiscal-xbrl');
expect(row.parser_version).toBe('unknown');
expect(row.taxonomy_regime).toBe('unknown');
expect(row.parser_engine).toBe("fiscal-xbrl");
expect(row.parser_version).toBe("unknown");
expect(row.taxonomy_regime).toBe("unknown");
expect(row.faithful_rows).toBe(statementRows);
expect(row.surface_rows).toBe('{"income":[],"balance":[],"cash_flow":[],"equity":[],"comprehensive_income":[]}');
expect(row.detail_rows).toBe('{"income":{},"balance":{},"cash_flow":{},"equity":{},"comprehensive_income":{}}');
expect(row.kpi_rows).toBe('[]');
expect(row.surface_rows).toBe(
'{"income":[],"balance":[],"cash_flow":[],"equity":[],"comprehensive_income":[]}',
);
expect(row.detail_rows).toBe(
'{"income":{},"balance":{},"cash_flow":{},"equity":{},"comprehensive_income":{}}',
);
expect(row.kpi_rows).toBe("[]");
expect(row.computed_definitions).toBe("[]");
expect(row.normalization_summary).toBeNull();
client.close();
});
it('repairs partial taxonomy sidecar drift without requiring a table rebuild', () => {
const client = new Database(':memory:');
client.exec('PRAGMA foreign_keys = ON;');
it("repairs partial taxonomy sidecar drift without requiring a table rebuild", () => {
const client = new Database(":memory:");
client.exec("PRAGMA foreign_keys = ON;");
applyMigration(client, '0000_cold_silver_centurion.sql');
applyMigration(client, '0005_financial_taxonomy_v3.sql');
client.exec("ALTER TABLE `filing_taxonomy_snapshot` ADD `parser_engine` text NOT NULL DEFAULT 'legacy-ts';");
applyMigration(client, "0000_cold_silver_centurion.sql");
applyMigration(client, "0005_financial_taxonomy_v3.sql");
client.exec(
"ALTER TABLE `filing_taxonomy_snapshot` ADD `parser_engine` text NOT NULL DEFAULT 'legacy-ts';",
);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_snapshot', 'parser_engine')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_snapshot', 'normalization_summary')).toBe(false);
expect(__dbInternals.hasTable(client, 'filing_taxonomy_context')).toBe(false);
expect(
__dbInternals.hasColumn(
client,
"filing_taxonomy_snapshot",
"parser_engine",
),
).toBe(true);
expect(
__dbInternals.hasColumn(
client,
"filing_taxonomy_snapshot",
"normalization_summary",
),
).toBe(false);
expect(__dbInternals.hasTable(client, "filing_taxonomy_context")).toBe(
false,
);
__dbInternals.ensureLocalSqliteSchema(client);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_snapshot', 'parser_version')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_snapshot', 'taxonomy_regime')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_snapshot', 'normalization_summary')).toBe(true);
expect(__dbInternals.hasTable(client, 'filing_taxonomy_context')).toBe(true);
client.close();
});
it('throws on missing parser_engine column when verifyCriticalSchema is called', () => {
const client = new Database(':memory:');
client.exec('PRAGMA foreign_keys = ON;');
applyMigration(client, '0000_cold_silver_centurion.sql');
applyMigration(client, '0005_financial_taxonomy_v3.sql');
expect(__dbInternals.hasTable(client, 'filing_taxonomy_snapshot')).toBe(true);
expect(__dbInternals.hasColumn(client, 'filing_taxonomy_snapshot', 'parser_engine')).toBe(false);
expect(() => __dbInternals.verifyCriticalSchema(client)).toThrow(
/filing_taxonomy_snapshot is missing columns: parser_engine/
expect(
__dbInternals.hasColumn(
client,
"filing_taxonomy_snapshot",
"parser_version",
),
).toBe(true);
expect(
__dbInternals.hasColumn(
client,
"filing_taxonomy_snapshot",
"taxonomy_regime",
),
).toBe(true);
expect(
__dbInternals.hasColumn(
client,
"filing_taxonomy_snapshot",
"normalization_summary",
),
).toBe(true);
expect(__dbInternals.hasTable(client, "filing_taxonomy_context")).toBe(
true,
);
client.close();
});
it('verifyCriticalSchema passes when all required columns exist', () => {
const client = new Database(':memory:');
client.exec('PRAGMA foreign_keys = ON;');
it("throws on missing parser_engine column when verifyCriticalSchema is called", () => {
const client = new Database(":memory:");
client.exec("PRAGMA foreign_keys = ON;");
applyMigration(client, '0000_cold_silver_centurion.sql');
applyMigration(client, '0005_financial_taxonomy_v3.sql');
applyMigration(client, "0000_cold_silver_centurion.sql");
applyMigration(client, "0005_financial_taxonomy_v3.sql");
expect(__dbInternals.hasTable(client, "filing_taxonomy_snapshot")).toBe(
true,
);
expect(
__dbInternals.hasColumn(
client,
"filing_taxonomy_snapshot",
"parser_engine",
),
).toBe(false);
expect(() => __dbInternals.verifyCriticalSchema(client)).toThrow(
/filing_taxonomy_snapshot is missing columns: parser_engine/,
);
client.close();
});
it("verifyCriticalSchema passes when all required columns exist", () => {
const client = new Database(":memory:");
client.exec("PRAGMA foreign_keys = ON;");
applyMigration(client, "0000_cold_silver_centurion.sql");
applyMigration(client, "0005_financial_taxonomy_v3.sql");
__dbInternals.ensureLocalSqliteSchema(client);