From 512eebf7f472b15d1788f8baafa2a3b3036a78c4 Mon Sep 17 00:00:00 2001 From: Egutierrez Date: Sun, 5 Apr 2026 18:19:05 +0200 Subject: [PATCH 1/6] feat: tablas unit_tests y e2e_tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Migración 008 en registry.db para unit_tests con FTS5 (tests individuales extraídos de archivos de test). Migración 004 en operations.db para e2e_tests con FTS5 (tests de integración entre funciones dentro de apps). --- fn_operations/migrations/004_e2e_tests.sql | 42 ++++++++++++++++++++++ registry/migrations/008_unit_tests.sql | 37 +++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 fn_operations/migrations/004_e2e_tests.sql create mode 100644 registry/migrations/008_unit_tests.sql diff --git a/fn_operations/migrations/004_e2e_tests.sql b/fn_operations/migrations/004_e2e_tests.sql new file mode 100644 index 00000000..ae3398cd --- /dev/null +++ b/fn_operations/migrations/004_e2e_tests.sql @@ -0,0 +1,42 @@ +-- e2e_tests: integration tests that verify function composition within an app +CREATE TABLE e2e_tests ( + id TEXT PRIMARY KEY, + name TEXT NOT NULL, + description TEXT NOT NULL DEFAULT '', + relation_id TEXT DEFAULT '' REFERENCES relations(id), + steps TEXT NOT NULL DEFAULT '[]', + input_fixture TEXT NOT NULL DEFAULT '{}', + expected TEXT NOT NULL DEFAULT '{}', + last_status TEXT NOT NULL DEFAULT '', + last_run_at TEXT NOT NULL DEFAULT '', + execution_id TEXT NOT NULL DEFAULT '', + duration_ms INTEGER NOT NULL DEFAULT 0, + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL +); + +CREATE INDEX idx_e2e_tests_relation ON e2e_tests(relation_id); +CREATE INDEX idx_e2e_tests_status ON e2e_tests(last_status); + +CREATE VIRTUAL TABLE e2e_tests_fts USING fts5( + id, name, description, steps, + content='e2e_tests', + content_rowid='rowid' +); + +CREATE TRIGGER e2e_tests_ai AFTER INSERT ON e2e_tests BEGIN + INSERT INTO e2e_tests_fts(rowid, id, name, description, steps) + VALUES (new.rowid, new.id, new.name, new.description, new.steps); +END; + +CREATE TRIGGER e2e_tests_ad AFTER DELETE ON e2e_tests BEGIN + INSERT INTO e2e_tests_fts(e2e_tests_fts, rowid, id, name, description, steps) + VALUES ('delete', old.rowid, old.id, old.name, old.description, old.steps); +END; + +CREATE TRIGGER e2e_tests_au AFTER UPDATE ON e2e_tests BEGIN + INSERT INTO e2e_tests_fts(e2e_tests_fts, rowid, id, name, description, steps) + VALUES ('delete', old.rowid, old.id, old.name, old.description, old.steps); + INSERT INTO e2e_tests_fts(rowid, id, name, description, steps) + VALUES (new.rowid, new.id, new.name, new.description, new.steps); +END; diff --git a/registry/migrations/008_unit_tests.sql b/registry/migrations/008_unit_tests.sql new file mode 100644 index 00000000..6ac1f20a --- /dev/null +++ b/registry/migrations/008_unit_tests.sql @@ -0,0 +1,37 @@ +-- unit_tests: individual test cases extracted from test files +CREATE TABLE unit_tests ( + id TEXT PRIMARY KEY, + function_id TEXT NOT NULL REFERENCES functions(id) ON DELETE CASCADE, + name TEXT NOT NULL, + code TEXT NOT NULL DEFAULT '', + file_path TEXT NOT NULL DEFAULT '', + lang TEXT NOT NULL, + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL +); + +CREATE INDEX idx_unit_tests_function ON unit_tests(function_id); +CREATE INDEX idx_unit_tests_lang ON unit_tests(lang); + +CREATE VIRTUAL TABLE unit_tests_fts USING fts5( + id, name, code, function_id, lang, + content='unit_tests', + content_rowid='rowid' +); + +CREATE TRIGGER unit_tests_ai AFTER INSERT ON unit_tests BEGIN + INSERT INTO unit_tests_fts(rowid, id, name, code, function_id, lang) + VALUES (new.rowid, new.id, new.name, new.code, new.function_id, new.lang); +END; + +CREATE TRIGGER unit_tests_ad AFTER DELETE ON unit_tests BEGIN + INSERT INTO unit_tests_fts(unit_tests_fts, rowid, id, name, code, function_id, lang) + VALUES ('delete', old.rowid, old.id, old.name, old.code, old.function_id, old.lang); +END; + +CREATE TRIGGER unit_tests_au AFTER UPDATE ON unit_tests BEGIN + INSERT INTO unit_tests_fts(unit_tests_fts, rowid, id, name, code, function_id, lang) + VALUES ('delete', old.rowid, old.id, old.name, old.code, old.function_id, old.lang); + INSERT INTO unit_tests_fts(rowid, id, name, code, function_id, lang) + VALUES (new.rowid, new.id, new.name, new.code, new.function_id, new.lang); +END; From fea8fed75d750671a7921c645d17ad68979405db Mon Sep 17 00:00:00 2001 From: Egutierrez Date: Sun, 5 Apr 2026 18:19:10 +0200 Subject: [PATCH 2/6] feat: modelos y CRUD para unit_tests y e2e_tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit UnitTest en registry con Insert, GetByFunction, Search FTS5, Purge. E2ETest en fn_operations con Insert, Get, List, UpdateResult, Delete. Ambos con scan helpers y serialización JSON. --- fn_operations/models.go | 27 +++++++++ fn_operations/store.go | 119 ++++++++++++++++++++++++++++++++++++++++ registry/models.go | 12 ++++ registry/store.go | 85 ++++++++++++++++++++++++++++ 4 files changed, 243 insertions(+) diff --git a/fn_operations/models.go b/fn_operations/models.go index a951505d..430ebc48 100644 --- a/fn_operations/models.go +++ b/fn_operations/models.go @@ -167,6 +167,33 @@ type Log struct { CreatedAt time.Time `json:"created_at"` } +// E2ETestStatus represents the result of an e2e test run. +type E2ETestStatus string + +const ( + E2EPass E2ETestStatus = "pass" + E2EFail E2ETestStatus = "fail" + E2ESkip E2ETestStatus = "skip" + E2EPending E2ETestStatus = "" +) + +// E2ETest is an integration test that verifies function composition within an app. +type E2ETest struct { + ID string `json:"id"` + Name string `json:"name"` + Description string `json:"description"` + RelationID string `json:"relation_id"` + Steps []string `json:"steps"` + InputFixture map[string]any `json:"input_fixture"` + Expected map[string]any `json:"expected"` + LastStatus E2ETestStatus `json:"last_status"` + LastRunAt string `json:"last_run_at"` + ExecutionID string `json:"execution_id"` + DurationMs int64 `json:"duration_ms"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + // TypeSnapshot is an immutable copy of a registry type at point of use. type TypeSnapshot struct { ID string `json:"id"` diff --git a/fn_operations/store.go b/fn_operations/store.go index a2942c9f..a6a0c49c 100644 --- a/fn_operations/store.go +++ b/fn_operations/store.go @@ -903,3 +903,122 @@ func (db *DB) ListLogs(level LogLevel, source, entityID, executionID string, lim } return result, nil } + +// --- E2E Tests CRUD --- + +// InsertE2ETest inserts or replaces an e2e test. +func (db *DB) InsertE2ETest(t *E2ETest) error { + now := time.Now().UTC() + if t.CreatedAt.IsZero() { + t.CreatedAt = now + } + t.UpdatedAt = now + + _, err := db.conn.Exec(` + INSERT OR REPLACE INTO e2e_tests ( + id, name, description, relation_id, steps, input_fixture, + expected, last_status, last_run_at, execution_id, duration_ms, + created_at, updated_at + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + t.ID, t.Name, t.Description, t.RelationID, + marshalStrings(t.Steps), marshalJSON(t.InputFixture), marshalJSON(t.Expected), + string(t.LastStatus), t.LastRunAt, t.ExecutionID, t.DurationMs, + t.CreatedAt.Format(time.RFC3339), t.UpdatedAt.Format(time.RFC3339), + ) + return err +} + +// GetE2ETest returns an e2e test by ID. +func (db *DB) GetE2ETest(id string) (*E2ETest, error) { + row := db.conn.QueryRow(` + SELECT id, name, description, relation_id, steps, input_fixture, + expected, last_status, last_run_at, execution_id, duration_ms, + created_at, updated_at + FROM e2e_tests WHERE id = ?`, id) + + t, err := scanE2ETest(row) + if err != nil { + return nil, fmt.Errorf("e2e test %q not found: %w", id, err) + } + return t, nil +} + +// ListE2ETests returns e2e tests with optional status filter. +func (db *DB) ListE2ETests(status E2ETestStatus) ([]E2ETest, error) { + where := []string{} + args := []any{} + if status != "" { + where = append(where, "last_status = ?") + args = append(args, string(status)) + } + + q := `SELECT id, name, description, relation_id, steps, input_fixture, + expected, last_status, last_run_at, execution_id, duration_ms, + created_at, updated_at + FROM e2e_tests` + if len(where) > 0 { + q += " WHERE " + strings.Join(where, " AND ") + } + q += " ORDER BY name" + + rows, err := db.conn.Query(q, args...) + if err != nil { + return nil, err + } + defer rows.Close() + + var result []E2ETest + for rows.Next() { + var t E2ETest + var stepsJSON, fixtureJSON, expectedJSON, createdAt, updatedAt string + if err := rows.Scan(&t.ID, &t.Name, &t.Description, &t.RelationID, + &stepsJSON, &fixtureJSON, &expectedJSON, + &t.LastStatus, &t.LastRunAt, &t.ExecutionID, &t.DurationMs, + &createdAt, &updatedAt); err != nil { + return nil, fmt.Errorf("scanning e2e test: %w", err) + } + t.Steps = unmarshalStrings(stepsJSON) + t.InputFixture = unmarshalJSON(fixtureJSON) + t.Expected = unmarshalJSON(expectedJSON) + t.CreatedAt, _ = time.Parse(time.RFC3339, createdAt) + t.UpdatedAt, _ = time.Parse(time.RFC3339, updatedAt) + result = append(result, t) + } + return result, nil +} + +// UpdateE2ETestResult updates the result fields after running an e2e test. +func (db *DB) UpdateE2ETestResult(id string, status E2ETestStatus, executionID string, durationMs int64) error { + now := time.Now().UTC() + _, err := db.conn.Exec(` + UPDATE e2e_tests SET last_status=?, last_run_at=?, execution_id=?, duration_ms=?, updated_at=? + WHERE id=?`, + string(status), now.Format(time.RFC3339), executionID, durationMs, + now.Format(time.RFC3339), id, + ) + return err +} + +// DeleteE2ETest removes an e2e test by ID. +func (db *DB) DeleteE2ETest(id string) error { + _, err := db.conn.Exec("DELETE FROM e2e_tests WHERE id = ?", id) + return err +} + +func scanE2ETest(row *sql.Row) (*E2ETest, error) { + var t E2ETest + var stepsJSON, fixtureJSON, expectedJSON, createdAt, updatedAt string + err := row.Scan(&t.ID, &t.Name, &t.Description, &t.RelationID, + &stepsJSON, &fixtureJSON, &expectedJSON, + &t.LastStatus, &t.LastRunAt, &t.ExecutionID, &t.DurationMs, + &createdAt, &updatedAt) + if err != nil { + return nil, err + } + t.Steps = unmarshalStrings(stepsJSON) + t.InputFixture = unmarshalJSON(fixtureJSON) + t.Expected = unmarshalJSON(expectedJSON) + t.CreatedAt, _ = time.Parse(time.RFC3339, createdAt) + t.UpdatedAt, _ = time.Parse(time.RFC3339, updatedAt) + return &t, nil +} diff --git a/registry/models.go b/registry/models.go index 031d3cb2..0f1eae68 100644 --- a/registry/models.go +++ b/registry/models.go @@ -180,6 +180,18 @@ type Proposal struct { UpdatedAt time.Time `json:"updated_at"` } +// UnitTest represents an individual test case extracted from a test file. +type UnitTest struct { + ID string `json:"id"` + FunctionID string `json:"function_id"` + Name string `json:"name"` + Code string `json:"code"` + FilePath string `json:"file_path"` + Lang string `json:"lang"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + // GenerateID builds the canonical ID: {name}_{lang}_{domain} func GenerateID(name, lang, domain string) string { return name + "_" + lang + "_" + domain diff --git a/registry/store.go b/registry/store.go index d874c17a..b2d8d102 100644 --- a/registry/store.go +++ b/registry/store.go @@ -614,6 +614,91 @@ func scanTypes(rows interface{ Next() bool; Scan(...any) error }) ([]Type, error return result, nil } +// --- Unit Tests CRUD --- + +// InsertUnitTest inserts or replaces a unit test entry. +func (db *DB) InsertUnitTest(ut *UnitTest) error { + now := time.Now().UTC() + if ut.CreatedAt.IsZero() { + ut.CreatedAt = now + } + if ut.UpdatedAt.IsZero() { + ut.UpdatedAt = now + } + + _, err := db.conn.Exec(` + INSERT OR REPLACE INTO unit_tests ( + id, function_id, name, code, file_path, lang, created_at, updated_at + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)`, + ut.ID, ut.FunctionID, ut.Name, ut.Code, ut.FilePath, ut.Lang, + ut.CreatedAt.Format(time.RFC3339), ut.UpdatedAt.Format(time.RFC3339), + ) + return err +} + +// GetUnitTestsByFunction returns all unit tests for a given function ID. +func (db *DB) GetUnitTestsByFunction(functionID string) ([]UnitTest, error) { + rows, err := db.conn.Query( + "SELECT id, function_id, name, code, file_path, lang, created_at, updated_at FROM unit_tests WHERE function_id = ? ORDER BY name", + functionID, + ) + if err != nil { + return nil, err + } + defer rows.Close() + return scanUnitTests(rows) +} + +// SearchUnitTests performs FTS search on unit tests. +func (db *DB) SearchUnitTests(query string, lang string) ([]UnitTest, error) { + where := []string{} + args := []any{} + + if query != "" { + where = append(where, "ut.id IN (SELECT id FROM unit_tests_fts WHERE unit_tests_fts MATCH ?)") + args = append(args, query) + } + if lang != "" { + where = append(where, "ut.lang = ?") + args = append(args, lang) + } + + sql := "SELECT id, function_id, name, code, file_path, lang, created_at, updated_at FROM unit_tests ut" + if len(where) > 0 { + sql += " WHERE " + strings.Join(where, " AND ") + } + sql += " ORDER BY ut.function_id, ut.name" + + rows, err := db.conn.Query(sql, args...) + if err != nil { + return nil, fmt.Errorf("search unit tests: %w", err) + } + defer rows.Close() + return scanUnitTests(rows) +} + +func scanUnitTests(rows interface{ Next() bool; Scan(...any) error }) ([]UnitTest, error) { + var result []UnitTest + for rows.Next() { + var ut UnitTest + var createdAt, updatedAt string + err := rows.Scan(&ut.ID, &ut.FunctionID, &ut.Name, &ut.Code, &ut.FilePath, &ut.Lang, &createdAt, &updatedAt) + if err != nil { + return nil, fmt.Errorf("scanning unit test: %w", err) + } + ut.CreatedAt, _ = time.Parse(time.RFC3339, createdAt) + ut.UpdatedAt, _ = time.Parse(time.RFC3339, updatedAt) + result = append(result, ut) + } + return result, nil +} + +// PurgeUnitTests deletes all unit test entries. Used before re-indexing. +func (db *DB) PurgeUnitTests() error { + _, err := db.conn.Exec("DELETE FROM unit_tests") + return err +} + // --- Proposal CRUD --- // InsertProposal inserts or replaces a proposal. From 384a87f8a7d56c308641cf33fab9c2ad28e37842 Mon Sep 17 00:00:00 2001 From: Egutierrez Date: Sun, 5 Apr 2026 18:19:17 +0200 Subject: [PATCH 3/6] =?UTF-8?q?feat:=20parser=20autom=C3=A1tico=20de=20tes?= =?UTF-8?q?t=20files=20Go/Python/Bash?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extrae test cases individuales con su código desde archivos _test. Go detecta func TestXxx, Python detecta def test_xxx, Bash soporta tres convenciones: test_xxx(){}, secciones === nombre ===, y comentarios # Test:. --- registry/test_parser.go | 133 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 registry/test_parser.go diff --git a/registry/test_parser.go b/registry/test_parser.go new file mode 100644 index 00000000..3abf75d0 --- /dev/null +++ b/registry/test_parser.go @@ -0,0 +1,133 @@ +package registry + +import ( + "fmt" + "os" + "regexp" + "strings" +) + +// testCase represents a single test extracted from a test file. +type testCase struct { + Name string + Code string +} + +// testPos marks the start of a test within a file. +type testPos struct { + name string + startLine int +} + +// parseTestFile reads a test file and extracts individual test cases. +// Supports Go, Python, and Bash test formats. +func parseTestFile(path, lang string) ([]testCase, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("reading test file %s: %w", path, err) + } + content := string(data) + + switch lang { + case "go": + return parseGoTests(content), nil + case "py": + return parsePythonTests(content), nil + case "bash": + return parseBashTests(content), nil + default: + return nil, nil + } +} + +// parseGoTests extracts Go test functions (func TestXxx). +var goTestFuncRe = regexp.MustCompile(`(?m)^func\s+(Test\w+)\s*\(`) + +func parseGoTests(content string) []testCase { + lines := strings.Split(content, "\n") + var positions []testPos + + for i, line := range lines { + if m := goTestFuncRe.FindStringSubmatch(line); m != nil { + positions = append(positions, testPos{name: m[1], startLine: i}) + } + } + + return extractBlocks(lines, positions) +} + +// parsePythonTests extracts Python test functions (def test_xxx). +var pyTestFuncRe = regexp.MustCompile(`(?m)^def\s+(test_\w+)\s*\(`) + +func parsePythonTests(content string) []testCase { + lines := strings.Split(content, "\n") + var positions []testPos + + for i, line := range lines { + if m := pyTestFuncRe.FindStringSubmatch(line); m != nil { + positions = append(positions, testPos{name: m[1], startLine: i}) + } + } + + return extractBlocks(lines, positions) +} + +// parseBashTests extracts Bash test blocks. +// Tries three conventions in order: +// 1. test_xxx() { ... } — function-based tests +// 2. === section === — section headers (echo "=== name ===") +// 3. # Test: ... — comment-based test blocks +var bashTestFuncRe = regexp.MustCompile(`(?m)^(test_\w+)\s*\(\)\s*\{`) +var bashTestCommentRe = regexp.MustCompile(`(?m)^#\s*[Tt]est:\s*(.+)`) +var bashSectionRe = regexp.MustCompile(`(?i)^(?:echo\s+["'])?===\s*(\w[\w\s]*\w)\s*===["']?\s*$`) + +func parseBashTests(content string) []testCase { + lines := strings.Split(content, "\n") + + // Strategy 1: test_xxx() { ... } + var positions []testPos + for i, line := range lines { + if m := bashTestFuncRe.FindStringSubmatch(line); m != nil { + positions = append(positions, testPos{name: m[1], startLine: i}) + } + } + if len(positions) > 0 { + return extractBlocks(lines, positions) + } + + // Strategy 2: === section === headers + for i, line := range lines { + trimmed := strings.TrimSpace(line) + if m := bashSectionRe.FindStringSubmatch(trimmed); m != nil { + positions = append(positions, testPos{name: m[1], startLine: i}) + } + } + if len(positions) > 0 { + return extractBlocks(lines, positions) + } + + // Strategy 3: # Test: ... comments + for i, line := range lines { + if m := bashTestCommentRe.FindStringSubmatch(line); m != nil { + positions = append(positions, testPos{name: strings.TrimSpace(m[1]), startLine: i}) + } + } + return extractBlocks(lines, positions) +} + +// extractBlocks splits lines into code blocks based on test positions. +func extractBlocks(lines []string, positions []testPos) []testCase { + var tests []testCase + for i, pos := range positions { + endLine := len(lines) + if i+1 < len(positions) { + endLine = positions[i+1].startLine + } + for endLine > pos.startLine && strings.TrimSpace(lines[endLine-1]) == "" { + endLine-- + } + code := strings.Join(lines[pos.startLine:endLine], "\n") + tests = append(tests, testCase{Name: pos.name, Code: code}) + } + return tests +} From e6228ea8c03ac1d604400cb7d388e38d082d5be1 Mon Sep 17 00:00:00 2001 From: Egutierrez Date: Sun, 5 Apr 2026 18:19:21 +0200 Subject: [PATCH 4/6] =?UTF-8?q?feat:=20fn=20index=20extrae=20unit=5Ftests?= =?UTF-8?q?=20autom=C3=A1ticamente?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit El indexer lee test_file_path de funciones testeadas, parsea los test cases y los inserta en unit_tests. El output de fn index ahora muestra el conteo de unit_tests extraídos. --- cmd/fn/main.go | 2 +- registry/indexer.go | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/cmd/fn/main.go b/cmd/fn/main.go index 63dc6346..79094ea5 100644 --- a/cmd/fn/main.go +++ b/cmd/fn/main.go @@ -123,7 +123,7 @@ func cmdIndex() { } } - fmt.Printf("Indexed %d functions, %d types, %d apps, %d analysis\n", result.Functions, result.Types, result.Apps, result.Analysis) + fmt.Printf("Indexed %d functions, %d types, %d apps, %d analysis, %d unit_tests\n", result.Functions, result.Types, result.Apps, result.Analysis, result.UnitTests) for _, e := range result.ValidationErrors { fmt.Fprintf(os.Stderr, " INVALID: %s\n", e) } diff --git a/registry/indexer.go b/registry/indexer.go index 12bddafc..a38422a4 100644 --- a/registry/indexer.go +++ b/registry/indexer.go @@ -14,6 +14,7 @@ type IndexResult struct { Types int Apps int Analysis int + UnitTests int ValidationErrors []string Warnings []string Errors []string @@ -203,6 +204,39 @@ func Index(db *DB, root string) (*IndexResult, error) { result.Analysis++ } + // Extract unit tests from test files of tested functions + if err := db.PurgeUnitTests(); err != nil { + result.Warnings = append(result.Warnings, fmt.Sprintf("purging unit_tests: %v", err)) + } + for _, f := range functions { + if !f.Tested || f.TestFilePath == "" { + continue + } + absTestPath := filepath.Join(root, f.TestFilePath) + cases, err := parseTestFile(absTestPath, f.Lang) + if err != nil { + result.Warnings = append(result.Warnings, fmt.Sprintf("%s: parsing tests: %v", f.ID, err)) + continue + } + for i, tc := range cases { + ut := &UnitTest{ + ID: fmt.Sprintf("%s_t%d", f.ID, i), + FunctionID: f.ID, + Name: tc.Name, + Code: tc.Code, + FilePath: f.TestFilePath, + Lang: f.Lang, + CreatedAt: now, + UpdatedAt: now, + } + if err := db.InsertUnitTest(ut); err != nil { + result.Warnings = append(result.Warnings, fmt.Sprintf("insert unit_test %s: %v", ut.ID, err)) + continue + } + result.UnitTests++ + } + } + // Post-insert: warn about file_path entries that don't exist on disk for _, f := range functions { if f.FilePath != "" { From 806c819cf74a2012048b4dab0645891975220c13 Mon Sep 17 00:00:00 2001 From: Egutierrez Date: Sun, 5 Apr 2026 18:19:26 +0200 Subject: [PATCH 5/6] docs: convenciones de testing y schema unit_tests/e2e_tests Nuevo docs/testing.md con convenciones de test por lenguaje (Go, Python, Bash con 3 opciones), tablas unit_tests y e2e_tests, consultas FTS5 de ejemplo. Actualiza functions.md y CLAUDE.md con referencia a unit_tests. --- .claude/CLAUDE.md | 5 + docs/functions.md | 2 +- docs/testing.md | 247 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 253 insertions(+), 1 deletion(-) create mode 100644 docs/testing.md diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index 34105fa0..a5500ea8 100644 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -58,9 +58,14 @@ sqlite3 registry.db ".schema" **types** — columnas: `id, name, lang, domain, version, algebraic, definition, description, tags, uses_types, file_path, created_at, updated_at, examples, notes, documentation, code, content_hash, source_repo, source_license, source_file` - Enums: `algebraic`(product|sum) +**unit_tests** — columnas: `id, function_id, name, code, file_path, lang, created_at, updated_at` +- Extraidos automaticamente por `fn index` desde los archivos de test +- FK: `function_id` → `functions.id` + **FTS5 (columnas buscables):** - `functions_fts`: id, name, description, tags, signature, domain, example, notes, documentation, code - `types_fts`: id, name, description, tags, domain, examples, notes, documentation, code +- `unit_tests_fts`: id, name, code, function_id, lang --- diff --git a/docs/functions.md b/docs/functions.md index afdee4f5..fed7ffc3 100644 --- a/docs/functions.md +++ b/docs/functions.md @@ -73,7 +73,7 @@ DataTable(props: { data: T[]; columns: ColumnDef[]; onRowClick?: (row: T) | `kind: pipeline` | `purity` siempre `impure`. `uses_functions` no puede estar vacío. | | `purity: pure` | `returns_optional` siempre `false`. `error_type` vacío. Una pura que devuelve opcional debe modelarse como tipo suma, no como `returns_optional: true`. | | `purity: impure` | `error_type` obligatorio. Toda impura declara explícitamente qué puede salir mal. | -| `tested: true` | `test_file_path` obligatorio. `tests` no puede estar vacío. | +| `tested: true` | `test_file_path` obligatorio. `tests` no puede estar vacío. `fn index` extrae los test cases a la tabla `unit_tests` (ver [testing.md](testing.md)). | | `tested: false` | `tests` vacío. `test_file_path` vacío. | | `uses_functions[]` | Todos los IDs deben existir en la tabla `functions`. Sin referencias huérfanas. | | `uses_types[]` | Todos los IDs deben existir en la tabla `types`. Sin referencias huérfanas. | diff --git a/docs/testing.md b/docs/testing.md new file mode 100644 index 00000000..7de02859 --- /dev/null +++ b/docs/testing.md @@ -0,0 +1,247 @@ +# Testing + +El registry tiene dos niveles de tests: + +- **Unit tests** (`unit_tests` en `registry.db`) — tests individuales extraidos automaticamente de los archivos de test de cada funcion. +- **E2E tests** (`e2e_tests` en `operations.db`) — tests de integracion que verifican como las funciones se componen dentro de una app. + +--- + +## Unit tests + +`fn index` lee cada archivo de test referenciado por `test_file_path` en las funciones testeadas, extrae los test cases individuales con su codigo, y los inserta en la tabla `unit_tests`. + +### Tabla `unit_tests` + +| Campo | Tipo | Descripcion | +|---|---|---| +| `id` | string | `{function_id}_t{n}` (ej: `filter_slice_go_core_t0`) | +| `function_id` | string | FK a `functions.id` | +| `name` | string | Nombre del test case | +| `code` | string | Codigo fuente completo del test | +| `file_path` | string | Ruta relativa al archivo de test | +| `lang` | string | Lenguaje (go, py, bash) | +| `created_at` | datetime | Fecha de indexacion | +| `updated_at` | datetime | Fecha de ultima indexacion | + +FTS5 disponible sobre `id`, `name`, `code`, `function_id`, `lang`. + +### Consultas utiles + +```bash +# Todos los tests de una funcion +sqlite3 registry.db "SELECT id, name FROM unit_tests WHERE function_id = 'filter_slice_go_core';" + +# Buscar tests por contenido (FTS5) +sqlite3 registry.db "SELECT id, function_id, name FROM unit_tests WHERE id IN (SELECT id FROM unit_tests_fts WHERE unit_tests_fts MATCH 'retry') LIMIT 10;" + +# Tests por lenguaje +sqlite3 registry.db "SELECT lang, COUNT(*) FROM unit_tests GROUP BY lang;" + +# Ver codigo de un test +sqlite3 registry.db "SELECT code FROM unit_tests WHERE id = 'cache_decorator_py_core_t0';" +``` + +--- + +## Convenciones de test por lenguaje + +El parser automatico de `fn index` detecta test cases segun el lenguaje. Para que los tests se extraigan correctamente, seguir estas convenciones. + +### Go + +Convencion estandar de Go. El parser detecta funciones `func TestXxx(t *testing.T)`: + +```go +func TestFilterSlice(t *testing.T) { + t.Run("filtra pares", func(t *testing.T) { + got := FilterSlice([]int{1, 2, 3, 4, 5}, func(n int) bool { return n%2 == 0 }) + if len(got) != 2 || got[0] != 2 || got[1] != 4 { + t.Errorf("got %v, want [2 4]", got) + } + }) + + t.Run("slice vacio retorna vacio", func(t *testing.T) { + got := FilterSlice([]int{}, func(n int) bool { return true }) + if len(got) != 0 { + t.Errorf("got %v, want []", got) + } + }) +} +``` + +**Deteccion:** `^func (Test\w+)\s*\(` — cada `func Test...` es un test case. Los subtests (`t.Run`) se incluyen dentro del codigo del test padre. + +**Archivo:** `{domain}/{name}_test.go` (convencion Go estandar). + +### Python + +Convencion estandar de pytest. El parser detecta funciones `def test_xxx(`: + +```python +def test_funcion_llamada_una_vez(store): + calls = [] + + @cache_decorator(store, ttl=60) + def compute(x: int) -> int: + calls.append(x) + return x * 10 + + assert compute(5) == 50 + assert compute(5) == 50 + assert len(calls) == 1 + + +def test_ttl_expirado(store): + # ... +``` + +**Deteccion:** `^def (test_\w+)\s*\(` — cada funcion top-level `def test_...` es un test case. El codigo incluye todo hasta la siguiente `def test_` o fin de archivo. + +**Archivo:** `{domain}/{name}_test.py`. + +### Bash + +Bash no tiene framework estandar de testing. El parser soporta tres convenciones, en orden de prioridad: + +#### Opcion 1: funciones `test_xxx()` (preferida) + +La mas explicita y la que mejor se parsea: + +```bash +#!/usr/bin/env bash +source "$(dirname "$0")/mi_funcion.sh" + +PASS=0; FAIL=0 + +assert_eq() { + local name="$1" got="$2" want="$3" + if [ "$got" = "$want" ]; then echo " PASS: $name"; ((PASS++)) + else echo " FAIL: $name (got='$got', want='$want')"; ((FAIL++)); fi +} + +test_caso_basico() { + local got + got=$(mi_funcion "input") + assert_eq "caso basico" "$got" "expected" +} + +test_caso_vacio() { + local got + got=$(mi_funcion "") + assert_eq "input vacio" "$got" "" +} + +# Ejecutar todos los tests +test_caso_basico +test_caso_vacio + +echo "Resultados: $PASS passed, $FAIL failed" +[ "$FAIL" -eq 0 ] || exit 1 +``` + +**Deteccion:** `^(test_\w+)\s*\(\)\s*\{` — cada funcion `test_xxx() { ... }` es un test case. + +#### Opcion 2: secciones `=== nombre ===` + +Para tests que agrupan multiples asserts bajo secciones nombradas: + +```bash +#!/usr/bin/env bash +source "$(dirname "$0")/mi_funcion.sh" + +echo "=== caso basico ===" + +got=$(mi_funcion "input") +assert_eq "retorna expected" "$got" "expected" + +echo "=== caso edge ===" + +got=$(mi_funcion "") +assert_eq "input vacio" "$got" "" + +echo "=== errores ===" + +assert_fail "input invalido" mi_funcion "--bad" +``` + +**Deteccion:** `^(echo\s+["'])?===\s*(\w[\w\s]*\w)\s*===(["'])?\s*$` — cada linea con `=== nombre ===` (con o sin `echo`) abre una seccion. El nombre debe contener al menos dos caracteres alfanumericos (las lineas de separacion puras como `======` se ignoran). + +#### Opcion 3: comentarios `# Test:` + +Para scripts simples donde cada test se marca con un comentario: + +```bash +#!/usr/bin/env bash +source "$(dirname "$0")/mi_funcion.sh" + +# Test: caso basico +got=$(mi_funcion "input") +[ "$got" = "expected" ] || { echo "FAIL"; exit 1; } + +# Test: input vacio +got=$(mi_funcion "") +[ "$got" = "" ] || { echo "FAIL"; exit 1; } +``` + +**Deteccion:** `^#\s*[Tt]est:\s*(.+)` — cada comentario `# Test: nombre` abre un bloque. + +#### Recomendacion + +Usar **opcion 1** (funciones `test_xxx()`) para tests nuevos. Es la mas explicita, cada test esta aislado en su propia funcion, y se parsea sin ambiguedad. + +La **opcion 2** (secciones `===`) es aceptable cuando ya existe el patron en el archivo (como `pass_test.sh`). + +**Archivo:** `{domain}/{name}_test.sh`. + +--- + +## E2E tests + +Los e2e tests viven en `operations.db` de cada app. No se extraen automaticamente — se crean manualmente o por el bucle reactivo cuando se necesita verificar que un flujo end-to-end funciona. + +### Tabla `e2e_tests` + +| Campo | Tipo | Descripcion | +|---|---|---| +| `id` | string | Identificador unico | +| `name` | string | Nombre descriptivo del test | +| `description` | string | Que verifica este test | +| `relation_id` | string | FK a `relations.id` — que pipeline/relacion prueba | +| `steps` | []string | Funciones involucradas en orden | +| `input_fixture` | JSON | Datos de entrada para el test | +| `expected` | JSON | Resultado esperado | +| `last_status` | string | pass, fail, skip, o vacio | +| `last_run_at` | datetime | Ultima ejecucion | +| `execution_id` | string | Referencia a la ejecucion que lo corrio | +| `duration_ms` | int | Duracion en milisegundos | +| `created_at` | datetime | Fecha de creacion | +| `updated_at` | datetime | Ultima actualizacion | + +FTS5 disponible sobre `id`, `name`, `description`, `steps`. + +### Diferencia con assertions + +| | Assertions | E2E tests | +|---|---|---| +| **Que son** | Reglas declarativas sobre datos | Ejecuciones concretas de flujos | +| **Cuando corren** | En cada ejecucion del bucle reactivo | Bajo demanda o en CI | +| **Sobre que** | Una entity (`precio > 0`) | Un flujo completo (input → pipeline → output) | +| **Resultado** | pass/fail sobre el valor actual | pass/fail comparando output vs expected | + +### Ejemplo de uso + +```bash +# Crear un e2e test para un pipeline +fn ops e2e add --name "metabase_setup_completo" \ + --relation-id "rel_setup_metabase" \ + --steps '["docker_pull_image_go_infra","init_metabase_go_pipelines"]' \ + --input '{"project":"test"}' \ + --expected '{"status":"running"}' + +# Listar e2e tests +fn ops e2e list + +# Ver resultado +fn ops e2e show +``` From a9cd28b0109823b846bbe5520bd59fdacd935664 Mon Sep 17 00:00:00 2001 From: Egutierrez Date: Sun, 5 Apr 2026 18:19:36 +0200 Subject: [PATCH 6/6] =?UTF-8?q?chore:=20a=C3=B1ade=20directorio=20dev/=20c?= =?UTF-8?q?on=20issues=20y=20funciones=20implementadas?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tracking de issues completados (jupyter tools) y funciones implementadas (specs de diseño ya resueltas). --- dev/functions_to_implement/00_README.md | 67 ++++++++ .../completed/01_parse_markdown.md | 80 ++++++++++ .../completed/02_parse_pdf_to_markdown.md | 74 +++++++++ .../completed/03_parse_html_to_markdown.md | 66 ++++++++ .../completed/04_parse_docx_to_markdown.md | 30 ++++ .../completed/05_parse_excel_to_markdown.md | 32 ++++ .../completed/06_parse_epub_to_markdown.md | 36 +++++ .../completed/07_directory_scanner.md | 61 ++++++++ .../completed/08_circuit_breaker.md | 38 +++++ .../completed/09_retry_with_classify.md | 57 +++++++ .../completed/10_hotness_score.md | 41 +++++ .../completed/11_time_utils.md | 48 ++++++ .../completed/12_safe_extract_zip.md | 43 ++++++ .../completed/13_envelope_encryption.md | 60 ++++++++ .../completed/14_parse_code_ast.md | 51 +++++++ .../completed/15_git_url_parser.md | 48 ++++++ .../completed/16_media_strategy.md | 26 ++++ .../completed/17_parser_registry.md | 53 +++++++ .../completed/18_read_file_with_encoding.md | 22 +++ .../completed/19_type_parse_result.md | 77 ++++++++++ .../completed/20_type_classified_file.md | 41 +++++ .../completed/21_type_code_entity.md | 40 +++++ .../completed/22_type_message.md | 96 ++++++++++++ .../completed/23_type_retrieval.md | 144 ++++++++++++++++++ .../completed/24_type_memory.md | 115 ++++++++++++++ .../completed/25_type_context.md | 69 +++++++++ .../completed/26_validation_schemas.md | 86 +++++++++++ .../completed/27_tabular_transforms.md | 122 +++++++++++++++ .../completed/28_serialization_format.md | 118 ++++++++++++++ .../completed/29_http_client.md | 96 ++++++++++++ .../completed/30_scheduling.md | 104 +++++++++++++ .../completed/31_cache_persistent.md | 124 +++++++++++++++ .../completed/32_diff_merge.md | 122 +++++++++++++++ .../completed/fg_01_extract_entities_llm.md | 73 +++++++++ .../completed/fg_02_extract_relations_llm.md | 64 ++++++++ .../completed/fg_03_deduplicate_entities.md | 50 ++++++ .../completed/fg_04_deduplicate_relations.md | 36 +++++ .../completed/fg_05_build_schema_prompt.md | 51 +++++++ .../completed/fg_06_normalize_entity_name.md | 41 +++++ .../fg_07_merge_entity_attributes.md | 34 +++++ .../completed/fg_08_extraction_pipeline.md | 71 +++++++++ .../completed/fg_09_type_extraction.md | 102 +++++++++++++ .../completed/mf_01_text_chunker.md | 34 +++++ .../completed/mf_02_preprocess_text.md | 36 +++++ .../completed/mf_03_to_pascal_case.md | 28 ++++ .../completed/mf_04_llm_client.md | 48 ++++++ .../completed/mf_05_setup_logger.md | 33 ++++ .../completed/mf_06_retry_decorator.md | 52 +++++++ .../completed/mf_07_cursor_paginator.md | 47 ++++++ .../completed/mf_08_locale.md | 42 +++++ .../completed/mf_09_extract_text.md | 29 ++++ .../completed/mf_10_react_agent.md | 60 ++++++++ .../completed/mf_11_batch_retry.md | 43 ++++++ .../completed/mf_12_type_task.md | 66 ++++++++ .../completed/mf_13_type_agent_action.md | 73 +++++++++ .../completed/mf_14_type_entity_node.md | 46 ++++++ dev/functions_to_implement/fg_00_README.md | 49 ++++++ dev/functions_to_implement/mf_00_README.md | 32 ++++ .../completed/001_jupyter_create_notebook.md | 27 ++++ .../002_jupyter_discover_root_dir.md | 20 +++ .../003_jupyter_tools_documentation.md | 29 ++++ ...004_jupyter_discover_multiple_instances.md | 20 +++ .../completed/005_jupyter_write_batch.md | 29 ++++ .../006_jupyter_exec_outputs_keyerror.md | 28 ++++ 64 files changed, 3680 insertions(+) create mode 100644 dev/functions_to_implement/00_README.md create mode 100644 dev/functions_to_implement/completed/01_parse_markdown.md create mode 100644 dev/functions_to_implement/completed/02_parse_pdf_to_markdown.md create mode 100644 dev/functions_to_implement/completed/03_parse_html_to_markdown.md create mode 100644 dev/functions_to_implement/completed/04_parse_docx_to_markdown.md create mode 100644 dev/functions_to_implement/completed/05_parse_excel_to_markdown.md create mode 100644 dev/functions_to_implement/completed/06_parse_epub_to_markdown.md create mode 100644 dev/functions_to_implement/completed/07_directory_scanner.md create mode 100644 dev/functions_to_implement/completed/08_circuit_breaker.md create mode 100644 dev/functions_to_implement/completed/09_retry_with_classify.md create mode 100644 dev/functions_to_implement/completed/10_hotness_score.md create mode 100644 dev/functions_to_implement/completed/11_time_utils.md create mode 100644 dev/functions_to_implement/completed/12_safe_extract_zip.md create mode 100644 dev/functions_to_implement/completed/13_envelope_encryption.md create mode 100644 dev/functions_to_implement/completed/14_parse_code_ast.md create mode 100644 dev/functions_to_implement/completed/15_git_url_parser.md create mode 100644 dev/functions_to_implement/completed/16_media_strategy.md create mode 100644 dev/functions_to_implement/completed/17_parser_registry.md create mode 100644 dev/functions_to_implement/completed/18_read_file_with_encoding.md create mode 100644 dev/functions_to_implement/completed/19_type_parse_result.md create mode 100644 dev/functions_to_implement/completed/20_type_classified_file.md create mode 100644 dev/functions_to_implement/completed/21_type_code_entity.md create mode 100644 dev/functions_to_implement/completed/22_type_message.md create mode 100644 dev/functions_to_implement/completed/23_type_retrieval.md create mode 100644 dev/functions_to_implement/completed/24_type_memory.md create mode 100644 dev/functions_to_implement/completed/25_type_context.md create mode 100644 dev/functions_to_implement/completed/26_validation_schemas.md create mode 100644 dev/functions_to_implement/completed/27_tabular_transforms.md create mode 100644 dev/functions_to_implement/completed/28_serialization_format.md create mode 100644 dev/functions_to_implement/completed/29_http_client.md create mode 100644 dev/functions_to_implement/completed/30_scheduling.md create mode 100644 dev/functions_to_implement/completed/31_cache_persistent.md create mode 100644 dev/functions_to_implement/completed/32_diff_merge.md create mode 100644 dev/functions_to_implement/completed/fg_01_extract_entities_llm.md create mode 100644 dev/functions_to_implement/completed/fg_02_extract_relations_llm.md create mode 100644 dev/functions_to_implement/completed/fg_03_deduplicate_entities.md create mode 100644 dev/functions_to_implement/completed/fg_04_deduplicate_relations.md create mode 100644 dev/functions_to_implement/completed/fg_05_build_schema_prompt.md create mode 100644 dev/functions_to_implement/completed/fg_06_normalize_entity_name.md create mode 100644 dev/functions_to_implement/completed/fg_07_merge_entity_attributes.md create mode 100644 dev/functions_to_implement/completed/fg_08_extraction_pipeline.md create mode 100644 dev/functions_to_implement/completed/fg_09_type_extraction.md create mode 100644 dev/functions_to_implement/completed/mf_01_text_chunker.md create mode 100644 dev/functions_to_implement/completed/mf_02_preprocess_text.md create mode 100644 dev/functions_to_implement/completed/mf_03_to_pascal_case.md create mode 100644 dev/functions_to_implement/completed/mf_04_llm_client.md create mode 100644 dev/functions_to_implement/completed/mf_05_setup_logger.md create mode 100644 dev/functions_to_implement/completed/mf_06_retry_decorator.md create mode 100644 dev/functions_to_implement/completed/mf_07_cursor_paginator.md create mode 100644 dev/functions_to_implement/completed/mf_08_locale.md create mode 100644 dev/functions_to_implement/completed/mf_09_extract_text.md create mode 100644 dev/functions_to_implement/completed/mf_10_react_agent.md create mode 100644 dev/functions_to_implement/completed/mf_11_batch_retry.md create mode 100644 dev/functions_to_implement/completed/mf_12_type_task.md create mode 100644 dev/functions_to_implement/completed/mf_13_type_agent_action.md create mode 100644 dev/functions_to_implement/completed/mf_14_type_entity_node.md create mode 100644 dev/functions_to_implement/fg_00_README.md create mode 100644 dev/functions_to_implement/mf_00_README.md create mode 100644 dev/issues/completed/001_jupyter_create_notebook.md create mode 100644 dev/issues/completed/002_jupyter_discover_root_dir.md create mode 100644 dev/issues/completed/003_jupyter_tools_documentation.md create mode 100644 dev/issues/completed/004_jupyter_discover_multiple_instances.md create mode 100644 dev/issues/completed/005_jupyter_write_batch.md create mode 100644 dev/issues/completed/006_jupyter_exec_outputs_keyerror.md diff --git a/dev/functions_to_implement/00_README.md b/dev/functions_to_implement/00_README.md new file mode 100644 index 00000000..049d1f14 --- /dev/null +++ b/dev/functions_to_implement/00_README.md @@ -0,0 +1,67 @@ +# Funciones a implementar desde OpenViking + +Fuente: `sources/OpenViking` (ByteDance) +Licencia principal: **AGPL-3.0** (NO permisiva) +Licencia bot/: **MIT** | Licencia examples/: **Apache 2.0** + +**Estrategia:** No copiar codigo. Reimplementar desde cero las funcionalidades +genericas y utiles, documentadas aqui como specs independientes. Las funciones +resultantes seran originales, sin dependencia alguna del codigo AGPL. + +## Funciones + +| # | Archivo | Dominio | Funciones | +|---|---------|---------|-----------| +| 01 | parse_markdown.md | core | extract_frontmatter, find_headings, smart_split_content, estimate_token_count, sanitize_for_path | +| 02 | parse_pdf_to_markdown.md | core | pdf_to_markdown, extract_pdf_bookmarks, detect_headings_by_font, format_table_to_markdown | +| 03 | parse_html_to_markdown.md | core | html_to_markdown, detect_url_type, fetch_and_parse_url, convert_github_to_raw_url | +| 04 | parse_docx_to_markdown.md | core | docx_to_markdown | +| 05 | parse_excel_to_markdown.md | core | excel_to_markdown | +| 06 | parse_epub_to_markdown.md | core | epub_to_markdown | +| 07 | directory_scanner.md | infra | scan_directory | +| 08 | circuit_breaker.md | core | CircuitBreaker (class) | +| 09 | retry_with_classify.md | core | classify_api_error, compute_backoff_delay, retry_sync, retry_async | +| 10 | hotness_score.md | datascience | hotness_score | +| 11 | time_utils.md | core | parse_iso_datetime, format_iso8601, format_simplified | +| 12 | safe_extract_zip.md | infra | safe_extract_zip, normalize_zip_filenames | +| 13 | envelope_encryption.md | cybersecurity | envelope_encrypt, envelope_decrypt | +| 14 | parse_code_ast.md | core | parse_code_ast (tree-sitter multi-language) | +| 15 | git_url_parser.md | core | parse_git_url, is_git_repo_url, validate_git_ssh_uri | +| 16 | media_strategy.md | core | calculate_media_strategy | +| 17 | parser_registry.md | core | ParserRegistry (patron extensible) | +| 18 | read_file_with_encoding.md | infra | read_file_with_encoding | + +## Tipos + +| # | Archivo | Dominio | Tipos | +|---|---------|---------|-------| +| 19 | type_parse_result.md | core | NodeType (sum), ResourceNode (product), ParseResult (product) | +| 20 | type_classified_file.md | infra | ClassifiedFile (product), DirectoryScanResult (product) | +| 21 | type_code_entity.md | core | CodeEntity (product) | +| 22 | type_message.md | core | TextPart, ContextPart, ToolPart, Part (sum), Message (product) | +| 23 | type_retrieval.md | core | ContextType (sum), TypedQuery, QueryPlan, MatchedContext, ScoreDistribution, QueryResult, FindResult | +| 24 | type_memory.md | core | FieldType (sum), MergeOp (sum), MemoryField, MemoryTypeSchema, MemoryData | +| 25 | type_context.md | core | ResourceContentType (sum), ContextLevel (sum), Context (product) | + +## Gaps identificados (specs propias) + +Funciones que faltan en el registry para completar capacidades de ingesta, +operaciones y automatizacion. Diseño original, sin fuente externa. + +### Funciones + +| # | Archivo | Dominio | Funciones | +|---|---------|---------|-----------| +| 26 | validation_schemas.md | core | validate_json_schema, validate_struct_fields, coerce_types | +| 27 | tabular_transforms.md | datascience/core | pivot, melt, join_by_key, aggregate_by_group | +| 28 | serialization_format.md | core | to_csv, from_csv, to_jsonl, from_jsonl, render_template, generate_html_report | +| 29 | http_client.md | infra | http_get_json, http_post_json, http_download_file | +| 30 | scheduling.md | core/infra | parse_cron_expr, next_cron_time, cron_ticker | +| 31 | cache_persistent.md | infra/core | cache_to_sqlite, cache_to_file, cache_decorator | +| 32 | diff_merge.md | datascience | diff_entities, diff_relations, detect_drift, merge_graphs | + +### Tipos + +| # | Archivo | Dominio | Tipos | +|---|---------|---------|-------| +| 30 | scheduling.md | core | CronSchedule (product) | diff --git a/dev/functions_to_implement/completed/01_parse_markdown.md b/dev/functions_to_implement/completed/01_parse_markdown.md new file mode 100644 index 00000000..805776d8 --- /dev/null +++ b/dev/functions_to_implement/completed/01_parse_markdown.md @@ -0,0 +1,80 @@ +# Parse Markdown — Suite de funciones + +Fuente conceptual: OpenViking `openviking/parse/parsers/markdown.py` (AGPL-3.0) +Reimplementar desde cero. No copiar codigo. + +## Funciones a implementar + +### 1. extract_frontmatter + +- **Dominio:** core +- **Lang:** Python +- **Purity:** pure +- **Signature:** `extract_frontmatter(content: str) -> tuple[str, dict | None]` +- **Descripcion:** Extrae YAML frontmatter (delimitado por `---`) del inicio de un string markdown. Retorna el contenido sin frontmatter y el dict parseado (o None si no hay). +- **Algoritmo:** + 1. Regex `^---\n(.*?)\n---\n` con DOTALL + 2. Si match, parsear cada linea `key: value` (o usar `yaml.safe_load`) + 3. Retornar (contenido_restante, dict_frontmatter) +- **Deps:** `re`, opcionalmente `yaml` +- **Tests:** contenido con frontmatter, sin frontmatter, frontmatter vacio, frontmatter con listas + +### 2. find_headings + +- **Dominio:** core +- **Lang:** Python +- **Purity:** pure +- **Signature:** `find_headings(content: str) -> list[tuple[int, int, str, int]]` +- **Descripcion:** Encuentra todos los headings markdown (# a ######), excluyendo los que estan dentro de code blocks (``` ... ```), HTML comments () y bloques indentados. +- **Retorno:** Lista de `(start_pos, end_pos, title, level)` +- **Algoritmo:** + 1. Recopilar rangos excluidos: code blocks (triple backtick), HTML comments, bloques indentados (4 espacios/tab) + 2. Regex `^(#{1,6})\s+(.+)$` con MULTILINE + 3. Filtrar matches cuya posicion cae en un rango excluido + 4. Filtrar headings escapados (`\#`) +- **Deps:** `re` +- **Tests:** headings normales, headings dentro de code blocks (no deben detectarse), headings escapados, headings en HTML comments + +### 3. smart_split_content + +- **Dominio:** core +- **Lang:** Python +- **Purity:** pure +- **Signature:** `smart_split_content(content: str, max_tokens: int = 1024, max_chars: int = 8000) -> list[str]` +- **Descripcion:** Divide contenido grande en partes respetando limites de tokens y caracteres. Divide por parrafos (doble newline). Si un parrafo individual excede el limite, lo corta por caracteres. +- **Algoritmo:** + 1. Split por `\n\n` (parrafos) + 2. Acumular parrafos mientras no excedan max_tokens/max_chars + 3. Si un parrafo individual excede, cortarlo en chunks de max_chars + 4. Retornar lista de partes +- **Deps:** `re` +- **Tests:** contenido corto (1 parte), contenido largo, parrafo gigante que requiere forzar corte + +### 4. estimate_token_count + +- **Dominio:** core +- **Lang:** Python +- **Purity:** pure +- **Signature:** `estimate_token_count(content: str) -> int` +- **Descripcion:** Estimacion rapida de tokens sin tokenizer. CJK chars ~0.7 token/char, otros ~0.3 token/char. +- **Algoritmo:** + 1. Contar chars CJK con regex `[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]` + 2. Contar otros non-whitespace chars + 3. Retornar `int(cjk * 0.7 + otros * 0.3)` +- **Deps:** `re` +- **Tests:** texto solo latin, texto solo CJK, texto mixto, texto vacio + +### 5. sanitize_for_path + +- **Dominio:** core +- **Lang:** Python +- **Purity:** pure +- **Signature:** `sanitize_for_path(text: str, max_length: int = 50) -> str` +- **Descripcion:** Convierte texto a nombre seguro para uso en paths. Remueve caracteres especiales, reemplaza espacios con `_`, trunca con hash suffix si excede max_length. +- **Algoritmo:** + 1. Regex: mantener solo `\w`, CJK ranges, espacios y guiones + 2. Reemplazar espacios por `_`, strip underscores + 3. Si vacio, retornar "section" + 4. Si excede max_length: truncar y anadir `_` + sha256[:8] +- **Deps:** `re`, `hashlib` +- **Tests:** texto normal, texto con caracteres especiales, texto muy largo, texto vacio, texto CJK diff --git a/dev/functions_to_implement/completed/02_parse_pdf_to_markdown.md b/dev/functions_to_implement/completed/02_parse_pdf_to_markdown.md new file mode 100644 index 00000000..5ae625a5 --- /dev/null +++ b/dev/functions_to_implement/completed/02_parse_pdf_to_markdown.md @@ -0,0 +1,74 @@ +# Parse PDF to Markdown + +Fuente conceptual: OpenViking `openviking/parse/parsers/pdf.py` (AGPL-3.0) +Reimplementar desde cero. No copiar codigo. + +## Funciones a implementar + +### 1. pdf_to_markdown + +- **Dominio:** core +- **Lang:** Python +- **Purity:** impure (I/O: lee archivo PDF) +- **Signature:** `pdf_to_markdown(pdf_path: str, heading_detection: str = "auto") -> tuple[str, dict]` +- **Descripcion:** Convierte un PDF a markdown. Extrae texto, tablas e inyecta headings detectados desde bookmarks o analisis de fuentes. Retorna (markdown_content, metadata_dict). +- **Algoritmo:** + 1. Abrir PDF con pdfplumber + 2. Detectar headings: + - `"bookmarks"`: extraer outlines/bookmarks del PDF + - `"font"`: analizar distribucion de font sizes para detectar headings + - `"auto"`: intentar bookmarks primero, fallback a font + 3. Agrupar headings por pagina + 4. Por cada pagina: inyectar headings como `# titulo`, extraer texto, extraer tablas como markdown + 5. Unir todo con `\n\n` +- **Deps:** `pdfplumber` +- **Error type:** Exception (archivo no encontrado, PDF corrupto) +- **Tests:** PDF con bookmarks, PDF sin bookmarks, PDF con tablas, PDF vacio + +### 2. extract_pdf_bookmarks + +- **Dominio:** core +- **Lang:** Python +- **Purity:** impure (requiere objeto PDF abierto) +- **Signature:** `extract_pdf_bookmarks(pdf) -> list[dict]` +- **Descripcion:** Extrae la estructura de bookmarks/outlines de un PDF abierto con pdfplumber. Retorna lista de `{"level": int, "title": str, "page_num": int | None}`. +- **Algoritmo:** + 1. Acceder a `pdf.doc.get_outlines()` + 2. Construir mapping `objid -> page_number` desde pdf.pages + 3. Resolver cada destino de outline a su numero de pagina + 4. Limitar nivel a [1, 6] +- **Deps:** `pdfplumber` +- **Tests:** PDF con outlines, PDF sin outlines + +### 3. detect_headings_by_font + +- **Dominio:** core +- **Lang:** Python +- **Purity:** impure (requiere objeto PDF abierto) +- **Signature:** `detect_headings_by_font(pdf, min_delta: float = 2.0, max_levels: int = 4) -> list[dict]` +- **Descripcion:** Detecta headings analizando la distribucion de font sizes. El font size mas comun es el body; sizes significativamente mayores son headings. +- **Algoritmo:** + 1. Samplear font sizes (cada 5ta pagina) → Counter + 2. Determinar body_size (most_common) + 3. Heading sizes: font sizes >= body_size + min_delta Y con frecuencia < 50% del body + 4. Ordenar heading sizes desc → asignar niveles 1,2,3... + 5. Recorrer todas las paginas, extraer texto de chars con heading size + 6. Deduplicar: filtrar titulos que aparecen en >30% de paginas (son headers/footers) +- **Deps:** `pdfplumber`, `collections.Counter` +- **Tests:** PDF con headings claros por font, PDF con fuente uniforme (sin headings detectados) + +### 4. format_table_markdown (reutilizable) + +- **Dominio:** core +- **Lang:** Python +- **Purity:** pure +- **Signature:** `format_table_to_markdown(rows: list[list[str]], has_header: bool = True) -> str` +- **Descripcion:** Convierte una lista 2D de celdas a tabla markdown con alineacion de columnas. +- **Algoritmo:** + 1. Calcular max width por columna + 2. Formatear header row con `|` + 3. Anadir separador `| --- | --- |` + 4. Formatear data rows + 5. Escapar pipes en celdas +- **Deps:** ninguna +- **Tests:** tabla normal, tabla con celdas vacias, tabla con 1 fila, tabla vacia, celdas con pipes diff --git a/dev/functions_to_implement/completed/03_parse_html_to_markdown.md b/dev/functions_to_implement/completed/03_parse_html_to_markdown.md new file mode 100644 index 00000000..79e9a562 --- /dev/null +++ b/dev/functions_to_implement/completed/03_parse_html_to_markdown.md @@ -0,0 +1,66 @@ +# Parse HTML to Markdown + +Fuente conceptual: OpenViking `openviking/parse/parsers/html.py` (AGPL-3.0) +Reimplementar desde cero. No copiar codigo. + +## Funciones a implementar + +### 1. html_to_markdown + +- **Dominio:** core +- **Lang:** Python +- **Purity:** pure +- **Signature:** `html_to_markdown(html: str) -> str` +- **Descripcion:** Convierte HTML a markdown. Usa readabilipy para extraer contenido principal (filtra nav, ads, boilerplate), luego markdownify para convertir a markdown. +- **Algoritmo:** + 1. Preprocesar HTML: manejar contenido oculto (ej: WeChat js_content), lazy loading images (data-src → src) + 2. Extraer contenido principal con readabilipy (basado en Mozilla Readability) + 3. Convertir a markdown con markdownify (headings ATX, strip script/style) +- **Deps:** `readabilipy`, `markdownify`, `beautifulsoup4` +- **Tests:** HTML con nav/footer (debe filtrarse), HTML limpio, HTML con imagenes lazy-loaded + +### 2. detect_url_type + +- **Dominio:** core +- **Lang:** Python +- **Purity:** impure (hace HTTP HEAD request) +- **Signature:** `detect_url_type(url: str, timeout: float = 10.0) -> tuple[str, dict]` +- **Descripcion:** Detecta el tipo de contenido de una URL. Retorna tipo ("webpage", "pdf", "markdown", "text", "code_repository") y metadata. +- **Algoritmo:** + 1. Verificar patrones de repos de codigo (github.com/org/repo, git@...) + 2. Verificar extension en URL (.pdf, .md, .txt, .html, .git) + 3. Si no se determino: HTTP HEAD request → leer Content-Type header + 4. Default: "webpage" +- **Deps:** `httpx`, `urllib.parse` +- **Error type:** Exception (timeout, network error) +- **Tests:** URL .pdf, URL github repo, URL webpage, URL con Content-Type custom + +### 3. fetch_and_parse_url + +- **Dominio:** core +- **Lang:** Python +- **Purity:** impure (I/O: HTTP request) +- **Signature:** `fetch_and_parse_url(url: str, timeout: float = 30.0) -> str` +- **Descripcion:** Descarga una pagina web y la convierte a markdown. Combina fetch HTML + html_to_markdown. +- **Algoritmo:** + 1. Detectar tipo de URL con detect_url_type + 2. Si webpage: fetch HTML con httpx, convertir a markdown + 3. Si download link: descargar a archivo temporal, delegar al parser apropiado + 4. Si code repository: delegar a parser de codigo +- **Deps:** `httpx` +- **Error type:** Exception + +### 4. convert_github_to_raw_url + +- **Dominio:** core +- **Lang:** Python +- **Purity:** pure +- **Signature:** `convert_github_to_raw_url(url: str) -> str` +- **Descripcion:** Convierte una URL de blob de GitHub/GitLab a su URL raw. Ej: `github.com/org/repo/blob/main/file.py` → `raw.githubusercontent.com/org/repo/main/file.py` +- **Algoritmo:** + 1. Parsear URL + 2. Si GitHub y path contiene `/blob/`: remover `/blob/` y cambiar dominio a raw.githubusercontent.com + 3. Si GitLab y path contiene `/blob/`: reemplazar por `/raw/` + 4. Si no aplica, retornar URL sin cambiar +- **Deps:** `urllib.parse` +- **Tests:** URL GitHub blob, URL GitLab blob, URL que no es blob, URL no-GitHub diff --git a/dev/functions_to_implement/completed/04_parse_docx_to_markdown.md b/dev/functions_to_implement/completed/04_parse_docx_to_markdown.md new file mode 100644 index 00000000..e460f0e0 --- /dev/null +++ b/dev/functions_to_implement/completed/04_parse_docx_to_markdown.md @@ -0,0 +1,30 @@ +# Parse DOCX to Markdown + +Fuente conceptual: OpenViking `openviking/parse/parsers/word.py` (AGPL-3.0) +Reimplementar desde cero. No copiar codigo. + +## Funcion a implementar + +### docx_to_markdown + +- **Dominio:** core +- **Lang:** Python +- **Purity:** impure (I/O: lee archivo .docx) +- **Signature:** `docx_to_markdown(docx_path: str) -> str` +- **Descripcion:** Convierte un documento Word (.docx) a markdown preservando estructura, formato y tablas en su posicion original. +- **Algoritmo:** + 1. Abrir con python-docx + 2. Construir mapa de tablas: `{element_xml: Table}` para lookup O(1) + 3. Recorrer `doc.element.body` en orden (preserva posicion de tablas): + - Si es parrafo (`w:p`): + - Si estilo es Heading N: `{'#' * level} {text}` + - Si no: convertir runs con formato (bold→`**`, italic→`*`, underline→``) + - Si es tabla (`w:tbl`): convertir filas a markdown table con `format_table_to_markdown` + 4. Unir partes con `\n\n` +- **Deps:** `python-docx` +- **Error type:** Exception (archivo no encontrado, formato invalido) +- **Notas:** + - Las tablas deben aparecer en su posicion original, no al final + - Los heading levels se extraen del nombre del estilo ("Heading 1" → nivel 1) + - El formato inline (bold/italic/underline) se preserva +- **Tests:** docx con headings y parrafos, docx con tablas intercaladas, docx con formato bold/italic, docx vacio diff --git a/dev/functions_to_implement/completed/05_parse_excel_to_markdown.md b/dev/functions_to_implement/completed/05_parse_excel_to_markdown.md new file mode 100644 index 00000000..da0cdcd6 --- /dev/null +++ b/dev/functions_to_implement/completed/05_parse_excel_to_markdown.md @@ -0,0 +1,32 @@ +# Parse Excel to Markdown + +Fuente conceptual: OpenViking `openviking/parse/parsers/excel.py` (AGPL-3.0) +Reimplementar desde cero. No copiar codigo. + +## Funcion a implementar + +### excel_to_markdown + +- **Dominio:** core +- **Lang:** Python +- **Purity:** impure (I/O: lee archivo) +- **Signature:** `excel_to_markdown(path: str, max_rows_per_sheet: int = 1000) -> str` +- **Descripcion:** Convierte un archivo Excel (.xlsx, .xls, .xlsm) a markdown con cada sheet como seccion H2. +- **Algoritmo:** + 1. Si extension `.xls`: usar xlrd (legacy) + 2. Si `.xlsx`/`.xlsm`: usar openpyxl + 3. Para cada sheet: + - Header: `## Sheet: {name}` + - Metadata: `**Dimensions:** {rows} x {cols}` + - Truncar a max_rows_per_sheet + - Convertir filas a tabla markdown + - Si truncado: anadir nota de filas omitidas + 4. Manejo de tipos de celda para xlrd: + - EMPTY/BLANK → "" + - DATE → formato ISO (con hora si no es 00:00:00) + - BOOLEAN → "TRUE"/"FALSE" + - ERROR → codigo de error Excel (#NULL!, #DIV/0!, etc.) + - NUMBER → entero si es entero, float si tiene decimales +- **Deps:** `openpyxl` (xlsx), `xlrd` (xls legacy) +- **Error type:** Exception +- **Tests:** xlsx con multiples sheets, xls legacy con fechas, sheet vacio, sheet con formulas (data_only), sheet truncado diff --git a/dev/functions_to_implement/completed/06_parse_epub_to_markdown.md b/dev/functions_to_implement/completed/06_parse_epub_to_markdown.md new file mode 100644 index 00000000..0b1e913b --- /dev/null +++ b/dev/functions_to_implement/completed/06_parse_epub_to_markdown.md @@ -0,0 +1,36 @@ +# Parse EPUB to Markdown + +Fuente conceptual: OpenViking `openviking/parse/parsers/epub.py` (AGPL-3.0) +Reimplementar desde cero. No copiar codigo. + +## Funcion a implementar + +### epub_to_markdown + +- **Dominio:** core +- **Lang:** Python +- **Purity:** impure (I/O: lee archivo) +- **Signature:** `epub_to_markdown(epub_path: str) -> str` +- **Descripcion:** Convierte un ebook EPUB a markdown. Intenta ebooklib primero, fallback a extraccion manual con zipfile. +- **Algoritmo con ebooklib:** + 1. Leer con `epub.read_epub(path)` + 2. Extraer metadata: titulo, autor + 3. Generar header: `# {titulo}` + `**Author:** {autor}` + 4. Para cada ITEM_DOCUMENT: decodificar contenido HTML → convertir a markdown + 5. Unir con `\n\n` +- **Algoritmo fallback manual (sin ebooklib):** + 1. Abrir como ZIP + 2. Listar archivos .html/.xhtml/.htm + 3. Para cada uno: decodificar HTML → convertir a markdown basico +- **Conversion HTML basica a markdown:** + 1. Remover `