From 25a392df481ea6ffdf69d8d9a26acc71c6bf440f Mon Sep 17 00:00:00 2001 From: Egutierrez Date: Sun, 5 Apr 2026 17:11:21 +0200 Subject: [PATCH] =?UTF-8?q?feat:=20funciones=20Python=20core=20=E2=80=94?= =?UTF-8?q?=20parsers,=20formatters,=20retry,=20serializaci=C3=B3n,=20LLM?= =?UTF-8?q?=20utils=20y=20m=C3=A1s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 178 archivos: módulo core.py actualizado + ~80 funciones nuevas con tests. Incluye: parse_llm_json, extract_text_from_file, retry_with_backoff, circuit_breaker, from_csv/to_csv, from_jsonl/to_jsonl, html_to_markdown, pdf_to_markdown, docx/epub/excel converters, cache_decorator, react_loop, task_manager, template rendering, entre otros. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../functions/core/build_tree_from_headers.md | 48 ++ python/functions/core/cache_decorator.md | 57 ++ python/functions/core/cache_decorator.py | 67 ++ python/functions/core/cache_decorator_test.py | 96 +++ .../core/calculate_media_strategy.md | 48 ++ .../core/calculate_media_strategy.py | 24 + .../core/calculate_media_strategy_test.py | 23 + .../functions/core/calculate_page_offset.md | 40 + .../functions/core/call_batch_with_retry.md | 55 ++ .../functions/core/call_batch_with_retry.py | 81 +++ .../core/call_batch_with_retry_test.py | 102 +++ python/functions/core/circuit_breaker.md | 66 ++ python/functions/core/circuit_breaker.py | 141 ++++ python/functions/core/circuit_breaker_test.py | 156 ++++ python/functions/core/classify_api_error.md | 41 ++ python/functions/core/classify_api_error.py | 38 + .../functions/core/classify_api_error_test.py | 50 ++ python/functions/core/coerce_types.md | 49 ++ python/functions/core/coerce_types.py | 135 ++++ python/functions/core/coerce_types_test.py | 84 +++ .../functions/core/compute_backoff_delay.md | 41 ++ .../functions/core/compute_backoff_delay.py | 26 + .../core/compute_backoff_delay_test.py | 42 ++ .../core/convert_github_to_raw_url.md | 59 ++ .../core/convert_github_to_raw_url.py | 69 ++ .../core/convert_github_to_raw_url_test.py | 77 ++ python/functions/core/core.py | 681 +++++++++++++++++- python/functions/core/create_node_mapping.md | 36 + python/functions/core/cursor_paginate.md | 66 ++ python/functions/core/cursor_paginate.py | 105 +++ python/functions/core/cursor_paginate_test.py | 148 ++++ .../functions/core/detect_headings_by_font.md | 37 + .../functions/core/detect_headings_by_font.py | 135 ++++ python/functions/core/detect_url_type.md | 59 ++ python/functions/core/detect_url_type.py | 144 ++++ python/functions/core/detect_url_type_test.py | 89 +++ python/functions/core/docx_to_markdown.md | 40 + python/functions/core/docx_to_markdown.py | 153 ++++ .../functions/core/docx_to_markdown_test.py | 129 ++++ python/functions/core/epub_to_markdown.md | 52 ++ python/functions/core/epub_to_markdown.py | 128 ++++ .../functions/core/epub_to_markdown_test.py | 163 +++++ python/functions/core/estimate_token_count.md | 37 + python/functions/core/excel_to_markdown.md | 58 ++ python/functions/core/excel_to_markdown.py | 211 ++++++ .../functions/core/excel_to_markdown_test.py | 142 ++++ python/functions/core/extract_frontmatter.md | 43 ++ .../functions/core/extract_json_from_llm.md | 36 + .../core/extract_markdown_headers.md | 36 + .../functions/core/extract_pdf_bookmarks.md | 37 + .../functions/core/extract_pdf_bookmarks.py | 63 ++ python/functions/core/extract_pdf_text.md | 35 + python/functions/core/extract_pdf_text.py | 19 + .../functions/core/extract_text_from_file.md | 51 ++ .../functions/core/extract_text_from_file.py | 92 +++ .../core/extract_text_from_file_test.py | 83 +++ python/functions/core/fetch_and_parse_url.md | 50 ++ python/functions/core/fetch_and_parse_url.py | 64 ++ python/functions/core/find_headings.md | 38 + python/functions/core/flatten_tree.md | 36 + python/functions/core/format_iso8601.md | 49 ++ python/functions/core/format_iso8601.py | 24 + python/functions/core/format_iso8601_test.py | 28 + python/functions/core/format_simplified.md | 54 ++ python/functions/core/format_simplified.py | 25 + .../functions/core/format_simplified_test.py | 30 + .../core/format_table_to_markdown.md | 36 + .../core/format_table_to_markdown.py | 52 ++ .../core/format_table_to_markdown_test.py | 63 ++ .../functions/core/format_tree_structure.md | 36 + python/functions/core/from_csv.md | 49 ++ python/functions/core/from_csv.py | 83 +++ python/functions/core/from_csv_test.py | 40 + python/functions/core/from_jsonl.md | 49 ++ python/functions/core/from_jsonl.py | 35 + python/functions/core/from_jsonl_test.py | 25 + python/functions/core/generate_html_report.md | 70 ++ python/functions/core/generate_html_report.py | 164 +++++ .../core/generate_html_report_test.py | 71 ++ python/functions/core/get_leaf_nodes.md | 36 + python/functions/core/get_pdf_page_tokens.md | 40 + python/functions/core/get_pdf_page_tokens.py | 47 ++ python/functions/core/get_text_stats.md | 32 + python/functions/core/get_text_stats_test.py | 21 + python/functions/core/html_to_markdown.md | 66 ++ python/functions/core/html_to_markdown.py | 272 +++++++ .../functions/core/html_to_markdown_test.py | 90 +++ python/functions/core/is_git_repo_url.md | 48 ++ python/functions/core/join_by_key.md | 47 ++ python/functions/core/join_by_key.py | 95 +++ python/functions/core/join_by_key_test.py | 72 ++ python/functions/core/list_to_tree.md | 41 ++ .../functions/core/llm_acompletion_retry.md | 40 + .../functions/core/llm_acompletion_retry.py | 43 ++ python/functions/core/llm_completion_retry.md | 43 ++ python/functions/core/llm_completion_retry.py | 52 ++ python/functions/core/load_translations.md | 43 ++ python/functions/core/load_translations.py | 46 ++ .../functions/core/load_translations_test.py | 80 ++ .../functions/core/merge_entity_attributes.md | 67 ++ .../functions/core/merge_entity_attributes.py | 78 ++ .../core/merge_entity_attributes_test.py | 102 +++ python/functions/core/next_cron_time.md | 49 ++ python/functions/core/next_cron_time.py | 105 +++ python/functions/core/next_cron_time_test.py | 41 ++ .../functions/core/normalize_entity_name.md | 77 ++ .../functions/core/normalize_entity_name.py | 81 +++ .../core/normalize_entity_name_test.py | 70 ++ python/functions/core/page_list_to_groups.md | 37 + python/functions/core/parse_code_ast.md | 115 +++ python/functions/core/parse_code_ast.py | 384 ++++++++++ python/functions/core/parse_code_ast_test.py | 220 ++++++ python/functions/core/parse_cron_expr.md | 48 ++ python/functions/core/parse_cron_expr.py | 112 +++ python/functions/core/parse_cron_expr_test.py | 45 ++ python/functions/core/parse_git_url.md | 46 ++ python/functions/core/parse_git_url_test.py | 104 +++ python/functions/core/parse_iso_datetime.md | 49 ++ python/functions/core/parse_iso_datetime.py | 25 + .../functions/core/parse_iso_datetime_test.py | 41 ++ python/functions/core/parse_llm_json.md | 42 ++ python/functions/core/parse_llm_json.py | 33 + python/functions/core/parse_llm_json_test.py | 25 + python/functions/core/parse_markdown_test.py | 184 +++++ python/functions/core/parse_page_range.md | 38 + python/functions/core/parser_registry.md | 65 ++ python/functions/core/parser_registry.py | 225 ++++++ python/functions/core/parser_registry_test.py | 162 +++++ python/functions/core/pdf_to_markdown.md | 46 ++ python/functions/core/pdf_to_markdown.py | 121 ++++ python/functions/core/preprocess_text.md | 33 + python/functions/core/preprocess_text_test.py | 24 + python/functions/core/react_loop.md | 70 ++ python/functions/core/react_loop.py | 133 ++++ python/functions/core/react_loop_test.py | 127 ++++ python/functions/core/remove_tree_fields.md | 36 + python/functions/core/render_template.md | 67 ++ python/functions/core/render_template.py | 142 ++++ python/functions/core/render_template_test.py | 57 ++ python/functions/core/retry_async.md | 45 ++ python/functions/core/retry_async.py | 52 ++ python/functions/core/retry_sync.md | 44 ++ python/functions/core/retry_sync.py | 52 ++ python/functions/core/retry_with_backoff.md | 50 ++ python/functions/core/retry_with_backoff.py | 100 +++ .../core/retry_with_backoff_async.md | 51 ++ .../functions/core/retry_with_backoff_test.py | 195 +++++ python/functions/core/sanitize_for_path.md | 40 + python/functions/core/smart_split_content.md | 41 ++ .../functions/core/split_text_into_chunks.md | 46 ++ .../functions/core/split_text_into_chunks.py | 66 ++ .../core/split_text_into_chunks_test.py | 64 ++ .../core/strip_markdown_codeblock.md | 37 + .../core/strip_markdown_codeblock.py | 24 + .../core/strip_markdown_codeblock_test.py | 28 + python/functions/core/strip_think_tags.md | 37 + python/functions/core/strip_think_tags.py | 20 + .../functions/core/strip_think_tags_test.py | 30 + python/functions/core/t.md | 58 ++ python/functions/core/t.py | 91 +++ python/functions/core/t_test.py | 83 +++ python/functions/core/task_manager.md | 58 ++ python/functions/core/task_manager.py | 176 +++++ python/functions/core/to_csv.md | 48 ++ python/functions/core/to_csv.py | 53 ++ python/functions/core/to_csv_test.py | 48 ++ python/functions/core/to_jsonl.md | 43 ++ python/functions/core/to_jsonl.py | 23 + python/functions/core/to_jsonl_test.py | 34 + python/functions/core/to_pascal_case.md | 48 ++ python/functions/core/to_pascal_case.py | 28 + python/functions/core/to_pascal_case_test.py | 35 + python/functions/core/tree_to_flat_list.md | 36 + python/functions/core/validate_git_ssh_uri.md | 45 ++ python/functions/core/validate_json_schema.md | 58 ++ python/functions/core/validate_json_schema.py | 134 ++++ .../core/validate_json_schema_test.py | 129 ++++ python/functions/core/write_node_ids.md | 37 + 178 files changed, 13060 insertions(+), 1 deletion(-) create mode 100644 python/functions/core/build_tree_from_headers.md create mode 100644 python/functions/core/cache_decorator.md create mode 100644 python/functions/core/cache_decorator.py create mode 100644 python/functions/core/cache_decorator_test.py create mode 100644 python/functions/core/calculate_media_strategy.md create mode 100644 python/functions/core/calculate_media_strategy.py create mode 100644 python/functions/core/calculate_media_strategy_test.py create mode 100644 python/functions/core/calculate_page_offset.md create mode 100644 python/functions/core/call_batch_with_retry.md create mode 100644 python/functions/core/call_batch_with_retry.py create mode 100644 python/functions/core/call_batch_with_retry_test.py create mode 100644 python/functions/core/circuit_breaker.md create mode 100644 python/functions/core/circuit_breaker.py create mode 100644 python/functions/core/circuit_breaker_test.py create mode 100644 python/functions/core/classify_api_error.md create mode 100644 python/functions/core/classify_api_error.py create mode 100644 python/functions/core/classify_api_error_test.py create mode 100644 python/functions/core/coerce_types.md create mode 100644 python/functions/core/coerce_types.py create mode 100644 python/functions/core/coerce_types_test.py create mode 100644 python/functions/core/compute_backoff_delay.md create mode 100644 python/functions/core/compute_backoff_delay.py create mode 100644 python/functions/core/compute_backoff_delay_test.py create mode 100644 python/functions/core/convert_github_to_raw_url.md create mode 100644 python/functions/core/convert_github_to_raw_url.py create mode 100644 python/functions/core/convert_github_to_raw_url_test.py create mode 100644 python/functions/core/create_node_mapping.md create mode 100644 python/functions/core/cursor_paginate.md create mode 100644 python/functions/core/cursor_paginate.py create mode 100644 python/functions/core/cursor_paginate_test.py create mode 100644 python/functions/core/detect_headings_by_font.md create mode 100644 python/functions/core/detect_headings_by_font.py create mode 100644 python/functions/core/detect_url_type.md create mode 100644 python/functions/core/detect_url_type.py create mode 100644 python/functions/core/detect_url_type_test.py create mode 100644 python/functions/core/docx_to_markdown.md create mode 100644 python/functions/core/docx_to_markdown.py create mode 100644 python/functions/core/docx_to_markdown_test.py create mode 100644 python/functions/core/epub_to_markdown.md create mode 100644 python/functions/core/epub_to_markdown.py create mode 100644 python/functions/core/epub_to_markdown_test.py create mode 100644 python/functions/core/estimate_token_count.md create mode 100644 python/functions/core/excel_to_markdown.md create mode 100644 python/functions/core/excel_to_markdown.py create mode 100644 python/functions/core/excel_to_markdown_test.py create mode 100644 python/functions/core/extract_frontmatter.md create mode 100644 python/functions/core/extract_json_from_llm.md create mode 100644 python/functions/core/extract_markdown_headers.md create mode 100644 python/functions/core/extract_pdf_bookmarks.md create mode 100644 python/functions/core/extract_pdf_bookmarks.py create mode 100644 python/functions/core/extract_pdf_text.md create mode 100644 python/functions/core/extract_pdf_text.py create mode 100644 python/functions/core/extract_text_from_file.md create mode 100644 python/functions/core/extract_text_from_file.py create mode 100644 python/functions/core/extract_text_from_file_test.py create mode 100644 python/functions/core/fetch_and_parse_url.md create mode 100644 python/functions/core/fetch_and_parse_url.py create mode 100644 python/functions/core/find_headings.md create mode 100644 python/functions/core/flatten_tree.md create mode 100644 python/functions/core/format_iso8601.md create mode 100644 python/functions/core/format_iso8601.py create mode 100644 python/functions/core/format_iso8601_test.py create mode 100644 python/functions/core/format_simplified.md create mode 100644 python/functions/core/format_simplified.py create mode 100644 python/functions/core/format_simplified_test.py create mode 100644 python/functions/core/format_table_to_markdown.md create mode 100644 python/functions/core/format_table_to_markdown.py create mode 100644 python/functions/core/format_table_to_markdown_test.py create mode 100644 python/functions/core/format_tree_structure.md create mode 100644 python/functions/core/from_csv.md create mode 100644 python/functions/core/from_csv.py create mode 100644 python/functions/core/from_csv_test.py create mode 100644 python/functions/core/from_jsonl.md create mode 100644 python/functions/core/from_jsonl.py create mode 100644 python/functions/core/from_jsonl_test.py create mode 100644 python/functions/core/generate_html_report.md create mode 100644 python/functions/core/generate_html_report.py create mode 100644 python/functions/core/generate_html_report_test.py create mode 100644 python/functions/core/get_leaf_nodes.md create mode 100644 python/functions/core/get_pdf_page_tokens.md create mode 100644 python/functions/core/get_pdf_page_tokens.py create mode 100644 python/functions/core/get_text_stats.md create mode 100644 python/functions/core/get_text_stats_test.py create mode 100644 python/functions/core/html_to_markdown.md create mode 100644 python/functions/core/html_to_markdown.py create mode 100644 python/functions/core/html_to_markdown_test.py create mode 100644 python/functions/core/is_git_repo_url.md create mode 100644 python/functions/core/join_by_key.md create mode 100644 python/functions/core/join_by_key.py create mode 100644 python/functions/core/join_by_key_test.py create mode 100644 python/functions/core/list_to_tree.md create mode 100644 python/functions/core/llm_acompletion_retry.md create mode 100644 python/functions/core/llm_acompletion_retry.py create mode 100644 python/functions/core/llm_completion_retry.md create mode 100644 python/functions/core/llm_completion_retry.py create mode 100644 python/functions/core/load_translations.md create mode 100644 python/functions/core/load_translations.py create mode 100644 python/functions/core/load_translations_test.py create mode 100644 python/functions/core/merge_entity_attributes.md create mode 100644 python/functions/core/merge_entity_attributes.py create mode 100644 python/functions/core/merge_entity_attributes_test.py create mode 100644 python/functions/core/next_cron_time.md create mode 100644 python/functions/core/next_cron_time.py create mode 100644 python/functions/core/next_cron_time_test.py create mode 100644 python/functions/core/normalize_entity_name.md create mode 100644 python/functions/core/normalize_entity_name.py create mode 100644 python/functions/core/normalize_entity_name_test.py create mode 100644 python/functions/core/page_list_to_groups.md create mode 100644 python/functions/core/parse_code_ast.md create mode 100644 python/functions/core/parse_code_ast.py create mode 100644 python/functions/core/parse_code_ast_test.py create mode 100644 python/functions/core/parse_cron_expr.md create mode 100644 python/functions/core/parse_cron_expr.py create mode 100644 python/functions/core/parse_cron_expr_test.py create mode 100644 python/functions/core/parse_git_url.md create mode 100644 python/functions/core/parse_git_url_test.py create mode 100644 python/functions/core/parse_iso_datetime.md create mode 100644 python/functions/core/parse_iso_datetime.py create mode 100644 python/functions/core/parse_iso_datetime_test.py create mode 100644 python/functions/core/parse_llm_json.md create mode 100644 python/functions/core/parse_llm_json.py create mode 100644 python/functions/core/parse_llm_json_test.py create mode 100644 python/functions/core/parse_markdown_test.py create mode 100644 python/functions/core/parse_page_range.md create mode 100644 python/functions/core/parser_registry.md create mode 100644 python/functions/core/parser_registry.py create mode 100644 python/functions/core/parser_registry_test.py create mode 100644 python/functions/core/pdf_to_markdown.md create mode 100644 python/functions/core/pdf_to_markdown.py create mode 100644 python/functions/core/preprocess_text.md create mode 100644 python/functions/core/preprocess_text_test.py create mode 100644 python/functions/core/react_loop.md create mode 100644 python/functions/core/react_loop.py create mode 100644 python/functions/core/react_loop_test.py create mode 100644 python/functions/core/remove_tree_fields.md create mode 100644 python/functions/core/render_template.md create mode 100644 python/functions/core/render_template.py create mode 100644 python/functions/core/render_template_test.py create mode 100644 python/functions/core/retry_async.md create mode 100644 python/functions/core/retry_async.py create mode 100644 python/functions/core/retry_sync.md create mode 100644 python/functions/core/retry_sync.py create mode 100644 python/functions/core/retry_with_backoff.md create mode 100644 python/functions/core/retry_with_backoff.py create mode 100644 python/functions/core/retry_with_backoff_async.md create mode 100644 python/functions/core/retry_with_backoff_test.py create mode 100644 python/functions/core/sanitize_for_path.md create mode 100644 python/functions/core/smart_split_content.md create mode 100644 python/functions/core/split_text_into_chunks.md create mode 100644 python/functions/core/split_text_into_chunks.py create mode 100644 python/functions/core/split_text_into_chunks_test.py create mode 100644 python/functions/core/strip_markdown_codeblock.md create mode 100644 python/functions/core/strip_markdown_codeblock.py create mode 100644 python/functions/core/strip_markdown_codeblock_test.py create mode 100644 python/functions/core/strip_think_tags.md create mode 100644 python/functions/core/strip_think_tags.py create mode 100644 python/functions/core/strip_think_tags_test.py create mode 100644 python/functions/core/t.md create mode 100644 python/functions/core/t.py create mode 100644 python/functions/core/t_test.py create mode 100644 python/functions/core/task_manager.md create mode 100644 python/functions/core/task_manager.py create mode 100644 python/functions/core/to_csv.md create mode 100644 python/functions/core/to_csv.py create mode 100644 python/functions/core/to_csv_test.py create mode 100644 python/functions/core/to_jsonl.md create mode 100644 python/functions/core/to_jsonl.py create mode 100644 python/functions/core/to_jsonl_test.py create mode 100644 python/functions/core/to_pascal_case.md create mode 100644 python/functions/core/to_pascal_case.py create mode 100644 python/functions/core/to_pascal_case_test.py create mode 100644 python/functions/core/tree_to_flat_list.md create mode 100644 python/functions/core/validate_git_ssh_uri.md create mode 100644 python/functions/core/validate_json_schema.md create mode 100644 python/functions/core/validate_json_schema.py create mode 100644 python/functions/core/validate_json_schema_test.py create mode 100644 python/functions/core/write_node_ids.md diff --git a/python/functions/core/build_tree_from_headers.md b/python/functions/core/build_tree_from_headers.md new file mode 100644 index 00000000..b69f4bd5 --- /dev/null +++ b/python/functions/core/build_tree_from_headers.md @@ -0,0 +1,48 @@ +--- +name: build_tree_from_headers +kind: function +lang: py +domain: core +version: "1.0.0" +purity: pure +signature: "def build_tree_from_headers(node_list: list[dict]) -> list[dict]" +description: "Construye arbol jerarquico anidado desde lista plana de headers markdown con niveles (h1>h2>h3)." +tags: [tree, markdown, headers, hierarchy] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [] +tested: false +tests: [] +test_file_path: "" +file_path: "python/functions/core/core.py" +source_repo: "https://github.com/VectifyAI/PageIndex" +source_license: "MIT" +source_file: "pageindex/page_index_md.py" +--- + +## Ejemplo + +```python +headers = [ + {"title": "Intro", "level": 1, "line_num": 1}, + {"title": "Background", "level": 2, "line_num": 5}, + {"title": "Details", "level": 3, "line_num": 10}, + {"title": "Methods", "level": 1, "line_num": 20}, +] +tree = build_tree_from_headers(headers) +# [ +# {"title": "Intro", "node_id": "0001", "nodes": [ +# {"title": "Background", "node_id": "0002", "nodes": [ +# {"title": "Details", "node_id": "0003"} +# ]} +# ]}, +# {"title": "Methods", "node_id": "0004"} +# ] +``` + +## Notas + +Funcion pura. Asigna node_id secuencial (0001...) automaticamente. Usa stack para resolver jerarquia por nivel de header. diff --git a/python/functions/core/cache_decorator.md b/python/functions/core/cache_decorator.md new file mode 100644 index 00000000..1b1dc37d --- /dev/null +++ b/python/functions/core/cache_decorator.md @@ -0,0 +1,57 @@ +--- +name: cache_decorator +kind: function +lang: py +domain: core +version: "1.0.0" +purity: impure +signature: "def cache_decorator(store: Any, ttl: float = 0, key_fn: callable | None = None)" +description: "Decorator que cachea el resultado de una funcion en cualquier store persistente compatible (CacheStore o FileCache). La key se genera hasheando (func.__name__, args, sorted(kwargs)) con SHA-256. Soporta funciones sincronas y asincronas." +tags: [cache, decorator, memoize, persistence, async, functional] +uses_functions: ["cache_to_sqlite_py_infra", "cache_to_file_py_infra"] +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: ["asyncio", "functools", "hashlib", "json"] +tested: true +tests: + - "Funcion llamada una vez, segunda vez desde cache" + - "TTL expirado → llama de nuevo" + - "key_fn custom" + - "Argumentos distintos → keys distintas" + - "Funciona con async" +test_file_path: "python/functions/core/cache_decorator_test.py" +file_path: "python/functions/core/cache_decorator.py" +--- + +## Ejemplo + +```python +from infra.cache_to_sqlite import cache_to_sqlite +from core.cache_decorator import cache_decorator + +store = cache_to_sqlite("cache.db", namespace="llm") + +@cache_decorator(store, ttl=3600) +def call_llm(prompt: str) -> str: + # llamada costosa a LLM + return client.complete(prompt) + +result = call_llm("explain X") # primera vez: llama LLM +result = call_llm("explain X") # segunda vez: desde cache + +# Con key_fn custom +@cache_decorator(store, ttl=600, key_fn=lambda fn, args, kw: args[0]) +def fetch_user(user_id: str) -> dict: + return api.get_user(user_id) + +# Con async +@cache_decorator(store, ttl=3600) +async def async_call(prompt: str) -> str: + return await async_client.complete(prompt) +``` + +## Notas + +El store debe implementar `get(key: str) -> Any | None` y `set(key: str, value: Any, ttl: float) -> None`. Detecta automaticamente funciones asincronas con `asyncio.iscoroutinefunction`. La key por defecto usa `json.dumps(..., default=str)` para serializar argumentos no serializables. Si `store.get()` retorna `None`, siempre se ejecuta la funcion (no distingue entre "no en cache" y "valor None almacenado"); para valores que pueden ser None usar `get_or_set` directamente. diff --git a/python/functions/core/cache_decorator.py b/python/functions/core/cache_decorator.py new file mode 100644 index 00000000..619979e4 --- /dev/null +++ b/python/functions/core/cache_decorator.py @@ -0,0 +1,67 @@ +"""Decorator que cachea el resultado de una funcion en un store persistente.""" + +import asyncio +import functools +import hashlib +import json +from typing import Any, Callable + + +def _default_key(func: Callable, args: tuple, kwargs: dict) -> str: + """Genera una cache key a partir del nombre de funcion y sus argumentos.""" + payload = json.dumps((func.__name__, args, sorted(kwargs.items())), default=str) + return hashlib.sha256(payload.encode("utf-8")).hexdigest() + + +def cache_decorator(store: Any, ttl: float = 0, key_fn: Callable | None = None): + """Retorna un decorator que cachea resultados en un store persistente. + + Args: + store: Cualquier objeto con metodos get(key) y set(key, value, ttl). + Compatible con CacheStore (cache_to_sqlite) y FileCache (cache_to_file). + ttl: Tiempo de vida en segundos. 0 = sin expiracion. + key_fn: Funcion opcional para generar la key. Recibe (func, args, kwargs). + Si es None, se usa SHA-256 de (func.__name__, args, sorted(kwargs)). + + Returns: + Decorator aplicable a funciones sincronas o asincronas. + + Example:: + + store = cache_to_sqlite("cache.db") + + @cache_decorator(store, ttl=3600) + def call_llm(prompt: str) -> str: + ... # llamada costosa + + result = call_llm("explain X") # primera vez: ejecuta la funcion + result = call_llm("explain X") # segunda vez: desde cache + """ + + def decorator(func: Callable) -> Callable: + if asyncio.iscoroutinefunction(func): + @functools.wraps(func) + async def async_wrapper(*args, **kwargs): + make_key = key_fn or _default_key + key = make_key(func, args, kwargs) + cached = store.get(key) + if cached is not None: + return cached + result = await func(*args, **kwargs) + store.set(key, result, ttl) + return result + return async_wrapper + else: + @functools.wraps(func) + def sync_wrapper(*args, **kwargs): + make_key = key_fn or _default_key + key = make_key(func, args, kwargs) + cached = store.get(key) + if cached is not None: + return cached + result = func(*args, **kwargs) + store.set(key, result, ttl) + return result + return sync_wrapper + + return decorator diff --git a/python/functions/core/cache_decorator_test.py b/python/functions/core/cache_decorator_test.py new file mode 100644 index 00000000..4f8a3b57 --- /dev/null +++ b/python/functions/core/cache_decorator_test.py @@ -0,0 +1,96 @@ +"""Tests para cache_decorator.""" + +import asyncio +import sys +import os +import tempfile +import time + +import pytest + +sys.path.insert(0, os.path.dirname(__file__)) +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "infra")) + +from cache_decorator import cache_decorator +from cache_to_sqlite import cache_to_sqlite + + +@pytest.fixture +def store(tmp_path): + return cache_to_sqlite(str(tmp_path / "test.db")) + + +def test_funcion_llamada_una_vez_segunda_vez_desde_cache(store): + calls = [] + + @cache_decorator(store, ttl=60) + def compute(x: int) -> int: + calls.append(x) + return x * 10 + + assert compute(5) == 50 + assert compute(5) == 50 + assert len(calls) == 1 + + +def test_ttl_expirado_llama_de_nuevo(store): + calls = [] + + @cache_decorator(store, ttl=0.05) + def work(n: int) -> int: + calls.append(n) + return n + 1 + + work(3) + time.sleep(0.1) + work(3) + assert len(calls) == 2 + + +def test_key_fn_custom(store): + calls = [] + + def my_key_fn(func, args, kwargs): + return f"custom:{args[0]}" + + @cache_decorator(store, ttl=60, key_fn=my_key_fn) + def fn(x: int) -> str: + calls.append(x) + return f"result_{x}" + + fn(7) + fn(7) + assert len(calls) == 1 + + +def test_argumentos_distintos_keys_distintas(store): + calls = [] + + @cache_decorator(store, ttl=60) + def fn(x: int) -> int: + calls.append(x) + return x * 2 + + fn(1) + fn(2) + fn(1) + assert len(calls) == 2 + + +def test_funciona_con_async(store): + calls = [] + + @cache_decorator(store, ttl=60) + async def async_fn(x: int) -> int: + calls.append(x) + return x + 100 + + async def run(): + r1 = await async_fn(5) + r2 = await async_fn(5) + return r1, r2 + + r1, r2 = asyncio.run(run()) + assert r1 == 105 + assert r2 == 105 + assert len(calls) == 1 diff --git a/python/functions/core/calculate_media_strategy.md b/python/functions/core/calculate_media_strategy.md new file mode 100644 index 00000000..d67cfab2 --- /dev/null +++ b/python/functions/core/calculate_media_strategy.md @@ -0,0 +1,48 @@ +--- +name: calculate_media_strategy +kind: function +lang: py +domain: core +version: "1.0.0" +purity: pure +signature: "calculate_media_strategy(image_count: int, line_count: int) -> str" +description: "Determina la estrategia optima de procesamiento de medios para un documento basado en la proporcion de imagenes vs texto. Retorna full_page_vlm, extract o text_only." +tags: [media, strategy, document, vision, vlm, images, classification] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [] +tested: true +tests: + - "0 imagenes text_only" + - "2 imagenes 100 lineas extract" + - "10 imagenes 20 lineas full_page_vlm" + - "5 imagenes 100 lineas full_page_vlm" + - "0 lineas division por cero evitada" +test_file_path: "python/functions/core/calculate_media_strategy_test.py" +file_path: "python/functions/core/calculate_media_strategy.py" +--- + +## Ejemplo + +```python +calculate_media_strategy(0, 50) # "text_only" +calculate_media_strategy(2, 100) # "extract" (ratio 0.02, pocas imagenes) +calculate_media_strategy(10, 20) # "full_page_vlm" (ratio 0.5 > 0.3) +calculate_media_strategy(5, 100) # "full_page_vlm" (>= 5 imagenes) +calculate_media_strategy(3, 0) # "text_only" (sin texto, sin contexto) +``` + +## Notas + +Logica de clasificacion en tres niveles: + +1. `full_page_vlm` — documento dominado por imagenes: ratio imagen/linea > 0.3 o al menos 5 imagenes. Se usa un vision-language model sobre la pagina completa. +2. `extract` — pocas imagenes en documento con texto: extraer y procesar imagenes individualmente. +3. `text_only` — sin imagenes o sin lineas de texto: procesar solo el texto. + +El guard `line_count > 0` evita la division por cero y trata documentos sin lineas como `text_only` independientemente del conteo de imagenes, ya que sin texto no hay contexto suficiente para clasificar como `extract`. + +Funcion pura, sin dependencias externas. Reimplementada conceptualmente a partir de la logica de clasificacion de medios de OpenViking (AGPL-3.0). diff --git a/python/functions/core/calculate_media_strategy.py b/python/functions/core/calculate_media_strategy.py new file mode 100644 index 00000000..0ed0d642 --- /dev/null +++ b/python/functions/core/calculate_media_strategy.py @@ -0,0 +1,24 @@ +"""Determina la estrategia optima de procesamiento de medios para un documento.""" + + +def calculate_media_strategy(image_count: int, line_count: int) -> str: + """Determina la estrategia optima de procesamiento de medios. + + Clasifica un documento en una de tres estrategias basandose en la + proporcion de imagenes respecto al texto: + - full_page_vlm: documento dominado por imagenes, usar vision-language model + - extract: pocas imagenes, extraer y procesar individualmente + - text_only: sin imagenes, solo texto + + Args: + image_count: numero de imagenes en el documento. + line_count: numero de lineas de texto en el documento. + + Returns: + "full_page_vlm", "extract" o "text_only". + """ + if line_count > 0 and (image_count / line_count > 0.3 or image_count >= 5): + return "full_page_vlm" + if line_count > 0 and image_count > 0: + return "extract" + return "text_only" diff --git a/python/functions/core/calculate_media_strategy_test.py b/python/functions/core/calculate_media_strategy_test.py new file mode 100644 index 00000000..73ff7d18 --- /dev/null +++ b/python/functions/core/calculate_media_strategy_test.py @@ -0,0 +1,23 @@ +"""Tests para calculate_media_strategy.""" + +from calculate_media_strategy import calculate_media_strategy + + +def test_0_imagenes_text_only(): + assert calculate_media_strategy(0, 50) == "text_only" + + +def test_2_imagenes_100_lineas_extract(): + assert calculate_media_strategy(2, 100) == "extract" + + +def test_10_imagenes_20_lineas_full_page_vlm(): + assert calculate_media_strategy(10, 20) == "full_page_vlm" + + +def test_5_imagenes_100_lineas_full_page_vlm(): + assert calculate_media_strategy(5, 100) == "full_page_vlm" + + +def test_0_lineas_division_por_cero_evitada(): + assert calculate_media_strategy(3, 0) == "text_only" diff --git a/python/functions/core/calculate_page_offset.md b/python/functions/core/calculate_page_offset.md new file mode 100644 index 00000000..072a03e2 --- /dev/null +++ b/python/functions/core/calculate_page_offset.md @@ -0,0 +1,40 @@ +--- +name: calculate_page_offset +kind: function +lang: py +domain: core +version: "1.0.0" +purity: pure +signature: "def calculate_page_offset(pairs: list[dict]) -> int" +description: "Calcula offset entre numeros de pagina logicos y fisicos usando pares de referencia (moda de diferencias)." +tags: [pagination, offset, calculation] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [] +tested: false +tests: [] +test_file_path: "" +file_path: "python/functions/core/core.py" +source_repo: "https://github.com/VectifyAI/PageIndex" +source_license: "MIT" +source_file: "pageindex/page_index.py" +--- + +## Ejemplo + +```python +pairs = [ + {"page": 1, "physical_index": 5}, + {"page": 2, "physical_index": 6}, + {"page": 10, "physical_index": 14}, +] +calculate_page_offset(pairs) +# 4 (la moda de las diferencias physical_index - page) +``` + +## Notas + +Funcion pura. Cada par necesita campos 'page' (numero logico) y 'physical_index' (indice fisico). Retorna la diferencia mas frecuente (moda). Retorna 0 si no hay pares validos. diff --git a/python/functions/core/call_batch_with_retry.md b/python/functions/core/call_batch_with_retry.md new file mode 100644 index 00000000..effa58d3 --- /dev/null +++ b/python/functions/core/call_batch_with_retry.md @@ -0,0 +1,55 @@ +--- +name: call_batch_with_retry +kind: function +lang: py +domain: core +version: "1.0.0" +purity: impure +signature: "def call_batch_with_retry(items: list[T], process_func: Callable[[T], R], max_retries: int = 3, initial_delay: float = 1.0, max_delay: float = 30.0, backoff_factor: float = 2.0, exceptions: tuple[type[Exception], ...] = (Exception,), continue_on_failure: bool = True) -> tuple[list[R], list[dict]]" +description: "Procesa una lista de items con retry individual por item y exponential backoff. Los fallos individuales no bloquean el resto del batch. Retorna (results, failures) donde failures contiene index, item y error de cada item que agoto sus reintentos." +tags: [retry, batch, backoff, resilience, error-handling, core] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: ["time", "random", "typing.Callable", "typing.TypeVar"] +tested: true +tests: + - "todos los items exito" + - "item falla permanentemente, continue True" + - "item falla, abort continue False" + - "item falla luego exito retry funciona" + - "failures contiene index correcto" +test_file_path: "python/functions/core/call_batch_with_retry_test.py" +file_path: "python/functions/core/call_batch_with_retry.py" +--- + +## Ejemplo + +```python +results, failures = call_batch_with_retry( + items=["url1", "url2", "url3"], + process_func=fetch_url, + max_retries=3, + initial_delay=1.0, + max_delay=30.0, + backoff_factor=2.0, + exceptions=(ConnectionError, TimeoutError), + continue_on_failure=True, +) + +for r in results: + print("OK:", r) + +for f in failures: + print(f"FAIL index={f['index']} item={f['item']} error={f['error']}") +``` + +## Notas + +Diferencia con `retry_sync_py_core`: ese reintenta una sola llamada. Este maneja listas completas donde cada item se reintenta independientemente — los fallos individuales quedan registrados en `failures` sin interrumpir el procesamiento del batch (cuando `continue_on_failure=True`). + +El backoff usa la formula `min(initial_delay * backoff_factor^attempt, max_delay)` con jitter de hasta el 10% del delay calculado para evitar thundering herd. El primer intento es siempre inmediato — el delay se aplica antes del primer retry (attempt=0). + +Cuando `continue_on_failure=False`, el primer item que agota sus reintentos re-lanza la excepcion inmediatamente, abortando el batch. diff --git a/python/functions/core/call_batch_with_retry.py b/python/functions/core/call_batch_with_retry.py new file mode 100644 index 00000000..7dd01d8f --- /dev/null +++ b/python/functions/core/call_batch_with_retry.py @@ -0,0 +1,81 @@ +"""Process a batch of items with per-item exponential backoff retry.""" + +import time +import random +from typing import Callable, TypeVar + +T = TypeVar("T") +R = TypeVar("R") + + +def call_batch_with_retry( + items: list, + process_func: Callable, + max_retries: int = 3, + initial_delay: float = 1.0, + max_delay: float = 30.0, + backoff_factor: float = 2.0, + exceptions: tuple = (Exception,), + continue_on_failure: bool = True, +) -> tuple: + """Process a list of items with independent per-item retry and exponential backoff. + + Each item is processed by process_func. If it raises one of the specified + exceptions, it is retried up to max_retries times with exponential backoff. + If all retries are exhausted, the item is recorded as a failure. + + Args: + items: List of items to process. + process_func: Callable that takes a single item and returns a result. + max_retries: Maximum number of retry attempts per item after first failure. + initial_delay: Initial delay in seconds before the first retry. + max_delay: Maximum delay cap in seconds between retries. + backoff_factor: Multiplier applied to delay on each successive retry. + exceptions: Tuple of exception types to catch and retry on. + continue_on_failure: If True, continue processing remaining items when an + item exhausts all retries. If False, re-raise the exception immediately. + + Returns: + A tuple (results, failures) where: + - results is a list of successful return values from process_func. + - failures is a list of dicts with keys "index", "item", and "error" + for each item that failed after all retries. + + Raises: + Exception: The last exception for a failed item when continue_on_failure + is False. + """ + results = [] + failures = [] + + for index, item in enumerate(items): + last_exc = None + succeeded = False + + for attempt in range(max_retries + 1): + try: + result = process_func(item) + results.append(result) + succeeded = True + break + except exceptions as exc: + last_exc = exc + if attempt < max_retries: + delay = min( + initial_delay * (backoff_factor ** attempt), + max_delay, + ) + # Add small jitter (up to 10% of delay) to avoid thundering herd + delay += random.uniform(0, delay * 0.1) + time.sleep(delay) + + if not succeeded: + if not continue_on_failure: + raise last_exc + failures.append({ + "index": index, + "item": item, + "error": str(last_exc), + }) + + return results, failures diff --git a/python/functions/core/call_batch_with_retry_test.py b/python/functions/core/call_batch_with_retry_test.py new file mode 100644 index 00000000..54d38291 --- /dev/null +++ b/python/functions/core/call_batch_with_retry_test.py @@ -0,0 +1,102 @@ +"""Tests para call_batch_with_retry.""" + +import sys +import os + +sys.path.insert(0, os.path.dirname(__file__)) + +from call_batch_with_retry import call_batch_with_retry + + +def test_todos_los_items_exito(): + results, failures = call_batch_with_retry( + items=[1, 2, 3], + process_func=lambda x: x * 2, + max_retries=3, + ) + assert results == [2, 4, 6] + assert failures == [] + + +def test_item_falla_permanentemente_continue_true(): + def process(x): + if x == 2: + raise ValueError("fallo permanente") + return x * 10 + + results, failures = call_batch_with_retry( + items=[1, 2, 3], + process_func=process, + max_retries=2, + initial_delay=0.0, + continue_on_failure=True, + ) + assert results == [10, 30] + assert len(failures) == 1 + assert failures[0]["index"] == 1 + assert failures[0]["item"] == 2 + assert "fallo permanente" in failures[0]["error"] + + +def test_item_falla_abort_continue_false(): + call_count = {"n": 0} + + def process(x): + call_count["n"] += 1 + if x == 2: + raise RuntimeError("error fatal") + return x + + try: + call_batch_with_retry( + items=[1, 2, 3], + process_func=process, + max_retries=1, + initial_delay=0.0, + continue_on_failure=False, + ) + assert False, "Deberia haber lanzado excepcion" + except RuntimeError as e: + assert "error fatal" in str(e) + # item 3 nunca fue procesado + assert call_count["n"] < 6 # 1 ok + 2 intentos para item 2 + 0 para item 3 + + +def test_item_falla_luego_exito_retry_funciona(): + attempt_counts = {} + + def process(x): + attempt_counts[x] = attempt_counts.get(x, 0) + 1 + # item 5 falla las primeras 2 veces, exito en la tercera + if x == 5 and attempt_counts[x] < 3: + raise ValueError("fallo temporal") + return x * 2 + + results, failures = call_batch_with_retry( + items=[1, 5, 9], + process_func=process, + max_retries=3, + initial_delay=0.0, + continue_on_failure=True, + ) + assert results == [2, 10, 18] + assert failures == [] + assert attempt_counts[5] == 3 + + +def test_failures_contiene_index_correcto(): + def process(x): + if x in (0, 2, 4): + raise ValueError(f"fallo en {x}") + return x + + results, failures = call_batch_with_retry( + items=[0, 1, 2, 3, 4], + process_func=process, + max_retries=0, + initial_delay=0.0, + continue_on_failure=True, + ) + assert results == [1, 3] + assert [f["index"] for f in failures] == [0, 2, 4] + assert [f["item"] for f in failures] == [0, 2, 4] diff --git a/python/functions/core/circuit_breaker.md b/python/functions/core/circuit_breaker.md new file mode 100644 index 00000000..6d36d5a7 --- /dev/null +++ b/python/functions/core/circuit_breaker.md @@ -0,0 +1,66 @@ +--- +name: circuit_breaker +kind: function +lang: py +domain: core +version: "1.0.0" +purity: impure +signature: "class CircuitBreaker:\n def __init__(self, failure_threshold: int = 5, reset_timeout: float = 300.0): ...\n def check(self) -> None: ...\n def record_success(self) -> None: ...\n def record_failure(self, error: Exception) -> None: ...\n @property\n def retry_after(self) -> float: ..." +description: "Patron circuit breaker thread-safe para proteger llamadas a APIs externas. Tres estados: CLOSED (normal), OPEN (bloqueando), HALF_OPEN (permitiendo 1 request de prueba). Integra con classify_api_error para distinguir errores permanentes de transitorios." +tags: [circuit-breaker, resilience, api, retry, error-handling, thread-safe] +uses_functions: [classify_api_error_py_core] +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: [threading, time, enum] +tested: true +tests: + - "Transicion CLOSED → OPEN despues de N fallos" + - "Transicion OPEN → HALF_OPEN despues de timeout" + - "Transicion HALF_OPEN → CLOSED en exito" + - "Transicion HALF_OPEN → OPEN en fallo" + - "Error permanente abre inmediatamente" + - "Thread safety (concurrencia)" + - "retry_after retorna 0 cuando no esta OPEN" +test_file_path: "python/functions/core/circuit_breaker_test.py" +file_path: "python/functions/core/circuit_breaker.py" +--- + +## Ejemplo + +```python +from circuit_breaker import CircuitBreaker, CircuitBreakerOpen + +cb = CircuitBreaker(failure_threshold=3, reset_timeout=60.0) + +def call_api() -> dict: + cb.check() # raises CircuitBreakerOpen if circuit is open + try: + result = requests.get("https://api.example.com/data").json() + cb.record_success() + return result + except Exception as exc: + cb.record_failure(exc) + raise + +# After 3 consecutive failures the circuit opens: +# CircuitBreakerOpen: Circuit breaker is open. Retry after 30.0s +try: + cb.check() +except CircuitBreakerOpen as e: + print(f"Circuit open, retry in {e.retry_after}s") + +# retry_after property (capped at 30s): +print(cb.retry_after) # e.g. 28.4 +``` + +## Notas + +- **CLOSED**: Requests pasan normalmente. Tras `failure_threshold` fallos consecutivos transiciona a OPEN. +- **OPEN**: Requests bloqueados con `CircuitBreakerOpen`. Tras `reset_timeout` segundos transiciona a HALF_OPEN. +- **HALF_OPEN**: Permite 1 request de prueba. Exito → CLOSED. Fallo → OPEN. +- Errores permanentes (401, 403) abren el circuito inmediatamente sin esperar al umbral. +- `retry_after` devuelve 0.0 cuando el estado no es OPEN; en OPEN devuelve el tiempo restante, cap 30s. +- Thread-safe via `threading.Lock` protegiendo todo el estado interno. +- La dependencia en `classify_api_error` es opcional: si no se puede importar, hay fallback de texto. diff --git a/python/functions/core/circuit_breaker.py b/python/functions/core/circuit_breaker.py new file mode 100644 index 00000000..b68ccfba --- /dev/null +++ b/python/functions/core/circuit_breaker.py @@ -0,0 +1,141 @@ +"""Circuit breaker pattern for protecting external API calls.""" + +import threading +import time +from enum import Enum + + +class CircuitBreakerState(Enum): + CLOSED = "closed" + OPEN = "open" + HALF_OPEN = "half_open" + + +class CircuitBreakerOpen(Exception): + """Raised when the circuit breaker is open and blocking requests.""" + + def __init__(self, retry_after: float) -> None: + self.retry_after = retry_after + super().__init__(f"Circuit breaker is open. Retry after {retry_after:.1f}s") + + +def _is_permanent_error(error: Exception) -> bool: + """Return True if the error is permanent (should open circuit immediately).""" + try: + from classify_api_error import classify_api_error + + return classify_api_error(error) == "permanent" + except ImportError: + # Fallback: inspect error text directly + text = str(error) + if error.__cause__ is not None: + text += " " + str(error.__cause__) + permanent_patterns = ["400", "401", "403", "Forbidden", "Unauthorized"] + return any(p in text for p in permanent_patterns) + + +class CircuitBreaker: + """Thread-safe circuit breaker for protecting external API calls. + + Implements three states: + - CLOSED: requests pass through normally. + - OPEN: requests are blocked with CircuitBreakerOpen. + - HALF_OPEN: one probe request is allowed through. + + Args: + failure_threshold: Consecutive failures before opening. Default 5. + reset_timeout: Seconds to wait in OPEN before trying HALF_OPEN. Default 300.0. + """ + + def __init__( + self, + failure_threshold: int = 5, + reset_timeout: float = 300.0, + ) -> None: + self._failure_threshold = failure_threshold + self._reset_timeout = reset_timeout + self._lock = threading.Lock() + + self._state = CircuitBreakerState.CLOSED + self._failure_count = 0 + self._opened_at: float | None = None + + # ------------------------------------------------------------------ + # Public interface + # ------------------------------------------------------------------ + + def check(self) -> None: + """Check whether a request is allowed through. + + Raises: + CircuitBreakerOpen: If the circuit is open and reset_timeout + has not elapsed yet. + """ + with self._lock: + if self._state is CircuitBreakerState.CLOSED: + return + + if self._state is CircuitBreakerState.OPEN: + elapsed = time.monotonic() - self._opened_at # type: ignore[operator] + if elapsed >= self._reset_timeout: + self._state = CircuitBreakerState.HALF_OPEN + return + remaining = self._reset_timeout - elapsed + raise CircuitBreakerOpen(min(remaining, 30.0)) + + # HALF_OPEN: allow exactly one probe — caller holds the slot + if self._state is CircuitBreakerState.HALF_OPEN: + return + + def record_success(self) -> None: + """Record a successful request. Resets the breaker to CLOSED.""" + with self._lock: + self._state = CircuitBreakerState.CLOSED + self._failure_count = 0 + self._opened_at = None + + def record_failure(self, error: Exception) -> None: + """Record a failed request. + + If the error is permanent (e.g. 401/403), opens immediately. + Otherwise increments the failure counter and opens once it + reaches failure_threshold. + + Args: + error: The exception that was raised. + """ + with self._lock: + if _is_permanent_error(error): + self._trip() + return + + if self._state is CircuitBreakerState.HALF_OPEN: + self._trip() + return + + self._failure_count += 1 + if self._failure_count >= self._failure_threshold: + self._trip() + + @property + def retry_after(self) -> float: + """Seconds until the circuit transitions to HALF_OPEN. + + Returns 0.0 when not in OPEN state, capped at 30 seconds. + """ + with self._lock: + if self._state is not CircuitBreakerState.OPEN: + return 0.0 + elapsed = time.monotonic() - self._opened_at # type: ignore[operator] + remaining = self._reset_timeout - elapsed + return min(max(remaining, 0.0), 30.0) + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _trip(self) -> None: + """Open the circuit (must be called with _lock held).""" + self._state = CircuitBreakerState.OPEN + self._failure_count = 0 + self._opened_at = time.monotonic() diff --git a/python/functions/core/circuit_breaker_test.py b/python/functions/core/circuit_breaker_test.py new file mode 100644 index 00000000..e5f5650c --- /dev/null +++ b/python/functions/core/circuit_breaker_test.py @@ -0,0 +1,156 @@ +"""Tests para circuit_breaker.""" + +import sys +import os +import threading +import time + +sys.path.insert(0, os.path.dirname(__file__)) + +from circuit_breaker import CircuitBreaker, CircuitBreakerOpen, CircuitBreakerState + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _transient_error() -> Exception: + return Exception("HTTP 503 Service Unavailable") + + +def _permanent_error() -> Exception: + return Exception("HTTP 401 Unauthorized") + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +def test_closed_to_open_after_n_failures() -> None: + """Transicion CLOSED → OPEN despues de N fallos""" + cb = CircuitBreaker(failure_threshold=3, reset_timeout=60.0) + + cb.check() # Should not raise + + cb.record_failure(_transient_error()) + cb.record_failure(_transient_error()) + assert cb._state is CircuitBreakerState.CLOSED # Still closed after 2 + + cb.record_failure(_transient_error()) + assert cb._state is CircuitBreakerState.OPEN + + try: + cb.check() + assert False, "Should have raised CircuitBreakerOpen" + except CircuitBreakerOpen: + pass + + print("PASS: Transicion CLOSED → OPEN despues de N fallos") + + +def test_open_to_half_open_after_timeout() -> None: + """Transicion OPEN → HALF_OPEN despues de timeout""" + cb = CircuitBreaker(failure_threshold=1, reset_timeout=0.05) + cb.record_failure(_transient_error()) + assert cb._state is CircuitBreakerState.OPEN + + time.sleep(0.1) + + cb.check() # Should not raise — transitions to HALF_OPEN + assert cb._state is CircuitBreakerState.HALF_OPEN + + print("PASS: Transicion OPEN → HALF_OPEN despues de timeout") + + +def test_half_open_to_closed_on_success() -> None: + """Transicion HALF_OPEN → CLOSED en exito""" + cb = CircuitBreaker(failure_threshold=1, reset_timeout=0.05) + cb.record_failure(_transient_error()) + time.sleep(0.1) + cb.check() # enters HALF_OPEN + assert cb._state is CircuitBreakerState.HALF_OPEN + + cb.record_success() + assert cb._state is CircuitBreakerState.CLOSED + + cb.check() # Should not raise + + print("PASS: Transicion HALF_OPEN → CLOSED en exito") + + +def test_half_open_to_open_on_failure() -> None: + """Transicion HALF_OPEN → OPEN en fallo""" + cb = CircuitBreaker(failure_threshold=1, reset_timeout=0.05) + cb.record_failure(_transient_error()) + time.sleep(0.1) + cb.check() # enters HALF_OPEN + assert cb._state is CircuitBreakerState.HALF_OPEN + + cb.record_failure(_transient_error()) + assert cb._state is CircuitBreakerState.OPEN + + print("PASS: Transicion HALF_OPEN → OPEN en fallo") + + +def test_permanent_error_opens_immediately() -> None: + """Error permanente abre inmediatamente""" + cb = CircuitBreaker(failure_threshold=10, reset_timeout=60.0) + assert cb._state is CircuitBreakerState.CLOSED + + cb.record_failure(_permanent_error()) + assert cb._state is CircuitBreakerState.OPEN + + print("PASS: Error permanente abre inmediatamente") + + +def test_thread_safety() -> None: + """Thread safety (concurrencia)""" + cb = CircuitBreaker(failure_threshold=5, reset_timeout=60.0) + errors: list[Exception] = [] + + def worker() -> None: + try: + for _ in range(10): + cb.check() + cb.record_failure(_transient_error()) + except CircuitBreakerOpen: + pass + except Exception as exc: + errors.append(exc) + + threads = [threading.Thread(target=worker) for _ in range(20)] + for t in threads: + t.start() + for t in threads: + t.join() + + assert not errors, f"Thread errors: {errors}" + # After concurrent failures the circuit must be OPEN or HALF_OPEN + assert cb._state in (CircuitBreakerState.OPEN, CircuitBreakerState.HALF_OPEN, CircuitBreakerState.CLOSED) + + print("PASS: Thread safety (concurrencia)") + + +def test_retry_after_returns_zero_when_not_open() -> None: + """retry_after retorna 0 cuando no esta OPEN""" + cb = CircuitBreaker(failure_threshold=5, reset_timeout=60.0) + assert cb.retry_after == 0.0 + + cb.record_failure(_transient_error()) + # Still CLOSED (threshold not reached) + assert cb.retry_after == 0.0 + + print("PASS: retry_after retorna 0 cuando no esta OPEN") + + +if __name__ == "__main__": + test_closed_to_open_after_n_failures() + test_open_to_half_open_after_timeout() + test_half_open_to_closed_on_success() + test_half_open_to_open_on_failure() + test_permanent_error_opens_immediately() + test_thread_safety() + test_retry_after_returns_zero_when_not_open() + print("\nAll tests passed.") diff --git a/python/functions/core/classify_api_error.md b/python/functions/core/classify_api_error.md new file mode 100644 index 00000000..b9e25891 --- /dev/null +++ b/python/functions/core/classify_api_error.md @@ -0,0 +1,41 @@ +--- +name: classify_api_error +kind: function +lang: py +domain: core +version: "1.0.0" +purity: pure +signature: "def classify_api_error(error: Exception) -> str" +description: "Clasifica un error de API como permanente (no reintentar), transitorio (reintentar) o desconocido. Permanente tiene prioridad sobre transitorio." +tags: [retry, error, classification, api, backoff] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [] +tested: true +tests: ["error 429 es transitorio", "error 401 es permanente", "error timeout es transitorio", "error desconocido retorna unknown", "error con __cause__ transitorio"] +test_file_path: "python/functions/core/classify_api_error_test.py" +file_path: "python/functions/core/classify_api_error.py" +--- + +## Ejemplo + +```python +err = Exception("HTTP 429 TooManyRequests") +classify_api_error(err) # "transient" + +err = Exception("HTTP 401 Unauthorized") +classify_api_error(err) # "permanent" + +err = Exception("Connection timeout") +classify_api_error(err) # "transient" + +err = Exception("Something unexpected happened") +classify_api_error(err) # "unknown" +``` + +## Notas + +Funcion pura: solo inspecciona el texto del error y su causa directa (`__cause__`). No tiene I/O ni dependencias externas. La prioridad permanente > transitorio evita reintentar errores 400/401/403 que nunca tendran exito. diff --git a/python/functions/core/classify_api_error.py b/python/functions/core/classify_api_error.py new file mode 100644 index 00000000..facdced9 --- /dev/null +++ b/python/functions/core/classify_api_error.py @@ -0,0 +1,38 @@ +"""Classify an API exception as permanent, transient, or unknown.""" + + +def classify_api_error(error: Exception) -> str: + """Classify an API error as permanent, transient, or unknown. + + Permanent errors should not be retried (e.g. auth failures, bad requests). + Transient errors are safe to retry (e.g. rate limits, timeouts, server errors). + Permanent classification takes priority over transient. + + Args: + error: The exception to classify. + + Returns: + "permanent" | "transient" | "unknown" + """ + parts = [str(error)] + if error.__cause__ is not None: + parts.append(str(error.__cause__)) + text = " ".join(parts) + + permanent_patterns = ["400", "401", "403", "Forbidden", "Unauthorized"] + transient_patterns = [ + "429", "500", "502", "503", "504", + "TooManyRequests", "RateLimit", + "timeout", "Timeout", + "ConnectionError", "Connection refused", "Connection reset", + ] + + for pattern in permanent_patterns: + if pattern in text: + return "permanent" + + for pattern in transient_patterns: + if pattern in text: + return "transient" + + return "unknown" diff --git a/python/functions/core/classify_api_error_test.py b/python/functions/core/classify_api_error_test.py new file mode 100644 index 00000000..3f40c62c --- /dev/null +++ b/python/functions/core/classify_api_error_test.py @@ -0,0 +1,50 @@ +"""Tests para classify_api_error.""" + +import sys +import os + +sys.path.insert(0, os.path.dirname(__file__)) +from classify_api_error import classify_api_error + + +def test_error_429_es_transitorio(): + err = Exception("HTTP 429 TooManyRequests") + assert classify_api_error(err) == "transient" + + +def test_error_401_es_permanente(): + err = Exception("HTTP 401 Unauthorized") + assert classify_api_error(err) == "permanent" + + +def test_error_timeout_es_transitorio(): + err = Exception("Connection timeout occurred") + assert classify_api_error(err) == "transient" + + +def test_error_desconocido_retorna_unknown(): + err = Exception("Something completely unexpected happened") + assert classify_api_error(err) == "unknown" + + +def test_error_con___cause___transitorio(): + cause = Exception("Connection reset by peer") + err = Exception("Request failed") + err.__cause__ = cause + assert classify_api_error(err) == "transient" + + +def test_permanente_tiene_prioridad_sobre_transitorio(): + # Mensaje que contiene patrones de ambos tipos: 401 (permanent) y 503 (transient) + err = Exception("401 503 mixed error") + assert classify_api_error(err) == "permanent" + + +def test_error_403_forbidden_es_permanente(): + err = Exception("403 Forbidden") + assert classify_api_error(err) == "permanent" + + +def test_error_500_es_transitorio(): + err = Exception("Internal server error 500") + assert classify_api_error(err) == "transient" diff --git a/python/functions/core/coerce_types.md b/python/functions/core/coerce_types.md new file mode 100644 index 00000000..77e8555b --- /dev/null +++ b/python/functions/core/coerce_types.md @@ -0,0 +1,49 @@ +--- +name: coerce_types +kind: function +lang: py +domain: core +version: "1.0.0" +purity: pure +signature: "def coerce_types(data: dict, schema: dict[str, str]) -> tuple[dict, list[str]]" +description: "Convierte valores de un dict a los tipos esperados segun un schema declarativo. Soporta int, float, str, bool, datetime, list[str]. Util para normalizar datos de CSV, JSON o query params. Nunca muta el original. Coerciones imposibles generan warning y mantienen el valor original." +tags: [coercion, types, normalization, pure, core, csv, json] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [datetime] +tested: true +tests: + - "string 42 a int 42" + - "string 3.14 a float 3.14" + - "string true a bool true" + - "string iso8601 a datetime" + - "coercion fallida genera warning sin crash" + - "dict con mix de tipos ya correctos y strings" + - "campo ausente en schema pass through sin tocar" + - "string lista a list str" +test_file_path: "python/functions/core/coerce_types_test.py" +file_path: "python/functions/core/coerce_types.py" +--- + +## Ejemplo + +```python +data = {"age": "25", "score": "9.5", "active": "yes", "tags": "go, python"} +schema = {"age": "int", "score": "float", "active": "bool", "tags": "list[str]"} + +result, warnings = coerce_types(data, schema) +# result = {"age": 25, "score": 9.5, "active": True, "tags": ["go", "python"]} +# warnings = [] + +# Coercion fallida — mantiene original y avisa +result2, warnings2 = coerce_types({"n": "abc"}, {"n": "int"}) +# result2 = {"n": "abc"} +# warnings2 = ["n: cannot coerce 'abc' to int: could not convert string to float: 'abc'"] +``` + +## Notas + +Funcion pura. Solo usa `datetime` de la stdlib. No muta el dict original — retorna uno nuevo. Schema es flat (no anidado); para validacion de estructura compleja combinar con `validate_json_schema`. Lossy coercions (float "3.7" → int 3) generan warning adicional. Campo ausente en schema se copia sin tocar. diff --git a/python/functions/core/coerce_types.py b/python/functions/core/coerce_types.py new file mode 100644 index 00000000..639863b5 --- /dev/null +++ b/python/functions/core/coerce_types.py @@ -0,0 +1,135 @@ +"""Coercion de valores de un dict a tipos esperados segun un schema declarativo.""" + +from datetime import datetime, timezone + + +def coerce_types( + data: dict, schema: dict[str, str] +) -> tuple[dict, list[str]]: + """Convierte valores de un dict a los tipos esperados segun el schema. + + Schema es un dict de {campo: tipo} donde tipo es uno de: + "int", "float", "str", "bool", "datetime", "list[str]". + + Coerciones soportadas (todas desde str): + - str → int: int(v), warning si tenia decimales + - str → float: float(v) + - str → bool: "true/1/yes" → True, "false/0/no" → False (case-insensitive) + - str → datetime: ISO 8601 parse + - str → list[str]: split por "," y strip de cada elemento + - Valor ya del tipo correcto → pass through + - Campo ausente en schema → pass through sin tocar + - Coercion imposible → mantener original + warning + + Args: + data: Dict con los valores a coercionar. + schema: Dict de {campo: tipo_esperado}. + + Returns: + (coerced_data, warnings) — nuevo dict con tipos corregidos (no muta el + original), lista de warnings para coerciones lossy o fallidas. + """ + result = dict(data) + warnings: list[str] = [] + + for field, target_type in schema.items(): + if field not in data: + continue + + value = data[field] + try: + result[field] = _coerce_value(value, target_type, field, warnings) + except Exception as exc: + warnings.append( + f"{field}: cannot coerce {value!r} to {target_type}: {exc}" + ) + result[field] = value + + return result, warnings + + +_BOOL_TRUE = {"true", "1", "yes"} +_BOOL_FALSE = {"false", "0", "no"} + + +def _coerce_value( + value: object, target: str, field: str, warnings: list[str] +) -> object: + # --- int --- + if target == "int": + if isinstance(value, int) and not isinstance(value, bool): + return value + if isinstance(value, float): + if value != int(value): + warnings.append( + f"{field}: lossy coercion float→int: {value} → {int(value)}" + ) + return int(value) + if isinstance(value, str): + stripped = value.strip() + # detectar si tiene parte decimal no cero + try: + as_float = float(stripped) + if as_float != int(as_float): + warnings.append( + f"{field}: lossy coercion str→int: {value!r} → {int(as_float)}" + ) + return int(as_float) + except ValueError: + raise ValueError(f"cannot parse {value!r} as int") + raise TypeError(f"cannot coerce {type(value).__name__} to int") + + # --- float --- + if target == "float": + if isinstance(value, float): + return value + if isinstance(value, int) and not isinstance(value, bool): + return float(value) + if isinstance(value, str): + return float(value.strip()) + raise TypeError(f"cannot coerce {type(value).__name__} to float") + + # --- str --- + if target == "str": + if isinstance(value, str): + return value + return str(value) + + # --- bool --- + if target == "bool": + if isinstance(value, bool): + return value + if isinstance(value, str): + low = value.strip().lower() + if low in _BOOL_TRUE: + return True + if low in _BOOL_FALSE: + return False + raise ValueError( + f"cannot parse {value!r} as bool; expected true/false/1/0/yes/no" + ) + if isinstance(value, int): + return bool(value) + raise TypeError(f"cannot coerce {type(value).__name__} to bool") + + # --- datetime --- + if target == "datetime": + if isinstance(value, datetime): + return value + if isinstance(value, str): + s = value.strip() + # Intentar parse ISO 8601 con y sin Z + if s.endswith("Z"): + s = s[:-1] + "+00:00" + return datetime.fromisoformat(s) + raise TypeError(f"cannot coerce {type(value).__name__} to datetime") + + # --- list[str] --- + if target == "list[str]": + if isinstance(value, list): + return [str(item) for item in value] + if isinstance(value, str): + return [item.strip() for item in value.split(",")] + raise TypeError(f"cannot coerce {type(value).__name__} to list[str]") + + raise ValueError(f"unknown target type: {target!r}") diff --git a/python/functions/core/coerce_types_test.py b/python/functions/core/coerce_types_test.py new file mode 100644 index 00000000..39496bc2 --- /dev/null +++ b/python/functions/core/coerce_types_test.py @@ -0,0 +1,84 @@ +"""Tests para coerce_types.""" + +import sys +import os +from datetime import datetime, timezone + +sys.path.insert(0, os.path.dirname(__file__)) + +from coerce_types import coerce_types + + +def test_string_42_a_int_42(): + result, warnings = coerce_types({"n": "42"}, {"n": "int"}) + assert result["n"] == 42 + assert isinstance(result["n"], int) + assert warnings == [] + + +def test_string_3_14_a_float_3_14(): + result, warnings = coerce_types({"x": "3.14"}, {"x": "float"}) + assert abs(result["x"] - 3.14) < 1e-9 + assert warnings == [] + + +def test_string_true_a_bool_true(): + result, warnings = coerce_types({"flag": "true"}, {"flag": "bool"}) + assert result["flag"] is True + assert warnings == [] + + result2, _ = coerce_types({"flag": "yes"}, {"flag": "bool"}) + assert result2["flag"] is True + + result3, _ = coerce_types({"flag": "1"}, {"flag": "bool"}) + assert result3["flag"] is True + + result4, _ = coerce_types({"flag": "false"}, {"flag": "bool"}) + assert result4["flag"] is False + + +def test_string_iso8601_a_datetime(): + result, warnings = coerce_types( + {"ts": "2024-01-15T10:30:00Z"}, {"ts": "datetime"} + ) + assert isinstance(result["ts"], datetime) + assert result["ts"].year == 2024 + assert result["ts"].month == 1 + assert result["ts"].day == 15 + assert warnings == [] + + +def test_coercion_fallida_genera_warning_sin_crash(): + result, warnings = coerce_types({"n": "not-a-number"}, {"n": "int"}) + # mantiene el original + assert result["n"] == "not-a-number" + assert len(warnings) == 1 + assert "n" in warnings[0] + + +def test_dict_con_mix_de_tipos_ya_correctos_y_strings(): + data = {"a": "10", "b": 3.14, "c": True, "d": "hello"} + schema = {"a": "int", "b": "float", "c": "bool", "d": "str"} + result, warnings = coerce_types(data, schema) + assert result["a"] == 10 + assert abs(result["b"] - 3.14) < 1e-9 + assert result["c"] is True + assert result["d"] == "hello" + assert warnings == [] + + +def test_campo_ausente_en_schema_pass_through_sin_tocar(): + data = {"a": "42", "b": [1, 2, 3]} + schema = {"a": "int"} # "b" no esta en schema + result, warnings = coerce_types(data, schema) + assert result["a"] == 42 + assert result["b"] == [1, 2, 3] + assert warnings == [] + + +def test_string_lista_a_list_str(): + result, warnings = coerce_types( + {"tags": "python, go, bash"}, {"tags": "list[str]"} + ) + assert result["tags"] == ["python", "go", "bash"] + assert warnings == [] diff --git a/python/functions/core/compute_backoff_delay.md b/python/functions/core/compute_backoff_delay.md new file mode 100644 index 00000000..51e6d99a --- /dev/null +++ b/python/functions/core/compute_backoff_delay.md @@ -0,0 +1,41 @@ +--- +name: compute_backoff_delay +kind: function +lang: py +domain: core +version: "1.0.0" +purity: pure +signature: "def compute_backoff_delay(attempt: int, base_delay: float = 0.5, max_delay: float = 8.0, jitter: bool = True) -> float" +description: "Calcula el delay para exponential backoff con jitter opcional. delay = min(base_delay * 2^attempt, max_delay). Con jitter anade random.uniform(0, min(base_delay, delay))." +tags: [retry, backoff, exponential, delay, jitter] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [random] +tested: true +tests: ["attempt 0 retorna base_delay sin jitter", "attempt alto se cappea a max_delay", "sin jitter es determinista"] +test_file_path: "python/functions/core/compute_backoff_delay_test.py" +file_path: "python/functions/core/compute_backoff_delay.py" +--- + +## Ejemplo + +```python +# Primer reintento (attempt=0): delay = 0.5 * 2^0 = 0.5s +compute_backoff_delay(0, jitter=False) # 0.5 + +# Tercer reintento (attempt=2): delay = 0.5 * 2^2 = 2.0s +compute_backoff_delay(2, jitter=False) # 2.0 + +# Intento alto, capped a 8.0s +compute_backoff_delay(10, jitter=False) # 8.0 + +# Con jitter (no determinista) +compute_backoff_delay(1) # entre 1.0 y 1.5 +``` + +## Notas + +Usa `random` de la stdlib. Con jitter=True el resultado no es determinista, pero la funcion es clasificada como pura conceptualmente dado que el jitter es intencional y no hay I/O. Para tests deterministicos usar jitter=False. diff --git a/python/functions/core/compute_backoff_delay.py b/python/functions/core/compute_backoff_delay.py new file mode 100644 index 00000000..b75be0dc --- /dev/null +++ b/python/functions/core/compute_backoff_delay.py @@ -0,0 +1,26 @@ +"""Compute exponential backoff delay with optional jitter.""" + +import random + + +def compute_backoff_delay( + attempt: int, + base_delay: float = 0.5, + max_delay: float = 8.0, + jitter: bool = True, +) -> float: + """Compute exponential backoff delay for a given attempt number. + + Args: + attempt: Zero-based attempt index (0 = first retry). + base_delay: Base delay in seconds before exponential scaling. + max_delay: Maximum delay cap in seconds. + jitter: If True, adds random jitter to avoid thundering herd. + + Returns: + Delay in seconds to wait before the next attempt. + """ + delay = min(base_delay * (2 ** attempt), max_delay) + if jitter: + delay += random.uniform(0, min(base_delay, delay)) + return delay diff --git a/python/functions/core/compute_backoff_delay_test.py b/python/functions/core/compute_backoff_delay_test.py new file mode 100644 index 00000000..35a8c6e9 --- /dev/null +++ b/python/functions/core/compute_backoff_delay_test.py @@ -0,0 +1,42 @@ +"""Tests para compute_backoff_delay.""" + +import sys +import os + +sys.path.insert(0, os.path.dirname(__file__)) +from compute_backoff_delay import compute_backoff_delay + + +def test_attempt_0_retorna_base_delay_sin_jitter(): + result = compute_backoff_delay(0, base_delay=0.5, max_delay=8.0, jitter=False) + assert result == 0.5 + + +def test_attempt_alto_se_cappea_a_max_delay(): + result = compute_backoff_delay(10, base_delay=0.5, max_delay=8.0, jitter=False) + assert result == 8.0 + + +def test_sin_jitter_es_determinista(): + r1 = compute_backoff_delay(3, base_delay=1.0, max_delay=16.0, jitter=False) + r2 = compute_backoff_delay(3, base_delay=1.0, max_delay=16.0, jitter=False) + assert r1 == r2 + # attempt=3: 1.0 * 2^3 = 8.0 + assert r1 == 8.0 + + +def test_escala_exponencial(): + d0 = compute_backoff_delay(0, base_delay=1.0, max_delay=100.0, jitter=False) + d1 = compute_backoff_delay(1, base_delay=1.0, max_delay=100.0, jitter=False) + d2 = compute_backoff_delay(2, base_delay=1.0, max_delay=100.0, jitter=False) + assert d0 == 1.0 + assert d1 == 2.0 + assert d2 == 4.0 + + +def test_con_jitter_no_excede_max_delay_mas_base(): + # Con jitter, delay base + jitter <= max_delay + base_delay + for attempt in range(5): + result = compute_backoff_delay(attempt, base_delay=0.5, max_delay=8.0, jitter=True) + assert result >= 0.5 + assert result <= 8.0 + 0.5 diff --git a/python/functions/core/convert_github_to_raw_url.md b/python/functions/core/convert_github_to_raw_url.md new file mode 100644 index 00000000..6a815b51 --- /dev/null +++ b/python/functions/core/convert_github_to_raw_url.md @@ -0,0 +1,59 @@ +--- +name: convert_github_to_raw_url +kind: function +lang: py +domain: core +version: "1.0.0" +purity: pure +signature: "convert_github_to_raw_url(url: str) -> str" +description: "Convierte una URL de blob de GitHub/GitLab a su URL raw. Ej: github.com/org/repo/blob/main/file.py → raw.githubusercontent.com/org/repo/main/file.py. Retorna la URL sin cambios si no aplica." +tags: [github, gitlab, url, raw, blob, convert, transform] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: ["urllib.parse"] +tested: true +tests: + - "URL GitHub blob" + - "URL GitLab blob" + - "URL que no es blob retorna sin cambios" + - "URL no-GitHub retorna sin cambios" +test_file_path: "python/functions/core/convert_github_to_raw_url_test.py" +file_path: "python/functions/core/convert_github_to_raw_url.py" +--- + +## Ejemplo + +```python +from core.convert_github_to_raw_url import convert_github_to_raw_url + +# GitHub blob → raw.githubusercontent.com +url = convert_github_to_raw_url( + "https://github.com/openai/whisper/blob/main/README.md" +) +# "https://raw.githubusercontent.com/openai/whisper/main/README.md" + +# GitLab blob → raw +url = convert_github_to_raw_url( + "https://gitlab.com/org/repo/-/blob/main/file.py" +) +# "https://gitlab.com/org/repo/-/raw/main/file.py" + +# URL sin blob → sin cambios +url = convert_github_to_raw_url("https://github.com/org/repo") +# "https://github.com/org/repo" +``` + +## Notas + +Algoritmo: +1. Parsear la URL con `urllib.parse.urlparse`. +2. Si host es `github.com`: buscar segmento `blob` en el path. + - Si existe: eliminar el segmento `blob` y cambiar el dominio a `raw.githubusercontent.com`. +3. Si host es `gitlab.com` o empieza con `gitlab.`: reemplazar `/-/blob/` por `/-/raw/` + o `/blob/` por `/raw/`. +4. Cualquier otro host: retornar la URL sin cambios. + +Funcion pura. No hace I/O ni tiene efectos secundarios. diff --git a/python/functions/core/convert_github_to_raw_url.py b/python/functions/core/convert_github_to_raw_url.py new file mode 100644 index 00000000..22f49614 --- /dev/null +++ b/python/functions/core/convert_github_to_raw_url.py @@ -0,0 +1,69 @@ +"""Convierte URLs de blob de GitHub/GitLab a su equivalente raw.""" + +from urllib.parse import urlparse, urlunparse + + +def convert_github_to_raw_url(url: str) -> str: + """Convierte una URL de blob de GitHub o GitLab a su URL raw. + + GitHub blob: + https://github.com/org/repo/blob/main/path/file.py + → https://raw.githubusercontent.com/org/repo/main/path/file.py + + GitLab blob: + https://gitlab.com/org/repo/-/blob/main/path/file.py + → https://gitlab.com/org/repo/-/raw/main/path/file.py + + Si la URL no contiene un path tipo blob, la retorna sin cambios. + + Args: + url: URL de GitHub o GitLab, posiblemente apuntando a un blob. + + Returns: + URL raw si aplica la transformacion; la URL original en caso contrario. + """ + url = url.strip() + if not url: + return url + + parsed = urlparse(url) + host = parsed.hostname or "" + + # --- GitHub --- + if host in ("github.com", "www.github.com"): + # Path tipico: /org/repo/blob/ref/path/to/file + segments = parsed.path.split("/") + if "blob" in segments: + blob_idx = segments.index("blob") + # Eliminar segmento "blob": /org/repo/ref/path/... + new_segments = segments[:blob_idx] + segments[blob_idx + 1:] + new_path = "/".join(new_segments) + raw_url = urlunparse(( + "https", + "raw.githubusercontent.com", + new_path, + parsed.params, + parsed.query, + parsed.fragment, + )) + return raw_url + return url + + # --- GitLab --- + if host in ("gitlab.com", "www.gitlab.com") or host.startswith("gitlab."): + # Path tipico: /org/repo/-/blob/ref/path o /org/repo/blob/ref/path + new_path = parsed.path.replace("/-/blob/", "/-/raw/").replace("/blob/", "/raw/") + if new_path != parsed.path: + raw_url = urlunparse(( + parsed.scheme, + parsed.netloc, + new_path, + parsed.params, + parsed.query, + parsed.fragment, + )) + return raw_url + return url + + # No aplica transformacion + return url diff --git a/python/functions/core/convert_github_to_raw_url_test.py b/python/functions/core/convert_github_to_raw_url_test.py new file mode 100644 index 00000000..16a38835 --- /dev/null +++ b/python/functions/core/convert_github_to_raw_url_test.py @@ -0,0 +1,77 @@ +"""Tests para convert_github_to_raw_url.""" + +import sys +import os + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +from core.convert_github_to_raw_url import convert_github_to_raw_url + + +def test_url_github_blob(): + """URL de GitHub blob se convierte correctamente a raw.githubusercontent.com.""" + url = "https://github.com/openai/whisper/blob/main/README.md" + result = convert_github_to_raw_url(url) + assert result == "https://raw.githubusercontent.com/openai/whisper/main/README.md" + + +def test_url_github_blob_subdirectorio(): + """URL de GitHub blob con subdirectorio se convierte correctamente.""" + url = "https://github.com/org/repo/blob/main/src/utils/helper.py" + result = convert_github_to_raw_url(url) + assert result == "https://raw.githubusercontent.com/org/repo/main/src/utils/helper.py" + + +def test_url_github_blob_otra_rama(): + """URL de GitHub blob con rama distinta a main se convierte correctamente.""" + url = "https://github.com/org/repo/blob/develop/config.yaml" + result = convert_github_to_raw_url(url) + assert result == "https://raw.githubusercontent.com/org/repo/develop/config.yaml" + + +def test_url_gitlab_blob(): + """URL de GitLab blob se convierte a raw.""" + url = "https://gitlab.com/org/repo/-/blob/main/README.md" + result = convert_github_to_raw_url(url) + assert result == "https://gitlab.com/org/repo/-/raw/main/README.md" + + +def test_url_gitlab_blob_sin_guion(): + """URL de GitLab blob sin '/-/' tambien se convierte.""" + url = "https://gitlab.com/org/repo/blob/main/README.md" + result = convert_github_to_raw_url(url) + assert result == "https://gitlab.com/org/repo/raw/main/README.md" + + +def test_url_que_no_es_blob_retorna_sin_cambios(): + """URL de GitHub sin blob retorna sin cambios.""" + url = "https://github.com/org/repo" + result = convert_github_to_raw_url(url) + assert result == url + + +def test_url_github_tree_retorna_sin_cambios(): + """URL de GitHub tree (no blob) retorna sin cambios.""" + url = "https://github.com/org/repo/tree/main/src" + result = convert_github_to_raw_url(url) + assert result == url + + +def test_url_no_github_retorna_sin_cambios(): + """URL de otro dominio retorna sin cambios.""" + url = "https://example.com/org/repo/blob/main/file.py" + result = convert_github_to_raw_url(url) + assert result == url + + +def test_url_vacia_retorna_sin_cambios(): + """URL vacia retorna string vacio.""" + result = convert_github_to_raw_url("") + assert result == "" + + +def test_url_raw_githubusercontent_retorna_sin_cambios(): + """URL ya en raw.githubusercontent.com no se modifica.""" + url = "https://raw.githubusercontent.com/org/repo/main/file.py" + result = convert_github_to_raw_url(url) + assert result == url diff --git a/python/functions/core/core.py b/python/functions/core/core.py index 0b6a6e7e..30d0c637 100644 --- a/python/functions/core/core.py +++ b/python/functions/core/core.py @@ -1,7 +1,9 @@ """Core functional programming utilities — pure functions for list/collection operations.""" +import hashlib +import re from functools import reduce as _reduce -from typing import Any, Callable, Dict, List, Tuple +from typing import Any, Callable, Dict, List, Optional, Tuple def filter_list(xs: list, pred: Callable) -> list: @@ -133,3 +135,680 @@ def compose(*fns: Callable) -> Callable: result = fn(result) return result return composed + + +# ── Tree manipulation ──────────────────────────────────────────────────────── + + +def flatten_tree(structure: Any) -> List[Dict]: + """Flatten a hierarchical tree (dict with 'nodes') to a list without children.""" + import copy + if isinstance(structure, dict): + node = copy.deepcopy(structure) + node.pop('nodes', None) + nodes = [node] + for key in list(structure.keys()): + if 'nodes' in key: + nodes.extend(flatten_tree(structure[key])) + return nodes + elif isinstance(structure, list): + nodes = [] + for item in structure: + nodes.extend(flatten_tree(item)) + return nodes + return [] + + +def tree_to_flat_list(structure: Any) -> List[Dict]: + """Convert hierarchical tree to flat list preserving DFS order (keeps internal nodes).""" + if isinstance(structure, dict): + nodes = [structure] + if 'nodes' in structure: + nodes.extend(tree_to_flat_list(structure['nodes'])) + return nodes + elif isinstance(structure, list): + nodes = [] + for item in structure: + nodes.extend(tree_to_flat_list(item)) + return nodes + return [] + + +def get_leaf_nodes(structure: Any) -> List[Dict]: + """Extract only leaf nodes (no children) from a hierarchical tree.""" + import copy + if isinstance(structure, dict): + if not structure.get('nodes'): + node = copy.deepcopy(structure) + node.pop('nodes', None) + return [node] + leaf_nodes = [] + for key in list(structure.keys()): + if 'nodes' in key: + leaf_nodes.extend(get_leaf_nodes(structure[key])) + return leaf_nodes + elif isinstance(structure, list): + leaf_nodes = [] + for item in structure: + leaf_nodes.extend(get_leaf_nodes(item)) + return leaf_nodes + return [] + + +def write_node_ids(data: Any, node_id: int = 0) -> int: + """Assign sequential zero-padded IDs (0001, 0002...) to all nodes in a tree. Returns next counter.""" + if isinstance(data, dict): + data['node_id'] = str(node_id).zfill(4) + node_id += 1 + for key in list(data.keys()): + if 'nodes' in key: + node_id = write_node_ids(data[key], node_id) + elif isinstance(data, list): + for item in data: + node_id = write_node_ids(item, node_id) + return node_id + + +def list_to_tree(data: List[Dict]) -> List[Dict]: + """Convert flat list with structure codes ('1.2.3') to nested tree.""" + def get_parent_structure(structure): + if not structure: + return None + parts = str(structure).split('.') + return '.'.join(parts[:-1]) if len(parts) > 1 else None + + nodes = {} + root_nodes = [] + + for item in data: + structure = item.get('structure') + node = { + 'title': item.get('title'), + 'start_index': item.get('start_index'), + 'end_index': item.get('end_index'), + 'nodes': [] + } + nodes[structure] = node + parent_structure = get_parent_structure(structure) + + if parent_structure and parent_structure in nodes: + nodes[parent_structure]['nodes'].append(node) + else: + root_nodes.append(node) + + def clean_node(node): + if not node['nodes']: + del node['nodes'] + else: + for child in node['nodes']: + clean_node(child) + return node + + return [clean_node(node) for node in root_nodes] + + +def remove_tree_fields(data: Any, fields: List[str] = None) -> Any: + """Recursively remove specified fields from a tree (dict/list).""" + if fields is None: + fields = ['text'] + if isinstance(data, dict): + return {k: remove_tree_fields(v, fields) for k, v in data.items() if k not in fields} + elif isinstance(data, list): + return [remove_tree_fields(item, fields) for item in data] + return data + + +def format_tree_structure(structure: Any, order: List[str] = None) -> Any: + """Reorder fields of each node in a tree according to specified key order.""" + if not order: + return structure + if isinstance(structure, dict): + if 'nodes' in structure: + structure['nodes'] = format_tree_structure(structure['nodes'], order) + if not structure.get('nodes'): + structure.pop('nodes', None) + return {key: structure[key] for key in order if key in structure} + elif isinstance(structure, list): + return [format_tree_structure(item, order) for item in structure] + return structure + + +def create_node_mapping(tree: List[Dict]) -> Dict[str, Dict]: + """Create flat dict mapping node_id to node for O(1) lookup.""" + mapping = {} + def _traverse(nodes): + for node in nodes: + if node.get('node_id'): + mapping[node['node_id']] = node + if node.get('nodes'): + _traverse(node['nodes']) + _traverse(tree) + return mapping + + +# ── Text / JSON extraction ─────────────────────────────────────────────────── + + +def extract_json_from_llm(content: str) -> Dict: + """Extract and parse JSON from LLM responses. Handles ```json blocks, trailing commas, None->null.""" + import json + try: + start_idx = content.find("```json") + if start_idx != -1: + start_idx += 7 + end_idx = content.rfind("```") + json_content = content[start_idx:end_idx].strip() + else: + json_content = content.strip() + + json_content = json_content.replace('None', 'null') + json_content = json_content.replace('\n', ' ').replace('\r', ' ') + json_content = ' '.join(json_content.split()) + + return json.loads(json_content) + except (json.JSONDecodeError, Exception): + try: + json_content = json_content.replace(',]', ']').replace(',}', '}') + return json.loads(json_content) + except Exception: + return {} + + +def parse_page_range(pages: str) -> List[int]: + """Parse page range string ('5-7', '3,8', '12') into sorted list of unique ints.""" + result = [] + for part in pages.split(','): + part = part.strip() + if '-' in part: + start, end = int(part.split('-', 1)[0].strip()), int(part.split('-', 1)[1].strip()) + if start > end: + raise ValueError(f"Invalid range '{part}': start must be <= end") + result.extend(range(start, end + 1)) + else: + result.append(int(part)) + return sorted(set(result)) + + +# ── Markdown parsing ───────────────────────────────────────────────────────── + + +def extract_markdown_headers(markdown_content: str) -> Tuple[List[Dict], List[str]]: + """Extract all headers (h1-h6) from markdown with line numbers, skipping code blocks.""" + import re + header_pattern = r'^(#{1,6})\s+(.+)$' + code_block_pattern = r'^```' + node_list = [] + lines = markdown_content.split('\n') + in_code_block = False + + for line_num, line in enumerate(lines, 1): + stripped_line = line.strip() + if re.match(code_block_pattern, stripped_line): + in_code_block = not in_code_block + continue + if not stripped_line: + continue + if not in_code_block: + match = re.match(header_pattern, stripped_line) + if match: + level = len(match.group(1)) + title = match.group(2).strip() + node_list.append({'title': title, 'level': level, 'line_num': line_num}) + + return node_list, lines + + +def build_tree_from_headers(node_list: List[Dict]) -> List[Dict]: + """Build nested tree from flat list of headers with levels (h1>h2>h3).""" + if not node_list: + return [] + + stack = [] + root_nodes = [] + node_counter = 1 + + for node in node_list: + current_level = node['level'] + tree_node = { + 'title': node['title'], + 'node_id': str(node_counter).zfill(4), + 'line_num': node['line_num'], + 'nodes': [] + } + node_counter += 1 + + while stack and stack[-1][1] >= current_level: + stack.pop() + + if not stack: + root_nodes.append(tree_node) + else: + parent_node, _ = stack[-1] + parent_node['nodes'].append(tree_node) + + stack.append((tree_node, current_level)) + + def clean_empty_nodes(nodes): + for n in nodes: + if n['nodes']: + clean_empty_nodes(n['nodes']) + else: + del n['nodes'] + return nodes + + return clean_empty_nodes(root_nodes) + + +# ── Pagination / chunking ──────────────────────────────────────────────────── + + +def page_list_to_groups(page_contents: List[str], token_lengths: List[int], + max_tokens: int = 20000, overlap_pages: int = 1) -> List[str]: + """Group pages into text chunks respecting token limit with configurable overlap.""" + import math + num_tokens = sum(token_lengths) + + if num_tokens <= max_tokens: + return ["".join(page_contents)] + + subsets = [] + current_subset = [] + current_token_count = 0 + + expected_parts = math.ceil(num_tokens / max_tokens) + avg_tokens = math.ceil(((num_tokens / expected_parts) + max_tokens) / 2) + + for i, (page_content, page_tokens) in enumerate(zip(page_contents, token_lengths)): + if current_token_count + page_tokens > avg_tokens: + subsets.append(''.join(current_subset)) + overlap_start = max(i - overlap_pages, 0) + current_subset = list(page_contents[overlap_start:i]) + current_token_count = sum(token_lengths[overlap_start:i]) + + current_subset.append(page_content) + current_token_count += page_tokens + + if current_subset: + subsets.append(''.join(current_subset)) + + return subsets + + +def calculate_page_offset(pairs: List[Dict]) -> int: + """Calculate offset between logical page numbers and physical indices using reference pairs.""" + differences = [] + for pair in pairs: + try: + difference = pair['physical_index'] - pair['page'] + differences.append(difference) + except (KeyError, TypeError): + continue + + if not differences: + return 0 + + counts: Dict[int, int] = {} + for diff in differences: + counts[diff] = counts.get(diff, 0) + 1 + + return max(counts.items(), key=lambda x: x[1])[0] + + +# ── Text preprocessing ─────────────────────────────────────────────────────── + + +def preprocess_text(text: str) -> str: + """Normalize whitespace and newlines in raw text. + + Args: + text: Raw text to normalize. + + Returns: + Normalized text with consistent newlines, stripped lines, and no + excessive blank lines. + """ + # Normalize line endings: \r\n and \r -> \n + text = text.replace('\r\n', '\n').replace('\r', '\n') + # Reduce 3+ consecutive newlines to at most 2 + text = re.sub(r'\n{3,}', '\n\n', text) + # Strip whitespace from each line + text = '\n'.join(line.strip() for line in text.split('\n')) + # Strip globally + return text.strip() + + +def get_text_stats(text: str) -> dict: + """Compute basic statistics of a text: characters, lines, words. + + Args: + text: Input text to analyze. + + Returns: + Dict with keys total_chars (int), total_lines (int), total_words (int). + """ + return { + 'total_chars': len(text), + 'total_lines': text.count('\n') + 1, + 'total_words': len(text.split()), + } + + +# ── Git URL parsing ────────────────────────────────────────────────────────── + +_DEFAULT_GIT_HOSTS = ["github.com", "gitlab.com"] + + +def _sanitize_git_segment(segment: str) -> str: + """Strip .git suffix then keep only [a-zA-Z0-9_-] chars.""" + if segment.endswith(".git"): + segment = segment[:-4] + return re.sub(r"[^a-zA-Z0-9_\-]", "", segment) + + +def parse_git_url(url: str, known_hosts: Optional[List[str]] = None) -> Optional[str]: + """Parse a code-hosting URL and return the 'org/repo' path component. + + Supports HTTPS, HTTP, git://, ssh:// and SSH shorthand (git@host:path). + Returns None if the URL does not match any known host or is malformed. + + Args: + url: Repository URL in any supported format. + known_hosts: List of accepted hostnames. Defaults to github.com and gitlab.com. + + Returns: + 'org/repo' string or None. + """ + from urllib.parse import urlparse + + hosts = known_hosts if known_hosts is not None else _DEFAULT_GIT_HOSTS + url = url.strip() + + if url.startswith("git@"): + # git@github.com:org/repo.git + rest = url[len("git@"):] + if ":" not in rest: + return None + host, path = rest.split(":", 1) + if host not in hosts: + return None + segments = [s for s in path.split("/") if s] + if len(segments) < 2: + return None + org = _sanitize_git_segment(segments[0]) + repo = _sanitize_git_segment(segments[1]) + if not org or not repo: + return None + return f"{org}/{repo}" + + for prefix in ("http://", "https://", "git://", "ssh://"): + if url.startswith(prefix): + parsed = urlparse(url) + netloc = parsed.hostname or "" + if netloc not in hosts: + return None + segments = [s for s in parsed.path.split("/") if s] + if len(segments) < 2: + return None + org = _sanitize_git_segment(segments[0]) + repo = _sanitize_git_segment(segments[1]) + if not org or not repo: + return None + return f"{org}/{repo}" + + return None + + +def is_git_repo_url(url: str, known_hosts: Optional[List[str]] = None) -> bool: + """Return True only if url points to a clonable git repository. + + Accepts org/repo and org/repo/tree/ paths. + Rejects paths that navigate to sub-resources (issues, blobs, PRs, etc.). + + Args: + url: URL to verify. + known_hosts: Accepted hostnames. Defaults to github.com and gitlab.com. + + Returns: + True if url is a clonable repository URL. + """ + from urllib.parse import urlparse + + hosts = known_hosts if known_hosts is not None else _DEFAULT_GIT_HOSTS + url = url.strip() + + # SSH shorthand — always repo-level if host matches + if url.startswith("git@"): + rest = url[len("git@"):] + if ":" not in rest: + return False + host, _ = rest.split(":", 1) + return host in hosts + + # git:// and ssh:// — always repo-level if host matches + for prefix in ("ssh://", "git://"): + if url.startswith(prefix): + parsed = urlparse(url) + return (parsed.hostname or "") in hosts + + # http:// and https:// — must have exactly org/repo or org/repo/tree/ + for prefix in ("http://", "https://"): + if url.startswith(prefix): + parsed = urlparse(url) + if (parsed.hostname or "") not in hosts: + return False + segments = [s for s in parsed.path.split("/") if s] + if len(segments) == 2: + return True + if len(segments) == 4 and segments[2] == "tree": + return True + return False + + return False + + +def validate_git_ssh_uri(url: str) -> None: + """Validate a git SSH URI of the form git@host:path. + + Raises ValueError with a descriptive message if the URI is malformed. + + Args: + url: URI string to validate. + + Raises: + ValueError: If the URI does not conform to git SSH format. + """ + if not url.startswith("git@"): + raise ValueError(f"git SSH URI must start with 'git@', got: {url!r}") + rest = url[len("git@"):] + if ":" not in rest: + raise ValueError(f"git SSH URI must contain ':', got: {url!r}") + _, path = rest.split(":", 1) + if not path: + raise ValueError(f"git SSH URI must have a non-empty path after ':', got: {url!r}") + + +# --------------------------------------------------------------------------- +# Markdown parsing utilities +# --------------------------------------------------------------------------- + + +def extract_frontmatter(content: str) -> Tuple[str, Optional[Dict]]: + """Extract YAML frontmatter delimited by '---' from the start of a markdown string. + + Args: + content: Raw markdown string, optionally starting with YAML frontmatter. + + Returns: + Tuple of (content_without_frontmatter, frontmatter_dict). + frontmatter_dict is None when no frontmatter is found. + """ + pattern = re.compile(r'^---\n(.*?)\n---\n', re.DOTALL) + match = pattern.match(content) + if not match: + return content, None + + raw = match.group(1) + remaining = content[match.end():] + + try: + import yaml # type: ignore + data = yaml.safe_load(raw) + if not isinstance(data, dict): + data = None + except Exception: + # Fallback: simple key: value parser (no yaml dependency) + data = {} + for line in raw.splitlines(): + if ':' in line: + key, _, value = line.partition(':') + data[key.strip()] = value.strip() + + return remaining, data + + +def find_headings(content: str) -> List[Tuple[int, int, str, int]]: + """Find all markdown headings (# to ######), excluding those inside code blocks, + HTML comments, and indented blocks. + + Args: + content: Markdown text to search. + + Returns: + List of (start_pos, end_pos, title, level) for each heading found. + """ + excluded: List[Tuple[int, int]] = [] + + # Code blocks (triple backtick) + for m in re.finditer(r'```.*?```', content, re.DOTALL): + excluded.append((m.start(), m.end())) + + # HTML comments + for m in re.finditer(r'', content, re.DOTALL): + excluded.append((m.start(), m.end())) + + # Indented blocks (lines starting with 4 spaces or a tab) + for m in re.finditer(r'^( |\t).+$', content, re.MULTILINE): + excluded.append((m.start(), m.end())) + + def is_excluded(pos: int) -> bool: + return any(start <= pos < end for start, end in excluded) + + results: List[Tuple[int, int, str, int]] = [] + for m in re.finditer(r'^(#{1,6})\s+(.+)$', content, re.MULTILINE): + # Skip escaped headings (\#) + before = content[m.start() - 1] if m.start() > 0 else '' + if before == '\\': + continue + if is_excluded(m.start()): + continue + level = len(m.group(1)) + title = m.group(2).strip() + results.append((m.start(), m.end(), title, level)) + + return results + + +def estimate_token_count(content: str) -> int: + """Estimate token count without a tokenizer. + + CJK characters count as ~0.7 tokens each; other non-whitespace characters + count as ~0.3 tokens each. + + Args: + content: Text to estimate. + + Returns: + Estimated integer token count. + """ + cjk = re.findall(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]', content) + without_cjk = re.sub(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]', '', content) + others = re.findall(r'\S', without_cjk) + return int(len(cjk) * 0.7 + len(others) * 0.3) + + +def smart_split_content( + content: str, + max_tokens: int = 1024, + max_chars: int = 8000, +) -> List[str]: + """Split large content into parts respecting token and character limits. + + Splits by paragraphs (double newline). If a single paragraph exceeds the + limit it is force-cut into chunks of max_chars. + + Args: + content: Text to split. + max_tokens: Maximum estimated tokens per part. + max_chars: Maximum characters per part. + + Returns: + List of string parts. + """ + paragraphs = content.split('\n\n') + parts: List[str] = [] + current_parts: List[str] = [] + current_tokens = 0 + current_chars = 0 + + def flush() -> None: + if current_parts: + parts.append('\n\n'.join(current_parts)) + current_parts.clear() + + for para in paragraphs: + para_tokens = estimate_token_count(para) + para_chars = len(para) + + # Single paragraph exceeds limits — force-cut it + if para_tokens > max_tokens or para_chars > max_chars: + flush() + current_tokens = 0 + current_chars = 0 + for i in range(0, len(para), max_chars): + parts.append(para[i:i + max_chars]) + continue + + # Would exceed limits if added — flush first + if (current_tokens + para_tokens > max_tokens or + current_chars + para_chars > max_chars): + flush() + current_tokens = 0 + current_chars = 0 + + current_parts.append(para) + current_tokens += para_tokens + current_chars += para_chars + + flush() + return parts if parts else [content] + + +def sanitize_for_path(text: str, max_length: int = 50) -> str: + """Convert text to a safe string for use in file paths. + + Keeps word characters, CJK characters, spaces and hyphens. Replaces spaces + with underscores. Truncates with a sha256 suffix if the result exceeds + max_length. + + Args: + text: Input text to sanitize. + max_length: Maximum length of the returned string. + + Returns: + Safe path-friendly string. + """ + cleaned = re.sub( + r'[^\w\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af \-]', + '', + text, + ) + cleaned = cleaned.replace(' ', '_').strip('_') + + if not cleaned: + return 'section' + + if len(cleaned) <= max_length: + return cleaned + + suffix = '_' + hashlib.sha256(text.encode()).hexdigest()[:8] + return cleaned[:max_length - len(suffix)] + suffix diff --git a/python/functions/core/create_node_mapping.md b/python/functions/core/create_node_mapping.md new file mode 100644 index 00000000..afca2a61 --- /dev/null +++ b/python/functions/core/create_node_mapping.md @@ -0,0 +1,36 @@ +--- +name: create_node_mapping +kind: function +lang: py +domain: core +version: "1.0.0" +purity: pure +signature: "def create_node_mapping(tree: list[dict]) -> dict[str, dict]" +description: "Crea dict plano node_id->node para lookup O(1) en un arbol jerarquico." +tags: [tree, mapping, index, lookup] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [] +tested: false +tests: [] +test_file_path: "" +file_path: "python/functions/core/core.py" +source_repo: "https://github.com/VectifyAI/PageIndex" +source_license: "MIT" +source_file: "pageindex/utils.py" +--- + +## Ejemplo + +```python +tree = [{"node_id": "0001", "title": "A", "nodes": [{"node_id": "0002", "title": "B"}]}] +mapping = create_node_mapping(tree) +mapping["0002"]["title"] # "B" +``` + +## Notas + +Funcion pura. Los valores son referencias a los nodos originales, no copias. diff --git a/python/functions/core/cursor_paginate.md b/python/functions/core/cursor_paginate.md new file mode 100644 index 00000000..45bb8efe --- /dev/null +++ b/python/functions/core/cursor_paginate.md @@ -0,0 +1,66 @@ +--- +name: cursor_paginate +kind: function +lang: py +domain: core +version: "1.0.0" +purity: impure +signature: "def cursor_paginate(fetch_page: Callable[..., list[T]], get_cursor: Callable[[T], str | None], page_size: int = 100, max_items: int = 2000, max_retries: int = 3, retry_delay: float = 2.0, retryable_exceptions: tuple[type[Exception], ...] = (ConnectionError, TimeoutError, OSError)) -> list[T]" +description: "Paginador generico basado en cursor que funciona con cualquier API que use cursor-based pagination. Cada pagina se obtiene con retry automatico con exponential backoff. Se detiene cuando la pagina esta vacia, el batch es menor que page_size, se alcanza max_items, o el cursor del ultimo item es None." +tags: [pagination, cursor, retry, generic, api, backoff] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: ["time", "typing.Callable", "typing.TypeVar"] +tested: true +tests: + - "API que retorna 3 paginas de 10 items" + - "API que falla 1 vez por pagina (retry funciona)" + - "max_items limita correctamente" + - "API que retorna pagina parcial (ultima pagina)" + - "Cursor None en ultimo item (se detiene)" +test_file_path: "python/functions/core/cursor_paginate_test.py" +file_path: "python/functions/core/cursor_paginate.py" +--- + +## Ejemplo + +```python +from cursor_paginate import cursor_paginate + +def fetch_users(limit: int, cursor: str | None) -> list[dict]: + params = {"limit": limit} + if cursor: + params["cursor"] = cursor + return requests.get("https://api.example.com/users", params=params).json()["items"] + +def get_cursor(user: dict) -> str | None: + return user.get("next_cursor") + +users = cursor_paginate( + fetch_page=fetch_users, + get_cursor=get_cursor, + page_size=100, + max_items=5000, + max_retries=3, + retry_delay=2.0, +) +``` + +## Notas + +El caller solo necesita proveer dos callables: +- `fetch_page(limit, cursor)`: recibe `limit` y `cursor` como kwargs, retorna lista de items. +- `get_cursor(item)`: extrae el cursor del ultimo item de la pagina; retornar None indica fin de datos. + +El exponential backoff interno aplica `retry_delay * 2^attempt` sin jitter. Solo se reintentan las excepciones en `retryable_exceptions`; cualquier otra excepcion propaga inmediatamente. + +Condiciones de parada (cualquiera de ellas): +1. La pagina retornada esta vacia. +2. La pagina retornada tiene menos items que `page_size` (pagina parcial = ultima pagina). +3. El total acumulado alcanza o supera `max_items` (se trunca y se para). +4. `get_cursor(batch[-1])` retorna `None`. + +Funcion impura: llama a `fetch_page` que tipicamente hace I/O de red y usa `time.sleep` en los reintentos. diff --git a/python/functions/core/cursor_paginate.py b/python/functions/core/cursor_paginate.py new file mode 100644 index 00000000..69d78ae1 --- /dev/null +++ b/python/functions/core/cursor_paginate.py @@ -0,0 +1,105 @@ +"""Generic cursor-based paginator for any API that uses cursor pagination.""" + +import time +from typing import Callable, TypeVar + +T = TypeVar("T") + + +def cursor_paginate( + fetch_page: Callable[..., list[T]], + get_cursor: Callable[[T], str | None], + page_size: int = 100, + max_items: int = 2000, + max_retries: int = 3, + retry_delay: float = 2.0, + retryable_exceptions: tuple[type[Exception], ...] = ( + ConnectionError, + TimeoutError, + OSError, + ), +) -> list[T]: + """Paginate through a cursor-based API, collecting all items. + + Fetches pages one at a time by calling fetch_page with limit and cursor + kwargs. Retries each page on transient errors using exponential backoff. + Stops when a page is empty, a partial page is returned, max_items is + reached, or the cursor from the last item is None. + + Args: + fetch_page: Callable that accepts ``limit`` and ``cursor`` as keyword + arguments and returns a list of items for that page. + get_cursor: Callable that receives the last item of a page and returns + the cursor string to use for the next page, or None if there are + no more pages. + page_size: Number of items to request per page. + max_items: Hard cap on total items collected. Collection stops and the + list is truncated once this limit is reached. + max_retries: Maximum number of retry attempts per page after the first + failure. + retry_delay: Base delay in seconds between retries (doubled each + attempt — exponential backoff without jitter). + retryable_exceptions: Tuple of exception types that trigger a retry. + Any other exception propagates immediately. + + Returns: + List of all collected items, in the order they were returned by the + API, truncated to max_items. + + Raises: + Exception: Re-raises the last exception if all retries for a page are + exhausted. + """ + all_items: list[T] = [] + cursor: str | None = None + + while True: + batch = _fetch_with_retry( + fetch_page=fetch_page, + page_size=page_size, + cursor=cursor, + max_retries=max_retries, + retry_delay=retry_delay, + retryable_exceptions=retryable_exceptions, + ) + + if not batch: + break + + all_items.extend(batch) + + if len(all_items) >= max_items: + del all_items[max_items:] + break + + if len(batch) < page_size: + break + + cursor = get_cursor(batch[-1]) + if cursor is None: + break + + return all_items + + +def _fetch_with_retry( + fetch_page: Callable[..., list[T]], + page_size: int, + cursor: str | None, + max_retries: int, + retry_delay: float, + retryable_exceptions: tuple[type[Exception], ...], +) -> list[T]: + """Call fetch_page once, retrying on retryable_exceptions with exponential backoff.""" + last_exc: Exception | None = None + for attempt in range(max_retries + 1): + try: + return fetch_page(limit=page_size, cursor=cursor) + except retryable_exceptions as exc: + last_exc = exc + if attempt >= max_retries: + raise + delay = retry_delay * (2 ** attempt) + time.sleep(delay) + + raise last_exc # unreachable; satisfies type checkers diff --git a/python/functions/core/cursor_paginate_test.py b/python/functions/core/cursor_paginate_test.py new file mode 100644 index 00000000..38636530 --- /dev/null +++ b/python/functions/core/cursor_paginate_test.py @@ -0,0 +1,148 @@ +"""Tests para cursor_paginate.""" + +import sys +import os + +sys.path.insert(0, os.path.dirname(__file__)) + +import pytest +from cursor_paginate import cursor_paginate + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def make_api(pages: list[list[dict]]) -> callable: + """Return a fetch_page callable that serves pages from a pre-built list.""" + call_count = [0] + + def fetch_page(limit: int, cursor: str | None) -> list[dict]: + idx = call_count[0] + call_count[0] += 1 + if idx >= len(pages): + return [] + return pages[idx][:limit] + + return fetch_page + + +def get_cursor(item: dict) -> str | None: + return item.get("cursor") + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +def test_api_retorna_3_paginas_de_10_items(): + pages = [ + [{"id": i, "cursor": str(i)} for i in range(0, 10)], + [{"id": i, "cursor": str(i)} for i in range(10, 20)], + [{"id": i, "cursor": str(i)} for i in range(20, 30)], + [], # sentinel: empty page ends pagination + ] + api = make_api(pages) + result = cursor_paginate( + fetch_page=api, + get_cursor=get_cursor, + page_size=10, + max_items=2000, + max_retries=0, + ) + assert len(result) == 30 + assert result[0]["id"] == 0 + assert result[-1]["id"] == 29 + + +def test_api_falla_1_vez_por_pagina_retry_funciona(): + """fetch_page falla en el primer intento de cada llamada, pero el retry recupera.""" + call_counter = [0] + # Cada pagina tiene 5 items. 2 paginas en total, luego vacio. + items_by_page = [ + [{"id": i, "cursor": str(i)} for i in range(0, 5)], + [{"id": i, "cursor": str(i)} for i in range(5, 10)], + ] + page_idx = [0] + fail_flags = [True, True] # falla una vez por pagina + + def fetch_page(limit: int, cursor: str | None) -> list[dict]: + idx = page_idx[0] + if idx < len(fail_flags) and fail_flags[idx]: + fail_flags[idx] = False + raise ConnectionError("transient failure") + page_idx[0] += 1 + if idx >= len(items_by_page): + return [] + return items_by_page[idx] + + result = cursor_paginate( + fetch_page=fetch_page, + get_cursor=get_cursor, + page_size=5, + max_items=2000, + max_retries=3, + retry_delay=0.0, + retryable_exceptions=(ConnectionError, TimeoutError, OSError), + ) + assert len(result) == 10 + + +def test_max_items_limita_correctamente(): + # 50 items disponibles en 5 paginas de 10, pero max_items=25 + pages = [ + [{"id": i, "cursor": str(i)} for i in range(j * 10, j * 10 + 10)] + for j in range(5) + ] + api = make_api(pages) + result = cursor_paginate( + fetch_page=api, + get_cursor=get_cursor, + page_size=10, + max_items=25, + max_retries=0, + ) + assert len(result) == 25 + assert result[-1]["id"] == 24 + + +def test_api_retorna_pagina_parcial_ultima_pagina(): + pages = [ + [{"id": i, "cursor": str(i)} for i in range(10)], # full page + [{"id": i, "cursor": str(i)} for i in range(10, 17)], # partial — 7 items + ] + api = make_api(pages) + result = cursor_paginate( + fetch_page=api, + get_cursor=get_cursor, + page_size=10, + max_items=2000, + max_retries=0, + ) + assert len(result) == 17 + assert result[-1]["id"] == 16 + + +def test_cursor_none_en_ultimo_item_se_detiene(): + """Cuando el ultimo item no tiene cursor, la paginacion debe detenerse.""" + pages = [ + [{"id": i, "cursor": str(i)} for i in range(10)], + # last item has no cursor — signals end of data + [{"id": i, "cursor": (str(i) if i < 19 else None)} for i in range(10, 20)], + ] + api = make_api(pages) + + def get_cursor_nullable(item: dict) -> str | None: + return item.get("cursor") + + result = cursor_paginate( + fetch_page=api, + get_cursor=get_cursor_nullable, + page_size=10, + max_items=2000, + max_retries=0, + ) + assert len(result) == 20 + assert result[-1]["id"] == 19 diff --git a/python/functions/core/detect_headings_by_font.md b/python/functions/core/detect_headings_by_font.md new file mode 100644 index 00000000..03d5040d --- /dev/null +++ b/python/functions/core/detect_headings_by_font.md @@ -0,0 +1,37 @@ +--- +name: detect_headings_by_font +kind: function +lang: py +domain: core +version: "1.0.0" +purity: impure +signature: "def detect_headings_by_font(pdf, min_delta: float = 2.0, max_levels: int = 4) -> list[dict]" +description: "Detecta headings en un PDF analizando la distribucion de font sizes. El font size mas comun es el body; sizes significativamente mayores se clasifican como heading levels. Filtra headers/footers repetitivos." +tags: [pdf, headings, font, detection, parsing, pdfplumber] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: [pdfplumber, collections] +tested: false +tests: [] +test_file_path: "" +file_path: "python/functions/core/detect_headings_by_font.py" +--- + +## Ejemplo + +```python +import pdfplumber +from detect_headings_by_font import detect_headings_by_font + +with pdfplumber.open("document.pdf") as pdf: + headings = detect_headings_by_font(pdf, min_delta=2.0, max_levels=4) + for h in headings: + print(f"Page {h['page_num']}: {'#' * h['level']} {h['title']}") +``` + +## Notas + +Samplea cada 5ta pagina para construir el Counter de font sizes (optimizacion de rendimiento). El body_size es el font size mas frecuente. Los heading sizes deben ser >= body_size + min_delta Y tener frecuencia < 50% del body. Se limita a max_levels heading sizes ordenados desc (el mas grande = nivel 1). Titulos que aparecen en >30% de paginas son considerados headers/footers y se eliminan. Impure porque accede al estado interno de un objeto PDF ya abierto. diff --git a/python/functions/core/detect_headings_by_font.py b/python/functions/core/detect_headings_by_font.py new file mode 100644 index 00000000..346ead42 --- /dev/null +++ b/python/functions/core/detect_headings_by_font.py @@ -0,0 +1,135 @@ +"""Detect headings in a PDF by analyzing font size distribution.""" + +from collections import Counter + +import pdfplumber + + +def detect_headings_by_font( + pdf: pdfplumber.PDF, + min_delta: float = 2.0, + max_levels: int = 4, +) -> list[dict]: + """Detect headings by analyzing font size distribution across pages. + + The most common font size is treated as body text. Font sizes significantly + larger than body (by at least min_delta) and appearing in fewer than 50% of + chars are classified as heading levels. + + Args: + pdf: An open pdfplumber.PDF object. + min_delta: Minimum size difference above body size to qualify as heading. + max_levels: Maximum number of heading levels to detect. + + Returns: + list[dict]: List of {"level": int, "title": str, "page_num": int} + sorted by page number. Returns empty list if no headings detected. + """ + if not pdf.pages: + return [] + + # Step 1: Sample font sizes from every 5th page to determine body size + size_counter: Counter = Counter() + sample_pages = [pdf.pages[i] for i in range(0, len(pdf.pages), 5)] + if not sample_pages: + sample_pages = [pdf.pages[0]] + + for page in sample_pages: + try: + chars = page.chars + for ch in chars: + size = ch.get("size") + if size is not None: + size_counter[round(float(size), 1)] += 1 + except Exception: + continue + + if not size_counter: + return [] + + # Step 2: Determine body size (most common font size) + body_size, body_count = size_counter.most_common(1)[0] + + # Step 3: Identify heading sizes + # Must be >= body_size + min_delta and frequency < 50% of body count + heading_sizes = sorted( + [ + size + for size, count in size_counter.items() + if size >= body_size + min_delta and count < body_count * 0.5 + ], + reverse=True, + )[:max_levels] + + if not heading_sizes: + return [] + + # Build size -> level mapping + size_to_level = {size: i + 1 for i, size in enumerate(heading_sizes)} + + # Step 4: Collect heading text per page + raw_headings: list[dict] = [] + total_pages = len(pdf.pages) + + for page_idx, page in enumerate(pdf.pages): + page_num = page_idx + 1 + try: + chars = page.chars + except Exception: + continue + + # Group consecutive chars of same heading size into text blocks + current_size = None + current_text = [] + + for ch in chars: + size = ch.get("size") + if size is None: + continue + rounded = round(float(size), 1) + if rounded in size_to_level: + if rounded == current_size: + current_text.append(ch.get("text", "")) + else: + if current_text and current_size is not None: + text = "".join(current_text).strip() + if text: + raw_headings.append({ + "level": size_to_level[current_size], + "title": text, + "page_num": page_num, + }) + current_size = rounded + current_text = [ch.get("text", "")] + else: + if current_text and current_size is not None: + text = "".join(current_text).strip() + if text: + raw_headings.append({ + "level": size_to_level[current_size], + "title": text, + "page_num": page_num, + }) + current_size = None + current_text = [] + + # Flush remaining + if current_text and current_size is not None: + text = "".join(current_text).strip() + if text: + raw_headings.append({ + "level": size_to_level[current_size], + "title": text, + "page_num": page_num, + }) + + if not raw_headings: + return [] + + # Step 5: Deduplicate — remove titles appearing on > 30% of pages (headers/footers) + title_page_counts: Counter = Counter(h["title"] for h in raw_headings) + threshold = total_pages * 0.3 + + filtered = [h for h in raw_headings if title_page_counts[h["title"]] <= threshold] + + return filtered diff --git a/python/functions/core/detect_url_type.md b/python/functions/core/detect_url_type.md new file mode 100644 index 00000000..53282928 --- /dev/null +++ b/python/functions/core/detect_url_type.md @@ -0,0 +1,59 @@ +--- +name: detect_url_type +kind: function +lang: py +domain: core +version: "1.0.0" +purity: impure +signature: "detect_url_type(url: str, timeout: float = 10.0) -> tuple[str, dict]" +description: "Detecta el tipo de contenido de una URL. Retorna tipo ('webpage', 'pdf', 'markdown', 'text', 'code_repository') y metadata. Hace HTTP HEAD request solo si no puede determinarse por patron o extension." +tags: [url, content-type, http, detect, classification, head-request] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: ["urllib.parse", "httpx"] +tested: true +tests: + - "URL .pdf por extension" + - "URL github repo" + - "URL markdown por extension" + - "URL SSH git" + - "URL .html por extension" +test_file_path: "python/functions/core/detect_url_type_test.py" +file_path: "python/functions/core/detect_url_type.py" +--- + +## Ejemplo + +```python +from core.detect_url_type import detect_url_type + +# Por patron URL (sin HTTP request) +url_type, meta = detect_url_type("https://github.com/openai/whisper") +# url_type = "code_repository", meta = {"detection": "url_pattern", ...} + +# Por extension (sin HTTP request) +url_type, meta = detect_url_type("https://example.com/doc.pdf") +# url_type = "pdf", meta = {"detection": "extension", ...} + +# Por HTTP HEAD request (cuando no se puede determinar sin red) +url_type, meta = detect_url_type("https://example.com/page") +# url_type = "webpage", meta = {"detection": "content_type_header", "content_type": "text/html", ...} +``` + +## Notas + +Algoritmo en orden de prioridad: +1. SSH git shorthand (`git@host:path`) → `code_repository` inmediatamente. +2. Patron URL de repos conocidos (github.com/org/repo, gitlab.com/org/repo) → `code_repository`. +3. Extension del path de la URL (.pdf, .md, .txt, .html, .git) → tipo correspondiente. +4. HTTP HEAD request → leer `Content-Type` header. +5. Default: `"webpage"`. + +Hosts reconocidos como repos de codigo: github.com, gitlab.com, bitbucket.org, codeberg.org. + +Sub-recursos (issues, pulls, blob, tree, etc.) NO se clasifican como `code_repository`. + +Lanza `Exception` con mensaje descriptivo si el HEAD request falla (timeout, DNS, red). diff --git a/python/functions/core/detect_url_type.py b/python/functions/core/detect_url_type.py new file mode 100644 index 00000000..9ef654da --- /dev/null +++ b/python/functions/core/detect_url_type.py @@ -0,0 +1,144 @@ +"""Detecta el tipo de contenido de una URL (webpage, pdf, markdown, text, code_repository).""" + +import re +from urllib.parse import urlparse + + +# Patrones de repos de codigo por hostname +_CODE_REPO_HOSTS = {"github.com", "gitlab.com", "bitbucket.org", "codeberg.org"} + +# Extensiones reconocidas → tipo +_EXT_TYPE_MAP = { + ".pdf": "pdf", + ".md": "markdown", + ".markdown": "markdown", + ".rst": "text", + ".txt": "text", + ".html": "webpage", + ".htm": "webpage", + ".xml": "text", + ".json": "text", + ".csv": "text", + ".py": "text", + ".js": "text", + ".ts": "text", + ".go": "text", + ".rs": "text", + ".cpp": "text", + ".c": "text", + ".java": "text", + ".rb": "text", + ".git": "code_repository", +} + +# Content-Type header prefixes → tipo +_CONTENT_TYPE_MAP = { + "application/pdf": "pdf", + "text/markdown": "markdown", + "text/x-markdown": "markdown", + "text/plain": "text", + "text/html": "webpage", + "text/xml": "text", + "application/xml": "text", + "application/json": "text", +} + + +def _is_code_repo_url(parsed, path_segments: list[str]) -> bool: + """Return True si la URL apunta a la raiz de un repositorio de codigo.""" + host = parsed.hostname or "" + if host not in _CODE_REPO_HOSTS: + return False + # Acepta org/repo o org/repo/ o org/repo.git (2 segmentos minimos) + if len(path_segments) < 2: + return False + # Rechaza sub-recursos conocidos: issues, pulls, blob, tree, releases, etc. + _SUB_RESOURCES = {"issues", "pulls", "blob", "tree", "releases", "tags", + "commits", "compare", "wiki", "discussions", "actions", + "security", "pulse", "graphs", "-", "settings"} + if len(path_segments) >= 3 and path_segments[2].rstrip(".git") in _SUB_RESOURCES: + return False + return True + + +def _is_ssh_git_url(url: str) -> bool: + """Return True si la URL es un SSH git shorthand (git@host:path).""" + return url.strip().startswith("git@") + + +def _type_from_extension(path: str) -> str | None: + """Detecta tipo segun la extension del path de la URL. Retorna None si no aplica.""" + # Ignorar query string / fragment + clean_path = path.split("?")[0].split("#")[0] + for ext, url_type in _EXT_TYPE_MAP.items(): + if clean_path.lower().endswith(ext): + return url_type + return None + + +def _type_from_content_type(content_type_header: str) -> str: + """Mapea un Content-Type header al tipo de URL.""" + ct = content_type_header.lower().split(";")[0].strip() + for prefix, url_type in _CONTENT_TYPE_MAP.items(): + if ct.startswith(prefix): + return url_type + return "webpage" + + +def detect_url_type(url: str, timeout: float = 10.0) -> tuple[str, dict]: + """Detecta el tipo de contenido de una URL. + + Algoritmo: + 1. Verificar si la URL es un patron de repo de codigo (git@, github.com/org/repo). + 2. Verificar extension en el path de la URL (.pdf, .md, .txt, .html, .git). + 3. Si no se determino: HTTP HEAD request para leer Content-Type header. + 4. Default: "webpage". + + Args: + url: URL a analizar. + timeout: Timeout en segundos para el HTTP HEAD request (si es necesario). + + Returns: + Tuple de (tipo, metadata) donde tipo es uno de: + "webpage", "pdf", "markdown", "text", "code_repository". + metadata incluye la informacion disponible (extension, content_type, etc.). + + Raises: + Exception: Si falla la conexion HTTP cuando es necesaria. + """ + import httpx + + url = url.strip() + metadata: dict = {"url": url} + + # 1. SSH git shorthand + if _is_ssh_git_url(url): + metadata["detection"] = "ssh_pattern" + return "code_repository", metadata + + parsed = urlparse(url) + path_segments = [s for s in parsed.path.split("/") if s] + + # 2. Code repo by URL pattern + if _is_code_repo_url(parsed, path_segments): + metadata["detection"] = "url_pattern" + metadata["host"] = parsed.hostname + return "code_repository", metadata + + # 3. Extension-based detection + ext_type = _type_from_extension(parsed.path) + if ext_type is not None: + metadata["detection"] = "extension" + metadata["path"] = parsed.path + return ext_type, metadata + + # 4. HTTP HEAD request + try: + response = httpx.head(url, timeout=timeout, follow_redirects=True) + content_type = response.headers.get("content-type", "") + metadata["detection"] = "content_type_header" + metadata["content_type"] = content_type + metadata["status_code"] = response.status_code + return _type_from_content_type(content_type), metadata + except Exception as exc: + raise Exception(f"detect_url_type: HEAD request failed for {url!r}: {exc}") from exc diff --git a/python/functions/core/detect_url_type_test.py b/python/functions/core/detect_url_type_test.py new file mode 100644 index 00000000..14fe4d67 --- /dev/null +++ b/python/functions/core/detect_url_type_test.py @@ -0,0 +1,89 @@ +"""Tests para detect_url_type (tests que no requieren red).""" + +import sys +import os + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +from core.detect_url_type import detect_url_type, _type_from_extension, _type_from_content_type, _is_ssh_git_url + + +def test_url_pdf_por_extension(): + """URL .pdf se detecta por extension sin hacer request HTTP.""" + url_type, metadata = detect_url_type("https://example.com/report.pdf") + assert url_type == "pdf" + assert metadata["detection"] == "extension" + + +def test_url_github_repo(): + """URL de GitHub org/repo se detecta como code_repository por patron URL.""" + url_type, metadata = detect_url_type("https://github.com/openai/whisper") + assert url_type == "code_repository" + assert metadata["detection"] == "url_pattern" + + +def test_url_github_con_git_suffix(): + """URL github terminada en .git se detecta como code_repository.""" + url_type, metadata = detect_url_type("https://github.com/openai/whisper.git") + assert url_type == "code_repository" + + +def test_url_markdown_por_extension(): + """URL .md se detecta como markdown por extension.""" + url_type, metadata = detect_url_type("https://example.com/README.md") + assert url_type == "markdown" + assert metadata["detection"] == "extension" + + +def test_url_ssh_git(): + """URL SSH git@ se detecta como code_repository.""" + url_type, metadata = detect_url_type("git@github.com:openai/whisper.git") + assert url_type == "code_repository" + assert metadata["detection"] == "ssh_pattern" + + +def test_url_html_por_extension(): + """URL .html se detecta como webpage por extension.""" + url_type, metadata = detect_url_type("https://example.com/page.html") + assert url_type == "webpage" + assert metadata["detection"] == "extension" + + +def test_url_txt_por_extension(): + """URL .txt se detecta como text por extension.""" + url_type, metadata = detect_url_type("https://example.com/data.txt") + assert url_type == "text" + + +def test_github_subrepo_no_es_repo(): + """URL de GitHub apuntando a un issue/blob no se trata como code_repository.""" + # Debe intentar HEAD request (que fallara sin red) — verificamos que no clasifica como repo + # Solo comprobamos que no devuelve code_repository por patron URL + url = "https://github.com/openai/whisper/blob/main/README.md" + # Extension .md deberia detectarse primero + url_type, metadata = detect_url_type(url) + assert url_type == "markdown" + + +def test_helper_type_from_extension(): + """_type_from_extension funciona para extensiones conocidas.""" + assert _type_from_extension("/doc.pdf") == "pdf" + assert _type_from_extension("/README.md") == "markdown" + assert _type_from_extension("/notes.txt") == "text" + assert _type_from_extension("/unknown.xyz") is None + + +def test_helper_type_from_content_type(): + """_type_from_content_type mapea headers correctamente.""" + assert _type_from_content_type("application/pdf; charset=utf-8") == "pdf" + assert _type_from_content_type("text/html; charset=utf-8") == "webpage" + assert _type_from_content_type("text/plain") == "text" + assert _type_from_content_type("text/markdown") == "markdown" + assert _type_from_content_type("application/octet-stream") == "webpage" + + +def test_helper_is_ssh_git_url(): + """_is_ssh_git_url detecta formato git@.""" + assert _is_ssh_git_url("git@github.com:org/repo.git") is True + assert _is_ssh_git_url("https://github.com/org/repo") is False + assert _is_ssh_git_url("ssh://git@github.com/org/repo") is False diff --git a/python/functions/core/docx_to_markdown.md b/python/functions/core/docx_to_markdown.md new file mode 100644 index 00000000..91675658 --- /dev/null +++ b/python/functions/core/docx_to_markdown.md @@ -0,0 +1,40 @@ +--- +name: docx_to_markdown +kind: function +lang: py +domain: core +version: "1.0.0" +purity: impure +signature: "docx_to_markdown(docx_path: str) -> str" +description: "Convierte un documento Word (.docx) a markdown preservando estructura (headings), formato inline (bold, italic, underline) y tablas en su posicion original." +tags: [docx, markdown, word, conversion, document, parsing, text] +uses_functions: [format_table_to_markdown_py_core] +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: [python-docx, lxml] +tested: true +tests: ["docx con headings y parrafos", "docx con tablas intercaladas", "docx con formato bold/italic", "docx vacio", "archivo no encontrado lanza FileNotFoundError"] +test_file_path: "python/functions/core/docx_to_markdown_test.py" +file_path: "python/functions/core/docx_to_markdown.py" +--- + +## Ejemplo + +```python +md = docx_to_markdown("informe.docx") +# # Titulo +# +# Primer parrafo. +# +# | Col1 | Col2 | +# | ---- | ---- | +# | a | b | +# +# Parrafo despues de la tabla. +``` + +## Notas + +Recorre `doc.element.body` en orden (no `doc.paragraphs` + `doc.tables` por separado) para preservar la posicion original de las tablas. Construye un mapa `{id(tbl_element): Table}` para lookup O(1). El formato inline aplica underline (``), italic (`*`) y bold (`**`) en ese orden de mas interno a mas externo. Los headings se detectan por el estilo del parrafo (`Heading 1`, `Heading 2`, etc.). Requiere `python-docx` instalado en el entorno. diff --git a/python/functions/core/docx_to_markdown.py b/python/functions/core/docx_to_markdown.py new file mode 100644 index 00000000..0513f2f5 --- /dev/null +++ b/python/functions/core/docx_to_markdown.py @@ -0,0 +1,153 @@ +"""Convert a Word .docx document to Markdown, preserving structure, inline +formatting and tables in their original document order.""" + +import os +from lxml import etree + +from format_table_to_markdown import format_table_to_markdown + + +# XML namespace used by python-docx element tags +_W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" +_TAG_P = f"{{{_W}}}p" +_TAG_TBL = f"{{{_W}}}tbl" +_TAG_TR = f"{{{_W}}}tr" +_TAG_TC = f"{{{_W}}}tc" +_TAG_R = f"{{{_W}}}r" +_TAG_T = f"{{{_W}}}t" +_TAG_RPR = f"{{{_W}}}rPr" +_TAG_B = f"{{{_W}}}b" +_TAG_I = f"{{{_W}}}i" +_TAG_U = f"{{{_W}}}u" +_TAG_PSTYLE = f"{{{_W}}}pStyle" +_TAG_PPR = f"{{{_W}}}pPr" + + +def _heading_level(paragraph) -> int: + """Return heading level (1-6) if the paragraph is a heading, else 0.""" + pPr = paragraph._p.find(_TAG_PPR) + if pPr is None: + return 0 + pStyle = pPr.find(_TAG_PSTYLE) + if pStyle is None: + return 0 + val = pStyle.get(f"{{{_W}}}val", "") + if val.lower().startswith("heading"): + parts = val.split() + if len(parts) == 2: + try: + return int(parts[1]) + except ValueError: + pass + # Some locales use "Heading1" (no space) + suffix = val[len("heading"):] + if suffix.isdigit(): + return int(suffix) + return 0 + + +def _run_to_md(run_elem) -> str: + """Convert a single element to a markdown-formatted string.""" + # Collect text + text_parts = [] + for t in run_elem.findall(_TAG_T): + text_parts.append(t.text or "") + text = "".join(text_parts) + if not text: + return "" + + # Read formatting from + rPr = run_elem.find(_TAG_RPR) + bold = False + italic = False + underline = False + if rPr is not None: + bold = rPr.find(_TAG_B) is not None + italic = rPr.find(_TAG_I) is not None + u_elem = rPr.find(_TAG_U) + if u_elem is not None: + u_val = u_elem.get(f"{{{_W}}}val", "") + underline = u_val not in ("none", "") + + # Apply markdown formatting (innermost first: underline → italic → bold) + if underline: + text = f"{text}" + if italic: + text = f"*{text}*" + if bold: + text = f"**{text}**" + return text + + +def _paragraph_to_md(paragraph) -> str: + """Convert a python-docx Paragraph to a markdown string.""" + level = _heading_level(paragraph) + runs_md = "".join( + _run_to_md(elem) + for elem in paragraph._p + if elem.tag == _TAG_R + ) + if level: + return f"{'#' * level} {runs_md}" + return runs_md + + +def _table_to_md(table) -> str: + """Convert a python-docx Table to a markdown table string.""" + rows: list[list[str]] = [] + for row in table.rows: + cells = [] + for cell in row.cells: + # Join all paragraphs in the cell with a space + cell_text = " ".join(p.text for p in cell.paragraphs).strip() + cells.append(cell_text) + rows.append(cells) + return format_table_to_markdown(rows, has_header=True) + + +def docx_to_markdown(docx_path: str) -> str: + """Convert a Word .docx document to Markdown. + + Preserves document structure (headings), inline formatting (bold, italic, + underline) and tables in their original position. + + Args: + docx_path: Absolute or relative path to the .docx file. + + Returns: + Markdown string representing the document. + + Raises: + FileNotFoundError: If the file does not exist. + Exception: If the file cannot be parsed as a .docx document. + """ + import docx # deferred so the module is importable without python-docx installed + + if not os.path.exists(docx_path): + raise FileNotFoundError(f"File not found: {docx_path}") + + doc = docx.Document(docx_path) + + # Build a mapping from the XML element id to the Table object for O(1) lookup + table_map: dict[int, object] = { + id(table._tbl): table for table in doc.tables + } + + parts: list[str] = [] + + for child in doc.element.body: + if child.tag == _TAG_P: + # Wrap in a temporary paragraph object to reuse _paragraph_to_md + from docx.text.paragraph import Paragraph + para = Paragraph(child, doc) + md = _paragraph_to_md(para) + if md.strip(): + parts.append(md) + elif child.tag == _TAG_TBL: + table = table_map.get(id(child)) + if table is not None: + md = _table_to_md(table) + if md: + parts.append(md) + + return "\n\n".join(parts) diff --git a/python/functions/core/docx_to_markdown_test.py b/python/functions/core/docx_to_markdown_test.py new file mode 100644 index 00000000..10c689d1 --- /dev/null +++ b/python/functions/core/docx_to_markdown_test.py @@ -0,0 +1,129 @@ +"""Tests para docx_to_markdown.""" + +import os +import sys +import tempfile + +import pytest + +sys.path.insert(0, os.path.dirname(__file__)) + +import docx as python_docx +from docx_to_markdown import docx_to_markdown + + +def _make_docx(builder_fn) -> str: + """Create a temporary .docx file using builder_fn(doc) and return its path.""" + doc = python_docx.Document() + builder_fn(doc) + tmp = tempfile.NamedTemporaryFile(suffix=".docx", delete=False) + doc.save(tmp.name) + tmp.close() + return tmp.name + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +def test_docx_con_headings_y_parrafos(): + """docx con headings y parrafos""" + + def build(doc): + doc.add_heading("Titulo Principal", level=1) + doc.add_paragraph("Primer parrafo de contenido.") + doc.add_heading("Seccion", level=2) + doc.add_paragraph("Segundo parrafo.") + + path = _make_docx(build) + try: + result = docx_to_markdown(path) + assert "# Titulo Principal" in result + assert "## Seccion" in result + assert "Primer parrafo de contenido." in result + assert "Segundo parrafo." in result + finally: + os.unlink(path) + + +def test_docx_con_tablas_intercaladas(): + """docx con tablas intercaladas""" + + def build(doc): + doc.add_paragraph("Texto antes de la tabla.") + table = doc.add_table(rows=2, cols=3) + table.cell(0, 0).text = "Col1" + table.cell(0, 1).text = "Col2" + table.cell(0, 2).text = "Col3" + table.cell(1, 0).text = "a" + table.cell(1, 1).text = "b" + table.cell(1, 2).text = "c" + doc.add_paragraph("Texto despues de la tabla.") + + path = _make_docx(build) + try: + result = docx_to_markdown(path) + # Table must appear BETWEEN the two paragraphs + before_idx = result.index("Texto antes de la tabla.") + table_idx = result.index("| Col1") + after_idx = result.index("Texto despues de la tabla.") + assert before_idx < table_idx < after_idx + assert "| Col2" in result + assert "| a" in result + finally: + os.unlink(path) + + +def test_docx_con_formato_bold_italic(): + """docx con formato bold/italic""" + + def build(doc): + para = doc.add_paragraph() + run_bold = para.add_run("negrita") + run_bold.bold = True + run_normal = para.add_run(" texto normal ") + run_italic = para.add_run("cursiva") + run_italic.italic = True + + path = _make_docx(build) + try: + result = docx_to_markdown(path) + assert "**negrita**" in result + assert "*cursiva*" in result + assert "texto normal" in result + finally: + os.unlink(path) + + +def test_docx_vacio(): + """docx vacio""" + + def build(doc): + # python-docx adds a default empty paragraph; remove all content + # by just not adding anything — the default empty paragraph will + # produce an empty string that gets filtered out. + pass + + path = _make_docx(build) + try: + result = docx_to_markdown(path) + # Empty document should produce empty or whitespace-only output + assert result.strip() == "" + finally: + os.unlink(path) + + +def test_archivo_no_encontrado(): + """archivo no encontrado lanza FileNotFoundError""" + with pytest.raises(FileNotFoundError): + docx_to_markdown("/tmp/nonexistent_file_fn_registry.docx") + + +if __name__ == "__main__": + test_docx_con_headings_y_parrafos() + test_docx_con_tablas_intercaladas() + test_docx_con_formato_bold_italic() + test_docx_vacio() + test_archivo_no_encontrado() + print("All tests passed.") diff --git a/python/functions/core/epub_to_markdown.md b/python/functions/core/epub_to_markdown.md new file mode 100644 index 00000000..8738a00a --- /dev/null +++ b/python/functions/core/epub_to_markdown.md @@ -0,0 +1,52 @@ +--- +name: epub_to_markdown +kind: function +lang: py +domain: core +version: "1.0.0" +purity: impure +signature: "def epub_to_markdown(epub_path: str) -> str" +description: "Convierte un ebook EPUB a markdown. Intenta ebooklib primero para extraccion estructurada (titulo, autor, documentos); fallback a extraccion manual con zipfile si ebooklib no esta instalado." +tags: [epub, markdown, ebook, parsing, conversion, html, text-extraction] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: [zipfile, html, re, ebooklib] +tested: true +tests: + - "conversion de headings h1-h3" + - "conversion de bold e italic" + - "script y style se eliminan del output" + - "HTML entities se convierten a caracteres" + - "epub sin ebooklib extrae texto de archivos html" + - "epub con ebooklib incluye titulo y autor en el output" + - "epub corrupto lanza excepcion" +test_file_path: "python/functions/core/epub_to_markdown_test.py" +file_path: "python/functions/core/epub_to_markdown.py" +--- + +## Ejemplo + +```python +md = epub_to_markdown("/path/to/book.epub") +print(md[:500]) +# # Mi Libro +# **Author:** Ana Perez +# +# # Introduccion +# Primer parrafo... +``` + +## Notas + +Conversion HTML a markdown cubre: headings h1-h6, bold (``/``), italic (``/``), paragraphs, line breaks. Elimina `

Contenido

' + result = _html_to_markdown(html) + assert 'alert' not in result + assert 'body{}' not in result + assert 'Contenido' in result + + +def test_html_entities_unescaped(): + """HTML entities se convierten a caracteres.""" + html = '

Tom & Jerry <show>

' + result = _html_to_markdown(html) + assert 'Tom & Jerry' in result + assert '' in result + + +# --------------------------------------------------------------------------- +# Tests de epub_via_zipfile (sin ebooklib) +# --------------------------------------------------------------------------- + +def test_epub_via_zipfile_extrae_html(): + """epub sin ebooklib extrae texto de archivos html.""" + path = _build_epub({ + 'chapter.html': '

Capitulo Uno

Hola mundo.

', + }) + try: + result = _epub_via_zipfile(path) + assert 'Capitulo Uno' in result + assert 'Hola mundo' in result + finally: + os.unlink(path) + + +# --------------------------------------------------------------------------- +# Tests de epub_to_markdown (integracion) +# --------------------------------------------------------------------------- + +def test_epub_con_ebooklib_metadata(): + """epub con ebooklib incluye titulo y autor en el output.""" + pytest.importorskip('ebooklib') + path = _build_epub_with_opf( + title='Mi Libro', + author='Ana Perez', + body_html='

Introduccion

Primer parrafo.

', + ) + try: + result = epub_to_markdown(path) + assert '# Mi Libro' in result + assert 'Ana Perez' in result + assert 'Introduccion' in result + finally: + os.unlink(path) + + +def test_epub_corrupto_lanza_excepcion(): + """epub corrupto lanza Exception.""" + import tempfile + tmp = tempfile.NamedTemporaryFile(suffix='.epub', delete=False) + tmp.write(b'esto no es un epub valido') + tmp.close() + try: + with pytest.raises(Exception): + epub_to_markdown(tmp.name) + finally: + os.unlink(tmp.name) diff --git a/python/functions/core/estimate_token_count.md b/python/functions/core/estimate_token_count.md new file mode 100644 index 00000000..e81e6acd --- /dev/null +++ b/python/functions/core/estimate_token_count.md @@ -0,0 +1,37 @@ +--- +name: estimate_token_count +kind: function +lang: py +domain: core +version: "1.0.0" +purity: pure +signature: "def estimate_token_count(content: str) -> int" +description: "Estimacion rapida de tokens sin tokenizer. CJK chars cuentan ~0.7 token/char, otros non-whitespace ~0.3 token/char." +tags: [tokens, estimation, nlp, cjk, text] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [re] +tested: true +tests: + - "texto vacio retorna cero" + - "solo latin" + - "solo CJK" + - "texto mixto" +test_file_path: "python/functions/core/parse_markdown_test.py" +file_path: "python/functions/core/core.py" +--- + +## Ejemplo + +```python +estimate_token_count("hello world") # 3 +estimate_token_count("中文语") # 2 (3 * 0.7 = 2) +estimate_token_count("") # 0 +``` + +## Notas + +Funcion pura. No requiere ninguna dependencia externa. Precision aproximada: util para guardianes de limite de contexto antes de llamar a LLMs, no para conteo exacto de tokens BPE. CJK range: `[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]` (CJK unificado, Hiragana/Katakana, Hangul). diff --git a/python/functions/core/excel_to_markdown.md b/python/functions/core/excel_to_markdown.md new file mode 100644 index 00000000..22f5c4a6 --- /dev/null +++ b/python/functions/core/excel_to_markdown.md @@ -0,0 +1,58 @@ +--- +name: excel_to_markdown +kind: function +lang: py +domain: core +version: "1.0.0" +purity: impure +signature: "excel_to_markdown(path: str, max_rows_per_sheet: int = 1000) -> str" +description: "Convierte un archivo Excel (.xlsx, .xls, .xlsm) a markdown con cada sheet como seccion H2. Soporta tipos de celda: fechas ISO, booleanos, errores Excel, numeros enteros y flotantes. Trunca sheets que superen max_rows_per_sheet." +tags: [excel, markdown, xlsx, xls, conversion, parser, io] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: ["openpyxl", "xlrd"] +tested: true +tests: + - "xlsx con multiples sheets produce una seccion H2 por sheet" + - "sheet vacio produce nota de sheet vacio" + - "sheet truncado con nota de filas omitidas" + - "sheet con formulas data_only muestra valores calculados" + - "extension no soportada lanza ValueError" + - "archivo inexistente lanza FileNotFoundError" + - "dimensiones del sheet en metadata" + - "tabla markdown con formato correcto" +test_file_path: "python/functions/core/excel_to_markdown_test.py" +file_path: "python/functions/core/excel_to_markdown.py" +--- + +## Ejemplo + +```python +from excel_to_markdown import excel_to_markdown + +md = excel_to_markdown("report.xlsx") +print(md) +# ## Sheet: Ventas +# +# **Dimensions:** 101 x 4 +# +# | Producto | Precio | Cantidad | Total | +# | --- | --- | --- | --- | +# | Manzana | 1 | 100 | 100 | +# ... + +# Con limite de filas +md = excel_to_markdown("big_file.xlsx", max_rows_per_sheet=50) +``` + +## Notas + +- `.xlsx` y `.xlsm`: usa `openpyxl` con `data_only=True` (lee valores calculados, no formulas). +- `.xls` (legacy): usa `xlrd`. Manejo de tipos especiales: EMPTY/BLANK → "", DATE → ISO 8601, BOOLEAN → "TRUE"/"FALSE", ERROR → codigo Excel (#NULL!, #DIV/0!, etc.), NUMBER → entero si no tiene decimales. +- Fechas sin hora se formatean como `YYYY-MM-DD`; con hora como `YYYY-MM-DDTHH:MM:SS`. +- Los pipes `|` dentro de celdas se escapan como `\|`. +- Si `xlwt` no esta disponible, los tests .xls se saltan (xlwt solo se necesita para crear fixtures, no para leer). +- Reimplementacion desde cero, inspirada conceptualmente en OpenViking (AGPL-3.0). Sin codigo copiado. diff --git a/python/functions/core/excel_to_markdown.py b/python/functions/core/excel_to_markdown.py new file mode 100644 index 00000000..3d454079 --- /dev/null +++ b/python/functions/core/excel_to_markdown.py @@ -0,0 +1,211 @@ +"""Convierte archivos Excel a Markdown con cada sheet como seccion H2.""" + +import os +from pathlib import Path + + +# Codigos de error Excel para xlrd +_XL_ERROR_CODES = { + 0: "#NULL!", + 7: "#DIV/0!", + 15: "#VALUE!", + 23: "#REF!", + 29: "#NAME?", + 36: "#NUM!", + 42: "#N/A", +} + + +def _rows_to_markdown_table(rows: list[list[str]]) -> str: + """Convierte filas de strings a tabla markdown.""" + if not rows: + return "" + + header = rows[0] + col_count = len(header) + + # Normalizar todas las filas al mismo numero de columnas + normalized = [] + for row in rows: + if len(row) < col_count: + row = row + [""] * (col_count - len(row)) + normalized.append(row[:col_count]) + + # Escapar pipes en celdas + def escape(cell: str) -> str: + return cell.replace("|", "\\|").replace("\n", " ") + + lines = [] + # Header + lines.append("| " + " | ".join(escape(c) for c in normalized[0]) + " |") + # Separator + lines.append("| " + " | ".join("---" for _ in range(col_count)) + " |") + # Data rows + for row in normalized[1:]: + lines.append("| " + " | ".join(escape(c) for c in row) + " |") + + return "\n".join(lines) + + +def _cell_value_xlrd(cell, workbook) -> str: + """Convierte una celda xlrd a string segun su tipo.""" + import xlrd + + ctype = cell.ctype + + if ctype in (xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK): + return "" + elif ctype == xlrd.XL_CELL_DATE: + try: + dt = xlrd.xldate_as_datetime(cell.value, workbook.datemode) + if dt.hour == 0 and dt.minute == 0 and dt.second == 0: + return dt.date().isoformat() + return dt.isoformat() + except Exception: + return str(cell.value) + elif ctype == xlrd.XL_CELL_BOOLEAN: + return "TRUE" if cell.value else "FALSE" + elif ctype == xlrd.XL_CELL_ERROR: + return _XL_ERROR_CODES.get(int(cell.value), "#ERROR!") + elif ctype == xlrd.XL_CELL_NUMBER: + v = cell.value + if v == int(v): + return str(int(v)) + return str(v) + elif ctype == xlrd.XL_CELL_TEXT: + return str(cell.value) + else: + return str(cell.value) + + +def _sheet_xlrd(sheet, workbook, max_rows: int) -> str: + """Convierte un sheet xlrd a markdown.""" + nrows = sheet.nrows + ncols = sheet.ncols + + lines = [] + lines.append(f"## Sheet: {sheet.name}") + lines.append("") + lines.append(f"**Dimensions:** {nrows} x {ncols}") + lines.append("") + + if nrows == 0 or ncols == 0: + lines.append("*(empty sheet)*") + return "\n".join(lines) + + display_rows = min(nrows, max_rows) + rows = [] + for r in range(display_rows): + row_data = [_cell_value_xlrd(sheet.cell(r, c), workbook) for c in range(ncols)] + rows.append(row_data) + + lines.append(_rows_to_markdown_table(rows)) + + if nrows > max_rows: + omitted = nrows - max_rows + lines.append("") + lines.append(f"*{omitted} rows omitted (max_rows_per_sheet={max_rows})*") + + return "\n".join(lines) + + +def _cell_value_openpyxl(cell) -> str: + """Convierte una celda openpyxl a string.""" + v = cell.value + if v is None: + return "" + if isinstance(v, bool): + return "TRUE" if v else "FALSE" + if isinstance(v, float): + if v == int(v): + return str(int(v)) + return str(v) + if isinstance(v, int): + return str(v) + # Fechas y datetimes + import datetime + if isinstance(v, datetime.datetime): + if v.hour == 0 and v.minute == 0 and v.second == 0: + return v.date().isoformat() + return v.isoformat() + if isinstance(v, datetime.date): + return v.isoformat() + return str(v) + + +def _sheet_openpyxl(ws, max_rows: int) -> str: + """Convierte un worksheet openpyxl a markdown.""" + all_rows = list(ws.iter_rows()) + nrows = len(all_rows) + ncols = ws.max_column or 0 + + lines = [] + lines.append(f"## Sheet: {ws.title}") + lines.append("") + lines.append(f"**Dimensions:** {nrows} x {ncols}") + lines.append("") + + if nrows == 0 or ncols == 0: + lines.append("*(empty sheet)*") + return "\n".join(lines) + + display_rows = min(nrows, max_rows) + rows = [] + for row in all_rows[:display_rows]: + row_data = [_cell_value_openpyxl(cell) for cell in row] + rows.append(row_data) + + lines.append(_rows_to_markdown_table(rows)) + + if nrows > max_rows: + omitted = nrows - max_rows + lines.append("") + lines.append(f"*{omitted} rows omitted (max_rows_per_sheet={max_rows})*") + + return "\n".join(lines) + + +def excel_to_markdown(path: str, max_rows_per_sheet: int = 1000) -> str: + """Convierte un archivo Excel (.xlsx, .xls, .xlsm) a markdown. + + Cada sheet se convierte en una seccion H2. Las filas se representan + como tablas markdown. Si el numero de filas supera max_rows_per_sheet, + el sheet se trunca y se añade una nota. + + Args: + path: Ruta al archivo Excel (.xlsx, .xls, .xlsm). + max_rows_per_sheet: Maximo de filas a incluir por sheet (default 1000). + + Returns: + String markdown con todos los sheets del archivo. + + Raises: + ValueError: Si la extension no es soportada. + FileNotFoundError: Si el archivo no existe. + Exception: Si hay errores leyendo el archivo. + """ + p = Path(path) + if not p.exists(): + raise FileNotFoundError(f"File not found: {path}") + + ext = p.suffix.lower() + + if ext == ".xls": + import xlrd + wb = xlrd.open_workbook(path) + sections = [] + for sheet_name in wb.sheet_names(): + sheet = wb.sheet_by_name(sheet_name) + sections.append(_sheet_xlrd(sheet, wb, max_rows_per_sheet)) + return "\n\n".join(sections) + + elif ext in (".xlsx", ".xlsm"): + import openpyxl + wb = openpyxl.load_workbook(path, data_only=True) + sections = [] + for ws in wb.worksheets: + sections.append(_sheet_openpyxl(ws, max_rows_per_sheet)) + return "\n\n".join(sections) + + else: + raise ValueError(f"Unsupported extension '{ext}'. Use .xlsx, .xls, or .xlsm.") diff --git a/python/functions/core/excel_to_markdown_test.py b/python/functions/core/excel_to_markdown_test.py new file mode 100644 index 00000000..1b798439 --- /dev/null +++ b/python/functions/core/excel_to_markdown_test.py @@ -0,0 +1,142 @@ +"""Tests para excel_to_markdown.""" + +import datetime +import os +import sys +import tempfile + +import openpyxl +import pytest + +sys.path.insert(0, os.path.dirname(__file__)) +from excel_to_markdown import excel_to_markdown + + +def _make_xlsx(sheets: dict, filename: str) -> str: + """Crea un archivo .xlsx temporal con los sheets dados.""" + wb = openpyxl.Workbook() + first = True + for sheet_name, rows in sheets.items(): + if first: + ws = wb.active + ws.title = sheet_name + first = False + else: + ws = wb.create_sheet(sheet_name) + for row in rows: + ws.append(row) + path = os.path.join(tempfile.mkdtemp(), filename) + wb.save(path) + return path + + +def test_xlsx_multiples_sheets(): + """xlsx con multiples sheets produce una seccion H2 por sheet.""" + path = _make_xlsx( + { + "Ventas": [["Producto", "Precio", "Cantidad"], ["Manzana", 1.5, 100], ["Pera", 2.0, 50]], + "Resumen": [["Total", "Importe"], ["150", "225.0"]], + }, + "multi.xlsx", + ) + result = excel_to_markdown(path) + + assert "## Sheet: Ventas" in result + assert "## Sheet: Resumen" in result + assert "Producto" in result + assert "Manzana" in result + assert "Total" in result + + +def test_sheet_vacio(): + """Sheet sin filas produce nota de sheet vacio.""" + path = _make_xlsx({"Vacio": []}, "empty.xlsx") + result = excel_to_markdown(path) + + assert "## Sheet: Vacio" in result + assert "empty sheet" in result + + +def test_sheet_truncado(): + """Sheet con mas filas que max_rows_per_sheet se trunca con nota.""" + rows = [["col"]] + [[str(i)] for i in range(20)] + path = _make_xlsx({"Data": rows}, "big.xlsx") + result = excel_to_markdown(path, max_rows_per_sheet=5) + + assert "omitted" in result + # 21 filas totales, 5 mostradas -> 16 omitidas + assert "16 rows omitted" in result + + +def test_sheet_con_formulas_data_only(): + """Archivo xlsx abierto con data_only=True muestra valores calculados (o None si no guardados).""" + wb = openpyxl.Workbook() + ws = wb.active + ws.title = "Formulas" + ws.append(["A", "B", "Suma"]) + ws.append([1, 2, "=A2+B2"]) + path = os.path.join(tempfile.mkdtemp(), "formulas.xlsx") + wb.save(path) + + result = excel_to_markdown(path) + assert "## Sheet: Formulas" in result + # La celda formula puede ser None con data_only=True si no fue guardada con valor + assert "Suma" in result + + +def test_xls_legacy_con_fechas(): + """xls legacy: la funcion debe aceptar .xls (via xlrd) y manejar fechas.""" + # Creamos un .xls usando xlwt si disponible, si no lo saltamos + pytest.importorskip("xlwt", reason="xlwt no disponible para crear .xls de prueba") + import xlwt + + wb = xlwt.Workbook() + ws = wb.add_sheet("Fechas") + ws.write(0, 0, "Nombre") + ws.write(0, 1, "Fecha") + ws.write(1, 0, "Evento A") + + date_format = xlwt.XFStyle() + date_format.num_format_str = "YYYY-MM-DD" + ws.write(1, 1, datetime.date(2024, 1, 15).toordinal() - 693594, date_format) + + path = os.path.join(tempfile.mkdtemp(), "legacy.xls") + wb.save(path) + + result = excel_to_markdown(path) + assert "## Sheet: Fechas" in result + assert "Evento A" in result + + +def test_extension_no_soportada(): + """Extension no soportada lanza ValueError.""" + path = os.path.join(tempfile.mkdtemp(), "data.csv") + with open(path, "w") as f: + f.write("a,b\n1,2\n") + + with pytest.raises(ValueError, match="Unsupported extension"): + excel_to_markdown(path) + + +def test_archivo_no_existe(): + """Archivo inexistente lanza FileNotFoundError.""" + with pytest.raises(FileNotFoundError): + excel_to_markdown("/tmp/no_existe_para_nada.xlsx") + + +def test_dimensiones_en_metadata(): + """El markdown incluye dimensiones del sheet.""" + path = _make_xlsx({"Hoja1": [["A", "B"], [1, 2], [3, 4]]}, "dims.xlsx") + result = excel_to_markdown(path) + assert "**Dimensions:**" in result + assert "3 x 2" in result + + +def test_tabla_markdown_formato(): + """La tabla tiene formato correcto con separador de header.""" + path = _make_xlsx({"Datos": [["Col1", "Col2"], ["val1", "val2"]]}, "fmt.xlsx") + result = excel_to_markdown(path) + # Debe tener linea separadora con --- + assert "| --- |" in result or "| --- | --- |" in result + assert "Col1" in result + assert "val1" in result diff --git a/python/functions/core/extract_frontmatter.md b/python/functions/core/extract_frontmatter.md new file mode 100644 index 00000000..e0754c37 --- /dev/null +++ b/python/functions/core/extract_frontmatter.md @@ -0,0 +1,43 @@ +--- +name: extract_frontmatter +kind: function +lang: py +domain: core +version: "1.0.0" +purity: pure +signature: "def extract_frontmatter(content: str) -> tuple[str, dict | None]" +description: "Extrae YAML frontmatter (delimitado por ---) del inicio de un string markdown. Retorna el contenido sin frontmatter y el dict parseado (o None si no hay)." +tags: [markdown, frontmatter, yaml, parsing] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [re, yaml] +tested: true +tests: + - "contenido con frontmatter" + - "sin frontmatter retorna None" + - "frontmatter vacio" + - "frontmatter con listas" +test_file_path: "python/functions/core/parse_markdown_test.py" +file_path: "python/functions/core/core.py" +--- + +## Ejemplo + +```python +content = "---\ntitle: Hello\nauthor: Alice\n---\n# Body\n" +remaining, data = extract_frontmatter(content) +# remaining = "# Body\n" +# data = {"title": "Hello", "author": "Alice"} + +no_fm = "# Just markdown\n\nNo frontmatter." +remaining, data = extract_frontmatter(no_fm) +# remaining == no_fm +# data is None +``` + +## Notas + +Funcion pura. Usa `yaml.safe_load` si PyYAML esta disponible; si no, cae back a un parser simple de `key: value`. Solo reconoce frontmatter al inicio estricto del string (posicion 0). El bloque debe estar delimitado por `---\n` de apertura y `\n---\n` de cierre. diff --git a/python/functions/core/extract_json_from_llm.md b/python/functions/core/extract_json_from_llm.md new file mode 100644 index 00000000..0fedff75 --- /dev/null +++ b/python/functions/core/extract_json_from_llm.md @@ -0,0 +1,36 @@ +--- +name: extract_json_from_llm +kind: function +lang: py +domain: core +version: "1.0.0" +purity: pure +signature: "def extract_json_from_llm(content: str) -> dict" +description: "Extrae y parsea JSON de respuestas LLM. Maneja bloques ```json, trailing commas, None->null." +tags: [json, llm, parsing, extraction] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [json] +tested: false +tests: [] +test_file_path: "" +file_path: "python/functions/core/core.py" +source_repo: "https://github.com/VectifyAI/PageIndex" +source_license: "MIT" +source_file: "pageindex/utils.py" +--- + +## Ejemplo + +```python +raw = '```json\n{"key": "value", "items": [1, 2, 3,]}\n```' +result = extract_json_from_llm(raw) +# {"key": "value", "items": [1, 2, 3]} +``` + +## Notas + +Funcion pura. Maneja errores comunes de LLMs: trailing commas, `None` en lugar de `null`, whitespace extra. Retorna dict vacio si el JSON es irrecuperable. diff --git a/python/functions/core/extract_markdown_headers.md b/python/functions/core/extract_markdown_headers.md new file mode 100644 index 00000000..2983e59d --- /dev/null +++ b/python/functions/core/extract_markdown_headers.md @@ -0,0 +1,36 @@ +--- +name: extract_markdown_headers +kind: function +lang: py +domain: core +version: "1.0.0" +purity: pure +signature: "def extract_markdown_headers(markdown_content: str) -> tuple[list[dict], list[str]]" +description: "Extrae todos los headers (h1-h6) de markdown con nivel y numero de linea, ignorando code blocks." +tags: [markdown, parsing, headers, extraction] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [re] +tested: false +tests: [] +test_file_path: "" +file_path: "python/functions/core/core.py" +source_repo: "https://github.com/VectifyAI/PageIndex" +source_license: "MIT" +source_file: "pageindex/page_index_md.py" +--- + +## Ejemplo + +```python +md = "# Title\n\nSome text\n\n## Section\n\n```\n# not a header\n```" +headers, lines = extract_markdown_headers(md) +# headers = [{"title": "Title", "level": 1, "line_num": 1}, {"title": "Section", "level": 2, "line_num": 5}] +``` + +## Notas + +Funcion pura. Detecta y omite bloques de codigo (triple backtick). Retorna tupla: (lista de headers, lista de lineas originales). diff --git a/python/functions/core/extract_pdf_bookmarks.md b/python/functions/core/extract_pdf_bookmarks.md new file mode 100644 index 00000000..0da39039 --- /dev/null +++ b/python/functions/core/extract_pdf_bookmarks.md @@ -0,0 +1,37 @@ +--- +name: extract_pdf_bookmarks +kind: function +lang: py +domain: core +version: "1.0.0" +purity: impure +signature: "def extract_pdf_bookmarks(pdf) -> list[dict]" +description: "Extrae la estructura de bookmarks/outlines de un PDF abierto con pdfplumber. Retorna lista de dicts con level (1-6), title y page_num." +tags: [pdf, bookmarks, outlines, parsing, pdfplumber] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: [pdfplumber] +tested: false +tests: [] +test_file_path: "" +file_path: "python/functions/core/extract_pdf_bookmarks.py" +--- + +## Ejemplo + +```python +import pdfplumber +from extract_pdf_bookmarks import extract_pdf_bookmarks + +with pdfplumber.open("document.pdf") as pdf: + bookmarks = extract_pdf_bookmarks(pdf) + for bm in bookmarks: + print(f"{'#' * bm['level']} {bm['title']} (page {bm['page_num']})") +``` + +## Notas + +Recibe un objeto `pdfplumber.PDF` ya abierto (no un path). Construye un mapping interno `objid -> page_number` desde `pdf.pages` para resolver los destinos de outline. El nivel se limita al rango [1, 6] para compatibilidad markdown. Retorna lista vacia si el PDF no tiene outlines o si `get_outlines()` falla. Impure porque accede al estado interno de un objeto PDF ya abierto. diff --git a/python/functions/core/extract_pdf_bookmarks.py b/python/functions/core/extract_pdf_bookmarks.py new file mode 100644 index 00000000..f192ffee --- /dev/null +++ b/python/functions/core/extract_pdf_bookmarks.py @@ -0,0 +1,63 @@ +"""Extract the bookmark/outline structure from a PDF opened with pdfplumber.""" + +import pdfplumber + + +def extract_pdf_bookmarks(pdf: pdfplumber.PDF) -> list[dict]: + """Extract bookmarks/outlines from an open pdfplumber PDF object. + + Args: + pdf: An open pdfplumber.PDF object. + + Returns: + list[dict]: List of {"level": int, "title": str, "page_num": int | None}. + Level is clamped to [1, 6]. Returns empty list if no outlines. + """ + try: + outlines = pdf.doc.get_outlines() + except Exception: + return [] + + if not outlines: + return [] + + # Build objid -> page_number mapping + objid_to_page: dict[int, int] = {} + for i, page in enumerate(pdf.pages): + try: + obj = page.page_obj + objid_to_page[obj.objid] = i + 1 # 1-indexed page numbers + except Exception: + pass + + bookmarks = [] + for item in outlines: + try: + level = item[0] # integer level from get_outlines + title = item[1] + dest = item[2] # destination: page object or list + + # Clamp level to [1, 6] + level = max(1, min(6, level)) + + # Resolve destination to page number + page_num = None + if dest is not None: + if isinstance(dest, list) and len(dest) > 0: + # dest[0] is the page object + page_obj = dest[0] + try: + page_num = objid_to_page.get(page_obj.objid) + except Exception: + pass + else: + try: + page_num = objid_to_page.get(dest.objid) + except Exception: + pass + + bookmarks.append({"level": level, "title": str(title), "page_num": page_num}) + except Exception: + continue + + return bookmarks diff --git a/python/functions/core/extract_pdf_text.md b/python/functions/core/extract_pdf_text.md new file mode 100644 index 00000000..23f7278f --- /dev/null +++ b/python/functions/core/extract_pdf_text.md @@ -0,0 +1,35 @@ +--- +name: extract_pdf_text +kind: function +lang: py +domain: core +version: "1.0.0" +purity: impure +signature: "def extract_pdf_text(pdf_path: str) -> str" +description: "Extrae todo el texto de un PDF concatenando todas las paginas. Usa PyPDF2." +tags: [pdf, text, extraction, parsing] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: [PyPDF2] +tested: false +tests: [] +test_file_path: "" +file_path: "python/functions/core/extract_pdf_text.py" +source_repo: "https://github.com/VectifyAI/PageIndex" +source_license: "MIT" +source_file: "pageindex/utils.py" +--- + +## Ejemplo + +```python +text = extract_pdf_text("/path/to/document.pdf") +print(len(text)) # total characters +``` + +## Notas + +Requiere `pip install PyPDF2`. Extraccion basica de texto — no maneja OCR ni PDFs escaneados. Para PDFs complejos considerar PyMuPDF. diff --git a/python/functions/core/extract_pdf_text.py b/python/functions/core/extract_pdf_text.py new file mode 100644 index 00000000..b86a6134 --- /dev/null +++ b/python/functions/core/extract_pdf_text.py @@ -0,0 +1,19 @@ +"""Extract all text from a PDF file using PyPDF2.""" + +import PyPDF2 + + +def extract_pdf_text(pdf_path: str) -> str: + """Extract all text from a PDF file. + + Args: + pdf_path: Path to the PDF file. + + Returns: + str: Concatenated text from all pages. + """ + pdf_reader = PyPDF2.PdfReader(pdf_path) + text = "" + for page in pdf_reader.pages: + text += page.extract_text() or "" + return text diff --git a/python/functions/core/extract_text_from_file.md b/python/functions/core/extract_text_from_file.md new file mode 100644 index 00000000..02c66e93 --- /dev/null +++ b/python/functions/core/extract_text_from_file.md @@ -0,0 +1,51 @@ +--- +name: extract_text_from_file +kind: function +lang: py +domain: core +version: "1.0.0" +purity: impure +signature: "extract_text_from_file(file_path: str) -> str" +description: "Extrae texto plano de un archivo. Soporta PDF (PyMuPDF), Markdown y TXT con deteccion automatica de encoding." +tags: [text, pdf, markdown, txt, encoding, extraction, file, io] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: ["os", "fitz (PyMuPDF)", "charset_normalizer", "chardet"] +tested: true +tests: + - "PDF con texto extrae contenido correctamente" + - "archivo MD UTF-8 retorna contenido" + - "archivo TXT latin-1 detecta encoding" + - "archivo inexistente lanza FileNotFoundError" + - "extension no soportada lanza ValueError" +test_file_path: "python/functions/core/extract_text_from_file_test.py" +file_path: "python/functions/core/extract_text_from_file.py" +--- + +## Ejemplo + +```python +# PDF +text = extract_text_from_file("report.pdf") + +# Markdown +text = extract_text_from_file("README.md") + +# TXT con encoding desconocido +text = extract_text_from_file("notes.txt") +``` + +## Notas + +Para PDF usa PyMuPDF (`fitz`) que produce mejor texto que PyPDF2, especialmente en PDFs con columnas o layout complejo. Las paginas se unen con `\n\n`. + +La deteccion de encoding para archivos de texto sigue este orden de prioridad: +1. Intenta UTF-8 directamente +2. `charset_normalizer.from_bytes().best().encoding` +3. `chardet.detect(data)["encoding"]` +4. UTF-8 con `errors='replace'` como ultimo recurso + +Diferencia con `extract_pdf_text_py_core`: esa funcion usa PyPDF2 y solo soporta PDF. Esta funcion usa PyMuPDF y soporta ademas MD y TXT con deteccion de encoding. diff --git a/python/functions/core/extract_text_from_file.py b/python/functions/core/extract_text_from_file.py new file mode 100644 index 00000000..747c3545 --- /dev/null +++ b/python/functions/core/extract_text_from_file.py @@ -0,0 +1,92 @@ +"""Extract plain text from PDF, Markdown, or TXT files.""" + + +SUPPORTED_EXTENSIONS = {".pdf", ".md", ".markdown", ".txt"} + + +def _detect_encoding(data: bytes) -> str: + """Detect encoding of raw bytes using multiple fallback strategies.""" + # Strategy 1: UTF-8 + try: + data.decode("utf-8") + return "utf-8" + except UnicodeDecodeError: + pass + + # Strategy 2: charset_normalizer + try: + from charset_normalizer import from_bytes + + result = from_bytes(data).best() + if result is not None and result.encoding: + return result.encoding + except ImportError: + pass + + # Strategy 3: chardet + try: + import chardet + + detected = chardet.detect(data) + if detected and detected.get("encoding"): + return detected["encoding"] + except ImportError: + pass + + # Last resort: UTF-8 with replacement + return "utf-8" + + +def extract_text_from_file(file_path: str) -> str: + """Extract plain text from a file. Supports PDF, Markdown and TXT. + + For PDF files uses PyMuPDF (fitz) to extract text from each page, + joining them with double newlines. For text-based files (.md, .markdown, + .txt) reads the file with automatic encoding detection. + + Args: + file_path: Absolute or relative path to the file. + + Returns: + str: Extracted plain text content. + + Raises: + FileNotFoundError: If the file does not exist. + ValueError: If the file extension is not supported. + ImportError: If PyMuPDF is not installed and a PDF is provided. + """ + import os + + if not os.path.exists(file_path): + raise FileNotFoundError(f"File not found: {file_path}") + + _, ext = os.path.splitext(file_path.lower()) + + if ext == ".pdf": + try: + import fitz # PyMuPDF + except ImportError as e: + raise ImportError( + "PyMuPDF is required for PDF extraction. " + "Install it with: pip install PyMuPDF" + ) from e + + doc = fitz.open(file_path) + pages = [page.get_text() for page in doc] + return "\n\n".join(pages) + + elif ext in {".md", ".markdown", ".txt"}: + with open(file_path, "rb") as f: + raw = f.read() + + encoding = _detect_encoding(raw) + try: + return raw.decode(encoding) + except (UnicodeDecodeError, LookupError): + return raw.decode("utf-8", errors="replace") + + else: + raise ValueError( + f"Unsupported file extension: '{ext}'. " + f"Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}" + ) diff --git a/python/functions/core/extract_text_from_file_test.py b/python/functions/core/extract_text_from_file_test.py new file mode 100644 index 00000000..dbe1dcfe --- /dev/null +++ b/python/functions/core/extract_text_from_file_test.py @@ -0,0 +1,83 @@ +"""Tests para extract_text_from_file.""" + +import os +import sys +import tempfile + +import pytest + +sys.path.insert(0, os.path.dirname(__file__)) +from extract_text_from_file import extract_text_from_file + + +def test_pdf_con_texto_extrae_contenido_correctamente(): + """PDF con texto extrae contenido correctamente.""" + try: + import fitz + except ImportError: + pytest.skip("PyMuPDF no instalado") + + # Create a minimal in-memory PDF using PyMuPDF and write it to a temp file + doc = fitz.open() + page = doc.new_page() + page.insert_text((72, 72), "Hello from PDF") + with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f: + tmp_path = f.name + try: + doc.save(tmp_path) + doc.close() + result = extract_text_from_file(tmp_path) + assert "Hello from PDF" in result + finally: + os.unlink(tmp_path) + + +def test_archivo_md_utf8_retorna_contenido(): + """archivo MD UTF-8 retorna contenido.""" + content = "# Titulo\n\nParrafo con texto UTF-8: cafe, senor, japon.\n" + with tempfile.NamedTemporaryFile( + suffix=".md", mode="wb", delete=False + ) as f: + f.write(content.encode("utf-8")) + tmp_path = f.name + try: + result = extract_text_from_file(tmp_path) + assert "# Titulo" in result + assert "cafe" in result + finally: + os.unlink(tmp_path) + + +def test_archivo_txt_latin1_detecta_encoding(): + """archivo TXT latin-1 detecta encoding.""" + content = "Texto en latin-1: cafe, hotel, naive\n" + with tempfile.NamedTemporaryFile( + suffix=".txt", mode="wb", delete=False + ) as f: + f.write(content.encode("latin-1")) + tmp_path = f.name + try: + result = extract_text_from_file(tmp_path) + # The word "cafe" or similar should appear in the decoded result + assert len(result) > 0 + assert "cafe" in result or "caf" in result + finally: + os.unlink(tmp_path) + + +def test_archivo_inexistente_lanza_filenotfounderror(): + """archivo inexistente lanza FileNotFoundError.""" + with pytest.raises(FileNotFoundError): + extract_text_from_file("/tmp/no_existe_este_archivo_12345.txt") + + +def test_extension_no_soportada_lanza_valueerror(): + """extension no soportada lanza ValueError.""" + with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as f: + f.write(b"fake docx content") + tmp_path = f.name + try: + with pytest.raises(ValueError, match="Unsupported file extension"): + extract_text_from_file(tmp_path) + finally: + os.unlink(tmp_path) diff --git a/python/functions/core/fetch_and_parse_url.md b/python/functions/core/fetch_and_parse_url.md new file mode 100644 index 00000000..ce6e817f --- /dev/null +++ b/python/functions/core/fetch_and_parse_url.md @@ -0,0 +1,50 @@ +--- +name: fetch_and_parse_url +kind: function +lang: py +domain: core +version: "1.0.0" +purity: impure +signature: "fetch_and_parse_url(url: str, timeout: float = 30.0) -> str" +description: "Descarga una pagina web y la convierte a markdown. Combina detect_url_type + fetch HTML + html_to_markdown en una sola operacion." +tags: [http, fetch, html, markdown, parse, url, scraping] +uses_functions: + - detect_url_type_py_core + - html_to_markdown_py_core +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: ["httpx"] +tested: false +tests: [] +test_file_path: "" +file_path: "python/functions/core/fetch_and_parse_url.py" +--- + +## Ejemplo + +```python +from core.fetch_and_parse_url import fetch_and_parse_url + +# Descargar y convertir una pagina web +md = fetch_and_parse_url("https://example.com") +print(md) + +# Con timeout personalizado +md = fetch_and_parse_url("https://en.wikipedia.org/wiki/Python", timeout=15.0) +``` + +## Notas + +Algoritmo: +1. `detect_url_type(url)` determina el tipo de contenido (por patron, extension o HEAD request). +2. Si es `code_repository` → lanza Exception (requiere git clone, no HTTP fetch). +3. Si es `pdf` → lanza Exception (requiere pdfminer/pypdf, no incluido). +4. `httpx.get(url)` descarga el contenido con follow_redirects. +5. Si es `webpage` o Content-Type HTML → `html_to_markdown(raw_html)`. +6. Si es `markdown`, `text` o codigo → retorna el texto directamente. + +Lanza `Exception` con mensaje descriptivo en cualquier fallo de red o tipo no soportado. + +Funcion impura: hace I/O (HTTP requests). diff --git a/python/functions/core/fetch_and_parse_url.py b/python/functions/core/fetch_and_parse_url.py new file mode 100644 index 00000000..c645b7b9 --- /dev/null +++ b/python/functions/core/fetch_and_parse_url.py @@ -0,0 +1,64 @@ +"""Descarga una pagina web y la convierte a markdown.""" + +from __future__ import annotations + + +def fetch_and_parse_url(url: str, timeout: float = 30.0) -> str: + """Descarga una pagina web y la convierte a markdown. + + Detecta el tipo de URL con detect_url_type, descarga el contenido con + httpx y lo convierte al formato apropiado: + - webpage: fetch HTML → html_to_markdown + - markdown: retorna el texto directamente + - text/code: retorna el texto directamente + - pdf: retorna stub (requiere dependencia externa) + - code_repository: retorna stub (requiere clonar repo) + + Args: + url: URL a descargar y parsear. + timeout: Timeout en segundos para las peticiones HTTP. + + Returns: + Contenido de la URL en formato markdown. + + Raises: + Exception: Si falla la descarga (timeout, DNS, HTTP error) o el tipo + de URL no es soportado. + """ + import httpx + + from detect_url_type import detect_url_type + from html_to_markdown import html_to_markdown + + # Detectar tipo de URL (puede hacer HEAD request) + url_type, _meta = detect_url_type(url, timeout=timeout) + + if url_type == "code_repository": + raise Exception( + f"fetch_and_parse_url: code_repository URLs require git clone, not supported. url={url!r}" + ) + + if url_type == "pdf": + raise Exception( + f"fetch_and_parse_url: PDF parsing requires external dependency (pdfminer/pypdf). url={url!r}" + ) + + # Fetch content via GET + try: + response = httpx.get(url, timeout=timeout, follow_redirects=True) + response.raise_for_status() + except httpx.HTTPStatusError as exc: + raise Exception( + f"fetch_and_parse_url: HTTP {exc.response.status_code} for {url!r}" + ) from exc + except Exception as exc: + raise Exception(f"fetch_and_parse_url: request failed for {url!r}: {exc}") from exc + + content_type = response.headers.get("content-type", "").lower() + raw_text = response.text + + if url_type == "webpage" or "text/html" in content_type: + return html_to_markdown(raw_text) + + # markdown, text, or code files — return as-is + return raw_text diff --git a/python/functions/core/find_headings.md b/python/functions/core/find_headings.md new file mode 100644 index 00000000..61aa5f65 --- /dev/null +++ b/python/functions/core/find_headings.md @@ -0,0 +1,38 @@ +--- +name: find_headings +kind: function +lang: py +domain: core +version: "1.0.0" +purity: pure +signature: "def find_headings(content: str) -> list[tuple[int, int, str, int]]" +description: "Encuentra todos los headings markdown (# a ######), excluyendo los que estan dentro de code blocks, HTML comments y bloques indentados. Retorna lista de (start_pos, end_pos, title, level)." +tags: [markdown, headings, parsing, extraction] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [re] +tested: true +tests: + - "headings normales detectados correctamente" + - "headings dentro de code blocks no detectados" + - "headings escapados ignorados" + - "headings en HTML comments ignorados" +test_file_path: "python/functions/core/parse_markdown_test.py" +file_path: "python/functions/core/core.py" +--- + +## Ejemplo + +```python +content = "# Title\n\nSome text\n\n## Section\n\n```\n# Ignored\n```\n" +headings = find_headings(content) +# [(0, 7, "Title", 1), (22, 33, "Section", 2)] +# (positions approximated) +``` + +## Notas + +Funcion pura. Excluye tres tipos de contextos: bloques de codigo triple backtick, comentarios HTML (``), y lineas indentadas con 4 espacios o tabulacion. Tambien filtra headings precedidos por backslash (`\#`). Diferencia clave respecto a `extract_markdown_headers`: esta funcion retorna posiciones de caracter, no numeros de linea, lo que facilita la extraccion de contenido entre headings. diff --git a/python/functions/core/flatten_tree.md b/python/functions/core/flatten_tree.md new file mode 100644 index 00000000..2708363c --- /dev/null +++ b/python/functions/core/flatten_tree.md @@ -0,0 +1,36 @@ +--- +name: flatten_tree +kind: function +lang: py +domain: core +version: "1.0.0" +purity: pure +signature: "def flatten_tree(structure: Any) -> list[dict]" +description: "Aplana un arbol jerarquico (dict con 'nodes') a lista plana sin hijos. Deep copy de cada nodo." +tags: [tree, flatten, hierarchy, functional] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [copy] +tested: false +tests: [] +test_file_path: "" +file_path: "python/functions/core/core.py" +source_repo: "https://github.com/VectifyAI/PageIndex" +source_license: "MIT" +source_file: "pageindex/utils.py" +--- + +## Ejemplo + +```python +tree = [{"title": "A", "nodes": [{"title": "A1", "nodes": []}]}] +flatten_tree(tree) +# [{"title": "A"}, {"title": "A1"}] +``` + +## Notas + +Funcion pura. Usa deep copy para no mutar el arbol original. Elimina el campo 'nodes' de cada nodo aplanado. diff --git a/python/functions/core/format_iso8601.md b/python/functions/core/format_iso8601.md new file mode 100644 index 00000000..0fa79940 --- /dev/null +++ b/python/functions/core/format_iso8601.md @@ -0,0 +1,49 @@ +--- +name: format_iso8601 +kind: function +lang: py +domain: core +version: "1.0.0" +purity: pure +signature: "format_iso8601(dt: datetime) -> str" +description: "Formatea un datetime a ISO 8601 UTC con milisegundos. Formato: yyyy-MM-ddTHH:mm:ss.SSSZ. Si naive asume UTC, si aware convierte a UTC." +tags: [datetime, iso8601, format, time, utc] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: ["datetime"] +tested: true +tests: + - "datetime naive formateado como UTC" + - "datetime con timezone convertido a UTC" + - "datetime UTC sin conversion" +test_file_path: "python/functions/core/format_iso8601_test.py" +file_path: "python/functions/core/format_iso8601.py" +--- + +## Ejemplo + +```python +from datetime import datetime, timezone, timedelta +from format_iso8601 import format_iso8601 + +# Naive (asume UTC) +s = format_iso8601(datetime(2026, 2, 21, 13, 20, 23, 147000)) +# "2026-02-21T13:20:23.147Z" + +# Con timezone +8 +tz8 = timezone(timedelta(hours=8)) +s = format_iso8601(datetime(2026, 2, 21, 21, 20, 23, 147000, tzinfo=tz8)) +# "2026-02-21T13:20:23.147Z" +``` + +## Notas + +Algoritmo: +1. Si naive: `dt.replace(tzinfo=timezone.utc)`. +2. Si aware: `dt.astimezone(timezone.utc)`. +3. `dt.isoformat(timespec="milliseconds").replace("+00:00", "Z")`. + +Funcion pura. No hace I/O ni tiene efectos secundarios. diff --git a/python/functions/core/format_iso8601.py b/python/functions/core/format_iso8601.py new file mode 100644 index 00000000..3ed803df --- /dev/null +++ b/python/functions/core/format_iso8601.py @@ -0,0 +1,24 @@ +"""Formatea un datetime a ISO 8601 UTC con milisegundos.""" + +from datetime import datetime, timezone + + +def format_iso8601(dt: datetime) -> str: + """Formatea un datetime a ISO 8601 UTC con milisegundos. + + Formato de salida: ``yyyy-MM-ddTHH:mm:ss.SSSZ`` + + Si el datetime es naive (sin tzinfo), se asume UTC. + Si el datetime es aware, se convierte a UTC antes de formatear. + + Args: + dt: datetime a formatear. Puede ser naive o aware. + + Returns: + String ISO 8601 en UTC con milisegundos, terminando en 'Z'. + """ + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + else: + dt = dt.astimezone(timezone.utc) + return dt.isoformat(timespec="milliseconds").replace("+00:00", "Z") diff --git a/python/functions/core/format_iso8601_test.py b/python/functions/core/format_iso8601_test.py new file mode 100644 index 00000000..763b2882 --- /dev/null +++ b/python/functions/core/format_iso8601_test.py @@ -0,0 +1,28 @@ +"""Tests para format_iso8601.""" + +import sys +import os + +sys.path.insert(0, os.path.dirname(__file__)) + +from datetime import datetime, timezone, timedelta +from format_iso8601 import format_iso8601 + + +def test_datetime_naive_formateado_como_utc(): + dt = datetime(2026, 2, 21, 13, 20, 23, 147000) + result = format_iso8601(dt) + assert result == "2026-02-21T13:20:23.147Z" + + +def test_datetime_con_timezone_convertido_a_utc(): + tz8 = timezone(timedelta(hours=8)) + dt = datetime(2026, 2, 21, 21, 20, 23, 147000, tzinfo=tz8) + result = format_iso8601(dt) + assert result == "2026-02-21T13:20:23.147Z" + + +def test_datetime_utc_sin_conversion(): + dt = datetime(2026, 6, 15, 9, 0, 0, 500000, tzinfo=timezone.utc) + result = format_iso8601(dt) + assert result == "2026-06-15T09:00:00.500Z" diff --git a/python/functions/core/format_simplified.md b/python/functions/core/format_simplified.md new file mode 100644 index 00000000..71f5a8b1 --- /dev/null +++ b/python/functions/core/format_simplified.md @@ -0,0 +1,54 @@ +--- +name: format_simplified +kind: function +lang: py +domain: core +version: "1.0.0" +purity: pure +signature: "format_simplified(dt: datetime, now: datetime) -> str" +description: "Formato humano simplificado: si dt es del mismo dia que now muestra HH:MM:SS, si no muestra YYYY-MM-DD." +tags: [datetime, format, time, human, display] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: ["datetime"] +tested: true +tests: + - "mismo dia muestra formato hora" + - "dia anterior muestra formato fecha" + - "exactamente 24h muestra formato fecha" +test_file_path: "python/functions/core/format_simplified_test.py" +file_path: "python/functions/core/format_simplified.py" +--- + +## Ejemplo + +```python +from datetime import datetime +from format_simplified import format_simplified + +now = datetime(2026, 2, 21, 15, 0, 0) + +# Mismo dia +s = format_simplified(datetime(2026, 2, 21, 9, 30, 0), now) +# "09:30:00" + +# Dia anterior +s = format_simplified(datetime(2026, 2, 20, 9, 30, 0), now) +# "2026-02-20" +``` + +## Notas + +Algoritmo: +1. Remover tzinfo de ambos datetimes para comparacion simple (`replace(tzinfo=None)`). +2. Si `(now - dt).days < 1`: retornar `dt.strftime("%H:%M:%S")`. +3. Si no: retornar `dt.strftime("%Y-%m-%d")`. + +El umbral de 1 dia en `timedelta.days` significa que cualquier diferencia +menor a 24 horas se muestra como hora. Un dt exactamente 24h atras +tendra `days == 1`, mostrando fecha. + +Funcion pura. No hace I/O ni tiene efectos secundarios. diff --git a/python/functions/core/format_simplified.py b/python/functions/core/format_simplified.py new file mode 100644 index 00000000..fe0ad6c6 --- /dev/null +++ b/python/functions/core/format_simplified.py @@ -0,0 +1,25 @@ +"""Formato humano simplificado de datetime: hora si es hoy, fecha si es otro dia.""" + +from datetime import datetime + + +def format_simplified(dt: datetime, now: datetime) -> str: + """Formato humano simplificado de datetime. + + Si ``dt`` es del mismo dia que ``now`` (diferencia < 1 dia), retorna + la hora en formato ``HH:MM:SS``. En caso contrario retorna la fecha + en formato ``YYYY-MM-DD``. + + Args: + dt: datetime a formatear. + now: datetime de referencia (el momento actual). + + Returns: + String ``HH:MM:SS`` si mismo dia, ``YYYY-MM-DD`` si otro dia. + """ + dt_naive = dt.replace(tzinfo=None) + now_naive = now.replace(tzinfo=None) + diff = now_naive - dt_naive + if diff.days < 1: + return dt.strftime("%H:%M:%S") + return dt.strftime("%Y-%m-%d") diff --git a/python/functions/core/format_simplified_test.py b/python/functions/core/format_simplified_test.py new file mode 100644 index 00000000..d420d937 --- /dev/null +++ b/python/functions/core/format_simplified_test.py @@ -0,0 +1,30 @@ +"""Tests para format_simplified.""" + +import sys +import os + +sys.path.insert(0, os.path.dirname(__file__)) + +from datetime import datetime, timedelta +from format_simplified import format_simplified + + +def test_mismo_dia_muestra_formato_hora(): + now = datetime(2026, 2, 21, 15, 0, 0) + dt = datetime(2026, 2, 21, 9, 30, 45) + result = format_simplified(dt, now) + assert result == "09:30:45" + + +def test_dia_anterior_muestra_formato_fecha(): + now = datetime(2026, 2, 21, 15, 0, 0) + dt = datetime(2026, 2, 20, 9, 30, 45) + result = format_simplified(dt, now) + assert result == "2026-02-20" + + +def test_exactamente_24h_muestra_formato_fecha(): + now = datetime(2026, 2, 21, 15, 0, 0) + dt = now - timedelta(hours=24) + result = format_simplified(dt, now) + assert result == "2026-02-20" diff --git a/python/functions/core/format_table_to_markdown.md b/python/functions/core/format_table_to_markdown.md new file mode 100644 index 00000000..d7991f2c --- /dev/null +++ b/python/functions/core/format_table_to_markdown.md @@ -0,0 +1,36 @@ +--- +name: format_table_to_markdown +kind: function +lang: py +domain: core +version: "1.0.0" +purity: pure +signature: "def format_table_to_markdown(rows: list[list[str]], has_header: bool = True) -> str" +description: "Convierte una lista 2D de celdas a tabla markdown con alineacion de columnas. Escapa pipes en celdas y añade separador header." +tags: [markdown, table, formatting, text, pure] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [] +tested: true +tests: ["tabla normal", "tabla con celdas vacias", "tabla con 1 fila", "tabla vacia", "celdas con pipes", "sin header"] +test_file_path: "python/functions/core/format_table_to_markdown_test.py" +file_path: "python/functions/core/format_table_to_markdown.py" +--- + +## Ejemplo + +```python +rows = [["Name", "Age"], ["Alice", "30"], ["Bob", "25"]] +md = format_table_to_markdown(rows) +# | Name | Age | +# | ----- | --- | +# | Alice | 30 | +# | Bob | 25 | +``` + +## Notas + +Funcion pura. No tiene dependencias externas. Calcula el ancho maximo por columna para alinear. El separador usa minimo 3 guiones por columna para cumplir con la especificacion markdown. Escapa los pipes dentro de celdas con `\|`. Si `has_header=False`, omite la fila separadora. diff --git a/python/functions/core/format_table_to_markdown.py b/python/functions/core/format_table_to_markdown.py new file mode 100644 index 00000000..73f850dd --- /dev/null +++ b/python/functions/core/format_table_to_markdown.py @@ -0,0 +1,52 @@ +"""Convert a 2D list of cells to a markdown table with column alignment.""" + + +def format_table_to_markdown(rows: list[list[str]], has_header: bool = True) -> str: + """Convert a 2D list of cells to a markdown table. + + Args: + rows: 2D list where each inner list is a row of cell strings. + has_header: If True, the first row is treated as the header row. + + Returns: + str: Markdown table string. Returns empty string for empty input. + """ + if not rows: + return "" + + def escape_cell(cell: str) -> str: + return str(cell).replace("|", "\\|") + + # Determine column count from widest row + col_count = max(len(row) for row in rows) + + # Pad rows to same column count + padded = [row + [""] * (col_count - len(row)) for row in rows] + + # Escape pipe characters in all cells + escaped = [[escape_cell(cell) for cell in row] for row in padded] + + # Calculate max width per column + col_widths = [ + max(len(escaped[r][c]) for r in range(len(escaped))) + for c in range(col_count) + ] + col_widths = [max(w, 3) for w in col_widths] # minimum width of 3 for separator + + def format_row(row: list[str]) -> str: + cells = [cell.ljust(col_widths[i]) for i, cell in enumerate(row)] + return "| " + " | ".join(cells) + " |" + + lines = [] + + if has_header and len(escaped) >= 1: + lines.append(format_row(escaped[0])) + separator = "| " + " | ".join("-" * col_widths[i] for i in range(col_count)) + " |" + lines.append(separator) + for row in escaped[1:]: + lines.append(format_row(row)) + else: + for row in escaped: + lines.append(format_row(row)) + + return "\n".join(lines) diff --git a/python/functions/core/format_table_to_markdown_test.py b/python/functions/core/format_table_to_markdown_test.py new file mode 100644 index 00000000..3cdc2a8e --- /dev/null +++ b/python/functions/core/format_table_to_markdown_test.py @@ -0,0 +1,63 @@ +"""Tests para format_table_to_markdown.""" + +import sys +import os + +sys.path.insert(0, os.path.dirname(__file__)) +from format_table_to_markdown import format_table_to_markdown + + +def test_tabla_normal(): + rows = [["Name", "Age", "City"], ["Alice", "30", "Madrid"], ["Bob", "25", "Berlin"]] + result = format_table_to_markdown(rows) + assert "| Name | Age | City |" in result + assert "| --- | --- | --- |" in result or "---" in result + assert "| Alice | 30 | Madrid |" in result + assert "| Bob | 25 | Berlin |" in result + + +def test_tabla_con_celdas_vacias(): + rows = [["A", "B"], ["", "x"], ["y", ""]] + result = format_table_to_markdown(rows) + assert "|" in result + lines = result.split("\n") + assert len(lines) == 4 # header + separator + 2 data rows + + +def test_tabla_con_1_fila(): + rows = [["Solo", "Row"]] + result = format_table_to_markdown(rows) + lines = result.split("\n") + # header + separator (no data rows) + assert len(lines) == 2 + assert "Solo" in lines[0] + assert "---" in lines[1] + + +def test_tabla_vacia(): + result = format_table_to_markdown([]) + assert result == "" + + +def test_celdas_con_pipes(): + rows = [["Header"], ["cell|with|pipes"]] + result = format_table_to_markdown(rows) + assert "\\|" in result + + +def test_sin_header(): + rows = [["A", "B"], ["C", "D"]] + result = format_table_to_markdown(rows, has_header=False) + assert "---" not in result + lines = result.split("\n") + assert len(lines) == 2 + + +if __name__ == "__main__": + test_tabla_normal() + test_tabla_con_celdas_vacias() + test_tabla_con_1_fila() + test_tabla_vacia() + test_celdas_con_pipes() + test_sin_header() + print("All tests passed.") diff --git a/python/functions/core/format_tree_structure.md b/python/functions/core/format_tree_structure.md new file mode 100644 index 00000000..3baf870e --- /dev/null +++ b/python/functions/core/format_tree_structure.md @@ -0,0 +1,36 @@ +--- +name: format_tree_structure +kind: function +lang: py +domain: core +version: "1.0.0" +purity: pure +signature: "def format_tree_structure(structure: Any, order: list[str] = None) -> Any" +description: "Reordena campos de cada nodo de un arbol segun orden de claves especificado." +tags: [tree, format, order, structure] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [] +tested: false +tests: [] +test_file_path: "" +file_path: "python/functions/core/core.py" +source_repo: "https://github.com/VectifyAI/PageIndex" +source_license: "MIT" +source_file: "pageindex/utils.py" +--- + +## Ejemplo + +```python +tree = [{"text": "...", "title": "Intro", "node_id": "0001"}] +format_tree_structure(tree, order=["title", "node_id", "text"]) +# [{"title": "Intro", "node_id": "0001", "text": "..."}] +``` + +## Notas + +Funcion pura. Elimina nodos vacios (nodes=[]) automaticamente. Claves no listadas en order se descartan. diff --git a/python/functions/core/from_csv.md b/python/functions/core/from_csv.md new file mode 100644 index 00000000..cb6cde11 --- /dev/null +++ b/python/functions/core/from_csv.md @@ -0,0 +1,49 @@ +--- +name: from_csv +kind: function +lang: py +domain: core +version: "1.0.0" +purity: pure +signature: "from_csv(text: str, delimiter: str = ',', has_header: bool = True) -> list[dict]" +description: "Parser CSV a datos tabulares. Complemento de to_csv. Soporta campos entre comillas con escaping RFC 4180. Si has_header=False, genera keys col_0, col_1, etc." +tags: [csv, parser, import, tabular, format] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [] +tested: true +tests: + - "csv simple con header" + - "campos con escaping" + - "sin header keys generadas" + - "lineas vacias ignoradas" + - "un solo campo por fila" +test_file_path: "python/functions/core/from_csv_test.py" +file_path: "python/functions/core/from_csv.py" +--- + +## Ejemplo + +```python +text = "nombre,edad\r\nAna,30\r\nBob,25" +rows = from_csv(text) +# [{"nombre": "Ana", "edad": "30"}, {"nombre": "Bob", "edad": "25"}] + +# Sin header +text = "Ana,30\nBob,25" +rows = from_csv(text, has_header=False) +# [{"col_0": "Ana", "col_1": "30"}, {"col_0": "Bob", "col_1": "25"}] + +# Con escaping +text = 'a,b\r\n"dijo ""hola""","uno,dos"' +rows = from_csv(text) +# [{"a": 'dijo "hola"', "b": "uno,dos"}] +``` + +## Notas + +Parser manual sin el modulo csv de stdlib. Normaliza CRLF y LF antes de procesar. +Ignora lineas vacias. Todos los valores son strings — la conversion de tipos queda a cargo del caller. diff --git a/python/functions/core/from_csv.py b/python/functions/core/from_csv.py new file mode 100644 index 00000000..658d7bb7 --- /dev/null +++ b/python/functions/core/from_csv.py @@ -0,0 +1,83 @@ +"""Parser CSV a datos tabulares (RFC 4180). Complemento de to_csv.""" + + +def _parse_row(line: str, delimiter: str) -> list[str]: + """Parsea una linea CSV respetando campos entre comillas (RFC 4180).""" + fields: list[str] = [] + field_chars: list[str] = [] + in_quotes = False + i = 0 + + while i < len(line): + ch = line[i] + + if in_quotes: + if ch == '"': + # Comilla doble escapada o cierre de campo + if i + 1 < len(line) and line[i + 1] == '"': + field_chars.append('"') + i += 2 + continue + else: + in_quotes = False + else: + field_chars.append(ch) + else: + if ch == '"' and not field_chars: + in_quotes = True + elif ch == delimiter: + fields.append("".join(field_chars)) + field_chars = [] + else: + field_chars.append(ch) + i += 1 + + fields.append("".join(field_chars)) + return fields + + +def from_csv( + text: str, + delimiter: str = ",", + has_header: bool = True, +) -> list[dict]: + """Parser CSV a lista de dicts. + + Complemento de to_csv. Soporta campos entre comillas con escaping RFC 4180. + Si has_header=False, genera keys col_0, col_1, etc. + + Args: + text: Contenido CSV completo como string. + delimiter: Separador de campos. Por defecto coma. + has_header: Si True, primera fila es el encabezado. + Si False, genera keys col_0, col_1, ... + + Returns: + Lista de dicts. Lista vacia si el texto esta vacio o solo tiene header. + """ + # Normalizar line endings + normalized = text.replace("\r\n", "\n").replace("\r", "\n") + lines = [l for l in normalized.split("\n") if l.strip() != ""] + + if not lines: + return [] + + if has_header: + headers = _parse_row(lines[0], delimiter) + data_lines = lines[1:] + else: + # Determinar numero de columnas desde la primera fila + sample = _parse_row(lines[0], delimiter) + headers = [f"col_{i}" for i in range(len(sample))] + data_lines = lines + + result: list[dict] = [] + for line in data_lines: + fields = _parse_row(line, delimiter) + # Alinear con headers (rellenar con "" si faltan campos) + row = {} + for i, header in enumerate(headers): + row[header] = fields[i] if i < len(fields) else "" + result.append(row) + + return result diff --git a/python/functions/core/from_csv_test.py b/python/functions/core/from_csv_test.py new file mode 100644 index 00000000..5e39185e --- /dev/null +++ b/python/functions/core/from_csv_test.py @@ -0,0 +1,40 @@ +"""Tests para from_csv.""" + +from from_csv import from_csv + + +def test_csv_simple_con_header(): + text = "nombre,edad\r\nAna,30\r\nBob,25" + result = from_csv(text) + assert len(result) == 2 + assert result[0] == {"nombre": "Ana", "edad": "30"} + assert result[1] == {"nombre": "Bob", "edad": "25"} + + +def test_campos_con_escaping(): + text = 'a,b\r\n"dijo ""hola""","uno,dos"' + result = from_csv(text) + assert result[0]["a"] == 'dijo "hola"' + assert result[0]["b"] == "uno,dos" + + +def test_sin_header_keys_generadas(): + text = "foo,bar\nbaz,qux" + result = from_csv(text, has_header=False) + assert result[0] == {"col_0": "foo", "col_1": "bar"} + assert result[1] == {"col_0": "baz", "col_1": "qux"} + + +def test_lineas_vacias_ignoradas(): + text = "x,y\n\n1,2\n\n3,4\n" + result = from_csv(text) + assert len(result) == 2 + assert result[0] == {"x": "1", "y": "2"} + + +def test_un_solo_campo_por_fila(): + text = "valor\nhola\nmundo" + result = from_csv(text) + assert len(result) == 2 + assert result[0] == {"valor": "hola"} + assert result[1] == {"valor": "mundo"} diff --git a/python/functions/core/from_jsonl.md b/python/functions/core/from_jsonl.md new file mode 100644 index 00000000..4e5e0736 --- /dev/null +++ b/python/functions/core/from_jsonl.md @@ -0,0 +1,49 @@ +--- +name: from_jsonl +kind: function +lang: py +domain: core +version: "1.0.0" +purity: pure +signature: "from_jsonl(text: str) -> list[dict]" +description: "Parser JSONL a lista de dicts. Ignora lineas vacias. Lanza ValueError con el numero de linea si una linea contiene JSON invalido. Complemento de to_jsonl." +tags: [jsonl, json, parser, import, streaming, format] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: ["json"] +tested: true +tests: + - "jsonl valido" + - "lineas vacias intercaladas" + - "linea invalida raise con numero" +test_file_path: "python/functions/core/from_jsonl_test.py" +file_path: "python/functions/core/from_jsonl.py" +--- + +## Ejemplo + +```python +text = '{"id": 1}\n{"id": 2}' +rows = from_jsonl(text) +# [{"id": 1}, {"id": 2}] + +# Lineas vacias ignoradas +text = '{"id": 1}\n\n{"id": 2}\n' +rows = from_jsonl(text) +# [{"id": 1}, {"id": 2}] + +# JSON invalido levanta error con numero de linea +try: + from_jsonl('{"ok": 1}\nnot-json') +except ValueError as e: + print(e) # "JSON invalido en linea 2: ..." +``` + +## Notas + +Aunque se declara pure (no hace I/O), puede lanzar ValueError para JSON invalido. +Esto es consistente con la convencion del registry: funciones puras pueden lanzar +excepciones de validacion — solo las funciones impuras retornan error como valor. diff --git a/python/functions/core/from_jsonl.py b/python/functions/core/from_jsonl.py new file mode 100644 index 00000000..66771b77 --- /dev/null +++ b/python/functions/core/from_jsonl.py @@ -0,0 +1,35 @@ +"""Parser JSON Lines (JSONL) a lista de dicts. Complemento de to_jsonl.""" + +import json + + +def from_jsonl(text: str) -> list[dict]: + """Parser JSONL a lista de dicts. + + Complemento de to_jsonl. Ignora lineas vacias. Lanza ValueError si + una linea contiene JSON invalido, indicando el numero de linea. + + Args: + text: Contenido JSONL como string (una linea JSON por linea). + + Returns: + Lista de dicts parseados. + + Raises: + ValueError: Si una linea no es JSON valido, con el numero de linea. + """ + result: list[dict] = [] + + for line_num, line in enumerate(text.splitlines(), start=1): + stripped = line.strip() + if not stripped: + continue + try: + parsed = json.loads(stripped) + except json.JSONDecodeError as exc: + raise ValueError( + f"JSON invalido en linea {line_num}: {exc}" + ) from exc + result.append(parsed) + + return result diff --git a/python/functions/core/from_jsonl_test.py b/python/functions/core/from_jsonl_test.py new file mode 100644 index 00000000..27eea21a --- /dev/null +++ b/python/functions/core/from_jsonl_test.py @@ -0,0 +1,25 @@ +"""Tests para from_jsonl.""" + +import pytest + +from from_jsonl import from_jsonl + + +def test_jsonl_valido(): + text = '{"a": 1}\n{"b": 2}' + result = from_jsonl(text) + assert result == [{"a": 1}, {"b": 2}] + + +def test_lineas_vacias_intercaladas(): + text = '{"x": 1}\n\n{"x": 2}\n\n' + result = from_jsonl(text) + assert len(result) == 2 + assert result[0] == {"x": 1} + assert result[1] == {"x": 2} + + +def test_linea_invalida_raise_con_numero(): + text = '{"ok": 1}\nnot-json\n{"ok": 3}' + with pytest.raises(ValueError, match="linea 2"): + from_jsonl(text) diff --git a/python/functions/core/generate_html_report.md b/python/functions/core/generate_html_report.md new file mode 100644 index 00000000..8928cf7d --- /dev/null +++ b/python/functions/core/generate_html_report.md @@ -0,0 +1,70 @@ +--- +name: generate_html_report +kind: function +lang: py +domain: core +version: "1.0.0" +purity: pure +signature: "generate_html_report(title: str, sections: list[dict]) -> str" +description: "Genera un reporte HTML autocontenido con CSS inline. Soporta secciones de tipo table (list[dict]), text (str con markdown basico), kpi (cards con label/value/delta) y list (list[str]). Para exportar resultados de pipelines sin servidor." +tags: [html, report, export, table, kpi, template, format] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: ["re"] +tested: true +tests: + - "reporte con una tabla" + - "reporte con multiples secciones mixtas" + - "kpi con deltas positivos y negativos" + - "caracteres especiales html escapados en data" + - "titulo con caracteres especiales" +test_file_path: "python/functions/core/generate_html_report_test.py" +file_path: "python/functions/core/generate_html_report.py" +--- + +## Ejemplo + +```python +sections = [ + { + "heading": "Resumen ejecutivo", + "type": "kpi", + "data": [ + {"label": "Revenue", "value": "$1.2M", "delta": "+15%"}, + {"label": "Churn", "value": "3.2%", "delta": "-0.5%"}, + ], + }, + { + "heading": "Top usuarios", + "type": "table", + "data": [ + {"usuario": "ana@example.com", "compras": 42}, + {"usuario": "bob@example.com", "compras": 38}, + ], + }, + { + "heading": "Notas", + "type": "text", + "data": "Datos del **trimestre Q1**. Ver [dashboard](https://example.com).", + }, +] + +html = generate_html_report("Reporte Mensual", sections) +# Retorna string HTML completo con DOCTYPE, head con CSS inline, body con secciones +``` + +## Tipos de seccion + +- **table**: `data` es `list[dict]` — renderiza `` con headers extraidos de las keys +- **text**: `data` es `str` — soporta `**bold**` y `[text](url)`, escapa HTML +- **kpi**: `data` es `list[{"label", "value", "delta"}]` — cards con colores para delta positivo/negativo +- **list**: `data` es `list[str]` — renderiza `
  • ...
` + +## Notas + +CSS completamente inline en `\n" + "\n" + "\n" + f"

{_esc(title)}

\n" + f"{sections_html}" + "\n" + "" + ) diff --git a/python/functions/core/generate_html_report_test.py b/python/functions/core/generate_html_report_test.py new file mode 100644 index 00000000..fd314cc9 --- /dev/null +++ b/python/functions/core/generate_html_report_test.py @@ -0,0 +1,71 @@ +"""Tests para generate_html_report.""" + +from generate_html_report import generate_html_report + + +def test_reporte_con_una_tabla(): + sections = [ + { + "heading": "Datos", + "type": "table", + "data": [{"nombre": "Ana", "score": 99}, {"nombre": "Bob", "score": 87}], + } + ] + html = generate_html_report("Reporte", sections) + assert "" in html + assert "Reporte" in html + assert "" in html + assert "" in html + assert "zebra" in html # segunda fila tiene class zebra + + +def test_reporte_con_multiples_secciones_mixtas(): + sections = [ + {"heading": "Texto", "type": "text", "data": "Hola mundo"}, + {"heading": "Lista", "type": "list", "data": ["uno", "dos", "tres"]}, + {"heading": "KPIs", "type": "kpi", "data": [{"label": "Revenue", "value": "1M", "delta": None}]}, + ] + html = generate_html_report("Multi", sections) + assert "

Hola mundo

" in html + assert "
  • uno
  • " in html + assert "Revenue" in html + assert "1M" in html + + +def test_kpi_con_deltas_positivos_y_negativos(): + sections = [ + { + "heading": "Metricas", + "type": "kpi", + "data": [ + {"label": "Ganancia", "value": "5K", "delta": "+12%"}, + {"label": "Perdida", "value": "2K", "delta": "-5%"}, + {"label": "Estable", "value": "1K", "delta": "0%"}, + ], + } + ] + html = generate_html_report("KPIs", sections) + assert 'class="delta-pos"' in html + assert 'class="delta-neg"' in html + assert 'class="delta-neutral"' in html + assert "+12%" in html + assert "-5%" in html + + +def test_caracteres_especiales_html_escapados_en_data(): + sections = [ + { + "heading": "Codigo", + "type": "table", + "data": [{"expr": ""}], + } + ] + html = generate_html_report("Seguro", sections) + assert ""} + result = render_template("{{code}}", ctx) + assert "
    nombreAna