From 25a392df481ea6ffdf69d8d9a26acc71c6bf440f Mon Sep 17 00:00:00 2001
From: Egutierrez <egutierrez@dead.dd>
Date: Sun, 5 Apr 2026 17:11:21 +0200
Subject: [PATCH] =?UTF-8?q?feat:=20funciones=20Python=20core=20=E2=80=94?=
 =?UTF-8?q?=20parsers,=20formatters,=20retry,=20serializaci=C3=B3n,=20LLM?=
 =?UTF-8?q?=20utils=20y=20m=C3=A1s?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

178 archivos: módulo core.py actualizado + ~80 funciones nuevas con tests.
Incluye: parse_llm_json, extract_text_from_file, retry_with_backoff, circuit_breaker,
from_csv/to_csv, from_jsonl/to_jsonl, html_to_markdown, pdf_to_markdown, docx/epub/excel
converters, cache_decorator, react_loop, task_manager, template rendering, entre otros.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../functions/core/build_tree_from_headers.md |  48 ++
 python/functions/core/cache_decorator.md      |  57 ++
 python/functions/core/cache_decorator.py      |  67 ++
 python/functions/core/cache_decorator_test.py |  96 +++
 .../core/calculate_media_strategy.md          |  48 ++
 .../core/calculate_media_strategy.py          |  24 +
 .../core/calculate_media_strategy_test.py     |  23 +
 .../functions/core/calculate_page_offset.md   |  40 +
 .../functions/core/call_batch_with_retry.md   |  55 ++
 .../functions/core/call_batch_with_retry.py   |  81 +++
 .../core/call_batch_with_retry_test.py        | 102 +++
 python/functions/core/circuit_breaker.md      |  66 ++
 python/functions/core/circuit_breaker.py      | 141 ++++
 python/functions/core/circuit_breaker_test.py | 156 ++++
 python/functions/core/classify_api_error.md   |  41 ++
 python/functions/core/classify_api_error.py   |  38 +
 .../functions/core/classify_api_error_test.py |  50 ++
 python/functions/core/coerce_types.md         |  49 ++
 python/functions/core/coerce_types.py         | 135 ++++
 python/functions/core/coerce_types_test.py    |  84 +++
 .../functions/core/compute_backoff_delay.md   |  41 ++
 .../functions/core/compute_backoff_delay.py   |  26 +
 .../core/compute_backoff_delay_test.py        |  42 ++
 .../core/convert_github_to_raw_url.md         |  59 ++
 .../core/convert_github_to_raw_url.py         |  69 ++
 .../core/convert_github_to_raw_url_test.py    |  77 ++
 python/functions/core/core.py                 | 681 +++++++++++++++++-
 python/functions/core/create_node_mapping.md  |  36 +
 python/functions/core/cursor_paginate.md      |  66 ++
 python/functions/core/cursor_paginate.py      | 105 +++
 python/functions/core/cursor_paginate_test.py | 148 ++++
 .../functions/core/detect_headings_by_font.md |  37 +
 .../functions/core/detect_headings_by_font.py | 135 ++++
 python/functions/core/detect_url_type.md      |  59 ++
 python/functions/core/detect_url_type.py      | 144 ++++
 python/functions/core/detect_url_type_test.py |  89 +++
 python/functions/core/docx_to_markdown.md     |  40 +
 python/functions/core/docx_to_markdown.py     | 153 ++++
 .../functions/core/docx_to_markdown_test.py   | 129 ++++
 python/functions/core/epub_to_markdown.md     |  52 ++
 python/functions/core/epub_to_markdown.py     | 128 ++++
 .../functions/core/epub_to_markdown_test.py   | 163 +++++
 python/functions/core/estimate_token_count.md |  37 +
 python/functions/core/excel_to_markdown.md    |  58 ++
 python/functions/core/excel_to_markdown.py    | 211 ++++++
 .../functions/core/excel_to_markdown_test.py  | 142 ++++
 python/functions/core/extract_frontmatter.md  |  43 ++
 .../functions/core/extract_json_from_llm.md   |  36 +
 .../core/extract_markdown_headers.md          |  36 +
 .../functions/core/extract_pdf_bookmarks.md   |  37 +
 .../functions/core/extract_pdf_bookmarks.py   |  63 ++
 python/functions/core/extract_pdf_text.md     |  35 +
 python/functions/core/extract_pdf_text.py     |  19 +
 .../functions/core/extract_text_from_file.md  |  51 ++
 .../functions/core/extract_text_from_file.py  |  92 +++
 .../core/extract_text_from_file_test.py       |  83 +++
 python/functions/core/fetch_and_parse_url.md  |  50 ++
 python/functions/core/fetch_and_parse_url.py  |  64 ++
 python/functions/core/find_headings.md        |  38 +
 python/functions/core/flatten_tree.md         |  36 +
 python/functions/core/format_iso8601.md       |  49 ++
 python/functions/core/format_iso8601.py       |  24 +
 python/functions/core/format_iso8601_test.py  |  28 +
 python/functions/core/format_simplified.md    |  54 ++
 python/functions/core/format_simplified.py    |  25 +
 .../functions/core/format_simplified_test.py  |  30 +
 .../core/format_table_to_markdown.md          |  36 +
 .../core/format_table_to_markdown.py          |  52 ++
 .../core/format_table_to_markdown_test.py     |  63 ++
 .../functions/core/format_tree_structure.md   |  36 +
 python/functions/core/from_csv.md             |  49 ++
 python/functions/core/from_csv.py             |  83 +++
 python/functions/core/from_csv_test.py        |  40 +
 python/functions/core/from_jsonl.md           |  49 ++
 python/functions/core/from_jsonl.py           |  35 +
 python/functions/core/from_jsonl_test.py      |  25 +
 python/functions/core/generate_html_report.md |  70 ++
 python/functions/core/generate_html_report.py | 164 +++++
 .../core/generate_html_report_test.py         |  71 ++
 python/functions/core/get_leaf_nodes.md       |  36 +
 python/functions/core/get_pdf_page_tokens.md  |  40 +
 python/functions/core/get_pdf_page_tokens.py  |  47 ++
 python/functions/core/get_text_stats.md       |  32 +
 python/functions/core/get_text_stats_test.py  |  21 +
 python/functions/core/html_to_markdown.md     |  66 ++
 python/functions/core/html_to_markdown.py     | 272 +++++++
 .../functions/core/html_to_markdown_test.py   |  90 +++
 python/functions/core/is_git_repo_url.md      |  48 ++
 python/functions/core/join_by_key.md          |  47 ++
 python/functions/core/join_by_key.py          |  95 +++
 python/functions/core/join_by_key_test.py     |  72 ++
 python/functions/core/list_to_tree.md         |  41 ++
 .../functions/core/llm_acompletion_retry.md   |  40 +
 .../functions/core/llm_acompletion_retry.py   |  43 ++
 python/functions/core/llm_completion_retry.md |  43 ++
 python/functions/core/llm_completion_retry.py |  52 ++
 python/functions/core/load_translations.md    |  43 ++
 python/functions/core/load_translations.py    |  46 ++
 .../functions/core/load_translations_test.py  |  80 ++
 .../functions/core/merge_entity_attributes.md |  67 ++
 .../functions/core/merge_entity_attributes.py |  78 ++
 .../core/merge_entity_attributes_test.py      | 102 +++
 python/functions/core/next_cron_time.md       |  49 ++
 python/functions/core/next_cron_time.py       | 105 +++
 python/functions/core/next_cron_time_test.py  |  41 ++
 .../functions/core/normalize_entity_name.md   |  77 ++
 .../functions/core/normalize_entity_name.py   |  81 +++
 .../core/normalize_entity_name_test.py        |  70 ++
 python/functions/core/page_list_to_groups.md  |  37 +
 python/functions/core/parse_code_ast.md       | 115 +++
 python/functions/core/parse_code_ast.py       | 384 ++++++++++
 python/functions/core/parse_code_ast_test.py  | 220 ++++++
 python/functions/core/parse_cron_expr.md      |  48 ++
 python/functions/core/parse_cron_expr.py      | 112 +++
 python/functions/core/parse_cron_expr_test.py |  45 ++
 python/functions/core/parse_git_url.md        |  46 ++
 python/functions/core/parse_git_url_test.py   | 104 +++
 python/functions/core/parse_iso_datetime.md   |  49 ++
 python/functions/core/parse_iso_datetime.py   |  25 +
 .../functions/core/parse_iso_datetime_test.py |  41 ++
 python/functions/core/parse_llm_json.md       |  42 ++
 python/functions/core/parse_llm_json.py       |  33 +
 python/functions/core/parse_llm_json_test.py  |  25 +
 python/functions/core/parse_markdown_test.py  | 184 +++++
 python/functions/core/parse_page_range.md     |  38 +
 python/functions/core/parser_registry.md      |  65 ++
 python/functions/core/parser_registry.py      | 225 ++++++
 python/functions/core/parser_registry_test.py | 162 +++++
 python/functions/core/pdf_to_markdown.md      |  46 ++
 python/functions/core/pdf_to_markdown.py      | 121 ++++
 python/functions/core/preprocess_text.md      |  33 +
 python/functions/core/preprocess_text_test.py |  24 +
 python/functions/core/react_loop.md           |  70 ++
 python/functions/core/react_loop.py           | 133 ++++
 python/functions/core/react_loop_test.py      | 127 ++++
 python/functions/core/remove_tree_fields.md   |  36 +
 python/functions/core/render_template.md      |  67 ++
 python/functions/core/render_template.py      | 142 ++++
 python/functions/core/render_template_test.py |  57 ++
 python/functions/core/retry_async.md          |  45 ++
 python/functions/core/retry_async.py          |  52 ++
 python/functions/core/retry_sync.md           |  44 ++
 python/functions/core/retry_sync.py           |  52 ++
 python/functions/core/retry_with_backoff.md   |  50 ++
 python/functions/core/retry_with_backoff.py   | 100 +++
 .../core/retry_with_backoff_async.md          |  51 ++
 .../functions/core/retry_with_backoff_test.py | 195 +++++
 python/functions/core/sanitize_for_path.md    |  40 +
 python/functions/core/smart_split_content.md  |  41 ++
 .../functions/core/split_text_into_chunks.md  |  46 ++
 .../functions/core/split_text_into_chunks.py  |  66 ++
 .../core/split_text_into_chunks_test.py       |  64 ++
 .../core/strip_markdown_codeblock.md          |  37 +
 .../core/strip_markdown_codeblock.py          |  24 +
 .../core/strip_markdown_codeblock_test.py     |  28 +
 python/functions/core/strip_think_tags.md     |  37 +
 python/functions/core/strip_think_tags.py     |  20 +
 .../functions/core/strip_think_tags_test.py   |  30 +
 python/functions/core/t.md                    |  58 ++
 python/functions/core/t.py                    |  91 +++
 python/functions/core/t_test.py               |  83 +++
 python/functions/core/task_manager.md         |  58 ++
 python/functions/core/task_manager.py         | 176 +++++
 python/functions/core/to_csv.md               |  48 ++
 python/functions/core/to_csv.py               |  53 ++
 python/functions/core/to_csv_test.py          |  48 ++
 python/functions/core/to_jsonl.md             |  43 ++
 python/functions/core/to_jsonl.py             |  23 +
 python/functions/core/to_jsonl_test.py        |  34 +
 python/functions/core/to_pascal_case.md       |  48 ++
 python/functions/core/to_pascal_case.py       |  28 +
 python/functions/core/to_pascal_case_test.py  |  35 +
 python/functions/core/tree_to_flat_list.md    |  36 +
 python/functions/core/validate_git_ssh_uri.md |  45 ++
 python/functions/core/validate_json_schema.md |  58 ++
 python/functions/core/validate_json_schema.py | 134 ++++
 .../core/validate_json_schema_test.py         | 129 ++++
 python/functions/core/write_node_ids.md       |  37 +
 178 files changed, 13060 insertions(+), 1 deletion(-)
 create mode 100644 python/functions/core/build_tree_from_headers.md
 create mode 100644 python/functions/core/cache_decorator.md
 create mode 100644 python/functions/core/cache_decorator.py
 create mode 100644 python/functions/core/cache_decorator_test.py
 create mode 100644 python/functions/core/calculate_media_strategy.md
 create mode 100644 python/functions/core/calculate_media_strategy.py
 create mode 100644 python/functions/core/calculate_media_strategy_test.py
 create mode 100644 python/functions/core/calculate_page_offset.md
 create mode 100644 python/functions/core/call_batch_with_retry.md
 create mode 100644 python/functions/core/call_batch_with_retry.py
 create mode 100644 python/functions/core/call_batch_with_retry_test.py
 create mode 100644 python/functions/core/circuit_breaker.md
 create mode 100644 python/functions/core/circuit_breaker.py
 create mode 100644 python/functions/core/circuit_breaker_test.py
 create mode 100644 python/functions/core/classify_api_error.md
 create mode 100644 python/functions/core/classify_api_error.py
 create mode 100644 python/functions/core/classify_api_error_test.py
 create mode 100644 python/functions/core/coerce_types.md
 create mode 100644 python/functions/core/coerce_types.py
 create mode 100644 python/functions/core/coerce_types_test.py
 create mode 100644 python/functions/core/compute_backoff_delay.md
 create mode 100644 python/functions/core/compute_backoff_delay.py
 create mode 100644 python/functions/core/compute_backoff_delay_test.py
 create mode 100644 python/functions/core/convert_github_to_raw_url.md
 create mode 100644 python/functions/core/convert_github_to_raw_url.py
 create mode 100644 python/functions/core/convert_github_to_raw_url_test.py
 create mode 100644 python/functions/core/create_node_mapping.md
 create mode 100644 python/functions/core/cursor_paginate.md
 create mode 100644 python/functions/core/cursor_paginate.py
 create mode 100644 python/functions/core/cursor_paginate_test.py
 create mode 100644 python/functions/core/detect_headings_by_font.md
 create mode 100644 python/functions/core/detect_headings_by_font.py
 create mode 100644 python/functions/core/detect_url_type.md
 create mode 100644 python/functions/core/detect_url_type.py
 create mode 100644 python/functions/core/detect_url_type_test.py
 create mode 100644 python/functions/core/docx_to_markdown.md
 create mode 100644 python/functions/core/docx_to_markdown.py
 create mode 100644 python/functions/core/docx_to_markdown_test.py
 create mode 100644 python/functions/core/epub_to_markdown.md
 create mode 100644 python/functions/core/epub_to_markdown.py
 create mode 100644 python/functions/core/epub_to_markdown_test.py
 create mode 100644 python/functions/core/estimate_token_count.md
 create mode 100644 python/functions/core/excel_to_markdown.md
 create mode 100644 python/functions/core/excel_to_markdown.py
 create mode 100644 python/functions/core/excel_to_markdown_test.py
 create mode 100644 python/functions/core/extract_frontmatter.md
 create mode 100644 python/functions/core/extract_json_from_llm.md
 create mode 100644 python/functions/core/extract_markdown_headers.md
 create mode 100644 python/functions/core/extract_pdf_bookmarks.md
 create mode 100644 python/functions/core/extract_pdf_bookmarks.py
 create mode 100644 python/functions/core/extract_pdf_text.md
 create mode 100644 python/functions/core/extract_pdf_text.py
 create mode 100644 python/functions/core/extract_text_from_file.md
 create mode 100644 python/functions/core/extract_text_from_file.py
 create mode 100644 python/functions/core/extract_text_from_file_test.py
 create mode 100644 python/functions/core/fetch_and_parse_url.md
 create mode 100644 python/functions/core/fetch_and_parse_url.py
 create mode 100644 python/functions/core/find_headings.md
 create mode 100644 python/functions/core/flatten_tree.md
 create mode 100644 python/functions/core/format_iso8601.md
 create mode 100644 python/functions/core/format_iso8601.py
 create mode 100644 python/functions/core/format_iso8601_test.py
 create mode 100644 python/functions/core/format_simplified.md
 create mode 100644 python/functions/core/format_simplified.py
 create mode 100644 python/functions/core/format_simplified_test.py
 create mode 100644 python/functions/core/format_table_to_markdown.md
 create mode 100644 python/functions/core/format_table_to_markdown.py
 create mode 100644 python/functions/core/format_table_to_markdown_test.py
 create mode 100644 python/functions/core/format_tree_structure.md
 create mode 100644 python/functions/core/from_csv.md
 create mode 100644 python/functions/core/from_csv.py
 create mode 100644 python/functions/core/from_csv_test.py
 create mode 100644 python/functions/core/from_jsonl.md
 create mode 100644 python/functions/core/from_jsonl.py
 create mode 100644 python/functions/core/from_jsonl_test.py
 create mode 100644 python/functions/core/generate_html_report.md
 create mode 100644 python/functions/core/generate_html_report.py
 create mode 100644 python/functions/core/generate_html_report_test.py
 create mode 100644 python/functions/core/get_leaf_nodes.md
 create mode 100644 python/functions/core/get_pdf_page_tokens.md
 create mode 100644 python/functions/core/get_pdf_page_tokens.py
 create mode 100644 python/functions/core/get_text_stats.md
 create mode 100644 python/functions/core/get_text_stats_test.py
 create mode 100644 python/functions/core/html_to_markdown.md
 create mode 100644 python/functions/core/html_to_markdown.py
 create mode 100644 python/functions/core/html_to_markdown_test.py
 create mode 100644 python/functions/core/is_git_repo_url.md
 create mode 100644 python/functions/core/join_by_key.md
 create mode 100644 python/functions/core/join_by_key.py
 create mode 100644 python/functions/core/join_by_key_test.py
 create mode 100644 python/functions/core/list_to_tree.md
 create mode 100644 python/functions/core/llm_acompletion_retry.md
 create mode 100644 python/functions/core/llm_acompletion_retry.py
 create mode 100644 python/functions/core/llm_completion_retry.md
 create mode 100644 python/functions/core/llm_completion_retry.py
 create mode 100644 python/functions/core/load_translations.md
 create mode 100644 python/functions/core/load_translations.py
 create mode 100644 python/functions/core/load_translations_test.py
 create mode 100644 python/functions/core/merge_entity_attributes.md
 create mode 100644 python/functions/core/merge_entity_attributes.py
 create mode 100644 python/functions/core/merge_entity_attributes_test.py
 create mode 100644 python/functions/core/next_cron_time.md
 create mode 100644 python/functions/core/next_cron_time.py
 create mode 100644 python/functions/core/next_cron_time_test.py
 create mode 100644 python/functions/core/normalize_entity_name.md
 create mode 100644 python/functions/core/normalize_entity_name.py
 create mode 100644 python/functions/core/normalize_entity_name_test.py
 create mode 100644 python/functions/core/page_list_to_groups.md
 create mode 100644 python/functions/core/parse_code_ast.md
 create mode 100644 python/functions/core/parse_code_ast.py
 create mode 100644 python/functions/core/parse_code_ast_test.py
 create mode 100644 python/functions/core/parse_cron_expr.md
 create mode 100644 python/functions/core/parse_cron_expr.py
 create mode 100644 python/functions/core/parse_cron_expr_test.py
 create mode 100644 python/functions/core/parse_git_url.md
 create mode 100644 python/functions/core/parse_git_url_test.py
 create mode 100644 python/functions/core/parse_iso_datetime.md
 create mode 100644 python/functions/core/parse_iso_datetime.py
 create mode 100644 python/functions/core/parse_iso_datetime_test.py
 create mode 100644 python/functions/core/parse_llm_json.md
 create mode 100644 python/functions/core/parse_llm_json.py
 create mode 100644 python/functions/core/parse_llm_json_test.py
 create mode 100644 python/functions/core/parse_markdown_test.py
 create mode 100644 python/functions/core/parse_page_range.md
 create mode 100644 python/functions/core/parser_registry.md
 create mode 100644 python/functions/core/parser_registry.py
 create mode 100644 python/functions/core/parser_registry_test.py
 create mode 100644 python/functions/core/pdf_to_markdown.md
 create mode 100644 python/functions/core/pdf_to_markdown.py
 create mode 100644 python/functions/core/preprocess_text.md
 create mode 100644 python/functions/core/preprocess_text_test.py
 create mode 100644 python/functions/core/react_loop.md
 create mode 100644 python/functions/core/react_loop.py
 create mode 100644 python/functions/core/react_loop_test.py
 create mode 100644 python/functions/core/remove_tree_fields.md
 create mode 100644 python/functions/core/render_template.md
 create mode 100644 python/functions/core/render_template.py
 create mode 100644 python/functions/core/render_template_test.py
 create mode 100644 python/functions/core/retry_async.md
 create mode 100644 python/functions/core/retry_async.py
 create mode 100644 python/functions/core/retry_sync.md
 create mode 100644 python/functions/core/retry_sync.py
 create mode 100644 python/functions/core/retry_with_backoff.md
 create mode 100644 python/functions/core/retry_with_backoff.py
 create mode 100644 python/functions/core/retry_with_backoff_async.md
 create mode 100644 python/functions/core/retry_with_backoff_test.py
 create mode 100644 python/functions/core/sanitize_for_path.md
 create mode 100644 python/functions/core/smart_split_content.md
 create mode 100644 python/functions/core/split_text_into_chunks.md
 create mode 100644 python/functions/core/split_text_into_chunks.py
 create mode 100644 python/functions/core/split_text_into_chunks_test.py
 create mode 100644 python/functions/core/strip_markdown_codeblock.md
 create mode 100644 python/functions/core/strip_markdown_codeblock.py
 create mode 100644 python/functions/core/strip_markdown_codeblock_test.py
 create mode 100644 python/functions/core/strip_think_tags.md
 create mode 100644 python/functions/core/strip_think_tags.py
 create mode 100644 python/functions/core/strip_think_tags_test.py
 create mode 100644 python/functions/core/t.md
 create mode 100644 python/functions/core/t.py
 create mode 100644 python/functions/core/t_test.py
 create mode 100644 python/functions/core/task_manager.md
 create mode 100644 python/functions/core/task_manager.py
 create mode 100644 python/functions/core/to_csv.md
 create mode 100644 python/functions/core/to_csv.py
 create mode 100644 python/functions/core/to_csv_test.py
 create mode 100644 python/functions/core/to_jsonl.md
 create mode 100644 python/functions/core/to_jsonl.py
 create mode 100644 python/functions/core/to_jsonl_test.py
 create mode 100644 python/functions/core/to_pascal_case.md
 create mode 100644 python/functions/core/to_pascal_case.py
 create mode 100644 python/functions/core/to_pascal_case_test.py
 create mode 100644 python/functions/core/tree_to_flat_list.md
 create mode 100644 python/functions/core/validate_git_ssh_uri.md
 create mode 100644 python/functions/core/validate_json_schema.md
 create mode 100644 python/functions/core/validate_json_schema.py
 create mode 100644 python/functions/core/validate_json_schema_test.py
 create mode 100644 python/functions/core/write_node_ids.md

diff --git a/python/functions/core/build_tree_from_headers.md b/python/functions/core/build_tree_from_headers.md
new file mode 100644
index 00000000..b69f4bd5
--- /dev/null
+++ b/python/functions/core/build_tree_from_headers.md
@@ -0,0 +1,48 @@
+---
+name: build_tree_from_headers
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def build_tree_from_headers(node_list: list[dict]) -> list[dict]"
+description: "Construye arbol jerarquico anidado desde lista plana de headers markdown con niveles (h1>h2>h3)."
+tags: [tree, markdown, headers, hierarchy]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/core.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/page_index_md.py"
+---
+
+## Ejemplo
+
+```python
+headers = [
+    {"title": "Intro", "level": 1, "line_num": 1},
+    {"title": "Background", "level": 2, "line_num": 5},
+    {"title": "Details", "level": 3, "line_num": 10},
+    {"title": "Methods", "level": 1, "line_num": 20},
+]
+tree = build_tree_from_headers(headers)
+# [
+#   {"title": "Intro", "node_id": "0001", "nodes": [
+#     {"title": "Background", "node_id": "0002", "nodes": [
+#       {"title": "Details", "node_id": "0003"}
+#     ]}
+#   ]},
+#   {"title": "Methods", "node_id": "0004"}
+# ]
+```
+
+## Notas
+
+Funcion pura. Asigna node_id secuencial (0001...) automaticamente. Usa stack para resolver jerarquia por nivel de header.
diff --git a/python/functions/core/cache_decorator.md b/python/functions/core/cache_decorator.md
new file mode 100644
index 00000000..1b1dc37d
--- /dev/null
+++ b/python/functions/core/cache_decorator.md
@@ -0,0 +1,57 @@
+---
+name: cache_decorator
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "def cache_decorator(store: Any, ttl: float = 0, key_fn: callable | None = None)"
+description: "Decorator que cachea el resultado de una funcion en cualquier store persistente compatible (CacheStore o FileCache). La key se genera hasheando (func.__name__, args, sorted(kwargs)) con SHA-256. Soporta funciones sincronas y asincronas."
+tags: [cache, decorator, memoize, persistence, async, functional]
+uses_functions: ["cache_to_sqlite_py_infra", "cache_to_file_py_infra"]
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: ["asyncio", "functools", "hashlib", "json"]
+tested: true
+tests:
+  - "Funcion llamada una vez, segunda vez desde cache"
+  - "TTL expirado → llama de nuevo"
+  - "key_fn custom"
+  - "Argumentos distintos → keys distintas"
+  - "Funciona con async"
+test_file_path: "python/functions/core/cache_decorator_test.py"
+file_path: "python/functions/core/cache_decorator.py"
+---
+
+## Ejemplo
+
+```python
+from infra.cache_to_sqlite import cache_to_sqlite
+from core.cache_decorator import cache_decorator
+
+store = cache_to_sqlite("cache.db", namespace="llm")
+
+@cache_decorator(store, ttl=3600)
+def call_llm(prompt: str) -> str:
+    # llamada costosa a LLM
+    return client.complete(prompt)
+
+result = call_llm("explain X")  # primera vez: llama LLM
+result = call_llm("explain X")  # segunda vez: desde cache
+
+# Con key_fn custom
+@cache_decorator(store, ttl=600, key_fn=lambda fn, args, kw: args[0])
+def fetch_user(user_id: str) -> dict:
+    return api.get_user(user_id)
+
+# Con async
+@cache_decorator(store, ttl=3600)
+async def async_call(prompt: str) -> str:
+    return await async_client.complete(prompt)
+```
+
+## Notas
+
+El store debe implementar `get(key: str) -> Any | None` y `set(key: str, value: Any, ttl: float) -> None`. Detecta automaticamente funciones asincronas con `asyncio.iscoroutinefunction`. La key por defecto usa `json.dumps(..., default=str)` para serializar argumentos no serializables. Si `store.get()` retorna `None`, siempre se ejecuta la funcion (no distingue entre "no en cache" y "valor None almacenado"); para valores que pueden ser None usar `get_or_set` directamente.
diff --git a/python/functions/core/cache_decorator.py b/python/functions/core/cache_decorator.py
new file mode 100644
index 00000000..619979e4
--- /dev/null
+++ b/python/functions/core/cache_decorator.py
@@ -0,0 +1,67 @@
+"""Decorator que cachea el resultado de una funcion en un store persistente."""
+
+import asyncio
+import functools
+import hashlib
+import json
+from typing import Any, Callable
+
+
+def _default_key(func: Callable, args: tuple, kwargs: dict) -> str:
+    """Genera una cache key a partir del nombre de funcion y sus argumentos."""
+    payload = json.dumps((func.__name__, args, sorted(kwargs.items())), default=str)
+    return hashlib.sha256(payload.encode("utf-8")).hexdigest()
+
+
+def cache_decorator(store: Any, ttl: float = 0, key_fn: Callable | None = None):
+    """Retorna un decorator que cachea resultados en un store persistente.
+
+    Args:
+        store: Cualquier objeto con metodos get(key) y set(key, value, ttl).
+               Compatible con CacheStore (cache_to_sqlite) y FileCache (cache_to_file).
+        ttl: Tiempo de vida en segundos. 0 = sin expiracion.
+        key_fn: Funcion opcional para generar la key. Recibe (func, args, kwargs).
+                Si es None, se usa SHA-256 de (func.__name__, args, sorted(kwargs)).
+
+    Returns:
+        Decorator aplicable a funciones sincronas o asincronas.
+
+    Example::
+
+        store = cache_to_sqlite("cache.db")
+
+        @cache_decorator(store, ttl=3600)
+        def call_llm(prompt: str) -> str:
+            ...  # llamada costosa
+
+        result = call_llm("explain X")  # primera vez: ejecuta la funcion
+        result = call_llm("explain X")  # segunda vez: desde cache
+    """
+
+    def decorator(func: Callable) -> Callable:
+        if asyncio.iscoroutinefunction(func):
+            @functools.wraps(func)
+            async def async_wrapper(*args, **kwargs):
+                make_key = key_fn or _default_key
+                key = make_key(func, args, kwargs)
+                cached = store.get(key)
+                if cached is not None:
+                    return cached
+                result = await func(*args, **kwargs)
+                store.set(key, result, ttl)
+                return result
+            return async_wrapper
+        else:
+            @functools.wraps(func)
+            def sync_wrapper(*args, **kwargs):
+                make_key = key_fn or _default_key
+                key = make_key(func, args, kwargs)
+                cached = store.get(key)
+                if cached is not None:
+                    return cached
+                result = func(*args, **kwargs)
+                store.set(key, result, ttl)
+                return result
+            return sync_wrapper
+
+    return decorator
diff --git a/python/functions/core/cache_decorator_test.py b/python/functions/core/cache_decorator_test.py
new file mode 100644
index 00000000..4f8a3b57
--- /dev/null
+++ b/python/functions/core/cache_decorator_test.py
@@ -0,0 +1,96 @@
+"""Tests para cache_decorator."""
+
+import asyncio
+import sys
+import os
+import tempfile
+import time
+
+import pytest
+
+sys.path.insert(0, os.path.dirname(__file__))
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "infra"))
+
+from cache_decorator import cache_decorator
+from cache_to_sqlite import cache_to_sqlite
+
+
+@pytest.fixture
+def store(tmp_path):
+    return cache_to_sqlite(str(tmp_path / "test.db"))
+
+
+def test_funcion_llamada_una_vez_segunda_vez_desde_cache(store):
+    calls = []
+
+    @cache_decorator(store, ttl=60)
+    def compute(x: int) -> int:
+        calls.append(x)
+        return x * 10
+
+    assert compute(5) == 50
+    assert compute(5) == 50
+    assert len(calls) == 1
+
+
+def test_ttl_expirado_llama_de_nuevo(store):
+    calls = []
+
+    @cache_decorator(store, ttl=0.05)
+    def work(n: int) -> int:
+        calls.append(n)
+        return n + 1
+
+    work(3)
+    time.sleep(0.1)
+    work(3)
+    assert len(calls) == 2
+
+
+def test_key_fn_custom(store):
+    calls = []
+
+    def my_key_fn(func, args, kwargs):
+        return f"custom:{args[0]}"
+
+    @cache_decorator(store, ttl=60, key_fn=my_key_fn)
+    def fn(x: int) -> str:
+        calls.append(x)
+        return f"result_{x}"
+
+    fn(7)
+    fn(7)
+    assert len(calls) == 1
+
+
+def test_argumentos_distintos_keys_distintas(store):
+    calls = []
+
+    @cache_decorator(store, ttl=60)
+    def fn(x: int) -> int:
+        calls.append(x)
+        return x * 2
+
+    fn(1)
+    fn(2)
+    fn(1)
+    assert len(calls) == 2
+
+
+def test_funciona_con_async(store):
+    calls = []
+
+    @cache_decorator(store, ttl=60)
+    async def async_fn(x: int) -> int:
+        calls.append(x)
+        return x + 100
+
+    async def run():
+        r1 = await async_fn(5)
+        r2 = await async_fn(5)
+        return r1, r2
+
+    r1, r2 = asyncio.run(run())
+    assert r1 == 105
+    assert r2 == 105
+    assert len(calls) == 1
diff --git a/python/functions/core/calculate_media_strategy.md b/python/functions/core/calculate_media_strategy.md
new file mode 100644
index 00000000..d67cfab2
--- /dev/null
+++ b/python/functions/core/calculate_media_strategy.md
@@ -0,0 +1,48 @@
+---
+name: calculate_media_strategy
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "calculate_media_strategy(image_count: int, line_count: int) -> str"
+description: "Determina la estrategia optima de procesamiento de medios para un documento basado en la proporcion de imagenes vs texto. Retorna full_page_vlm, extract o text_only."
+tags: [media, strategy, document, vision, vlm, images, classification]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: true
+tests:
+  - "0 imagenes text_only"
+  - "2 imagenes 100 lineas extract"
+  - "10 imagenes 20 lineas full_page_vlm"
+  - "5 imagenes 100 lineas full_page_vlm"
+  - "0 lineas division por cero evitada"
+test_file_path: "python/functions/core/calculate_media_strategy_test.py"
+file_path: "python/functions/core/calculate_media_strategy.py"
+---
+
+## Ejemplo
+
+```python
+calculate_media_strategy(0, 50)    # "text_only"
+calculate_media_strategy(2, 100)   # "extract"  (ratio 0.02, pocas imagenes)
+calculate_media_strategy(10, 20)   # "full_page_vlm"  (ratio 0.5 > 0.3)
+calculate_media_strategy(5, 100)   # "full_page_vlm"  (>= 5 imagenes)
+calculate_media_strategy(3, 0)     # "text_only"  (sin texto, sin contexto)
+```
+
+## Notas
+
+Logica de clasificacion en tres niveles:
+
+1. `full_page_vlm` — documento dominado por imagenes: ratio imagen/linea > 0.3 o al menos 5 imagenes. Se usa un vision-language model sobre la pagina completa.
+2. `extract` — pocas imagenes en documento con texto: extraer y procesar imagenes individualmente.
+3. `text_only` — sin imagenes o sin lineas de texto: procesar solo el texto.
+
+El guard `line_count > 0` evita la division por cero y trata documentos sin lineas como `text_only` independientemente del conteo de imagenes, ya que sin texto no hay contexto suficiente para clasificar como `extract`.
+
+Funcion pura, sin dependencias externas. Reimplementada conceptualmente a partir de la logica de clasificacion de medios de OpenViking (AGPL-3.0).
diff --git a/python/functions/core/calculate_media_strategy.py b/python/functions/core/calculate_media_strategy.py
new file mode 100644
index 00000000..0ed0d642
--- /dev/null
+++ b/python/functions/core/calculate_media_strategy.py
@@ -0,0 +1,24 @@
+"""Determina la estrategia optima de procesamiento de medios para un documento."""
+
+
+def calculate_media_strategy(image_count: int, line_count: int) -> str:
+    """Determina la estrategia optima de procesamiento de medios.
+
+    Clasifica un documento en una de tres estrategias basandose en la
+    proporcion de imagenes respecto al texto:
+    - full_page_vlm: documento dominado por imagenes, usar vision-language model
+    - extract: pocas imagenes, extraer y procesar individualmente
+    - text_only: sin imagenes, solo texto
+
+    Args:
+        image_count: numero de imagenes en el documento.
+        line_count: numero de lineas de texto en el documento.
+
+    Returns:
+        "full_page_vlm", "extract" o "text_only".
+    """
+    if line_count > 0 and (image_count / line_count > 0.3 or image_count >= 5):
+        return "full_page_vlm"
+    if line_count > 0 and image_count > 0:
+        return "extract"
+    return "text_only"
diff --git a/python/functions/core/calculate_media_strategy_test.py b/python/functions/core/calculate_media_strategy_test.py
new file mode 100644
index 00000000..73ff7d18
--- /dev/null
+++ b/python/functions/core/calculate_media_strategy_test.py
@@ -0,0 +1,23 @@
+"""Tests para calculate_media_strategy."""
+
+from calculate_media_strategy import calculate_media_strategy
+
+
+def test_0_imagenes_text_only():
+    assert calculate_media_strategy(0, 50) == "text_only"
+
+
+def test_2_imagenes_100_lineas_extract():
+    assert calculate_media_strategy(2, 100) == "extract"
+
+
+def test_10_imagenes_20_lineas_full_page_vlm():
+    assert calculate_media_strategy(10, 20) == "full_page_vlm"
+
+
+def test_5_imagenes_100_lineas_full_page_vlm():
+    assert calculate_media_strategy(5, 100) == "full_page_vlm"
+
+
+def test_0_lineas_division_por_cero_evitada():
+    assert calculate_media_strategy(3, 0) == "text_only"
diff --git a/python/functions/core/calculate_page_offset.md b/python/functions/core/calculate_page_offset.md
new file mode 100644
index 00000000..072a03e2
--- /dev/null
+++ b/python/functions/core/calculate_page_offset.md
@@ -0,0 +1,40 @@
+---
+name: calculate_page_offset
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def calculate_page_offset(pairs: list[dict]) -> int"
+description: "Calcula offset entre numeros de pagina logicos y fisicos usando pares de referencia (moda de diferencias)."
+tags: [pagination, offset, calculation]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/core.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/page_index.py"
+---
+
+## Ejemplo
+
+```python
+pairs = [
+    {"page": 1, "physical_index": 5},
+    {"page": 2, "physical_index": 6},
+    {"page": 10, "physical_index": 14},
+]
+calculate_page_offset(pairs)
+# 4 (la moda de las diferencias physical_index - page)
+```
+
+## Notas
+
+Funcion pura. Cada par necesita campos 'page' (numero logico) y 'physical_index' (indice fisico). Retorna la diferencia mas frecuente (moda). Retorna 0 si no hay pares validos.
diff --git a/python/functions/core/call_batch_with_retry.md b/python/functions/core/call_batch_with_retry.md
new file mode 100644
index 00000000..effa58d3
--- /dev/null
+++ b/python/functions/core/call_batch_with_retry.md
@@ -0,0 +1,55 @@
+---
+name: call_batch_with_retry
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "def call_batch_with_retry(items: list[T], process_func: Callable[[T], R], max_retries: int = 3, initial_delay: float = 1.0, max_delay: float = 30.0, backoff_factor: float = 2.0, exceptions: tuple[type[Exception], ...] = (Exception,), continue_on_failure: bool = True) -> tuple[list[R], list[dict]]"
+description: "Procesa una lista de items con retry individual por item y exponential backoff. Los fallos individuales no bloquean el resto del batch. Retorna (results, failures) donde failures contiene index, item y error de cada item que agoto sus reintentos."
+tags: [retry, batch, backoff, resilience, error-handling, core]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: ["time", "random", "typing.Callable", "typing.TypeVar"]
+tested: true
+tests:
+  - "todos los items exito"
+  - "item falla permanentemente, continue True"
+  - "item falla, abort continue False"
+  - "item falla luego exito retry funciona"
+  - "failures contiene index correcto"
+test_file_path: "python/functions/core/call_batch_with_retry_test.py"
+file_path: "python/functions/core/call_batch_with_retry.py"
+---
+
+## Ejemplo
+
+```python
+results, failures = call_batch_with_retry(
+    items=["url1", "url2", "url3"],
+    process_func=fetch_url,
+    max_retries=3,
+    initial_delay=1.0,
+    max_delay=30.0,
+    backoff_factor=2.0,
+    exceptions=(ConnectionError, TimeoutError),
+    continue_on_failure=True,
+)
+
+for r in results:
+    print("OK:", r)
+
+for f in failures:
+    print(f"FAIL index={f['index']} item={f['item']} error={f['error']}")
+```
+
+## Notas
+
+Diferencia con `retry_sync_py_core`: ese reintenta una sola llamada. Este maneja listas completas donde cada item se reintenta independientemente — los fallos individuales quedan registrados en `failures` sin interrumpir el procesamiento del batch (cuando `continue_on_failure=True`).
+
+El backoff usa la formula `min(initial_delay * backoff_factor^attempt, max_delay)` con jitter de hasta el 10% del delay calculado para evitar thundering herd. El primer intento es siempre inmediato — el delay se aplica antes del primer retry (attempt=0).
+
+Cuando `continue_on_failure=False`, el primer item que agota sus reintentos re-lanza la excepcion inmediatamente, abortando el batch.
diff --git a/python/functions/core/call_batch_with_retry.py b/python/functions/core/call_batch_with_retry.py
new file mode 100644
index 00000000..7dd01d8f
--- /dev/null
+++ b/python/functions/core/call_batch_with_retry.py
@@ -0,0 +1,81 @@
+"""Process a batch of items with per-item exponential backoff retry."""
+
+import time
+import random
+from typing import Callable, TypeVar
+
+T = TypeVar("T")
+R = TypeVar("R")
+
+
+def call_batch_with_retry(
+    items: list,
+    process_func: Callable,
+    max_retries: int = 3,
+    initial_delay: float = 1.0,
+    max_delay: float = 30.0,
+    backoff_factor: float = 2.0,
+    exceptions: tuple = (Exception,),
+    continue_on_failure: bool = True,
+) -> tuple:
+    """Process a list of items with independent per-item retry and exponential backoff.
+
+    Each item is processed by process_func. If it raises one of the specified
+    exceptions, it is retried up to max_retries times with exponential backoff.
+    If all retries are exhausted, the item is recorded as a failure.
+
+    Args:
+        items: List of items to process.
+        process_func: Callable that takes a single item and returns a result.
+        max_retries: Maximum number of retry attempts per item after first failure.
+        initial_delay: Initial delay in seconds before the first retry.
+        max_delay: Maximum delay cap in seconds between retries.
+        backoff_factor: Multiplier applied to delay on each successive retry.
+        exceptions: Tuple of exception types to catch and retry on.
+        continue_on_failure: If True, continue processing remaining items when an
+            item exhausts all retries. If False, re-raise the exception immediately.
+
+    Returns:
+        A tuple (results, failures) where:
+        - results is a list of successful return values from process_func.
+        - failures is a list of dicts with keys "index", "item", and "error"
+          for each item that failed after all retries.
+
+    Raises:
+        Exception: The last exception for a failed item when continue_on_failure
+            is False.
+    """
+    results = []
+    failures = []
+
+    for index, item in enumerate(items):
+        last_exc = None
+        succeeded = False
+
+        for attempt in range(max_retries + 1):
+            try:
+                result = process_func(item)
+                results.append(result)
+                succeeded = True
+                break
+            except exceptions as exc:
+                last_exc = exc
+                if attempt < max_retries:
+                    delay = min(
+                        initial_delay * (backoff_factor ** attempt),
+                        max_delay,
+                    )
+                    # Add small jitter (up to 10% of delay) to avoid thundering herd
+                    delay += random.uniform(0, delay * 0.1)
+                    time.sleep(delay)
+
+        if not succeeded:
+            if not continue_on_failure:
+                raise last_exc
+            failures.append({
+                "index": index,
+                "item": item,
+                "error": str(last_exc),
+            })
+
+    return results, failures
diff --git a/python/functions/core/call_batch_with_retry_test.py b/python/functions/core/call_batch_with_retry_test.py
new file mode 100644
index 00000000..54d38291
--- /dev/null
+++ b/python/functions/core/call_batch_with_retry_test.py
@@ -0,0 +1,102 @@
+"""Tests para call_batch_with_retry."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from call_batch_with_retry import call_batch_with_retry
+
+
+def test_todos_los_items_exito():
+    results, failures = call_batch_with_retry(
+        items=[1, 2, 3],
+        process_func=lambda x: x * 2,
+        max_retries=3,
+    )
+    assert results == [2, 4, 6]
+    assert failures == []
+
+
+def test_item_falla_permanentemente_continue_true():
+    def process(x):
+        if x == 2:
+            raise ValueError("fallo permanente")
+        return x * 10
+
+    results, failures = call_batch_with_retry(
+        items=[1, 2, 3],
+        process_func=process,
+        max_retries=2,
+        initial_delay=0.0,
+        continue_on_failure=True,
+    )
+    assert results == [10, 30]
+    assert len(failures) == 1
+    assert failures[0]["index"] == 1
+    assert failures[0]["item"] == 2
+    assert "fallo permanente" in failures[0]["error"]
+
+
+def test_item_falla_abort_continue_false():
+    call_count = {"n": 0}
+
+    def process(x):
+        call_count["n"] += 1
+        if x == 2:
+            raise RuntimeError("error fatal")
+        return x
+
+    try:
+        call_batch_with_retry(
+            items=[1, 2, 3],
+            process_func=process,
+            max_retries=1,
+            initial_delay=0.0,
+            continue_on_failure=False,
+        )
+        assert False, "Deberia haber lanzado excepcion"
+    except RuntimeError as e:
+        assert "error fatal" in str(e)
+    # item 3 nunca fue procesado
+    assert call_count["n"] < 6  # 1 ok + 2 intentos para item 2 + 0 para item 3
+
+
+def test_item_falla_luego_exito_retry_funciona():
+    attempt_counts = {}
+
+    def process(x):
+        attempt_counts[x] = attempt_counts.get(x, 0) + 1
+        # item 5 falla las primeras 2 veces, exito en la tercera
+        if x == 5 and attempt_counts[x] < 3:
+            raise ValueError("fallo temporal")
+        return x * 2
+
+    results, failures = call_batch_with_retry(
+        items=[1, 5, 9],
+        process_func=process,
+        max_retries=3,
+        initial_delay=0.0,
+        continue_on_failure=True,
+    )
+    assert results == [2, 10, 18]
+    assert failures == []
+    assert attempt_counts[5] == 3
+
+
+def test_failures_contiene_index_correcto():
+    def process(x):
+        if x in (0, 2, 4):
+            raise ValueError(f"fallo en {x}")
+        return x
+
+    results, failures = call_batch_with_retry(
+        items=[0, 1, 2, 3, 4],
+        process_func=process,
+        max_retries=0,
+        initial_delay=0.0,
+        continue_on_failure=True,
+    )
+    assert results == [1, 3]
+    assert [f["index"] for f in failures] == [0, 2, 4]
+    assert [f["item"] for f in failures] == [0, 2, 4]
diff --git a/python/functions/core/circuit_breaker.md b/python/functions/core/circuit_breaker.md
new file mode 100644
index 00000000..6d36d5a7
--- /dev/null
+++ b/python/functions/core/circuit_breaker.md
@@ -0,0 +1,66 @@
+---
+name: circuit_breaker
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "class CircuitBreaker:\n  def __init__(self, failure_threshold: int = 5, reset_timeout: float = 300.0): ...\n  def check(self) -> None: ...\n  def record_success(self) -> None: ...\n  def record_failure(self, error: Exception) -> None: ...\n  @property\n  def retry_after(self) -> float: ..."
+description: "Patron circuit breaker thread-safe para proteger llamadas a APIs externas. Tres estados: CLOSED (normal), OPEN (bloqueando), HALF_OPEN (permitiendo 1 request de prueba). Integra con classify_api_error para distinguir errores permanentes de transitorios."
+tags: [circuit-breaker, resilience, api, retry, error-handling, thread-safe]
+uses_functions: [classify_api_error_py_core]
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [threading, time, enum]
+tested: true
+tests:
+  - "Transicion CLOSED → OPEN despues de N fallos"
+  - "Transicion OPEN → HALF_OPEN despues de timeout"
+  - "Transicion HALF_OPEN → CLOSED en exito"
+  - "Transicion HALF_OPEN → OPEN en fallo"
+  - "Error permanente abre inmediatamente"
+  - "Thread safety (concurrencia)"
+  - "retry_after retorna 0 cuando no esta OPEN"
+test_file_path: "python/functions/core/circuit_breaker_test.py"
+file_path: "python/functions/core/circuit_breaker.py"
+---
+
+## Ejemplo
+
+```python
+from circuit_breaker import CircuitBreaker, CircuitBreakerOpen
+
+cb = CircuitBreaker(failure_threshold=3, reset_timeout=60.0)
+
+def call_api() -> dict:
+    cb.check()  # raises CircuitBreakerOpen if circuit is open
+    try:
+        result = requests.get("https://api.example.com/data").json()
+        cb.record_success()
+        return result
+    except Exception as exc:
+        cb.record_failure(exc)
+        raise
+
+# After 3 consecutive failures the circuit opens:
+# CircuitBreakerOpen: Circuit breaker is open. Retry after 30.0s
+try:
+    cb.check()
+except CircuitBreakerOpen as e:
+    print(f"Circuit open, retry in {e.retry_after}s")
+
+# retry_after property (capped at 30s):
+print(cb.retry_after)  # e.g. 28.4
+```
+
+## Notas
+
+- **CLOSED**: Requests pasan normalmente. Tras `failure_threshold` fallos consecutivos transiciona a OPEN.
+- **OPEN**: Requests bloqueados con `CircuitBreakerOpen`. Tras `reset_timeout` segundos transiciona a HALF_OPEN.
+- **HALF_OPEN**: Permite 1 request de prueba. Exito → CLOSED. Fallo → OPEN.
+- Errores permanentes (401, 403) abren el circuito inmediatamente sin esperar al umbral.
+- `retry_after` devuelve 0.0 cuando el estado no es OPEN; en OPEN devuelve el tiempo restante, cap 30s.
+- Thread-safe via `threading.Lock` protegiendo todo el estado interno.
+- La dependencia en `classify_api_error` es opcional: si no se puede importar, hay fallback de texto.
diff --git a/python/functions/core/circuit_breaker.py b/python/functions/core/circuit_breaker.py
new file mode 100644
index 00000000..b68ccfba
--- /dev/null
+++ b/python/functions/core/circuit_breaker.py
@@ -0,0 +1,141 @@
+"""Circuit breaker pattern for protecting external API calls."""
+
+import threading
+import time
+from enum import Enum
+
+
+class CircuitBreakerState(Enum):
+    CLOSED = "closed"
+    OPEN = "open"
+    HALF_OPEN = "half_open"
+
+
+class CircuitBreakerOpen(Exception):
+    """Raised when the circuit breaker is open and blocking requests."""
+
+    def __init__(self, retry_after: float) -> None:
+        self.retry_after = retry_after
+        super().__init__(f"Circuit breaker is open. Retry after {retry_after:.1f}s")
+
+
+def _is_permanent_error(error: Exception) -> bool:
+    """Return True if the error is permanent (should open circuit immediately)."""
+    try:
+        from classify_api_error import classify_api_error
+
+        return classify_api_error(error) == "permanent"
+    except ImportError:
+        # Fallback: inspect error text directly
+        text = str(error)
+        if error.__cause__ is not None:
+            text += " " + str(error.__cause__)
+        permanent_patterns = ["400", "401", "403", "Forbidden", "Unauthorized"]
+        return any(p in text for p in permanent_patterns)
+
+
+class CircuitBreaker:
+    """Thread-safe circuit breaker for protecting external API calls.
+
+    Implements three states:
+    - CLOSED: requests pass through normally.
+    - OPEN: requests are blocked with CircuitBreakerOpen.
+    - HALF_OPEN: one probe request is allowed through.
+
+    Args:
+        failure_threshold: Consecutive failures before opening. Default 5.
+        reset_timeout: Seconds to wait in OPEN before trying HALF_OPEN. Default 300.0.
+    """
+
+    def __init__(
+        self,
+        failure_threshold: int = 5,
+        reset_timeout: float = 300.0,
+    ) -> None:
+        self._failure_threshold = failure_threshold
+        self._reset_timeout = reset_timeout
+        self._lock = threading.Lock()
+
+        self._state = CircuitBreakerState.CLOSED
+        self._failure_count = 0
+        self._opened_at: float | None = None
+
+    # ------------------------------------------------------------------
+    # Public interface
+    # ------------------------------------------------------------------
+
+    def check(self) -> None:
+        """Check whether a request is allowed through.
+
+        Raises:
+            CircuitBreakerOpen: If the circuit is open and reset_timeout
+                has not elapsed yet.
+        """
+        with self._lock:
+            if self._state is CircuitBreakerState.CLOSED:
+                return
+
+            if self._state is CircuitBreakerState.OPEN:
+                elapsed = time.monotonic() - self._opened_at  # type: ignore[operator]
+                if elapsed >= self._reset_timeout:
+                    self._state = CircuitBreakerState.HALF_OPEN
+                    return
+                remaining = self._reset_timeout - elapsed
+                raise CircuitBreakerOpen(min(remaining, 30.0))
+
+            # HALF_OPEN: allow exactly one probe — caller holds the slot
+            if self._state is CircuitBreakerState.HALF_OPEN:
+                return
+
+    def record_success(self) -> None:
+        """Record a successful request. Resets the breaker to CLOSED."""
+        with self._lock:
+            self._state = CircuitBreakerState.CLOSED
+            self._failure_count = 0
+            self._opened_at = None
+
+    def record_failure(self, error: Exception) -> None:
+        """Record a failed request.
+
+        If the error is permanent (e.g. 401/403), opens immediately.
+        Otherwise increments the failure counter and opens once it
+        reaches failure_threshold.
+
+        Args:
+            error: The exception that was raised.
+        """
+        with self._lock:
+            if _is_permanent_error(error):
+                self._trip()
+                return
+
+            if self._state is CircuitBreakerState.HALF_OPEN:
+                self._trip()
+                return
+
+            self._failure_count += 1
+            if self._failure_count >= self._failure_threshold:
+                self._trip()
+
+    @property
+    def retry_after(self) -> float:
+        """Seconds until the circuit transitions to HALF_OPEN.
+
+        Returns 0.0 when not in OPEN state, capped at 30 seconds.
+        """
+        with self._lock:
+            if self._state is not CircuitBreakerState.OPEN:
+                return 0.0
+            elapsed = time.monotonic() - self._opened_at  # type: ignore[operator]
+            remaining = self._reset_timeout - elapsed
+            return min(max(remaining, 0.0), 30.0)
+
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+
+    def _trip(self) -> None:
+        """Open the circuit (must be called with _lock held)."""
+        self._state = CircuitBreakerState.OPEN
+        self._failure_count = 0
+        self._opened_at = time.monotonic()
diff --git a/python/functions/core/circuit_breaker_test.py b/python/functions/core/circuit_breaker_test.py
new file mode 100644
index 00000000..e5f5650c
--- /dev/null
+++ b/python/functions/core/circuit_breaker_test.py
@@ -0,0 +1,156 @@
+"""Tests para circuit_breaker."""
+
+import sys
+import os
+import threading
+import time
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from circuit_breaker import CircuitBreaker, CircuitBreakerOpen, CircuitBreakerState
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _transient_error() -> Exception:
+    return Exception("HTTP 503 Service Unavailable")
+
+
+def _permanent_error() -> Exception:
+    return Exception("HTTP 401 Unauthorized")
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+def test_closed_to_open_after_n_failures() -> None:
+    """Transicion CLOSED → OPEN despues de N fallos"""
+    cb = CircuitBreaker(failure_threshold=3, reset_timeout=60.0)
+
+    cb.check()  # Should not raise
+
+    cb.record_failure(_transient_error())
+    cb.record_failure(_transient_error())
+    assert cb._state is CircuitBreakerState.CLOSED  # Still closed after 2
+
+    cb.record_failure(_transient_error())
+    assert cb._state is CircuitBreakerState.OPEN
+
+    try:
+        cb.check()
+        assert False, "Should have raised CircuitBreakerOpen"
+    except CircuitBreakerOpen:
+        pass
+
+    print("PASS: Transicion CLOSED → OPEN despues de N fallos")
+
+
+def test_open_to_half_open_after_timeout() -> None:
+    """Transicion OPEN → HALF_OPEN despues de timeout"""
+    cb = CircuitBreaker(failure_threshold=1, reset_timeout=0.05)
+    cb.record_failure(_transient_error())
+    assert cb._state is CircuitBreakerState.OPEN
+
+    time.sleep(0.1)
+
+    cb.check()  # Should not raise — transitions to HALF_OPEN
+    assert cb._state is CircuitBreakerState.HALF_OPEN
+
+    print("PASS: Transicion OPEN → HALF_OPEN despues de timeout")
+
+
+def test_half_open_to_closed_on_success() -> None:
+    """Transicion HALF_OPEN → CLOSED en exito"""
+    cb = CircuitBreaker(failure_threshold=1, reset_timeout=0.05)
+    cb.record_failure(_transient_error())
+    time.sleep(0.1)
+    cb.check()  # enters HALF_OPEN
+    assert cb._state is CircuitBreakerState.HALF_OPEN
+
+    cb.record_success()
+    assert cb._state is CircuitBreakerState.CLOSED
+
+    cb.check()  # Should not raise
+
+    print("PASS: Transicion HALF_OPEN → CLOSED en exito")
+
+
+def test_half_open_to_open_on_failure() -> None:
+    """Transicion HALF_OPEN → OPEN en fallo"""
+    cb = CircuitBreaker(failure_threshold=1, reset_timeout=0.05)
+    cb.record_failure(_transient_error())
+    time.sleep(0.1)
+    cb.check()  # enters HALF_OPEN
+    assert cb._state is CircuitBreakerState.HALF_OPEN
+
+    cb.record_failure(_transient_error())
+    assert cb._state is CircuitBreakerState.OPEN
+
+    print("PASS: Transicion HALF_OPEN → OPEN en fallo")
+
+
+def test_permanent_error_opens_immediately() -> None:
+    """Error permanente abre inmediatamente"""
+    cb = CircuitBreaker(failure_threshold=10, reset_timeout=60.0)
+    assert cb._state is CircuitBreakerState.CLOSED
+
+    cb.record_failure(_permanent_error())
+    assert cb._state is CircuitBreakerState.OPEN
+
+    print("PASS: Error permanente abre inmediatamente")
+
+
+def test_thread_safety() -> None:
+    """Thread safety (concurrencia)"""
+    cb = CircuitBreaker(failure_threshold=5, reset_timeout=60.0)
+    errors: list[Exception] = []
+
+    def worker() -> None:
+        try:
+            for _ in range(10):
+                cb.check()
+                cb.record_failure(_transient_error())
+        except CircuitBreakerOpen:
+            pass
+        except Exception as exc:
+            errors.append(exc)
+
+    threads = [threading.Thread(target=worker) for _ in range(20)]
+    for t in threads:
+        t.start()
+    for t in threads:
+        t.join()
+
+    assert not errors, f"Thread errors: {errors}"
+    # After concurrent failures the circuit must be OPEN or HALF_OPEN
+    assert cb._state in (CircuitBreakerState.OPEN, CircuitBreakerState.HALF_OPEN, CircuitBreakerState.CLOSED)
+
+    print("PASS: Thread safety (concurrencia)")
+
+
+def test_retry_after_returns_zero_when_not_open() -> None:
+    """retry_after retorna 0 cuando no esta OPEN"""
+    cb = CircuitBreaker(failure_threshold=5, reset_timeout=60.0)
+    assert cb.retry_after == 0.0
+
+    cb.record_failure(_transient_error())
+    # Still CLOSED (threshold not reached)
+    assert cb.retry_after == 0.0
+
+    print("PASS: retry_after retorna 0 cuando no esta OPEN")
+
+
+if __name__ == "__main__":
+    test_closed_to_open_after_n_failures()
+    test_open_to_half_open_after_timeout()
+    test_half_open_to_closed_on_success()
+    test_half_open_to_open_on_failure()
+    test_permanent_error_opens_immediately()
+    test_thread_safety()
+    test_retry_after_returns_zero_when_not_open()
+    print("\nAll tests passed.")
diff --git a/python/functions/core/classify_api_error.md b/python/functions/core/classify_api_error.md
new file mode 100644
index 00000000..b9e25891
--- /dev/null
+++ b/python/functions/core/classify_api_error.md
@@ -0,0 +1,41 @@
+---
+name: classify_api_error
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def classify_api_error(error: Exception) -> str"
+description: "Clasifica un error de API como permanente (no reintentar), transitorio (reintentar) o desconocido. Permanente tiene prioridad sobre transitorio."
+tags: [retry, error, classification, api, backoff]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: true
+tests: ["error 429 es transitorio", "error 401 es permanente", "error timeout es transitorio", "error desconocido retorna unknown", "error con __cause__ transitorio"]
+test_file_path: "python/functions/core/classify_api_error_test.py"
+file_path: "python/functions/core/classify_api_error.py"
+---
+
+## Ejemplo
+
+```python
+err = Exception("HTTP 429 TooManyRequests")
+classify_api_error(err)  # "transient"
+
+err = Exception("HTTP 401 Unauthorized")
+classify_api_error(err)  # "permanent"
+
+err = Exception("Connection timeout")
+classify_api_error(err)  # "transient"
+
+err = Exception("Something unexpected happened")
+classify_api_error(err)  # "unknown"
+```
+
+## Notas
+
+Funcion pura: solo inspecciona el texto del error y su causa directa (`__cause__`). No tiene I/O ni dependencias externas. La prioridad permanente > transitorio evita reintentar errores 400/401/403 que nunca tendran exito.
diff --git a/python/functions/core/classify_api_error.py b/python/functions/core/classify_api_error.py
new file mode 100644
index 00000000..facdced9
--- /dev/null
+++ b/python/functions/core/classify_api_error.py
@@ -0,0 +1,38 @@
+"""Classify an API exception as permanent, transient, or unknown."""
+
+
+def classify_api_error(error: Exception) -> str:
+    """Classify an API error as permanent, transient, or unknown.
+
+    Permanent errors should not be retried (e.g. auth failures, bad requests).
+    Transient errors are safe to retry (e.g. rate limits, timeouts, server errors).
+    Permanent classification takes priority over transient.
+
+    Args:
+        error: The exception to classify.
+
+    Returns:
+        "permanent" | "transient" | "unknown"
+    """
+    parts = [str(error)]
+    if error.__cause__ is not None:
+        parts.append(str(error.__cause__))
+    text = " ".join(parts)
+
+    permanent_patterns = ["400", "401", "403", "Forbidden", "Unauthorized"]
+    transient_patterns = [
+        "429", "500", "502", "503", "504",
+        "TooManyRequests", "RateLimit",
+        "timeout", "Timeout",
+        "ConnectionError", "Connection refused", "Connection reset",
+    ]
+
+    for pattern in permanent_patterns:
+        if pattern in text:
+            return "permanent"
+
+    for pattern in transient_patterns:
+        if pattern in text:
+            return "transient"
+
+    return "unknown"
diff --git a/python/functions/core/classify_api_error_test.py b/python/functions/core/classify_api_error_test.py
new file mode 100644
index 00000000..3f40c62c
--- /dev/null
+++ b/python/functions/core/classify_api_error_test.py
@@ -0,0 +1,50 @@
+"""Tests para classify_api_error."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(__file__))
+from classify_api_error import classify_api_error
+
+
+def test_error_429_es_transitorio():
+    err = Exception("HTTP 429 TooManyRequests")
+    assert classify_api_error(err) == "transient"
+
+
+def test_error_401_es_permanente():
+    err = Exception("HTTP 401 Unauthorized")
+    assert classify_api_error(err) == "permanent"
+
+
+def test_error_timeout_es_transitorio():
+    err = Exception("Connection timeout occurred")
+    assert classify_api_error(err) == "transient"
+
+
+def test_error_desconocido_retorna_unknown():
+    err = Exception("Something completely unexpected happened")
+    assert classify_api_error(err) == "unknown"
+
+
+def test_error_con___cause___transitorio():
+    cause = Exception("Connection reset by peer")
+    err = Exception("Request failed")
+    err.__cause__ = cause
+    assert classify_api_error(err) == "transient"
+
+
+def test_permanente_tiene_prioridad_sobre_transitorio():
+    # Mensaje que contiene patrones de ambos tipos: 401 (permanent) y 503 (transient)
+    err = Exception("401 503 mixed error")
+    assert classify_api_error(err) == "permanent"
+
+
+def test_error_403_forbidden_es_permanente():
+    err = Exception("403 Forbidden")
+    assert classify_api_error(err) == "permanent"
+
+
+def test_error_500_es_transitorio():
+    err = Exception("Internal server error 500")
+    assert classify_api_error(err) == "transient"
diff --git a/python/functions/core/coerce_types.md b/python/functions/core/coerce_types.md
new file mode 100644
index 00000000..77e8555b
--- /dev/null
+++ b/python/functions/core/coerce_types.md
@@ -0,0 +1,49 @@
+---
+name: coerce_types
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def coerce_types(data: dict, schema: dict[str, str]) -> tuple[dict, list[str]]"
+description: "Convierte valores de un dict a los tipos esperados segun un schema declarativo. Soporta int, float, str, bool, datetime, list[str]. Util para normalizar datos de CSV, JSON o query params. Nunca muta el original. Coerciones imposibles generan warning y mantienen el valor original."
+tags: [coercion, types, normalization, pure, core, csv, json]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [datetime]
+tested: true
+tests:
+  - "string 42 a int 42"
+  - "string 3.14 a float 3.14"
+  - "string true a bool true"
+  - "string iso8601 a datetime"
+  - "coercion fallida genera warning sin crash"
+  - "dict con mix de tipos ya correctos y strings"
+  - "campo ausente en schema pass through sin tocar"
+  - "string lista a list str"
+test_file_path: "python/functions/core/coerce_types_test.py"
+file_path: "python/functions/core/coerce_types.py"
+---
+
+## Ejemplo
+
+```python
+data = {"age": "25", "score": "9.5", "active": "yes", "tags": "go, python"}
+schema = {"age": "int", "score": "float", "active": "bool", "tags": "list[str]"}
+
+result, warnings = coerce_types(data, schema)
+# result = {"age": 25, "score": 9.5, "active": True, "tags": ["go", "python"]}
+# warnings = []
+
+# Coercion fallida — mantiene original y avisa
+result2, warnings2 = coerce_types({"n": "abc"}, {"n": "int"})
+# result2 = {"n": "abc"}
+# warnings2 = ["n: cannot coerce 'abc' to int: could not convert string to float: 'abc'"]
+```
+
+## Notas
+
+Funcion pura. Solo usa `datetime` de la stdlib. No muta el dict original — retorna uno nuevo. Schema es flat (no anidado); para validacion de estructura compleja combinar con `validate_json_schema`. Lossy coercions (float "3.7" → int 3) generan warning adicional. Campo ausente en schema se copia sin tocar.
diff --git a/python/functions/core/coerce_types.py b/python/functions/core/coerce_types.py
new file mode 100644
index 00000000..639863b5
--- /dev/null
+++ b/python/functions/core/coerce_types.py
@@ -0,0 +1,135 @@
+"""Coercion de valores de un dict a tipos esperados segun un schema declarativo."""
+
+from datetime import datetime, timezone
+
+
+def coerce_types(
+    data: dict, schema: dict[str, str]
+) -> tuple[dict, list[str]]:
+    """Convierte valores de un dict a los tipos esperados segun el schema.
+
+    Schema es un dict de {campo: tipo} donde tipo es uno de:
+    "int", "float", "str", "bool", "datetime", "list[str]".
+
+    Coerciones soportadas (todas desde str):
+    - str → int: int(v), warning si tenia decimales
+    - str → float: float(v)
+    - str → bool: "true/1/yes" → True, "false/0/no" → False (case-insensitive)
+    - str → datetime: ISO 8601 parse
+    - str → list[str]: split por "," y strip de cada elemento
+    - Valor ya del tipo correcto → pass through
+    - Campo ausente en schema → pass through sin tocar
+    - Coercion imposible → mantener original + warning
+
+    Args:
+        data: Dict con los valores a coercionar.
+        schema: Dict de {campo: tipo_esperado}.
+
+    Returns:
+        (coerced_data, warnings) — nuevo dict con tipos corregidos (no muta el
+        original), lista de warnings para coerciones lossy o fallidas.
+    """
+    result = dict(data)
+    warnings: list[str] = []
+
+    for field, target_type in schema.items():
+        if field not in data:
+            continue
+
+        value = data[field]
+        try:
+            result[field] = _coerce_value(value, target_type, field, warnings)
+        except Exception as exc:
+            warnings.append(
+                f"{field}: cannot coerce {value!r} to {target_type}: {exc}"
+            )
+            result[field] = value
+
+    return result, warnings
+
+
+_BOOL_TRUE = {"true", "1", "yes"}
+_BOOL_FALSE = {"false", "0", "no"}
+
+
+def _coerce_value(
+    value: object, target: str, field: str, warnings: list[str]
+) -> object:
+    # --- int ---
+    if target == "int":
+        if isinstance(value, int) and not isinstance(value, bool):
+            return value
+        if isinstance(value, float):
+            if value != int(value):
+                warnings.append(
+                    f"{field}: lossy coercion float→int: {value} → {int(value)}"
+                )
+            return int(value)
+        if isinstance(value, str):
+            stripped = value.strip()
+            # detectar si tiene parte decimal no cero
+            try:
+                as_float = float(stripped)
+                if as_float != int(as_float):
+                    warnings.append(
+                        f"{field}: lossy coercion str→int: {value!r} → {int(as_float)}"
+                    )
+                return int(as_float)
+            except ValueError:
+                raise ValueError(f"cannot parse {value!r} as int")
+        raise TypeError(f"cannot coerce {type(value).__name__} to int")
+
+    # --- float ---
+    if target == "float":
+        if isinstance(value, float):
+            return value
+        if isinstance(value, int) and not isinstance(value, bool):
+            return float(value)
+        if isinstance(value, str):
+            return float(value.strip())
+        raise TypeError(f"cannot coerce {type(value).__name__} to float")
+
+    # --- str ---
+    if target == "str":
+        if isinstance(value, str):
+            return value
+        return str(value)
+
+    # --- bool ---
+    if target == "bool":
+        if isinstance(value, bool):
+            return value
+        if isinstance(value, str):
+            low = value.strip().lower()
+            if low in _BOOL_TRUE:
+                return True
+            if low in _BOOL_FALSE:
+                return False
+            raise ValueError(
+                f"cannot parse {value!r} as bool; expected true/false/1/0/yes/no"
+            )
+        if isinstance(value, int):
+            return bool(value)
+        raise TypeError(f"cannot coerce {type(value).__name__} to bool")
+
+    # --- datetime ---
+    if target == "datetime":
+        if isinstance(value, datetime):
+            return value
+        if isinstance(value, str):
+            s = value.strip()
+            # Intentar parse ISO 8601 con y sin Z
+            if s.endswith("Z"):
+                s = s[:-1] + "+00:00"
+            return datetime.fromisoformat(s)
+        raise TypeError(f"cannot coerce {type(value).__name__} to datetime")
+
+    # --- list[str] ---
+    if target == "list[str]":
+        if isinstance(value, list):
+            return [str(item) for item in value]
+        if isinstance(value, str):
+            return [item.strip() for item in value.split(",")]
+        raise TypeError(f"cannot coerce {type(value).__name__} to list[str]")
+
+    raise ValueError(f"unknown target type: {target!r}")
diff --git a/python/functions/core/coerce_types_test.py b/python/functions/core/coerce_types_test.py
new file mode 100644
index 00000000..39496bc2
--- /dev/null
+++ b/python/functions/core/coerce_types_test.py
@@ -0,0 +1,84 @@
+"""Tests para coerce_types."""
+
+import sys
+import os
+from datetime import datetime, timezone
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from coerce_types import coerce_types
+
+
+def test_string_42_a_int_42():
+    result, warnings = coerce_types({"n": "42"}, {"n": "int"})
+    assert result["n"] == 42
+    assert isinstance(result["n"], int)
+    assert warnings == []
+
+
+def test_string_3_14_a_float_3_14():
+    result, warnings = coerce_types({"x": "3.14"}, {"x": "float"})
+    assert abs(result["x"] - 3.14) < 1e-9
+    assert warnings == []
+
+
+def test_string_true_a_bool_true():
+    result, warnings = coerce_types({"flag": "true"}, {"flag": "bool"})
+    assert result["flag"] is True
+    assert warnings == []
+
+    result2, _ = coerce_types({"flag": "yes"}, {"flag": "bool"})
+    assert result2["flag"] is True
+
+    result3, _ = coerce_types({"flag": "1"}, {"flag": "bool"})
+    assert result3["flag"] is True
+
+    result4, _ = coerce_types({"flag": "false"}, {"flag": "bool"})
+    assert result4["flag"] is False
+
+
+def test_string_iso8601_a_datetime():
+    result, warnings = coerce_types(
+        {"ts": "2024-01-15T10:30:00Z"}, {"ts": "datetime"}
+    )
+    assert isinstance(result["ts"], datetime)
+    assert result["ts"].year == 2024
+    assert result["ts"].month == 1
+    assert result["ts"].day == 15
+    assert warnings == []
+
+
+def test_coercion_fallida_genera_warning_sin_crash():
+    result, warnings = coerce_types({"n": "not-a-number"}, {"n": "int"})
+    # mantiene el original
+    assert result["n"] == "not-a-number"
+    assert len(warnings) == 1
+    assert "n" in warnings[0]
+
+
+def test_dict_con_mix_de_tipos_ya_correctos_y_strings():
+    data = {"a": "10", "b": 3.14, "c": True, "d": "hello"}
+    schema = {"a": "int", "b": "float", "c": "bool", "d": "str"}
+    result, warnings = coerce_types(data, schema)
+    assert result["a"] == 10
+    assert abs(result["b"] - 3.14) < 1e-9
+    assert result["c"] is True
+    assert result["d"] == "hello"
+    assert warnings == []
+
+
+def test_campo_ausente_en_schema_pass_through_sin_tocar():
+    data = {"a": "42", "b": [1, 2, 3]}
+    schema = {"a": "int"}  # "b" no esta en schema
+    result, warnings = coerce_types(data, schema)
+    assert result["a"] == 42
+    assert result["b"] == [1, 2, 3]
+    assert warnings == []
+
+
+def test_string_lista_a_list_str():
+    result, warnings = coerce_types(
+        {"tags": "python, go, bash"}, {"tags": "list[str]"}
+    )
+    assert result["tags"] == ["python", "go", "bash"]
+    assert warnings == []
diff --git a/python/functions/core/compute_backoff_delay.md b/python/functions/core/compute_backoff_delay.md
new file mode 100644
index 00000000..51e6d99a
--- /dev/null
+++ b/python/functions/core/compute_backoff_delay.md
@@ -0,0 +1,41 @@
+---
+name: compute_backoff_delay
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def compute_backoff_delay(attempt: int, base_delay: float = 0.5, max_delay: float = 8.0, jitter: bool = True) -> float"
+description: "Calcula el delay para exponential backoff con jitter opcional. delay = min(base_delay * 2^attempt, max_delay). Con jitter anade random.uniform(0, min(base_delay, delay))."
+tags: [retry, backoff, exponential, delay, jitter]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [random]
+tested: true
+tests: ["attempt 0 retorna base_delay sin jitter", "attempt alto se cappea a max_delay", "sin jitter es determinista"]
+test_file_path: "python/functions/core/compute_backoff_delay_test.py"
+file_path: "python/functions/core/compute_backoff_delay.py"
+---
+
+## Ejemplo
+
+```python
+# Primer reintento (attempt=0): delay = 0.5 * 2^0 = 0.5s
+compute_backoff_delay(0, jitter=False)  # 0.5
+
+# Tercer reintento (attempt=2): delay = 0.5 * 2^2 = 2.0s
+compute_backoff_delay(2, jitter=False)  # 2.0
+
+# Intento alto, capped a 8.0s
+compute_backoff_delay(10, jitter=False)  # 8.0
+
+# Con jitter (no determinista)
+compute_backoff_delay(1)  # entre 1.0 y 1.5
+```
+
+## Notas
+
+Usa `random` de la stdlib. Con jitter=True el resultado no es determinista, pero la funcion es clasificada como pura conceptualmente dado que el jitter es intencional y no hay I/O. Para tests deterministicos usar jitter=False.
diff --git a/python/functions/core/compute_backoff_delay.py b/python/functions/core/compute_backoff_delay.py
new file mode 100644
index 00000000..b75be0dc
--- /dev/null
+++ b/python/functions/core/compute_backoff_delay.py
@@ -0,0 +1,26 @@
+"""Compute exponential backoff delay with optional jitter."""
+
+import random
+
+
+def compute_backoff_delay(
+    attempt: int,
+    base_delay: float = 0.5,
+    max_delay: float = 8.0,
+    jitter: bool = True,
+) -> float:
+    """Compute exponential backoff delay for a given attempt number.
+
+    Args:
+        attempt: Zero-based attempt index (0 = first retry).
+        base_delay: Base delay in seconds before exponential scaling.
+        max_delay: Maximum delay cap in seconds.
+        jitter: If True, adds random jitter to avoid thundering herd.
+
+    Returns:
+        Delay in seconds to wait before the next attempt.
+    """
+    delay = min(base_delay * (2 ** attempt), max_delay)
+    if jitter:
+        delay += random.uniform(0, min(base_delay, delay))
+    return delay
diff --git a/python/functions/core/compute_backoff_delay_test.py b/python/functions/core/compute_backoff_delay_test.py
new file mode 100644
index 00000000..35a8c6e9
--- /dev/null
+++ b/python/functions/core/compute_backoff_delay_test.py
@@ -0,0 +1,42 @@
+"""Tests para compute_backoff_delay."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(__file__))
+from compute_backoff_delay import compute_backoff_delay
+
+
+def test_attempt_0_retorna_base_delay_sin_jitter():
+    result = compute_backoff_delay(0, base_delay=0.5, max_delay=8.0, jitter=False)
+    assert result == 0.5
+
+
+def test_attempt_alto_se_cappea_a_max_delay():
+    result = compute_backoff_delay(10, base_delay=0.5, max_delay=8.0, jitter=False)
+    assert result == 8.0
+
+
+def test_sin_jitter_es_determinista():
+    r1 = compute_backoff_delay(3, base_delay=1.0, max_delay=16.0, jitter=False)
+    r2 = compute_backoff_delay(3, base_delay=1.0, max_delay=16.0, jitter=False)
+    assert r1 == r2
+    # attempt=3: 1.0 * 2^3 = 8.0
+    assert r1 == 8.0
+
+
+def test_escala_exponencial():
+    d0 = compute_backoff_delay(0, base_delay=1.0, max_delay=100.0, jitter=False)
+    d1 = compute_backoff_delay(1, base_delay=1.0, max_delay=100.0, jitter=False)
+    d2 = compute_backoff_delay(2, base_delay=1.0, max_delay=100.0, jitter=False)
+    assert d0 == 1.0
+    assert d1 == 2.0
+    assert d2 == 4.0
+
+
+def test_con_jitter_no_excede_max_delay_mas_base():
+    # Con jitter, delay base + jitter <= max_delay + base_delay
+    for attempt in range(5):
+        result = compute_backoff_delay(attempt, base_delay=0.5, max_delay=8.0, jitter=True)
+        assert result >= 0.5
+        assert result <= 8.0 + 0.5
diff --git a/python/functions/core/convert_github_to_raw_url.md b/python/functions/core/convert_github_to_raw_url.md
new file mode 100644
index 00000000..6a815b51
--- /dev/null
+++ b/python/functions/core/convert_github_to_raw_url.md
@@ -0,0 +1,59 @@
+---
+name: convert_github_to_raw_url
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "convert_github_to_raw_url(url: str) -> str"
+description: "Convierte una URL de blob de GitHub/GitLab a su URL raw. Ej: github.com/org/repo/blob/main/file.py → raw.githubusercontent.com/org/repo/main/file.py. Retorna la URL sin cambios si no aplica."
+tags: [github, gitlab, url, raw, blob, convert, transform]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: ["urllib.parse"]
+tested: true
+tests:
+  - "URL GitHub blob"
+  - "URL GitLab blob"
+  - "URL que no es blob retorna sin cambios"
+  - "URL no-GitHub retorna sin cambios"
+test_file_path: "python/functions/core/convert_github_to_raw_url_test.py"
+file_path: "python/functions/core/convert_github_to_raw_url.py"
+---
+
+## Ejemplo
+
+```python
+from core.convert_github_to_raw_url import convert_github_to_raw_url
+
+# GitHub blob → raw.githubusercontent.com
+url = convert_github_to_raw_url(
+    "https://github.com/openai/whisper/blob/main/README.md"
+)
+# "https://raw.githubusercontent.com/openai/whisper/main/README.md"
+
+# GitLab blob → raw
+url = convert_github_to_raw_url(
+    "https://gitlab.com/org/repo/-/blob/main/file.py"
+)
+# "https://gitlab.com/org/repo/-/raw/main/file.py"
+
+# URL sin blob → sin cambios
+url = convert_github_to_raw_url("https://github.com/org/repo")
+# "https://github.com/org/repo"
+```
+
+## Notas
+
+Algoritmo:
+1. Parsear la URL con `urllib.parse.urlparse`.
+2. Si host es `github.com`: buscar segmento `blob` en el path.
+   - Si existe: eliminar el segmento `blob` y cambiar el dominio a `raw.githubusercontent.com`.
+3. Si host es `gitlab.com` o empieza con `gitlab.`: reemplazar `/-/blob/` por `/-/raw/`
+   o `/blob/` por `/raw/`.
+4. Cualquier otro host: retornar la URL sin cambios.
+
+Funcion pura. No hace I/O ni tiene efectos secundarios.
diff --git a/python/functions/core/convert_github_to_raw_url.py b/python/functions/core/convert_github_to_raw_url.py
new file mode 100644
index 00000000..22f49614
--- /dev/null
+++ b/python/functions/core/convert_github_to_raw_url.py
@@ -0,0 +1,69 @@
+"""Convierte URLs de blob de GitHub/GitLab a su equivalente raw."""
+
+from urllib.parse import urlparse, urlunparse
+
+
+def convert_github_to_raw_url(url: str) -> str:
+    """Convierte una URL de blob de GitHub o GitLab a su URL raw.
+
+    GitHub blob:
+        https://github.com/org/repo/blob/main/path/file.py
+        → https://raw.githubusercontent.com/org/repo/main/path/file.py
+
+    GitLab blob:
+        https://gitlab.com/org/repo/-/blob/main/path/file.py
+        → https://gitlab.com/org/repo/-/raw/main/path/file.py
+
+    Si la URL no contiene un path tipo blob, la retorna sin cambios.
+
+    Args:
+        url: URL de GitHub o GitLab, posiblemente apuntando a un blob.
+
+    Returns:
+        URL raw si aplica la transformacion; la URL original en caso contrario.
+    """
+    url = url.strip()
+    if not url:
+        return url
+
+    parsed = urlparse(url)
+    host = parsed.hostname or ""
+
+    # --- GitHub ---
+    if host in ("github.com", "www.github.com"):
+        # Path tipico: /org/repo/blob/ref/path/to/file
+        segments = parsed.path.split("/")
+        if "blob" in segments:
+            blob_idx = segments.index("blob")
+            # Eliminar segmento "blob": /org/repo/ref/path/...
+            new_segments = segments[:blob_idx] + segments[blob_idx + 1:]
+            new_path = "/".join(new_segments)
+            raw_url = urlunparse((
+                "https",
+                "raw.githubusercontent.com",
+                new_path,
+                parsed.params,
+                parsed.query,
+                parsed.fragment,
+            ))
+            return raw_url
+        return url
+
+    # --- GitLab ---
+    if host in ("gitlab.com", "www.gitlab.com") or host.startswith("gitlab."):
+        # Path tipico: /org/repo/-/blob/ref/path o /org/repo/blob/ref/path
+        new_path = parsed.path.replace("/-/blob/", "/-/raw/").replace("/blob/", "/raw/")
+        if new_path != parsed.path:
+            raw_url = urlunparse((
+                parsed.scheme,
+                parsed.netloc,
+                new_path,
+                parsed.params,
+                parsed.query,
+                parsed.fragment,
+            ))
+            return raw_url
+        return url
+
+    # No aplica transformacion
+    return url
diff --git a/python/functions/core/convert_github_to_raw_url_test.py b/python/functions/core/convert_github_to_raw_url_test.py
new file mode 100644
index 00000000..16a38835
--- /dev/null
+++ b/python/functions/core/convert_github_to_raw_url_test.py
@@ -0,0 +1,77 @@
+"""Tests para convert_github_to_raw_url."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from core.convert_github_to_raw_url import convert_github_to_raw_url
+
+
+def test_url_github_blob():
+    """URL de GitHub blob se convierte correctamente a raw.githubusercontent.com."""
+    url = "https://github.com/openai/whisper/blob/main/README.md"
+    result = convert_github_to_raw_url(url)
+    assert result == "https://raw.githubusercontent.com/openai/whisper/main/README.md"
+
+
+def test_url_github_blob_subdirectorio():
+    """URL de GitHub blob con subdirectorio se convierte correctamente."""
+    url = "https://github.com/org/repo/blob/main/src/utils/helper.py"
+    result = convert_github_to_raw_url(url)
+    assert result == "https://raw.githubusercontent.com/org/repo/main/src/utils/helper.py"
+
+
+def test_url_github_blob_otra_rama():
+    """URL de GitHub blob con rama distinta a main se convierte correctamente."""
+    url = "https://github.com/org/repo/blob/develop/config.yaml"
+    result = convert_github_to_raw_url(url)
+    assert result == "https://raw.githubusercontent.com/org/repo/develop/config.yaml"
+
+
+def test_url_gitlab_blob():
+    """URL de GitLab blob se convierte a raw."""
+    url = "https://gitlab.com/org/repo/-/blob/main/README.md"
+    result = convert_github_to_raw_url(url)
+    assert result == "https://gitlab.com/org/repo/-/raw/main/README.md"
+
+
+def test_url_gitlab_blob_sin_guion():
+    """URL de GitLab blob sin '/-/' tambien se convierte."""
+    url = "https://gitlab.com/org/repo/blob/main/README.md"
+    result = convert_github_to_raw_url(url)
+    assert result == "https://gitlab.com/org/repo/raw/main/README.md"
+
+
+def test_url_que_no_es_blob_retorna_sin_cambios():
+    """URL de GitHub sin blob retorna sin cambios."""
+    url = "https://github.com/org/repo"
+    result = convert_github_to_raw_url(url)
+    assert result == url
+
+
+def test_url_github_tree_retorna_sin_cambios():
+    """URL de GitHub tree (no blob) retorna sin cambios."""
+    url = "https://github.com/org/repo/tree/main/src"
+    result = convert_github_to_raw_url(url)
+    assert result == url
+
+
+def test_url_no_github_retorna_sin_cambios():
+    """URL de otro dominio retorna sin cambios."""
+    url = "https://example.com/org/repo/blob/main/file.py"
+    result = convert_github_to_raw_url(url)
+    assert result == url
+
+
+def test_url_vacia_retorna_sin_cambios():
+    """URL vacia retorna string vacio."""
+    result = convert_github_to_raw_url("")
+    assert result == ""
+
+
+def test_url_raw_githubusercontent_retorna_sin_cambios():
+    """URL ya en raw.githubusercontent.com no se modifica."""
+    url = "https://raw.githubusercontent.com/org/repo/main/file.py"
+    result = convert_github_to_raw_url(url)
+    assert result == url
diff --git a/python/functions/core/core.py b/python/functions/core/core.py
index 0b6a6e7e..30d0c637 100644
--- a/python/functions/core/core.py
+++ b/python/functions/core/core.py
@@ -1,7 +1,9 @@
 """Core functional programming utilities — pure functions for list/collection operations."""
 
+import hashlib
+import re
 from functools import reduce as _reduce
-from typing import Any, Callable, Dict, List, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple
 
 
 def filter_list(xs: list, pred: Callable) -> list:
@@ -133,3 +135,680 @@ def compose(*fns: Callable) -> Callable:
             result = fn(result)
         return result
     return composed
+
+
+# ── Tree manipulation ────────────────────────────────────────────────────────
+
+
+def flatten_tree(structure: Any) -> List[Dict]:
+    """Flatten a hierarchical tree (dict with 'nodes') to a list without children."""
+    import copy
+    if isinstance(structure, dict):
+        node = copy.deepcopy(structure)
+        node.pop('nodes', None)
+        nodes = [node]
+        for key in list(structure.keys()):
+            if 'nodes' in key:
+                nodes.extend(flatten_tree(structure[key]))
+        return nodes
+    elif isinstance(structure, list):
+        nodes = []
+        for item in structure:
+            nodes.extend(flatten_tree(item))
+        return nodes
+    return []
+
+
+def tree_to_flat_list(structure: Any) -> List[Dict]:
+    """Convert hierarchical tree to flat list preserving DFS order (keeps internal nodes)."""
+    if isinstance(structure, dict):
+        nodes = [structure]
+        if 'nodes' in structure:
+            nodes.extend(tree_to_flat_list(structure['nodes']))
+        return nodes
+    elif isinstance(structure, list):
+        nodes = []
+        for item in structure:
+            nodes.extend(tree_to_flat_list(item))
+        return nodes
+    return []
+
+
+def get_leaf_nodes(structure: Any) -> List[Dict]:
+    """Extract only leaf nodes (no children) from a hierarchical tree."""
+    import copy
+    if isinstance(structure, dict):
+        if not structure.get('nodes'):
+            node = copy.deepcopy(structure)
+            node.pop('nodes', None)
+            return [node]
+        leaf_nodes = []
+        for key in list(structure.keys()):
+            if 'nodes' in key:
+                leaf_nodes.extend(get_leaf_nodes(structure[key]))
+        return leaf_nodes
+    elif isinstance(structure, list):
+        leaf_nodes = []
+        for item in structure:
+            leaf_nodes.extend(get_leaf_nodes(item))
+        return leaf_nodes
+    return []
+
+
+def write_node_ids(data: Any, node_id: int = 0) -> int:
+    """Assign sequential zero-padded IDs (0001, 0002...) to all nodes in a tree. Returns next counter."""
+    if isinstance(data, dict):
+        data['node_id'] = str(node_id).zfill(4)
+        node_id += 1
+        for key in list(data.keys()):
+            if 'nodes' in key:
+                node_id = write_node_ids(data[key], node_id)
+    elif isinstance(data, list):
+        for item in data:
+            node_id = write_node_ids(item, node_id)
+    return node_id
+
+
+def list_to_tree(data: List[Dict]) -> List[Dict]:
+    """Convert flat list with structure codes ('1.2.3') to nested tree."""
+    def get_parent_structure(structure):
+        if not structure:
+            return None
+        parts = str(structure).split('.')
+        return '.'.join(parts[:-1]) if len(parts) > 1 else None
+
+    nodes = {}
+    root_nodes = []
+
+    for item in data:
+        structure = item.get('structure')
+        node = {
+            'title': item.get('title'),
+            'start_index': item.get('start_index'),
+            'end_index': item.get('end_index'),
+            'nodes': []
+        }
+        nodes[structure] = node
+        parent_structure = get_parent_structure(structure)
+
+        if parent_structure and parent_structure in nodes:
+            nodes[parent_structure]['nodes'].append(node)
+        else:
+            root_nodes.append(node)
+
+    def clean_node(node):
+        if not node['nodes']:
+            del node['nodes']
+        else:
+            for child in node['nodes']:
+                clean_node(child)
+        return node
+
+    return [clean_node(node) for node in root_nodes]
+
+
+def remove_tree_fields(data: Any, fields: List[str] = None) -> Any:
+    """Recursively remove specified fields from a tree (dict/list)."""
+    if fields is None:
+        fields = ['text']
+    if isinstance(data, dict):
+        return {k: remove_tree_fields(v, fields) for k, v in data.items() if k not in fields}
+    elif isinstance(data, list):
+        return [remove_tree_fields(item, fields) for item in data]
+    return data
+
+
+def format_tree_structure(structure: Any, order: List[str] = None) -> Any:
+    """Reorder fields of each node in a tree according to specified key order."""
+    if not order:
+        return structure
+    if isinstance(structure, dict):
+        if 'nodes' in structure:
+            structure['nodes'] = format_tree_structure(structure['nodes'], order)
+        if not structure.get('nodes'):
+            structure.pop('nodes', None)
+        return {key: structure[key] for key in order if key in structure}
+    elif isinstance(structure, list):
+        return [format_tree_structure(item, order) for item in structure]
+    return structure
+
+
+def create_node_mapping(tree: List[Dict]) -> Dict[str, Dict]:
+    """Create flat dict mapping node_id to node for O(1) lookup."""
+    mapping = {}
+    def _traverse(nodes):
+        for node in nodes:
+            if node.get('node_id'):
+                mapping[node['node_id']] = node
+            if node.get('nodes'):
+                _traverse(node['nodes'])
+    _traverse(tree)
+    return mapping
+
+
+# ── Text / JSON extraction ───────────────────────────────────────────────────
+
+
+def extract_json_from_llm(content: str) -> Dict:
+    """Extract and parse JSON from LLM responses. Handles ```json blocks, trailing commas, None->null."""
+    import json
+    try:
+        start_idx = content.find("```json")
+        if start_idx != -1:
+            start_idx += 7
+            end_idx = content.rfind("```")
+            json_content = content[start_idx:end_idx].strip()
+        else:
+            json_content = content.strip()
+
+        json_content = json_content.replace('None', 'null')
+        json_content = json_content.replace('\n', ' ').replace('\r', ' ')
+        json_content = ' '.join(json_content.split())
+
+        return json.loads(json_content)
+    except (json.JSONDecodeError, Exception):
+        try:
+            json_content = json_content.replace(',]', ']').replace(',}', '}')
+            return json.loads(json_content)
+        except Exception:
+            return {}
+
+
+def parse_page_range(pages: str) -> List[int]:
+    """Parse page range string ('5-7', '3,8', '12') into sorted list of unique ints."""
+    result = []
+    for part in pages.split(','):
+        part = part.strip()
+        if '-' in part:
+            start, end = int(part.split('-', 1)[0].strip()), int(part.split('-', 1)[1].strip())
+            if start > end:
+                raise ValueError(f"Invalid range '{part}': start must be <= end")
+            result.extend(range(start, end + 1))
+        else:
+            result.append(int(part))
+    return sorted(set(result))
+
+
+# ── Markdown parsing ─────────────────────────────────────────────────────────
+
+
+def extract_markdown_headers(markdown_content: str) -> Tuple[List[Dict], List[str]]:
+    """Extract all headers (h1-h6) from markdown with line numbers, skipping code blocks."""
+    import re
+    header_pattern = r'^(#{1,6})\s+(.+)$'
+    code_block_pattern = r'^```'
+    node_list = []
+    lines = markdown_content.split('\n')
+    in_code_block = False
+
+    for line_num, line in enumerate(lines, 1):
+        stripped_line = line.strip()
+        if re.match(code_block_pattern, stripped_line):
+            in_code_block = not in_code_block
+            continue
+        if not stripped_line:
+            continue
+        if not in_code_block:
+            match = re.match(header_pattern, stripped_line)
+            if match:
+                level = len(match.group(1))
+                title = match.group(2).strip()
+                node_list.append({'title': title, 'level': level, 'line_num': line_num})
+
+    return node_list, lines
+
+
+def build_tree_from_headers(node_list: List[Dict]) -> List[Dict]:
+    """Build nested tree from flat list of headers with levels (h1>h2>h3)."""
+    if not node_list:
+        return []
+
+    stack = []
+    root_nodes = []
+    node_counter = 1
+
+    for node in node_list:
+        current_level = node['level']
+        tree_node = {
+            'title': node['title'],
+            'node_id': str(node_counter).zfill(4),
+            'line_num': node['line_num'],
+            'nodes': []
+        }
+        node_counter += 1
+
+        while stack and stack[-1][1] >= current_level:
+            stack.pop()
+
+        if not stack:
+            root_nodes.append(tree_node)
+        else:
+            parent_node, _ = stack[-1]
+            parent_node['nodes'].append(tree_node)
+
+        stack.append((tree_node, current_level))
+
+    def clean_empty_nodes(nodes):
+        for n in nodes:
+            if n['nodes']:
+                clean_empty_nodes(n['nodes'])
+            else:
+                del n['nodes']
+        return nodes
+
+    return clean_empty_nodes(root_nodes)
+
+
+# ── Pagination / chunking ────────────────────────────────────────────────────
+
+
+def page_list_to_groups(page_contents: List[str], token_lengths: List[int],
+                        max_tokens: int = 20000, overlap_pages: int = 1) -> List[str]:
+    """Group pages into text chunks respecting token limit with configurable overlap."""
+    import math
+    num_tokens = sum(token_lengths)
+
+    if num_tokens <= max_tokens:
+        return ["".join(page_contents)]
+
+    subsets = []
+    current_subset = []
+    current_token_count = 0
+
+    expected_parts = math.ceil(num_tokens / max_tokens)
+    avg_tokens = math.ceil(((num_tokens / expected_parts) + max_tokens) / 2)
+
+    for i, (page_content, page_tokens) in enumerate(zip(page_contents, token_lengths)):
+        if current_token_count + page_tokens > avg_tokens:
+            subsets.append(''.join(current_subset))
+            overlap_start = max(i - overlap_pages, 0)
+            current_subset = list(page_contents[overlap_start:i])
+            current_token_count = sum(token_lengths[overlap_start:i])
+
+        current_subset.append(page_content)
+        current_token_count += page_tokens
+
+    if current_subset:
+        subsets.append(''.join(current_subset))
+
+    return subsets
+
+
+def calculate_page_offset(pairs: List[Dict]) -> int:
+    """Calculate offset between logical page numbers and physical indices using reference pairs."""
+    differences = []
+    for pair in pairs:
+        try:
+            difference = pair['physical_index'] - pair['page']
+            differences.append(difference)
+        except (KeyError, TypeError):
+            continue
+
+    if not differences:
+        return 0
+
+    counts: Dict[int, int] = {}
+    for diff in differences:
+        counts[diff] = counts.get(diff, 0) + 1
+
+    return max(counts.items(), key=lambda x: x[1])[0]
+
+
+# ── Text preprocessing ───────────────────────────────────────────────────────
+
+
+def preprocess_text(text: str) -> str:
+    """Normalize whitespace and newlines in raw text.
+
+    Args:
+        text: Raw text to normalize.
+
+    Returns:
+        Normalized text with consistent newlines, stripped lines, and no
+        excessive blank lines.
+    """
+    # Normalize line endings: \r\n and \r -> \n
+    text = text.replace('\r\n', '\n').replace('\r', '\n')
+    # Reduce 3+ consecutive newlines to at most 2
+    text = re.sub(r'\n{3,}', '\n\n', text)
+    # Strip whitespace from each line
+    text = '\n'.join(line.strip() for line in text.split('\n'))
+    # Strip globally
+    return text.strip()
+
+
+def get_text_stats(text: str) -> dict:
+    """Compute basic statistics of a text: characters, lines, words.
+
+    Args:
+        text: Input text to analyze.
+
+    Returns:
+        Dict with keys total_chars (int), total_lines (int), total_words (int).
+    """
+    return {
+        'total_chars': len(text),
+        'total_lines': text.count('\n') + 1,
+        'total_words': len(text.split()),
+    }
+
+
+# ── Git URL parsing ──────────────────────────────────────────────────────────
+
+_DEFAULT_GIT_HOSTS = ["github.com", "gitlab.com"]
+
+
+def _sanitize_git_segment(segment: str) -> str:
+    """Strip .git suffix then keep only [a-zA-Z0-9_-] chars."""
+    if segment.endswith(".git"):
+        segment = segment[:-4]
+    return re.sub(r"[^a-zA-Z0-9_\-]", "", segment)
+
+
+def parse_git_url(url: str, known_hosts: Optional[List[str]] = None) -> Optional[str]:
+    """Parse a code-hosting URL and return the 'org/repo' path component.
+
+    Supports HTTPS, HTTP, git://, ssh:// and SSH shorthand (git@host:path).
+    Returns None if the URL does not match any known host or is malformed.
+
+    Args:
+        url: Repository URL in any supported format.
+        known_hosts: List of accepted hostnames. Defaults to github.com and gitlab.com.
+
+    Returns:
+        'org/repo' string or None.
+    """
+    from urllib.parse import urlparse
+
+    hosts = known_hosts if known_hosts is not None else _DEFAULT_GIT_HOSTS
+    url = url.strip()
+
+    if url.startswith("git@"):
+        # git@github.com:org/repo.git
+        rest = url[len("git@"):]
+        if ":" not in rest:
+            return None
+        host, path = rest.split(":", 1)
+        if host not in hosts:
+            return None
+        segments = [s for s in path.split("/") if s]
+        if len(segments) < 2:
+            return None
+        org = _sanitize_git_segment(segments[0])
+        repo = _sanitize_git_segment(segments[1])
+        if not org or not repo:
+            return None
+        return f"{org}/{repo}"
+
+    for prefix in ("http://", "https://", "git://", "ssh://"):
+        if url.startswith(prefix):
+            parsed = urlparse(url)
+            netloc = parsed.hostname or ""
+            if netloc not in hosts:
+                return None
+            segments = [s for s in parsed.path.split("/") if s]
+            if len(segments) < 2:
+                return None
+            org = _sanitize_git_segment(segments[0])
+            repo = _sanitize_git_segment(segments[1])
+            if not org or not repo:
+                return None
+            return f"{org}/{repo}"
+
+    return None
+
+
+def is_git_repo_url(url: str, known_hosts: Optional[List[str]] = None) -> bool:
+    """Return True only if url points to a clonable git repository.
+
+    Accepts org/repo and org/repo/tree/<ref> paths.
+    Rejects paths that navigate to sub-resources (issues, blobs, PRs, etc.).
+
+    Args:
+        url: URL to verify.
+        known_hosts: Accepted hostnames. Defaults to github.com and gitlab.com.
+
+    Returns:
+        True if url is a clonable repository URL.
+    """
+    from urllib.parse import urlparse
+
+    hosts = known_hosts if known_hosts is not None else _DEFAULT_GIT_HOSTS
+    url = url.strip()
+
+    # SSH shorthand — always repo-level if host matches
+    if url.startswith("git@"):
+        rest = url[len("git@"):]
+        if ":" not in rest:
+            return False
+        host, _ = rest.split(":", 1)
+        return host in hosts
+
+    # git:// and ssh:// — always repo-level if host matches
+    for prefix in ("ssh://", "git://"):
+        if url.startswith(prefix):
+            parsed = urlparse(url)
+            return (parsed.hostname or "") in hosts
+
+    # http:// and https:// — must have exactly org/repo or org/repo/tree/<ref>
+    for prefix in ("http://", "https://"):
+        if url.startswith(prefix):
+            parsed = urlparse(url)
+            if (parsed.hostname or "") not in hosts:
+                return False
+            segments = [s for s in parsed.path.split("/") if s]
+            if len(segments) == 2:
+                return True
+            if len(segments) == 4 and segments[2] == "tree":
+                return True
+            return False
+
+    return False
+
+
+def validate_git_ssh_uri(url: str) -> None:
+    """Validate a git SSH URI of the form git@host:path.
+
+    Raises ValueError with a descriptive message if the URI is malformed.
+
+    Args:
+        url: URI string to validate.
+
+    Raises:
+        ValueError: If the URI does not conform to git SSH format.
+    """
+    if not url.startswith("git@"):
+        raise ValueError(f"git SSH URI must start with 'git@', got: {url!r}")
+    rest = url[len("git@"):]
+    if ":" not in rest:
+        raise ValueError(f"git SSH URI must contain ':', got: {url!r}")
+    _, path = rest.split(":", 1)
+    if not path:
+        raise ValueError(f"git SSH URI must have a non-empty path after ':', got: {url!r}")
+
+
+# ---------------------------------------------------------------------------
+# Markdown parsing utilities
+# ---------------------------------------------------------------------------
+
+
+def extract_frontmatter(content: str) -> Tuple[str, Optional[Dict]]:
+    """Extract YAML frontmatter delimited by '---' from the start of a markdown string.
+
+    Args:
+        content: Raw markdown string, optionally starting with YAML frontmatter.
+
+    Returns:
+        Tuple of (content_without_frontmatter, frontmatter_dict).
+        frontmatter_dict is None when no frontmatter is found.
+    """
+    pattern = re.compile(r'^---\n(.*?)\n---\n', re.DOTALL)
+    match = pattern.match(content)
+    if not match:
+        return content, None
+
+    raw = match.group(1)
+    remaining = content[match.end():]
+
+    try:
+        import yaml  # type: ignore
+        data = yaml.safe_load(raw)
+        if not isinstance(data, dict):
+            data = None
+    except Exception:
+        # Fallback: simple key: value parser (no yaml dependency)
+        data = {}
+        for line in raw.splitlines():
+            if ':' in line:
+                key, _, value = line.partition(':')
+                data[key.strip()] = value.strip()
+
+    return remaining, data
+
+
+def find_headings(content: str) -> List[Tuple[int, int, str, int]]:
+    """Find all markdown headings (# to ######), excluding those inside code blocks,
+    HTML comments, and indented blocks.
+
+    Args:
+        content: Markdown text to search.
+
+    Returns:
+        List of (start_pos, end_pos, title, level) for each heading found.
+    """
+    excluded: List[Tuple[int, int]] = []
+
+    # Code blocks (triple backtick)
+    for m in re.finditer(r'```.*?```', content, re.DOTALL):
+        excluded.append((m.start(), m.end()))
+
+    # HTML comments
+    for m in re.finditer(r'<!--.*?-->', content, re.DOTALL):
+        excluded.append((m.start(), m.end()))
+
+    # Indented blocks (lines starting with 4 spaces or a tab)
+    for m in re.finditer(r'^(    |\t).+$', content, re.MULTILINE):
+        excluded.append((m.start(), m.end()))
+
+    def is_excluded(pos: int) -> bool:
+        return any(start <= pos < end for start, end in excluded)
+
+    results: List[Tuple[int, int, str, int]] = []
+    for m in re.finditer(r'^(#{1,6})\s+(.+)$', content, re.MULTILINE):
+        # Skip escaped headings (\#)
+        before = content[m.start() - 1] if m.start() > 0 else ''
+        if before == '\\':
+            continue
+        if is_excluded(m.start()):
+            continue
+        level = len(m.group(1))
+        title = m.group(2).strip()
+        results.append((m.start(), m.end(), title, level))
+
+    return results
+
+
+def estimate_token_count(content: str) -> int:
+    """Estimate token count without a tokenizer.
+
+    CJK characters count as ~0.7 tokens each; other non-whitespace characters
+    count as ~0.3 tokens each.
+
+    Args:
+        content: Text to estimate.
+
+    Returns:
+        Estimated integer token count.
+    """
+    cjk = re.findall(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]', content)
+    without_cjk = re.sub(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]', '', content)
+    others = re.findall(r'\S', without_cjk)
+    return int(len(cjk) * 0.7 + len(others) * 0.3)
+
+
+def smart_split_content(
+    content: str,
+    max_tokens: int = 1024,
+    max_chars: int = 8000,
+) -> List[str]:
+    """Split large content into parts respecting token and character limits.
+
+    Splits by paragraphs (double newline). If a single paragraph exceeds the
+    limit it is force-cut into chunks of max_chars.
+
+    Args:
+        content: Text to split.
+        max_tokens: Maximum estimated tokens per part.
+        max_chars: Maximum characters per part.
+
+    Returns:
+        List of string parts.
+    """
+    paragraphs = content.split('\n\n')
+    parts: List[str] = []
+    current_parts: List[str] = []
+    current_tokens = 0
+    current_chars = 0
+
+    def flush() -> None:
+        if current_parts:
+            parts.append('\n\n'.join(current_parts))
+            current_parts.clear()
+
+    for para in paragraphs:
+        para_tokens = estimate_token_count(para)
+        para_chars = len(para)
+
+        # Single paragraph exceeds limits — force-cut it
+        if para_tokens > max_tokens or para_chars > max_chars:
+            flush()
+            current_tokens = 0
+            current_chars = 0
+            for i in range(0, len(para), max_chars):
+                parts.append(para[i:i + max_chars])
+            continue
+
+        # Would exceed limits if added — flush first
+        if (current_tokens + para_tokens > max_tokens or
+                current_chars + para_chars > max_chars):
+            flush()
+            current_tokens = 0
+            current_chars = 0
+
+        current_parts.append(para)
+        current_tokens += para_tokens
+        current_chars += para_chars
+
+    flush()
+    return parts if parts else [content]
+
+
+def sanitize_for_path(text: str, max_length: int = 50) -> str:
+    """Convert text to a safe string for use in file paths.
+
+    Keeps word characters, CJK characters, spaces and hyphens. Replaces spaces
+    with underscores. Truncates with a sha256 suffix if the result exceeds
+    max_length.
+
+    Args:
+        text: Input text to sanitize.
+        max_length: Maximum length of the returned string.
+
+    Returns:
+        Safe path-friendly string.
+    """
+    cleaned = re.sub(
+        r'[^\w\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af \-]',
+        '',
+        text,
+    )
+    cleaned = cleaned.replace(' ', '_').strip('_')
+
+    if not cleaned:
+        return 'section'
+
+    if len(cleaned) <= max_length:
+        return cleaned
+
+    suffix = '_' + hashlib.sha256(text.encode()).hexdigest()[:8]
+    return cleaned[:max_length - len(suffix)] + suffix
diff --git a/python/functions/core/create_node_mapping.md b/python/functions/core/create_node_mapping.md
new file mode 100644
index 00000000..afca2a61
--- /dev/null
+++ b/python/functions/core/create_node_mapping.md
@@ -0,0 +1,36 @@
+---
+name: create_node_mapping
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def create_node_mapping(tree: list[dict]) -> dict[str, dict]"
+description: "Crea dict plano node_id->node para lookup O(1) en un arbol jerarquico."
+tags: [tree, mapping, index, lookup]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/core.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/utils.py"
+---
+
+## Ejemplo
+
+```python
+tree = [{"node_id": "0001", "title": "A", "nodes": [{"node_id": "0002", "title": "B"}]}]
+mapping = create_node_mapping(tree)
+mapping["0002"]["title"]  # "B"
+```
+
+## Notas
+
+Funcion pura. Los valores son referencias a los nodos originales, no copias.
diff --git a/python/functions/core/cursor_paginate.md b/python/functions/core/cursor_paginate.md
new file mode 100644
index 00000000..45bb8efe
--- /dev/null
+++ b/python/functions/core/cursor_paginate.md
@@ -0,0 +1,66 @@
+---
+name: cursor_paginate
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "def cursor_paginate(fetch_page: Callable[..., list[T]], get_cursor: Callable[[T], str | None], page_size: int = 100, max_items: int = 2000, max_retries: int = 3, retry_delay: float = 2.0, retryable_exceptions: tuple[type[Exception], ...] = (ConnectionError, TimeoutError, OSError)) -> list[T]"
+description: "Paginador generico basado en cursor que funciona con cualquier API que use cursor-based pagination. Cada pagina se obtiene con retry automatico con exponential backoff. Se detiene cuando la pagina esta vacia, el batch es menor que page_size, se alcanza max_items, o el cursor del ultimo item es None."
+tags: [pagination, cursor, retry, generic, api, backoff]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: ["time", "typing.Callable", "typing.TypeVar"]
+tested: true
+tests:
+  - "API que retorna 3 paginas de 10 items"
+  - "API que falla 1 vez por pagina (retry funciona)"
+  - "max_items limita correctamente"
+  - "API que retorna pagina parcial (ultima pagina)"
+  - "Cursor None en ultimo item (se detiene)"
+test_file_path: "python/functions/core/cursor_paginate_test.py"
+file_path: "python/functions/core/cursor_paginate.py"
+---
+
+## Ejemplo
+
+```python
+from cursor_paginate import cursor_paginate
+
+def fetch_users(limit: int, cursor: str | None) -> list[dict]:
+    params = {"limit": limit}
+    if cursor:
+        params["cursor"] = cursor
+    return requests.get("https://api.example.com/users", params=params).json()["items"]
+
+def get_cursor(user: dict) -> str | None:
+    return user.get("next_cursor")
+
+users = cursor_paginate(
+    fetch_page=fetch_users,
+    get_cursor=get_cursor,
+    page_size=100,
+    max_items=5000,
+    max_retries=3,
+    retry_delay=2.0,
+)
+```
+
+## Notas
+
+El caller solo necesita proveer dos callables:
+- `fetch_page(limit, cursor)`: recibe `limit` y `cursor` como kwargs, retorna lista de items.
+- `get_cursor(item)`: extrae el cursor del ultimo item de la pagina; retornar None indica fin de datos.
+
+El exponential backoff interno aplica `retry_delay * 2^attempt` sin jitter. Solo se reintentan las excepciones en `retryable_exceptions`; cualquier otra excepcion propaga inmediatamente.
+
+Condiciones de parada (cualquiera de ellas):
+1. La pagina retornada esta vacia.
+2. La pagina retornada tiene menos items que `page_size` (pagina parcial = ultima pagina).
+3. El total acumulado alcanza o supera `max_items` (se trunca y se para).
+4. `get_cursor(batch[-1])` retorna `None`.
+
+Funcion impura: llama a `fetch_page` que tipicamente hace I/O de red y usa `time.sleep` en los reintentos.
diff --git a/python/functions/core/cursor_paginate.py b/python/functions/core/cursor_paginate.py
new file mode 100644
index 00000000..69d78ae1
--- /dev/null
+++ b/python/functions/core/cursor_paginate.py
@@ -0,0 +1,105 @@
+"""Generic cursor-based paginator for any API that uses cursor pagination."""
+
+import time
+from typing import Callable, TypeVar
+
+T = TypeVar("T")
+
+
+def cursor_paginate(
+    fetch_page: Callable[..., list[T]],
+    get_cursor: Callable[[T], str | None],
+    page_size: int = 100,
+    max_items: int = 2000,
+    max_retries: int = 3,
+    retry_delay: float = 2.0,
+    retryable_exceptions: tuple[type[Exception], ...] = (
+        ConnectionError,
+        TimeoutError,
+        OSError,
+    ),
+) -> list[T]:
+    """Paginate through a cursor-based API, collecting all items.
+
+    Fetches pages one at a time by calling fetch_page with limit and cursor
+    kwargs. Retries each page on transient errors using exponential backoff.
+    Stops when a page is empty, a partial page is returned, max_items is
+    reached, or the cursor from the last item is None.
+
+    Args:
+        fetch_page: Callable that accepts ``limit`` and ``cursor`` as keyword
+            arguments and returns a list of items for that page.
+        get_cursor: Callable that receives the last item of a page and returns
+            the cursor string to use for the next page, or None if there are
+            no more pages.
+        page_size: Number of items to request per page.
+        max_items: Hard cap on total items collected. Collection stops and the
+            list is truncated once this limit is reached.
+        max_retries: Maximum number of retry attempts per page after the first
+            failure.
+        retry_delay: Base delay in seconds between retries (doubled each
+            attempt — exponential backoff without jitter).
+        retryable_exceptions: Tuple of exception types that trigger a retry.
+            Any other exception propagates immediately.
+
+    Returns:
+        List of all collected items, in the order they were returned by the
+        API, truncated to max_items.
+
+    Raises:
+        Exception: Re-raises the last exception if all retries for a page are
+            exhausted.
+    """
+    all_items: list[T] = []
+    cursor: str | None = None
+
+    while True:
+        batch = _fetch_with_retry(
+            fetch_page=fetch_page,
+            page_size=page_size,
+            cursor=cursor,
+            max_retries=max_retries,
+            retry_delay=retry_delay,
+            retryable_exceptions=retryable_exceptions,
+        )
+
+        if not batch:
+            break
+
+        all_items.extend(batch)
+
+        if len(all_items) >= max_items:
+            del all_items[max_items:]
+            break
+
+        if len(batch) < page_size:
+            break
+
+        cursor = get_cursor(batch[-1])
+        if cursor is None:
+            break
+
+    return all_items
+
+
+def _fetch_with_retry(
+    fetch_page: Callable[..., list[T]],
+    page_size: int,
+    cursor: str | None,
+    max_retries: int,
+    retry_delay: float,
+    retryable_exceptions: tuple[type[Exception], ...],
+) -> list[T]:
+    """Call fetch_page once, retrying on retryable_exceptions with exponential backoff."""
+    last_exc: Exception | None = None
+    for attempt in range(max_retries + 1):
+        try:
+            return fetch_page(limit=page_size, cursor=cursor)
+        except retryable_exceptions as exc:
+            last_exc = exc
+            if attempt >= max_retries:
+                raise
+            delay = retry_delay * (2 ** attempt)
+            time.sleep(delay)
+
+    raise last_exc  # unreachable; satisfies type checkers
diff --git a/python/functions/core/cursor_paginate_test.py b/python/functions/core/cursor_paginate_test.py
new file mode 100644
index 00000000..38636530
--- /dev/null
+++ b/python/functions/core/cursor_paginate_test.py
@@ -0,0 +1,148 @@
+"""Tests para cursor_paginate."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+import pytest
+from cursor_paginate import cursor_paginate
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def make_api(pages: list[list[dict]]) -> callable:
+    """Return a fetch_page callable that serves pages from a pre-built list."""
+    call_count = [0]
+
+    def fetch_page(limit: int, cursor: str | None) -> list[dict]:
+        idx = call_count[0]
+        call_count[0] += 1
+        if idx >= len(pages):
+            return []
+        return pages[idx][:limit]
+
+    return fetch_page
+
+
+def get_cursor(item: dict) -> str | None:
+    return item.get("cursor")
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+def test_api_retorna_3_paginas_de_10_items():
+    pages = [
+        [{"id": i, "cursor": str(i)} for i in range(0, 10)],
+        [{"id": i, "cursor": str(i)} for i in range(10, 20)],
+        [{"id": i, "cursor": str(i)} for i in range(20, 30)],
+        [],  # sentinel: empty page ends pagination
+    ]
+    api = make_api(pages)
+    result = cursor_paginate(
+        fetch_page=api,
+        get_cursor=get_cursor,
+        page_size=10,
+        max_items=2000,
+        max_retries=0,
+    )
+    assert len(result) == 30
+    assert result[0]["id"] == 0
+    assert result[-1]["id"] == 29
+
+
+def test_api_falla_1_vez_por_pagina_retry_funciona():
+    """fetch_page falla en el primer intento de cada llamada, pero el retry recupera."""
+    call_counter = [0]
+    # Cada pagina tiene 5 items. 2 paginas en total, luego vacio.
+    items_by_page = [
+        [{"id": i, "cursor": str(i)} for i in range(0, 5)],
+        [{"id": i, "cursor": str(i)} for i in range(5, 10)],
+    ]
+    page_idx = [0]
+    fail_flags = [True, True]  # falla una vez por pagina
+
+    def fetch_page(limit: int, cursor: str | None) -> list[dict]:
+        idx = page_idx[0]
+        if idx < len(fail_flags) and fail_flags[idx]:
+            fail_flags[idx] = False
+            raise ConnectionError("transient failure")
+        page_idx[0] += 1
+        if idx >= len(items_by_page):
+            return []
+        return items_by_page[idx]
+
+    result = cursor_paginate(
+        fetch_page=fetch_page,
+        get_cursor=get_cursor,
+        page_size=5,
+        max_items=2000,
+        max_retries=3,
+        retry_delay=0.0,
+        retryable_exceptions=(ConnectionError, TimeoutError, OSError),
+    )
+    assert len(result) == 10
+
+
+def test_max_items_limita_correctamente():
+    # 50 items disponibles en 5 paginas de 10, pero max_items=25
+    pages = [
+        [{"id": i, "cursor": str(i)} for i in range(j * 10, j * 10 + 10)]
+        for j in range(5)
+    ]
+    api = make_api(pages)
+    result = cursor_paginate(
+        fetch_page=api,
+        get_cursor=get_cursor,
+        page_size=10,
+        max_items=25,
+        max_retries=0,
+    )
+    assert len(result) == 25
+    assert result[-1]["id"] == 24
+
+
+def test_api_retorna_pagina_parcial_ultima_pagina():
+    pages = [
+        [{"id": i, "cursor": str(i)} for i in range(10)],  # full page
+        [{"id": i, "cursor": str(i)} for i in range(10, 17)],  # partial — 7 items
+    ]
+    api = make_api(pages)
+    result = cursor_paginate(
+        fetch_page=api,
+        get_cursor=get_cursor,
+        page_size=10,
+        max_items=2000,
+        max_retries=0,
+    )
+    assert len(result) == 17
+    assert result[-1]["id"] == 16
+
+
+def test_cursor_none_en_ultimo_item_se_detiene():
+    """Cuando el ultimo item no tiene cursor, la paginacion debe detenerse."""
+    pages = [
+        [{"id": i, "cursor": str(i)} for i in range(10)],
+        # last item has no cursor — signals end of data
+        [{"id": i, "cursor": (str(i) if i < 19 else None)} for i in range(10, 20)],
+    ]
+    api = make_api(pages)
+
+    def get_cursor_nullable(item: dict) -> str | None:
+        return item.get("cursor")
+
+    result = cursor_paginate(
+        fetch_page=api,
+        get_cursor=get_cursor_nullable,
+        page_size=10,
+        max_items=2000,
+        max_retries=0,
+    )
+    assert len(result) == 20
+    assert result[-1]["id"] == 19
diff --git a/python/functions/core/detect_headings_by_font.md b/python/functions/core/detect_headings_by_font.md
new file mode 100644
index 00000000..03d5040d
--- /dev/null
+++ b/python/functions/core/detect_headings_by_font.md
@@ -0,0 +1,37 @@
+---
+name: detect_headings_by_font
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "def detect_headings_by_font(pdf, min_delta: float = 2.0, max_levels: int = 4) -> list[dict]"
+description: "Detecta headings en un PDF analizando la distribucion de font sizes. El font size mas comun es el body; sizes significativamente mayores se clasifican como heading levels. Filtra headers/footers repetitivos."
+tags: [pdf, headings, font, detection, parsing, pdfplumber]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [pdfplumber, collections]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/detect_headings_by_font.py"
+---
+
+## Ejemplo
+
+```python
+import pdfplumber
+from detect_headings_by_font import detect_headings_by_font
+
+with pdfplumber.open("document.pdf") as pdf:
+    headings = detect_headings_by_font(pdf, min_delta=2.0, max_levels=4)
+    for h in headings:
+        print(f"Page {h['page_num']}: {'#' * h['level']} {h['title']}")
+```
+
+## Notas
+
+Samplea cada 5ta pagina para construir el Counter de font sizes (optimizacion de rendimiento). El body_size es el font size mas frecuente. Los heading sizes deben ser >= body_size + min_delta Y tener frecuencia < 50% del body. Se limita a max_levels heading sizes ordenados desc (el mas grande = nivel 1). Titulos que aparecen en >30% de paginas son considerados headers/footers y se eliminan. Impure porque accede al estado interno de un objeto PDF ya abierto.
diff --git a/python/functions/core/detect_headings_by_font.py b/python/functions/core/detect_headings_by_font.py
new file mode 100644
index 00000000..346ead42
--- /dev/null
+++ b/python/functions/core/detect_headings_by_font.py
@@ -0,0 +1,135 @@
+"""Detect headings in a PDF by analyzing font size distribution."""
+
+from collections import Counter
+
+import pdfplumber
+
+
+def detect_headings_by_font(
+    pdf: pdfplumber.PDF,
+    min_delta: float = 2.0,
+    max_levels: int = 4,
+) -> list[dict]:
+    """Detect headings by analyzing font size distribution across pages.
+
+    The most common font size is treated as body text. Font sizes significantly
+    larger than body (by at least min_delta) and appearing in fewer than 50% of
+    chars are classified as heading levels.
+
+    Args:
+        pdf: An open pdfplumber.PDF object.
+        min_delta: Minimum size difference above body size to qualify as heading.
+        max_levels: Maximum number of heading levels to detect.
+
+    Returns:
+        list[dict]: List of {"level": int, "title": str, "page_num": int}
+                    sorted by page number. Returns empty list if no headings detected.
+    """
+    if not pdf.pages:
+        return []
+
+    # Step 1: Sample font sizes from every 5th page to determine body size
+    size_counter: Counter = Counter()
+    sample_pages = [pdf.pages[i] for i in range(0, len(pdf.pages), 5)]
+    if not sample_pages:
+        sample_pages = [pdf.pages[0]]
+
+    for page in sample_pages:
+        try:
+            chars = page.chars
+            for ch in chars:
+                size = ch.get("size")
+                if size is not None:
+                    size_counter[round(float(size), 1)] += 1
+        except Exception:
+            continue
+
+    if not size_counter:
+        return []
+
+    # Step 2: Determine body size (most common font size)
+    body_size, body_count = size_counter.most_common(1)[0]
+
+    # Step 3: Identify heading sizes
+    # Must be >= body_size + min_delta and frequency < 50% of body count
+    heading_sizes = sorted(
+        [
+            size
+            for size, count in size_counter.items()
+            if size >= body_size + min_delta and count < body_count * 0.5
+        ],
+        reverse=True,
+    )[:max_levels]
+
+    if not heading_sizes:
+        return []
+
+    # Build size -> level mapping
+    size_to_level = {size: i + 1 for i, size in enumerate(heading_sizes)}
+
+    # Step 4: Collect heading text per page
+    raw_headings: list[dict] = []
+    total_pages = len(pdf.pages)
+
+    for page_idx, page in enumerate(pdf.pages):
+        page_num = page_idx + 1
+        try:
+            chars = page.chars
+        except Exception:
+            continue
+
+        # Group consecutive chars of same heading size into text blocks
+        current_size = None
+        current_text = []
+
+        for ch in chars:
+            size = ch.get("size")
+            if size is None:
+                continue
+            rounded = round(float(size), 1)
+            if rounded in size_to_level:
+                if rounded == current_size:
+                    current_text.append(ch.get("text", ""))
+                else:
+                    if current_text and current_size is not None:
+                        text = "".join(current_text).strip()
+                        if text:
+                            raw_headings.append({
+                                "level": size_to_level[current_size],
+                                "title": text,
+                                "page_num": page_num,
+                            })
+                    current_size = rounded
+                    current_text = [ch.get("text", "")]
+            else:
+                if current_text and current_size is not None:
+                    text = "".join(current_text).strip()
+                    if text:
+                        raw_headings.append({
+                            "level": size_to_level[current_size],
+                            "title": text,
+                            "page_num": page_num,
+                        })
+                current_size = None
+                current_text = []
+
+        # Flush remaining
+        if current_text and current_size is not None:
+            text = "".join(current_text).strip()
+            if text:
+                raw_headings.append({
+                    "level": size_to_level[current_size],
+                    "title": text,
+                    "page_num": page_num,
+                })
+
+    if not raw_headings:
+        return []
+
+    # Step 5: Deduplicate — remove titles appearing on > 30% of pages (headers/footers)
+    title_page_counts: Counter = Counter(h["title"] for h in raw_headings)
+    threshold = total_pages * 0.3
+
+    filtered = [h for h in raw_headings if title_page_counts[h["title"]] <= threshold]
+
+    return filtered
diff --git a/python/functions/core/detect_url_type.md b/python/functions/core/detect_url_type.md
new file mode 100644
index 00000000..53282928
--- /dev/null
+++ b/python/functions/core/detect_url_type.md
@@ -0,0 +1,59 @@
+---
+name: detect_url_type
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "detect_url_type(url: str, timeout: float = 10.0) -> tuple[str, dict]"
+description: "Detecta el tipo de contenido de una URL. Retorna tipo ('webpage', 'pdf', 'markdown', 'text', 'code_repository') y metadata. Hace HTTP HEAD request solo si no puede determinarse por patron o extension."
+tags: [url, content-type, http, detect, classification, head-request]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: ["urllib.parse", "httpx"]
+tested: true
+tests:
+  - "URL .pdf por extension"
+  - "URL github repo"
+  - "URL markdown por extension"
+  - "URL SSH git"
+  - "URL .html por extension"
+test_file_path: "python/functions/core/detect_url_type_test.py"
+file_path: "python/functions/core/detect_url_type.py"
+---
+
+## Ejemplo
+
+```python
+from core.detect_url_type import detect_url_type
+
+# Por patron URL (sin HTTP request)
+url_type, meta = detect_url_type("https://github.com/openai/whisper")
+# url_type = "code_repository", meta = {"detection": "url_pattern", ...}
+
+# Por extension (sin HTTP request)
+url_type, meta = detect_url_type("https://example.com/doc.pdf")
+# url_type = "pdf", meta = {"detection": "extension", ...}
+
+# Por HTTP HEAD request (cuando no se puede determinar sin red)
+url_type, meta = detect_url_type("https://example.com/page")
+# url_type = "webpage", meta = {"detection": "content_type_header", "content_type": "text/html", ...}
+```
+
+## Notas
+
+Algoritmo en orden de prioridad:
+1. SSH git shorthand (`git@host:path`) → `code_repository` inmediatamente.
+2. Patron URL de repos conocidos (github.com/org/repo, gitlab.com/org/repo) → `code_repository`.
+3. Extension del path de la URL (.pdf, .md, .txt, .html, .git) → tipo correspondiente.
+4. HTTP HEAD request → leer `Content-Type` header.
+5. Default: `"webpage"`.
+
+Hosts reconocidos como repos de codigo: github.com, gitlab.com, bitbucket.org, codeberg.org.
+
+Sub-recursos (issues, pulls, blob, tree, etc.) NO se clasifican como `code_repository`.
+
+Lanza `Exception` con mensaje descriptivo si el HEAD request falla (timeout, DNS, red).
diff --git a/python/functions/core/detect_url_type.py b/python/functions/core/detect_url_type.py
new file mode 100644
index 00000000..9ef654da
--- /dev/null
+++ b/python/functions/core/detect_url_type.py
@@ -0,0 +1,144 @@
+"""Detecta el tipo de contenido de una URL (webpage, pdf, markdown, text, code_repository)."""
+
+import re
+from urllib.parse import urlparse
+
+
+# Patrones de repos de codigo por hostname
+_CODE_REPO_HOSTS = {"github.com", "gitlab.com", "bitbucket.org", "codeberg.org"}
+
+# Extensiones reconocidas → tipo
+_EXT_TYPE_MAP = {
+    ".pdf": "pdf",
+    ".md": "markdown",
+    ".markdown": "markdown",
+    ".rst": "text",
+    ".txt": "text",
+    ".html": "webpage",
+    ".htm": "webpage",
+    ".xml": "text",
+    ".json": "text",
+    ".csv": "text",
+    ".py": "text",
+    ".js": "text",
+    ".ts": "text",
+    ".go": "text",
+    ".rs": "text",
+    ".cpp": "text",
+    ".c": "text",
+    ".java": "text",
+    ".rb": "text",
+    ".git": "code_repository",
+}
+
+# Content-Type header prefixes → tipo
+_CONTENT_TYPE_MAP = {
+    "application/pdf": "pdf",
+    "text/markdown": "markdown",
+    "text/x-markdown": "markdown",
+    "text/plain": "text",
+    "text/html": "webpage",
+    "text/xml": "text",
+    "application/xml": "text",
+    "application/json": "text",
+}
+
+
+def _is_code_repo_url(parsed, path_segments: list[str]) -> bool:
+    """Return True si la URL apunta a la raiz de un repositorio de codigo."""
+    host = parsed.hostname or ""
+    if host not in _CODE_REPO_HOSTS:
+        return False
+    # Acepta org/repo o org/repo/ o org/repo.git (2 segmentos minimos)
+    if len(path_segments) < 2:
+        return False
+    # Rechaza sub-recursos conocidos: issues, pulls, blob, tree, releases, etc.
+    _SUB_RESOURCES = {"issues", "pulls", "blob", "tree", "releases", "tags",
+                      "commits", "compare", "wiki", "discussions", "actions",
+                      "security", "pulse", "graphs", "-", "settings"}
+    if len(path_segments) >= 3 and path_segments[2].rstrip(".git") in _SUB_RESOURCES:
+        return False
+    return True
+
+
+def _is_ssh_git_url(url: str) -> bool:
+    """Return True si la URL es un SSH git shorthand (git@host:path)."""
+    return url.strip().startswith("git@")
+
+
+def _type_from_extension(path: str) -> str | None:
+    """Detecta tipo segun la extension del path de la URL. Retorna None si no aplica."""
+    # Ignorar query string / fragment
+    clean_path = path.split("?")[0].split("#")[0]
+    for ext, url_type in _EXT_TYPE_MAP.items():
+        if clean_path.lower().endswith(ext):
+            return url_type
+    return None
+
+
+def _type_from_content_type(content_type_header: str) -> str:
+    """Mapea un Content-Type header al tipo de URL."""
+    ct = content_type_header.lower().split(";")[0].strip()
+    for prefix, url_type in _CONTENT_TYPE_MAP.items():
+        if ct.startswith(prefix):
+            return url_type
+    return "webpage"
+
+
+def detect_url_type(url: str, timeout: float = 10.0) -> tuple[str, dict]:
+    """Detecta el tipo de contenido de una URL.
+
+    Algoritmo:
+    1. Verificar si la URL es un patron de repo de codigo (git@, github.com/org/repo).
+    2. Verificar extension en el path de la URL (.pdf, .md, .txt, .html, .git).
+    3. Si no se determino: HTTP HEAD request para leer Content-Type header.
+    4. Default: "webpage".
+
+    Args:
+        url: URL a analizar.
+        timeout: Timeout en segundos para el HTTP HEAD request (si es necesario).
+
+    Returns:
+        Tuple de (tipo, metadata) donde tipo es uno de:
+        "webpage", "pdf", "markdown", "text", "code_repository".
+        metadata incluye la informacion disponible (extension, content_type, etc.).
+
+    Raises:
+        Exception: Si falla la conexion HTTP cuando es necesaria.
+    """
+    import httpx
+
+    url = url.strip()
+    metadata: dict = {"url": url}
+
+    # 1. SSH git shorthand
+    if _is_ssh_git_url(url):
+        metadata["detection"] = "ssh_pattern"
+        return "code_repository", metadata
+
+    parsed = urlparse(url)
+    path_segments = [s for s in parsed.path.split("/") if s]
+
+    # 2. Code repo by URL pattern
+    if _is_code_repo_url(parsed, path_segments):
+        metadata["detection"] = "url_pattern"
+        metadata["host"] = parsed.hostname
+        return "code_repository", metadata
+
+    # 3. Extension-based detection
+    ext_type = _type_from_extension(parsed.path)
+    if ext_type is not None:
+        metadata["detection"] = "extension"
+        metadata["path"] = parsed.path
+        return ext_type, metadata
+
+    # 4. HTTP HEAD request
+    try:
+        response = httpx.head(url, timeout=timeout, follow_redirects=True)
+        content_type = response.headers.get("content-type", "")
+        metadata["detection"] = "content_type_header"
+        metadata["content_type"] = content_type
+        metadata["status_code"] = response.status_code
+        return _type_from_content_type(content_type), metadata
+    except Exception as exc:
+        raise Exception(f"detect_url_type: HEAD request failed for {url!r}: {exc}") from exc
diff --git a/python/functions/core/detect_url_type_test.py b/python/functions/core/detect_url_type_test.py
new file mode 100644
index 00000000..14fe4d67
--- /dev/null
+++ b/python/functions/core/detect_url_type_test.py
@@ -0,0 +1,89 @@
+"""Tests para detect_url_type (tests que no requieren red)."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from core.detect_url_type import detect_url_type, _type_from_extension, _type_from_content_type, _is_ssh_git_url
+
+
+def test_url_pdf_por_extension():
+    """URL .pdf se detecta por extension sin hacer request HTTP."""
+    url_type, metadata = detect_url_type("https://example.com/report.pdf")
+    assert url_type == "pdf"
+    assert metadata["detection"] == "extension"
+
+
+def test_url_github_repo():
+    """URL de GitHub org/repo se detecta como code_repository por patron URL."""
+    url_type, metadata = detect_url_type("https://github.com/openai/whisper")
+    assert url_type == "code_repository"
+    assert metadata["detection"] == "url_pattern"
+
+
+def test_url_github_con_git_suffix():
+    """URL github terminada en .git se detecta como code_repository."""
+    url_type, metadata = detect_url_type("https://github.com/openai/whisper.git")
+    assert url_type == "code_repository"
+
+
+def test_url_markdown_por_extension():
+    """URL .md se detecta como markdown por extension."""
+    url_type, metadata = detect_url_type("https://example.com/README.md")
+    assert url_type == "markdown"
+    assert metadata["detection"] == "extension"
+
+
+def test_url_ssh_git():
+    """URL SSH git@ se detecta como code_repository."""
+    url_type, metadata = detect_url_type("git@github.com:openai/whisper.git")
+    assert url_type == "code_repository"
+    assert metadata["detection"] == "ssh_pattern"
+
+
+def test_url_html_por_extension():
+    """URL .html se detecta como webpage por extension."""
+    url_type, metadata = detect_url_type("https://example.com/page.html")
+    assert url_type == "webpage"
+    assert metadata["detection"] == "extension"
+
+
+def test_url_txt_por_extension():
+    """URL .txt se detecta como text por extension."""
+    url_type, metadata = detect_url_type("https://example.com/data.txt")
+    assert url_type == "text"
+
+
+def test_github_subrepo_no_es_repo():
+    """URL de GitHub apuntando a un issue/blob no se trata como code_repository."""
+    # Debe intentar HEAD request (que fallara sin red) — verificamos que no clasifica como repo
+    # Solo comprobamos que no devuelve code_repository por patron URL
+    url = "https://github.com/openai/whisper/blob/main/README.md"
+    # Extension .md deberia detectarse primero
+    url_type, metadata = detect_url_type(url)
+    assert url_type == "markdown"
+
+
+def test_helper_type_from_extension():
+    """_type_from_extension funciona para extensiones conocidas."""
+    assert _type_from_extension("/doc.pdf") == "pdf"
+    assert _type_from_extension("/README.md") == "markdown"
+    assert _type_from_extension("/notes.txt") == "text"
+    assert _type_from_extension("/unknown.xyz") is None
+
+
+def test_helper_type_from_content_type():
+    """_type_from_content_type mapea headers correctamente."""
+    assert _type_from_content_type("application/pdf; charset=utf-8") == "pdf"
+    assert _type_from_content_type("text/html; charset=utf-8") == "webpage"
+    assert _type_from_content_type("text/plain") == "text"
+    assert _type_from_content_type("text/markdown") == "markdown"
+    assert _type_from_content_type("application/octet-stream") == "webpage"
+
+
+def test_helper_is_ssh_git_url():
+    """_is_ssh_git_url detecta formato git@."""
+    assert _is_ssh_git_url("git@github.com:org/repo.git") is True
+    assert _is_ssh_git_url("https://github.com/org/repo") is False
+    assert _is_ssh_git_url("ssh://git@github.com/org/repo") is False
diff --git a/python/functions/core/docx_to_markdown.md b/python/functions/core/docx_to_markdown.md
new file mode 100644
index 00000000..91675658
--- /dev/null
+++ b/python/functions/core/docx_to_markdown.md
@@ -0,0 +1,40 @@
+---
+name: docx_to_markdown
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "docx_to_markdown(docx_path: str) -> str"
+description: "Convierte un documento Word (.docx) a markdown preservando estructura (headings), formato inline (bold, italic, underline) y tablas en su posicion original."
+tags: [docx, markdown, word, conversion, document, parsing, text]
+uses_functions: [format_table_to_markdown_py_core]
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [python-docx, lxml]
+tested: true
+tests: ["docx con headings y parrafos", "docx con tablas intercaladas", "docx con formato bold/italic", "docx vacio", "archivo no encontrado lanza FileNotFoundError"]
+test_file_path: "python/functions/core/docx_to_markdown_test.py"
+file_path: "python/functions/core/docx_to_markdown.py"
+---
+
+## Ejemplo
+
+```python
+md = docx_to_markdown("informe.docx")
+# # Titulo
+#
+# Primer parrafo.
+#
+# | Col1 | Col2 |
+# | ---- | ---- |
+# | a    | b    |
+#
+# Parrafo despues de la tabla.
+```
+
+## Notas
+
+Recorre `doc.element.body` en orden (no `doc.paragraphs` + `doc.tables` por separado) para preservar la posicion original de las tablas. Construye un mapa `{id(tbl_element): Table}` para lookup O(1). El formato inline aplica underline (`<ins>`), italic (`*`) y bold (`**`) en ese orden de mas interno a mas externo. Los headings se detectan por el estilo del parrafo (`Heading 1`, `Heading 2`, etc.). Requiere `python-docx` instalado en el entorno.
diff --git a/python/functions/core/docx_to_markdown.py b/python/functions/core/docx_to_markdown.py
new file mode 100644
index 00000000..0513f2f5
--- /dev/null
+++ b/python/functions/core/docx_to_markdown.py
@@ -0,0 +1,153 @@
+"""Convert a Word .docx document to Markdown, preserving structure, inline
+formatting and tables in their original document order."""
+
+import os
+from lxml import etree
+
+from format_table_to_markdown import format_table_to_markdown
+
+
+# XML namespace used by python-docx element tags
+_W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
+_TAG_P = f"{{{_W}}}p"
+_TAG_TBL = f"{{{_W}}}tbl"
+_TAG_TR = f"{{{_W}}}tr"
+_TAG_TC = f"{{{_W}}}tc"
+_TAG_R = f"{{{_W}}}r"
+_TAG_T = f"{{{_W}}}t"
+_TAG_RPR = f"{{{_W}}}rPr"
+_TAG_B = f"{{{_W}}}b"
+_TAG_I = f"{{{_W}}}i"
+_TAG_U = f"{{{_W}}}u"
+_TAG_PSTYLE = f"{{{_W}}}pStyle"
+_TAG_PPR = f"{{{_W}}}pPr"
+
+
+def _heading_level(paragraph) -> int:
+    """Return heading level (1-6) if the paragraph is a heading, else 0."""
+    pPr = paragraph._p.find(_TAG_PPR)
+    if pPr is None:
+        return 0
+    pStyle = pPr.find(_TAG_PSTYLE)
+    if pStyle is None:
+        return 0
+    val = pStyle.get(f"{{{_W}}}val", "")
+    if val.lower().startswith("heading"):
+        parts = val.split()
+        if len(parts) == 2:
+            try:
+                return int(parts[1])
+            except ValueError:
+                pass
+        # Some locales use "Heading1" (no space)
+        suffix = val[len("heading"):]
+        if suffix.isdigit():
+            return int(suffix)
+    return 0
+
+
+def _run_to_md(run_elem) -> str:
+    """Convert a single <w:r> element to a markdown-formatted string."""
+    # Collect text
+    text_parts = []
+    for t in run_elem.findall(_TAG_T):
+        text_parts.append(t.text or "")
+    text = "".join(text_parts)
+    if not text:
+        return ""
+
+    # Read formatting from <w:rPr>
+    rPr = run_elem.find(_TAG_RPR)
+    bold = False
+    italic = False
+    underline = False
+    if rPr is not None:
+        bold = rPr.find(_TAG_B) is not None
+        italic = rPr.find(_TAG_I) is not None
+        u_elem = rPr.find(_TAG_U)
+        if u_elem is not None:
+            u_val = u_elem.get(f"{{{_W}}}val", "")
+            underline = u_val not in ("none", "")
+
+    # Apply markdown formatting (innermost first: underline → italic → bold)
+    if underline:
+        text = f"<ins>{text}</ins>"
+    if italic:
+        text = f"*{text}*"
+    if bold:
+        text = f"**{text}**"
+    return text
+
+
+def _paragraph_to_md(paragraph) -> str:
+    """Convert a python-docx Paragraph to a markdown string."""
+    level = _heading_level(paragraph)
+    runs_md = "".join(
+        _run_to_md(elem)
+        for elem in paragraph._p
+        if elem.tag == _TAG_R
+    )
+    if level:
+        return f"{'#' * level} {runs_md}"
+    return runs_md
+
+
+def _table_to_md(table) -> str:
+    """Convert a python-docx Table to a markdown table string."""
+    rows: list[list[str]] = []
+    for row in table.rows:
+        cells = []
+        for cell in row.cells:
+            # Join all paragraphs in the cell with a space
+            cell_text = " ".join(p.text for p in cell.paragraphs).strip()
+            cells.append(cell_text)
+        rows.append(cells)
+    return format_table_to_markdown(rows, has_header=True)
+
+
+def docx_to_markdown(docx_path: str) -> str:
+    """Convert a Word .docx document to Markdown.
+
+    Preserves document structure (headings), inline formatting (bold, italic,
+    underline) and tables in their original position.
+
+    Args:
+        docx_path: Absolute or relative path to the .docx file.
+
+    Returns:
+        Markdown string representing the document.
+
+    Raises:
+        FileNotFoundError: If the file does not exist.
+        Exception: If the file cannot be parsed as a .docx document.
+    """
+    import docx  # deferred so the module is importable without python-docx installed
+
+    if not os.path.exists(docx_path):
+        raise FileNotFoundError(f"File not found: {docx_path}")
+
+    doc = docx.Document(docx_path)
+
+    # Build a mapping from the XML element id to the Table object for O(1) lookup
+    table_map: dict[int, object] = {
+        id(table._tbl): table for table in doc.tables
+    }
+
+    parts: list[str] = []
+
+    for child in doc.element.body:
+        if child.tag == _TAG_P:
+            # Wrap in a temporary paragraph object to reuse _paragraph_to_md
+            from docx.text.paragraph import Paragraph
+            para = Paragraph(child, doc)
+            md = _paragraph_to_md(para)
+            if md.strip():
+                parts.append(md)
+        elif child.tag == _TAG_TBL:
+            table = table_map.get(id(child))
+            if table is not None:
+                md = _table_to_md(table)
+                if md:
+                    parts.append(md)
+
+    return "\n\n".join(parts)
diff --git a/python/functions/core/docx_to_markdown_test.py b/python/functions/core/docx_to_markdown_test.py
new file mode 100644
index 00000000..10c689d1
--- /dev/null
+++ b/python/functions/core/docx_to_markdown_test.py
@@ -0,0 +1,129 @@
+"""Tests para docx_to_markdown."""
+
+import os
+import sys
+import tempfile
+
+import pytest
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+import docx as python_docx
+from docx_to_markdown import docx_to_markdown
+
+
+def _make_docx(builder_fn) -> str:
+    """Create a temporary .docx file using builder_fn(doc) and return its path."""
+    doc = python_docx.Document()
+    builder_fn(doc)
+    tmp = tempfile.NamedTemporaryFile(suffix=".docx", delete=False)
+    doc.save(tmp.name)
+    tmp.close()
+    return tmp.name
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+def test_docx_con_headings_y_parrafos():
+    """docx con headings y parrafos"""
+
+    def build(doc):
+        doc.add_heading("Titulo Principal", level=1)
+        doc.add_paragraph("Primer parrafo de contenido.")
+        doc.add_heading("Seccion", level=2)
+        doc.add_paragraph("Segundo parrafo.")
+
+    path = _make_docx(build)
+    try:
+        result = docx_to_markdown(path)
+        assert "# Titulo Principal" in result
+        assert "## Seccion" in result
+        assert "Primer parrafo de contenido." in result
+        assert "Segundo parrafo." in result
+    finally:
+        os.unlink(path)
+
+
+def test_docx_con_tablas_intercaladas():
+    """docx con tablas intercaladas"""
+
+    def build(doc):
+        doc.add_paragraph("Texto antes de la tabla.")
+        table = doc.add_table(rows=2, cols=3)
+        table.cell(0, 0).text = "Col1"
+        table.cell(0, 1).text = "Col2"
+        table.cell(0, 2).text = "Col3"
+        table.cell(1, 0).text = "a"
+        table.cell(1, 1).text = "b"
+        table.cell(1, 2).text = "c"
+        doc.add_paragraph("Texto despues de la tabla.")
+
+    path = _make_docx(build)
+    try:
+        result = docx_to_markdown(path)
+        # Table must appear BETWEEN the two paragraphs
+        before_idx = result.index("Texto antes de la tabla.")
+        table_idx = result.index("| Col1")
+        after_idx = result.index("Texto despues de la tabla.")
+        assert before_idx < table_idx < after_idx
+        assert "| Col2" in result
+        assert "| a" in result
+    finally:
+        os.unlink(path)
+
+
+def test_docx_con_formato_bold_italic():
+    """docx con formato bold/italic"""
+
+    def build(doc):
+        para = doc.add_paragraph()
+        run_bold = para.add_run("negrita")
+        run_bold.bold = True
+        run_normal = para.add_run(" texto normal ")
+        run_italic = para.add_run("cursiva")
+        run_italic.italic = True
+
+    path = _make_docx(build)
+    try:
+        result = docx_to_markdown(path)
+        assert "**negrita**" in result
+        assert "*cursiva*" in result
+        assert "texto normal" in result
+    finally:
+        os.unlink(path)
+
+
+def test_docx_vacio():
+    """docx vacio"""
+
+    def build(doc):
+        # python-docx adds a default empty paragraph; remove all content
+        # by just not adding anything — the default empty paragraph will
+        # produce an empty string that gets filtered out.
+        pass
+
+    path = _make_docx(build)
+    try:
+        result = docx_to_markdown(path)
+        # Empty document should produce empty or whitespace-only output
+        assert result.strip() == ""
+    finally:
+        os.unlink(path)
+
+
+def test_archivo_no_encontrado():
+    """archivo no encontrado lanza FileNotFoundError"""
+    with pytest.raises(FileNotFoundError):
+        docx_to_markdown("/tmp/nonexistent_file_fn_registry.docx")
+
+
+if __name__ == "__main__":
+    test_docx_con_headings_y_parrafos()
+    test_docx_con_tablas_intercaladas()
+    test_docx_con_formato_bold_italic()
+    test_docx_vacio()
+    test_archivo_no_encontrado()
+    print("All tests passed.")
diff --git a/python/functions/core/epub_to_markdown.md b/python/functions/core/epub_to_markdown.md
new file mode 100644
index 00000000..8738a00a
--- /dev/null
+++ b/python/functions/core/epub_to_markdown.md
@@ -0,0 +1,52 @@
+---
+name: epub_to_markdown
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "def epub_to_markdown(epub_path: str) -> str"
+description: "Convierte un ebook EPUB a markdown. Intenta ebooklib primero para extraccion estructurada (titulo, autor, documentos); fallback a extraccion manual con zipfile si ebooklib no esta instalado."
+tags: [epub, markdown, ebook, parsing, conversion, html, text-extraction]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [zipfile, html, re, ebooklib]
+tested: true
+tests:
+  - "conversion de headings h1-h3"
+  - "conversion de bold e italic"
+  - "script y style se eliminan del output"
+  - "HTML entities se convierten a caracteres"
+  - "epub sin ebooklib extrae texto de archivos html"
+  - "epub con ebooklib incluye titulo y autor en el output"
+  - "epub corrupto lanza excepcion"
+test_file_path: "python/functions/core/epub_to_markdown_test.py"
+file_path: "python/functions/core/epub_to_markdown.py"
+---
+
+## Ejemplo
+
+```python
+md = epub_to_markdown("/path/to/book.epub")
+print(md[:500])
+# # Mi Libro
+# **Author:** Ana Perez
+#
+# # Introduccion
+# Primer parrafo...
+```
+
+## Notas
+
+Conversion HTML a markdown cubre: headings h1-h6, bold (`<strong>`/`<b>`), italic (`<em>`/`<i>`), paragraphs, line breaks. Elimina `<script>` y `<style>`. Desescapa entidades HTML y normaliza whitespace.
+
+Con ebooklib: extrae metadata DC (titulo, autor) del OPF y procesa solo los ITEM_DOCUMENT del spine.
+
+Sin ebooklib (fallback ZIP): lista archivos `.html`/`.xhtml`/`.htm` en orden alfabetico y extrae su contenido. No hay metadata de titulo/autor en este modo.
+
+Dependencia opcional: `pip install ebooklib`. Si no esta instalada la funcion sigue funcionando via zipfile.
+
+Reimplementacion conceptual desde OpenViking `openviking/parse/parsers/epub.py` (AGPL-3.0). El codigo es original.
diff --git a/python/functions/core/epub_to_markdown.py b/python/functions/core/epub_to_markdown.py
new file mode 100644
index 00000000..f9ffcf0c
--- /dev/null
+++ b/python/functions/core/epub_to_markdown.py
@@ -0,0 +1,128 @@
+"""Convert an EPUB file to markdown text."""
+
+import re
+import zipfile
+from html import unescape
+from html.parser import HTMLParser
+
+
+def _remove_tags(html: str, tag: str) -> str:
+    """Remove a tag and its content from HTML string."""
+    pattern = re.compile(rf'<{tag}[^>]*>.*?</{tag}>', re.IGNORECASE | re.DOTALL)
+    return pattern.sub('', html)
+
+
+def _html_to_markdown(html: str) -> str:
+    """Convert basic HTML to markdown.
+
+    Handles headings, bold, italic, paragraphs, line breaks
+    and strips remaining tags.
+
+    Args:
+        html: HTML string to convert.
+
+    Returns:
+        Markdown-formatted string.
+    """
+    # Remove script and style blocks
+    text = _remove_tags(html, 'script')
+    text = _remove_tags(text, 'style')
+
+    # Headings h1-h6
+    for level in range(6, 0, -1):
+        hashes = '#' * level
+        text = re.sub(
+            rf'<h{level}[^>]*>(.*?)</h{level}>',
+            lambda m, h=hashes: f'{h} {m.group(1).strip()}',
+            text,
+            flags=re.IGNORECASE | re.DOTALL,
+        )
+
+    # Bold
+    text = re.sub(r'<strong[^>]*>(.*?)</strong>', r'**\1**', text, flags=re.IGNORECASE | re.DOTALL)
+    text = re.sub(r'<b[^>]*>(.*?)</b>', r'**\1**', text, flags=re.IGNORECASE | re.DOTALL)
+
+    # Italic
+    text = re.sub(r'<em[^>]*>(.*?)</em>', r'*\1*', text, flags=re.IGNORECASE | re.DOTALL)
+    text = re.sub(r'<i[^>]*>(.*?)</i>', r'*\1*', text, flags=re.IGNORECASE | re.DOTALL)
+
+    # Paragraphs — append double newline after content
+    text = re.sub(r'<p[^>]*>(.*?)</p>', lambda m: m.group(1).strip() + '\n\n', text, flags=re.IGNORECASE | re.DOTALL)
+
+    # Line breaks
+    text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
+
+    # Strip remaining HTML tags
+    text = re.sub(r'<[^>]+>', '', text)
+
+    # Unescape HTML entities
+    text = unescape(text)
+
+    # Normalize whitespace: collapse multiple blank lines into two
+    text = re.sub(r'\n{3,}', '\n\n', text)
+    text = re.sub(r'[ \t]+', ' ', text)
+
+    return text.strip()
+
+
+def _epub_via_ebooklib(epub_path: str) -> str:
+    """Extract markdown from EPUB using ebooklib."""
+    import ebooklib
+    from ebooklib import epub
+
+    book = epub.read_epub(epub_path)
+
+    # Metadata
+    title_meta = book.get_metadata('DC', 'title')
+    author_meta = book.get_metadata('DC', 'creator')
+    title = title_meta[0][0] if title_meta else 'Unknown Title'
+    author = author_meta[0][0] if author_meta else 'Unknown Author'
+
+    parts = [f'# {title}', f'**Author:** {author}']
+
+    for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
+        content = item.get_content().decode('utf-8', errors='replace')
+        md = _html_to_markdown(content)
+        if md:
+            parts.append(md)
+
+    return '\n\n'.join(parts)
+
+
+def _epub_via_zipfile(epub_path: str) -> str:
+    """Extract markdown from EPUB using zipfile (fallback)."""
+    parts = []
+    with zipfile.ZipFile(epub_path, 'r') as zf:
+        html_files = sorted(
+            name for name in zf.namelist()
+            if name.lower().endswith(('.html', '.xhtml', '.htm'))
+        )
+        for name in html_files:
+            raw = zf.read(name).decode('utf-8', errors='replace')
+            md = _html_to_markdown(raw)
+            if md:
+                parts.append(md)
+
+    return '\n\n'.join(parts)
+
+
+def epub_to_markdown(epub_path: str) -> str:
+    """Convert an EPUB ebook to markdown.
+
+    Attempts to use ebooklib for structured extraction (title, author,
+    document items). Falls back to manual ZIP extraction if ebooklib is
+    not installed.
+
+    Args:
+        epub_path: Path to the .epub file.
+
+    Returns:
+        Markdown string with the book content.
+
+    Raises:
+        Exception: If the file cannot be read or is not a valid EPUB.
+    """
+    try:
+        return _epub_via_ebooklib(epub_path)
+    except ImportError:
+        return _epub_via_zipfile(epub_path)
diff --git a/python/functions/core/epub_to_markdown_test.py b/python/functions/core/epub_to_markdown_test.py
new file mode 100644
index 00000000..73df18a6
--- /dev/null
+++ b/python/functions/core/epub_to_markdown_test.py
@@ -0,0 +1,163 @@
+"""Tests para epub_to_markdown."""
+
+import io
+import os
+import struct
+import sys
+import zipfile
+
+import pytest
+
+sys.path.insert(0, os.path.dirname(__file__))
+from epub_to_markdown import _html_to_markdown, _epub_via_zipfile, epub_to_markdown
+
+
+# ---------------------------------------------------------------------------
+# Helpers para construir EPUBs minimos en memoria
+# ---------------------------------------------------------------------------
+
+def _build_epub(files: dict[str, str]) -> str:
+    """Crea un EPUB minimo como ZIP en disco y retorna el path."""
+    import tempfile
+    tmp = tempfile.NamedTemporaryFile(suffix='.epub', delete=False)
+    with zipfile.ZipFile(tmp, 'w') as zf:
+        for name, content in files.items():
+            zf.writestr(name, content)
+    tmp.close()
+    return tmp.name
+
+
+def _build_epub_with_opf(title: str, author: str, body_html: str) -> str:
+    """Crea un EPUB con OPF y un documento HTML valido para ebooklib."""
+    opf = f"""<?xml version='1.0' encoding='utf-8'?>
+<package xmlns='http://www.idpf.org/2007/opf' unique-identifier='uid' version='2.0'>
+  <metadata xmlns:dc='http://purl.org/dc/elements/1.1/'>
+    <dc:title>{title}</dc:title>
+    <dc:creator>{author}</dc:creator>
+    <dc:identifier id='uid'>test-uid</dc:identifier>
+    <dc:language>en</dc:language>
+  </metadata>
+  <manifest>
+    <item id='ch1' href='chapter1.xhtml' media-type='application/xhtml+xml'/>
+    <item id='ncx' href='toc.ncx' media-type='application/x-dtbncx+xml'/>
+  </manifest>
+  <spine toc='ncx'>
+    <itemref idref='ch1'/>
+  </spine>
+</package>"""
+
+    ncx = """<?xml version='1.0' encoding='utf-8'?>
+<ncx xmlns='http://www.daisy.org/z3986/2005/ncx/' version='2005-1'>
+  <head><meta name='dtb:uid' content='test-uid'/></head>
+  <docTitle><text>Test</text></docTitle>
+  <navMap/>
+</ncx>"""
+
+    chapter = f"""<?xml version='1.0' encoding='utf-8'?>
+<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.1//EN' 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'>
+<html xmlns='http://www.w3.org/1999/xhtml'>
+<head><title>Chapter</title></head>
+<body>{body_html}</body>
+</html>"""
+
+    return _build_epub({
+        'mimetype': 'application/epub+zip',
+        'META-INF/container.xml': """<?xml version='1.0'?>
+<container version='1.0' xmlns='urn:oasis:names:tc:opendocument:xmlns:container'>
+  <rootfiles>
+    <rootfile full-path='content.opf' media-type='application/oebps-package+xml'/>
+  </rootfiles>
+</container>""",
+        'content.opf': opf,
+        'toc.ncx': ncx,
+        'chapter1.xhtml': chapter,
+    })
+
+
+# ---------------------------------------------------------------------------
+# Tests de _html_to_markdown (pura, sin disco)
+# ---------------------------------------------------------------------------
+
+def test_html_heading_conversion():
+    """conversion de headings h1-h3."""
+    html = '<h1>Titulo</h1><h2>Subtitulo</h2><h3>Seccion</h3>'
+    result = _html_to_markdown(html)
+    assert '# Titulo' in result
+    assert '## Subtitulo' in result
+    assert '### Seccion' in result
+
+
+def test_html_bold_italic():
+    """conversion de bold e italic."""
+    html = '<p><strong>negrita</strong> y <em>italica</em></p>'
+    result = _html_to_markdown(html)
+    assert '**negrita**' in result
+    assert '*italica*' in result
+
+
+def test_html_script_style_removed():
+    """script y style se eliminan del output."""
+    html = '<script>alert(1)</script><style>body{}</style><p>Contenido</p>'
+    result = _html_to_markdown(html)
+    assert 'alert' not in result
+    assert 'body{}' not in result
+    assert 'Contenido' in result
+
+
+def test_html_entities_unescaped():
+    """HTML entities se convierten a caracteres."""
+    html = '<p>Tom &amp; Jerry &lt;show&gt;</p>'
+    result = _html_to_markdown(html)
+    assert 'Tom & Jerry' in result
+    assert '<show>' in result
+
+
+# ---------------------------------------------------------------------------
+# Tests de epub_via_zipfile (sin ebooklib)
+# ---------------------------------------------------------------------------
+
+def test_epub_via_zipfile_extrae_html():
+    """epub sin ebooklib extrae texto de archivos html."""
+    path = _build_epub({
+        'chapter.html': '<html><body><h1>Capitulo Uno</h1><p>Hola mundo.</p></body></html>',
+    })
+    try:
+        result = _epub_via_zipfile(path)
+        assert 'Capitulo Uno' in result
+        assert 'Hola mundo' in result
+    finally:
+        os.unlink(path)
+
+
+# ---------------------------------------------------------------------------
+# Tests de epub_to_markdown (integracion)
+# ---------------------------------------------------------------------------
+
+def test_epub_con_ebooklib_metadata():
+    """epub con ebooklib incluye titulo y autor en el output."""
+    pytest.importorskip('ebooklib')
+    path = _build_epub_with_opf(
+        title='Mi Libro',
+        author='Ana Perez',
+        body_html='<h1>Introduccion</h1><p>Primer parrafo.</p>',
+    )
+    try:
+        result = epub_to_markdown(path)
+        assert '# Mi Libro' in result
+        assert 'Ana Perez' in result
+        assert 'Introduccion' in result
+    finally:
+        os.unlink(path)
+
+
+def test_epub_corrupto_lanza_excepcion():
+    """epub corrupto lanza Exception."""
+    import tempfile
+    tmp = tempfile.NamedTemporaryFile(suffix='.epub', delete=False)
+    tmp.write(b'esto no es un epub valido')
+    tmp.close()
+    try:
+        with pytest.raises(Exception):
+            epub_to_markdown(tmp.name)
+    finally:
+        os.unlink(tmp.name)
diff --git a/python/functions/core/estimate_token_count.md b/python/functions/core/estimate_token_count.md
new file mode 100644
index 00000000..e81e6acd
--- /dev/null
+++ b/python/functions/core/estimate_token_count.md
@@ -0,0 +1,37 @@
+---
+name: estimate_token_count
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def estimate_token_count(content: str) -> int"
+description: "Estimacion rapida de tokens sin tokenizer. CJK chars cuentan ~0.7 token/char, otros non-whitespace ~0.3 token/char."
+tags: [tokens, estimation, nlp, cjk, text]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [re]
+tested: true
+tests:
+  - "texto vacio retorna cero"
+  - "solo latin"
+  - "solo CJK"
+  - "texto mixto"
+test_file_path: "python/functions/core/parse_markdown_test.py"
+file_path: "python/functions/core/core.py"
+---
+
+## Ejemplo
+
+```python
+estimate_token_count("hello world")  # 3
+estimate_token_count("中文语")        # 2  (3 * 0.7 = 2)
+estimate_token_count("")             # 0
+```
+
+## Notas
+
+Funcion pura. No requiere ninguna dependencia externa. Precision aproximada: util para guardianes de limite de contexto antes de llamar a LLMs, no para conteo exacto de tokens BPE. CJK range: `[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]` (CJK unificado, Hiragana/Katakana, Hangul).
diff --git a/python/functions/core/excel_to_markdown.md b/python/functions/core/excel_to_markdown.md
new file mode 100644
index 00000000..22f5c4a6
--- /dev/null
+++ b/python/functions/core/excel_to_markdown.md
@@ -0,0 +1,58 @@
+---
+name: excel_to_markdown
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "excel_to_markdown(path: str, max_rows_per_sheet: int = 1000) -> str"
+description: "Convierte un archivo Excel (.xlsx, .xls, .xlsm) a markdown con cada sheet como seccion H2. Soporta tipos de celda: fechas ISO, booleanos, errores Excel, numeros enteros y flotantes. Trunca sheets que superen max_rows_per_sheet."
+tags: [excel, markdown, xlsx, xls, conversion, parser, io]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: ["openpyxl", "xlrd"]
+tested: true
+tests:
+  - "xlsx con multiples sheets produce una seccion H2 por sheet"
+  - "sheet vacio produce nota de sheet vacio"
+  - "sheet truncado con nota de filas omitidas"
+  - "sheet con formulas data_only muestra valores calculados"
+  - "extension no soportada lanza ValueError"
+  - "archivo inexistente lanza FileNotFoundError"
+  - "dimensiones del sheet en metadata"
+  - "tabla markdown con formato correcto"
+test_file_path: "python/functions/core/excel_to_markdown_test.py"
+file_path: "python/functions/core/excel_to_markdown.py"
+---
+
+## Ejemplo
+
+```python
+from excel_to_markdown import excel_to_markdown
+
+md = excel_to_markdown("report.xlsx")
+print(md)
+# ## Sheet: Ventas
+#
+# **Dimensions:** 101 x 4
+#
+# | Producto | Precio | Cantidad | Total |
+# | --- | --- | --- | --- |
+# | Manzana | 1 | 100 | 100 |
+# ...
+
+# Con limite de filas
+md = excel_to_markdown("big_file.xlsx", max_rows_per_sheet=50)
+```
+
+## Notas
+
+- `.xlsx` y `.xlsm`: usa `openpyxl` con `data_only=True` (lee valores calculados, no formulas).
+- `.xls` (legacy): usa `xlrd`. Manejo de tipos especiales: EMPTY/BLANK → "", DATE → ISO 8601, BOOLEAN → "TRUE"/"FALSE", ERROR → codigo Excel (#NULL!, #DIV/0!, etc.), NUMBER → entero si no tiene decimales.
+- Fechas sin hora se formatean como `YYYY-MM-DD`; con hora como `YYYY-MM-DDTHH:MM:SS`.
+- Los pipes `|` dentro de celdas se escapan como `\|`.
+- Si `xlwt` no esta disponible, los tests .xls se saltan (xlwt solo se necesita para crear fixtures, no para leer).
+- Reimplementacion desde cero, inspirada conceptualmente en OpenViking (AGPL-3.0). Sin codigo copiado.
diff --git a/python/functions/core/excel_to_markdown.py b/python/functions/core/excel_to_markdown.py
new file mode 100644
index 00000000..3d454079
--- /dev/null
+++ b/python/functions/core/excel_to_markdown.py
@@ -0,0 +1,211 @@
+"""Convierte archivos Excel a Markdown con cada sheet como seccion H2."""
+
+import os
+from pathlib import Path
+
+
+# Codigos de error Excel para xlrd
+_XL_ERROR_CODES = {
+    0: "#NULL!",
+    7: "#DIV/0!",
+    15: "#VALUE!",
+    23: "#REF!",
+    29: "#NAME?",
+    36: "#NUM!",
+    42: "#N/A",
+}
+
+
+def _rows_to_markdown_table(rows: list[list[str]]) -> str:
+    """Convierte filas de strings a tabla markdown."""
+    if not rows:
+        return ""
+
+    header = rows[0]
+    col_count = len(header)
+
+    # Normalizar todas las filas al mismo numero de columnas
+    normalized = []
+    for row in rows:
+        if len(row) < col_count:
+            row = row + [""] * (col_count - len(row))
+        normalized.append(row[:col_count])
+
+    # Escapar pipes en celdas
+    def escape(cell: str) -> str:
+        return cell.replace("|", "\\|").replace("\n", " ")
+
+    lines = []
+    # Header
+    lines.append("| " + " | ".join(escape(c) for c in normalized[0]) + " |")
+    # Separator
+    lines.append("| " + " | ".join("---" for _ in range(col_count)) + " |")
+    # Data rows
+    for row in normalized[1:]:
+        lines.append("| " + " | ".join(escape(c) for c in row) + " |")
+
+    return "\n".join(lines)
+
+
+def _cell_value_xlrd(cell, workbook) -> str:
+    """Convierte una celda xlrd a string segun su tipo."""
+    import xlrd
+
+    ctype = cell.ctype
+
+    if ctype in (xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK):
+        return ""
+    elif ctype == xlrd.XL_CELL_DATE:
+        try:
+            dt = xlrd.xldate_as_datetime(cell.value, workbook.datemode)
+            if dt.hour == 0 and dt.minute == 0 and dt.second == 0:
+                return dt.date().isoformat()
+            return dt.isoformat()
+        except Exception:
+            return str(cell.value)
+    elif ctype == xlrd.XL_CELL_BOOLEAN:
+        return "TRUE" if cell.value else "FALSE"
+    elif ctype == xlrd.XL_CELL_ERROR:
+        return _XL_ERROR_CODES.get(int(cell.value), "#ERROR!")
+    elif ctype == xlrd.XL_CELL_NUMBER:
+        v = cell.value
+        if v == int(v):
+            return str(int(v))
+        return str(v)
+    elif ctype == xlrd.XL_CELL_TEXT:
+        return str(cell.value)
+    else:
+        return str(cell.value)
+
+
+def _sheet_xlrd(sheet, workbook, max_rows: int) -> str:
+    """Convierte un sheet xlrd a markdown."""
+    nrows = sheet.nrows
+    ncols = sheet.ncols
+
+    lines = []
+    lines.append(f"## Sheet: {sheet.name}")
+    lines.append("")
+    lines.append(f"**Dimensions:** {nrows} x {ncols}")
+    lines.append("")
+
+    if nrows == 0 or ncols == 0:
+        lines.append("*(empty sheet)*")
+        return "\n".join(lines)
+
+    display_rows = min(nrows, max_rows)
+    rows = []
+    for r in range(display_rows):
+        row_data = [_cell_value_xlrd(sheet.cell(r, c), workbook) for c in range(ncols)]
+        rows.append(row_data)
+
+    lines.append(_rows_to_markdown_table(rows))
+
+    if nrows > max_rows:
+        omitted = nrows - max_rows
+        lines.append("")
+        lines.append(f"*{omitted} rows omitted (max_rows_per_sheet={max_rows})*")
+
+    return "\n".join(lines)
+
+
+def _cell_value_openpyxl(cell) -> str:
+    """Convierte una celda openpyxl a string."""
+    v = cell.value
+    if v is None:
+        return ""
+    if isinstance(v, bool):
+        return "TRUE" if v else "FALSE"
+    if isinstance(v, float):
+        if v == int(v):
+            return str(int(v))
+        return str(v)
+    if isinstance(v, int):
+        return str(v)
+    # Fechas y datetimes
+    import datetime
+    if isinstance(v, datetime.datetime):
+        if v.hour == 0 and v.minute == 0 and v.second == 0:
+            return v.date().isoformat()
+        return v.isoformat()
+    if isinstance(v, datetime.date):
+        return v.isoformat()
+    return str(v)
+
+
+def _sheet_openpyxl(ws, max_rows: int) -> str:
+    """Convierte un worksheet openpyxl a markdown."""
+    all_rows = list(ws.iter_rows())
+    nrows = len(all_rows)
+    ncols = ws.max_column or 0
+
+    lines = []
+    lines.append(f"## Sheet: {ws.title}")
+    lines.append("")
+    lines.append(f"**Dimensions:** {nrows} x {ncols}")
+    lines.append("")
+
+    if nrows == 0 or ncols == 0:
+        lines.append("*(empty sheet)*")
+        return "\n".join(lines)
+
+    display_rows = min(nrows, max_rows)
+    rows = []
+    for row in all_rows[:display_rows]:
+        row_data = [_cell_value_openpyxl(cell) for cell in row]
+        rows.append(row_data)
+
+    lines.append(_rows_to_markdown_table(rows))
+
+    if nrows > max_rows:
+        omitted = nrows - max_rows
+        lines.append("")
+        lines.append(f"*{omitted} rows omitted (max_rows_per_sheet={max_rows})*")
+
+    return "\n".join(lines)
+
+
+def excel_to_markdown(path: str, max_rows_per_sheet: int = 1000) -> str:
+    """Convierte un archivo Excel (.xlsx, .xls, .xlsm) a markdown.
+
+    Cada sheet se convierte en una seccion H2. Las filas se representan
+    como tablas markdown. Si el numero de filas supera max_rows_per_sheet,
+    el sheet se trunca y se añade una nota.
+
+    Args:
+        path: Ruta al archivo Excel (.xlsx, .xls, .xlsm).
+        max_rows_per_sheet: Maximo de filas a incluir por sheet (default 1000).
+
+    Returns:
+        String markdown con todos los sheets del archivo.
+
+    Raises:
+        ValueError: Si la extension no es soportada.
+        FileNotFoundError: Si el archivo no existe.
+        Exception: Si hay errores leyendo el archivo.
+    """
+    p = Path(path)
+    if not p.exists():
+        raise FileNotFoundError(f"File not found: {path}")
+
+    ext = p.suffix.lower()
+
+    if ext == ".xls":
+        import xlrd
+        wb = xlrd.open_workbook(path)
+        sections = []
+        for sheet_name in wb.sheet_names():
+            sheet = wb.sheet_by_name(sheet_name)
+            sections.append(_sheet_xlrd(sheet, wb, max_rows_per_sheet))
+        return "\n\n".join(sections)
+
+    elif ext in (".xlsx", ".xlsm"):
+        import openpyxl
+        wb = openpyxl.load_workbook(path, data_only=True)
+        sections = []
+        for ws in wb.worksheets:
+            sections.append(_sheet_openpyxl(ws, max_rows_per_sheet))
+        return "\n\n".join(sections)
+
+    else:
+        raise ValueError(f"Unsupported extension '{ext}'. Use .xlsx, .xls, or .xlsm.")
diff --git a/python/functions/core/excel_to_markdown_test.py b/python/functions/core/excel_to_markdown_test.py
new file mode 100644
index 00000000..1b798439
--- /dev/null
+++ b/python/functions/core/excel_to_markdown_test.py
@@ -0,0 +1,142 @@
+"""Tests para excel_to_markdown."""
+
+import datetime
+import os
+import sys
+import tempfile
+
+import openpyxl
+import pytest
+
+sys.path.insert(0, os.path.dirname(__file__))
+from excel_to_markdown import excel_to_markdown
+
+
+def _make_xlsx(sheets: dict, filename: str) -> str:
+    """Crea un archivo .xlsx temporal con los sheets dados."""
+    wb = openpyxl.Workbook()
+    first = True
+    for sheet_name, rows in sheets.items():
+        if first:
+            ws = wb.active
+            ws.title = sheet_name
+            first = False
+        else:
+            ws = wb.create_sheet(sheet_name)
+        for row in rows:
+            ws.append(row)
+    path = os.path.join(tempfile.mkdtemp(), filename)
+    wb.save(path)
+    return path
+
+
+def test_xlsx_multiples_sheets():
+    """xlsx con multiples sheets produce una seccion H2 por sheet."""
+    path = _make_xlsx(
+        {
+            "Ventas": [["Producto", "Precio", "Cantidad"], ["Manzana", 1.5, 100], ["Pera", 2.0, 50]],
+            "Resumen": [["Total", "Importe"], ["150", "225.0"]],
+        },
+        "multi.xlsx",
+    )
+    result = excel_to_markdown(path)
+
+    assert "## Sheet: Ventas" in result
+    assert "## Sheet: Resumen" in result
+    assert "Producto" in result
+    assert "Manzana" in result
+    assert "Total" in result
+
+
+def test_sheet_vacio():
+    """Sheet sin filas produce nota de sheet vacio."""
+    path = _make_xlsx({"Vacio": []}, "empty.xlsx")
+    result = excel_to_markdown(path)
+
+    assert "## Sheet: Vacio" in result
+    assert "empty sheet" in result
+
+
+def test_sheet_truncado():
+    """Sheet con mas filas que max_rows_per_sheet se trunca con nota."""
+    rows = [["col"]] + [[str(i)] for i in range(20)]
+    path = _make_xlsx({"Data": rows}, "big.xlsx")
+    result = excel_to_markdown(path, max_rows_per_sheet=5)
+
+    assert "omitted" in result
+    # 21 filas totales, 5 mostradas -> 16 omitidas
+    assert "16 rows omitted" in result
+
+
+def test_sheet_con_formulas_data_only():
+    """Archivo xlsx abierto con data_only=True muestra valores calculados (o None si no guardados)."""
+    wb = openpyxl.Workbook()
+    ws = wb.active
+    ws.title = "Formulas"
+    ws.append(["A", "B", "Suma"])
+    ws.append([1, 2, "=A2+B2"])
+    path = os.path.join(tempfile.mkdtemp(), "formulas.xlsx")
+    wb.save(path)
+
+    result = excel_to_markdown(path)
+    assert "## Sheet: Formulas" in result
+    # La celda formula puede ser None con data_only=True si no fue guardada con valor
+    assert "Suma" in result
+
+
+def test_xls_legacy_con_fechas():
+    """xls legacy: la funcion debe aceptar .xls (via xlrd) y manejar fechas."""
+    # Creamos un .xls usando xlwt si disponible, si no lo saltamos
+    pytest.importorskip("xlwt", reason="xlwt no disponible para crear .xls de prueba")
+    import xlwt
+
+    wb = xlwt.Workbook()
+    ws = wb.add_sheet("Fechas")
+    ws.write(0, 0, "Nombre")
+    ws.write(0, 1, "Fecha")
+    ws.write(1, 0, "Evento A")
+
+    date_format = xlwt.XFStyle()
+    date_format.num_format_str = "YYYY-MM-DD"
+    ws.write(1, 1, datetime.date(2024, 1, 15).toordinal() - 693594, date_format)
+
+    path = os.path.join(tempfile.mkdtemp(), "legacy.xls")
+    wb.save(path)
+
+    result = excel_to_markdown(path)
+    assert "## Sheet: Fechas" in result
+    assert "Evento A" in result
+
+
+def test_extension_no_soportada():
+    """Extension no soportada lanza ValueError."""
+    path = os.path.join(tempfile.mkdtemp(), "data.csv")
+    with open(path, "w") as f:
+        f.write("a,b\n1,2\n")
+
+    with pytest.raises(ValueError, match="Unsupported extension"):
+        excel_to_markdown(path)
+
+
+def test_archivo_no_existe():
+    """Archivo inexistente lanza FileNotFoundError."""
+    with pytest.raises(FileNotFoundError):
+        excel_to_markdown("/tmp/no_existe_para_nada.xlsx")
+
+
+def test_dimensiones_en_metadata():
+    """El markdown incluye dimensiones del sheet."""
+    path = _make_xlsx({"Hoja1": [["A", "B"], [1, 2], [3, 4]]}, "dims.xlsx")
+    result = excel_to_markdown(path)
+    assert "**Dimensions:**" in result
+    assert "3 x 2" in result
+
+
+def test_tabla_markdown_formato():
+    """La tabla tiene formato correcto con separador de header."""
+    path = _make_xlsx({"Datos": [["Col1", "Col2"], ["val1", "val2"]]}, "fmt.xlsx")
+    result = excel_to_markdown(path)
+    # Debe tener linea separadora con ---
+    assert "| --- |" in result or "| --- | --- |" in result
+    assert "Col1" in result
+    assert "val1" in result
diff --git a/python/functions/core/extract_frontmatter.md b/python/functions/core/extract_frontmatter.md
new file mode 100644
index 00000000..e0754c37
--- /dev/null
+++ b/python/functions/core/extract_frontmatter.md
@@ -0,0 +1,43 @@
+---
+name: extract_frontmatter
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def extract_frontmatter(content: str) -> tuple[str, dict | None]"
+description: "Extrae YAML frontmatter (delimitado por ---) del inicio de un string markdown. Retorna el contenido sin frontmatter y el dict parseado (o None si no hay)."
+tags: [markdown, frontmatter, yaml, parsing]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [re, yaml]
+tested: true
+tests:
+  - "contenido con frontmatter"
+  - "sin frontmatter retorna None"
+  - "frontmatter vacio"
+  - "frontmatter con listas"
+test_file_path: "python/functions/core/parse_markdown_test.py"
+file_path: "python/functions/core/core.py"
+---
+
+## Ejemplo
+
+```python
+content = "---\ntitle: Hello\nauthor: Alice\n---\n# Body\n"
+remaining, data = extract_frontmatter(content)
+# remaining = "# Body\n"
+# data = {"title": "Hello", "author": "Alice"}
+
+no_fm = "# Just markdown\n\nNo frontmatter."
+remaining, data = extract_frontmatter(no_fm)
+# remaining == no_fm
+# data is None
+```
+
+## Notas
+
+Funcion pura. Usa `yaml.safe_load` si PyYAML esta disponible; si no, cae back a un parser simple de `key: value`. Solo reconoce frontmatter al inicio estricto del string (posicion 0). El bloque debe estar delimitado por `---\n` de apertura y `\n---\n` de cierre.
diff --git a/python/functions/core/extract_json_from_llm.md b/python/functions/core/extract_json_from_llm.md
new file mode 100644
index 00000000..0fedff75
--- /dev/null
+++ b/python/functions/core/extract_json_from_llm.md
@@ -0,0 +1,36 @@
+---
+name: extract_json_from_llm
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def extract_json_from_llm(content: str) -> dict"
+description: "Extrae y parsea JSON de respuestas LLM. Maneja bloques ```json, trailing commas, None->null."
+tags: [json, llm, parsing, extraction]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [json]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/core.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/utils.py"
+---
+
+## Ejemplo
+
+```python
+raw = '```json\n{"key": "value", "items": [1, 2, 3,]}\n```'
+result = extract_json_from_llm(raw)
+# {"key": "value", "items": [1, 2, 3]}
+```
+
+## Notas
+
+Funcion pura. Maneja errores comunes de LLMs: trailing commas, `None` en lugar de `null`, whitespace extra. Retorna dict vacio si el JSON es irrecuperable.
diff --git a/python/functions/core/extract_markdown_headers.md b/python/functions/core/extract_markdown_headers.md
new file mode 100644
index 00000000..2983e59d
--- /dev/null
+++ b/python/functions/core/extract_markdown_headers.md
@@ -0,0 +1,36 @@
+---
+name: extract_markdown_headers
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def extract_markdown_headers(markdown_content: str) -> tuple[list[dict], list[str]]"
+description: "Extrae todos los headers (h1-h6) de markdown con nivel y numero de linea, ignorando code blocks."
+tags: [markdown, parsing, headers, extraction]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [re]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/core.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/page_index_md.py"
+---
+
+## Ejemplo
+
+```python
+md = "# Title\n\nSome text\n\n## Section\n\n```\n# not a header\n```"
+headers, lines = extract_markdown_headers(md)
+# headers = [{"title": "Title", "level": 1, "line_num": 1}, {"title": "Section", "level": 2, "line_num": 5}]
+```
+
+## Notas
+
+Funcion pura. Detecta y omite bloques de codigo (triple backtick). Retorna tupla: (lista de headers, lista de lineas originales).
diff --git a/python/functions/core/extract_pdf_bookmarks.md b/python/functions/core/extract_pdf_bookmarks.md
new file mode 100644
index 00000000..0da39039
--- /dev/null
+++ b/python/functions/core/extract_pdf_bookmarks.md
@@ -0,0 +1,37 @@
+---
+name: extract_pdf_bookmarks
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "def extract_pdf_bookmarks(pdf) -> list[dict]"
+description: "Extrae la estructura de bookmarks/outlines de un PDF abierto con pdfplumber. Retorna lista de dicts con level (1-6), title y page_num."
+tags: [pdf, bookmarks, outlines, parsing, pdfplumber]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [pdfplumber]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/extract_pdf_bookmarks.py"
+---
+
+## Ejemplo
+
+```python
+import pdfplumber
+from extract_pdf_bookmarks import extract_pdf_bookmarks
+
+with pdfplumber.open("document.pdf") as pdf:
+    bookmarks = extract_pdf_bookmarks(pdf)
+    for bm in bookmarks:
+        print(f"{'#' * bm['level']} {bm['title']} (page {bm['page_num']})")
+```
+
+## Notas
+
+Recibe un objeto `pdfplumber.PDF` ya abierto (no un path). Construye un mapping interno `objid -> page_number` desde `pdf.pages` para resolver los destinos de outline. El nivel se limita al rango [1, 6] para compatibilidad markdown. Retorna lista vacia si el PDF no tiene outlines o si `get_outlines()` falla. Impure porque accede al estado interno de un objeto PDF ya abierto.
diff --git a/python/functions/core/extract_pdf_bookmarks.py b/python/functions/core/extract_pdf_bookmarks.py
new file mode 100644
index 00000000..f192ffee
--- /dev/null
+++ b/python/functions/core/extract_pdf_bookmarks.py
@@ -0,0 +1,63 @@
+"""Extract the bookmark/outline structure from a PDF opened with pdfplumber."""
+
+import pdfplumber
+
+
+def extract_pdf_bookmarks(pdf: pdfplumber.PDF) -> list[dict]:
+    """Extract bookmarks/outlines from an open pdfplumber PDF object.
+
+    Args:
+        pdf: An open pdfplumber.PDF object.
+
+    Returns:
+        list[dict]: List of {"level": int, "title": str, "page_num": int | None}.
+                    Level is clamped to [1, 6]. Returns empty list if no outlines.
+    """
+    try:
+        outlines = pdf.doc.get_outlines()
+    except Exception:
+        return []
+
+    if not outlines:
+        return []
+
+    # Build objid -> page_number mapping
+    objid_to_page: dict[int, int] = {}
+    for i, page in enumerate(pdf.pages):
+        try:
+            obj = page.page_obj
+            objid_to_page[obj.objid] = i + 1  # 1-indexed page numbers
+        except Exception:
+            pass
+
+    bookmarks = []
+    for item in outlines:
+        try:
+            level = item[0]  # integer level from get_outlines
+            title = item[1]
+            dest = item[2]  # destination: page object or list
+
+            # Clamp level to [1, 6]
+            level = max(1, min(6, level))
+
+            # Resolve destination to page number
+            page_num = None
+            if dest is not None:
+                if isinstance(dest, list) and len(dest) > 0:
+                    # dest[0] is the page object
+                    page_obj = dest[0]
+                    try:
+                        page_num = objid_to_page.get(page_obj.objid)
+                    except Exception:
+                        pass
+                else:
+                    try:
+                        page_num = objid_to_page.get(dest.objid)
+                    except Exception:
+                        pass
+
+            bookmarks.append({"level": level, "title": str(title), "page_num": page_num})
+        except Exception:
+            continue
+
+    return bookmarks
diff --git a/python/functions/core/extract_pdf_text.md b/python/functions/core/extract_pdf_text.md
new file mode 100644
index 00000000..23f7278f
--- /dev/null
+++ b/python/functions/core/extract_pdf_text.md
@@ -0,0 +1,35 @@
+---
+name: extract_pdf_text
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "def extract_pdf_text(pdf_path: str) -> str"
+description: "Extrae todo el texto de un PDF concatenando todas las paginas. Usa PyPDF2."
+tags: [pdf, text, extraction, parsing]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [PyPDF2]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/extract_pdf_text.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/utils.py"
+---
+
+## Ejemplo
+
+```python
+text = extract_pdf_text("/path/to/document.pdf")
+print(len(text))  # total characters
+```
+
+## Notas
+
+Requiere `pip install PyPDF2`. Extraccion basica de texto — no maneja OCR ni PDFs escaneados. Para PDFs complejos considerar PyMuPDF.
diff --git a/python/functions/core/extract_pdf_text.py b/python/functions/core/extract_pdf_text.py
new file mode 100644
index 00000000..b86a6134
--- /dev/null
+++ b/python/functions/core/extract_pdf_text.py
@@ -0,0 +1,19 @@
+"""Extract all text from a PDF file using PyPDF2."""
+
+import PyPDF2
+
+
+def extract_pdf_text(pdf_path: str) -> str:
+    """Extract all text from a PDF file.
+
+    Args:
+        pdf_path: Path to the PDF file.
+
+    Returns:
+        str: Concatenated text from all pages.
+    """
+    pdf_reader = PyPDF2.PdfReader(pdf_path)
+    text = ""
+    for page in pdf_reader.pages:
+        text += page.extract_text() or ""
+    return text
diff --git a/python/functions/core/extract_text_from_file.md b/python/functions/core/extract_text_from_file.md
new file mode 100644
index 00000000..02c66e93
--- /dev/null
+++ b/python/functions/core/extract_text_from_file.md
@@ -0,0 +1,51 @@
+---
+name: extract_text_from_file
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "extract_text_from_file(file_path: str) -> str"
+description: "Extrae texto plano de un archivo. Soporta PDF (PyMuPDF), Markdown y TXT con deteccion automatica de encoding."
+tags: [text, pdf, markdown, txt, encoding, extraction, file, io]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: ["os", "fitz (PyMuPDF)", "charset_normalizer", "chardet"]
+tested: true
+tests:
+  - "PDF con texto extrae contenido correctamente"
+  - "archivo MD UTF-8 retorna contenido"
+  - "archivo TXT latin-1 detecta encoding"
+  - "archivo inexistente lanza FileNotFoundError"
+  - "extension no soportada lanza ValueError"
+test_file_path: "python/functions/core/extract_text_from_file_test.py"
+file_path: "python/functions/core/extract_text_from_file.py"
+---
+
+## Ejemplo
+
+```python
+# PDF
+text = extract_text_from_file("report.pdf")
+
+# Markdown
+text = extract_text_from_file("README.md")
+
+# TXT con encoding desconocido
+text = extract_text_from_file("notes.txt")
+```
+
+## Notas
+
+Para PDF usa PyMuPDF (`fitz`) que produce mejor texto que PyPDF2, especialmente en PDFs con columnas o layout complejo. Las paginas se unen con `\n\n`.
+
+La deteccion de encoding para archivos de texto sigue este orden de prioridad:
+1. Intenta UTF-8 directamente
+2. `charset_normalizer.from_bytes().best().encoding`
+3. `chardet.detect(data)["encoding"]`
+4. UTF-8 con `errors='replace'` como ultimo recurso
+
+Diferencia con `extract_pdf_text_py_core`: esa funcion usa PyPDF2 y solo soporta PDF. Esta funcion usa PyMuPDF y soporta ademas MD y TXT con deteccion de encoding.
diff --git a/python/functions/core/extract_text_from_file.py b/python/functions/core/extract_text_from_file.py
new file mode 100644
index 00000000..747c3545
--- /dev/null
+++ b/python/functions/core/extract_text_from_file.py
@@ -0,0 +1,92 @@
+"""Extract plain text from PDF, Markdown, or TXT files."""
+
+
+SUPPORTED_EXTENSIONS = {".pdf", ".md", ".markdown", ".txt"}
+
+
+def _detect_encoding(data: bytes) -> str:
+    """Detect encoding of raw bytes using multiple fallback strategies."""
+    # Strategy 1: UTF-8
+    try:
+        data.decode("utf-8")
+        return "utf-8"
+    except UnicodeDecodeError:
+        pass
+
+    # Strategy 2: charset_normalizer
+    try:
+        from charset_normalizer import from_bytes
+
+        result = from_bytes(data).best()
+        if result is not None and result.encoding:
+            return result.encoding
+    except ImportError:
+        pass
+
+    # Strategy 3: chardet
+    try:
+        import chardet
+
+        detected = chardet.detect(data)
+        if detected and detected.get("encoding"):
+            return detected["encoding"]
+    except ImportError:
+        pass
+
+    # Last resort: UTF-8 with replacement
+    return "utf-8"
+
+
+def extract_text_from_file(file_path: str) -> str:
+    """Extract plain text from a file. Supports PDF, Markdown and TXT.
+
+    For PDF files uses PyMuPDF (fitz) to extract text from each page,
+    joining them with double newlines. For text-based files (.md, .markdown,
+    .txt) reads the file with automatic encoding detection.
+
+    Args:
+        file_path: Absolute or relative path to the file.
+
+    Returns:
+        str: Extracted plain text content.
+
+    Raises:
+        FileNotFoundError: If the file does not exist.
+        ValueError: If the file extension is not supported.
+        ImportError: If PyMuPDF is not installed and a PDF is provided.
+    """
+    import os
+
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"File not found: {file_path}")
+
+    _, ext = os.path.splitext(file_path.lower())
+
+    if ext == ".pdf":
+        try:
+            import fitz  # PyMuPDF
+        except ImportError as e:
+            raise ImportError(
+                "PyMuPDF is required for PDF extraction. "
+                "Install it with: pip install PyMuPDF"
+            ) from e
+
+        doc = fitz.open(file_path)
+        pages = [page.get_text() for page in doc]
+        return "\n\n".join(pages)
+
+    elif ext in {".md", ".markdown", ".txt"}:
+        with open(file_path, "rb") as f:
+            raw = f.read()
+
+        encoding = _detect_encoding(raw)
+        try:
+            return raw.decode(encoding)
+        except (UnicodeDecodeError, LookupError):
+            return raw.decode("utf-8", errors="replace")
+
+    else:
+        raise ValueError(
+            f"Unsupported file extension: '{ext}'. "
+            f"Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
+        )
diff --git a/python/functions/core/extract_text_from_file_test.py b/python/functions/core/extract_text_from_file_test.py
new file mode 100644
index 00000000..dbe1dcfe
--- /dev/null
+++ b/python/functions/core/extract_text_from_file_test.py
@@ -0,0 +1,83 @@
+"""Tests para extract_text_from_file."""
+
+import os
+import sys
+import tempfile
+
+import pytest
+
+sys.path.insert(0, os.path.dirname(__file__))
+from extract_text_from_file import extract_text_from_file
+
+
+def test_pdf_con_texto_extrae_contenido_correctamente():
+    """PDF con texto extrae contenido correctamente."""
+    try:
+        import fitz
+    except ImportError:
+        pytest.skip("PyMuPDF no instalado")
+
+    # Create a minimal in-memory PDF using PyMuPDF and write it to a temp file
+    doc = fitz.open()
+    page = doc.new_page()
+    page.insert_text((72, 72), "Hello from PDF")
+    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
+        tmp_path = f.name
+    try:
+        doc.save(tmp_path)
+        doc.close()
+        result = extract_text_from_file(tmp_path)
+        assert "Hello from PDF" in result
+    finally:
+        os.unlink(tmp_path)
+
+
+def test_archivo_md_utf8_retorna_contenido():
+    """archivo MD UTF-8 retorna contenido."""
+    content = "# Titulo\n\nParrafo con texto UTF-8: cafe, senor, japon.\n"
+    with tempfile.NamedTemporaryFile(
+        suffix=".md", mode="wb", delete=False
+    ) as f:
+        f.write(content.encode("utf-8"))
+        tmp_path = f.name
+    try:
+        result = extract_text_from_file(tmp_path)
+        assert "# Titulo" in result
+        assert "cafe" in result
+    finally:
+        os.unlink(tmp_path)
+
+
+def test_archivo_txt_latin1_detecta_encoding():
+    """archivo TXT latin-1 detecta encoding."""
+    content = "Texto en latin-1: cafe, hotel, naive\n"
+    with tempfile.NamedTemporaryFile(
+        suffix=".txt", mode="wb", delete=False
+    ) as f:
+        f.write(content.encode("latin-1"))
+        tmp_path = f.name
+    try:
+        result = extract_text_from_file(tmp_path)
+        # The word "cafe" or similar should appear in the decoded result
+        assert len(result) > 0
+        assert "cafe" in result or "caf" in result
+    finally:
+        os.unlink(tmp_path)
+
+
+def test_archivo_inexistente_lanza_filenotfounderror():
+    """archivo inexistente lanza FileNotFoundError."""
+    with pytest.raises(FileNotFoundError):
+        extract_text_from_file("/tmp/no_existe_este_archivo_12345.txt")
+
+
+def test_extension_no_soportada_lanza_valueerror():
+    """extension no soportada lanza ValueError."""
+    with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as f:
+        f.write(b"fake docx content")
+        tmp_path = f.name
+    try:
+        with pytest.raises(ValueError, match="Unsupported file extension"):
+            extract_text_from_file(tmp_path)
+    finally:
+        os.unlink(tmp_path)
diff --git a/python/functions/core/fetch_and_parse_url.md b/python/functions/core/fetch_and_parse_url.md
new file mode 100644
index 00000000..ce6e817f
--- /dev/null
+++ b/python/functions/core/fetch_and_parse_url.md
@@ -0,0 +1,50 @@
+---
+name: fetch_and_parse_url
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "fetch_and_parse_url(url: str, timeout: float = 30.0) -> str"
+description: "Descarga una pagina web y la convierte a markdown. Combina detect_url_type + fetch HTML + html_to_markdown en una sola operacion."
+tags: [http, fetch, html, markdown, parse, url, scraping]
+uses_functions:
+  - detect_url_type_py_core
+  - html_to_markdown_py_core
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: ["httpx"]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/fetch_and_parse_url.py"
+---
+
+## Ejemplo
+
+```python
+from core.fetch_and_parse_url import fetch_and_parse_url
+
+# Descargar y convertir una pagina web
+md = fetch_and_parse_url("https://example.com")
+print(md)
+
+# Con timeout personalizado
+md = fetch_and_parse_url("https://en.wikipedia.org/wiki/Python", timeout=15.0)
+```
+
+## Notas
+
+Algoritmo:
+1. `detect_url_type(url)` determina el tipo de contenido (por patron, extension o HEAD request).
+2. Si es `code_repository` → lanza Exception (requiere git clone, no HTTP fetch).
+3. Si es `pdf` → lanza Exception (requiere pdfminer/pypdf, no incluido).
+4. `httpx.get(url)` descarga el contenido con follow_redirects.
+5. Si es `webpage` o Content-Type HTML → `html_to_markdown(raw_html)`.
+6. Si es `markdown`, `text` o codigo → retorna el texto directamente.
+
+Lanza `Exception` con mensaje descriptivo en cualquier fallo de red o tipo no soportado.
+
+Funcion impura: hace I/O (HTTP requests).
diff --git a/python/functions/core/fetch_and_parse_url.py b/python/functions/core/fetch_and_parse_url.py
new file mode 100644
index 00000000..c645b7b9
--- /dev/null
+++ b/python/functions/core/fetch_and_parse_url.py
@@ -0,0 +1,64 @@
+"""Descarga una pagina web y la convierte a markdown."""
+
+from __future__ import annotations
+
+
+def fetch_and_parse_url(url: str, timeout: float = 30.0) -> str:
+    """Descarga una pagina web y la convierte a markdown.
+
+    Detecta el tipo de URL con detect_url_type, descarga el contenido con
+    httpx y lo convierte al formato apropiado:
+    - webpage: fetch HTML → html_to_markdown
+    - markdown: retorna el texto directamente
+    - text/code: retorna el texto directamente
+    - pdf: retorna stub (requiere dependencia externa)
+    - code_repository: retorna stub (requiere clonar repo)
+
+    Args:
+        url: URL a descargar y parsear.
+        timeout: Timeout en segundos para las peticiones HTTP.
+
+    Returns:
+        Contenido de la URL en formato markdown.
+
+    Raises:
+        Exception: Si falla la descarga (timeout, DNS, HTTP error) o el tipo
+                   de URL no es soportado.
+    """
+    import httpx
+
+    from detect_url_type import detect_url_type
+    from html_to_markdown import html_to_markdown
+
+    # Detectar tipo de URL (puede hacer HEAD request)
+    url_type, _meta = detect_url_type(url, timeout=timeout)
+
+    if url_type == "code_repository":
+        raise Exception(
+            f"fetch_and_parse_url: code_repository URLs require git clone, not supported. url={url!r}"
+        )
+
+    if url_type == "pdf":
+        raise Exception(
+            f"fetch_and_parse_url: PDF parsing requires external dependency (pdfminer/pypdf). url={url!r}"
+        )
+
+    # Fetch content via GET
+    try:
+        response = httpx.get(url, timeout=timeout, follow_redirects=True)
+        response.raise_for_status()
+    except httpx.HTTPStatusError as exc:
+        raise Exception(
+            f"fetch_and_parse_url: HTTP {exc.response.status_code} for {url!r}"
+        ) from exc
+    except Exception as exc:
+        raise Exception(f"fetch_and_parse_url: request failed for {url!r}: {exc}") from exc
+
+    content_type = response.headers.get("content-type", "").lower()
+    raw_text = response.text
+
+    if url_type == "webpage" or "text/html" in content_type:
+        return html_to_markdown(raw_text)
+
+    # markdown, text, or code files — return as-is
+    return raw_text
diff --git a/python/functions/core/find_headings.md b/python/functions/core/find_headings.md
new file mode 100644
index 00000000..61aa5f65
--- /dev/null
+++ b/python/functions/core/find_headings.md
@@ -0,0 +1,38 @@
+---
+name: find_headings
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def find_headings(content: str) -> list[tuple[int, int, str, int]]"
+description: "Encuentra todos los headings markdown (# a ######), excluyendo los que estan dentro de code blocks, HTML comments y bloques indentados. Retorna lista de (start_pos, end_pos, title, level)."
+tags: [markdown, headings, parsing, extraction]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [re]
+tested: true
+tests:
+  - "headings normales detectados correctamente"
+  - "headings dentro de code blocks no detectados"
+  - "headings escapados ignorados"
+  - "headings en HTML comments ignorados"
+test_file_path: "python/functions/core/parse_markdown_test.py"
+file_path: "python/functions/core/core.py"
+---
+
+## Ejemplo
+
+```python
+content = "# Title\n\nSome text\n\n## Section\n\n```\n# Ignored\n```\n"
+headings = find_headings(content)
+# [(0, 7, "Title", 1), (22, 33, "Section", 2)]
+# (positions approximated)
+```
+
+## Notas
+
+Funcion pura. Excluye tres tipos de contextos: bloques de codigo triple backtick, comentarios HTML (`<!-- ... -->`), y lineas indentadas con 4 espacios o tabulacion. Tambien filtra headings precedidos por backslash (`\#`). Diferencia clave respecto a `extract_markdown_headers`: esta funcion retorna posiciones de caracter, no numeros de linea, lo que facilita la extraccion de contenido entre headings.
diff --git a/python/functions/core/flatten_tree.md b/python/functions/core/flatten_tree.md
new file mode 100644
index 00000000..2708363c
--- /dev/null
+++ b/python/functions/core/flatten_tree.md
@@ -0,0 +1,36 @@
+---
+name: flatten_tree
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def flatten_tree(structure: Any) -> list[dict]"
+description: "Aplana un arbol jerarquico (dict con 'nodes') a lista plana sin hijos. Deep copy de cada nodo."
+tags: [tree, flatten, hierarchy, functional]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [copy]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/core.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/utils.py"
+---
+
+## Ejemplo
+
+```python
+tree = [{"title": "A", "nodes": [{"title": "A1", "nodes": []}]}]
+flatten_tree(tree)
+# [{"title": "A"}, {"title": "A1"}]
+```
+
+## Notas
+
+Funcion pura. Usa deep copy para no mutar el arbol original. Elimina el campo 'nodes' de cada nodo aplanado.
diff --git a/python/functions/core/format_iso8601.md b/python/functions/core/format_iso8601.md
new file mode 100644
index 00000000..0fa79940
--- /dev/null
+++ b/python/functions/core/format_iso8601.md
@@ -0,0 +1,49 @@
+---
+name: format_iso8601
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "format_iso8601(dt: datetime) -> str"
+description: "Formatea un datetime a ISO 8601 UTC con milisegundos. Formato: yyyy-MM-ddTHH:mm:ss.SSSZ. Si naive asume UTC, si aware convierte a UTC."
+tags: [datetime, iso8601, format, time, utc]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: ["datetime"]
+tested: true
+tests:
+  - "datetime naive formateado como UTC"
+  - "datetime con timezone convertido a UTC"
+  - "datetime UTC sin conversion"
+test_file_path: "python/functions/core/format_iso8601_test.py"
+file_path: "python/functions/core/format_iso8601.py"
+---
+
+## Ejemplo
+
+```python
+from datetime import datetime, timezone, timedelta
+from format_iso8601 import format_iso8601
+
+# Naive (asume UTC)
+s = format_iso8601(datetime(2026, 2, 21, 13, 20, 23, 147000))
+# "2026-02-21T13:20:23.147Z"
+
+# Con timezone +8
+tz8 = timezone(timedelta(hours=8))
+s = format_iso8601(datetime(2026, 2, 21, 21, 20, 23, 147000, tzinfo=tz8))
+# "2026-02-21T13:20:23.147Z"
+```
+
+## Notas
+
+Algoritmo:
+1. Si naive: `dt.replace(tzinfo=timezone.utc)`.
+2. Si aware: `dt.astimezone(timezone.utc)`.
+3. `dt.isoformat(timespec="milliseconds").replace("+00:00", "Z")`.
+
+Funcion pura. No hace I/O ni tiene efectos secundarios.
diff --git a/python/functions/core/format_iso8601.py b/python/functions/core/format_iso8601.py
new file mode 100644
index 00000000..3ed803df
--- /dev/null
+++ b/python/functions/core/format_iso8601.py
@@ -0,0 +1,24 @@
+"""Formatea un datetime a ISO 8601 UTC con milisegundos."""
+
+from datetime import datetime, timezone
+
+
+def format_iso8601(dt: datetime) -> str:
+    """Formatea un datetime a ISO 8601 UTC con milisegundos.
+
+    Formato de salida: ``yyyy-MM-ddTHH:mm:ss.SSSZ``
+
+    Si el datetime es naive (sin tzinfo), se asume UTC.
+    Si el datetime es aware, se convierte a UTC antes de formatear.
+
+    Args:
+        dt: datetime a formatear. Puede ser naive o aware.
+
+    Returns:
+        String ISO 8601 en UTC con milisegundos, terminando en 'Z'.
+    """
+    if dt.tzinfo is None:
+        dt = dt.replace(tzinfo=timezone.utc)
+    else:
+        dt = dt.astimezone(timezone.utc)
+    return dt.isoformat(timespec="milliseconds").replace("+00:00", "Z")
diff --git a/python/functions/core/format_iso8601_test.py b/python/functions/core/format_iso8601_test.py
new file mode 100644
index 00000000..763b2882
--- /dev/null
+++ b/python/functions/core/format_iso8601_test.py
@@ -0,0 +1,28 @@
+"""Tests para format_iso8601."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from datetime import datetime, timezone, timedelta
+from format_iso8601 import format_iso8601
+
+
+def test_datetime_naive_formateado_como_utc():
+    dt = datetime(2026, 2, 21, 13, 20, 23, 147000)
+    result = format_iso8601(dt)
+    assert result == "2026-02-21T13:20:23.147Z"
+
+
+def test_datetime_con_timezone_convertido_a_utc():
+    tz8 = timezone(timedelta(hours=8))
+    dt = datetime(2026, 2, 21, 21, 20, 23, 147000, tzinfo=tz8)
+    result = format_iso8601(dt)
+    assert result == "2026-02-21T13:20:23.147Z"
+
+
+def test_datetime_utc_sin_conversion():
+    dt = datetime(2026, 6, 15, 9, 0, 0, 500000, tzinfo=timezone.utc)
+    result = format_iso8601(dt)
+    assert result == "2026-06-15T09:00:00.500Z"
diff --git a/python/functions/core/format_simplified.md b/python/functions/core/format_simplified.md
new file mode 100644
index 00000000..71f5a8b1
--- /dev/null
+++ b/python/functions/core/format_simplified.md
@@ -0,0 +1,54 @@
+---
+name: format_simplified
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "format_simplified(dt: datetime, now: datetime) -> str"
+description: "Formato humano simplificado: si dt es del mismo dia que now muestra HH:MM:SS, si no muestra YYYY-MM-DD."
+tags: [datetime, format, time, human, display]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: ["datetime"]
+tested: true
+tests:
+  - "mismo dia muestra formato hora"
+  - "dia anterior muestra formato fecha"
+  - "exactamente 24h muestra formato fecha"
+test_file_path: "python/functions/core/format_simplified_test.py"
+file_path: "python/functions/core/format_simplified.py"
+---
+
+## Ejemplo
+
+```python
+from datetime import datetime
+from format_simplified import format_simplified
+
+now = datetime(2026, 2, 21, 15, 0, 0)
+
+# Mismo dia
+s = format_simplified(datetime(2026, 2, 21, 9, 30, 0), now)
+# "09:30:00"
+
+# Dia anterior
+s = format_simplified(datetime(2026, 2, 20, 9, 30, 0), now)
+# "2026-02-20"
+```
+
+## Notas
+
+Algoritmo:
+1. Remover tzinfo de ambos datetimes para comparacion simple (`replace(tzinfo=None)`).
+2. Si `(now - dt).days < 1`: retornar `dt.strftime("%H:%M:%S")`.
+3. Si no: retornar `dt.strftime("%Y-%m-%d")`.
+
+El umbral de 1 dia en `timedelta.days` significa que cualquier diferencia
+menor a 24 horas se muestra como hora. Un dt exactamente 24h atras
+tendra `days == 1`, mostrando fecha.
+
+Funcion pura. No hace I/O ni tiene efectos secundarios.
diff --git a/python/functions/core/format_simplified.py b/python/functions/core/format_simplified.py
new file mode 100644
index 00000000..fe0ad6c6
--- /dev/null
+++ b/python/functions/core/format_simplified.py
@@ -0,0 +1,25 @@
+"""Formato humano simplificado de datetime: hora si es hoy, fecha si es otro dia."""
+
+from datetime import datetime
+
+
+def format_simplified(dt: datetime, now: datetime) -> str:
+    """Formato humano simplificado de datetime.
+
+    Si ``dt`` es del mismo dia que ``now`` (diferencia < 1 dia), retorna
+    la hora en formato ``HH:MM:SS``. En caso contrario retorna la fecha
+    en formato ``YYYY-MM-DD``.
+
+    Args:
+        dt: datetime a formatear.
+        now: datetime de referencia (el momento actual).
+
+    Returns:
+        String ``HH:MM:SS`` si mismo dia, ``YYYY-MM-DD`` si otro dia.
+    """
+    dt_naive = dt.replace(tzinfo=None)
+    now_naive = now.replace(tzinfo=None)
+    diff = now_naive - dt_naive
+    if diff.days < 1:
+        return dt.strftime("%H:%M:%S")
+    return dt.strftime("%Y-%m-%d")
diff --git a/python/functions/core/format_simplified_test.py b/python/functions/core/format_simplified_test.py
new file mode 100644
index 00000000..d420d937
--- /dev/null
+++ b/python/functions/core/format_simplified_test.py
@@ -0,0 +1,30 @@
+"""Tests para format_simplified."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from datetime import datetime, timedelta
+from format_simplified import format_simplified
+
+
+def test_mismo_dia_muestra_formato_hora():
+    now = datetime(2026, 2, 21, 15, 0, 0)
+    dt = datetime(2026, 2, 21, 9, 30, 45)
+    result = format_simplified(dt, now)
+    assert result == "09:30:45"
+
+
+def test_dia_anterior_muestra_formato_fecha():
+    now = datetime(2026, 2, 21, 15, 0, 0)
+    dt = datetime(2026, 2, 20, 9, 30, 45)
+    result = format_simplified(dt, now)
+    assert result == "2026-02-20"
+
+
+def test_exactamente_24h_muestra_formato_fecha():
+    now = datetime(2026, 2, 21, 15, 0, 0)
+    dt = now - timedelta(hours=24)
+    result = format_simplified(dt, now)
+    assert result == "2026-02-20"
diff --git a/python/functions/core/format_table_to_markdown.md b/python/functions/core/format_table_to_markdown.md
new file mode 100644
index 00000000..d7991f2c
--- /dev/null
+++ b/python/functions/core/format_table_to_markdown.md
@@ -0,0 +1,36 @@
+---
+name: format_table_to_markdown
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def format_table_to_markdown(rows: list[list[str]], has_header: bool = True) -> str"
+description: "Convierte una lista 2D de celdas a tabla markdown con alineacion de columnas. Escapa pipes en celdas y añade separador header."
+tags: [markdown, table, formatting, text, pure]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: true
+tests: ["tabla normal", "tabla con celdas vacias", "tabla con 1 fila", "tabla vacia", "celdas con pipes", "sin header"]
+test_file_path: "python/functions/core/format_table_to_markdown_test.py"
+file_path: "python/functions/core/format_table_to_markdown.py"
+---
+
+## Ejemplo
+
+```python
+rows = [["Name", "Age"], ["Alice", "30"], ["Bob", "25"]]
+md = format_table_to_markdown(rows)
+# | Name  | Age |
+# | ----- | --- |
+# | Alice | 30  |
+# | Bob   | 25  |
+```
+
+## Notas
+
+Funcion pura. No tiene dependencias externas. Calcula el ancho maximo por columna para alinear. El separador usa minimo 3 guiones por columna para cumplir con la especificacion markdown. Escapa los pipes dentro de celdas con `\|`. Si `has_header=False`, omite la fila separadora.
diff --git a/python/functions/core/format_table_to_markdown.py b/python/functions/core/format_table_to_markdown.py
new file mode 100644
index 00000000..73f850dd
--- /dev/null
+++ b/python/functions/core/format_table_to_markdown.py
@@ -0,0 +1,52 @@
+"""Convert a 2D list of cells to a markdown table with column alignment."""
+
+
+def format_table_to_markdown(rows: list[list[str]], has_header: bool = True) -> str:
+    """Convert a 2D list of cells to a markdown table.
+
+    Args:
+        rows: 2D list where each inner list is a row of cell strings.
+        has_header: If True, the first row is treated as the header row.
+
+    Returns:
+        str: Markdown table string. Returns empty string for empty input.
+    """
+    if not rows:
+        return ""
+
+    def escape_cell(cell: str) -> str:
+        return str(cell).replace("|", "\\|")
+
+    # Determine column count from widest row
+    col_count = max(len(row) for row in rows)
+
+    # Pad rows to same column count
+    padded = [row + [""] * (col_count - len(row)) for row in rows]
+
+    # Escape pipe characters in all cells
+    escaped = [[escape_cell(cell) for cell in row] for row in padded]
+
+    # Calculate max width per column
+    col_widths = [
+        max(len(escaped[r][c]) for r in range(len(escaped)))
+        for c in range(col_count)
+    ]
+    col_widths = [max(w, 3) for w in col_widths]  # minimum width of 3 for separator
+
+    def format_row(row: list[str]) -> str:
+        cells = [cell.ljust(col_widths[i]) for i, cell in enumerate(row)]
+        return "| " + " | ".join(cells) + " |"
+
+    lines = []
+
+    if has_header and len(escaped) >= 1:
+        lines.append(format_row(escaped[0]))
+        separator = "| " + " | ".join("-" * col_widths[i] for i in range(col_count)) + " |"
+        lines.append(separator)
+        for row in escaped[1:]:
+            lines.append(format_row(row))
+    else:
+        for row in escaped:
+            lines.append(format_row(row))
+
+    return "\n".join(lines)
diff --git a/python/functions/core/format_table_to_markdown_test.py b/python/functions/core/format_table_to_markdown_test.py
new file mode 100644
index 00000000..3cdc2a8e
--- /dev/null
+++ b/python/functions/core/format_table_to_markdown_test.py
@@ -0,0 +1,63 @@
+"""Tests para format_table_to_markdown."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(__file__))
+from format_table_to_markdown import format_table_to_markdown
+
+
+def test_tabla_normal():
+    rows = [["Name", "Age", "City"], ["Alice", "30", "Madrid"], ["Bob", "25", "Berlin"]]
+    result = format_table_to_markdown(rows)
+    assert "| Name  | Age | City   |" in result
+    assert "| ---   | --- | ---    |" in result or "---" in result
+    assert "| Alice | 30  | Madrid |" in result
+    assert "| Bob   | 25  | Berlin |" in result
+
+
+def test_tabla_con_celdas_vacias():
+    rows = [["A", "B"], ["", "x"], ["y", ""]]
+    result = format_table_to_markdown(rows)
+    assert "|" in result
+    lines = result.split("\n")
+    assert len(lines) == 4  # header + separator + 2 data rows
+
+
+def test_tabla_con_1_fila():
+    rows = [["Solo", "Row"]]
+    result = format_table_to_markdown(rows)
+    lines = result.split("\n")
+    # header + separator (no data rows)
+    assert len(lines) == 2
+    assert "Solo" in lines[0]
+    assert "---" in lines[1]
+
+
+def test_tabla_vacia():
+    result = format_table_to_markdown([])
+    assert result == ""
+
+
+def test_celdas_con_pipes():
+    rows = [["Header"], ["cell|with|pipes"]]
+    result = format_table_to_markdown(rows)
+    assert "\\|" in result
+
+
+def test_sin_header():
+    rows = [["A", "B"], ["C", "D"]]
+    result = format_table_to_markdown(rows, has_header=False)
+    assert "---" not in result
+    lines = result.split("\n")
+    assert len(lines) == 2
+
+
+if __name__ == "__main__":
+    test_tabla_normal()
+    test_tabla_con_celdas_vacias()
+    test_tabla_con_1_fila()
+    test_tabla_vacia()
+    test_celdas_con_pipes()
+    test_sin_header()
+    print("All tests passed.")
diff --git a/python/functions/core/format_tree_structure.md b/python/functions/core/format_tree_structure.md
new file mode 100644
index 00000000..3baf870e
--- /dev/null
+++ b/python/functions/core/format_tree_structure.md
@@ -0,0 +1,36 @@
+---
+name: format_tree_structure
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def format_tree_structure(structure: Any, order: list[str] = None) -> Any"
+description: "Reordena campos de cada nodo de un arbol segun orden de claves especificado."
+tags: [tree, format, order, structure]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/core.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/utils.py"
+---
+
+## Ejemplo
+
+```python
+tree = [{"text": "...", "title": "Intro", "node_id": "0001"}]
+format_tree_structure(tree, order=["title", "node_id", "text"])
+# [{"title": "Intro", "node_id": "0001", "text": "..."}]
+```
+
+## Notas
+
+Funcion pura. Elimina nodos vacios (nodes=[]) automaticamente. Claves no listadas en order se descartan.
diff --git a/python/functions/core/from_csv.md b/python/functions/core/from_csv.md
new file mode 100644
index 00000000..cb6cde11
--- /dev/null
+++ b/python/functions/core/from_csv.md
@@ -0,0 +1,49 @@
+---
+name: from_csv
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "from_csv(text: str, delimiter: str = ',', has_header: bool = True) -> list[dict]"
+description: "Parser CSV a datos tabulares. Complemento de to_csv. Soporta campos entre comillas con escaping RFC 4180. Si has_header=False, genera keys col_0, col_1, etc."
+tags: [csv, parser, import, tabular, format]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: true
+tests:
+  - "csv simple con header"
+  - "campos con escaping"
+  - "sin header keys generadas"
+  - "lineas vacias ignoradas"
+  - "un solo campo por fila"
+test_file_path: "python/functions/core/from_csv_test.py"
+file_path: "python/functions/core/from_csv.py"
+---
+
+## Ejemplo
+
+```python
+text = "nombre,edad\r\nAna,30\r\nBob,25"
+rows = from_csv(text)
+# [{"nombre": "Ana", "edad": "30"}, {"nombre": "Bob", "edad": "25"}]
+
+# Sin header
+text = "Ana,30\nBob,25"
+rows = from_csv(text, has_header=False)
+# [{"col_0": "Ana", "col_1": "30"}, {"col_0": "Bob", "col_1": "25"}]
+
+# Con escaping
+text = 'a,b\r\n"dijo ""hola""","uno,dos"'
+rows = from_csv(text)
+# [{"a": 'dijo "hola"', "b": "uno,dos"}]
+```
+
+## Notas
+
+Parser manual sin el modulo csv de stdlib. Normaliza CRLF y LF antes de procesar.
+Ignora lineas vacias. Todos los valores son strings — la conversion de tipos queda a cargo del caller.
diff --git a/python/functions/core/from_csv.py b/python/functions/core/from_csv.py
new file mode 100644
index 00000000..658d7bb7
--- /dev/null
+++ b/python/functions/core/from_csv.py
@@ -0,0 +1,83 @@
+"""Parser CSV a datos tabulares (RFC 4180). Complemento de to_csv."""
+
+
+def _parse_row(line: str, delimiter: str) -> list[str]:
+    """Parsea una linea CSV respetando campos entre comillas (RFC 4180)."""
+    fields: list[str] = []
+    field_chars: list[str] = []
+    in_quotes = False
+    i = 0
+
+    while i < len(line):
+        ch = line[i]
+
+        if in_quotes:
+            if ch == '"':
+                # Comilla doble escapada o cierre de campo
+                if i + 1 < len(line) and line[i + 1] == '"':
+                    field_chars.append('"')
+                    i += 2
+                    continue
+                else:
+                    in_quotes = False
+            else:
+                field_chars.append(ch)
+        else:
+            if ch == '"' and not field_chars:
+                in_quotes = True
+            elif ch == delimiter:
+                fields.append("".join(field_chars))
+                field_chars = []
+            else:
+                field_chars.append(ch)
+        i += 1
+
+    fields.append("".join(field_chars))
+    return fields
+
+
+def from_csv(
+    text: str,
+    delimiter: str = ",",
+    has_header: bool = True,
+) -> list[dict]:
+    """Parser CSV a lista de dicts.
+
+    Complemento de to_csv. Soporta campos entre comillas con escaping RFC 4180.
+    Si has_header=False, genera keys col_0, col_1, etc.
+
+    Args:
+        text: Contenido CSV completo como string.
+        delimiter: Separador de campos. Por defecto coma.
+        has_header: Si True, primera fila es el encabezado.
+                    Si False, genera keys col_0, col_1, ...
+
+    Returns:
+        Lista de dicts. Lista vacia si el texto esta vacio o solo tiene header.
+    """
+    # Normalizar line endings
+    normalized = text.replace("\r\n", "\n").replace("\r", "\n")
+    lines = [l for l in normalized.split("\n") if l.strip() != ""]
+
+    if not lines:
+        return []
+
+    if has_header:
+        headers = _parse_row(lines[0], delimiter)
+        data_lines = lines[1:]
+    else:
+        # Determinar numero de columnas desde la primera fila
+        sample = _parse_row(lines[0], delimiter)
+        headers = [f"col_{i}" for i in range(len(sample))]
+        data_lines = lines
+
+    result: list[dict] = []
+    for line in data_lines:
+        fields = _parse_row(line, delimiter)
+        # Alinear con headers (rellenar con "" si faltan campos)
+        row = {}
+        for i, header in enumerate(headers):
+            row[header] = fields[i] if i < len(fields) else ""
+        result.append(row)
+
+    return result
diff --git a/python/functions/core/from_csv_test.py b/python/functions/core/from_csv_test.py
new file mode 100644
index 00000000..5e39185e
--- /dev/null
+++ b/python/functions/core/from_csv_test.py
@@ -0,0 +1,40 @@
+"""Tests para from_csv."""
+
+from from_csv import from_csv
+
+
+def test_csv_simple_con_header():
+    text = "nombre,edad\r\nAna,30\r\nBob,25"
+    result = from_csv(text)
+    assert len(result) == 2
+    assert result[0] == {"nombre": "Ana", "edad": "30"}
+    assert result[1] == {"nombre": "Bob", "edad": "25"}
+
+
+def test_campos_con_escaping():
+    text = 'a,b\r\n"dijo ""hola""","uno,dos"'
+    result = from_csv(text)
+    assert result[0]["a"] == 'dijo "hola"'
+    assert result[0]["b"] == "uno,dos"
+
+
+def test_sin_header_keys_generadas():
+    text = "foo,bar\nbaz,qux"
+    result = from_csv(text, has_header=False)
+    assert result[0] == {"col_0": "foo", "col_1": "bar"}
+    assert result[1] == {"col_0": "baz", "col_1": "qux"}
+
+
+def test_lineas_vacias_ignoradas():
+    text = "x,y\n\n1,2\n\n3,4\n"
+    result = from_csv(text)
+    assert len(result) == 2
+    assert result[0] == {"x": "1", "y": "2"}
+
+
+def test_un_solo_campo_por_fila():
+    text = "valor\nhola\nmundo"
+    result = from_csv(text)
+    assert len(result) == 2
+    assert result[0] == {"valor": "hola"}
+    assert result[1] == {"valor": "mundo"}
diff --git a/python/functions/core/from_jsonl.md b/python/functions/core/from_jsonl.md
new file mode 100644
index 00000000..4e5e0736
--- /dev/null
+++ b/python/functions/core/from_jsonl.md
@@ -0,0 +1,49 @@
+---
+name: from_jsonl
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "from_jsonl(text: str) -> list[dict]"
+description: "Parser JSONL a lista de dicts. Ignora lineas vacias. Lanza ValueError con el numero de linea si una linea contiene JSON invalido. Complemento de to_jsonl."
+tags: [jsonl, json, parser, import, streaming, format]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: ["json"]
+tested: true
+tests:
+  - "jsonl valido"
+  - "lineas vacias intercaladas"
+  - "linea invalida raise con numero"
+test_file_path: "python/functions/core/from_jsonl_test.py"
+file_path: "python/functions/core/from_jsonl.py"
+---
+
+## Ejemplo
+
+```python
+text = '{"id": 1}\n{"id": 2}'
+rows = from_jsonl(text)
+# [{"id": 1}, {"id": 2}]
+
+# Lineas vacias ignoradas
+text = '{"id": 1}\n\n{"id": 2}\n'
+rows = from_jsonl(text)
+# [{"id": 1}, {"id": 2}]
+
+# JSON invalido levanta error con numero de linea
+try:
+    from_jsonl('{"ok": 1}\nnot-json')
+except ValueError as e:
+    print(e)  # "JSON invalido en linea 2: ..."
+```
+
+## Notas
+
+Aunque se declara pure (no hace I/O), puede lanzar ValueError para JSON invalido.
+Esto es consistente con la convencion del registry: funciones puras pueden lanzar
+excepciones de validacion — solo las funciones impuras retornan error como valor.
diff --git a/python/functions/core/from_jsonl.py b/python/functions/core/from_jsonl.py
new file mode 100644
index 00000000..66771b77
--- /dev/null
+++ b/python/functions/core/from_jsonl.py
@@ -0,0 +1,35 @@
+"""Parser JSON Lines (JSONL) a lista de dicts. Complemento de to_jsonl."""
+
+import json
+
+
+def from_jsonl(text: str) -> list[dict]:
+    """Parser JSONL a lista de dicts.
+
+    Complemento de to_jsonl. Ignora lineas vacias. Lanza ValueError si
+    una linea contiene JSON invalido, indicando el numero de linea.
+
+    Args:
+        text: Contenido JSONL como string (una linea JSON por linea).
+
+    Returns:
+        Lista de dicts parseados.
+
+    Raises:
+        ValueError: Si una linea no es JSON valido, con el numero de linea.
+    """
+    result: list[dict] = []
+
+    for line_num, line in enumerate(text.splitlines(), start=1):
+        stripped = line.strip()
+        if not stripped:
+            continue
+        try:
+            parsed = json.loads(stripped)
+        except json.JSONDecodeError as exc:
+            raise ValueError(
+                f"JSON invalido en linea {line_num}: {exc}"
+            ) from exc
+        result.append(parsed)
+
+    return result
diff --git a/python/functions/core/from_jsonl_test.py b/python/functions/core/from_jsonl_test.py
new file mode 100644
index 00000000..27eea21a
--- /dev/null
+++ b/python/functions/core/from_jsonl_test.py
@@ -0,0 +1,25 @@
+"""Tests para from_jsonl."""
+
+import pytest
+
+from from_jsonl import from_jsonl
+
+
+def test_jsonl_valido():
+    text = '{"a": 1}\n{"b": 2}'
+    result = from_jsonl(text)
+    assert result == [{"a": 1}, {"b": 2}]
+
+
+def test_lineas_vacias_intercaladas():
+    text = '{"x": 1}\n\n{"x": 2}\n\n'
+    result = from_jsonl(text)
+    assert len(result) == 2
+    assert result[0] == {"x": 1}
+    assert result[1] == {"x": 2}
+
+
+def test_linea_invalida_raise_con_numero():
+    text = '{"ok": 1}\nnot-json\n{"ok": 3}'
+    with pytest.raises(ValueError, match="linea 2"):
+        from_jsonl(text)
diff --git a/python/functions/core/generate_html_report.md b/python/functions/core/generate_html_report.md
new file mode 100644
index 00000000..8928cf7d
--- /dev/null
+++ b/python/functions/core/generate_html_report.md
@@ -0,0 +1,70 @@
+---
+name: generate_html_report
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "generate_html_report(title: str, sections: list[dict]) -> str"
+description: "Genera un reporte HTML autocontenido con CSS inline. Soporta secciones de tipo table (list[dict]), text (str con markdown basico), kpi (cards con label/value/delta) y list (list[str]). Para exportar resultados de pipelines sin servidor."
+tags: [html, report, export, table, kpi, template, format]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: ["re"]
+tested: true
+tests:
+  - "reporte con una tabla"
+  - "reporte con multiples secciones mixtas"
+  - "kpi con deltas positivos y negativos"
+  - "caracteres especiales html escapados en data"
+  - "titulo con caracteres especiales"
+test_file_path: "python/functions/core/generate_html_report_test.py"
+file_path: "python/functions/core/generate_html_report.py"
+---
+
+## Ejemplo
+
+```python
+sections = [
+    {
+        "heading": "Resumen ejecutivo",
+        "type": "kpi",
+        "data": [
+            {"label": "Revenue", "value": "$1.2M", "delta": "+15%"},
+            {"label": "Churn", "value": "3.2%", "delta": "-0.5%"},
+        ],
+    },
+    {
+        "heading": "Top usuarios",
+        "type": "table",
+        "data": [
+            {"usuario": "ana@example.com", "compras": 42},
+            {"usuario": "bob@example.com", "compras": 38},
+        ],
+    },
+    {
+        "heading": "Notas",
+        "type": "text",
+        "data": "Datos del **trimestre Q1**. Ver [dashboard](https://example.com).",
+    },
+]
+
+html = generate_html_report("Reporte Mensual", sections)
+# Retorna string HTML completo con DOCTYPE, head con CSS inline, body con secciones
+```
+
+## Tipos de seccion
+
+- **table**: `data` es `list[dict]` — renderiza `<table>` con headers extraidos de las keys
+- **text**: `data` es `str` — soporta `**bold**` y `[text](url)`, escapa HTML
+- **kpi**: `data` es `list[{"label", "value", "delta"}]` — cards con colores para delta positivo/negativo
+- **list**: `data` es `list[str]` — renderiza `<ul><li>...</li></ul>`
+
+## Notas
+
+CSS completamente inline en `<style>`. Tema minimalista con max-width 960px, sans-serif,
+tabla con zebra stripes, cards KPI con colores verde/rojo para deltas.
+Todo el contenido del usuario pasa por HTML escape para proteger contra XSS.
diff --git a/python/functions/core/generate_html_report.py b/python/functions/core/generate_html_report.py
new file mode 100644
index 00000000..e5a1fd7e
--- /dev/null
+++ b/python/functions/core/generate_html_report.py
@@ -0,0 +1,164 @@
+"""Genera reportes HTML autocontenidos con CSS inline."""
+
+
+_HTML_ESCAPES = {
+    "&": "&amp;",
+    "<": "&lt;",
+    ">": "&gt;",
+    '"': "&quot;",
+    "'": "&#x27;",
+}
+
+
+def _esc(value: str) -> str:
+    for ch, entity in _HTML_ESCAPES.items():
+        value = value.replace(ch, entity)
+    return value
+
+
+def _render_table(data: list[dict]) -> str:
+    if not data:
+        return "<p><em>(sin datos)</em></p>"
+    headers = list(data[0].keys())
+    rows_html = ""
+    for i, row in enumerate(data):
+        cls = ' class="zebra"' if i % 2 == 1 else ""
+        cells = "".join(f"<td>{_esc(str(row.get(h, '')))}</td>" for h in headers)
+        rows_html += f"<tr{cls}>{cells}</tr>\n"
+    headers_html = "".join(f"<th>{_esc(h)}</th>" for h in headers)
+    return (
+        f"<table>\n<thead><tr>{headers_html}</tr></thead>\n"
+        f"<tbody>\n{rows_html}</tbody>\n</table>"
+    )
+
+
+def _render_text(data: str) -> str:
+    # Markdown basico: **bold** y [text](url)
+    import re
+
+    text = _esc(str(data))
+    # Bold: **text** (despues de escapar, & no interfiere)
+    text = re.sub(r"\*\*(.+?)\*\*", r"<strong>\1</strong>", text)
+    # Links: [text](url)
+    text = re.sub(r"\[(.+?)\]\((.+?)\)", r'<a href="\2">\1</a>', text)
+    return f"<p>{text}</p>"
+
+
+def _render_kpi(data: list[dict]) -> str:
+    cards = ""
+    for kpi in data:
+        label = _esc(str(kpi.get("label", "")))
+        value = _esc(str(kpi.get("value", "")))
+        delta = kpi.get("delta")
+        delta_html = ""
+        if delta is not None:
+            delta_str = str(delta)
+            if delta_str.startswith("+"):
+                delta_html = f'<span class="delta-pos">{_esc(delta_str)}</span>'
+            elif delta_str.startswith("-"):
+                delta_html = f'<span class="delta-neg">{_esc(delta_str)}</span>'
+            else:
+                delta_html = f'<span class="delta-neutral">{_esc(delta_str)}</span>'
+        cards += (
+            f'<div class="kpi-card">'
+            f'<div class="kpi-label">{label}</div>'
+            f'<div class="kpi-value">{value}</div>'
+            f"{delta_html}"
+            f"</div>\n"
+        )
+    return f'<div class="kpi-grid">\n{cards}</div>'
+
+
+def _render_list(data: list[str]) -> str:
+    items = "".join(f"<li>{_esc(str(item))}</li>\n" for item in data)
+    return f"<ul>\n{items}</ul>"
+
+
+_CSS = """
+body {
+    font-family: sans-serif;
+    max-width: 960px;
+    margin: 2rem auto;
+    padding: 0 1rem;
+    color: #222;
+    background: #fff;
+}
+h1 { font-size: 1.8rem; border-bottom: 2px solid #ddd; padding-bottom: .5rem; }
+h2 { font-size: 1.3rem; margin-top: 2rem; color: #333; }
+table { border-collapse: collapse; width: 100%; margin: 1rem 0; font-size: .95rem; }
+th { background: #f0f0f0; text-align: left; padding: .5rem .75rem; border: 1px solid #ddd; }
+td { padding: .45rem .75rem; border: 1px solid #ddd; }
+tr.zebra { background: #f9f9f9; }
+ul { padding-left: 1.5rem; }
+li { margin: .3rem 0; }
+p { line-height: 1.6; }
+a { color: #0066cc; }
+.kpi-grid { display: flex; flex-wrap: wrap; gap: 1rem; margin: 1rem 0; }
+.kpi-card {
+    border: 1px solid #ddd;
+    border-radius: 6px;
+    padding: 1rem 1.5rem;
+    min-width: 140px;
+    background: #fafafa;
+}
+.kpi-label { font-size: .85rem; color: #666; margin-bottom: .25rem; }
+.kpi-value { font-size: 1.6rem; font-weight: bold; }
+.delta-pos { color: #16a34a; font-size: .9rem; }
+.delta-neg { color: #dc2626; font-size: .9rem; }
+.delta-neutral { color: #888; font-size: .9rem; }
+""".strip()
+
+
+def generate_html_report(title: str, sections: list[dict]) -> str:
+    """Genera un reporte HTML autocontenido con CSS inline.
+
+    Cada seccion es un dict con:
+        heading: str — titulo de la seccion
+        type: "table" | "text" | "kpi" | "list" — tipo de contenido
+        data: contenido segun el tipo:
+            table -> list[dict]
+            text  -> str (soporta **bold** y [links](url))
+            kpi   -> list[{"label": str, "value": str|number, "delta": str|None}]
+            list  -> list[str]
+
+    No requiere servidor — todo el CSS va inline en <style>.
+
+    Args:
+        title: Titulo del reporte (mostrado en <h1> y <title>).
+        sections: Lista de secciones a incluir en el reporte.
+
+    Returns:
+        String HTML completo con DOCTYPE.
+    """
+    sections_html = ""
+    for section in sections:
+        heading = _esc(str(section.get("heading", "")))
+        kind = section.get("type", "text")
+        data = section.get("data")
+
+        if kind == "table":
+            content = _render_table(data or [])
+        elif kind == "kpi":
+            content = _render_kpi(data or [])
+        elif kind == "list":
+            content = _render_list(data or [])
+        else:
+            content = _render_text(str(data or ""))
+
+        sections_html += f"<section>\n<h2>{heading}</h2>\n{content}\n</section>\n"
+
+    return (
+        "<!DOCTYPE html>\n"
+        "<html lang='es'>\n"
+        "<head>\n"
+        "<meta charset='UTF-8'>\n"
+        "<meta name='viewport' content='width=device-width, initial-scale=1'>\n"
+        f"<title>{_esc(title)}</title>\n"
+        f"<style>\n{_CSS}\n</style>\n"
+        "</head>\n"
+        "<body>\n"
+        f"<h1>{_esc(title)}</h1>\n"
+        f"{sections_html}"
+        "</body>\n"
+        "</html>"
+    )
diff --git a/python/functions/core/generate_html_report_test.py b/python/functions/core/generate_html_report_test.py
new file mode 100644
index 00000000..fd314cc9
--- /dev/null
+++ b/python/functions/core/generate_html_report_test.py
@@ -0,0 +1,71 @@
+"""Tests para generate_html_report."""
+
+from generate_html_report import generate_html_report
+
+
+def test_reporte_con_una_tabla():
+    sections = [
+        {
+            "heading": "Datos",
+            "type": "table",
+            "data": [{"nombre": "Ana", "score": 99}, {"nombre": "Bob", "score": 87}],
+        }
+    ]
+    html = generate_html_report("Reporte", sections)
+    assert "<!DOCTYPE html>" in html
+    assert "<title>Reporte</title>" in html
+    assert "<th>nombre</th>" in html
+    assert "<td>Ana</td>" in html
+    assert "zebra" in html  # segunda fila tiene class zebra
+
+
+def test_reporte_con_multiples_secciones_mixtas():
+    sections = [
+        {"heading": "Texto", "type": "text", "data": "Hola mundo"},
+        {"heading": "Lista", "type": "list", "data": ["uno", "dos", "tres"]},
+        {"heading": "KPIs", "type": "kpi", "data": [{"label": "Revenue", "value": "1M", "delta": None}]},
+    ]
+    html = generate_html_report("Multi", sections)
+    assert "<p>Hola mundo</p>" in html
+    assert "<li>uno</li>" in html
+    assert "Revenue" in html
+    assert "1M" in html
+
+
+def test_kpi_con_deltas_positivos_y_negativos():
+    sections = [
+        {
+            "heading": "Metricas",
+            "type": "kpi",
+            "data": [
+                {"label": "Ganancia", "value": "5K", "delta": "+12%"},
+                {"label": "Perdida", "value": "2K", "delta": "-5%"},
+                {"label": "Estable", "value": "1K", "delta": "0%"},
+            ],
+        }
+    ]
+    html = generate_html_report("KPIs", sections)
+    assert 'class="delta-pos"' in html
+    assert 'class="delta-neg"' in html
+    assert 'class="delta-neutral"' in html
+    assert "+12%" in html
+    assert "-5%" in html
+
+
+def test_caracteres_especiales_html_escapados_en_data():
+    sections = [
+        {
+            "heading": "Codigo",
+            "type": "table",
+            "data": [{"expr": "<script>alert('xss')</script>"}],
+        }
+    ]
+    html = generate_html_report("Seguro", sections)
+    assert "<script>" not in html
+    assert "&lt;script&gt;" in html
+
+
+def test_titulo_con_caracteres_especiales():
+    html = generate_html_report("Reporte & Analisis <2024>", [])
+    assert "Reporte &amp; Analisis &lt;2024&gt;" in html
+    assert "<title>Reporte &amp; Analisis &lt;2024&gt;</title>" in html
diff --git a/python/functions/core/get_leaf_nodes.md b/python/functions/core/get_leaf_nodes.md
new file mode 100644
index 00000000..d698be21
--- /dev/null
+++ b/python/functions/core/get_leaf_nodes.md
@@ -0,0 +1,36 @@
+---
+name: get_leaf_nodes
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def get_leaf_nodes(structure: Any) -> list[dict]"
+description: "Extrae solo nodos hoja (sin hijos) de un arbol jerarquico. Deep copy de cada nodo."
+tags: [tree, leaf, hierarchy, functional]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [copy]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/core.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/utils.py"
+---
+
+## Ejemplo
+
+```python
+tree = [{"title": "A", "nodes": [{"title": "A1", "nodes": []}, {"title": "A2", "nodes": []}]}]
+get_leaf_nodes(tree)
+# [{"title": "A1"}, {"title": "A2"}]
+```
+
+## Notas
+
+Funcion pura. Usa deep copy. Un nodo es hoja si su campo 'nodes' es falsy (vacio o ausente).
diff --git a/python/functions/core/get_pdf_page_tokens.md b/python/functions/core/get_pdf_page_tokens.md
new file mode 100644
index 00000000..344b8045
--- /dev/null
+++ b/python/functions/core/get_pdf_page_tokens.md
@@ -0,0 +1,40 @@
+---
+name: get_pdf_page_tokens
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "def get_pdf_page_tokens(pdf_path, model: str = None, pdf_parser: str = 'PyPDF2') -> list[tuple[str, int]]"
+description: "Extrae texto y cuenta tokens por pagina de un PDF. Soporta PyPDF2 y PyMuPDF como backends."
+tags: [pdf, tokens, extraction, litellm, parsing]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [litellm, PyPDF2]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/get_pdf_page_tokens.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/utils.py"
+---
+
+## Ejemplo
+
+```python
+pages = get_pdf_page_tokens("report.pdf", model="gpt-4o")
+for text, tokens in pages:
+    print(f"{tokens} tokens")
+
+# Con PyMuPDF (mejor para PDFs complejos)
+pages = get_pdf_page_tokens("report.pdf", pdf_parser="PyMuPDF")
+total = sum(t for _, t in pages)
+```
+
+## Notas
+
+Requiere `pip install litellm PyPDF2` (o `pymupdf` para backend PyMuPDF). Acepta path string o BytesIO. Util para estimar costos de procesamiento LLM y para page_list_to_groups.
diff --git a/python/functions/core/get_pdf_page_tokens.py b/python/functions/core/get_pdf_page_tokens.py
new file mode 100644
index 00000000..62c15a05
--- /dev/null
+++ b/python/functions/core/get_pdf_page_tokens.py
@@ -0,0 +1,47 @@
+"""Extract text and token count per page from a PDF. Supports PyPDF2 and PyMuPDF."""
+
+import os
+from io import BytesIO
+
+import litellm
+
+
+def get_pdf_page_tokens(pdf_path, model: str = None,
+                        pdf_parser: str = "PyPDF2") -> list[tuple[str, int]]:
+    """Extract text and token count for each page of a PDF.
+
+    Args:
+        pdf_path: Path to PDF file, or BytesIO object.
+        model: Model name for token counting (passed to litellm.token_counter).
+        pdf_parser: Parser backend — 'PyPDF2' or 'PyMuPDF'.
+
+    Returns:
+        list[tuple[str, int]]: List of (page_text, token_count) per page.
+    """
+    if pdf_parser == "PyPDF2":
+        import PyPDF2
+        pdf_reader = PyPDF2.PdfReader(pdf_path)
+        page_list = []
+        for page in pdf_reader.pages:
+            page_text = page.extract_text() or ""
+            token_length = litellm.token_counter(model=model, text=page_text)
+            page_list.append((page_text, token_length))
+        return page_list
+
+    elif pdf_parser == "PyMuPDF":
+        import pymupdf
+        if isinstance(pdf_path, BytesIO):
+            doc = pymupdf.open(stream=pdf_path, filetype="pdf")
+        elif isinstance(pdf_path, str) and os.path.isfile(pdf_path):
+            doc = pymupdf.open(pdf_path)
+        else:
+            raise ValueError(f"Invalid pdf_path: {pdf_path}")
+        page_list = []
+        for page in doc:
+            page_text = page.get_text()
+            token_length = litellm.token_counter(model=model, text=page_text)
+            page_list.append((page_text, token_length))
+        return page_list
+
+    else:
+        raise ValueError(f"Unsupported PDF parser: {pdf_parser}. Use 'PyPDF2' or 'PyMuPDF'.")
diff --git a/python/functions/core/get_text_stats.md b/python/functions/core/get_text_stats.md
new file mode 100644
index 00000000..9bf8935a
--- /dev/null
+++ b/python/functions/core/get_text_stats.md
@@ -0,0 +1,32 @@
+---
+name: get_text_stats
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def get_text_stats(text: str) -> dict"
+description: "Estadisticas basicas de un texto: total de caracteres, lineas y palabras."
+tags: [text, statistics, stats, characters, words, lines]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: true
+tests: ["texto normal con palabras y lineas", "texto vacio retorna ceros", "texto con solo newlines"]
+test_file_path: "python/functions/core/get_text_stats_test.py"
+file_path: "python/functions/core/core.py"
+---
+
+## Ejemplo
+
+```python
+stats = get_text_stats("hello world\nfoo bar")
+# {"total_chars": 19, "total_lines": 2, "total_words": 4}
+```
+
+## Notas
+
+Funcion pura sin dependencias externas. `total_lines` cuenta newlines + 1, por lo que un texto vacio cuenta como 1 linea (comportamiento consistente con `wc -l` + 1). `total_words` usa `str.split()` que separa por cualquier whitespace y descarta vacios, equivalente a contar tokens separados por espacios.
diff --git a/python/functions/core/get_text_stats_test.py b/python/functions/core/get_text_stats_test.py
new file mode 100644
index 00000000..f5a2d5b4
--- /dev/null
+++ b/python/functions/core/get_text_stats_test.py
@@ -0,0 +1,21 @@
+"""Tests para get_text_stats."""
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(__file__))
+from core import get_text_stats
+
+
+def test_texto_normal_con_palabras_y_lineas():
+    result = get_text_stats("hello world\nfoo bar")
+    assert result == {"total_chars": 19, "total_lines": 2, "total_words": 4}
+
+
+def test_texto_vacio_retorna_ceros():
+    result = get_text_stats("")
+    assert result == {"total_chars": 0, "total_lines": 1, "total_words": 0}
+
+
+def test_texto_con_solo_newlines():
+    result = get_text_stats("\n\n")
+    assert result == {"total_chars": 2, "total_lines": 3, "total_words": 0}
diff --git a/python/functions/core/html_to_markdown.md b/python/functions/core/html_to_markdown.md
new file mode 100644
index 00000000..a1cf4737
--- /dev/null
+++ b/python/functions/core/html_to_markdown.md
@@ -0,0 +1,66 @@
+---
+name: html_to_markdown
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "html_to_markdown(html: str) -> str"
+description: "Convierte HTML a markdown. Usa readabilipy para extraer contenido principal (filtra nav, ads, boilerplate), luego markdownify para convertir a markdown. Si las librerias opcionales no estan disponibles, usa un parser stdlib como fallback."
+tags: [html, markdown, parse, convert, readabilipy, markdownify, content-extraction]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: ["re", "html.parser"]
+tested: true
+tests:
+  - "HTML con nav/footer filtra boilerplate"
+  - "HTML limpio se convierte correctamente"
+  - "HTML con imagenes lazy-loaded"
+test_file_path: "python/functions/core/html_to_markdown_test.py"
+file_path: "python/functions/core/html_to_markdown.py"
+---
+
+## Ejemplo
+
+```python
+from core.html_to_markdown import html_to_markdown
+
+html = """
+<html>
+  <body>
+    <nav><a href="/">Home</a></nav>
+    <main>
+      <h1>Titulo del articulo</h1>
+      <p>Contenido <strong>relevante</strong> aqui.</p>
+    </main>
+    <footer>Copyright 2026</footer>
+  </body>
+</html>
+"""
+
+md = html_to_markdown(html)
+# "# Titulo del articulo\n\nContenido **relevante** aqui."
+```
+
+## Notas
+
+Algoritmo:
+1. Preprocesar HTML: manejar contenido oculto WeChat (`js_content` con display:none),
+   lazy loading images (`data-src` → `src`).
+2. Extraer contenido principal con `readabilipy` (basado en Mozilla Readability).
+   Si no esta disponible, usa el HTML completo.
+3. Convertir a markdown con `markdownify` (headings ATX, strip script/style).
+   Si no esta disponible, usa el parser stdlib de la misma funcion.
+
+Dependencias opcionales (mejoran la calidad si estan instaladas):
+- `readabilipy` — extraccion del contenido principal (filtra nav, ads, boilerplate)
+- `markdownify` — conversion HTML→markdown de alta fidelidad
+- `beautifulsoup4` — requerida por readabilipy
+
+Sin las dependencias opcionales la funcion sigue siendo pura y funcional,
+usando `html.parser` de stdlib como fallback.
+
+Funcion pura. No hace I/O ni tiene efectos secundarios.
diff --git a/python/functions/core/html_to_markdown.py b/python/functions/core/html_to_markdown.py
new file mode 100644
index 00000000..52349ccc
--- /dev/null
+++ b/python/functions/core/html_to_markdown.py
@@ -0,0 +1,272 @@
+"""Convierte HTML a markdown usando readabilipy + markdownify, con fallback a stdlib."""
+
+import re
+from html.parser import HTMLParser
+from typing import Optional
+
+
+# ---------------------------------------------------------------------------
+# Stdlib fallback parser (no external deps)
+# ---------------------------------------------------------------------------
+
+_BLOCK_TAGS = {
+    "p", "div", "article", "section", "main", "header", "footer", "aside",
+    "nav", "figure", "figcaption", "blockquote", "pre", "ul", "ol", "li",
+    "table", "thead", "tbody", "tr", "th", "td", "h1", "h2", "h3",
+    "h4", "h5", "h6", "br", "hr",
+}
+
+_SKIP_TAGS = {
+    "script", "style", "noscript", "iframe", "svg", "canvas",
+    "nav", "footer", "header", "aside",
+}
+
+_HEADING_TAGS = {"h1": 1, "h2": 2, "h3": 3, "h4": 4, "h5": 5, "h6": 6}
+
+
+class _HTMLToMarkdownParser(HTMLParser):
+    """Minimal HTML → Markdown parser using only stdlib."""
+
+    def __init__(self) -> None:
+        super().__init__(convert_charrefs=True)
+        self._parts: list[str] = []
+        self._skip_depth = 0
+        self._in_pre = False
+        self._tag_stack: list[str] = []
+        self._list_stack: list[str] = []
+
+    def handle_starttag(self, tag: str, attrs: list) -> None:
+        tag = tag.lower()
+        self._tag_stack.append(tag)
+
+        if self._skip_depth > 0:
+            if tag in _SKIP_TAGS:
+                self._skip_depth += 1
+            return
+
+        if tag in _SKIP_TAGS:
+            self._skip_depth += 1
+            return
+
+        attrs_dict = dict(attrs)
+
+        if tag in _HEADING_TAGS:
+            level = _HEADING_TAGS[tag]
+            self._parts.append(f"\n\n{'#' * level} ")
+
+        elif tag == "p":
+            self._parts.append("\n\n")
+
+        elif tag == "br":
+            self._parts.append("  \n")
+
+        elif tag == "hr":
+            self._parts.append("\n\n---\n\n")
+
+        elif tag == "pre":
+            self._in_pre = True
+            self._parts.append("\n\n```\n")
+
+        elif tag == "code" and not self._in_pre:
+            self._parts.append("`")
+
+        elif tag in ("strong", "b"):
+            self._parts.append("**")
+
+        elif tag in ("em", "i"):
+            self._parts.append("*")
+
+        elif tag == "a":
+            href = attrs_dict.get("href", "")
+            self._parts.append("[")
+            self._parts.append(f"_href:{href}_")
+
+        elif tag == "img":
+            # Handle lazy-loaded images: prefer data-src over src
+            src = attrs_dict.get("data-src") or attrs_dict.get("src", "")
+            alt = attrs_dict.get("alt", "")
+            self._parts.append(f"\n\n![{alt}]({src})\n\n")
+
+        elif tag == "ul":
+            self._list_stack.append("ul")
+            self._parts.append("\n")
+
+        elif tag == "ol":
+            self._list_stack.append("ol")
+            self._parts.append("\n")
+
+        elif tag == "li":
+            prefix = "-" if (not self._list_stack or self._list_stack[-1] == "ul") else "1."
+            self._parts.append(f"\n{prefix} ")
+
+        elif tag in ("blockquote",):
+            self._parts.append("\n\n> ")
+
+        elif tag in ("th", "td"):
+            self._parts.append("| ")
+
+        elif tag == "tr":
+            self._parts.append("\n")
+
+    def handle_endtag(self, tag: str) -> None:
+        tag = tag.lower()
+        if self._tag_stack and self._tag_stack[-1] == tag:
+            self._tag_stack.pop()
+
+        if self._skip_depth > 0:
+            if tag in _SKIP_TAGS:
+                self._skip_depth -= 1
+            return
+
+        if tag in _HEADING_TAGS:
+            self._parts.append("\n\n")
+
+        elif tag == "p":
+            self._parts.append("\n\n")
+
+        elif tag == "pre":
+            self._in_pre = False
+            self._parts.append("\n```\n\n")
+
+        elif tag == "code" and not self._in_pre:
+            self._parts.append("`")
+
+        elif tag in ("strong", "b"):
+            self._parts.append("**")
+
+        elif tag in ("em", "i"):
+            self._parts.append("*")
+
+        elif tag == "a":
+            # Find the matching _href: placeholder and rebuild [text](href)
+            text_parts: list[str] = []
+            href = ""
+            while self._parts:
+                part = self._parts.pop()
+                if part.startswith("_href:") and part.endswith("_"):
+                    href = part[6:-1]
+                    # collected text_parts in reverse, also the "[" opener
+                    if self._parts and self._parts[-1] == "[":
+                        self._parts.pop()
+                    break
+                text_parts.insert(0, part)
+            link_text = "".join(text_parts).strip()
+            self._parts.append(f"[{link_text}]({href})")
+
+        elif tag in ("ul", "ol"):
+            if self._list_stack:
+                self._list_stack.pop()
+            self._parts.append("\n")
+
+    def handle_data(self, data: str) -> None:
+        if self._skip_depth > 0:
+            return
+        if self._in_pre:
+            self._parts.append(data)
+        else:
+            self._parts.append(data)
+
+    def get_markdown(self) -> str:
+        raw = "".join(self._parts)
+        # Collapse 3+ consecutive newlines to 2
+        raw = re.sub(r"\n{3,}", "\n\n", raw)
+        return raw.strip()
+
+
+def _stdlib_html_to_markdown(html: str) -> str:
+    """Convert HTML to markdown using only Python stdlib."""
+    parser = _HTMLToMarkdownParser()
+    parser.feed(html)
+    return parser.get_markdown()
+
+
+# ---------------------------------------------------------------------------
+# Public function
+# ---------------------------------------------------------------------------
+
+
+def html_to_markdown(html: str) -> str:
+    """Convierte HTML a markdown.
+
+    Usa readabilipy para extraer el contenido principal (filtra nav, ads,
+    boilerplate) y markdownify para convertir a markdown. Si alguna de esas
+    librerias no esta disponible, usa un parser stdlib como fallback.
+
+    Pasos:
+    1. Preprocesar HTML: manejar contenido oculto (WeChat js_content),
+       lazy loading images (data-src → src).
+    2. Extraer contenido principal con readabilipy (basado en Mozilla
+       Readability). Fallback: usar el HTML completo.
+    3. Convertir a markdown con markdownify (headings ATX, strip
+       script/style). Fallback: parser stdlib.
+
+    Args:
+        html: HTML completo de la pagina.
+
+    Returns:
+        Contenido de la pagina en formato markdown.
+    """
+    # Step 1: preprocess — handle WeChat hidden content and lazy-loaded images
+    html = _preprocess_html(html)
+
+    # Step 2: extract main content with readabilipy (optional dep)
+    main_html = _extract_main_content(html)
+
+    # Step 3: convert to markdown
+    return _convert_to_markdown(main_html)
+
+
+def _preprocess_html(html: str) -> str:
+    """Preprocesar HTML antes de extraer contenido.
+
+    - Expande contenido oculto de WeChat (js_content).
+    - Reemplaza data-src por src en imagenes lazy-loaded.
+    """
+    # WeChat js_content: replace hidden wrapper divs
+    html = re.sub(
+        r'<div[^>]*id=["\']js_content["\'][^>]*style=["\'][^"\']*display\s*:\s*none[^"\']*["\'][^>]*>',
+        '<div id="js_content">',
+        html,
+        flags=re.IGNORECASE,
+    )
+
+    # Lazy loading: copy data-src to src for img tags
+    def replace_lazy_src(m: re.Match) -> str:
+        tag = m.group(0)
+        data_src_match = re.search(r'data-src=["\']([^"\']*)["\']', tag)
+        if data_src_match:
+            data_src = data_src_match.group(1)
+            # Replace or add src attribute
+            if re.search(r'\bsrc=["\']', tag):
+                tag = re.sub(r'\bsrc=["\'][^"\']*["\']', f'src="{data_src}"', tag)
+            else:
+                tag = tag.replace("<img", f'<img src="{data_src}"', 1)
+        return tag
+
+    html = re.sub(r"<img[^>]+>", replace_lazy_src, html, flags=re.IGNORECASE)
+    return html
+
+
+def _extract_main_content(html: str) -> str:
+    """Extraer contenido principal usando readabilipy si esta disponible."""
+    try:
+        from readabilipy import simple_json_from_html_string  # type: ignore
+
+        article = simple_json_from_html_string(html, use_readability=True)
+        return article.get("content") or html
+    except ImportError:
+        return html
+
+
+def _convert_to_markdown(html: str) -> str:
+    """Convertir HTML a markdown usando markdownify si esta disponible."""
+    try:
+        import markdownify  # type: ignore
+
+        return markdownify.markdownify(
+            html,
+            heading_style="ATX",
+            strip=["script", "style"],
+        )
+    except ImportError:
+        return _stdlib_html_to_markdown(html)
diff --git a/python/functions/core/html_to_markdown_test.py b/python/functions/core/html_to_markdown_test.py
new file mode 100644
index 00000000..e477985b
--- /dev/null
+++ b/python/functions/core/html_to_markdown_test.py
@@ -0,0 +1,90 @@
+"""Tests para html_to_markdown."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from core.html_to_markdown import html_to_markdown, _preprocess_html
+
+
+def test_html_con_nav_y_footer_filtra_boilerplate():
+    """HTML con nav/footer: el contenido principal debe extraerse (nav no aparece en output)."""
+    html = """
+    <html>
+      <body>
+        <nav><a href="/">Home</a><a href="/about">About</a></nav>
+        <main>
+          <h1>Titulo principal</h1>
+          <p>Este es el contenido relevante del articulo.</p>
+        </main>
+        <footer><p>Copyright 2026</p></footer>
+      </body>
+    </html>
+    """
+    result = html_to_markdown(html)
+    assert "Titulo principal" in result
+    assert "contenido relevante" in result
+
+
+def test_html_limpio_se_convierte_correctamente():
+    """HTML limpio sin boilerplate: headings y parrafos se convierten correctamente."""
+    html = """
+    <html>
+      <body>
+        <h1>Hello World</h1>
+        <p>Parrafo de prueba con <strong>texto en negrita</strong>.</p>
+        <h2>Seccion dos</h2>
+        <p>Mas contenido aqui.</p>
+      </body>
+    </html>
+    """
+    result = html_to_markdown(html)
+    assert "Hello World" in result
+    assert "Parrafo de prueba" in result
+    assert "Seccion dos" in result
+
+
+def test_html_con_imagenes_lazy_loaded():
+    """HTML con imagenes lazy-loaded: data-src debe reemplazar src en el output."""
+    html = """
+    <html>
+      <body>
+        <p>Articulo con imagen</p>
+        <img src="placeholder.gif" data-src="imagen-real.jpg" alt="foto real" />
+      </body>
+    </html>
+    """
+    # Verificar preprocesamiento
+    preprocessed = _preprocess_html(html)
+    assert "imagen-real.jpg" in preprocessed
+    # El resultado final debe contener la URL real
+    result = html_to_markdown(html)
+    assert "imagen-real.jpg" in result
+
+
+def test_preprocess_lazy_loading_reemplaza_src():
+    """_preprocess_html reemplaza src con data-src en imagenes."""
+    html = '<img src="placeholder.gif" data-src="real.jpg" alt="x" />'
+    result = _preprocess_html(html)
+    assert 'src="real.jpg"' in result
+
+
+def test_preprocess_lazy_loading_sin_src_anade_src():
+    """_preprocess_html agrega src cuando la imagen no tiene atributo src."""
+    html = '<img data-src="real.jpg" alt="foto" />'
+    result = _preprocess_html(html)
+    assert 'src="real.jpg"' in result
+
+
+def test_html_vacio_retorna_string():
+    """HTML vacio no lanza excepcion."""
+    result = html_to_markdown("")
+    assert isinstance(result, str)
+
+
+def test_html_solo_texto():
+    """HTML con solo texto plano se convierte sin error."""
+    html = "<p>Solo texto</p>"
+    result = html_to_markdown(html)
+    assert "Solo texto" in result
diff --git a/python/functions/core/is_git_repo_url.md b/python/functions/core/is_git_repo_url.md
new file mode 100644
index 00000000..8cb04574
--- /dev/null
+++ b/python/functions/core/is_git_repo_url.md
@@ -0,0 +1,48 @@
+---
+name: is_git_repo_url
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def is_git_repo_url(url: str, known_hosts: list[str] | None = None) -> bool"
+description: "Verifica si una URL apunta a un repositorio git clonable. Acepta org/repo y org/repo/tree/<ref>. Rechaza issues, blobs, PRs y otros sub-recursos."
+tags: [git, url, validation, github, gitlab, repository]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [urllib.parse]
+tested: true
+tests:
+  - "URL repo valida"
+  - "URL de issue (False)"
+  - "URL de blob/file (False)"
+  - "URL con tree/branch (True)"
+test_file_path: "python/functions/core/parse_git_url_test.py"
+file_path: "python/functions/core/core.py"
+---
+
+## Ejemplo
+
+```python
+is_git_repo_url("https://github.com/psf/requests")
+# True
+
+is_git_repo_url("https://github.com/psf/requests/issues/123")
+# False
+
+is_git_repo_url("https://github.com/psf/requests/blob/main/README.md")
+# False
+
+is_git_repo_url("https://github.com/psf/requests/tree/main")
+# True
+
+is_git_repo_url("git@github.com:psf/requests.git")
+# True
+```
+
+## Notas
+
+Funcion pura. Para SSH y git:// se acepta cualquier path siempre que el host sea conocido (los protocolos de clonacion no navegan a sub-recursos). Para HTTP/HTTPS se exige exactamente 2 segmentos (org/repo) o 4 segmentos con `tree` en posicion 3.
diff --git a/python/functions/core/join_by_key.md b/python/functions/core/join_by_key.md
new file mode 100644
index 00000000..ddeb66bf
--- /dev/null
+++ b/python/functions/core/join_by_key.md
@@ -0,0 +1,47 @@
+---
+name: join_by_key
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def join_by_key(left: list[dict], right: list[dict], key: str, how: str = 'inner') -> list[dict]"
+description: "Join de dos listas de dicts por una clave comun. Soporta inner, left, right y outer. Campos duplicados del right se sufijan con _right. Algoritmo O(n+m)."
+tags: [tabular, join, merge, python, core]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: true
+tests:
+  - "Inner join solo matches"
+  - "Left join todos los left con None para right sin match"
+  - "Right join"
+  - "Outer join"
+  - "Campos duplicados con sufijo _right"
+  - "Key ausente en alguna fila"
+test_file_path: "python/functions/core/join_by_key_test.py"
+file_path: "python/functions/core/join_by_key.py"
+---
+
+## Ejemplo
+
+```python
+left = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]
+right = [{"id": 1, "dept": "eng"}, {"id": 3, "dept": "sales"}]
+
+join_by_key(left, right, key="id", how="inner")
+# [{"id": 1, "name": "Alice", "dept": "eng"}]
+
+join_by_key(left, right, key="id", how="left")
+# [{"id": 1, "name": "Alice", "dept": "eng"},
+#  {"id": 2, "name": "Bob", "dept": None}]
+```
+
+## Notas
+
+Funcion pura sin dependencias externas.
+El algoritmo indexa right en O(n) y luego itera left en O(m), total O(n+m).
+Los campos de right que colisionan con campos de left (excepto la clave) se renombran con sufijo _right.
diff --git a/python/functions/core/join_by_key.py b/python/functions/core/join_by_key.py
new file mode 100644
index 00000000..a726ee2b
--- /dev/null
+++ b/python/functions/core/join_by_key.py
@@ -0,0 +1,95 @@
+"""Join de dos tablas tabulares por una clave comun."""
+
+
+def join_by_key(
+    left: list[dict],
+    right: list[dict],
+    key: str,
+    how: str = "inner",
+) -> list[dict]:
+    """Une dos listas de dicts por una clave comun.
+
+    Soporta los cuatro tipos de join: inner, left, right, outer.
+    Campos duplicados del lado right (distintos a la clave) se sufijan con _right.
+
+    Algoritmo O(n+m): indexa right por key, luego itera left buscando matches.
+
+    Args:
+        left: Lista de dicts del lado izquierdo.
+        right: Lista de dicts del lado derecho.
+        key: Nombre del campo clave para el join.
+        how: Tipo de join: inner, left, right, outer.
+
+    Returns:
+        Lista de dicts con campos de ambos lados mergeados.
+        Campos del right ausentes en un match left se rellenan con None.
+        Campos del left ausentes en un match right se rellenan con None.
+    """
+    # Indexar right por key
+    right_index: dict[any, list[dict]] = {}
+    for row in right:
+        k = row.get(key)
+        right_index.setdefault(k, []).append(row)
+
+    # Determinar campos del right que podrian colisionar con left
+    left_keys = {k for row in left for k in row}
+    right_only_keys = {k for row in right for k in row if k != key}
+    conflicting = right_only_keys & left_keys - {key}
+
+    def _merge(l_row: dict | None, r_row: dict | None) -> dict:
+        result: dict = {}
+        if l_row is not None:
+            result.update(l_row)
+        if r_row is not None:
+            for k, v in r_row.items():
+                if k == key:
+                    continue
+                if k in conflicting:
+                    result[f"{k}_right"] = v
+                else:
+                    result[k] = v
+        return result
+
+    def _empty_left(left_sample: dict | None) -> dict:
+        if left_sample is None:
+            return {}
+        return {k: None for k in left_sample}
+
+    def _empty_right() -> dict:
+        result: dict = {}
+        for row in right:
+            for k in row:
+                if k == key:
+                    continue
+                dest = f"{k}_right" if k in conflicting else k
+                result[dest] = None
+        return result
+
+    matched_right_keys: set = set()
+    output: list[dict] = []
+
+    for l_row in left:
+        k = l_row.get(key)
+        r_rows = right_index.get(k)
+        if r_rows:
+            matched_right_keys.add(k)
+            for r_row in r_rows:
+                output.append(_merge(l_row, r_row))
+        else:
+            if how in ("left", "outer"):
+                output.append(_merge(l_row, None) | _empty_right())
+
+    if how in ("right", "outer"):
+        for r_row in right:
+            k = r_row.get(key)
+            if k not in matched_right_keys:
+                base = _empty_right()
+                base[key] = k
+                for rk, rv in r_row.items():
+                    if rk == key:
+                        continue
+                    dest = f"{rk}_right" if rk in conflicting else rk
+                    base[dest] = rv
+                output.append(base)
+
+    return output
diff --git a/python/functions/core/join_by_key_test.py b/python/functions/core/join_by_key_test.py
new file mode 100644
index 00000000..909ee483
--- /dev/null
+++ b/python/functions/core/join_by_key_test.py
@@ -0,0 +1,72 @@
+"""Tests para join_by_key."""
+
+import sys
+import os
+sys.path.insert(0, os.path.dirname(__file__))
+
+from join_by_key import join_by_key
+
+
+def test_inner_join_solo_matches():
+    """Inner join solo matches."""
+    left = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]
+    right = [{"id": 1, "dept": "eng"}, {"id": 3, "dept": "sales"}]
+    result = join_by_key(left, right, key="id", how="inner")
+    assert len(result) == 1
+    assert result[0]["id"] == 1
+    assert result[0]["name"] == "Alice"
+    assert result[0]["dept"] == "eng"
+
+
+def test_left_join_todos_los_left_con_none_para_right_sin_match():
+    """Left join todos los left con None para right sin match."""
+    left = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]
+    right = [{"id": 1, "dept": "eng"}]
+    result = join_by_key(left, right, key="id", how="left")
+    assert len(result) == 2
+    alice = next(r for r in result if r["id"] == 1)
+    bob = next(r for r in result if r["id"] == 2)
+    assert alice["dept"] == "eng"
+    assert bob["dept"] is None
+
+
+def test_right_join():
+    """Right join."""
+    left = [{"id": 1, "name": "Alice"}]
+    right = [{"id": 1, "dept": "eng"}, {"id": 2, "dept": "sales"}]
+    result = join_by_key(left, right, key="id", how="right")
+    assert len(result) == 2
+    eng = next(r for r in result if r["id"] == 1)
+    sales = next(r for r in result if r["id"] == 2)
+    assert eng["name"] == "Alice"
+    assert sales.get("name") is None
+
+
+def test_outer_join():
+    """Outer join."""
+    left = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]
+    right = [{"id": 1, "dept": "eng"}, {"id": 3, "dept": "sales"}]
+    result = join_by_key(left, right, key="id", how="outer")
+    ids = {r["id"] for r in result}
+    assert ids == {1, 2, 3}
+
+
+def test_campos_duplicados_con_sufijo_right():
+    """Campos duplicados con sufijo _right."""
+    left = [{"id": 1, "name": "Alice", "score": 90}]
+    right = [{"id": 1, "score": 85, "dept": "eng"}]
+    result = join_by_key(left, right, key="id", how="inner")
+    assert len(result) == 1
+    assert result[0]["score"] == 90
+    assert result[0]["score_right"] == 85
+    assert result[0]["dept"] == "eng"
+
+
+def test_key_ausente_en_alguna_fila():
+    """Key ausente en alguna fila."""
+    left = [{"id": 1, "name": "Alice"}, {"name": "Bob"}]  # Bob sin id
+    right = [{"id": 1, "dept": "eng"}]
+    result = join_by_key(left, right, key="id", how="inner")
+    # Solo Alice matchea
+    assert len(result) == 1
+    assert result[0]["name"] == "Alice"
diff --git a/python/functions/core/list_to_tree.md b/python/functions/core/list_to_tree.md
new file mode 100644
index 00000000..640ffb13
--- /dev/null
+++ b/python/functions/core/list_to_tree.md
@@ -0,0 +1,41 @@
+---
+name: list_to_tree
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def list_to_tree(data: list[dict]) -> list[dict]"
+description: "Convierte lista plana con codigos de estructura ('1.2.3') a arbol jerarquico anidado."
+tags: [tree, hierarchy, structure, conversion]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/core.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/utils.py"
+---
+
+## Ejemplo
+
+```python
+flat = [
+    {"structure": "1", "title": "Intro", "start_index": 1, "end_index": 5},
+    {"structure": "1.1", "title": "Background", "start_index": 1, "end_index": 3},
+    {"structure": "1.2", "title": "Scope", "start_index": 3, "end_index": 5},
+    {"structure": "2", "title": "Methods", "start_index": 5, "end_index": 10},
+]
+tree = list_to_tree(flat)
+# [{"title": "Intro", "nodes": [{"title": "Background"}, {"title": "Scope"}]}, {"title": "Methods"}]
+```
+
+## Notas
+
+Funcion pura. Cada item necesita campo 'structure' con codigo jerarquico separado por puntos. Nodos huerfanos se promueven a raiz.
diff --git a/python/functions/core/llm_acompletion_retry.md b/python/functions/core/llm_acompletion_retry.md
new file mode 100644
index 00000000..a7f9834d
--- /dev/null
+++ b/python/functions/core/llm_acompletion_retry.md
@@ -0,0 +1,40 @@
+---
+name: llm_acompletion_retry
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "async def llm_acompletion_retry(model: str, prompt: str, max_retries: int = 10, temperature: float = 0) -> str"
+description: "Completion LLM asincrono con retry automatico. Soporte multi-modelo via litellm (OpenAI, Anthropic, etc.)."
+tags: [llm, completion, retry, async, litellm, api]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [litellm, asyncio, logging]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/llm_acompletion_retry.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/utils.py"
+---
+
+## Ejemplo
+
+```python
+import asyncio
+
+async def main():
+    response = await llm_acompletion_retry("gpt-4o", "Summarize this text: ...")
+    print(response)
+
+asyncio.run(main())
+```
+
+## Notas
+
+Requiere `pip install litellm`. Version async de llm_completion_retry. Usa asyncio.sleep entre retries. Ideal para procesar multiples prompts en paralelo con asyncio.gather.
diff --git a/python/functions/core/llm_acompletion_retry.py b/python/functions/core/llm_acompletion_retry.py
new file mode 100644
index 00000000..f1032cbe
--- /dev/null
+++ b/python/functions/core/llm_acompletion_retry.py
@@ -0,0 +1,43 @@
+"""Async LLM completion with retry logic via litellm. Supports 100+ models."""
+
+import asyncio
+import logging
+
+import litellm
+
+litellm.drop_params = True
+
+
+async def llm_acompletion_retry(model: str, prompt: str, max_retries: int = 10,
+                                temperature: float = 0) -> str:
+    """Asynchronous LLM completion with retry. Multi-model support via litellm.
+
+    Args:
+        model: Model name (e.g. 'gpt-4o', 'claude-sonnet-4-20250514'). Strips 'litellm/' prefix.
+        prompt: User prompt text.
+        max_retries: Max retry attempts on failure.
+        temperature: Sampling temperature.
+
+    Returns:
+        str: Response content. Empty string if all retries fail.
+    """
+    if model:
+        model = model.removeprefix("litellm/")
+
+    messages = [{"role": "user", "content": prompt}]
+
+    for i in range(max_retries):
+        try:
+            response = await litellm.acompletion(
+                model=model,
+                messages=messages,
+                temperature=temperature,
+            )
+            return response.choices[0].message.content
+        except Exception as e:
+            logging.error(f"Async LLM completion error (attempt {i+1}/{max_retries}): {e}")
+            if i < max_retries - 1:
+                await asyncio.sleep(1)
+            else:
+                logging.error(f"Max retries reached for model={model}")
+                return ""
diff --git a/python/functions/core/llm_completion_retry.md b/python/functions/core/llm_completion_retry.md
new file mode 100644
index 00000000..a7bae27d
--- /dev/null
+++ b/python/functions/core/llm_completion_retry.md
@@ -0,0 +1,43 @@
+---
+name: llm_completion_retry
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "def llm_completion_retry(model: str, prompt: str, chat_history: list = None, return_finish_reason: bool = False, max_retries: int = 10, temperature: float = 0) -> str"
+description: "Completion LLM sincrono con retry automatico (max 10). Soporte multi-modelo via litellm (OpenAI, Anthropic, etc.)."
+tags: [llm, completion, retry, litellm, api]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [litellm, logging, time]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/llm_completion_retry.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/utils.py"
+---
+
+## Ejemplo
+
+```python
+response = llm_completion_retry("gpt-4o", "Explain quantum computing in one sentence")
+# "Quantum computing uses quantum bits..."
+
+# Con historial de chat
+history = [{"role": "user", "content": "Hi"}, {"role": "assistant", "content": "Hello!"}]
+response = llm_completion_retry("claude-sonnet-4-20250514", "What's 2+2?", chat_history=history)
+
+# Con finish reason
+content, reason = llm_completion_retry("gpt-4o", "...", return_finish_reason=True)
+# reason: "finished" | "max_output_reached" | "error"
+```
+
+## Notas
+
+Requiere `pip install litellm`. Soporta 100+ modelos via litellm. Retry con sleep(1) entre intentos. Retorna string vacio si todos los intentos fallan.
diff --git a/python/functions/core/llm_completion_retry.py b/python/functions/core/llm_completion_retry.py
new file mode 100644
index 00000000..48072ccb
--- /dev/null
+++ b/python/functions/core/llm_completion_retry.py
@@ -0,0 +1,52 @@
+"""LLM completion with retry logic via litellm. Supports 100+ models."""
+
+import logging
+import time
+
+import litellm
+
+litellm.drop_params = True
+
+
+def llm_completion_retry(model: str, prompt: str, chat_history: list = None,
+                         return_finish_reason: bool = False, max_retries: int = 10,
+                         temperature: float = 0):
+    """Synchronous LLM completion with retry. Multi-model support via litellm.
+
+    Args:
+        model: Model name (e.g. 'gpt-4o', 'claude-sonnet-4-20250514'). Strips 'litellm/' prefix.
+        prompt: User prompt text.
+        chat_history: Optional list of prior messages [{"role": ..., "content": ...}].
+        return_finish_reason: If True, returns (content, reason) tuple.
+        max_retries: Max retry attempts on failure.
+        temperature: Sampling temperature.
+
+    Returns:
+        str or (str, str): Response content, optionally with finish reason.
+    """
+    if model:
+        model = model.removeprefix("litellm/")
+
+    messages = list(chat_history or []) + [{"role": "user", "content": prompt}]
+
+    for i in range(max_retries):
+        try:
+            response = litellm.completion(
+                model=model,
+                messages=messages,
+                temperature=temperature,
+            )
+            content = response.choices[0].message.content
+            if return_finish_reason:
+                reason = "max_output_reached" if response.choices[0].finish_reason == "length" else "finished"
+                return content, reason
+            return content
+        except Exception as e:
+            logging.error(f"LLM completion error (attempt {i+1}/{max_retries}): {e}")
+            if i < max_retries - 1:
+                time.sleep(1)
+            else:
+                logging.error(f"Max retries reached for model={model}")
+                if return_finish_reason:
+                    return "", "error"
+                return ""
diff --git a/python/functions/core/load_translations.md b/python/functions/core/load_translations.md
new file mode 100644
index 00000000..5b31ab05
--- /dev/null
+++ b/python/functions/core/load_translations.md
@@ -0,0 +1,43 @@
+---
+name: load_translations
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "def load_translations(locales_dir: str) -> dict[str, dict]"
+description: "Carga todos los archivos JSON de un directorio de locales. Cada archivo {locale}.json se indexa por nombre sin extension. Retorna {} si el directorio no existe o esta vacio."
+tags: [i18n, translation, locale, json, files]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [json, os]
+tested: true
+tests: ["carga multiples locales", "directorio inexistente retorna dict vacio", "ignora archivos no json", "locale con estructura anidada"]
+test_file_path: "python/functions/core/load_translations_test.py"
+file_path: "python/functions/core/load_translations.py"
+---
+
+## Ejemplo
+
+```python
+from load_translations import load_translations
+from t import _set_translations, t
+
+# Estructura de archivos:
+# locales/
+#   en.json  →  {"report": {"done": "Done", "sectionStart": "Section: {title}"}}
+#   es.json  →  {"report": {"done": "Listo"}}
+
+translations = load_translations("locales/")
+_set_translations(translations, default_locale="en")
+
+t("report.done", locale="es")
+# → "Listo"
+```
+
+## Notas
+
+Lee el filesystem, por eso es impura. Los errores de JSON malformado se propagan directamente (`json.JSONDecodeError`). Los errores de acceso al directorio se propagan como `OSError`. Companera natural de `t_py_core` — el flujo tipico es: `load_translations` al inicio de la app → `_set_translations` → llamadas a `t` durante la ejecucion. Inspirada conceptualmente en el modulo `locale.py` de MiroFish (AGPL-3.0); reimplementada desde cero.
diff --git a/python/functions/core/load_translations.py b/python/functions/core/load_translations.py
new file mode 100644
index 00000000..69579c5a
--- /dev/null
+++ b/python/functions/core/load_translations.py
@@ -0,0 +1,46 @@
+"""Carga de archivos JSON de un directorio de locales."""
+
+import json
+import os
+
+
+def load_translations(locales_dir: str) -> dict[str, dict]:
+    """Carga todos los archivos JSON de un directorio de locales.
+
+    Cada archivo `{locale}.json` se carga como diccionario y se indexa
+    por el nombre del archivo sin extension (el locale).
+
+    Args:
+        locales_dir: Ruta al directorio que contiene los archivos JSON de locales.
+
+    Returns:
+        Diccionario {locale: dict_de_traducciones}. Retorna {} si el directorio
+        no existe o no contiene archivos JSON.
+
+    Raises:
+        OSError: Si el directorio no es accesible.
+        json.JSONDecodeError: Si un archivo JSON esta malformado.
+
+    Example:
+        >>> # locales/en.json = {"greeting": "Hello"}
+        >>> # locales/es.json = {"greeting": "Hola"}
+        >>> translations = load_translations("locales/")
+        >>> translations["en"]["greeting"]
+        'Hello'
+        >>> translations["es"]["greeting"]
+        'Hola'
+    """
+    translations: dict[str, dict] = {}
+
+    if not os.path.isdir(locales_dir):
+        return translations
+
+    for filename in os.listdir(locales_dir):
+        if not filename.endswith(".json"):
+            continue
+        locale = filename[:-5]  # quitar ".json"
+        filepath = os.path.join(locales_dir, filename)
+        with open(filepath, encoding="utf-8") as f:
+            translations[locale] = json.load(f)
+
+    return translations
diff --git a/python/functions/core/load_translations_test.py b/python/functions/core/load_translations_test.py
new file mode 100644
index 00000000..ceeac60c
--- /dev/null
+++ b/python/functions/core/load_translations_test.py
@@ -0,0 +1,80 @@
+"""Tests para load_translations."""
+
+import json
+import os
+import sys
+import tempfile
+import shutil
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from load_translations import load_translations
+
+
+def test_carga_multiples_locales():
+    tmp = tempfile.mkdtemp()
+    try:
+        with open(os.path.join(tmp, "en.json"), "w") as f:
+            json.dump({"greeting": "Hello"}, f)
+        with open(os.path.join(tmp, "es.json"), "w") as f:
+            json.dump({"greeting": "Hola"}, f)
+
+        result = load_translations(tmp)
+        assert "en" in result, "Debe contener locale 'en'"
+        assert "es" in result, "Debe contener locale 'es'"
+        assert result["en"]["greeting"] == "Hello"
+        assert result["es"]["greeting"] == "Hola"
+    finally:
+        shutil.rmtree(tmp)
+
+
+def test_directorio_inexistente_retorna_dict_vacio():
+    result = load_translations("/tmp/directorio_que_no_existe_xyz_12345")
+    assert result == {}, f"Expected {{}}, got {result}"
+
+
+def test_ignora_archivos_no_json():
+    tmp = tempfile.mkdtemp()
+    try:
+        with open(os.path.join(tmp, "en.json"), "w") as f:
+            json.dump({"key": "value"}, f)
+        with open(os.path.join(tmp, "README.md"), "w") as f:
+            f.write("# Locales")
+        with open(os.path.join(tmp, "notes.txt"), "w") as f:
+            f.write("some notes")
+
+        result = load_translations(tmp)
+        assert list(result.keys()) == ["en"], f"Expected only 'en', got {list(result.keys())}"
+    finally:
+        shutil.rmtree(tmp)
+
+
+def test_locale_con_estructura_anidada():
+    tmp = tempfile.mkdtemp()
+    try:
+        nested = {"report": {"sectionStart": "Section: {title}", "done": "Done"}}
+        with open(os.path.join(tmp, "en.json"), "w") as f:
+            json.dump(nested, f)
+
+        result = load_translations(tmp)
+        assert result["en"]["report"]["done"] == "Done"
+        assert result["en"]["report"]["sectionStart"] == "Section: {title}"
+    finally:
+        shutil.rmtree(tmp)
+
+
+if __name__ == "__main__":
+    test_carga_multiples_locales()
+    print("PASS: carga multiples locales")
+
+    test_directorio_inexistente_retorna_dict_vacio()
+    print("PASS: directorio inexistente retorna dict vacio")
+
+    test_ignora_archivos_no_json()
+    print("PASS: ignora archivos no json")
+
+    test_locale_con_estructura_anidada()
+    print("PASS: locale con estructura anidada")
+
+    print("---")
+    print("All tests passed.")
diff --git a/python/functions/core/merge_entity_attributes.md b/python/functions/core/merge_entity_attributes.md
new file mode 100644
index 00000000..8ae0bf3d
--- /dev/null
+++ b/python/functions/core/merge_entity_attributes.md
@@ -0,0 +1,67 @@
+---
+name: merge_entity_attributes
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def merge_entity_attributes(attr_list: list[dict]) -> dict"
+description: "Combina atributos de multiples candidatos de la misma entidad. Aplica heuristicas de resolucion por tipo de campo: max para numericos, min/max para fechas, union para listas, OR para booleanos, mas largo para strings."
+tags: [merge, entity, attributes, resolution, deduplication, fuzzygraph, python]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: true
+tests:
+  - "Atributos complementarios (A tiene full_name, B tiene nationality) -> ambos"
+  - "Atributos conflictivos en risk_score -> max"
+  - "Atributos first_seen conflictivos -> min"
+  - "Todos null -> null"
+  - "Listas -> union sin duplicados"
+  - "Boolean verified -> True si alguno es True"
+  - "String conflictivo -> usar el mas largo"
+  - "Valores iguales -> usar ese valor"
+  - "Un solo candidato -> retorna sus atributos tal cual"
+  - "Lista vacia -> retorna dict vacio"
+  - "last_seen conflictivo -> max (mas reciente)"
+  - "Un candidato tiene null, otro tiene valor -> usar el valor"
+test_file_path: "python/functions/core/merge_entity_attributes_test.py"
+file_path: "python/functions/core/merge_entity_attributes.py"
+---
+
+## Ejemplo
+
+```python
+a = {"risk_score": 3.5, "first_seen": "2022-05-15", "verified": False}
+b = {"risk_score": 7.2, "first_seen": "2023-01-01", "verified": True, "alias": "Alice"}
+
+result = merge_entity_attributes([a, b])
+# {
+#   "risk_score": 7.2,           # max
+#   "first_seen": "2022-05-15",  # min (mas antigua)
+#   "verified": True,            # OR logico
+#   "alias": "Alice"             # solo en b
+# }
+```
+
+## Heuristicas de resolucion
+
+| Campo / tipo | Conflicto | Resolucion |
+|---|---|---|
+| `risk_score`, `balance`, `cvss` | numerico | `max` |
+| `first_seen`, `created_date` | fecha | `min` (mas antigua) |
+| `last_seen`, `expires_date` | fecha | `max` (mas reciente) |
+| `verified`, `exploited` | booleano | `any` (OR logico) |
+| cualquier `list` | lista | union sin duplicados |
+| cualquier `str` u otro | string | el mas largo |
+
+Los campos fuera de las listas conocidas usan la heuristica por tipo Python (`list`, `bool`, luego `str`/otro).
+
+## Notas
+
+Funcion pura. No tiene dependencias externas. Las listas conocidas de campos especiales (`_NUMERIC_FIELDS`, `_DATE_MIN_FIELDS`, etc.) pueden extenderse si el dominio crece.
+
+Disenada originalmente para el grafo de entidades de fuzzygraph, donde multiples fuentes pueden describir la misma entidad con datos complementarios o contradictorios.
diff --git a/python/functions/core/merge_entity_attributes.py b/python/functions/core/merge_entity_attributes.py
new file mode 100644
index 00000000..6c864581
--- /dev/null
+++ b/python/functions/core/merge_entity_attributes.py
@@ -0,0 +1,78 @@
+"""Combina atributos de multiples candidatos de la misma entidad."""
+
+from __future__ import annotations
+
+_NUMERIC_FIELDS = {"risk_score", "balance", "cvss"}
+_DATE_MIN_FIELDS = {"first_seen", "created_date"}
+_DATE_MAX_FIELDS = {"last_seen", "expires_date"}
+_BOOL_FIELDS = {"verified", "exploited"}
+
+
+def merge_entity_attributes(attr_list: list[dict]) -> dict:
+    """Combina atributos de multiples candidatos de la misma entidad.
+
+    Para cada campo presente en cualquier candidato recopila todos los valores
+    non-null y aplica heuristicas de resolucion por tipo de campo:
+    - Numerico (risk_score, balance, cvss): max
+    - Fecha min (first_seen, created_date): min (mas antigua)
+    - Fecha max (last_seen, expires_date): max (mas reciente)
+    - Lista (cualquier valor de tipo list): union sin duplicados
+    - Boolean (verified, exploited): OR logico
+    - String: el mas largo
+
+    Args:
+        attr_list: Lista de dicts con los atributos de cada candidato.
+
+    Returns:
+        Dict con los atributos fusionados.
+    """
+    if not attr_list:
+        return {}
+
+    # Recopilar todas las claves presentes en cualquier candidato
+    all_keys: set[str] = set()
+    for attrs in attr_list:
+        all_keys.update(attrs.keys())
+
+    merged: dict = {}
+
+    for key in all_keys:
+        # Recopilar valores non-null
+        values = [attrs[key] for attrs in attr_list if key in attrs and attrs[key] is not None]
+
+        if not values:
+            merged[key] = None
+            continue
+
+        if len(values) == 1:
+            merged[key] = values[0]
+            continue
+
+        # Todos iguales
+        if all(v == values[0] for v in values):
+            merged[key] = values[0]
+            continue
+
+        # Resolver conflicto segun tipo de campo
+        if key in _NUMERIC_FIELDS:
+            merged[key] = max(values)
+        elif key in _DATE_MIN_FIELDS:
+            merged[key] = min(values)
+        elif key in _DATE_MAX_FIELDS:
+            merged[key] = max(values)
+        elif key in _BOOL_FIELDS:
+            merged[key] = any(values)
+        elif isinstance(values[0], list):
+            # Union de listas sin duplicados, preservando orden de aparicion
+            seen: list = []
+            for lst in values:
+                for item in lst:
+                    if item not in seen:
+                        seen.append(item)
+            merged[key] = seen
+        else:
+            # String u otro: usar el mas largo
+            str_values = [str(v) for v in values]
+            merged[key] = max(str_values, key=len)
+
+    return merged
diff --git a/python/functions/core/merge_entity_attributes_test.py b/python/functions/core/merge_entity_attributes_test.py
new file mode 100644
index 00000000..2f4811cb
--- /dev/null
+++ b/python/functions/core/merge_entity_attributes_test.py
@@ -0,0 +1,102 @@
+"""Tests para merge_entity_attributes."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(__file__))
+from merge_entity_attributes import merge_entity_attributes
+
+
+def test_atributos_complementarios():
+    """Atributos complementarios (A tiene full_name, B tiene nationality) -> ambos."""
+    a = {"full_name": "Alice Smith"}
+    b = {"nationality": "US"}
+    result = merge_entity_attributes([a, b])
+    assert result["full_name"] == "Alice Smith"
+    assert result["nationality"] == "US"
+
+
+def test_atributos_conflictivos_risk_score_max():
+    """Atributos conflictivos en risk_score -> max."""
+    a = {"risk_score": 3.5}
+    b = {"risk_score": 7.2}
+    result = merge_entity_attributes([a, b])
+    assert result["risk_score"] == 7.2
+
+
+def test_atributos_first_seen_conflictivos_min():
+    """Atributos first_seen conflictivos -> min."""
+    a = {"first_seen": "2023-01-01"}
+    b = {"first_seen": "2022-05-15"}
+    result = merge_entity_attributes([a, b])
+    assert result["first_seen"] == "2022-05-15"
+
+
+def test_todos_null():
+    """Todos null -> null."""
+    a = {"score": None}
+    b = {"score": None}
+    result = merge_entity_attributes([a, b])
+    assert result["score"] is None
+
+
+def test_listas_union_sin_duplicados():
+    """Listas -> union sin duplicados."""
+    a = {"name_servers": ["ns1.example.com", "ns2.example.com"]}
+    b = {"name_servers": ["ns2.example.com", "ns3.example.com"]}
+    result = merge_entity_attributes([a, b])
+    assert set(result["name_servers"]) == {"ns1.example.com", "ns2.example.com", "ns3.example.com"}
+    assert len(result["name_servers"]) == 3
+
+
+def test_boolean_or():
+    """Boolean verified -> True si alguno es True."""
+    a = {"verified": False}
+    b = {"verified": True}
+    result = merge_entity_attributes([a, b])
+    assert result["verified"] is True
+
+
+def test_string_mas_largo():
+    """String conflictivo -> usar el mas largo."""
+    a = {"description": "Short desc"}
+    b = {"description": "A much longer and more detailed description"}
+    result = merge_entity_attributes([a, b])
+    assert result["description"] == "A much longer and more detailed description"
+
+
+def test_valores_iguales():
+    """Valores iguales -> usar ese valor."""
+    a = {"alias": "Alice"}
+    b = {"alias": "Alice"}
+    result = merge_entity_attributes([a, b])
+    assert result["alias"] == "Alice"
+
+
+def test_un_solo_candidato():
+    """Un solo candidato -> retorna sus atributos tal cual."""
+    a = {"risk_score": 5.0, "verified": True}
+    result = merge_entity_attributes([a])
+    assert result == a
+
+
+def test_lista_vacia():
+    """Lista vacia -> retorna dict vacio."""
+    result = merge_entity_attributes([])
+    assert result == {}
+
+
+def test_last_seen_max():
+    """last_seen conflictivo -> max (mas reciente)."""
+    a = {"last_seen": "2023-12-01"}
+    b = {"last_seen": "2024-03-15"}
+    result = merge_entity_attributes([a, b])
+    assert result["last_seen"] == "2024-03-15"
+
+
+def test_un_valor_non_null():
+    """Un candidato tiene null, otro tiene valor -> usar el valor."""
+    a = {"cvss": None}
+    b = {"cvss": 8.5}
+    result = merge_entity_attributes([a, b])
+    assert result["cvss"] == 8.5
diff --git a/python/functions/core/next_cron_time.md b/python/functions/core/next_cron_time.md
new file mode 100644
index 00000000..fcac2049
--- /dev/null
+++ b/python/functions/core/next_cron_time.md
@@ -0,0 +1,49 @@
+---
+name: next_cron_time
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "next_cron_time(schedule: dict, after: datetime) -> datetime | None"
+description: "Calcula la proxima ejecucion de un cron schedule despues de un tiempo dado. Avanza minuto a minuto saltando campos no coincidentes. Retorna None si no hay match en 366 dias (schedule imposible)."
+tags: [cron, scheduling, time, next, pure]
+uses_functions: [parse_cron_expr_py_core]
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [datetime]
+tested: true
+tests:
+  - "0 * * * * desde :30 retorna la proxima hora en punto"
+  - "0 9 * * 1-5 desde viernes retorna proximo lunes a las 9"
+  - "schedule imposible retorna None"
+  - "paso */15 desde :05 retorna :15"
+test_file_path: "python/functions/core/next_cron_time_test.py"
+file_path: "python/functions/core/next_cron_time.py"
+---
+
+## Ejemplo
+
+```python
+from parse_cron_expr import parse_cron_expr
+from next_cron_time import next_cron_time
+from datetime import datetime
+
+sched = parse_cron_expr("0 * * * *")
+after = datetime(2024, 1, 15, 14, 30)
+nxt = next_cron_time(sched, after)
+# nxt = datetime(2024, 1, 15, 15, 0)
+
+weekdays = parse_cron_expr("0 9 * * 1-5")
+friday = datetime(2024, 1, 19, 10, 0)  # viernes
+nxt2 = next_cron_time(weekdays, friday)
+# nxt2 = datetime(2024, 1, 22, 9, 0)  # lunes
+```
+
+## Notas
+
+Misma semantica que la version Go (next_cron_time_go_core). Usa AND para day_of_month y day_of_week.
+El campo day_of_week usa convencion cron (0=domingo) convertido internamente a weekday() de Python.
+Retorna None en lugar de lanzar excepcion para mantener purity (sin side effects).
diff --git a/python/functions/core/next_cron_time.py b/python/functions/core/next_cron_time.py
new file mode 100644
index 00000000..63d09fa8
--- /dev/null
+++ b/python/functions/core/next_cron_time.py
@@ -0,0 +1,105 @@
+"""Calcula la proxima ejecucion de un cron schedule."""
+
+from __future__ import annotations
+
+from datetime import datetime, timedelta
+
+
+def next_cron_time(schedule: dict, after: datetime) -> datetime | None:
+    """Calcula la proxima ejecucion de un cron schedule despues de `after`.
+
+    Avanza minuto a minuto saltando campos no coincidentes. Retorna None
+    si no hay match en 366 dias (schedule imposible).
+
+    Args:
+        schedule: Dict con claves minute, hour, day_of_month, month, day_of_week,
+                  cada una con lista de enteros validos. Producido por parse_cron_expr.
+        after: Tiempo de referencia. El proximo tick sera estrictamente posterior.
+
+    Returns:
+        Proximo datetime que satisface el schedule, o None si no existe en 366 dias.
+    """
+    minutes: list[int] = schedule["minute"]
+    hours: list[int] = schedule["hour"]
+    doms: list[int] = schedule["day_of_month"]
+    months: list[int] = schedule["month"]
+    dows: list[int] = schedule["day_of_week"]
+
+    # Truncate to minute and advance by 1.
+    t = after.replace(second=0, microsecond=0) + timedelta(minutes=1)
+    limit = after + timedelta(days=366)
+
+    while t < limit:
+        # Check month.
+        if t.month not in months:
+            t = _next_valid_month(t, months)
+            if t is None:
+                return None
+            continue
+
+        # Check day_of_month AND day_of_week (AND semantics).
+        if t.day not in doms or t.weekday() not in _python_dows(dows):
+            t = t.replace(hour=0, minute=0, second=0, microsecond=0) + timedelta(days=1)
+            continue
+
+        # Check hour.
+        if t.hour not in hours:
+            nxt = _next_valid_hour(t, hours)
+            if nxt is None:
+                t = t.replace(hour=0, minute=0, second=0, microsecond=0) + timedelta(days=1)
+            else:
+                t = nxt
+            continue
+
+        # Check minute.
+        if t.minute not in minutes:
+            nxt = _next_valid_minute(t, minutes)
+            if nxt is None:
+                t = t.replace(minute=0, second=0, microsecond=0) + timedelta(hours=1)
+            else:
+                t = nxt
+            continue
+
+        return t
+
+    return None
+
+
+def _python_dows(dows: list[int]) -> set[int]:
+    """Convert cron day-of-week (0=Sun) to Python weekday() (0=Mon, 6=Sun)."""
+    result = set()
+    for d in dows:
+        # cron: 0=Sun,1=Mon,...,6=Sat
+        # python weekday: 0=Mon,...,6=Sun
+        result.add((d - 1) % 7)
+    return result
+
+
+def _next_valid_month(t: datetime, months: list[int]) -> datetime | None:
+    month = t.month
+    for m in months:
+        if m > month:
+            return t.replace(month=m, day=1, hour=0, minute=0, second=0, microsecond=0)
+    # Wrap to next year.
+    if months:
+        try:
+            return t.replace(year=t.year + 1, month=months[0], day=1, hour=0, minute=0, second=0, microsecond=0)
+        except ValueError:
+            return None
+    return None
+
+
+def _next_valid_hour(t: datetime, hours: list[int]) -> datetime | None:
+    h = t.hour
+    for hh in hours:
+        if hh > h:
+            return t.replace(hour=hh, minute=0, second=0, microsecond=0)
+    return None
+
+
+def _next_valid_minute(t: datetime, minutes: list[int]) -> datetime | None:
+    m = t.minute
+    for mm in minutes:
+        if mm > m:
+            return t.replace(minute=mm, second=0, microsecond=0)
+    return None
diff --git a/python/functions/core/next_cron_time_test.py b/python/functions/core/next_cron_time_test.py
new file mode 100644
index 00000000..fbf64351
--- /dev/null
+++ b/python/functions/core/next_cron_time_test.py
@@ -0,0 +1,41 @@
+"""Tests para next_cron_time."""
+
+from datetime import datetime
+
+from next_cron_time import next_cron_time
+from parse_cron_expr import parse_cron_expr
+
+
+def test_0_star_desde_30_retorna_proxima_hora_en_punto():
+    """0 * * * * desde :30 retorna la proxima hora en punto"""
+    sched = parse_cron_expr("0 * * * *")
+    after = datetime(2024, 1, 15, 14, 30)
+    nxt = next_cron_time(sched, after)
+    assert nxt == datetime(2024, 1, 15, 15, 0)
+
+
+def test_0_9_weekdays_desde_viernes_retorna_proximo_lunes():
+    """0 9 * * 1-5 desde viernes retorna proximo lunes a las 9"""
+    sched = parse_cron_expr("0 9 * * 1-5")
+    # 2024-01-19 es viernes
+    friday = datetime(2024, 1, 19, 10, 0)
+    nxt = next_cron_time(sched, friday)
+    # 2024-01-22 es lunes
+    assert nxt == datetime(2024, 1, 22, 9, 0)
+
+
+def test_schedule_imposible_retorna_none():
+    """schedule imposible retorna None"""
+    # 31 de febrero no existe — schedule nunca se cumple en 366 dias
+    sched = parse_cron_expr("0 0 31 2 *")
+    after = datetime(2024, 1, 1)
+    nxt = next_cron_time(sched, after)
+    assert nxt is None
+
+
+def test_paso_15_desde_05_retorna_15():
+    """paso */15 desde :05 retorna :15"""
+    sched = parse_cron_expr("*/15 * * * *")
+    after = datetime(2024, 3, 10, 8, 5)
+    nxt = next_cron_time(sched, after)
+    assert nxt == datetime(2024, 3, 10, 8, 15)
diff --git a/python/functions/core/normalize_entity_name.md b/python/functions/core/normalize_entity_name.md
new file mode 100644
index 00000000..05e563af
--- /dev/null
+++ b/python/functions/core/normalize_entity_name.md
@@ -0,0 +1,77 @@
+---
+name: normalize_entity_name
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def normalize_entity_name(name: str, entity_type: str = \"\") -> str"
+description: "Normaliza el nombre de una entidad para comparacion y deduplicacion. Aplica reglas distintas segun el tipo: ip/email/domain/crypto_wallet/phone usan normalizacion tecnica; person remueve titulos y convierte formato Apellido-Nombre; organization elimina sufijos legales; default aplica lower+strip+colapso de espacios."
+tags: [normalize, entity, name, deduplication, osint, fuzzygraph, nlp, text]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [re]
+tested: true
+tests:
+  - "person strip whitespace"
+  - "person apellido nombre"
+  - "person titulo"
+  - "organization llc"
+  - "organization corp"
+  - "ip sin cambio"
+  - "email lowercase"
+  - "domain www y trailing dot"
+  - "phone formato"
+  - "default lowercase"
+  - "default colapsa espacios"
+  - "crypto wallet case sensitive"
+  - "domain solo trailing dot"
+  - "person prof titulo"
+test_file_path: "python/functions/core/normalize_entity_name_test.py"
+file_path: "python/functions/core/normalize_entity_name.py"
+---
+
+## Ejemplo
+
+```python
+# Persona con titulo y whitespace
+normalize_entity_name("  Dr. Jane Doe  ", "person")   # "Jane Doe"
+
+# Formato apellido-nombre
+normalize_entity_name("Smith, John", "person")          # "John Smith"
+
+# Organizacion con sufijo legal
+normalize_entity_name("Google LLC", "organization")     # "Google"
+normalize_entity_name("ACME Corp.", "organization")     # "Acme"
+
+# Tecnico: dominio con www y trailing dot
+normalize_entity_name("www.example.com.", "domain")     # "example.com"
+
+# Email case-insensitive
+normalize_entity_name("user@GMAIL.com", "email")        # "user@gmail.com"
+
+# Telefono: solo digitos y +
+normalize_entity_name("+1 (555) 123-4567", "phone")     # "+15551234567"
+
+# Crypto wallet es case-sensitive
+normalize_entity_name("1A1zP1eP5QGefi2DMPTfTL5SLmv7Divf", "crypto_wallet")
+# "1A1zP1eP5QGefi2DMPTfTL5SLmv7Divf" (sin cambio de case)
+
+# Default
+normalize_entity_name("  Hello   WORLD  ")              # "hello world"
+```
+
+## Notas
+
+Disenada para el modulo fuzzygraph donde entidades de distintos tipos necesitan normalizacion previa a la comparacion y deduplicacion.
+
+El parametro `entity_type` es case-insensitive. Valores soportados: `ip`, `email`, `domain`, `crypto_wallet`, `phone`, `person`, `organization`. Cualquier otro valor (incluido string vacio) activa el comportamiento default.
+
+Bitcoin y otras crypto addresses son case-sensitive por diseno — el tipo `crypto_wallet` solo aplica strip sin cambiar el case.
+
+Los titulos de persona reconocidos incluyen: Dr, Mr, Mrs, Ms, Miss, Prof, Sr, Jr, Ing, Lic, Gen, Col, Maj, Capt, Sgt, Rev, Hon (con o sin punto final).
+
+Los sufijos legales de organizacion reconocidos incluyen: Inc, LLC, Ltd, Corp, Co, SA, GmbH, BV, NV, PLC, AG, SRL, SL, Pty, LP, LLP, LLLP, PC, PA, PLLC, Foundation, Group, Holdings, Enterprises, International, Industries, Services, Solutions, Systems, Technologies.
diff --git a/python/functions/core/normalize_entity_name.py b/python/functions/core/normalize_entity_name.py
new file mode 100644
index 00000000..a01b14a9
--- /dev/null
+++ b/python/functions/core/normalize_entity_name.py
@@ -0,0 +1,81 @@
+"""Normaliza el nombre de una entidad para comparacion y deduplicacion."""
+
+import re
+
+
+_TITLES = re.compile(
+    r"^\b(?:Dr|Mr|Mrs|Ms|Miss|Prof|Sr|Jr|Ing|Lic|Gen|Col|Maj|Capt|Sgt|Rev|Hon)\.?\s+",
+    re.IGNORECASE,
+)
+
+_LEGAL_SUFFIXES = re.compile(
+    r"\b(?:Inc|LLC|Ltd|Corp|Co|S\.?A|GmbH|B\.?V|N\.?V|PLC|AG|SRL|S\.?L|Pty|"
+    r"LP|LLP|LLLP|PC|PA|PLLC|Foundation|Group|Holdings|Enterprises?|"
+    r"International|Industries|Services?|Solutions?|Systems?|Technologies?)\.?\s*$",
+    re.IGNORECASE,
+)
+
+_MULTI_SPACE = re.compile(r"\s+")
+
+
+def normalize_entity_name(name: str, entity_type: str = "") -> str:
+    """Normaliza el nombre de una entidad para comparacion y deduplicacion.
+
+    Aplica reglas diferentes segun el tipo de entidad:
+    - ip / email / domain / crypto_wallet / phone: normalizacion tecnica
+    - person: normalizacion de nombre humano (titulos, formato apellido-nombre)
+    - organization: normalizacion corporativa (sufijos legales)
+    - default: lower + strip + colapsar espacios
+
+    Args:
+        name: nombre de la entidad a normalizar.
+        entity_type: tipo de entidad (ip, email, domain, crypto_wallet, phone,
+                     person, organization). Vacio = default.
+
+    Returns:
+        nombre normalizado como string.
+    """
+    name = name.strip()
+    et = entity_type.lower().strip()
+
+    if et == "ip":
+        return name.lower()
+
+    if et == "email":
+        return name.lower()
+
+    if et == "domain":
+        result = name.lower().rstrip(".")
+        if result.startswith("www."):
+            result = result[4:]
+        return result
+
+    if et == "crypto_wallet":
+        # Bitcoin addresses son case-sensitive — solo strip
+        return name
+
+    if et == "phone":
+        # Mantener solo digitos y el signo +
+        return re.sub(r"[^\d+]", "", name)
+
+    if et == "person":
+        # Remover titulos al inicio
+        result = _TITLES.sub("", name).strip()
+        # Detectar formato "Apellido, Nombre"
+        if "," in result:
+            parts = result.split(",", 1)
+            last = parts[0].strip()
+            first = parts[1].strip()
+            result = f"{first} {last}"
+        # Colapsar espacios y title case
+        result = _MULTI_SPACE.sub(" ", result).strip()
+        return result.title()
+
+    if et == "organization":
+        result = _LEGAL_SUFFIXES.sub("", name).strip()
+        result = _MULTI_SPACE.sub(" ", result).strip()
+        # Title case para consistencia
+        return result.title()
+
+    # Default: lower, strip, colapsar espacios
+    return _MULTI_SPACE.sub(" ", name.lower()).strip()
diff --git a/python/functions/core/normalize_entity_name_test.py b/python/functions/core/normalize_entity_name_test.py
new file mode 100644
index 00000000..d2d9e03e
--- /dev/null
+++ b/python/functions/core/normalize_entity_name_test.py
@@ -0,0 +1,70 @@
+"""Tests para normalize_entity_name."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from normalize_entity_name import normalize_entity_name
+
+
+def test_person_strip_whitespace():
+    assert normalize_entity_name("  John Smith  ", "person") == "John Smith"
+
+
+def test_person_apellido_nombre():
+    assert normalize_entity_name("Smith, John", "person") == "John Smith"
+
+
+def test_person_titulo():
+    assert normalize_entity_name("Dr. Jane Doe", "person") == "Jane Doe"
+
+
+def test_organization_llc():
+    assert normalize_entity_name("Google LLC", "organization") == "Google"
+
+
+def test_organization_corp():
+    assert normalize_entity_name("ACME Corp.", "organization") == "Acme"
+
+
+def test_ip_sin_cambio():
+    assert normalize_entity_name("192.168.1.1", "ip") == "192.168.1.1"
+
+
+def test_email_lowercase():
+    assert normalize_entity_name("user@GMAIL.com", "email") == "user@gmail.com"
+
+
+def test_domain_www_y_trailing_dot():
+    assert normalize_entity_name("www.example.com.", "domain") == "example.com"
+
+
+def test_phone_formato():
+    assert normalize_entity_name("+1 (555) 123-4567", "phone") == "+15551234567"
+
+
+def test_default_lowercase():
+    assert normalize_entity_name("  Hello WORLD  ") == "hello world"
+
+
+def test_default_colapsa_espacios():
+    assert normalize_entity_name("foo   bar") == "foo bar"
+
+
+def test_crypto_wallet_case_sensitive():
+    wallet = "1A1zP1eP5QGefi2DMPTfTL5SLmv7Divf"
+    assert normalize_entity_name(wallet, "crypto_wallet") == wallet
+
+
+def test_domain_solo_trailing_dot():
+    assert normalize_entity_name("example.com.", "domain") == "example.com"
+
+
+def test_person_prof_titulo():
+    assert normalize_entity_name("Prof. Alan Turing", "person") == "Alan Turing"
+
+
+if __name__ == "__main__":
+    import pytest
+    pytest.main([__file__, "-v"])
diff --git a/python/functions/core/page_list_to_groups.md b/python/functions/core/page_list_to_groups.md
new file mode 100644
index 00000000..6534f800
--- /dev/null
+++ b/python/functions/core/page_list_to_groups.md
@@ -0,0 +1,37 @@
+---
+name: page_list_to_groups
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def page_list_to_groups(page_contents: list[str], token_lengths: list[int], max_tokens: int = 20000, overlap_pages: int = 1) -> list[str]"
+description: "Agrupa paginas en chunks de texto respetando limite de tokens con overlap configurable entre grupos."
+tags: [chunking, tokens, pagination, grouping]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [math]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/core.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/page_index.py"
+---
+
+## Ejemplo
+
+```python
+contents = ["page1 text", "page2 text", "page3 text", "page4 text"]
+tokens = [5000, 6000, 7000, 4000]
+groups = page_list_to_groups(contents, tokens, max_tokens=12000, overlap_pages=1)
+# 2 groups con overlap de 1 pagina entre ellos
+```
+
+## Notas
+
+Funcion pura. Si el total de tokens cabe en max_tokens, retorna un solo grupo. El overlap permite contexto compartido entre chunks adyacentes. Util para procesar documentos largos con LLMs.
diff --git a/python/functions/core/parse_code_ast.md b/python/functions/core/parse_code_ast.md
new file mode 100644
index 00000000..4905ddfd
--- /dev/null
+++ b/python/functions/core/parse_code_ast.md
@@ -0,0 +1,115 @@
+---
+name: parse_code_ast
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def parse_code_ast(source_code: str, language: str) -> list[CodeEntity]"
+description: "Extrae entidades de codigo (funciones, clases, metodos, tipos, interfaces, structs, traits) de codigo fuente usando tree-sitter. Soporta Python, JavaScript, TypeScript, Go, Rust, Java y C++. Opera sobre strings: no accede al disco."
+tags: [ast, parsing, tree-sitter, code, symbol, extraction, analysis, multi-language]
+uses_functions: []
+uses_types: [code_entity_py_core]
+returns: [code_entity_py_core]
+returns_optional: false
+error_type: ""
+imports: [tree-sitter, tree-sitter-python, tree-sitter-go, tree-sitter-javascript, tree-sitter-typescript, tree-sitter-rust, tree-sitter-java, tree-sitter-cpp]
+tested: true
+tests:
+  - "lenguaje no soportado lanza value error"
+  - "lenguaje no soportado lanza value error vacio"
+  - "codigo vacio retorna lista vacia"
+  - "codigo solo espacios retorna lista vacia"
+  - "alias py funciona"
+  - "alias ts funciona"
+  - "lenguajes soportados incluyen go y python"
+  - "python extrae funcion simple"
+  - "python extrae clase con metodos"
+  - "python extrae docstring"
+  - "go extrae funciones y tipos"
+  - "typescript extrae interfaces"
+  - "codigo con errores de sintaxis es tolerante"
+test_file_path: "python/functions/core/parse_code_ast_test.py"
+file_path: "python/functions/core/parse_code_ast.py"
+---
+
+## Ejemplo
+
+```python
+import sys
+sys.path.insert(0, "python/functions/core")
+sys.path.insert(0, "python/types/core")
+
+from parse_code_ast import parse_code_ast
+
+source = '''
+def greet(name: str) -> str:
+    """Saluda al usuario."""
+    return f"Hello, {name}!"
+
+class Animal:
+    def speak(self) -> str:
+        return ""
+'''
+
+entities = parse_code_ast(source, "python")
+for e in entities:
+    print(e.kind, e.name, f"lines {e.start_line}-{e.end_line}")
+    for child in e.children:
+        print("  ->", child.kind, child.name)
+
+# function greet lines 2-4
+# class Animal lines 6-8
+#   -> method speak
+```
+
+## Lenguajes soportados
+
+| Lenguaje | Alias | Grammar pip | Entidades extraidas |
+|---|---|---|---|
+| `python` | `py` | `tree-sitter-python` | function, class, method |
+| `javascript` | `js` | `tree-sitter-javascript` | function, class, method |
+| `typescript` | `ts` | `tree-sitter-typescript` | function, class, method, interface, type |
+| `go` | — | `tree-sitter-go` | function, method, type |
+| `rust` | `rs` | `tree-sitter-rust` | function, struct, class (impl), trait |
+| `java` | — | `tree-sitter-java` | class, method, interface, function (constructor) |
+| `cpp` | `c++`, `cxx` | `tree-sitter-cpp` | function, class, struct, template |
+
+## Algoritmo
+
+1. Normalizar alias de lenguaje (`py` → `python`, `ts` → `typescript`, etc.)
+2. Importar `tree-sitter` y el grammar del lenguaje (falla con ImportError claro si no instalado)
+3. Construir `Language` desde el modulo del grammar (API tree-sitter >= 0.22)
+4. Parsear `source_code.encode("utf-8")` → AST (tree-sitter tolera errores de sintaxis)
+5. Walk recursivo del AST — para cada nodo relevante:
+   - Extraer nombre desde hijo `identifier` / `type_identifier`
+   - Extraer firma: texto del nodo hasta el primer nodo body
+   - Extraer docstring: primer string literal en Python; comentario `//` / `///` / `/** */` previo en otros lenguajes
+   - Para clases/structs/traits/interfaces: extraer metodos como `children`
+6. Retornar lista plana de entidades de nivel superior
+
+## Instalacion de dependencias
+
+```bash
+# Base (obligatorio)
+pip install tree-sitter
+
+# Grammars (instalar solo los que necesites)
+pip install tree-sitter-python
+pip install tree-sitter-go
+pip install tree-sitter-javascript
+pip install tree-sitter-typescript
+pip install tree-sitter-rust
+pip install tree-sitter-java
+pip install tree-sitter-cpp
+```
+
+## Notas
+
+- **Pura sobre strings**: opera exclusivamente sobre `source_code: str`. No accede a disco, red ni estado global. Es deterministicamente pura dada una entrada.
+- **Tolerancia a errores**: tree-sitter parsea incluso codigo con errores de sintaxis, produciendo nodos ERROR en el AST pero continuando el parsing. La funcion retorna lo que pueda extraer.
+- **Imports lazy**: `tree-sitter` y los grammars se importan en el momento de llamar a la funcion, no al importar el modulo. Esto permite importar `parse_code_ast` sin tener tree-sitter instalado.
+- **Firma completa**: la firma incluye decoradores (Python), generic params (Go/TypeScript), visibility modifiers (Rust/Java). Es todo el texto desde el inicio de la definicion hasta el inicio del body.
+- **Children un nivel**: solo se extraen metodos directos de clases. No se hace recursion profunda (un impl de Rust con traits anidados producira children del impl, no sub-children).
+- **API tree-sitter >= 0.22**: usa `Language(mod.language())` en lugar del antiguo `Language.build_library()`. Compatible con los paquetes pip actuales.
+- Reimplementado desde cero. Inspirado conceptualmente en `openviking/parse/parsers/code/ast/` (AGPL-3.0) pero sin copiar codigo.
diff --git a/python/functions/core/parse_code_ast.py b/python/functions/core/parse_code_ast.py
new file mode 100644
index 00000000..9a7edf63
--- /dev/null
+++ b/python/functions/core/parse_code_ast.py
@@ -0,0 +1,384 @@
+"""parse_code_ast — extrae entidades de codigo fuente usando tree-sitter."""
+
+from __future__ import annotations
+
+import sys
+import os
+
+# Permite importar CodeEntity desde python/types/core/
+_TYPES_DIR = os.path.join(os.path.dirname(__file__), "..", "..", "..", "python", "types", "core")
+_TYPES_DIR = os.path.abspath(_TYPES_DIR)
+if _TYPES_DIR not in sys.path:
+    sys.path.insert(0, _TYPES_DIR)
+
+try:
+    from code_entity import CodeEntity
+except ImportError:
+    # Fallback: define localmente para que el modulo pueda importarse
+    from dataclasses import dataclass, field
+
+    @dataclass
+    class CodeEntity:  # type: ignore[no-redef]
+        kind: str
+        name: str
+        start_line: int
+        end_line: int
+        signature: str
+        docstring: str | None
+        language: str
+        children: list["CodeEntity"] = field(default_factory=list)
+
+
+# ---------------------------------------------------------------------------
+# Lenguajes soportados
+# ---------------------------------------------------------------------------
+
+SUPPORTED_LANGUAGES: dict[str, str] = {
+    "python": "tree_sitter_python",
+    "javascript": "tree_sitter_javascript",
+    "typescript": "tree_sitter_typescript",
+    "go": "tree_sitter_go",
+    "rust": "tree_sitter_rust",
+    "java": "tree_sitter_java",
+    "cpp": "tree_sitter_cpp",
+}
+
+# Alias normalizados
+_ALIASES: dict[str, str] = {
+    "py": "python",
+    "js": "javascript",
+    "ts": "typescript",
+    "rs": "rust",
+    "c++": "cpp",
+    "cxx": "cpp",
+}
+
+# Nodos AST relevantes por lenguaje
+_ENTITY_NODES: dict[str, dict[str, str]] = {
+    "python": {
+        "function_definition": "function",
+        "class_definition": "class",
+        "decorated_definition": "function",  # resuelto en runtime
+    },
+    "javascript": {
+        "function_declaration": "function",
+        "function_expression": "function",
+        "arrow_function": "function",
+        "class_declaration": "class",
+        "method_definition": "method",
+    },
+    "typescript": {
+        "function_declaration": "function",
+        "function_expression": "function",
+        "arrow_function": "function",
+        "class_declaration": "class",
+        "method_definition": "method",
+        "interface_declaration": "interface",
+        "type_alias_declaration": "type",
+    },
+    "go": {
+        "function_declaration": "function",
+        "method_declaration": "method",
+        "type_declaration": "type",
+    },
+    "rust": {
+        "function_item": "function",
+        "struct_item": "struct",
+        "impl_item": "class",  # impl block tratado como class con metodos
+        "trait_item": "trait",
+    },
+    "java": {
+        "class_declaration": "class",
+        "method_declaration": "method",
+        "interface_declaration": "interface",
+        "constructor_declaration": "function",
+    },
+    "cpp": {
+        "function_definition": "function",
+        "class_specifier": "class",
+        "template_declaration": "function",
+        "struct_specifier": "struct",
+    },
+}
+
+
+# ---------------------------------------------------------------------------
+# Helpers de extraccion
+# ---------------------------------------------------------------------------
+
+def _node_text(node, source_bytes: bytes) -> str:
+    """Retorna el texto crudo del nodo."""
+    return source_bytes[node.start_byte:node.end_byte].decode("utf-8", errors="replace")
+
+
+def _extract_name(node, source_bytes: bytes, lang: str) -> str:
+    """Extrae el nombre de la entidad desde el nodo AST."""
+    # Buscar nodo hijo con type 'identifier' o 'name'
+    for child in node.children:
+        if child.type in ("identifier", "name", "type_identifier", "field_identifier"):
+            return _node_text(child, source_bytes)
+    # Para decorated_definition en Python, ir al nodo interno
+    if node.type == "decorated_definition":
+        for child in node.children:
+            if child.type in ("function_definition", "class_definition"):
+                return _extract_name(child, source_bytes, lang)
+    return "<anonymous>"
+
+
+def _extract_signature(node, source_bytes: bytes, lang: str) -> str:
+    """Extrae la firma del nodo: todo antes del body."""
+    body_types = {
+        "block", "body", "statement_block", "compound_statement",
+        "declaration_list", "field_declaration_list", "class_body",
+        "enum_body", "struct_body", "impl_body",
+    }
+    # Acumular texto hasta encontrar el body
+    start = node.start_byte
+    body_start = None
+    for child in node.children:
+        if child.type in body_types:
+            body_start = child.start_byte
+            break
+    if body_start is not None:
+        sig = source_bytes[start:body_start].decode("utf-8", errors="replace").strip()
+    else:
+        # Sin body distinguible (ej: type alias, interface en algunos grammars)
+        sig = _node_text(node, source_bytes).split("\n")[0].strip()
+    return sig
+
+
+def _extract_docstring_python(node, source_bytes: bytes) -> str | None:
+    """Extrae docstring Python: primer string literal en el body."""
+    body_types = {"block", "body"}
+    for child in node.children:
+        if child.type in body_types:
+            for stmt in child.children:
+                if stmt.type == "expression_statement":
+                    for inner in stmt.children:
+                        if inner.type in ("string", "concatenated_string"):
+                            raw = _node_text(inner, source_bytes)
+                            # Limpiar comillas triples / simples
+                            for q in ('"""', "'''", '"', "'"):
+                                if raw.startswith(q) and raw.endswith(q) and len(raw) > 2 * len(q):
+                                    return raw[len(q):-len(q)].strip()
+                            return raw.strip()
+                # Solo miramos el primer statement real
+                if stmt.type not in ("comment", "\n", "pass_statement"):
+                    break
+    return None
+
+
+def _extract_docstring_comment(node, source_bytes: bytes, all_bytes: bytes) -> str | None:
+    """Extrae comentario previo al nodo (para Go, Rust, Java, C++)."""
+    # Buscar comentario en siblings anteriores del nodo en el tree
+    # Usamos el inicio del nodo y buscamos hacia atras en el texto
+    start = node.start_byte
+    preceding = all_bytes[:start].decode("utf-8", errors="replace")
+    lines = preceding.split("\n")
+    # Recolectar lineas de comentario inmediatamente anteriores
+    comment_lines: list[str] = []
+    for line in reversed(lines):
+        stripped = line.strip()
+        if stripped.startswith("///") or stripped.startswith("//"):
+            comment_lines.insert(0, stripped.lstrip("/").strip())
+        elif stripped.startswith("*") or stripped.startswith("/**") or stripped.startswith("*/"):
+            content = stripped.lstrip("/*").rstrip("*/").strip()
+            if content:
+                comment_lines.insert(0, content)
+        elif stripped == "":
+            # Linea en blanco: detener busqueda solo si ya tenemos algo
+            if comment_lines:
+                break
+        else:
+            break
+    return "\n".join(comment_lines) if comment_lines else None
+
+
+def _extract_docstring(node, source_bytes: bytes, lang: str) -> str | None:
+    """Extrae docstring/comentario segun el lenguaje."""
+    if lang == "python":
+        # Para decorated_definition, extraer del nodo interno
+        if node.type == "decorated_definition":
+            for child in node.children:
+                if child.type in ("function_definition", "class_definition"):
+                    return _extract_docstring_python(child, source_bytes)
+        return _extract_docstring_python(node, source_bytes)
+    else:
+        return _extract_docstring_comment(node, source_bytes, source_bytes)
+
+
+def _resolve_kind(node_type: str, node, source_bytes: bytes, lang: str, entity_map: dict) -> str:
+    """Resuelve el kind de la entidad (maneja decorated_definition en Python)."""
+    base_kind = entity_map.get(node_type, "function")
+    if node_type == "decorated_definition":
+        for child in node.children:
+            if child.type == "class_definition":
+                return "class"
+            if child.type == "function_definition":
+                return "function"
+    return base_kind
+
+
+def _extract_children(node, source_bytes: bytes, lang: str, entity_map: dict) -> list[CodeEntity]:
+    """Extrae metodos/funciones hijos dentro de una clase/impl/trait."""
+    children: list[CodeEntity] = []
+    method_kinds = {
+        "method_definition", "method_declaration", "function_definition",
+        "function_declaration", "function_item",
+    }
+    body_types = {
+        "block", "body", "statement_block", "class_body", "declaration_list",
+        "impl_body", "field_declaration_list",
+    }
+
+    def _walk_for_methods(n):
+        for child in n.children:
+            if child.type in body_types:
+                _walk_for_methods(child)
+            elif child.type in method_kinds and child.type in entity_map:
+                kind = entity_map[child.type]
+                name = _extract_name(child, source_bytes, lang)
+                start_line = child.start_point[0] + 1
+                end_line = child.end_point[0] + 1
+                sig = _extract_signature(child, source_bytes, lang)
+                doc = _extract_docstring(child, source_bytes, lang)
+                children.append(CodeEntity(
+                    kind=kind,
+                    name=name,
+                    start_line=start_line,
+                    end_line=end_line,
+                    signature=sig,
+                    docstring=doc,
+                    language=lang,
+                    children=[],
+                ))
+
+    _walk_for_methods(node)
+    return children
+
+
+# ---------------------------------------------------------------------------
+# Funcion principal
+# ---------------------------------------------------------------------------
+
+def parse_code_ast(source_code: str, language: str) -> list[CodeEntity]:
+    """Extrae entidades de codigo fuente usando tree-sitter.
+
+    Parsea el codigo fuente y retorna una lista plana de entidades de nivel
+    superior (funciones, clases, tipos, interfaces). Las clases incluyen sus
+    metodos como `children`.
+
+    Args:
+        source_code: Codigo fuente como string.
+        language: Lenguaje del codigo. Valores soportados: "python",
+            "javascript", "typescript", "go", "rust", "java", "cpp".
+            Se aceptan alias: "py", "js", "ts", "rs", "c++", "cxx".
+
+    Returns:
+        Lista de CodeEntity con las entidades extraidas. Lista vacia si el
+        codigo no contiene entidades reconocibles o es vacio.
+
+    Raises:
+        ValueError: Si el lenguaje no esta soportado.
+        ImportError: Si tree-sitter o el grammar del lenguaje no estan
+            instalados. Instalar con:
+            `pip install tree-sitter tree-sitter-python tree-sitter-go` etc.
+    """
+    if not source_code or not source_code.strip():
+        return []
+
+    # Normalizar lenguaje
+    lang = language.lower().strip()
+    lang = _ALIASES.get(lang, lang)
+
+    if lang not in SUPPORTED_LANGUAGES:
+        supported = ", ".join(sorted(SUPPORTED_LANGUAGES))
+        raise ValueError(
+            f"Lenguaje '{language}' no soportado. "
+            f"Soportados: {supported}"
+        )
+
+    # Importar tree-sitter
+    try:
+        import tree_sitter  # noqa: F401
+        from tree_sitter import Language, Parser
+    except ImportError as exc:
+        raise ImportError(
+            "tree-sitter no esta instalado. Instalar con: "
+            "pip install tree-sitter tree-sitter-python tree-sitter-go "
+            "tree-sitter-javascript tree-sitter-typescript tree-sitter-rust "
+            "tree-sitter-java tree-sitter-cpp"
+        ) from exc
+
+    # Importar grammar del lenguaje
+    grammar_module = SUPPORTED_LANGUAGES[lang]
+    try:
+        import importlib
+        mod = importlib.import_module(grammar_module)
+    except ImportError as exc:
+        pip_name = grammar_module.replace("_", "-")
+        raise ImportError(
+            f"Grammar '{grammar_module}' no instalado. Instalar con: "
+            f"pip install {pip_name}"
+        ) from exc
+
+    # Construir Language segun la API de tree-sitter >= 0.22
+    try:
+        # API nueva: el modulo tiene una funcion language() que retorna el Language
+        if lang == "typescript":
+            # tree-sitter-typescript expone typescript y tsx por separado
+            ts_lang = Language(mod.language_typescript())
+        elif hasattr(mod, "language"):
+            ts_lang = Language(mod.language())
+        else:
+            # Fallback: algunos grammars exponen Language directamente
+            ts_lang = Language(mod.Language)
+    except Exception as exc:
+        raise ImportError(
+            f"No se pudo inicializar el grammar para '{lang}': {exc}. "
+            f"Verifica que {grammar_module} es compatible con esta version de tree-sitter."
+        ) from exc
+
+    parser = Parser(ts_lang)
+
+    source_bytes = source_code.encode("utf-8")
+    tree = parser.parse(source_bytes)
+
+    entity_map = _ENTITY_NODES.get(lang, {})
+    top_level_kinds = {"class", "struct", "trait", "interface"}
+
+    results: list[CodeEntity] = []
+
+    def _walk(node, depth: int = 0) -> None:
+        """Walk recursivo del AST para extraer entidades de nivel superior."""
+        if node.type in entity_map and depth <= 2:
+            kind = _resolve_kind(node.type, node, source_bytes, lang, entity_map)
+            name = _extract_name(node, source_bytes, lang)
+            start_line = node.start_point[0] + 1
+            end_line = node.end_point[0] + 1
+            sig = _extract_signature(node, source_bytes, lang)
+            doc = _extract_docstring(node, source_bytes, lang)
+
+            # Extraer children solo para contenedores
+            children: list[CodeEntity] = []
+            if kind in top_level_kinds:
+                children = _extract_children(node, source_bytes, lang, entity_map)
+
+            results.append(CodeEntity(
+                kind=kind,
+                name=name,
+                start_line=start_line,
+                end_line=end_line,
+                signature=sig,
+                docstring=doc,
+                language=lang,
+                children=children,
+            ))
+            # No seguir descendiendo si ya extrajimos la entidad (children ya incluidos)
+            return
+
+        for child in node.children:
+            _walk(child, depth + 1)
+
+    _walk(tree.root_node)
+    return results
diff --git a/python/functions/core/parse_code_ast_test.py b/python/functions/core/parse_code_ast_test.py
new file mode 100644
index 00000000..c1a3b8bd
--- /dev/null
+++ b/python/functions/core/parse_code_ast_test.py
@@ -0,0 +1,220 @@
+"""Tests para parse_code_ast."""
+
+import pytest
+
+try:
+    import tree_sitter  # noqa: F401
+    HAS_TREE_SITTER = True
+except ImportError:
+    HAS_TREE_SITTER = False
+
+from parse_code_ast import parse_code_ast, SUPPORTED_LANGUAGES
+
+# ---------------------------------------------------------------------------
+# Tests que no requieren tree-sitter
+# ---------------------------------------------------------------------------
+
+def test_lenguaje_no_soportado_lanza_value_error():
+    """Lenguaje no soportado lanza ValueError con mensaje claro."""
+    with pytest.raises(ValueError, match="no soportado"):
+        parse_code_ast("x = 1", "cobol")
+
+
+def test_lenguaje_no_soportado_lanza_value_error_vacio():
+    """Lenguaje vacio lanza ValueError."""
+    with pytest.raises(ValueError):
+        parse_code_ast("x = 1", "")
+
+
+def test_codigo_vacio_retorna_lista_vacia():
+    """Codigo vacio retorna lista vacia sin invocar tree-sitter."""
+    result = parse_code_ast("", "python")
+    assert result == []
+
+
+def test_codigo_solo_espacios_retorna_lista_vacia():
+    """Codigo con solo espacios/newlines retorna lista vacia."""
+    result = parse_code_ast("   \n\n\t  \n", "python")
+    assert result == []
+
+
+def test_alias_py_funciona():
+    """Alias 'py' es aceptado como 'python'."""
+    # Solo verificamos que no lanza ValueError por alias
+    # Si tree-sitter no esta instalado, lanza ImportError (no ValueError)
+    try:
+        parse_code_ast("def f(): pass", "py")
+    except ImportError:
+        pass  # esperado si tree-sitter no instalado
+    except ValueError:
+        pytest.fail("'py' deberia ser alias valido de 'python'")
+
+
+def test_alias_ts_funciona():
+    """Alias 'ts' es aceptado como 'typescript'."""
+    try:
+        parse_code_ast("const x = 1;", "ts")
+    except ImportError:
+        pass
+    except ValueError:
+        pytest.fail("'ts' deberia ser alias valido de 'typescript'")
+
+
+def test_lenguajes_soportados_incluyen_go_y_python():
+    """SUPPORTED_LANGUAGES contiene los lenguajes principales."""
+    assert "python" in SUPPORTED_LANGUAGES
+    assert "go" in SUPPORTED_LANGUAGES
+    assert "typescript" in SUPPORTED_LANGUAGES
+    assert "rust" in SUPPORTED_LANGUAGES
+
+
+# ---------------------------------------------------------------------------
+# Tests que requieren tree-sitter
+# ---------------------------------------------------------------------------
+
+pytestmark_ts = pytest.mark.skipif(
+    not HAS_TREE_SITTER,
+    reason="tree-sitter no instalado"
+)
+
+
+@pytest.mark.skipif(not HAS_TREE_SITTER, reason="tree-sitter no instalado")
+def test_python_extrae_funcion_simple():
+    """Codigo Python con funcion simple retorna una entidad function."""
+    source = '''\
+def greet(name: str) -> str:
+    """Saluda al usuario."""
+    return f"Hello, {name}!"
+'''
+    try:
+        import tree_sitter_python  # noqa: F401
+    except ImportError:
+        pytest.skip("tree-sitter-python no instalado")
+
+    result = parse_code_ast(source, "python")
+    assert len(result) == 1
+    entity = result[0]
+    assert entity.kind == "function"
+    assert entity.name == "greet"
+    assert entity.start_line == 1
+    assert entity.language == "python"
+    assert "greet" in entity.signature
+
+
+@pytest.mark.skipif(not HAS_TREE_SITTER, reason="tree-sitter no instalado")
+def test_python_extrae_clase_con_metodos():
+    """Codigo Python con clase extrae clase y sus metodos como children."""
+    source = '''\
+class Animal:
+    """Clase base de animales."""
+
+    def __init__(self, name: str) -> None:
+        self.name = name
+
+    def speak(self) -> str:
+        return ""
+'''
+    try:
+        import tree_sitter_python  # noqa: F401
+    except ImportError:
+        pytest.skip("tree-sitter-python no instalado")
+
+    result = parse_code_ast(source, "python")
+    classes = [e for e in result if e.kind == "class"]
+    assert len(classes) == 1
+    cls = classes[0]
+    assert cls.name == "Animal"
+    assert len(cls.children) >= 1
+    method_names = [c.name for c in cls.children]
+    assert "speak" in method_names or "__init__" in method_names
+
+
+@pytest.mark.skipif(not HAS_TREE_SITTER, reason="tree-sitter no instalado")
+def test_python_extrae_docstring():
+    """La entidad Python con docstring lo captura correctamente."""
+    source = '''\
+def documented():
+    """Esta es la documentacion."""
+    pass
+'''
+    try:
+        import tree_sitter_python  # noqa: F401
+    except ImportError:
+        pytest.skip("tree-sitter-python no instalado")
+
+    result = parse_code_ast(source, "python")
+    assert len(result) == 1
+    assert result[0].docstring is not None
+    assert "documentacion" in result[0].docstring
+
+
+@pytest.mark.skipif(not HAS_TREE_SITTER, reason="tree-sitter no instalado")
+def test_go_extrae_funciones_y_tipos():
+    """Codigo Go extrae funciones y tipos."""
+    source = '''\
+package main
+
+// Add suma dos enteros.
+func Add(a, b int) int {
+    return a + b
+}
+
+type Point struct {
+    X float64
+    Y float64
+}
+'''
+    try:
+        import tree_sitter_go  # noqa: F401
+    except ImportError:
+        pytest.skip("tree-sitter-go no instalado")
+
+    result = parse_code_ast(source, "go")
+    names = [e.name for e in result]
+    assert "Add" in names
+    assert "Point" in names
+
+
+@pytest.mark.skipif(not HAS_TREE_SITTER, reason="tree-sitter no instalado")
+def test_typescript_extrae_interfaces():
+    """Codigo TypeScript extrae interfaces y funciones."""
+    source = '''\
+interface User {
+    id: number;
+    name: string;
+}
+
+function greetUser(user: User): string {
+    return `Hello, ${user.name}`;
+}
+'''
+    try:
+        import tree_sitter_typescript  # noqa: F401
+    except ImportError:
+        pytest.skip("tree-sitter-typescript no instalado")
+
+    result = parse_code_ast(source, "typescript")
+    kinds = [e.kind for e in result]
+    assert "interface" in kinds or "function" in kinds
+
+
+@pytest.mark.skipif(not HAS_TREE_SITTER, reason="tree-sitter no instalado")
+def test_codigo_con_errores_de_sintaxis_es_tolerante():
+    """tree-sitter es tolerante con errores de sintaxis: retorna lo que puede."""
+    source = '''\
+def valid_function():
+    pass
+
+def broken_function(
+    # error: parentesis no cerrado
+'''
+    try:
+        import tree_sitter_python  # noqa: F401
+    except ImportError:
+        pytest.skip("tree-sitter-python no instalado")
+
+    # No debe lanzar excepcion — tree-sitter es tolerante a errores
+    result = parse_code_ast(source, "python")
+    # Al menos la funcion valida deberia extraerse
+    names = [e.name for e in result]
+    assert "valid_function" in names
diff --git a/python/functions/core/parse_cron_expr.md b/python/functions/core/parse_cron_expr.md
new file mode 100644
index 00000000..0160223f
--- /dev/null
+++ b/python/functions/core/parse_cron_expr.md
@@ -0,0 +1,48 @@
+---
+name: parse_cron_expr
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "parse_cron_expr(expr: str) -> dict"
+description: "Parsea una expresion cron estandar de 5 campos. Soporta *, rangos (1-5), listas (1,3,5), pasos (*/15) y aliases (@hourly, @daily, @weekly, @monthly, @yearly). Retorna dict con claves minute, hour, day_of_month, month, day_of_week con listas de enteros validos."
+tags: [cron, scheduling, parsing, time, pure]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: true
+tests:
+  - "*/15 expande minutos a [0 15 30 45]"
+  - "@daily resuelve a minuto 0 hora 0"
+  - "0 9 1,15 * * expande dias a [1 15]"
+  - "0 9 * * 1-5 expande dia de semana a [1 2 3 4 5]"
+  - "expresion con 4 campos lanza ValueError"
+  - "minuto fuera de rango lanza ValueError"
+test_file_path: "python/functions/core/parse_cron_expr_test.py"
+file_path: "python/functions/core/parse_cron_expr.py"
+---
+
+## Ejemplo
+
+```python
+from parse_cron_expr import parse_cron_expr
+
+sched = parse_cron_expr("*/15 * * * *")
+# sched["minute"] = [0, 15, 30, 45]
+# sched["hour"]   = list(range(24))
+
+sched2 = parse_cron_expr("@daily")
+# sched2["minute"] = [0]
+# sched2["hour"]   = [0]
+
+sched3 = parse_cron_expr("0 9 * * 1-5")
+# sched3["day_of_week"] = [1, 2, 3, 4, 5]
+```
+
+## Notas
+
+Misma semantica que la version Go (parse_cron_expr_go_core). Usa dict en lugar de struct. Limites de campos: minute [0,59], hour [0,23], day_of_month [1,31], month [1,12], day_of_week [0,6] (0=domingo). Lanza ValueError con mensaje descriptivo para entradas invalidas.
diff --git a/python/functions/core/parse_cron_expr.py b/python/functions/core/parse_cron_expr.py
new file mode 100644
index 00000000..caffc56c
--- /dev/null
+++ b/python/functions/core/parse_cron_expr.py
@@ -0,0 +1,112 @@
+"""Parser de expresiones cron estandar de 5 campos."""
+
+from __future__ import annotations
+
+_ALIASES: dict[str, str] = {
+    "@yearly": "0 0 1 1 *",
+    "@annually": "0 0 1 1 *",
+    "@monthly": "0 0 1 * *",
+    "@weekly": "0 0 * * 0",
+    "@daily": "0 0 * * *",
+    "@midnight": "0 0 * * *",
+    "@hourly": "0 * * * *",
+}
+
+# (lo, hi) inclusive for each field: minute, hour, day_of_month, month, day_of_week
+_FIELD_LIMITS: list[tuple[int, int]] = [
+    (0, 59),
+    (0, 23),
+    (1, 31),
+    (1, 12),
+    (0, 6),
+]
+_FIELD_NAMES = ["minute", "hour", "day_of_month", "month", "day_of_week"]
+
+
+def parse_cron_expr(expr: str) -> dict[str, list[int]]:
+    """Parsea una expresion cron estandar de 5 campos.
+
+    Soporta *, rangos (1-5), listas (1,3,5), pasos (*/15) y aliases
+    (@hourly, @daily, @weekly, @monthly, @yearly).
+
+    Args:
+        expr: Expresion cron de 5 campos o alias.
+
+    Returns:
+        Dict con claves minute, hour, day_of_month, month, day_of_week,
+        cada una con la lista de enteros validos para ese campo.
+
+    Raises:
+        ValueError: Si la expresion es invalida o un valor esta fuera de rango.
+    """
+    expr = expr.strip()
+    expr = _ALIASES.get(expr, expr)
+    fields = expr.split()
+    if len(fields) != 5:
+        raise ValueError(
+            f"parse_cron_expr: expected 5 fields, got {len(fields)} in {expr!r}"
+        )
+    result: dict[str, list[int]] = {}
+    for i, field in enumerate(fields):
+        lo, hi = _FIELD_LIMITS[i]
+        key = _FIELD_NAMES[i]
+        result[key] = _parse_field(field, lo, hi, key)
+    return result
+
+
+def _parse_field(field: str, lo: int, hi: int, name: str) -> list[int]:
+    if field == "*":
+        return list(range(lo, hi + 1))
+    seen: set[int] = set()
+    values: list[int] = []
+    for part in field.split(","):
+        for v in _parse_part(part, lo, hi):
+            if v not in seen:
+                seen.add(v)
+                values.append(v)
+    values.sort()
+    return values
+
+
+def _parse_part(part: str, lo: int, hi: int) -> list[int]:
+    if "/" in part:
+        base, step_str = part.split("/", 1)
+        try:
+            step = int(step_str)
+        except ValueError:
+            raise ValueError(f"invalid step {step_str!r}")
+        if step <= 0:
+            raise ValueError(f"step must be positive, got {step}")
+        if base == "*":
+            start, end = lo, hi
+        elif "-" in base:
+            start, end = _parse_range(base, lo, hi)
+        else:
+            start = _parse_value(base, lo, hi)
+            end = hi
+        return list(range(start, end + 1, step))
+    if "-" in part:
+        start, end = _parse_range(part, lo, hi)
+        return list(range(start, end + 1))
+    return [_parse_value(part, lo, hi)]
+
+
+def _parse_range(s: str, lo: int, hi: int) -> tuple[int, int]:
+    parts = s.split("-", 1)
+    if len(parts) != 2:
+        raise ValueError(f"invalid range {s!r}")
+    start = _parse_value(parts[0], lo, hi)
+    end = _parse_value(parts[1], lo, hi)
+    if start > end:
+        raise ValueError(f"range start {start} > end {end}")
+    return start, end
+
+
+def _parse_value(s: str, lo: int, hi: int) -> int:
+    try:
+        v = int(s)
+    except ValueError:
+        raise ValueError(f"invalid value {s!r}: not an integer")
+    if v < lo or v > hi:
+        raise ValueError(f"value {v} out of range [{lo}, {hi}]")
+    return v
diff --git a/python/functions/core/parse_cron_expr_test.py b/python/functions/core/parse_cron_expr_test.py
new file mode 100644
index 00000000..59fb78f2
--- /dev/null
+++ b/python/functions/core/parse_cron_expr_test.py
@@ -0,0 +1,45 @@
+"""Tests para parse_cron_expr."""
+
+import pytest
+from parse_cron_expr import parse_cron_expr
+
+
+def test_expande_minutos_a_0_15_30_45():
+    """*/15 expande minutos a [0 15 30 45]"""
+    sched = parse_cron_expr("*/15 * * * *")
+    assert sched["minute"] == [0, 15, 30, 45]
+    assert sched["hour"] == list(range(24))
+
+
+def test_daily_resuelve_a_minuto_0_hora_0():
+    """@daily resuelve a minuto 0 hora 0"""
+    sched = parse_cron_expr("@daily")
+    assert sched["minute"] == [0]
+    assert sched["hour"] == [0]
+    assert sched["day_of_month"] == list(range(1, 32))
+
+
+def test_0_9_1_15_expande_dias_a_1_15():
+    """0 9 1,15 * * expande dias a [1 15]"""
+    sched = parse_cron_expr("0 9 1,15 * *")
+    assert sched["minute"] == [0]
+    assert sched["hour"] == [9]
+    assert sched["day_of_month"] == [1, 15]
+
+
+def test_0_9_1_5_expande_dia_semana_a_1_2_3_4_5():
+    """0 9 * * 1-5 expande dia de semana a [1 2 3 4 5]"""
+    sched = parse_cron_expr("0 9 * * 1-5")
+    assert sched["day_of_week"] == [1, 2, 3, 4, 5]
+
+
+def test_expresion_con_4_campos_lanza_valueerror():
+    """expresion con 4 campos lanza ValueError"""
+    with pytest.raises(ValueError):
+        parse_cron_expr("0 9 * *")
+
+
+def test_minuto_fuera_de_rango_lanza_valueerror():
+    """minuto fuera de rango lanza ValueError"""
+    with pytest.raises(ValueError):
+        parse_cron_expr("60 * * * *")
diff --git a/python/functions/core/parse_git_url.md b/python/functions/core/parse_git_url.md
new file mode 100644
index 00000000..bbf87519
--- /dev/null
+++ b/python/functions/core/parse_git_url.md
@@ -0,0 +1,46 @@
+---
+name: parse_git_url
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def parse_git_url(url: str, known_hosts: list[str] | None = None) -> str | None"
+description: "Parsea una URL de code hosting (GitHub, GitLab, etc.) y retorna el path org/repo. Soporta HTTPS, HTTP, SSH (git@), git:// y ssh://."
+tags: [git, url, parsing, github, gitlab, repository]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [urllib.parse]
+tested: true
+tests:
+  - "URL HTTPS GitHub"
+  - "URL SSH git@"
+  - "URL con .git suffix"
+  - "URL con path extra (issues/123)"
+  - "URL no reconocida"
+test_file_path: "python/functions/core/parse_git_url_test.py"
+file_path: "python/functions/core/core.py"
+---
+
+## Ejemplo
+
+```python
+parse_git_url("https://github.com/psf/requests")
+# "psf/requests"
+
+parse_git_url("git@github.com:psf/requests.git")
+# "psf/requests"
+
+parse_git_url("https://github.com/psf/requests/issues/123")
+# "psf/requests"  (ignora segmentos extra)
+
+parse_git_url("https://bitbucket.org/org/repo")
+# None  (host desconocido)
+```
+
+## Notas
+
+Funcion pura. Sanitiza org y repo dejando solo `[a-zA-Z0-9_-]`. Los hosts por defecto son `github.com` y `gitlab.com`; se pueden extender via `known_hosts`. No hace requests de red.
diff --git a/python/functions/core/parse_git_url_test.py b/python/functions/core/parse_git_url_test.py
new file mode 100644
index 00000000..e0fe36d0
--- /dev/null
+++ b/python/functions/core/parse_git_url_test.py
@@ -0,0 +1,104 @@
+"""Tests para parse_git_url, is_git_repo_url, validate_git_ssh_uri."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from core import parse_git_url, is_git_repo_url, validate_git_ssh_uri
+
+
+# ── parse_git_url ─────────────────────────────────────────────────────────────
+
+
+def test_parse_git_url_https_github():
+    result = parse_git_url("https://github.com/psf/requests")
+    assert result == "psf/requests"
+
+
+def test_parse_git_url_ssh_git_at():
+    result = parse_git_url("git@github.com:psf/requests.git")
+    assert result == "psf/requests"
+
+
+def test_parse_git_url_git_suffix_stripped():
+    result = parse_git_url("https://github.com/psf/requests.git")
+    assert result == "psf/requests"
+
+
+def test_parse_git_url_path_extra_ignored():
+    result = parse_git_url("https://github.com/psf/requests/issues/123")
+    assert result == "psf/requests"
+
+
+def test_parse_git_url_unknown_host_returns_none():
+    result = parse_git_url("https://bitbucket.org/org/repo")
+    assert result is None
+
+
+# ── is_git_repo_url ───────────────────────────────────────────────────────────
+
+
+def test_is_git_repo_url_valid_repo():
+    assert is_git_repo_url("https://github.com/psf/requests") is True
+
+
+def test_is_git_repo_url_issue_rejected():
+    assert is_git_repo_url("https://github.com/psf/requests/issues/123") is False
+
+
+def test_is_git_repo_url_blob_rejected():
+    assert is_git_repo_url("https://github.com/psf/requests/blob/main/README.md") is False
+
+
+def test_is_git_repo_url_tree_accepted():
+    assert is_git_repo_url("https://github.com/psf/requests/tree/main") is True
+
+
+# ── validate_git_ssh_uri ──────────────────────────────────────────────────────
+
+
+def test_validate_git_ssh_uri_valid():
+    # Should not raise
+    validate_git_ssh_uri("git@github.com:org/repo.git")
+
+
+def test_validate_git_ssh_uri_no_git_at():
+    try:
+        validate_git_ssh_uri("https://github.com/org/repo")
+        assert False, "Expected ValueError"
+    except ValueError as e:
+        assert "git@" in str(e)
+
+
+def test_validate_git_ssh_uri_no_colon():
+    try:
+        validate_git_ssh_uri("git@github.com/org/repo")
+        assert False, "Expected ValueError"
+    except ValueError as e:
+        assert ":" in str(e)
+
+
+def test_validate_git_ssh_uri_empty_path():
+    try:
+        validate_git_ssh_uri("git@github.com:")
+        assert False, "Expected ValueError"
+    except ValueError as e:
+        assert "non-empty path" in str(e)
+
+
+if __name__ == "__main__":
+    test_parse_git_url_https_github()
+    test_parse_git_url_ssh_git_at()
+    test_parse_git_url_git_suffix_stripped()
+    test_parse_git_url_path_extra_ignored()
+    test_parse_git_url_unknown_host_returns_none()
+    test_is_git_repo_url_valid_repo()
+    test_is_git_repo_url_issue_rejected()
+    test_is_git_repo_url_blob_rejected()
+    test_is_git_repo_url_tree_accepted()
+    test_validate_git_ssh_uri_valid()
+    test_validate_git_ssh_uri_no_git_at()
+    test_validate_git_ssh_uri_no_colon()
+    test_validate_git_ssh_uri_empty_path()
+    print("All tests passed.")
diff --git a/python/functions/core/parse_iso_datetime.md b/python/functions/core/parse_iso_datetime.md
new file mode 100644
index 00000000..827ffc24
--- /dev/null
+++ b/python/functions/core/parse_iso_datetime.md
@@ -0,0 +1,49 @@
+---
+name: parse_iso_datetime
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "parse_iso_datetime(value: str) -> datetime"
+description: "Parsea un datetime ISO 8601 tolerando fracciones de segundo con mas de 6 digitos. Windows produce timestamps como 2026-02-21T13:20:23.1470042+08:00 donde la fraccion excede los 6 digitos que acepta datetime.fromisoformat."
+tags: [datetime, iso8601, parse, time, windows]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: ["re", "datetime"]
+tested: true
+tests:
+  - "ISO normal sin fraccion"
+  - "ISO con Z como UTC"
+  - "ISO con fraccion de mas de 6 digitos"
+  - "ISO con timezone offset"
+test_file_path: "python/functions/core/parse_iso_datetime_test.py"
+file_path: "python/functions/core/parse_iso_datetime.py"
+---
+
+## Ejemplo
+
+```python
+from parse_iso_datetime import parse_iso_datetime
+
+# Timestamp normal
+dt = parse_iso_datetime("2026-02-21T13:20:23+08:00")
+
+# Timestamp Windows con 7 digitos en fraccion
+dt = parse_iso_datetime("2026-02-21T13:20:23.1470042+08:00")
+
+# UTC con Z
+dt = parse_iso_datetime("2026-02-21T13:20:23Z")
+```
+
+## Notas
+
+Algoritmo:
+1. Regex `(\.\d{6})\d+` trunca la fraccion de segundo a exactamente 6 digitos.
+2. Reemplaza `Z` final por `+00:00` para compatibilidad con `fromisoformat`.
+3. Llama a `datetime.fromisoformat(normalized)`.
+
+Funcion pura. No hace I/O ni tiene efectos secundarios.
diff --git a/python/functions/core/parse_iso_datetime.py b/python/functions/core/parse_iso_datetime.py
new file mode 100644
index 00000000..08755bc3
--- /dev/null
+++ b/python/functions/core/parse_iso_datetime.py
@@ -0,0 +1,25 @@
+"""Parsea un datetime ISO 8601 tolerando fracciones de segundo con mas de 6 digitos."""
+
+import re
+from datetime import datetime
+
+_FRAC_RE = re.compile(r"(\.\d{6})\d+")
+
+
+def parse_iso_datetime(value: str) -> datetime:
+    """Parsea un datetime ISO 8601 tolerando fracciones de segundo con >6 digitos.
+
+    Windows produce timestamps como ``2026-02-21T13:20:23.1470042+08:00`` donde
+    la fraccion de segundo excede los 6 digitos que acepta ``datetime.fromisoformat``.
+    Esta funcion trunca la fraccion a 6 digitos antes de parsear.
+
+    Args:
+        value: String ISO 8601. Puede contener 'Z' como offset UTC o
+               fracciones de segundo con mas de 6 digitos.
+
+    Returns:
+        datetime parseado, con tzinfo si el string incluia offset.
+    """
+    normalized = _FRAC_RE.sub(r"\1", value)
+    normalized = normalized.replace("Z", "+00:00")
+    return datetime.fromisoformat(normalized)
diff --git a/python/functions/core/parse_iso_datetime_test.py b/python/functions/core/parse_iso_datetime_test.py
new file mode 100644
index 00000000..9f81717e
--- /dev/null
+++ b/python/functions/core/parse_iso_datetime_test.py
@@ -0,0 +1,41 @@
+"""Tests para parse_iso_datetime."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from datetime import datetime, timezone, timedelta
+from parse_iso_datetime import parse_iso_datetime
+
+
+def test_iso_normal_sin_fraccion():
+    result = parse_iso_datetime("2026-02-21T13:20:23+08:00")
+    assert result.year == 2026
+    assert result.month == 2
+    assert result.day == 21
+    assert result.hour == 13
+    assert result.minute == 20
+    assert result.second == 23
+    assert result.tzinfo is not None
+
+
+def test_iso_con_z_como_utc():
+    result = parse_iso_datetime("2026-02-21T13:20:23Z")
+    assert result.tzinfo is not None
+    assert result.utcoffset() == timedelta(0)
+
+
+def test_iso_con_fraccion_de_mas_de_6_digitos():
+    # Windows produce fracciones con 7 digitos: .1470042
+    result = parse_iso_datetime("2026-02-21T13:20:23.1470042+08:00")
+    assert result.year == 2026
+    assert result.microsecond == 147004  # truncado a 6 digitos: 147004
+    assert result.tzinfo is not None
+
+
+def test_iso_con_timezone_offset():
+    result = parse_iso_datetime("2026-06-15T09:00:00.000000-05:00")
+    assert result.hour == 9
+    expected_offset = timedelta(hours=-5)
+    assert result.utcoffset() == expected_offset
diff --git a/python/functions/core/parse_llm_json.md b/python/functions/core/parse_llm_json.md
new file mode 100644
index 00000000..1956f97a
--- /dev/null
+++ b/python/functions/core/parse_llm_json.md
@@ -0,0 +1,42 @@
+---
+name: parse_llm_json
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "def parse_llm_json(response: str) -> dict"
+description: "Parsea una respuesta LLM como JSON, limpiando primero think tags y markdown codeblocks. Combina strip_think_tags + strip_markdown_codeblock + json.loads."
+tags: [llm, json, parsing, cleaning, think, codeblock]
+uses_functions: [strip_think_tags_py_core, strip_markdown_codeblock_py_core]
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [json]
+tested: true
+tests: ["JSON limpio es parseado correctamente", "JSON con think tags y codeblock es parseado correctamente", "JSON invalido lanza ValueError"]
+test_file_path: "python/functions/core/parse_llm_json_test.py"
+file_path: "python/functions/core/parse_llm_json.py"
+---
+
+## Ejemplo
+
+```python
+# Respuesta limpia
+result = parse_llm_json('{"status": "ok"}')
+# {"status": "ok"}
+
+# Respuesta con think tags y codeblock
+raw = '<think>I need to return JSON.</think>\n```json\n{"status": "ok"}\n```'
+result = parse_llm_json(raw)
+# {"status": "ok"}
+
+# JSON invalido lanza ValueError
+parse_llm_json("not json at all")
+# ValueError: parse_llm_json: invalid JSON after cleaning. Cleaned text: 'not json at all'
+```
+
+## Notas
+
+Funcion impura porque puede lanzar ValueError. El mensaje de error incluye el texto ya limpiado (post strip_think_tags y strip_markdown_codeblock) para facilitar el debugging. Composicion de tres funciones: strip_think_tags → strip_markdown_codeblock → json.loads.
diff --git a/python/functions/core/parse_llm_json.py b/python/functions/core/parse_llm_json.py
new file mode 100644
index 00000000..905f2aef
--- /dev/null
+++ b/python/functions/core/parse_llm_json.py
@@ -0,0 +1,33 @@
+"""Parse JSON from LLM responses, cleaning think tags and markdown code blocks."""
+
+import json
+
+from strip_think_tags import strip_think_tags
+from strip_markdown_codeblock import strip_markdown_codeblock
+
+
+def parse_llm_json(response: str) -> dict:
+    """Parse an LLM response as JSON, cleaning think tags and markdown code blocks first.
+
+    Combines strip_think_tags + strip_markdown_codeblock + json.loads into a
+    single pipeline. Raises ValueError with the cleaned text included in the
+    message to aid debugging.
+
+    Args:
+        response: Raw LLM response string.
+
+    Returns:
+        Parsed JSON as a dict.
+
+    Raises:
+        ValueError: If the cleaned text is not valid JSON.
+    """
+    cleaned = strip_think_tags(response)
+    cleaned = strip_markdown_codeblock(cleaned)
+    try:
+        return json.loads(cleaned)
+    except json.JSONDecodeError as exc:
+        raise ValueError(
+            f"parse_llm_json: invalid JSON after cleaning. "
+            f"Cleaned text: {cleaned!r}"
+        ) from exc
diff --git a/python/functions/core/parse_llm_json_test.py b/python/functions/core/parse_llm_json_test.py
new file mode 100644
index 00000000..80ea6a5d
--- /dev/null
+++ b/python/functions/core/parse_llm_json_test.py
@@ -0,0 +1,25 @@
+"""Tests para parse_llm_json."""
+
+import sys
+import os
+import pytest
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from parse_llm_json import parse_llm_json
+
+
+def test_JSON_limpio_es_parseado_correctamente():
+    result = parse_llm_json('{"status": "ok", "value": 42}')
+    assert result == {"status": "ok", "value": 42}
+
+
+def test_JSON_con_think_tags_y_codeblock_es_parseado_correctamente():
+    raw = '<think>I need to return JSON.</think>\n```json\n{"status": "ok"}\n```'
+    result = parse_llm_json(raw)
+    assert result == {"status": "ok"}
+
+
+def test_JSON_invalido_lanza_ValueError():
+    with pytest.raises(ValueError, match="parse_llm_json"):
+        parse_llm_json("not json at all")
diff --git a/python/functions/core/parse_markdown_test.py b/python/functions/core/parse_markdown_test.py
new file mode 100644
index 00000000..4ab12594
--- /dev/null
+++ b/python/functions/core/parse_markdown_test.py
@@ -0,0 +1,184 @@
+"""Tests para extract_frontmatter, find_headings, estimate_token_count,
+smart_split_content y sanitize_for_path."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from core.core import (
+    estimate_token_count,
+    extract_frontmatter,
+    find_headings,
+    sanitize_for_path,
+    smart_split_content,
+)
+
+
+# ---------------------------------------------------------------------------
+# extract_frontmatter
+# ---------------------------------------------------------------------------
+
+
+def test_extract_frontmatter_con_frontmatter():
+    content = "---\ntitle: Hello\nauthor: Alice\n---\n# Body\n"
+    remaining, data = extract_frontmatter(content)
+    assert data is not None
+    assert data.get("title") in ("Hello", "Hello")
+    assert "# Body" in remaining
+
+
+def test_extract_frontmatter_sin_frontmatter():
+    content = "# Just a heading\n\nSome text."
+    remaining, data = extract_frontmatter(content)
+    assert data is None
+    assert remaining == content
+
+
+def test_extract_frontmatter_frontmatter_vacio():
+    content = "---\n\n---\n# Body\n"
+    remaining, data = extract_frontmatter(content)
+    # An empty frontmatter block — data may be None or empty dict
+    assert "# Body" in remaining
+
+
+def test_extract_frontmatter_con_listas():
+    content = "---\ntags: [go, python, bash]\nauthor: Bob\n---\nContent here\n"
+    remaining, data = extract_frontmatter(content)
+    assert data is not None
+    assert "Content here" in remaining
+
+
+# ---------------------------------------------------------------------------
+# find_headings
+# ---------------------------------------------------------------------------
+
+
+def test_find_headings_normales():
+    content = "# Title\n\nSome text\n\n## Section\n\n### Sub\n"
+    headings = find_headings(content)
+    titles = [h[2] for h in headings]
+    levels = [h[3] for h in headings]
+    assert titles == ["Title", "Section", "Sub"]
+    assert levels == [1, 2, 3]
+
+
+def test_find_headings_dentro_de_code_blocks():
+    content = "# Real\n\n```\n# Not a heading\n```\n\n## Also real\n"
+    headings = find_headings(content)
+    titles = [h[2] for h in headings]
+    assert "Not a heading" not in titles
+    assert "Real" in titles
+    assert "Also real" in titles
+
+
+def test_find_headings_escapados():
+    content = "# Normal\n\n\\# Escaped heading\n\n## Another\n"
+    headings = find_headings(content)
+    titles = [h[2] for h in headings]
+    # Escaped heading should not appear
+    assert "Escaped heading" not in titles
+    assert "Normal" in titles
+
+
+def test_find_headings_en_html_comments():
+    content = "# Visible\n\n<!-- # Hidden -->\n\n## Also visible\n"
+    headings = find_headings(content)
+    titles = [h[2] for h in headings]
+    assert "Hidden" not in titles
+    assert "Visible" in titles
+
+
+# ---------------------------------------------------------------------------
+# estimate_token_count
+# ---------------------------------------------------------------------------
+
+
+def test_estimate_token_count_texto_vacio():
+    assert estimate_token_count("") == 0
+
+
+def test_estimate_token_count_solo_latin():
+    # "hello" = 5 non-whitespace chars * 0.3 = 1 (int)
+    result = estimate_token_count("hello")
+    assert isinstance(result, int)
+    assert result >= 0
+
+
+def test_estimate_token_count_solo_cjk():
+    # 3 CJK chars * 0.7 = 2 (int)
+    result = estimate_token_count("\u4e2d\u6587\u8bed")
+    assert isinstance(result, int)
+    assert result >= 1
+
+
+def test_estimate_token_count_texto_mixto():
+    # Mix of CJK and latin
+    result = estimate_token_count("Hello \u4e16\u754c")
+    assert isinstance(result, int)
+    assert result > 0
+
+
+# ---------------------------------------------------------------------------
+# smart_split_content
+# ---------------------------------------------------------------------------
+
+
+def test_smart_split_content_contenido_corto():
+    content = "Short paragraph.\n\nAnother short one."
+    parts = smart_split_content(content, max_tokens=1024, max_chars=8000)
+    assert len(parts) == 1
+    assert parts[0] == content
+
+
+def test_smart_split_content_contenido_largo():
+    # Create content with many small paragraphs that together exceed max_tokens
+    paragraphs = ["word " * 10] * 200  # 200 paragraphs of ~10 words each
+    content = "\n\n".join(paragraphs)
+    parts = smart_split_content(content, max_tokens=50, max_chars=8000)
+    assert len(parts) > 1
+    for part in parts:
+        assert estimate_token_count(part) <= 50 + estimate_token_count("word " * 10)
+
+
+def test_smart_split_content_parrafo_gigante():
+    # Single paragraph that exceeds max_chars — must be force-cut
+    long_para = "x" * 20000
+    parts = smart_split_content(long_para, max_tokens=1024, max_chars=8000)
+    assert len(parts) >= 3  # 20000 / 8000 = at least 3 chunks
+    for part in parts:
+        assert len(part) <= 8000
+
+
+# ---------------------------------------------------------------------------
+# sanitize_for_path
+# ---------------------------------------------------------------------------
+
+
+def test_sanitize_for_path_texto_normal():
+    assert sanitize_for_path("Hello World") == "Hello_World"
+
+
+def test_sanitize_for_path_caracteres_especiales():
+    result = sanitize_for_path("Hello! @World#2024")
+    assert "!" not in result
+    assert "@" not in result
+    assert "#" not in result
+
+
+def test_sanitize_for_path_texto_muy_largo():
+    long_text = "a" * 100
+    result = sanitize_for_path(long_text, max_length=50)
+    assert len(result) <= 50
+    # Should have hash suffix
+    assert "_" in result
+
+
+def test_sanitize_for_path_texto_vacio():
+    assert sanitize_for_path("") == "section"
+
+
+def test_sanitize_for_path_texto_cjk():
+    result = sanitize_for_path("\u4e2d\u6587\u6807\u9898")
+    assert len(result) > 0
+    assert result != "section"
diff --git a/python/functions/core/parse_page_range.md b/python/functions/core/parse_page_range.md
new file mode 100644
index 00000000..71f5e12c
--- /dev/null
+++ b/python/functions/core/parse_page_range.md
@@ -0,0 +1,38 @@
+---
+name: parse_page_range
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def parse_page_range(pages: str) -> list[int]"
+description: "Parsea string de rangos de paginas ('5-7', '3,8', '12') a lista de enteros ordenada y sin duplicados."
+tags: [parsing, range, pages, string]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/core.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/retrieve.py"
+---
+
+## Ejemplo
+
+```python
+parse_page_range("1-3,7,10-12")
+# [1, 2, 3, 7, 10, 11, 12]
+
+parse_page_range("5")
+# [5]
+```
+
+## Notas
+
+Funcion pura. Lanza ValueError si un rango tiene start > end.
diff --git a/python/functions/core/parser_registry.md b/python/functions/core/parser_registry.md
new file mode 100644
index 00000000..7dc19d03
--- /dev/null
+++ b/python/functions/core/parser_registry.md
@@ -0,0 +1,65 @@
+---
+name: parser_registry
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "class ParserRegistry: register(name: str, parser: BaseParser) -> None; unregister(name: str) -> None; get_parser(name: str) -> BaseParser | None; get_parser_for_file(path: str) -> BaseParser | None; async parse(source: str, **kwargs) -> ParseResult; list_parsers() -> list[str]; list_supported_extensions() -> list[str]"
+description: "Registry extensible que despacha parsing de archivos al parser correcto basado en extension. Patron plugin: registrar parsers por nombre y extensiones, resolver automaticamente. Mantiene estado mutable (mapa extension→parser). Singleton global disponible via get_registry()."
+tags: [parser, registry, plugin, extensible, dispatch, pattern, singleton]
+uses_functions: []
+uses_types: [base_parser_py_core, parse_result_py_core]
+returns: [parse_result_py_core]
+returns_optional: false
+error_type: "error_go_core"
+imports: []
+tested: true
+tests:
+  - "registrar parser custom"
+  - "resolver parser por extension de archivo"
+  - "parsear archivo existente con extension registrada"
+  - "parsear string de contenido sin path"
+  - "desregistrar parser elimina nombre y extensiones"
+  - "extension desconocida hace fallback al parser text"
+test_file_path: "python/functions/core/parser_registry_test.py"
+file_path: "python/functions/core/parser_registry.py"
+---
+
+## Ejemplo
+
+```python
+from parser_registry import ParserRegistry, get_registry
+
+# Usar instancia propia
+registry = ParserRegistry()
+registry.register("markdown", MarkdownParser())
+registry.register("pdf", PDFParser())
+registry.register("text", TextParser())
+
+result = await registry.parse("documento.md")
+result2 = await registry.parse("Contenido directo\ncon varias lineas")
+
+# Usar singleton global
+global_registry = get_registry()
+global_registry.register("markdown", MarkdownParser())
+```
+
+## Algoritmo de despacho
+
+`parse(source)` sigue este orden de decision:
+
+1. Si `source` es URL de repositorio de codigo (github.com, gitlab.com, bitbucket.org) → delega al parser `"code"`.
+2. Si `source` parece path de archivo (longitud ≤ 512, sin newlines) y existe en disco:
+   - Si es directorio → delega al parser `"directory"`.
+   - Resuelve parser por extension → delega.
+   - Sin parser para la extension → fallback al parser `"text"` via `parse_content`.
+3. Si no parece path (contenido largo o con newlines) → parsea como string con parser `"text"` via `parse_content`.
+
+## Notas
+
+Patron plugin extensible: cada parser se registra por nombre y declara sus extensiones. El mapa extension→nombre se construye automaticamente al registrar. Unregister limpia el mapa solo si el parser registrado sigue siendo el mismo (evita colisiones si dos parsers comparten extension).
+
+El singleton `get_registry()` usa lazy initialization thread-unsafe — suficiente para la mayoria de casos. Para uso multi-threaded, crear instancias propias de ParserRegistry.
+
+Los parsers especiales `"text"`, `"directory"` y `"code"` son convencion — el registry no los crea automaticamente, deben registrarse explicitamente.
diff --git a/python/functions/core/parser_registry.py b/python/functions/core/parser_registry.py
new file mode 100644
index 00000000..2abaf106
--- /dev/null
+++ b/python/functions/core/parser_registry.py
@@ -0,0 +1,225 @@
+"""ParserRegistry — registry extensible que despacha parsing por extension de archivo."""
+
+from __future__ import annotations
+
+import re
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from base_parser import BaseParser
+    from parse_result import ParseResult
+
+# Patron para detectar URLs de repositorios de codigo.
+_CODE_REPO_PATTERN = re.compile(
+    r"^https?://(github\.com|gitlab\.com|bitbucket\.org)/[\w.\-]+/[\w.\-]+"
+)
+
+# Longitud maxima que puede tener un path de archivo valido (heuristica).
+_MAX_PATH_LEN = 512
+
+
+def _looks_like_path(source: str) -> bool:
+    """Heuristica: True si source parece un path de archivo, no contenido."""
+    return len(source) <= _MAX_PATH_LEN and "\n" not in source
+
+
+class ParserRegistry:
+    """Registry extensible que despacha parsing al parser correcto segun extension.
+
+    Patron plugin: registrar parsers por nombre y lista de extensiones,
+    resolver automaticamente al parsear un archivo. El registry mantiene
+    un mapa de extension→nombre para O(1) lookup.
+
+    Uso tipico:
+        registry = ParserRegistry()
+        registry.register("markdown", MarkdownParser())
+        registry.register("pdf", PDFParser())
+        result = await registry.parse("doc.md")
+    """
+
+    def __init__(self) -> None:
+        # nombre → instancia de parser
+        self._parsers: dict[str, "BaseParser"] = {}
+        # extension → nombre del parser
+        self._ext_map: dict[str, str] = {}
+
+    # ------------------------------------------------------------------
+    # Registro
+    # ------------------------------------------------------------------
+
+    def register(self, name: str, parser: "BaseParser") -> None:
+        """Registra un parser con el nombre dado.
+
+        Sobreescribe cualquier parser previamente registrado con el mismo nombre.
+        Actualiza el mapa de extensiones automaticamente.
+
+        Args:
+            name:   Nombre unico del parser (ej: "markdown", "pdf").
+            parser: Instancia de BaseParser a registrar.
+        """
+        self._parsers[name] = parser
+        for ext in parser.supported_extensions:
+            self._ext_map[ext.lower()] = name
+
+    def unregister(self, name: str) -> None:
+        """Elimina un parser del registry.
+
+        Tambien elimina sus extensiones del mapa. Si el nombre no existe,
+        no hace nada.
+
+        Args:
+            name: Nombre del parser a eliminar.
+        """
+        parser = self._parsers.pop(name, None)
+        if parser is None:
+            return
+        for ext in parser.supported_extensions:
+            key = ext.lower()
+            if self._ext_map.get(key) == name:
+                del self._ext_map[key]
+
+    # ------------------------------------------------------------------
+    # Resolucion
+    # ------------------------------------------------------------------
+
+    def get_parser(self, name: str) -> "BaseParser | None":
+        """Retorna el parser registrado bajo el nombre dado.
+
+        Args:
+            name: Nombre del parser.
+
+        Returns:
+            Instancia de BaseParser o None si no existe.
+        """
+        return self._parsers.get(name)
+
+    def get_parser_for_file(self, path: str) -> "BaseParser | None":
+        """Resuelve el parser correcto para un archivo dado su extension.
+
+        Args:
+            path: Path del archivo (puede ser relativo o absoluto).
+
+        Returns:
+            Instancia de BaseParser o None si no hay parser para esa extension.
+        """
+        ext = Path(path).suffix.lower()
+        name = self._ext_map.get(ext)
+        if name is None:
+            return None
+        return self._parsers.get(name)
+
+    # ------------------------------------------------------------------
+    # Parseo
+    # ------------------------------------------------------------------
+
+    async def parse(self, source: str, **kwargs) -> "ParseResult":
+        """Parsea source despachando al parser correcto segun el tipo de entrada.
+
+        Algoritmo de despacho:
+        1. Si source es URL de repo de codigo → delega al parser "code".
+        2. Si source parece path de archivo (corto, sin newlines) y existe en disco:
+           - Si es directorio → delega al parser "directory".
+           - Resuelve parser por extension → delega.
+           - Si no hay parser para la extension → fallback al parser "text".
+        3. Si no parece path (contenido largo o con newlines) → parsea como
+           string de contenido con el parser "text".
+
+        Args:
+            source:   Path de archivo, directorio, URL o string de contenido.
+            **kwargs: Argumentos adicionales pasados al parser seleccionado.
+
+        Returns:
+            ParseResult con el arbol de nodos del documento.
+
+        Raises:
+            ValueError: Si el parser requerido no esta registrado.
+        """
+        # 1. URL de repositorio de codigo
+        if _CODE_REPO_PATTERN.match(source):
+            return await self._dispatch("code", source, **kwargs)
+
+        # 2. Parece un path de archivo
+        if _looks_like_path(source):
+            p = Path(source)
+            if p.exists():
+                if p.is_dir():
+                    return await self._dispatch("directory", source, **kwargs)
+                parser = self.get_parser_for_file(source)
+                if parser is not None:
+                    return await parser.parse(source, **kwargs)
+                # Fallback a text si no hay parser para la extension
+                return await self._dispatch_content("text", source, **kwargs)
+
+        # 3. Es contenido directo (string largo o con newlines)
+        text_parser = self._parsers.get("text")
+        if text_parser is None:
+            raise ValueError(
+                "No hay parser 'text' registrado. Registra un TextParser primero."
+            )
+        return await text_parser.parse_content(source, **kwargs)
+
+    # ------------------------------------------------------------------
+    # Introspeccion
+    # ------------------------------------------------------------------
+
+    def list_parsers(self) -> list[str]:
+        """Retorna los nombres de todos los parsers registrados.
+
+        Returns:
+            Lista de nombres de parsers, ordenada alfabeticamente.
+        """
+        return sorted(self._parsers.keys())
+
+    def list_supported_extensions(self) -> list[str]:
+        """Retorna todas las extensiones soportadas por parsers registrados.
+
+        Returns:
+            Lista de extensiones (con punto, lowercase), ordenada alfabeticamente.
+        """
+        return sorted(self._ext_map.keys())
+
+    # ------------------------------------------------------------------
+    # Helpers internos
+    # ------------------------------------------------------------------
+
+    async def _dispatch(self, name: str, source: str, **kwargs) -> "ParseResult":
+        """Despacha a un parser por nombre via parse()."""
+        parser = self._parsers.get(name)
+        if parser is None:
+            raise ValueError(
+                f"Parser '{name}' no esta registrado. "
+                f"Parsers disponibles: {self.list_parsers()}"
+            )
+        return await parser.parse(source, **kwargs)
+
+    async def _dispatch_content(
+        self, name: str, content: str, **kwargs
+    ) -> "ParseResult":
+        """Despacha a un parser por nombre via parse_content()."""
+        parser = self._parsers.get(name)
+        if parser is None:
+            raise ValueError(
+                f"Parser '{name}' no esta registrado. "
+                f"Parsers disponibles: {self.list_parsers()}"
+            )
+        return await parser.parse_content(content, **kwargs)
+
+
+# ------------------------------------------------------------------
+# Singleton global
+# ------------------------------------------------------------------
+
+_registry: ParserRegistry | None = None
+
+
+def get_registry() -> ParserRegistry:
+    """Retorna la instancia global del ParserRegistry (lazy-initialized).
+
+    Returns:
+        Instancia singleton de ParserRegistry.
+    """
+    global _registry
+    if _registry is None:
+        _registry = ParserRegistry()
+    return _registry
diff --git a/python/functions/core/parser_registry_test.py b/python/functions/core/parser_registry_test.py
new file mode 100644
index 00000000..60e90b2d
--- /dev/null
+++ b/python/functions/core/parser_registry_test.py
@@ -0,0 +1,162 @@
+"""Tests para parser_registry."""
+
+from __future__ import annotations
+
+import asyncio
+import sys
+import os
+import tempfile
+from pathlib import Path
+from unittest.mock import MagicMock, AsyncMock
+
+# Añadir python/types/core al path para importar BaseParser y ParseResult
+_registry_root = Path(__file__).resolve().parents[4]
+sys.path.insert(0, str(_registry_root / "python" / "types" / "core"))
+sys.path.insert(0, str(_registry_root / "python" / "functions" / "core"))
+
+from parser_registry import ParserRegistry, get_registry
+
+
+# ------------------------------------------------------------------
+# Helpers: parser stub que implementa BaseParser
+# ------------------------------------------------------------------
+
+class StubParser:
+    """Parser de prueba para tests. No hereda ABC para evitar deps."""
+
+    def __init__(self, extensions: list[str], name: str = "stub") -> None:
+        self._extensions = extensions
+        self._name = name
+
+    @property
+    def supported_extensions(self) -> list[str]:
+        return self._extensions
+
+    def can_parse(self, path: str) -> bool:
+        return Path(path).suffix.lower() in self._extensions
+
+    async def parse(self, source, **kwargs):
+        return MagicMock(parser_name=self._name, source_path=str(source))
+
+    async def parse_content(self, content: str, source_path=None, **kwargs):
+        return MagicMock(parser_name=self._name, source_path=source_path)
+
+
+def _run(coro):
+    """Helper sincrono para ejecutar corrutinas en tests."""
+    return asyncio.get_event_loop().run_until_complete(coro)
+
+
+# ------------------------------------------------------------------
+# Tests
+# ------------------------------------------------------------------
+
+def test_registrar_parser_custom():
+    """registrar parser custom"""
+    registry = ParserRegistry()
+    stub = StubParser([".foo"])
+    registry.register("foo", stub)
+
+    assert "foo" in registry.list_parsers()
+    assert ".foo" in registry.list_supported_extensions()
+    assert registry.get_parser("foo") is stub
+
+
+def test_resolver_por_extension():
+    """resolver parser por extension de archivo"""
+    registry = ParserRegistry()
+    md_parser = StubParser([".md", ".markdown"], "markdown")
+    registry.register("markdown", md_parser)
+
+    result = registry.get_parser_for_file("documento.md")
+    assert result is md_parser
+
+    result2 = registry.get_parser_for_file("readme.markdown")
+    assert result2 is md_parser
+
+
+def test_parse_file():
+    """parsear archivo existente con extension registrada"""
+    registry = ParserRegistry()
+    txt_parser = StubParser([".txt"], "text")
+    registry.register("text", txt_parser)
+
+    # Crear archivo temporal .txt
+    with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as f:
+        f.write(b"hola mundo")
+        tmp_path = f.name
+
+    try:
+        result = _run(registry.parse(tmp_path))
+        assert result.parser_name == "text"
+    finally:
+        os.unlink(tmp_path)
+
+
+def test_parse_string():
+    """parsear string de contenido sin path"""
+    registry = ParserRegistry()
+    text_parser = StubParser([".txt"], "text")
+    registry.register("text", text_parser)
+
+    # Contenido con newlines → se trata como contenido directo
+    content = "linea 1\nlinea 2\nlinea 3"
+    result = _run(registry.parse(content))
+    assert result.parser_name == "text"
+
+
+def test_unregister():
+    """desregistrar parser elimina nombre y extensiones"""
+    registry = ParserRegistry()
+    stub = StubParser([".bar"])
+    registry.register("bar", stub)
+    assert "bar" in registry.list_parsers()
+
+    registry.unregister("bar")
+    assert "bar" not in registry.list_parsers()
+    assert ".bar" not in registry.list_supported_extensions()
+    assert registry.get_parser("bar") is None
+
+
+def test_extension_desconocida_fallback_text():
+    """extension desconocida hace fallback al parser text"""
+    registry = ParserRegistry()
+    text_parser = StubParser([".txt"], "text")
+    registry.register("text", text_parser)
+
+    # Crear archivo con extension no registrada
+    with tempfile.NamedTemporaryFile(suffix=".xyz", delete=False) as f:
+        f.write(b"contenido")
+        tmp_path = f.name
+
+    try:
+        result = _run(registry.parse(tmp_path))
+        assert result.parser_name == "text"
+    finally:
+        os.unlink(tmp_path)
+
+
+if __name__ == "__main__":
+    tests = [
+        test_registrar_parser_custom,
+        test_resolver_por_extension,
+        test_parse_file,
+        test_parse_string,
+        test_unregister,
+        test_extension_desconocida_fallback_text,
+    ]
+
+    passed = 0
+    failed = 0
+    for t in tests:
+        try:
+            t()
+            print(f"PASS: {t.__doc__}")
+            passed += 1
+        except Exception as e:
+            print(f"FAIL: {t.__doc__} — {e}")
+            failed += 1
+
+    print(f"---\nResults: {passed} passed, {failed} failed")
+    if failed:
+        sys.exit(1)
diff --git a/python/functions/core/pdf_to_markdown.md b/python/functions/core/pdf_to_markdown.md
new file mode 100644
index 00000000..41fac848
--- /dev/null
+++ b/python/functions/core/pdf_to_markdown.md
@@ -0,0 +1,46 @@
+---
+name: pdf_to_markdown
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "def pdf_to_markdown(pdf_path: str, heading_detection: str = 'auto') -> tuple[str, dict]"
+description: "Convierte un PDF a markdown. Extrae texto, tablas e inyecta headings detectados desde bookmarks o analisis de fuentes. Retorna (markdown_content, metadata_dict)."
+tags: [pdf, markdown, conversion, headings, tables, pdfplumber, parsing]
+uses_functions: [extract_pdf_bookmarks_py_core, detect_headings_by_font_py_core, format_table_to_markdown_py_core]
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [pdfplumber, os, sys]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/pdf_to_markdown.py"
+---
+
+## Ejemplo
+
+```python
+from pdf_to_markdown import pdf_to_markdown
+
+md, meta = pdf_to_markdown("report.pdf")
+print(f"Pages: {meta['pages']}, Headings: {meta['headings_found']}, Tables: {meta['tables_found']}")
+print(md[:500])
+
+# Con estrategia especifica
+md, meta = pdf_to_markdown("report.pdf", heading_detection="font")
+print(f"Strategy used: {meta['heading_strategy']}")
+```
+
+## Notas
+
+Estrategias de heading_detection:
+- `"bookmarks"`: extrae outlines/bookmarks del PDF via pdfplumber. Rapido y preciso cuando el PDF tiene bookmarks bien formados.
+- `"font"`: analiza distribucion de font sizes para detectar headings. Util para PDFs sin bookmarks.
+- `"auto"`: intenta bookmarks primero; si no hay, usa analisis de font. Estrategia por defecto.
+
+El metadata_dict retornado contiene: `pages` (int), `heading_strategy` (str: "bookmarks"|"font"|"none"), `headings_found` (int), `tables_found` (int).
+
+Tablas se extraen via `page.extract_tables()` de pdfplumber y se formatean como tablas markdown. Requiere `pip install pdfplumber`.
diff --git a/python/functions/core/pdf_to_markdown.py b/python/functions/core/pdf_to_markdown.py
new file mode 100644
index 00000000..9b0dc3fd
--- /dev/null
+++ b/python/functions/core/pdf_to_markdown.py
@@ -0,0 +1,121 @@
+"""Convert a PDF file to markdown, with heading detection and table extraction."""
+
+import os
+import sys
+
+import pdfplumber
+
+# Allow import from same directory when run directly
+_DIR = os.path.dirname(os.path.abspath(__file__))
+if _DIR not in sys.path:
+    sys.path.insert(0, _DIR)
+
+from extract_pdf_bookmarks import extract_pdf_bookmarks
+from detect_headings_by_font import detect_headings_by_font
+from format_table_to_markdown import format_table_to_markdown
+
+
+def pdf_to_markdown(
+    pdf_path: str,
+    heading_detection: str = "auto",
+) -> tuple[str, dict]:
+    """Convert a PDF file to markdown.
+
+    Extracts text, tables, and injects detected headings from bookmarks or
+    font size analysis.
+
+    Args:
+        pdf_path: Path to the PDF file.
+        heading_detection: Strategy for detecting headings:
+            - "bookmarks": use PDF outline/bookmarks only.
+            - "font": analyze font size distribution.
+            - "auto": try bookmarks first, fall back to font analysis.
+
+    Returns:
+        tuple[str, dict]: (markdown_content, metadata_dict).
+            metadata_dict contains: {"pages": int, "heading_strategy": str,
+            "headings_found": int, "tables_found": int}
+
+    Raises:
+        FileNotFoundError: If pdf_path does not exist.
+        Exception: If the PDF cannot be opened or is corrupted.
+    """
+    if not os.path.exists(pdf_path):
+        raise FileNotFoundError(f"PDF not found: {pdf_path}")
+
+    with pdfplumber.open(pdf_path) as pdf:
+        total_pages = len(pdf.pages)
+
+        # Step 1: Detect headings
+        headings: list[dict] = []
+        strategy_used = heading_detection
+
+        if heading_detection in ("bookmarks", "auto"):
+            headings = extract_pdf_bookmarks(pdf)
+            if headings:
+                strategy_used = "bookmarks"
+            elif heading_detection == "auto":
+                headings = detect_headings_by_font(pdf)
+                strategy_used = "font" if headings else "none"
+        elif heading_detection == "font":
+            headings = detect_headings_by_font(pdf)
+            strategy_used = "font"
+
+        # Step 2: Group headings by page number
+        headings_by_page: dict[int, list[dict]] = {}
+        for h in headings:
+            pn = h.get("page_num")
+            if pn is not None:
+                headings_by_page.setdefault(pn, []).append(h)
+
+        # Step 3: Build markdown page by page
+        sections: list[str] = []
+        tables_found = 0
+
+        for page_idx, page in enumerate(pdf.pages):
+            page_num = page_idx + 1
+            page_parts: list[str] = []
+
+            # Inject headings for this page
+            for h in headings_by_page.get(page_num, []):
+                level = h["level"]
+                title = h["title"]
+                page_parts.append(f"{'#' * level} {title}")
+
+            # Extract page text
+            text = page.extract_text() or ""
+            if text.strip():
+                page_parts.append(text.strip())
+
+            # Extract tables
+            try:
+                tables = page.extract_tables()
+            except Exception:
+                tables = []
+
+            for table in tables:
+                if not table:
+                    continue
+                # Convert all cells to strings, treating None as empty
+                str_table = [
+                    [str(cell) if cell is not None else "" for cell in row]
+                    for row in table
+                ]
+                md_table = format_table_to_markdown(str_table)
+                if md_table:
+                    page_parts.append(md_table)
+                    tables_found += 1
+
+            if page_parts:
+                sections.append("\n\n".join(page_parts))
+
+        markdown_content = "\n\n".join(sections)
+
+        metadata = {
+            "pages": total_pages,
+            "heading_strategy": strategy_used,
+            "headings_found": len(headings),
+            "tables_found": tables_found,
+        }
+
+        return markdown_content, metadata
diff --git a/python/functions/core/preprocess_text.md b/python/functions/core/preprocess_text.md
new file mode 100644
index 00000000..b0db1c08
--- /dev/null
+++ b/python/functions/core/preprocess_text.md
@@ -0,0 +1,33 @@
+---
+name: preprocess_text
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def preprocess_text(text: str) -> str"
+description: "Normaliza whitespace y newlines de un texto crudo. Util como paso previo a chunking o indexing."
+tags: [text, preprocessing, normalization, whitespace, newlines]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [re]
+tested: true
+tests: ["texto con saltos de linea Windows CRLF", "texto con multiples newlines consecutivos", "texto con espacios leading y trailing en cada linea"]
+test_file_path: "python/functions/core/preprocess_text_test.py"
+file_path: "python/functions/core/core.py"
+---
+
+## Ejemplo
+
+```python
+raw = "  hello  \r\n  world  \r\n\n\n\n  end  "
+result = preprocess_text(raw)
+# "hello\nworld\n\nend"
+```
+
+## Notas
+
+Funcion pura. Aplica tres pasos en orden: normalizar line endings (CRLF y CR a LF), colapsar 3+ newlines consecutivos a 2, strip de cada linea. El strip global final elimina whitespace al inicio y al final del texto completo.
diff --git a/python/functions/core/preprocess_text_test.py b/python/functions/core/preprocess_text_test.py
new file mode 100644
index 00000000..6e63d27f
--- /dev/null
+++ b/python/functions/core/preprocess_text_test.py
@@ -0,0 +1,24 @@
+"""Tests para preprocess_text."""
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(__file__))
+from core import preprocess_text
+
+
+def test_texto_con_saltos_de_linea_windows_crlf():
+    text = "hello\r\nworld\r\nend"
+    result = preprocess_text(text)
+    assert result == "hello\nworld\nend"
+
+
+def test_texto_con_multiples_newlines_consecutivos():
+    text = "first\n\n\n\n\nsecond"
+    result = preprocess_text(text)
+    assert result == "first\n\nsecond"
+
+
+def test_texto_con_espacios_leading_y_trailing_en_cada_linea():
+    text = "  hello  \n  world  \n  end  "
+    result = preprocess_text(text)
+    assert result == "hello\nworld\nend"
diff --git a/python/functions/core/react_loop.md b/python/functions/core/react_loop.md
new file mode 100644
index 00000000..df6c7ed6
--- /dev/null
+++ b/python/functions/core/react_loop.md
@@ -0,0 +1,70 @@
+---
+name: react_loop
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "def react_loop(llm_chat: Callable[[list[dict]], str], tools: dict[str, Callable[..., str]], system_prompt: str, user_prompt: str, max_iterations: int = 5, on_thought: Callable[[str], None] | None = None, on_action: Callable[[str, dict], None] | None = None, on_observation: Callable[[str], None] | None = None) -> str"
+description: "Implementa el patron ReACT (Reasoning + Acting) para agentes LLM. El agente razona, decide usar herramientas, observa resultados, y repite hasta producir una respuesta final."
+tags: [llm, agent, react, reasoning, tools, loop]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [json, re, typing]
+tested: true
+tests:
+  - "agente que usa 1 tool y da respuesta final"
+  - "agente que usa multiples tools iterativamente"
+  - "agente que da respuesta directa sin tools"
+  - "max iterations alcanzado fallback"
+  - "tool que falla error handling"
+test_file_path: "python/functions/core/react_loop_test.py"
+file_path: "python/functions/core/react_loop.py"
+---
+
+## Ejemplo
+
+```python
+import openai
+
+client = openai.OpenAI()
+
+def llm_chat(messages: list[dict]) -> str:
+    resp = client.chat.completions.create(model="gpt-4o", messages=messages)
+    return resp.choices[0].message.content
+
+def search(query: str) -> str:
+    """Search the knowledge base."""
+    return f"Results for: {query}"
+
+def analyze(entity_id: str) -> str:
+    """Deep dive into an entity."""
+    return f"Analysis of entity {entity_id}"
+
+answer = react_loop(
+    llm_chat=llm_chat,
+    tools={"search": search, "analyze": analyze},
+    system_prompt="You are a research assistant.",
+    user_prompt="What is quantum entanglement?",
+    max_iterations=5,
+    on_thought=lambda t: print(f"[thought] {t}"),
+    on_action=lambda a, i: print(f"[action] {a}({i})"),
+    on_observation=lambda o: print(f"[obs] {o}"),
+)
+print(answer)
+```
+
+## Notas
+
+El formato de parseo sigue el patron ReACT clasico (Thought/Action/Action Input/Observation/Final Answer), compatible con LangChain y agentes basados en este patron.
+
+Los callbacks (on_thought, on_action, on_observation) permiten logging, UI y tracing sin acoplar al agente.
+
+La funcion es completamente generica: funciona con cualquier LLM (OpenAI, Anthropic, litellm) y cualquier conjunto de tools. El tool description se extrae del primer renglon del docstring de cada callable.
+
+Si max_iterations se agota sin Final Answer, retorna el ultimo response crudo del LLM como fallback.
+
+Los errores de tools se capturan y se entregan como Observation al LLM para que pueda recuperarse. Los errores del LLM propio se propagan hacia arriba.
diff --git a/python/functions/core/react_loop.py b/python/functions/core/react_loop.py
new file mode 100644
index 00000000..573c03c0
--- /dev/null
+++ b/python/functions/core/react_loop.py
@@ -0,0 +1,133 @@
+"""ReACT (Reasoning + Acting) agent loop for LLM-based agents."""
+
+import json
+import re
+from typing import Callable
+
+
+def react_loop(
+    llm_chat: Callable[[list[dict]], str],
+    tools: dict[str, Callable[..., str]],
+    system_prompt: str,
+    user_prompt: str,
+    max_iterations: int = 5,
+    on_thought: Callable[[str], None] | None = None,
+    on_action: Callable[[str, dict], None] | None = None,
+    on_observation: Callable[[str], None] | None = None,
+) -> str:
+    """Execute a ReACT (Reasoning + Acting) agent loop.
+
+    The agent reasons, selects tools, observes results, and repeats until
+    producing a Final Answer or exhausting max_iterations.
+
+    Args:
+        llm_chat: Callable that takes a list of messages and returns a string response.
+        tools: Mapping of tool name to callable. Each tool returns a string.
+        system_prompt: System instruction for the agent.
+        user_prompt: Initial user request.
+        max_iterations: Maximum reasoning iterations before fallback.
+        on_thought: Optional callback invoked with each Thought string.
+        on_action: Optional callback invoked with (action_name, action_input_dict).
+        on_observation: Optional callback invoked with each Observation string.
+
+    Returns:
+        The Final Answer string, or the last raw LLM response as fallback.
+    """
+    tools_description = _format_tools(tools)
+
+    augmented_system = (
+        f"{system_prompt}\n\n"
+        "You have access to the following tools:\n"
+        f"{tools_description}\n\n"
+        "Use this format:\n"
+        "Thought: <your reasoning>\n"
+        "Action: <tool name>\n"
+        "Action Input: <JSON object with tool arguments>\n"
+        "Observation: <tool result will appear here>\n"
+        "... (repeat as needed)\n"
+        "Final Answer: <your final response to the user>"
+    )
+
+    messages: list[dict] = [
+        {"role": "system", "content": augmented_system},
+        {"role": "user", "content": user_prompt},
+    ]
+
+    last_response = ""
+
+    for _ in range(max_iterations):
+        response = llm_chat(messages)
+        last_response = response
+
+        final_answer = _parse_final_answer(response)
+        if final_answer is not None:
+            return final_answer
+
+        thought = _parse_section(response, "Thought")
+        action = _parse_section(response, "Action")
+        action_input_raw = _parse_section(response, "Action Input")
+
+        if thought and on_thought:
+            on_thought(thought)
+
+        if not action:
+            # No action and no final answer — return as-is
+            return response
+
+        action_input: dict = {}
+        if action_input_raw:
+            try:
+                action_input = json.loads(action_input_raw)
+            except json.JSONDecodeError:
+                action_input = {"input": action_input_raw}
+
+        if on_action:
+            on_action(action, action_input)
+
+        tool_fn = tools.get(action)
+        if tool_fn is None:
+            observation = f"Error: tool '{action}' not found. Available tools: {list(tools.keys())}"
+        else:
+            try:
+                observation = tool_fn(**action_input)
+            except Exception as exc:
+                observation = f"Error executing tool '{action}': {exc}"
+
+        if on_observation:
+            on_observation(observation)
+
+        # Append assistant turn + observation to continue the conversation
+        messages.append({"role": "assistant", "content": response})
+        messages.append({"role": "user", "content": f"Observation: {observation}"})
+
+    return last_response
+
+
+# ---------------------------------------------------------------------------
+# Private helpers
+# ---------------------------------------------------------------------------
+
+def _format_tools(tools: dict[str, Callable[..., str]]) -> str:
+    """Build a human-readable tool list from the tools dict."""
+    lines = []
+    for name, fn in tools.items():
+        doc = (fn.__doc__ or "").strip().split("\n")[0]
+        lines.append(f"- {name}: {doc}" if doc else f"- {name}")
+    return "\n".join(lines)
+
+
+def _parse_section(text: str, label: str) -> str | None:
+    """Extract the value after a 'Label:' marker, stopping at the next label."""
+    pattern = rf"^{label}:\s*(.+?)(?=\n(?:Thought|Action Input|Action|Observation|Final Answer):|$)"
+    match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE | re.DOTALL)
+    if match:
+        return match.group(1).strip()
+    return None
+
+
+def _parse_final_answer(text: str) -> str | None:
+    """Extract Final Answer from agent response."""
+    match = re.search(r"Final Answer:\s*(.+)", text, re.IGNORECASE | re.DOTALL)
+    if match:
+        return match.group(1).strip()
+    return None
diff --git a/python/functions/core/react_loop_test.py b/python/functions/core/react_loop_test.py
new file mode 100644
index 00000000..8b9666e1
--- /dev/null
+++ b/python/functions/core/react_loop_test.py
@@ -0,0 +1,127 @@
+"""Tests para react_loop."""
+
+from react_loop import react_loop
+
+
+def _make_llm(responses: list[str]) -> object:
+    """Return a llm_chat callable that yields responses in order."""
+    it = iter(responses)
+
+    def llm_chat(messages: list[dict]) -> str:
+        return next(it)
+
+    return llm_chat
+
+
+def test_agente_que_usa_1_tool_y_da_respuesta_final():
+    called_with = {}
+
+    def search(query: str) -> str:
+        called_with["query"] = query
+        return "Paris"
+
+    llm_responses = [
+        "Thought: I need to search for the capital.\nAction: search\nAction Input: {\"query\": \"capital of France\"}\n",
+        "Thought: I have the answer.\nFinal Answer: The capital of France is Paris.",
+    ]
+
+    result = react_loop(
+        llm_chat=_make_llm(llm_responses),
+        tools={"search": search},
+        system_prompt="You are a helpful assistant.",
+        user_prompt="What is the capital of France?",
+    )
+
+    assert result == "The capital of France is Paris."
+    assert called_with.get("query") == "capital of France"
+
+
+def test_agente_que_usa_multiples_tools_iterativamente():
+    call_log = []
+
+    def lookup(entity: str) -> str:
+        call_log.append(("lookup", entity))
+        return f"Info about {entity}"
+
+    def summarize(text: str) -> str:
+        call_log.append(("summarize", text))
+        return "Summary: " + text[:20]
+
+    llm_responses = [
+        "Thought: Lookup first.\nAction: lookup\nAction Input: {\"entity\": \"Python\"}",
+        "Thought: Now summarize.\nAction: summarize\nAction Input: {\"text\": \"Info about Python\"}",
+        "Final Answer: Done.",
+    ]
+
+    result = react_loop(
+        llm_chat=_make_llm(llm_responses),
+        tools={"lookup": lookup, "summarize": summarize},
+        system_prompt="You are an assistant.",
+        user_prompt="Tell me about Python.",
+        max_iterations=5,
+    )
+
+    assert result == "Done."
+    assert len(call_log) == 2
+    assert call_log[0] == ("lookup", "Python")
+    assert call_log[1] == ("summarize", "Info about Python")
+
+
+def test_agente_que_da_respuesta_directa_sin_tools():
+    llm_responses = [
+        "Final Answer: 42 is the answer.",
+    ]
+
+    result = react_loop(
+        llm_chat=_make_llm(llm_responses),
+        tools={},
+        system_prompt="You are a calculator.",
+        user_prompt="What is 6*7?",
+    )
+
+    assert result == "42 is the answer."
+
+
+def test_max_iterations_alcanzado_fallback():
+    call_count = {"n": 0}
+
+    def noop(q: str) -> str:
+        return "nothing"
+
+    def llm_chat(messages: list[dict]) -> str:
+        call_count["n"] += 1
+        return "Thought: Thinking...\nAction: noop\nAction Input: {\"q\": \"x\"}"
+
+    result = react_loop(
+        llm_chat=llm_chat,
+        tools={"noop": noop},
+        system_prompt=".",
+        user_prompt="Loop forever.",
+        max_iterations=3,
+    )
+
+    assert call_count["n"] == 3
+    assert "Thought: Thinking" in result
+
+
+def test_tool_que_falla_error_handling():
+    observations = []
+
+    def broken_tool(x: str) -> str:
+        raise ValueError("tool exploded")
+
+    llm_responses = [
+        "Action: broken_tool\nAction Input: {\"x\": \"test\"}",
+        "Final Answer: Tool failed gracefully.",
+    ]
+
+    result = react_loop(
+        llm_chat=_make_llm(llm_responses),
+        tools={"broken_tool": broken_tool},
+        system_prompt=".",
+        user_prompt="Use broken tool.",
+        on_observation=lambda obs: observations.append(obs),
+    )
+
+    assert result == "Tool failed gracefully."
+    assert any("Error" in obs for obs in observations)
diff --git a/python/functions/core/remove_tree_fields.md b/python/functions/core/remove_tree_fields.md
new file mode 100644
index 00000000..7a83cccd
--- /dev/null
+++ b/python/functions/core/remove_tree_fields.md
@@ -0,0 +1,36 @@
+---
+name: remove_tree_fields
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def remove_tree_fields(data: Any, fields: list[str] = None) -> Any"
+description: "Elimina campos especificados recursivamente de un arbol (dict/list). Por defecto elimina 'text'."
+tags: [tree, filter, fields, recursive]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/core.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/utils.py"
+---
+
+## Ejemplo
+
+```python
+data = {"title": "A", "text": "largo...", "nodes": [{"title": "B", "text": "otro..."}]}
+remove_tree_fields(data, ["text"])
+# {"title": "A", "nodes": [{"title": "B"}]}
+```
+
+## Notas
+
+Funcion pura. Crea nueva estructura sin los campos indicados. No muta el original.
diff --git a/python/functions/core/render_template.md b/python/functions/core/render_template.md
new file mode 100644
index 00000000..14a2f489
--- /dev/null
+++ b/python/functions/core/render_template.md
@@ -0,0 +1,67 @@
+---
+name: render_template
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "render_template(template: str, context: dict, missing: str = '') -> str"
+description: "Motor de templates minimalista sin dependencias. Soporta {{var}}, {{{raw}}}, {{obj.field}} dot-path, {% for x in list %}...{% endfor %}, {% if cond %}...{% endif %}, {% if not cond %}...{% endif %}. HTML-escaped por defecto."
+tags: [template, render, html, string, format, jinja]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: ["re"]
+tested: true
+tests:
+  - "sustitucion simple"
+  - "dot path en nested dicts"
+  - "for loop con lista de strings"
+  - "for loop con lista de dicts"
+  - "if endif condicional"
+  - "variable missing retorna string vacio"
+  - "variable missing configurable"
+  - "html escaping por defecto"
+  - "triple braces sin escape"
+test_file_path: "python/functions/core/render_template_test.py"
+file_path: "python/functions/core/render_template.py"
+---
+
+## Ejemplo
+
+```python
+# Sustitucion simple
+result = render_template("Hola {{nombre}}!", {"nombre": "Ana"})
+# "Hola Ana!"
+
+# Dot-path
+ctx = {"user": {"name": "Bob", "city": "Madrid"}}
+result = render_template("{{user.name}} en {{user.city}}", ctx)
+# "Bob en Madrid"
+
+# For loop
+ctx = {"items": ["a", "b", "c"]}
+result = render_template("{% for x in items %}-{{x}}{% endfor %}", ctx)
+# "-a-b-c"
+
+# If condicional
+result = render_template("{% if show %}visible{% endif %}", {"show": True})
+# "visible"
+
+# Sin escape (raw)
+result = render_template("{{{html}}}", {"html": "<b>bold</b>"})
+# "<b>bold</b>"
+
+# Variable ausente con valor configurable
+result = render_template("{{falta}}", {}, missing="N/A")
+# "N/A"
+```
+
+## Notas
+
+No pretende reemplazar Jinja2. Cubre el 80% de casos simples sin dependencias.
+HTML escape por defecto protege contra XSS al generar HTML desde datos de usuario.
+Usa {{{triple}}} cuando el valor ya es HTML confiable.
+Los bloques for/if pueden anidarse pero el parsing es iterativo, no recursivo profundo.
diff --git a/python/functions/core/render_template.py b/python/functions/core/render_template.py
new file mode 100644
index 00000000..50617e7d
--- /dev/null
+++ b/python/functions/core/render_template.py
@@ -0,0 +1,142 @@
+"""Motor de templates minimalista sin dependencias externas."""
+
+import re
+
+
+_HTML_ESCAPES = {
+    "&": "&amp;",
+    "<": "&lt;",
+    ">": "&gt;",
+    '"': "&quot;",
+    "'": "&#x27;",
+}
+
+
+def _html_escape(value: str) -> str:
+    for ch, entity in _HTML_ESCAPES.items():
+        value = value.replace(ch, entity)
+    return value
+
+
+def _resolve_path(context: dict, path: str, missing: str) -> str:
+    """Resuelve un dot-path en un dict anidado."""
+    parts = path.strip().split(".")
+    current = context
+    for part in parts:
+        if isinstance(current, dict) and part in current:
+            current = current[part]
+        else:
+            return missing
+    return str(current) if current is not None else ""
+
+
+def _process_block(template: str, context: dict, missing: str) -> str:
+    """Procesa un bloque de template (puede contener for/if anidados)."""
+    result = template
+
+    # Procesar bloques for: {% for x in items %}...{% endfor %}
+    for_pattern = re.compile(
+        r"\{%\s*for\s+(\w+)\s+in\s+(\w+(?:\.\w+)*)\s*%\}(.*?)\{%\s*endfor\s*%\}",
+        re.DOTALL,
+    )
+    while for_pattern.search(result):
+        def replace_for(m: re.Match) -> str:
+            var_name = m.group(1)
+            list_path = m.group(2)
+            body = m.group(3)
+            items = _resolve_raw(context, list_path)
+            if not isinstance(items, list):
+                return ""
+            parts = []
+            for item in items:
+                loop_context = dict(context)
+                loop_context[var_name] = item
+                parts.append(_process_block(body, loop_context, missing))
+            return "".join(parts)
+
+        result = for_pattern.sub(replace_for, result)
+
+    # Procesar bloques if not: {% if not cond %}...{% endif %}
+    if_not_pattern = re.compile(
+        r"\{%\s*if\s+not\s+(\w+(?:\.\w+)*)\s*%\}(.*?)\{%\s*endif\s*%\}",
+        re.DOTALL,
+    )
+    while if_not_pattern.search(result):
+        def replace_if_not(m: re.Match) -> str:
+            cond_path = m.group(1)
+            body = m.group(2)
+            value = _resolve_raw(context, cond_path)
+            if not value:
+                return _process_block(body, context, missing)
+            return ""
+
+        result = if_not_pattern.sub(replace_if_not, result)
+
+    # Procesar bloques if: {% if cond %}...{% endif %}
+    if_pattern = re.compile(
+        r"\{%\s*if\s+(\w+(?:\.\w+)*)\s*%\}(.*?)\{%\s*endif\s*%\}",
+        re.DOTALL,
+    )
+    while if_pattern.search(result):
+        def replace_if(m: re.Match) -> str:
+            cond_path = m.group(1)
+            body = m.group(2)
+            value = _resolve_raw(context, cond_path)
+            if value:
+                return _process_block(body, context, missing)
+            return ""
+
+        result = if_pattern.sub(replace_if, result)
+
+    # Sustitucion triple braces (sin escape): {{{var}}}
+    raw_pattern = re.compile(r"\{\{\{(\s*[\w.]+\s*)\}\}\}")
+    result = raw_pattern.sub(
+        lambda m: _resolve_path(context, m.group(1), missing), result
+    )
+
+    # Sustitucion doble braces (HTML-escaped): {{var}}
+    escaped_pattern = re.compile(r"\{\{(\s*[\w.]+\s*)\}\}")
+    result = escaped_pattern.sub(
+        lambda m: _html_escape(_resolve_path(context, m.group(1), missing)), result
+    )
+
+    return result
+
+
+def _resolve_raw(context: dict, path: str):
+    """Resuelve un dot-path y retorna el valor raw (no convertido a string)."""
+    parts = path.strip().split(".")
+    current = context
+    for part in parts:
+        if isinstance(current, dict) and part in current:
+            current = current[part]
+        else:
+            return None
+    return current
+
+
+def render_template(template: str, context: dict, missing: str = "") -> str:
+    """Motor de templates minimalista sin dependencias.
+
+    Soporta sustitucion de variables, dot-path traversal, bucles for
+    e condicionales if/if not. No pretende reemplazar Jinja2 — cubre
+    el 80% de casos simples.
+
+    Sintaxis:
+        {{var}}             — sustitucion con HTML escape
+        {{{var}}}           — sustitucion sin escape (raw)
+        {{obj.key.subkey}}  — dot-path traversal en dicts anidados
+        {% for x in items %}...{% endfor %} — iteracion sobre lista
+        {% if cond %}...{% endif %}          — condicional truthy
+        {% if not cond %}...{% endif %}      — negacion
+
+    Args:
+        template: String con la plantilla.
+        context: Dict con los valores disponibles para sustitucion.
+        missing: Valor por defecto para variables no encontradas.
+                 Por defecto string vacio.
+
+    Returns:
+        String con todos los placeholders procesados.
+    """
+    return _process_block(template, context, missing)
diff --git a/python/functions/core/render_template_test.py b/python/functions/core/render_template_test.py
new file mode 100644
index 00000000..c68b379a
--- /dev/null
+++ b/python/functions/core/render_template_test.py
@@ -0,0 +1,57 @@
+"""Tests para render_template."""
+
+from render_template import render_template
+
+
+def test_sustitucion_simple():
+    result = render_template("Hola {{nombre}}!", {"nombre": "Ana"})
+    assert result == "Hola Ana!"
+
+
+def test_dot_path_en_nested_dicts():
+    ctx = {"user": {"name": "Bob", "address": {"city": "Madrid"}}}
+    result = render_template("{{user.name}} vive en {{user.address.city}}", ctx)
+    assert result == "Bob vive en Madrid"
+
+
+def test_for_loop_con_lista_de_strings():
+    ctx = {"items": ["a", "b", "c"]}
+    result = render_template("{% for x in items %}{{x}},{% endfor %}", ctx)
+    assert result == "a,b,c,"
+
+
+def test_for_loop_con_lista_de_dicts():
+    ctx = {"rows": [{"name": "Ana"}, {"name": "Bob"}]}
+    result = render_template("{% for row in rows %}{{row.name}} {% endfor %}", ctx)
+    assert result == "Ana Bob "
+
+
+def test_if_endif_condicional():
+    result = render_template("{% if show %}visible{% endif %}", {"show": True})
+    assert result == "visible"
+
+    result = render_template("{% if show %}visible{% endif %}", {"show": False})
+    assert result == ""
+
+
+def test_variable_missing_retorna_string_vacio():
+    result = render_template("{{inexistente}}", {})
+    assert result == ""
+
+
+def test_variable_missing_configurable():
+    result = render_template("{{inexistente}}", {}, missing="N/A")
+    assert result == "N/A"
+
+
+def test_html_escaping_por_defecto():
+    ctx = {"code": "<script>alert('xss')</script>"}
+    result = render_template("{{code}}", ctx)
+    assert "<script>" not in result
+    assert "&lt;script&gt;" in result
+
+
+def test_triple_braces_sin_escape():
+    ctx = {"html": "<b>bold</b>"}
+    result = render_template("{{{html}}}", ctx)
+    assert result == "<b>bold</b>"
diff --git a/python/functions/core/retry_async.md b/python/functions/core/retry_async.md
new file mode 100644
index 00000000..0123ea55
--- /dev/null
+++ b/python/functions/core/retry_async.md
@@ -0,0 +1,45 @@
+---
+name: retry_async
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "async def retry_async(func: Callable[[], Awaitable[T]], max_retries: int, base_delay: float = 0.5, max_delay: float = 8.0, jitter: bool = True, is_retryable: Callable[[Exception], bool] | None = None) -> T"
+description: "Version async de retry_sync. Reintenta una corrutina en errores transitorios con exponential backoff. Usa asyncio.sleep en vez de time.sleep."
+tags: [retry, backoff, exponential, async, resilience, api, asyncio]
+uses_functions: [classify_api_error_py_core, compute_backoff_delay_py_core]
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [asyncio, typing]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/retry_async.py"
+---
+
+## Ejemplo
+
+```python
+import httpx
+
+async def fetch():
+    async with httpx.AsyncClient() as client:
+        return await client.get("https://api.example.com/data")
+
+# Reintentar hasta 3 veces con backoff
+response = await retry_async(func=fetch, max_retries=3)
+
+# Con predicado personalizado
+result = await retry_async(
+    func=my_async_db_query,
+    max_retries=5,
+    is_retryable=lambda e: "deadlock" in str(e),
+)
+```
+
+## Notas
+
+Version async de retry_sync: usa `asyncio.sleep` en lugar de `time.sleep`, permitiendo que el event loop atienda otras tareas durante la espera. El comportamiento de clasificacion y backoff es identico a retry_sync.
diff --git a/python/functions/core/retry_async.py b/python/functions/core/retry_async.py
new file mode 100644
index 00000000..98e4a09c
--- /dev/null
+++ b/python/functions/core/retry_async.py
@@ -0,0 +1,52 @@
+"""Retry an async callable with exponential backoff."""
+
+import asyncio
+from typing import Awaitable, Callable, TypeVar
+
+from classify_api_error import classify_api_error
+from compute_backoff_delay import compute_backoff_delay
+
+T = TypeVar("T")
+
+
+async def retry_async(
+    func: Callable[[], Awaitable[T]],
+    max_retries: int,
+    base_delay: float = 0.5,
+    max_delay: float = 8.0,
+    jitter: bool = True,
+    is_retryable: Callable[[Exception], bool] | None = None,
+) -> T:
+    """Retry an async function on transient errors with exponential backoff.
+
+    Args:
+        func: Zero-argument async callable to execute and retry.
+        max_retries: Maximum number of retry attempts after the first failure.
+        base_delay: Base delay in seconds for backoff calculation.
+        max_delay: Maximum delay cap in seconds.
+        jitter: If True, adds random jitter to backoff delay.
+        is_retryable: Optional predicate; if None, uses classify_api_error to
+                      classify transient errors as retryable.
+
+    Returns:
+        The return value of func on success.
+
+    Raises:
+        Exception: The last exception raised if all attempts are exhausted or
+                   if the error is not retryable.
+    """
+    if is_retryable is None:
+        is_retryable = lambda e: classify_api_error(e) == "transient"
+
+    last_exc: Exception | None = None
+    for attempt in range(max_retries + 1):
+        try:
+            return await func()
+        except Exception as exc:
+            last_exc = exc
+            if not is_retryable(exc) or attempt >= max_retries:
+                raise
+            delay = compute_backoff_delay(attempt, base_delay, max_delay, jitter)
+            await asyncio.sleep(delay)
+
+    raise last_exc  # unreachable but satisfies type checkers
diff --git a/python/functions/core/retry_sync.md b/python/functions/core/retry_sync.md
new file mode 100644
index 00000000..cccda366
--- /dev/null
+++ b/python/functions/core/retry_sync.md
@@ -0,0 +1,44 @@
+---
+name: retry_sync
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "def retry_sync(func: Callable[[], T], max_retries: int, base_delay: float = 0.5, max_delay: float = 8.0, jitter: bool = True, is_retryable: Callable[[Exception], bool] | None = None) -> T"
+description: "Reintenta una funcion sincrona en errores transitorios con exponential backoff. Por defecto usa classify_api_error para decidir si reintentar. Hace time.sleep entre intentos."
+tags: [retry, backoff, exponential, sync, resilience, api]
+uses_functions: [classify_api_error_py_core, compute_backoff_delay_py_core]
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [time, typing]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/retry_sync.py"
+---
+
+## Ejemplo
+
+```python
+import httpx
+
+# Reintentar una llamada HTTP transitoria hasta 3 veces
+response = retry_sync(
+    func=lambda: httpx.get("https://api.example.com/data"),
+    max_retries=3,
+)
+
+# Con predicado personalizado
+retry_sync(
+    func=my_db_query,
+    max_retries=5,
+    is_retryable=lambda e: "deadlock" in str(e),
+)
+```
+
+## Notas
+
+Impure: hace `time.sleep` entre reintentos. Usa `classify_api_error` como clasificador por defecto — solo reintenta errores marcados como "transient". Si `is_retryable` devuelve False en la primera excepcion, re-raise inmediato sin esperar. El parametro `max_retries` es el numero de reintentos despues del primer intento (total de llamadas = max_retries + 1).
diff --git a/python/functions/core/retry_sync.py b/python/functions/core/retry_sync.py
new file mode 100644
index 00000000..40ddec0e
--- /dev/null
+++ b/python/functions/core/retry_sync.py
@@ -0,0 +1,52 @@
+"""Retry a synchronous callable with exponential backoff."""
+
+import time
+from typing import Callable, TypeVar
+
+from classify_api_error import classify_api_error
+from compute_backoff_delay import compute_backoff_delay
+
+T = TypeVar("T")
+
+
+def retry_sync(
+    func: Callable[[], T],
+    max_retries: int,
+    base_delay: float = 0.5,
+    max_delay: float = 8.0,
+    jitter: bool = True,
+    is_retryable: Callable[[Exception], bool] | None = None,
+) -> T:
+    """Retry a synchronous function on transient errors with exponential backoff.
+
+    Args:
+        func: Zero-argument callable to execute and retry.
+        max_retries: Maximum number of retry attempts after the first failure.
+        base_delay: Base delay in seconds for backoff calculation.
+        max_delay: Maximum delay cap in seconds.
+        jitter: If True, adds random jitter to backoff delay.
+        is_retryable: Optional predicate; if None, uses classify_api_error to
+                      classify transient errors as retryable.
+
+    Returns:
+        The return value of func on success.
+
+    Raises:
+        Exception: The last exception raised if all attempts are exhausted or
+                   if the error is not retryable.
+    """
+    if is_retryable is None:
+        is_retryable = lambda e: classify_api_error(e) == "transient"
+
+    last_exc: Exception | None = None
+    for attempt in range(max_retries + 1):
+        try:
+            return func()
+        except Exception as exc:
+            last_exc = exc
+            if not is_retryable(exc) or attempt >= max_retries:
+                raise
+            delay = compute_backoff_delay(attempt, base_delay, max_delay, jitter)
+            time.sleep(delay)
+
+    raise last_exc  # unreachable but satisfies type checkers
diff --git a/python/functions/core/retry_with_backoff.md b/python/functions/core/retry_with_backoff.md
new file mode 100644
index 00000000..2e263487
--- /dev/null
+++ b/python/functions/core/retry_with_backoff.md
@@ -0,0 +1,50 @@
+---
+name: retry_with_backoff
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "def retry_with_backoff(max_retries: int = 3, initial_delay: float = 1.0, max_delay: float = 30.0, backoff_factor: float = 2.0, jitter: bool = True, exceptions: tuple[type[Exception], ...] = (Exception,), on_retry: Callable[[Exception, int], None] | None = None) -> Callable"
+description: "Decorador que reintenta una funcion sincrona con exponential backoff cuando lanza excepciones del tipo especificado. Soporta jitter, callback on_retry y filtrado por tipo de excepcion."
+tags: [retry, backoff, exponential, decorator, sync, resilience, jitter]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [functools, time, random, asyncio, collections.abc, typing]
+tested: true
+tests:
+  - "funcion que falla dos veces y luego exito"
+  - "funcion que siempre falla agota retries y lanza"
+  - "on_retry callback se llama"
+  - "jitter produce delays variables"
+  - "solo reintenta excepciones especificadas"
+test_file_path: "python/functions/core/retry_with_backoff_test.py"
+file_path: "python/functions/core/retry_with_backoff.py"
+---
+
+## Ejemplo
+
+```python
+from retry_with_backoff import retry_with_backoff
+
+@retry_with_backoff(max_retries=3, exceptions=(ConnectionError, TimeoutError))
+def call_api():
+    response = requests.get("https://api.example.com/data")
+    response.raise_for_status()
+    return response.json()
+
+# Con callback para logging
+def log_retry(exc: Exception, attempt: int) -> None:
+    print(f"Attempt {attempt} failed: {exc}. Retrying...")
+
+@retry_with_backoff(max_retries=5, on_retry=log_retry)
+def fetch_resource():
+    ...
+```
+
+## Notas
+
+Impure: hace `time.sleep` entre reintentos. El delay entre intentos se calcula como `min(delay, max_delay)`, opcionalmente multiplicado por un factor aleatorio en `[0.5, 1.0]` si `jitter=True`. Tras cada intento fallido, `delay *= backoff_factor`. El parametro `max_retries` es el numero de reintentos despues del primer intento (total de llamadas = max_retries + 1). Solo reintenta excepciones que sean instancias de los tipos en `exceptions`. Implementado en el mismo archivo que `retry_with_backoff_async`.
diff --git a/python/functions/core/retry_with_backoff.py b/python/functions/core/retry_with_backoff.py
new file mode 100644
index 00000000..59f822ca
--- /dev/null
+++ b/python/functions/core/retry_with_backoff.py
@@ -0,0 +1,100 @@
+"""Retry decorators with exponential backoff for sync and async functions."""
+
+import asyncio
+import functools
+import random
+import time
+from collections.abc import Callable
+from typing import Any
+
+
+def retry_with_backoff(
+    max_retries: int = 3,
+    initial_delay: float = 1.0,
+    max_delay: float = 30.0,
+    backoff_factor: float = 2.0,
+    jitter: bool = True,
+    exceptions: tuple[type[Exception], ...] = (Exception,),
+    on_retry: Callable[[Exception, int], None] | None = None,
+) -> Callable:
+    """Decorator that retries a synchronous function with exponential backoff.
+
+    Args:
+        max_retries: Maximum number of retry attempts after the first failure.
+        initial_delay: Initial delay in seconds before the first retry.
+        max_delay: Maximum delay cap in seconds.
+        backoff_factor: Multiplier applied to delay after each retry.
+        jitter: If True, multiplies delay by a random factor in [0.5, 1.0].
+        exceptions: Tuple of exception types that trigger a retry.
+        on_retry: Optional callback invoked with (exception, attempt) before sleeping.
+
+    Returns:
+        Decorator that wraps the target function with retry logic.
+
+    Raises:
+        Exception: The last exception raised after all retries are exhausted.
+    """
+    def decorator(func: Callable) -> Callable:
+        @functools.wraps(func)
+        def wrapper(*args: Any, **kwargs: Any) -> Any:
+            delay = initial_delay
+            for attempt in range(max_retries + 1):
+                try:
+                    return func(*args, **kwargs)
+                except exceptions as exc:
+                    if attempt >= max_retries:
+                        raise
+                    capped = min(delay, max_delay)
+                    actual_delay = capped * (0.5 + random.random()) if jitter else capped
+                    if on_retry is not None:
+                        on_retry(exc, attempt)
+                    time.sleep(actual_delay)
+                    delay *= backoff_factor
+        return wrapper
+    return decorator
+
+
+def retry_with_backoff_async(
+    max_retries: int = 3,
+    initial_delay: float = 1.0,
+    max_delay: float = 30.0,
+    backoff_factor: float = 2.0,
+    jitter: bool = True,
+    exceptions: tuple[type[Exception], ...] = (Exception,),
+    on_retry: Callable[[Exception, int], None] | None = None,
+) -> Callable:
+    """Decorator that retries an async function with exponential backoff.
+
+    Args:
+        max_retries: Maximum number of retry attempts after the first failure.
+        initial_delay: Initial delay in seconds before the first retry.
+        max_delay: Maximum delay cap in seconds.
+        backoff_factor: Multiplier applied to delay after each retry.
+        jitter: If True, multiplies delay by a random factor in [0.5, 1.0].
+        exceptions: Tuple of exception types that trigger a retry.
+        on_retry: Optional callback invoked with (exception, attempt) before sleeping.
+
+    Returns:
+        Decorator that wraps the target async function with retry logic.
+
+    Raises:
+        Exception: The last exception raised after all retries are exhausted.
+    """
+    def decorator(func: Callable) -> Callable:
+        @functools.wraps(func)
+        async def wrapper(*args: Any, **kwargs: Any) -> Any:
+            delay = initial_delay
+            for attempt in range(max_retries + 1):
+                try:
+                    return await func(*args, **kwargs)
+                except exceptions as exc:
+                    if attempt >= max_retries:
+                        raise
+                    capped = min(delay, max_delay)
+                    actual_delay = capped * (0.5 + random.random()) if jitter else capped
+                    if on_retry is not None:
+                        on_retry(exc, attempt)
+                    await asyncio.sleep(actual_delay)
+                    delay *= backoff_factor
+        return wrapper
+    return decorator
diff --git a/python/functions/core/retry_with_backoff_async.md b/python/functions/core/retry_with_backoff_async.md
new file mode 100644
index 00000000..98c80e76
--- /dev/null
+++ b/python/functions/core/retry_with_backoff_async.md
@@ -0,0 +1,51 @@
+---
+name: retry_with_backoff_async
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "def retry_with_backoff_async(max_retries: int = 3, initial_delay: float = 1.0, max_delay: float = 30.0, backoff_factor: float = 2.0, jitter: bool = True, exceptions: tuple[type[Exception], ...] = (Exception,), on_retry: Callable[[Exception, int], None] | None = None) -> Callable"
+description: "Decorador que reintenta una funcion async con exponential backoff cuando lanza excepciones del tipo especificado. Usa asyncio.sleep en vez de time.sleep."
+tags: [retry, backoff, exponential, decorator, async, asyncio, resilience, jitter]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [functools, asyncio, random, collections.abc, typing]
+tested: true
+tests:
+  - "async funcion que falla dos veces y luego exito"
+  - "async funcion que siempre falla agota retries y lanza"
+  - "async on_retry callback se llama"
+  - "async jitter produce delays variables"
+  - "async solo reintenta excepciones especificadas"
+test_file_path: "python/functions/core/retry_with_backoff_test.py"
+file_path: "python/functions/core/retry_with_backoff.py"
+---
+
+## Ejemplo
+
+```python
+from retry_with_backoff import retry_with_backoff_async
+
+@retry_with_backoff_async(max_retries=3, exceptions=(ConnectionError, TimeoutError))
+async def fetch_data():
+    async with httpx.AsyncClient() as client:
+        response = await client.get("https://api.example.com/data")
+        response.raise_for_status()
+        return response.json()
+
+# Con callback para logging
+def log_retry(exc: Exception, attempt: int) -> None:
+    print(f"Async attempt {attempt} failed: {exc}. Retrying...")
+
+@retry_with_backoff_async(max_retries=5, on_retry=log_retry)
+async def call_service():
+    ...
+```
+
+## Notas
+
+Impure: usa `asyncio.sleep` entre reintentos — no bloquea el event loop. Tiene la misma logica de backoff que `retry_with_backoff` pero para coroutines. El wrapper generado es una coroutine (`async def`), por lo que es compatible con `await` y `asyncio.gather`. Definido en el mismo archivo que `retry_with_backoff`.
diff --git a/python/functions/core/retry_with_backoff_test.py b/python/functions/core/retry_with_backoff_test.py
new file mode 100644
index 00000000..2095eb20
--- /dev/null
+++ b/python/functions/core/retry_with_backoff_test.py
@@ -0,0 +1,195 @@
+"""Tests for retry_with_backoff and retry_with_backoff_async."""
+
+import asyncio
+
+import pytest
+
+from retry_with_backoff import retry_with_backoff, retry_with_backoff_async
+
+
+# ---------------------------------------------------------------------------
+# retry_with_backoff (sync)
+# ---------------------------------------------------------------------------
+
+
+def test_funcion_que_falla_dos_veces_y_luego_exito():
+    """Debe tener exito en el tercer intento tras 2 fallos."""
+    calls = []
+
+    @retry_with_backoff(max_retries=3, initial_delay=0.0, jitter=False)
+    def flaky():
+        calls.append(1)
+        if len(calls) < 3:
+            raise ConnectionError("transient")
+        return "ok"
+
+    result = flaky()
+    assert result == "ok"
+    assert len(calls) == 3
+
+
+def test_funcion_que_siempre_falla_agota_retries_y_lanza():
+    """Debe lanzar la excepcion tras agotar todos los reintentos."""
+    @retry_with_backoff(max_retries=2, initial_delay=0.0, jitter=False)
+    def always_fails():
+        raise ValueError("permanent")
+
+    with pytest.raises(ValueError, match="permanent"):
+        always_fails()
+
+
+def test_on_retry_callback_se_llama():
+    """El callback on_retry debe invocarse una vez por reintento."""
+    retries_seen: list[tuple[Exception, int]] = []
+
+    def on_retry(exc: Exception, attempt: int) -> None:
+        retries_seen.append((exc, attempt))
+
+    @retry_with_backoff(max_retries=2, initial_delay=0.0, jitter=False, on_retry=on_retry)
+    def flaky():
+        raise RuntimeError("boom")
+
+    with pytest.raises(RuntimeError):
+        flaky()
+
+    assert len(retries_seen) == 2
+    assert all(isinstance(e, RuntimeError) for e, _ in retries_seen)
+    assert [a for _, a in retries_seen] == [0, 1]
+
+
+def test_jitter_produce_delays_variables():
+    """Con jitter=True los delays deben variar entre llamadas."""
+    delays: list[float] = []
+    original_sleep = __import__("time").sleep
+
+    import time
+
+    def capture_sleep(d: float) -> None:
+        delays.append(d)
+
+    import unittest.mock as mock
+
+    @retry_with_backoff(max_retries=3, initial_delay=1.0, max_delay=30.0, jitter=True)
+    def always_fails():
+        raise RuntimeError("x")
+
+    with mock.patch("time.sleep", side_effect=capture_sleep):
+        with pytest.raises(RuntimeError):
+            always_fails()
+
+    # All delays should be > 0
+    assert all(d > 0 for d in delays)
+    # With jitter they are unlikely to be all identical (probabilistic; seed not fixed)
+    # At minimum we verify the count matches max_retries
+    assert len(delays) == 3
+
+
+def test_solo_reintenta_excepciones_especificadas():
+    """Solo debe reintentar las excepciones incluidas en el tuple."""
+    calls = []
+
+    @retry_with_backoff(
+        max_retries=3,
+        initial_delay=0.0,
+        jitter=False,
+        exceptions=(ConnectionError,),
+    )
+    def raises_value_error():
+        calls.append(1)
+        raise ValueError("not retryable")
+
+    with pytest.raises(ValueError):
+        raises_value_error()
+
+    # Should have been called exactly once — no retries for ValueError
+    assert len(calls) == 1
+
+
+# ---------------------------------------------------------------------------
+# retry_with_backoff_async
+# ---------------------------------------------------------------------------
+
+
+def test_async_funcion_que_falla_dos_veces_y_luego_exito():
+    """Async: debe tener exito en el tercer intento tras 2 fallos."""
+    calls = []
+
+    @retry_with_backoff_async(max_retries=3, initial_delay=0.0, jitter=False)
+    async def flaky():
+        calls.append(1)
+        if len(calls) < 3:
+            raise ConnectionError("transient")
+        return "ok"
+
+    result = asyncio.run(flaky())
+    assert result == "ok"
+    assert len(calls) == 3
+
+
+def test_async_funcion_que_siempre_falla_agota_retries_y_lanza():
+    """Async: debe lanzar la excepcion tras agotar todos los reintentos."""
+    @retry_with_backoff_async(max_retries=2, initial_delay=0.0, jitter=False)
+    async def always_fails():
+        raise ValueError("permanent")
+
+    with pytest.raises(ValueError, match="permanent"):
+        asyncio.run(always_fails())
+
+
+def test_async_on_retry_callback_se_llama():
+    """Async: el callback on_retry debe invocarse una vez por reintento."""
+    retries_seen: list[tuple[Exception, int]] = []
+
+    def on_retry(exc: Exception, attempt: int) -> None:
+        retries_seen.append((exc, attempt))
+
+    @retry_with_backoff_async(max_retries=2, initial_delay=0.0, jitter=False, on_retry=on_retry)
+    async def flaky():
+        raise RuntimeError("boom")
+
+    with pytest.raises(RuntimeError):
+        asyncio.run(flaky())
+
+    assert len(retries_seen) == 2
+    assert [a for _, a in retries_seen] == [0, 1]
+
+
+def test_async_jitter_produce_delays_variables():
+    """Async: con jitter=True los delays registrados deben ser mayores que cero."""
+    import unittest.mock as mock
+
+    delays: list[float] = []
+
+    async def capture_sleep(d: float) -> None:
+        delays.append(d)
+
+    @retry_with_backoff_async(max_retries=3, initial_delay=1.0, max_delay=30.0, jitter=True)
+    async def always_fails():
+        raise RuntimeError("x")
+
+    with mock.patch("asyncio.sleep", side_effect=capture_sleep):
+        with pytest.raises(RuntimeError):
+            asyncio.run(always_fails())
+
+    assert all(d > 0 for d in delays)
+    assert len(delays) == 3
+
+
+def test_async_solo_reintenta_excepciones_especificadas():
+    """Async: solo debe reintentar las excepciones del tuple especificado."""
+    calls = []
+
+    @retry_with_backoff_async(
+        max_retries=3,
+        initial_delay=0.0,
+        jitter=False,
+        exceptions=(ConnectionError,),
+    )
+    async def raises_value_error():
+        calls.append(1)
+        raise ValueError("not retryable")
+
+    with pytest.raises(ValueError):
+        asyncio.run(raises_value_error())
+
+    assert len(calls) == 1
diff --git a/python/functions/core/sanitize_for_path.md b/python/functions/core/sanitize_for_path.md
new file mode 100644
index 00000000..a3b843ce
--- /dev/null
+++ b/python/functions/core/sanitize_for_path.md
@@ -0,0 +1,40 @@
+---
+name: sanitize_for_path
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def sanitize_for_path(text: str, max_length: int = 50) -> str"
+description: "Convierte texto a nombre seguro para uso en paths. Remueve caracteres especiales, reemplaza espacios con _, trunca con hash suffix si excede max_length."
+tags: [path, sanitize, string, filesystem, slug]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [re, hashlib]
+tested: true
+tests:
+  - "texto normal produce slug con underscores"
+  - "caracteres especiales son removidos"
+  - "texto muy largo trunca con hash suffix"
+  - "texto vacio retorna section"
+  - "texto CJK es preservado"
+test_file_path: "python/functions/core/parse_markdown_test.py"
+file_path: "python/functions/core/core.py"
+---
+
+## Ejemplo
+
+```python
+sanitize_for_path("Hello World!")       # "Hello_World"
+sanitize_for_path("中文标题")             # "中文标题"
+sanitize_for_path("a" * 100)           # "aaa...aaa_<sha256[:8]>" (max 50 chars)
+sanitize_for_path("")                  # "section"
+sanitize_for_path("price: $100 @2024") # "price__100_2024"
+```
+
+## Notas
+
+Funcion pura. Caracteres permitidos: `\w` (letras, digitos, _), rangos CJK `[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]`, espacios (convertidos a `_`) y guiones. El sufijo hash es `_` + primeros 8 caracteres del sha256 del texto original, lo que hace la truncacion determinista y evita colisiones entre textos largos similares. Si el texto es todo caracteres especiales y queda vacio tras limpiar, retorna `"section"` como fallback seguro.
diff --git a/python/functions/core/smart_split_content.md b/python/functions/core/smart_split_content.md
new file mode 100644
index 00000000..133a4f08
--- /dev/null
+++ b/python/functions/core/smart_split_content.md
@@ -0,0 +1,41 @@
+---
+name: smart_split_content
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def smart_split_content(content: str, max_tokens: int = 1024, max_chars: int = 8000) -> list[str]"
+description: "Divide contenido grande en partes respetando limites de tokens y caracteres. Divide por parrafos (doble newline). Si un parrafo individual excede el limite, lo corta por caracteres."
+tags: [chunking, splitting, tokens, text, markdown, llm]
+uses_functions: [estimate_token_count_py_core]
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [re]
+tested: true
+tests:
+  - "contenido corto produce una sola parte"
+  - "contenido largo divide en multiples partes"
+  - "parrafo gigante que requiere forzar corte"
+test_file_path: "python/functions/core/parse_markdown_test.py"
+file_path: "python/functions/core/core.py"
+---
+
+## Ejemplo
+
+```python
+# Short content — stays as one chunk
+parts = smart_split_content("Hello.\n\nWorld.", max_tokens=1024, max_chars=8000)
+# ["Hello.\n\nWorld."]
+
+# Long content — split at paragraph boundaries
+long_md = "\n\n".join(["paragraph text"] * 300)
+parts = smart_split_content(long_md, max_tokens=100, max_chars=8000)
+# [<chunk1>, <chunk2>, ...]
+```
+
+## Notas
+
+Funcion pura. Compone `estimate_token_count` para la estimacion de tokens. La estrategia de split es conservadora: acumula parrafos mientras quepan bajo AMBOS limites (tokens Y caracteres). Si un parrafo individual excede cualquier limite se fuerza corte byte a byte cada `max_chars` caracteres. Util para partir documentos grandes antes de enviarlos a LLMs con ventana de contexto limitada.
diff --git a/python/functions/core/split_text_into_chunks.md b/python/functions/core/split_text_into_chunks.md
new file mode 100644
index 00000000..b3c053a2
--- /dev/null
+++ b/python/functions/core/split_text_into_chunks.md
@@ -0,0 +1,46 @@
+---
+name: split_text_into_chunks
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def split_text_into_chunks(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]"
+description: "Divide texto en chunks de tamaño fijo con overlap, intentando cortar en límites de oración para no romper frases a mitad. Soporta separadores CJK (。！？) y occidentales (. ! ?)."
+tags: [text, chunking, nlp, split, overlap, sentence, boundary, rag]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: true
+tests: ["texto corto cabe en 1 chunk", "texto largo con oraciones corta en punto", "texto cjk con separador ideografico", "texto sin separadores corta en chunk size exacto", "overlap funciona", "texto vacio retorna lista vacia"]
+test_file_path: "python/functions/core/split_text_into_chunks_test.py"
+file_path: "python/functions/core/split_text_into_chunks.py"
+---
+
+## Ejemplo
+
+```python
+text = "First sentence is here. Second sentence follows. Third ends it."
+chunks = split_text_into_chunks(text, chunk_size=40, overlap=10)
+# ['First sentence is here.', 'is here. Second sentence follows.', ...]
+
+# Texto vacío
+split_text_into_chunks("")  # []
+
+# Texto corto (cabe en un chunk)
+split_text_into_chunks("Hello world.", chunk_size=500)  # ['Hello world.']
+```
+
+## Notas
+
+Algoritmo de sliding window con detección de límites de oración. Los separadores se evalúan en orden de prioridad:
+`。`, `！`, `？`, `.\n`, `!\n`, `?\n`, `\n\n`, `. `, `! `, `? `
+
+Un separador solo se acepta si el corte resultante está después del 30% del chunk (evita chunks muy cortos). Si no se encuentra ningún separador válido, el texto se corta en `chunk_size` exacto.
+
+El siguiente chunk empieza en `end - overlap`, garantizando solapamiento entre chunks consecutivos. Esto es útil para RAG (Retrieval-Augmented Generation) donde el contexto de límite de chunk puede ser crítico.
+
+Diferencia con `page_list_to_groups_py_core`: ese agrupa páginas por tokens con overlap configurable entre grupos de páginas. Este divide texto plano por caracteres con detección de oraciones.
diff --git a/python/functions/core/split_text_into_chunks.py b/python/functions/core/split_text_into_chunks.py
new file mode 100644
index 00000000..bbdc572f
--- /dev/null
+++ b/python/functions/core/split_text_into_chunks.py
@@ -0,0 +1,66 @@
+"""Split text into overlapping chunks with sentence-boundary awareness."""
+
+
+def split_text_into_chunks(
+    text: str, chunk_size: int = 500, overlap: int = 50
+) -> list[str]:
+    """Divide texto en chunks de tamaño fijo con overlap, cortando en límites de oración.
+
+    Args:
+        text: Texto a dividir.
+        chunk_size: Tamaño máximo de cada chunk en caracteres.
+        overlap: Número de caracteres de solapamiento entre chunks consecutivos.
+
+    Returns:
+        Lista de chunks. Vacía si el texto es vacío.
+    """
+    if not text:
+        return []
+
+    if len(text) <= chunk_size:
+        stripped = text.strip()
+        return [stripped] if stripped else []
+
+    # Separadores en orden de prioridad (más específicos primero)
+    separators = ["。", "！", "？", ".\n", "!\n", "?\n", "\n\n", ". ", "! ", "? "]
+
+    chunks: list[str] = []
+    start = 0
+    text_len = len(text)
+
+    while start < text_len:
+        end = start + chunk_size
+
+        if end < text_len:
+            # Buscar el último separador de oración dentro de text[start:end]
+            # Solo aceptar si está después del 30% del chunk
+            min_pos = start + int(chunk_size * 0.30)
+            best_end = None
+
+            for sep in separators:
+                sep_len = len(sep)
+                # Buscar la última ocurrencia del separador en text[start:end]
+                search_region = text[start:end]
+                pos = search_region.rfind(sep)
+                if pos == -1:
+                    continue
+                abs_pos = start + pos + sep_len
+                if abs_pos > min_pos:
+                    # Usar este separador solo si produce un corte más tarde que el mínimo
+                    # y más temprano que chunk_size (ya garantizado por rfind en [start:end])
+                    if best_end is None or abs_pos > best_end:
+                        best_end = abs_pos
+
+            if best_end is not None:
+                end = best_end
+
+        chunk = text[start:end].strip()
+        if chunk:
+            chunks.append(chunk)
+
+        start = end - overlap
+        # Protección contra bucle infinito si overlap >= chunk_size o end no avanza
+        if start >= end:
+            start = end
+
+    return chunks
diff --git a/python/functions/core/split_text_into_chunks_test.py b/python/functions/core/split_text_into_chunks_test.py
new file mode 100644
index 00000000..e1b994e0
--- /dev/null
+++ b/python/functions/core/split_text_into_chunks_test.py
@@ -0,0 +1,64 @@
+"""Tests para split_text_into_chunks."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from split_text_into_chunks import split_text_into_chunks
+
+
+def test_texto_corto_cabe_en_1_chunk():
+    text = "Hello world. This is a short sentence."
+    result = split_text_into_chunks(text, chunk_size=500)
+    assert result == [text], f"Expected [text], got {result}"
+
+
+def test_texto_largo_con_oraciones_corta_en_punto():
+    # Dos oraciones claras que juntas superan chunk_size
+    s1 = "This is the first sentence which ends with a period. "
+    s2 = "This is the second sentence which also ends with a period."
+    text = s1 + s2
+    result = split_text_into_chunks(text, chunk_size=60, overlap=10)
+    assert len(result) >= 2, f"Expected at least 2 chunks, got {result}"
+    # El primer chunk debe terminar alrededor del punto de la primera oración
+    assert result[0].endswith("period."), f"First chunk should end at sentence boundary: {result[0]!r}"
+
+
+def test_texto_cjk_con_separador_ideografico():
+    # Texto CJK con separador 。
+    s1 = "これは最初の文です。"
+    s2 = "これは二番目の文です。"
+    text = s1 + s2
+    # chunk_size pequeño para forzar el corte
+    result = split_text_into_chunks(text, chunk_size=len(s1) + 2, overlap=2)
+    assert len(result) >= 2, f"Expected at least 2 chunks for CJK text, got {result}"
+    assert result[0].endswith("。"), f"First chunk should end with CJK period: {result[0]!r}"
+
+
+def test_texto_sin_separadores_corta_en_chunk_size_exacto():
+    # Texto sin ningún separador de oración
+    text = "a" * 200
+    result = split_text_into_chunks(text, chunk_size=50, overlap=0)
+    # Cada chunk debe tener exactamente 50 chars (excepto el último)
+    for chunk in result[:-1]:
+        assert len(chunk) == 50, f"Chunk should be 50 chars, got {len(chunk)}: {chunk!r}"
+    # Último puede ser menor
+    assert len(result[-1]) <= 50
+
+
+def test_overlap_funciona():
+    # Verificar que el inicio del chunk N+1 solapa con el final del chunk N
+    text = "a" * 300
+    chunk_size = 100
+    overlap = 20
+    result = split_text_into_chunks(text, chunk_size=chunk_size, overlap=overlap)
+    assert len(result) >= 2, "Need at least 2 chunks to test overlap"
+    # La suma total de chars sin overlap debería ser < len(text) (hay solapamiento)
+    total_chars = sum(len(c) for c in result)
+    assert total_chars > len(text), f"With overlap, total chars ({total_chars}) should exceed text length ({len(text)})"
+
+
+def test_texto_vacio_retorna_lista_vacia():
+    assert split_text_into_chunks("") == []
+    assert split_text_into_chunks("   ") == []
diff --git a/python/functions/core/strip_markdown_codeblock.md b/python/functions/core/strip_markdown_codeblock.md
new file mode 100644
index 00000000..ce6f8e5f
--- /dev/null
+++ b/python/functions/core/strip_markdown_codeblock.md
@@ -0,0 +1,37 @@
+---
+name: strip_markdown_codeblock
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def strip_markdown_codeblock(text: str) -> str"
+description: "Remueve wrapping de code blocks markdown que modelos a veces anaden al responder JSON. Ej: ```json\\n{...}\\n``` → {...}"
+tags: [llm, text, cleaning, markdown, codeblock, json]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [re]
+tested: true
+tests: ["JSON sin codeblock no cambia", "JSON con wrapper ```json es desenvuelto", "JSON con wrapper ``` solo es desenvuelto", "texto que no es JSON pasa sin cambio"]
+test_file_path: "python/functions/core/strip_markdown_codeblock_test.py"
+file_path: "python/functions/core/strip_markdown_codeblock.py"
+---
+
+## Ejemplo
+
+```python
+wrapped = '```json\n{"key": "value"}\n```'
+result = strip_markdown_codeblock(wrapped)
+# '{"key": "value"}'
+
+plain = '{"key": "value"}'
+result = strip_markdown_codeblock(plain)
+# '{"key": "value"}'
+```
+
+## Notas
+
+Funcion pura. Aplica cuatro pasos: strip inicial, remover prefijo ` ``` ` o ` ```json ` (case-insensitive), remover sufijo ` ``` `, strip final. Maneja tanto ` ```json ` como ` ``` ` generico. El flag IGNORECASE permite ` ```JSON ` o ` ```Json `.
diff --git a/python/functions/core/strip_markdown_codeblock.py b/python/functions/core/strip_markdown_codeblock.py
new file mode 100644
index 00000000..a4e58908
--- /dev/null
+++ b/python/functions/core/strip_markdown_codeblock.py
@@ -0,0 +1,24 @@
+"""Remove markdown code block wrapping from LLM responses."""
+
+import re
+
+
+def strip_markdown_codeblock(text: str) -> str:
+    """Remove markdown code block wrapping from LLM responses.
+
+    LLMs sometimes wrap JSON (or other content) in markdown code fences like:
+        ```json
+        {"key": "value"}
+        ```
+    This function unwraps the content so it can be parsed directly.
+
+    Args:
+        text: Text that may be wrapped in a markdown code block.
+
+    Returns:
+        Unwrapped text with leading/trailing whitespace stripped.
+    """
+    text = text.strip()
+    text = re.sub(r'^```(?:json)?\s*\n?', '', text, flags=re.IGNORECASE)
+    text = re.sub(r'\n?```\s*$', '', text)
+    return text.strip()
diff --git a/python/functions/core/strip_markdown_codeblock_test.py b/python/functions/core/strip_markdown_codeblock_test.py
new file mode 100644
index 00000000..d3098fea
--- /dev/null
+++ b/python/functions/core/strip_markdown_codeblock_test.py
@@ -0,0 +1,28 @@
+"""Tests para strip_markdown_codeblock."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from strip_markdown_codeblock import strip_markdown_codeblock
+
+
+def test_JSON_sin_codeblock_no_cambia():
+    result = strip_markdown_codeblock('{"key": "value"}')
+    assert result == '{"key": "value"}'
+
+
+def test_JSON_con_wrapper_json_es_desenvuelto():
+    result = strip_markdown_codeblock('```json\n{"key": "value"}\n```')
+    assert result == '{"key": "value"}'
+
+
+def test_JSON_con_wrapper_solo_es_desenvuelto():
+    result = strip_markdown_codeblock('```\n{"key": "value"}\n```')
+    assert result == '{"key": "value"}'
+
+
+def test_texto_que_no_es_JSON_pasa_sin_cambio():
+    result = strip_markdown_codeblock("hello world")
+    assert result == "hello world"
diff --git a/python/functions/core/strip_think_tags.md b/python/functions/core/strip_think_tags.md
new file mode 100644
index 00000000..b641b6cb
--- /dev/null
+++ b/python/functions/core/strip_think_tags.md
@@ -0,0 +1,37 @@
+---
+name: strip_think_tags
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def strip_think_tags(text: str) -> str"
+description: "Remueve tags <think>...</think> que algunos modelos (MiniMax, DeepSeek) incluyen en sus respuestas como chain of thought interno."
+tags: [llm, text, cleaning, think, chain-of-thought, deepseek, minimax]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [re]
+tested: true
+tests: ["texto sin tags no cambia", "texto con think tags los remueve", "tags multilinea son removidos", "multiples tags son removidos"]
+test_file_path: "python/functions/core/strip_think_tags_test.py"
+file_path: "python/functions/core/strip_think_tags.py"
+---
+
+## Ejemplo
+
+```python
+raw = "<think>Let me reason step by step...</think>La respuesta es 42."
+result = strip_think_tags(raw)
+# "La respuesta es 42."
+
+multiline = "<think>\nPrimero analizo...\nLuego concluyo...\n</think>Respuesta final."
+result = strip_think_tags(multiline)
+# "Respuesta final."
+```
+
+## Notas
+
+Funcion pura. Usa `re.sub` con el flag `[\s\S]*?` (lazy) para manejar contenido multilinea sin capturar de mas cuando hay multiples tags. El `.strip()` final elimina whitespace residual alrededor del texto util.
diff --git a/python/functions/core/strip_think_tags.py b/python/functions/core/strip_think_tags.py
new file mode 100644
index 00000000..644a5348
--- /dev/null
+++ b/python/functions/core/strip_think_tags.py
@@ -0,0 +1,20 @@
+"""Remove <think>...</think> tags from LLM responses."""
+
+import re
+
+
+def strip_think_tags(text: str) -> str:
+    """Remove <think>...</think> tags from LLM responses.
+
+    Some models (MiniMax, DeepSeek) include chain-of-thought blocks wrapped in
+    <think> tags as part of their output. This function strips those blocks so
+    that downstream code only sees the final answer.
+
+    Args:
+        text: Raw LLM response that may contain <think>...</think> blocks.
+
+    Returns:
+        Text with all <think>...</think> blocks removed and outer whitespace
+        stripped.
+    """
+    return re.sub(r'<think>[\s\S]*?</think>', '', text).strip()
diff --git a/python/functions/core/strip_think_tags_test.py b/python/functions/core/strip_think_tags_test.py
new file mode 100644
index 00000000..06332496
--- /dev/null
+++ b/python/functions/core/strip_think_tags_test.py
@@ -0,0 +1,30 @@
+"""Tests para strip_think_tags."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from strip_think_tags import strip_think_tags
+
+
+def test_texto_sin_tags_no_cambia():
+    result = strip_think_tags("La respuesta es 42.")
+    assert result == "La respuesta es 42."
+
+
+def test_texto_con_think_tags_los_remueve():
+    result = strip_think_tags("<think>razonamiento interno</think>La respuesta es 42.")
+    assert result == "La respuesta es 42."
+
+
+def test_tags_multilinea_son_removidos():
+    text = "<think>\nPrimero analizo...\nLuego concluyo...\n</think>Respuesta final."
+    result = strip_think_tags(text)
+    assert result == "Respuesta final."
+
+
+def test_multiples_tags_son_removidos():
+    text = "<think>paso 1</think>Parte A<think>paso 2</think>Parte B"
+    result = strip_think_tags(text)
+    assert result == "Parte AParte B"
diff --git a/python/functions/core/t.md b/python/functions/core/t.md
new file mode 100644
index 00000000..bd408f0a
--- /dev/null
+++ b/python/functions/core/t.md
@@ -0,0 +1,58 @@
+---
+name: t
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "def t(key: str, locale: str | None = None, **kwargs) -> str"
+description: "Traduce una clave con dot-path notation al idioma activo. Prioridad: parametro locale > thread-local > default. Soporta interpolacion {variable}. Fallback al locale default si la clave no existe; si tampoco existe, retorna la key."
+tags: [i18n, translation, locale, dot-path, interpolation]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [threading]
+tested: true
+tests: ["key existente retorna traduccion", "key inexistente retorna la key", "interpolacion de variables", "dot-path profundo", "fallback a locale default"]
+test_file_path: "python/functions/core/t_test.py"
+file_path: "python/functions/core/t.py"
+---
+
+## Ejemplo
+
+```python
+from t import t, _set_translations
+
+translations = {
+    "en": {
+        "report": {
+            "sectionStart": "Generating section: {title}",
+            "done": "Done"
+        }
+    },
+    "es": {
+        "report": {
+            "done": "Listo"
+        }
+    }
+}
+_set_translations(translations, default_locale="en")
+
+t("report.sectionStart", locale="en", title="Introduction")
+# → "Generating section: Introduction"
+
+t("report.done", locale="es")
+# → "Listo"
+
+t("report.sectionStart", locale="es", title="Intro")
+# → "Generating section: Intro"  (fallback a en)
+
+t("nonexistent.key", locale="en")
+# → "nonexistent.key"
+```
+
+## Notas
+
+Lee estado global (modulo-level `_translations` y `_locale_local` thread-local), por eso es impura. Configurar con `_set_translations()` al inicio de la aplicacion y `_set_locale()` por thread/request. La interpolacion usa `str.format(**kwargs)` — si hay un placeholder faltante, retorna el string sin interpolar para no romper el flujo. Inspirada conceptualmente en el modulo `locale.py` de MiroFish (AGPL-3.0); reimplementada desde cero.
diff --git a/python/functions/core/t.py b/python/functions/core/t.py
new file mode 100644
index 00000000..bebcf702
--- /dev/null
+++ b/python/functions/core/t.py
@@ -0,0 +1,91 @@
+"""Traduccion de claves con dot-path notation e interpolacion de variables."""
+
+import threading
+
+_locale_local = threading.local()
+_translations: dict = {}
+_default_locale: str = "en"
+
+
+def _set_translations(translations: dict, default_locale: str = "en") -> None:
+    """Configura el diccionario global de traducciones y el locale default.
+
+    Llamar desde load_translations o al inicio de la aplicacion.
+    """
+    global _translations, _default_locale
+    _translations = translations
+    _default_locale = default_locale
+
+
+def _set_locale(locale: str) -> None:
+    """Establece el locale para el thread actual."""
+    _locale_local.locale = locale
+
+
+def _get_locale() -> str:
+    """Retorna el locale del thread actual, o el default si no esta configurado."""
+    return getattr(_locale_local, "locale", _default_locale)
+
+
+def _resolve_key(translations: dict, locale: str, key: str) -> str | None:
+    """Navega dot-path en el diccionario de traducciones para un locale.
+
+    Returns:
+        El string de traduccion si existe, None si no.
+    """
+    locale_dict = translations.get(locale)
+    if not locale_dict:
+        return None
+
+    parts = key.split(".")
+    node = locale_dict
+    for part in parts:
+        if not isinstance(node, dict):
+            return None
+        node = node.get(part)
+        if node is None:
+            return None
+
+    return node if isinstance(node, str) else None
+
+
+def t(key: str, locale: str | None = None, **kwargs) -> str:
+    """Traduce una clave con dot-path notation al idioma actual.
+
+    Determina el locale en orden de prioridad: parametro > thread-local > default.
+    Soporta interpolacion de variables con {nombre} en el valor traducido.
+    Si la clave no existe en el locale solicitado, intenta el locale default.
+    Si tampoco existe, retorna la clave tal cual.
+
+    Args:
+        key: Clave de traduccion en dot-path notation (ej: "report.taskStarted").
+        locale: Locale a usar. Si es None usa el locale del thread actual.
+        **kwargs: Variables para interpolar en el string traducido.
+
+    Returns:
+        String traducido con variables interpoladas, o la key si no se encontro.
+
+    Example:
+        >>> # translations = {"en": {"report": {"sectionStart": "Section: {title}"}}}
+        >>> t("report.sectionStart", locale="en", title="Introduction")
+        'Section: Introduction'
+        >>> t("nonexistent.key", locale="en")
+        'nonexistent.key'
+    """
+    resolved_locale = locale if locale is not None else _get_locale()
+
+    value = _resolve_key(_translations, resolved_locale, key)
+
+    if value is None and resolved_locale != _default_locale:
+        value = _resolve_key(_translations, _default_locale, key)
+
+    if value is None:
+        return key
+
+    if kwargs:
+        try:
+            value = value.format(**kwargs)
+        except (KeyError, ValueError):
+            pass
+
+    return value
diff --git a/python/functions/core/t_test.py b/python/functions/core/t_test.py
new file mode 100644
index 00000000..1f760604
--- /dev/null
+++ b/python/functions/core/t_test.py
@@ -0,0 +1,83 @@
+"""Tests para t (translate)."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from t import t, _set_translations, _set_locale
+
+TRANSLATIONS = {
+    "en": {
+        "report": {
+            "sectionStart": "Generating section: {title}",
+            "done": "Done",
+            "deep": {
+                "nested": {
+                    "value": "Deep value"
+                }
+            }
+        },
+        "greeting": "Hello"
+    },
+    "es": {
+        "report": {
+            "done": "Listo"
+        }
+    }
+}
+
+
+def setup():
+    _set_translations(TRANSLATIONS, default_locale="en")
+
+
+def test_key_existente_retorna_traduccion():
+    setup()
+    result = t("report.done", locale="en")
+    assert result == "Done", f"Expected 'Done', got '{result}'"
+
+
+def test_key_inexistente_retorna_la_key():
+    setup()
+    result = t("nonexistent.key", locale="en")
+    assert result == "nonexistent.key", f"Expected 'nonexistent.key', got '{result}'"
+
+
+def test_interpolacion_de_variables():
+    setup()
+    result = t("report.sectionStart", locale="en", title="Introduction")
+    assert result == "Generating section: Introduction", f"Got '{result}'"
+
+
+def test_dot_path_profundo():
+    setup()
+    result = t("report.deep.nested.value", locale="en")
+    assert result == "Deep value", f"Expected 'Deep value', got '{result}'"
+
+
+def test_fallback_a_locale_default():
+    setup()
+    # "report.sectionStart" no existe en "es", debe hacer fallback a "en"
+    result = t("report.sectionStart", locale="es", title="Intro")
+    assert result == "Generating section: Intro", f"Got '{result}'"
+
+
+if __name__ == "__main__":
+    test_key_existente_retorna_traduccion()
+    print("PASS: key existente retorna traduccion")
+
+    test_key_inexistente_retorna_la_key()
+    print("PASS: key inexistente retorna la key")
+
+    test_interpolacion_de_variables()
+    print("PASS: interpolacion de variables")
+
+    test_dot_path_profundo()
+    print("PASS: dot-path profundo")
+
+    test_fallback_a_locale_default()
+    print("PASS: fallback a locale default")
+
+    print("---")
+    print("All tests passed.")
diff --git a/python/functions/core/task_manager.md b/python/functions/core/task_manager.md
new file mode 100644
index 00000000..4cdbbff6
--- /dev/null
+++ b/python/functions/core/task_manager.md
@@ -0,0 +1,58 @@
+---
+name: task_manager
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "class TaskManager: create_task(task_type: str, metadata: dict | None = None) -> str; get_task(task_id: str) -> Task | None; update_task(task_id: str, ...) -> None; complete_task(task_id: str, result: dict) -> None; fail_task(task_id: str, error: str) -> None; list_tasks(task_type: str | None = None) -> list[dict]; cleanup_old_tasks(max_age_hours: int = 24) -> None"
+description: "Gestor de tareas background en memoria, thread-safe, con patron singleton. Permite crear, actualizar, completar y limpiar tareas de larga duracion. Patron tipico: hilo background actualiza progreso via update_task(), frontend hace polling via get_task()."
+tags: [task, manager, singleton, thread-safe, background, async, polling, progress]
+uses_functions: []
+uses_types: [task_py_core, task_status_py_core]
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [threading, uuid, datetime]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/task_manager.py"
+---
+
+## Ejemplo
+
+```python
+manager = TaskManager()
+
+# Crear tarea
+task_id = manager.create_task("graph_build", metadata={"nodes": 500})
+
+# Hilo background actualiza progreso
+manager.update_task(task_id, status=TaskStatus.PROCESSING, progress=25, message="Cargando datos...")
+manager.update_task(task_id, progress=75, message="Construyendo grafo...")
+manager.complete_task(task_id, result={"graph_id": "abc-123"})
+
+# Frontend hace polling
+task = manager.get_task(task_id)
+print(task.to_dict())
+# {"task_id": "...", "status": "completed", "progress": 100, ...}
+
+# Listar todas las tareas de un tipo
+tasks = manager.list_tasks(task_type="graph_build")
+
+# Limpiar tareas antiguas (> 24h)
+manager.cleanup_old_tasks(max_age_hours=24)
+```
+
+## Notas
+
+Singleton con double-checked locking: la primera instancia se crea una sola vez aunque multiples hilos llamen a `TaskManager()` simultaneamente.
+
+El lock `_tasks_lock` protege el diccionario interno `_tasks` en todas las operaciones de lectura y escritura.
+
+`update_task()` solo actualiza los campos que recibe como no-None, lo que permite actualizaciones parciales sin sobreescribir el estado existente.
+
+`cleanup_old_tasks()` solo elimina tareas en estado terminal (COMPLETED o FAILED). Las tareas PENDING o PROCESSING no se eliminan independientemente de su edad.
+
+El `error_type` referencia `error_go_core` como convencion del registry — los errores reales en Python se lanzan como excepciones, no se retornan.
diff --git a/python/functions/core/task_manager.py b/python/functions/core/task_manager.py
new file mode 100644
index 00000000..1c18eb6a
--- /dev/null
+++ b/python/functions/core/task_manager.py
@@ -0,0 +1,176 @@
+"""TaskManager — gestor de tareas background thread-safe con patron singleton."""
+
+import threading
+import uuid
+from datetime import datetime, timezone
+
+from task_status import TaskStatus
+from task import Task
+
+
+class TaskManager:
+    """Gestor de tareas en memoria, thread-safe, con patron singleton.
+
+    Uso tipico:
+        manager = TaskManager()
+        task_id = manager.create_task("graph_build", metadata={"nodes": 500})
+        # en un hilo background:
+        manager.update_task(task_id, progress=50, message="Procesando nodos...")
+        manager.complete_task(task_id, result={"graph_id": "abc"})
+        # desde el frontend (polling):
+        task = manager.get_task(task_id)
+    """
+
+    _instance: "TaskManager | None" = None
+    _lock: threading.Lock = threading.Lock()
+
+    def __new__(cls) -> "TaskManager":
+        if cls._instance is None:
+            with cls._lock:
+                if cls._instance is None:
+                    instance = super().__new__(cls)
+                    instance._tasks: dict[str, Task] = {}
+                    instance._tasks_lock = threading.Lock()
+                    cls._instance = instance
+        return cls._instance
+
+    # ------------------------------------------------------------------
+    # Operaciones publicas
+    # ------------------------------------------------------------------
+
+    def create_task(self, task_type: str, metadata: dict | None = None) -> str:
+        """Crea una nueva tarea en estado PENDING y retorna su task_id.
+
+        Args:
+            task_type: Identificador libre del tipo de tarea ("graph_build", etc.).
+            metadata:  Datos adicionales asociados a la tarea (parametros de entrada).
+
+        Returns:
+            task_id — UUID string de la tarea creada.
+        """
+        task_id = str(uuid.uuid4())
+        now = datetime.now(timezone.utc)
+        task = Task(
+            task_id=task_id,
+            task_type=task_type,
+            status=TaskStatus.PENDING,
+            created_at=now,
+            updated_at=now,
+            metadata=metadata or {},
+        )
+        with self._tasks_lock:
+            self._tasks[task_id] = task
+        return task_id
+
+    def get_task(self, task_id: str) -> Task | None:
+        """Retorna la tarea con el id dado, o None si no existe.
+
+        Args:
+            task_id: UUID de la tarea.
+
+        Returns:
+            Task o None.
+        """
+        with self._tasks_lock:
+            return self._tasks.get(task_id)
+
+    def update_task(
+        self,
+        task_id: str,
+        status: TaskStatus | None = None,
+        progress: int | None = None,
+        message: str | None = None,
+        result: dict | None = None,
+        error: str | None = None,
+        progress_detail: dict | None = None,
+    ) -> None:
+        """Actualiza campos de una tarea existente.
+
+        Solo actualiza los campos que no sean None. Actualiza siempre updated_at.
+
+        Args:
+            task_id:         UUID de la tarea a actualizar.
+            status:          Nuevo estado (TaskStatus), opcional.
+            progress:        Nuevo porcentaje 0-100, opcional.
+            message:         Nuevo mensaje de estado, opcional.
+            result:          Resultado de la tarea, opcional.
+            error:           Mensaje de error, opcional.
+            progress_detail: Detalles granulares de progreso, opcional.
+        """
+        with self._tasks_lock:
+            task = self._tasks.get(task_id)
+            if task is None:
+                return
+            if status is not None:
+                task.status = status
+            if progress is not None:
+                task.progress = progress
+            if message is not None:
+                task.message = message
+            if result is not None:
+                task.result = result
+            if error is not None:
+                task.error = error
+            if progress_detail is not None:
+                task.progress_detail = progress_detail
+            task.updated_at = datetime.now(timezone.utc)
+
+    def complete_task(self, task_id: str, result: dict) -> None:
+        """Marca la tarea como COMPLETED con el resultado dado.
+
+        Args:
+            task_id: UUID de la tarea.
+            result:  Resultado serializable de la tarea completada.
+        """
+        self.update_task(
+            task_id,
+            status=TaskStatus.COMPLETED,
+            progress=100,
+            result=result,
+        )
+
+    def fail_task(self, task_id: str, error: str) -> None:
+        """Marca la tarea como FAILED con el mensaje de error dado.
+
+        Args:
+            task_id: UUID de la tarea.
+            error:   Descripcion del error.
+        """
+        self.update_task(
+            task_id,
+            status=TaskStatus.FAILED,
+            error=error,
+        )
+
+    def list_tasks(self, task_type: str | None = None) -> list[dict]:
+        """Lista todas las tareas, opcionalmente filtradas por tipo.
+
+        Args:
+            task_type: Si se especifica, solo retorna tareas de ese tipo.
+
+        Returns:
+            Lista de dicts serializados (via Task.to_dict()).
+        """
+        with self._tasks_lock:
+            tasks = list(self._tasks.values())
+        if task_type is not None:
+            tasks = [t for t in tasks if t.task_type == task_type]
+        return [t.to_dict() for t in tasks]
+
+    def cleanup_old_tasks(self, max_age_hours: int = 24) -> None:
+        """Elimina tareas terminadas (COMPLETED o FAILED) con mas de max_age_hours horas.
+
+        Args:
+            max_age_hours: Edad maxima en horas para tareas terminadas. Default: 24.
+        """
+        now = datetime.now(timezone.utc)
+        terminal = {TaskStatus.COMPLETED, TaskStatus.FAILED}
+        with self._tasks_lock:
+            to_delete = [
+                tid
+                for tid, task in self._tasks.items()
+                if task.status in terminal
+                and (now - task.updated_at).total_seconds() > max_age_hours * 3600
+            ]
+            for tid in to_delete:
+                del self._tasks[tid]
diff --git a/python/functions/core/to_csv.md b/python/functions/core/to_csv.md
new file mode 100644
index 00000000..bfedfbeb
--- /dev/null
+++ b/python/functions/core/to_csv.md
@@ -0,0 +1,48 @@
+---
+name: to_csv
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "to_csv(rows: list[dict], columns: list[str] | None = None, delimiter: str = ',', include_header: bool = True) -> str"
+description: "Serializa datos tabulares a CSV. Si columns es None, usa las keys de la primera fila. Escapa campos con comillas, newlines y delimiters correctamente (RFC 4180). Sin dependencias externas."
+tags: [csv, serialization, export, tabular, format]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: true
+tests:
+  - "lista simple csv con header"
+  - "campos con comas comillas newlines"
+  - "columns explicitas reordena y filtra"
+  - "include header false"
+  - "lista vacia retorna string vacio"
+  - "valores none campo vacio"
+test_file_path: "python/functions/core/to_csv_test.py"
+file_path: "python/functions/core/to_csv.py"
+---
+
+## Ejemplo
+
+```python
+rows = [{"nombre": "Ana", "edad": 30}, {"nombre": "Bob", "edad": 25}]
+csv = to_csv(rows)
+# "nombre,edad\r\nAna,30\r\nBob,25"
+
+# Con columnas especificas
+csv = to_csv(rows, columns=["edad", "nombre"])
+# "edad,nombre\r\n30,Ana\r\n25,Bob"
+
+# Sin header
+csv = to_csv(rows, include_header=False)
+# "Ana,30\r\nBob,25"
+```
+
+## Notas
+
+Implementacion manual sin el modulo csv de stdlib para control total del escaping.
+Cumple RFC 4180: escapa con comillas dobles cualquier campo que contenga el delimiter, comillas o saltos de linea. Los valores None se serializan como campo vacio.
diff --git a/python/functions/core/to_csv.py b/python/functions/core/to_csv.py
new file mode 100644
index 00000000..664237ec
--- /dev/null
+++ b/python/functions/core/to_csv.py
@@ -0,0 +1,53 @@
+"""Serializa datos tabulares a formato CSV (RFC 4180)."""
+
+
+def _escape_field(value: str, delimiter: str) -> str:
+    """Escapa un campo segun RFC 4180 si contiene delimiter, comillas o newlines."""
+    if any(c in value for c in (delimiter, '"', "\n", "\r")):
+        return '"' + value.replace('"', '""') + '"'
+    return value
+
+
+def to_csv(
+    rows: list[dict],
+    columns: list[str] | None = None,
+    delimiter: str = ",",
+    include_header: bool = True,
+) -> str:
+    """Serializa datos tabulares a CSV.
+
+    Implementacion manual sin csv module para control total del escaping.
+    Cumple RFC 4180: escapa campos con comillas, newlines y delimiters.
+
+    Args:
+        rows: Lista de dicts con los datos. Todas las filas deben tener
+              las mismas keys (o un subconjunto de columns).
+        columns: Columnas a incluir y su orden. Si es None, usa las keys
+                 de la primera fila en orden de insercion.
+        delimiter: Separador de campos. Por defecto coma.
+        include_header: Si True, incluye la fila de encabezados.
+
+    Returns:
+        String CSV completo. String vacio si rows es vacio.
+    """
+    if not rows:
+        return ""
+
+    cols = columns if columns is not None else list(rows[0].keys())
+
+    output_lines: list[str] = []
+
+    if include_header:
+        output_lines.append(delimiter.join(_escape_field(c, delimiter) for c in cols))
+
+    for row in rows:
+        fields = []
+        for col in cols:
+            raw = row.get(col)
+            if raw is None:
+                fields.append("")
+            else:
+                fields.append(_escape_field(str(raw), delimiter))
+        output_lines.append(delimiter.join(fields))
+
+    return "\r\n".join(output_lines)
diff --git a/python/functions/core/to_csv_test.py b/python/functions/core/to_csv_test.py
new file mode 100644
index 00000000..3ff40889
--- /dev/null
+++ b/python/functions/core/to_csv_test.py
@@ -0,0 +1,48 @@
+"""Tests para to_csv."""
+
+from to_csv import to_csv
+
+
+def test_lista_simple_csv_con_header():
+    rows = [{"nombre": "Ana", "edad": 30}, {"nombre": "Bob", "edad": 25}]
+    result = to_csv(rows)
+    lines = result.split("\r\n")
+    assert lines[0] == "nombre,edad"
+    assert lines[1] == "Ana,30"
+    assert lines[2] == "Bob,25"
+
+
+def test_campos_con_comas_comillas_newlines():
+    rows = [{"a": 'dijo "hola"', "b": "uno,dos", "c": "linea1\nlinea2"}]
+    result = to_csv(rows)
+    lines = result.split("\r\n")
+    assert '"dijo ""hola"""' in lines[1]
+    assert '"uno,dos"' in lines[1]
+    assert '"linea1\nlinea2"' in lines[1]
+
+
+def test_columns_explicitas_reordena_y_filtra():
+    rows = [{"a": 1, "b": 2, "c": 3}]
+    result = to_csv(rows, columns=["c", "a"])
+    lines = result.split("\r\n")
+    assert lines[0] == "c,a"
+    assert lines[1] == "3,1"
+
+
+def test_include_header_false():
+    rows = [{"x": "foo", "y": "bar"}]
+    result = to_csv(rows, include_header=False)
+    lines = result.split("\r\n")
+    assert len(lines) == 1
+    assert lines[0] == "foo,bar"
+
+
+def test_lista_vacia_retorna_string_vacio():
+    assert to_csv([]) == ""
+
+
+def test_valores_none_campo_vacio():
+    rows = [{"a": "ok", "b": None}]
+    result = to_csv(rows)
+    lines = result.split("\r\n")
+    assert lines[1] == "ok,"
diff --git a/python/functions/core/to_jsonl.md b/python/functions/core/to_jsonl.md
new file mode 100644
index 00000000..f7a26ca1
--- /dev/null
+++ b/python/functions/core/to_jsonl.md
@@ -0,0 +1,43 @@
+---
+name: to_jsonl
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "to_jsonl(rows: list[dict]) -> str"
+description: "Serializa a JSON Lines (newline-delimited JSON). Cada dict se serializa como una linea JSON independiente. Util para streaming, logging estructurado y formatos de intercambio."
+tags: [jsonl, json, serialization, export, streaming, format]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: ["json"]
+tested: true
+tests:
+  - "lista de dicts jsonl"
+  - "valores unicode none nested"
+  - "lista vacia retorna string vacio"
+  - "cada linea es json parseable"
+test_file_path: "python/functions/core/to_jsonl_test.py"
+file_path: "python/functions/core/to_jsonl.py"
+---
+
+## Ejemplo
+
+```python
+rows = [{"id": 1, "name": "Ana"}, {"id": 2, "name": "Bob"}]
+jsonl = to_jsonl(rows)
+# '{"id": 1, "name": "Ana"}\n{"id": 2, "name": "Bob"}'
+
+# Unicode preservado
+rows = [{"emoji": "🔥", "val": None}]
+jsonl = to_jsonl(rows)
+# '{"emoji": "🔥", "val": null}'
+```
+
+## Notas
+
+Usa json.dumps con ensure_ascii=False para preservar Unicode. No incluye newline al final.
+Cada linea es un JSON valido independiente, parseable con json.loads.
diff --git a/python/functions/core/to_jsonl.py b/python/functions/core/to_jsonl.py
new file mode 100644
index 00000000..e1e7e059
--- /dev/null
+++ b/python/functions/core/to_jsonl.py
@@ -0,0 +1,23 @@
+"""Serializa lista de dicts a formato JSON Lines (JSONL)."""
+
+import json
+
+
+def to_jsonl(rows: list[dict]) -> str:
+    """Serializa a JSON Lines (newline-delimited JSON).
+
+    Cada dict se serializa como una linea JSON independiente separada por \\n.
+    Util para streaming, logging estructurado y formatos de intercambio de datos.
+
+    Args:
+        rows: Lista de dicts a serializar. Cada dict puede contener valores
+              anidados, Unicode, None, numeros, etc.
+
+    Returns:
+        String con un JSON por linea. String vacio si rows es vacio.
+        No incluye newline al final.
+    """
+    if not rows:
+        return ""
+
+    return "\n".join(json.dumps(row, ensure_ascii=False) for row in rows)
diff --git a/python/functions/core/to_jsonl_test.py b/python/functions/core/to_jsonl_test.py
new file mode 100644
index 00000000..0c7356ec
--- /dev/null
+++ b/python/functions/core/to_jsonl_test.py
@@ -0,0 +1,34 @@
+"""Tests para to_jsonl."""
+
+import json
+
+from to_jsonl import to_jsonl
+
+
+def test_lista_de_dicts_jsonl():
+    rows = [{"a": 1}, {"b": 2}]
+    result = to_jsonl(rows)
+    lines = result.split("\n")
+    assert len(lines) == 2
+    assert json.loads(lines[0]) == {"a": 1}
+    assert json.loads(lines[1]) == {"b": 2}
+
+
+def test_valores_unicode_none_nested():
+    rows = [{"emoji": "🔥", "nulo": None, "nested": {"x": 1}}]
+    result = to_jsonl(rows)
+    parsed = json.loads(result)
+    assert parsed["emoji"] == "🔥"
+    assert parsed["nulo"] is None
+    assert parsed["nested"] == {"x": 1}
+
+
+def test_lista_vacia_retorna_string_vacio():
+    assert to_jsonl([]) == ""
+
+
+def test_cada_linea_es_json_parseable():
+    rows = [{"k": i} for i in range(5)]
+    result = to_jsonl(rows)
+    for i, line in enumerate(result.split("\n")):
+        assert json.loads(line) == {"k": i}
diff --git a/python/functions/core/to_pascal_case.md b/python/functions/core/to_pascal_case.md
new file mode 100644
index 00000000..d860def3
--- /dev/null
+++ b/python/functions/core/to_pascal_case.md
@@ -0,0 +1,48 @@
+---
+name: to_pascal_case
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def to_pascal_case(name: str) -> str"
+description: "Convierte cualquier formato de nombre (snake_case, camelCase, kebab-case, mixto) a PascalCase. Retorna \"Unknown\" para entrada vacia."
+tags: [string, naming, pascal-case, case-conversion, formatting, python]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [re]
+tested: true
+tests:
+  - "snake_case"
+  - "camelCase"
+  - "UPPER_SNAKE"
+  - "kebab-case"
+  - "pascal_case_idempotente"
+  - "string_vacio"
+  - "con_numeros"
+test_file_path: "python/functions/core/to_pascal_case_test.py"
+file_path: "python/functions/core/to_pascal_case.py"
+---
+
+## Ejemplo
+
+```python
+to_pascal_case("works_for")        # "WorksFor"
+to_pascal_case("camelCaseExample") # "CamelCaseExample"
+to_pascal_case("SCREAMING_SNAKE")  # "ScreamingSnake"
+to_pascal_case("kebab-case")       # "KebabCase"
+to_pascal_case("")                 # "Unknown"
+```
+
+## Notas
+
+Funcion pura, sin dependencias externas (solo `re` de stdlib).
+
+Algoritmo en dos pasos:
+1. Split por caracteres no alfanumericos: `re.split(r'[^a-zA-Z0-9]+', name)`
+2. Split adicional por boundary camelCase en cada parte: `re.sub(r'([a-z])([A-Z])', r'\1_\2', part).split('_')`
+
+Cada word resultante se capitaliza (primera letra upper, resto lower) y se concatenan. Si no hay words validas, retorna `"Unknown"`.
diff --git a/python/functions/core/to_pascal_case.py b/python/functions/core/to_pascal_case.py
new file mode 100644
index 00000000..71971e14
--- /dev/null
+++ b/python/functions/core/to_pascal_case.py
@@ -0,0 +1,28 @@
+"""Convert any name format to PascalCase."""
+
+import re
+
+
+def to_pascal_case(name: str) -> str:
+    """Convert any name format to PascalCase.
+
+    Handles snake_case, camelCase, kebab-case, SCREAMING_SNAKE, and mixed formats.
+    Splits on non-alphanumeric characters and camelCase boundaries, then capitalizes
+    each word and joins them.
+
+    Args:
+        name: Input name in any format.
+
+    Returns:
+        str: Name converted to PascalCase. Returns "Unknown" for empty input.
+    """
+    parts = re.split(r'[^a-zA-Z0-9]+', name)
+    words = []
+    for part in parts:
+        if not part:
+            continue
+        sub_parts = re.sub(r'([a-z])([A-Z])', r'\1_\2', part).split('_')
+        for w in sub_parts:
+            if w:
+                words.append(w[0].upper() + w[1:].lower() if len(w) > 1 else w.upper())
+    return ''.join(words) if words else 'Unknown'
diff --git a/python/functions/core/to_pascal_case_test.py b/python/functions/core/to_pascal_case_test.py
new file mode 100644
index 00000000..1c8c491f
--- /dev/null
+++ b/python/functions/core/to_pascal_case_test.py
@@ -0,0 +1,35 @@
+"""Tests para to_pascal_case."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(__file__))
+from to_pascal_case import to_pascal_case
+
+
+def test_snake_case():
+    assert to_pascal_case("works_for") == "WorksFor"
+
+
+def test_camel_case():
+    assert to_pascal_case("camelCaseExample") == "CamelCaseExample"
+
+
+def test_upper_snake():
+    assert to_pascal_case("SCREAMING_SNAKE") == "ScreamingSnake"
+
+
+def test_kebab_case():
+    assert to_pascal_case("kebab-case") == "KebabCase"
+
+
+def test_pascal_case_idempotente():
+    assert to_pascal_case("PersonName") == "PersonName"
+
+
+def test_string_vacio():
+    assert to_pascal_case("") == "Unknown"
+
+
+def test_con_numeros():
+    assert to_pascal_case("field_2_value") == "Field2Value"
diff --git a/python/functions/core/tree_to_flat_list.md b/python/functions/core/tree_to_flat_list.md
new file mode 100644
index 00000000..33616c47
--- /dev/null
+++ b/python/functions/core/tree_to_flat_list.md
@@ -0,0 +1,36 @@
+---
+name: tree_to_flat_list
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def tree_to_flat_list(structure: Any) -> list[dict]"
+description: "Convierte arbol jerarquico a lista plana en orden DFS. Mantiene nodos internos con sus campos."
+tags: [tree, flatten, dfs, hierarchy]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/core.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/utils.py"
+---
+
+## Ejemplo
+
+```python
+tree = {"title": "Root", "nodes": [{"title": "Child", "nodes": []}]}
+tree_to_flat_list(tree)
+# [{"title": "Root", "nodes": [...]}, {"title": "Child", "nodes": []}]
+```
+
+## Notas
+
+Funcion pura. A diferencia de flatten_tree, mantiene los nodos internos con su campo 'nodes' intacto (referencias, no copias).
diff --git a/python/functions/core/validate_git_ssh_uri.md b/python/functions/core/validate_git_ssh_uri.md
new file mode 100644
index 00000000..da1d0310
--- /dev/null
+++ b/python/functions/core/validate_git_ssh_uri.md
@@ -0,0 +1,45 @@
+---
+name: validate_git_ssh_uri
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def validate_git_ssh_uri(url: str) -> None"
+description: "Valida el formato de una URI SSH de git (git@host:path). Lanza ValueError si la URI es invalida."
+tags: [git, ssh, validation, uri]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: true
+tests:
+  - "URI valida"
+  - "URI sin git@"
+  - "URI sin colon"
+  - "URI con path vacio"
+test_file_path: "python/functions/core/parse_git_url_test.py"
+file_path: "python/functions/core/core.py"
+---
+
+## Ejemplo
+
+```python
+validate_git_ssh_uri("git@github.com:org/repo.git")
+# None (sin excepcion)
+
+validate_git_ssh_uri("https://github.com/org/repo")
+# ValueError: git SSH URI must start with 'git@', got: 'https://github.com/org/repo'
+
+validate_git_ssh_uri("git@github.com/org/repo")
+# ValueError: git SSH URI must contain ':'
+
+validate_git_ssh_uri("git@github.com:")
+# ValueError: git SSH URI must have a non-empty path after ':'
+```
+
+## Notas
+
+Funcion pura. Valida solo la estructura sintatica del formato SSH de git: prefijo `git@`, separador `:`, y path no vacio. No verifica que el host sea alcanzable ni que el repo exista. Usar antes de `subprocess.run(["git", "clone", uri])` para dar mensajes de error claros al usuario.
diff --git a/python/functions/core/validate_json_schema.md b/python/functions/core/validate_json_schema.md
new file mode 100644
index 00000000..b5c17238
--- /dev/null
+++ b/python/functions/core/validate_json_schema.md
@@ -0,0 +1,58 @@
+---
+name: validate_json_schema
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def validate_json_schema(data: dict, schema: dict) -> tuple[bool, list[str]]"
+description: "Valida un dict contra un subset practico de JSON Schema (draft 2020-12) sin dependencias externas. Soporta type, required, properties, items, minItems/maxItems, minimum/maximum, exclusiveMinimum/exclusiveMaximum, minLength/maxLength, pattern, enum. Acumula todos los errores con paths descriptivos."
+tags: [validation, schema, json-schema, pure, core]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [re]
+tested: true
+tests:
+  - "schema simple con types y required"
+  - "nested object validation"
+  - "array con items schema"
+  - "pattern validation en strings"
+  - "numeric ranges"
+  - "multiples errores acumulados con paths correctos"
+  - "schema vacio acepta todo"
+  - "data none contra required field"
+test_file_path: "python/functions/core/validate_json_schema_test.py"
+file_path: "python/functions/core/validate_json_schema.py"
+---
+
+## Ejemplo
+
+```python
+schema = {
+    "type": "object",
+    "required": ["name", "age"],
+    "properties": {
+        "name": {"type": "string"},
+        "age": {"type": "integer", "minimum": 0},
+        "email": {"type": "string", "pattern": r"^[^@]+@[^@]+$"},
+    },
+}
+
+valid, errors = validate_json_schema({"name": "Alice", "age": 30}, schema)
+# valid = True, errors = []
+
+valid, errors = validate_json_schema({"name": 123, "age": -1}, schema)
+# valid = False
+# errors = [
+#   "$.name: expected string, got integer",
+#   "$.age: -1 < minimum 0",
+#   "$.email: required field missing or empty",  (si email en required)
+# ]
+```
+
+## Notas
+
+Funcion pura. Solo usa `re` de la stdlib. No cubre el spec completo de JSON Schema — esta disenada para validacion practica de metadata de entities y resultados de queries. El path usa notacion `$.field[i]` para facilitar debugging. Schema vacio acepta cualquier valor (comportamiento estandar JSON Schema).
diff --git a/python/functions/core/validate_json_schema.py b/python/functions/core/validate_json_schema.py
new file mode 100644
index 00000000..1ab57553
--- /dev/null
+++ b/python/functions/core/validate_json_schema.py
@@ -0,0 +1,134 @@
+"""Validacion de dicts contra un subset de JSON Schema (draft 2020-12) sin dependencias externas."""
+
+import re
+
+
+def validate_json_schema(data: dict, schema: dict) -> tuple[bool, list[str]]:
+    """Valida un dict contra un schema JSON Schema (draft 2020-12).
+
+    Soporta: type, required, properties, items, minItems/maxItems,
+    minimum/maximum, exclusiveMinimum/exclusiveMaximum,
+    minLength/maxLength, pattern, enum.
+
+    Args:
+        data: Datos a validar.
+        schema: Schema JSON Schema (draft 2020-12) como dict.
+
+    Returns:
+        (valid, errors) — True y lista vacia si cumple, False y lista de
+        errores descriptivos si no. Cada error incluye el path (ej: "$.address.zip").
+    """
+    errors: list[str] = []
+    _validate(data, schema, "$", errors)
+    return (len(errors) == 0, errors)
+
+
+_TYPE_CHECKS = {
+    "string": lambda v: isinstance(v, str),
+    "number": lambda v: isinstance(v, (int, float)) and not isinstance(v, bool),
+    "integer": lambda v: isinstance(v, int) and not isinstance(v, bool),
+    "boolean": lambda v: isinstance(v, bool),
+    "array": lambda v: isinstance(v, list),
+    "object": lambda v: isinstance(v, dict),
+    "null": lambda v: v is None,
+}
+
+_TYPE_NAMES = {
+    str: "string",
+    int: "integer",
+    float: "number",
+    bool: "boolean",
+    list: "array",
+    dict: "object",
+    type(None): "null",
+}
+
+
+def _type_name(v: object) -> str:
+    return _TYPE_NAMES.get(type(v), type(v).__name__)
+
+
+def _validate(data: object, schema: dict, path: str, errors: list[str]) -> None:
+    if not schema:
+        return
+
+    # --- type ---
+    expected_type = schema.get("type")
+    if expected_type is not None:
+        checker = _TYPE_CHECKS.get(expected_type)
+        if checker and not checker(data):
+            errors.append(
+                f"{path}: expected {expected_type}, got {_type_name(data)}"
+            )
+            # no seguir validando si el tipo base ya fallo
+            return
+
+    # --- enum ---
+    if "enum" in schema:
+        if data not in schema["enum"]:
+            errors.append(
+                f"{path}: value {data!r} not in enum {schema['enum']}"
+            )
+
+    # --- object ---
+    if isinstance(data, dict):
+        required = schema.get("required", [])
+        for field in required:
+            if field not in data or data[field] is None or data[field] == "":
+                errors.append(f"{path}.{field}: required field missing or empty")
+
+        properties = schema.get("properties", {})
+        for field, sub_schema in properties.items():
+            if field in data:
+                _validate(data[field], sub_schema, f"{path}.{field}", errors)
+
+    # --- array ---
+    elif isinstance(data, list):
+        min_items = schema.get("minItems")
+        max_items = schema.get("maxItems")
+        if min_items is not None and len(data) < min_items:
+            errors.append(
+                f"{path}: array length {len(data)} < minItems {min_items}"
+            )
+        if max_items is not None and len(data) > max_items:
+            errors.append(
+                f"{path}: array length {len(data)} > maxItems {max_items}"
+            )
+
+        items_schema = schema.get("items")
+        if items_schema:
+            for i, item in enumerate(data):
+                _validate(item, items_schema, f"{path}[{i}]", errors)
+
+    # --- string ---
+    elif isinstance(data, str):
+        min_len = schema.get("minLength")
+        max_len = schema.get("maxLength")
+        if min_len is not None and len(data) < min_len:
+            errors.append(
+                f"{path}: string length {len(data)} < minLength {min_len}"
+            )
+        if max_len is not None and len(data) > max_len:
+            errors.append(
+                f"{path}: string length {len(data)} > maxLength {max_len}"
+            )
+        pattern = schema.get("pattern")
+        if pattern is not None and not re.search(pattern, data):
+            errors.append(
+                f"{path}: value {data!r} does not match pattern {pattern!r}"
+            )
+
+    # --- number / integer ---
+    elif isinstance(data, (int, float)) and not isinstance(data, bool):
+        minimum = schema.get("minimum")
+        maximum = schema.get("maximum")
+        ex_min = schema.get("exclusiveMinimum")
+        ex_max = schema.get("exclusiveMaximum")
+        if minimum is not None and data < minimum:
+            errors.append(f"{path}: {data} < minimum {minimum}")
+        if maximum is not None and data > maximum:
+            errors.append(f"{path}: {data} > maximum {maximum}")
+        if ex_min is not None and data <= ex_min:
+            errors.append(f"{path}: {data} <= exclusiveMinimum {ex_min}")
+        if ex_max is not None and data >= ex_max:
+            errors.append(f"{path}: {data} >= exclusiveMaximum {ex_max}")
diff --git a/python/functions/core/validate_json_schema_test.py b/python/functions/core/validate_json_schema_test.py
new file mode 100644
index 00000000..aeefa7cc
--- /dev/null
+++ b/python/functions/core/validate_json_schema_test.py
@@ -0,0 +1,129 @@
+"""Tests para validate_json_schema."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from validate_json_schema import validate_json_schema
+
+
+def test_schema_simple_con_types_y_required():
+    schema = {
+        "type": "object",
+        "required": ["name", "age"],
+        "properties": {
+            "name": {"type": "string"},
+            "age": {"type": "integer"},
+        },
+    }
+    valid, errors = validate_json_schema({"name": "Alice", "age": 30}, schema)
+    assert valid is True
+    assert errors == []
+
+
+def test_nested_object_validation():
+    schema = {
+        "type": "object",
+        "properties": {
+            "address": {
+                "type": "object",
+                "required": ["zip"],
+                "properties": {
+                    "zip": {"type": "string"},
+                    "street": {"type": "string"},
+                },
+            }
+        },
+    }
+    valid, errors = validate_json_schema(
+        {"address": {"zip": 12345, "street": "Main St"}}, schema
+    )
+    assert valid is False
+    assert any("$.address.zip" in e for e in errors)
+
+
+def test_array_con_items_schema():
+    schema = {
+        "type": "array",
+        "items": {"type": "string"},
+        "minItems": 1,
+        "maxItems": 3,
+    }
+    valid, errors = validate_json_schema(["a", "b"], schema)
+    assert valid is True
+
+    valid2, errors2 = validate_json_schema(["a", 2, "c"], schema)
+    assert valid2 is False
+    assert any("[1]" in e for e in errors2)
+
+
+def test_pattern_validation_en_strings():
+    schema = {
+        "type": "object",
+        "properties": {
+            "email": {"type": "string", "pattern": r"^[^@]+@[^@]+\.[^@]+$"},
+        },
+    }
+    valid, errors = validate_json_schema({"email": "user@example.com"}, schema)
+    assert valid is True
+
+    valid2, errors2 = validate_json_schema({"email": "not-an-email"}, schema)
+    assert valid2 is False
+    assert any("pattern" in e for e in errors2)
+
+
+def test_numeric_ranges():
+    schema = {
+        "type": "integer",
+        "minimum": 0,
+        "maximum": 100,
+    }
+    valid, _ = validate_json_schema(50, schema)
+    assert valid is True
+
+    valid2, errors2 = validate_json_schema(150, schema)
+    assert valid2 is False
+    assert any("maximum" in e for e in errors2)
+
+
+def test_multiples_errores_acumulados_con_paths_correctos():
+    schema = {
+        "type": "object",
+        "required": ["name", "age", "email"],
+        "properties": {
+            "name": {"type": "string"},
+            "age": {"type": "integer", "minimum": 0, "maximum": 120},
+            "email": {"type": "string", "pattern": r"^[^@]+@[^@]+$"},
+        },
+    }
+    data = {"name": 123, "age": -5, "email": "bad"}
+    valid, errors = validate_json_schema(data, schema)
+    assert valid is False
+    # name falla type, age falla minimum, email falla pattern
+    assert len(errors) >= 2
+    paths = [e.split(":")[0] for e in errors]
+    assert "$.name" in paths or any("$.name" in e for e in errors)
+
+
+def test_schema_vacio_acepta_todo():
+    valid, errors = validate_json_schema({"any": "thing"}, {})
+    assert valid is True
+    assert errors == []
+
+    valid2, errors2 = validate_json_schema([1, 2, 3], {})
+    assert valid2 is True
+    assert errors2 == []
+
+
+def test_data_none_contra_required_field():
+    schema = {
+        "type": "object",
+        "required": ["name"],
+        "properties": {
+            "name": {"type": "string"},
+        },
+    }
+    valid, errors = validate_json_schema({"name": None}, schema)
+    assert valid is False
+    assert any("name" in e for e in errors)
diff --git a/python/functions/core/write_node_ids.md b/python/functions/core/write_node_ids.md
new file mode 100644
index 00000000..9892748e
--- /dev/null
+++ b/python/functions/core/write_node_ids.md
@@ -0,0 +1,37 @@
+---
+name: write_node_ids
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def write_node_ids(data: Any, node_id: int = 0) -> int"
+description: "Asigna IDs secuenciales zero-padded (0001, 0002...) a todos los nodos de un arbol. Retorna siguiente contador."
+tags: [tree, id, hierarchy, mutation]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/core.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/utils.py"
+---
+
+## Ejemplo
+
+```python
+tree = [{"title": "A", "nodes": [{"title": "B", "nodes": []}]}]
+next_id = write_node_ids(tree)
+# tree[0]["node_id"] == "0000", tree[0]["nodes"][0]["node_id"] == "0001"
+# next_id == 2
+```
+
+## Notas
+
+Funcion pura en el sentido de no tener I/O, pero muta el arbol de entrada in-place. IDs son strings de 4 digitos.