Files
ontology_graph/lib/split_text_into_chunks.py
T
fn-registry agent 40bea81603 chore: initial sync
2026-04-28 22:13:08 +02:00

67 lines
2.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Split text into overlapping chunks with sentence-boundary awareness."""
def split_text_into_chunks(
text: str, chunk_size: int = 500, overlap: int = 50
) -> list[str]:
"""Divide texto en chunks de tamaño fijo con overlap, cortando en límites de oración.
Args:
text: Texto a dividir.
chunk_size: Tamaño máximo de cada chunk en caracteres.
overlap: Número de caracteres de solapamiento entre chunks consecutivos.
Returns:
Lista de chunks. Vacía si el texto es vacío.
"""
if not text:
return []
if len(text) <= chunk_size:
stripped = text.strip()
return [stripped] if stripped else []
# Separadores en orden de prioridad (más específicos primero)
separators = ["", "", "", ".\n", "!\n", "?\n", "\n\n", ". ", "! ", "? "]
chunks: list[str] = []
start = 0
text_len = len(text)
while start < text_len:
end = start + chunk_size
if end < text_len:
# Buscar el último separador de oración dentro de text[start:end]
# Solo aceptar si está después del 30% del chunk
min_pos = start + int(chunk_size * 0.30)
best_end = None
for sep in separators:
sep_len = len(sep)
# Buscar la última ocurrencia del separador en text[start:end]
search_region = text[start:end]
pos = search_region.rfind(sep)
if pos == -1:
continue
abs_pos = start + pos + sep_len
if abs_pos > min_pos:
# Usar este separador solo si produce un corte más tarde que el mínimo
# y más temprano que chunk_size (ya garantizado por rfind en [start:end])
if best_end is None or abs_pos > best_end:
best_end = abs_pos
if best_end is not None:
end = best_end
chunk = text[start:end].strip()
if chunk:
chunks.append(chunk)
start = end - overlap
# Protección contra bucle infinito si overlap >= chunk_size o end no avanza
if start >= end:
start = end
return chunks