ontology_graph/lib/extract_text_from_file.py

"""Extract plain text from PDF, Markdown, or TXT files."""


SUPPORTED_EXTENSIONS = {".pdf", ".md", ".markdown", ".txt"}


def _detect_encoding(data: bytes) -> str:
    """Detect encoding of raw bytes using multiple fallback strategies."""
    # Strategy 1: UTF-8
    try:
        data.decode("utf-8")
        return "utf-8"
    except UnicodeDecodeError:
        pass

    # Strategy 2: charset_normalizer
    try:
        from charset_normalizer import from_bytes

        result = from_bytes(data).best()
        if result is not None and result.encoding:
            return result.encoding
    except ImportError:
        pass

    # Strategy 3: chardet
    try:
        import chardet

        detected = chardet.detect(data)
        if detected and detected.get("encoding"):
            return detected["encoding"]
    except ImportError:
        pass

    # Last resort: UTF-8 with replacement
    return "utf-8"


def extract_text_from_file(file_path: str) -> str:
    """Extract plain text from a file. Supports PDF, Markdown and TXT.

    For PDF files uses PyMuPDF (fitz) to extract text from each page,
    joining them with double newlines. For text-based files (.md, .markdown,
    .txt) reads the file with automatic encoding detection.

    Args:
        file_path: Absolute or relative path to the file.

    Returns:
        str: Extracted plain text content.

    Raises:
        FileNotFoundError: If the file does not exist.
        ValueError: If the file extension is not supported.
        ImportError: If PyMuPDF is not installed and a PDF is provided.
    """
    import os

    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    _, ext = os.path.splitext(file_path.lower())

    if ext == ".pdf":
        try:
            import fitz  # PyMuPDF
        except ImportError as e:
            raise ImportError(
                "PyMuPDF is required for PDF extraction. "
                "Install it with: pip install PyMuPDF"
            ) from e

        doc = fitz.open(file_path)
        pages = [page.get_text() for page in doc]
        return "\n\n".join(pages)

    elif ext in {".md", ".markdown", ".txt"}:
        with open(file_path, "rb") as f:
            raw = f.read()

        encoding = _detect_encoding(raw)
        try:
            return raw.decode(encoding)
        except (UnicodeDecodeError, LookupError):
            return raw.decode("utf-8", errors="replace")

    else:
        raise ValueError(
            f"Unsupported file extension: '{ext}'. "
            f"Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
        )