"""Extract plain text from PDF, Markdown, or TXT files.""" SUPPORTED_EXTENSIONS = {".pdf", ".md", ".markdown", ".txt"} def _detect_encoding(data: bytes) -> str: """Detect encoding of raw bytes using multiple fallback strategies.""" # Strategy 1: UTF-8 try: data.decode("utf-8") return "utf-8" except UnicodeDecodeError: pass # Strategy 2: charset_normalizer try: from charset_normalizer import from_bytes result = from_bytes(data).best() if result is not None and result.encoding: return result.encoding except ImportError: pass # Strategy 3: chardet try: import chardet detected = chardet.detect(data) if detected and detected.get("encoding"): return detected["encoding"] except ImportError: pass # Last resort: UTF-8 with replacement return "utf-8" def extract_text_from_file(file_path: str) -> str: """Extract plain text from a file. Supports PDF, Markdown and TXT. For PDF files uses PyMuPDF (fitz) to extract text from each page, joining them with double newlines. For text-based files (.md, .markdown, .txt) reads the file with automatic encoding detection. Args: file_path: Absolute or relative path to the file. Returns: str: Extracted plain text content. Raises: FileNotFoundError: If the file does not exist. ValueError: If the file extension is not supported. ImportError: If PyMuPDF is not installed and a PDF is provided. """ import os if not os.path.exists(file_path): raise FileNotFoundError(f"File not found: {file_path}") _, ext = os.path.splitext(file_path.lower()) if ext == ".pdf": try: import fitz # PyMuPDF except ImportError as e: raise ImportError( "PyMuPDF is required for PDF extraction. " "Install it with: pip install PyMuPDF" ) from e doc = fitz.open(file_path) pages = [page.get_text() for page in doc] return "\n\n".join(pages) elif ext in {".md", ".markdown", ".txt"}: with open(file_path, "rb") as f: raw = f.read() encoding = _detect_encoding(raw) try: return raw.decode(encoding) except (UnicodeDecodeError, LookupError): return raw.decode("utf-8", errors="replace") else: raise ValueError( f"Unsupported file extension: '{ext}'. " f"Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}" )