Files
ontology_graph/lib/extract_text_from_file.py
T
fn-registry agent 40bea81603 chore: initial sync
2026-04-28 22:13:08 +02:00

93 lines
2.6 KiB
Python

"""Extract plain text from PDF, Markdown, or TXT files."""
SUPPORTED_EXTENSIONS = {".pdf", ".md", ".markdown", ".txt"}
def _detect_encoding(data: bytes) -> str:
"""Detect encoding of raw bytes using multiple fallback strategies."""
# Strategy 1: UTF-8
try:
data.decode("utf-8")
return "utf-8"
except UnicodeDecodeError:
pass
# Strategy 2: charset_normalizer
try:
from charset_normalizer import from_bytes
result = from_bytes(data).best()
if result is not None and result.encoding:
return result.encoding
except ImportError:
pass
# Strategy 3: chardet
try:
import chardet
detected = chardet.detect(data)
if detected and detected.get("encoding"):
return detected["encoding"]
except ImportError:
pass
# Last resort: UTF-8 with replacement
return "utf-8"
def extract_text_from_file(file_path: str) -> str:
"""Extract plain text from a file. Supports PDF, Markdown and TXT.
For PDF files uses PyMuPDF (fitz) to extract text from each page,
joining them with double newlines. For text-based files (.md, .markdown,
.txt) reads the file with automatic encoding detection.
Args:
file_path: Absolute or relative path to the file.
Returns:
str: Extracted plain text content.
Raises:
FileNotFoundError: If the file does not exist.
ValueError: If the file extension is not supported.
ImportError: If PyMuPDF is not installed and a PDF is provided.
"""
import os
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
_, ext = os.path.splitext(file_path.lower())
if ext == ".pdf":
try:
import fitz # PyMuPDF
except ImportError as e:
raise ImportError(
"PyMuPDF is required for PDF extraction. "
"Install it with: pip install PyMuPDF"
) from e
doc = fitz.open(file_path)
pages = [page.get_text() for page in doc]
return "\n\n".join(pages)
elif ext in {".md", ".markdown", ".txt"}:
with open(file_path, "rb") as f:
raw = f.read()
encoding = _detect_encoding(raw)
try:
return raw.decode(encoding)
except (UnicodeDecodeError, LookupError):
return raw.decode("utf-8", errors="replace")
else:
raise ValueError(
f"Unsupported file extension: '{ext}'. "
f"Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
)