93 lines
2.6 KiB
Python
93 lines
2.6 KiB
Python
"""Extract plain text from PDF, Markdown, or TXT files."""
|
|
|
|
|
|
SUPPORTED_EXTENSIONS = {".pdf", ".md", ".markdown", ".txt"}
|
|
|
|
|
|
def _detect_encoding(data: bytes) -> str:
|
|
"""Detect encoding of raw bytes using multiple fallback strategies."""
|
|
# Strategy 1: UTF-8
|
|
try:
|
|
data.decode("utf-8")
|
|
return "utf-8"
|
|
except UnicodeDecodeError:
|
|
pass
|
|
|
|
# Strategy 2: charset_normalizer
|
|
try:
|
|
from charset_normalizer import from_bytes
|
|
|
|
result = from_bytes(data).best()
|
|
if result is not None and result.encoding:
|
|
return result.encoding
|
|
except ImportError:
|
|
pass
|
|
|
|
# Strategy 3: chardet
|
|
try:
|
|
import chardet
|
|
|
|
detected = chardet.detect(data)
|
|
if detected and detected.get("encoding"):
|
|
return detected["encoding"]
|
|
except ImportError:
|
|
pass
|
|
|
|
# Last resort: UTF-8 with replacement
|
|
return "utf-8"
|
|
|
|
|
|
def extract_text_from_file(file_path: str) -> str:
|
|
"""Extract plain text from a file. Supports PDF, Markdown and TXT.
|
|
|
|
For PDF files uses PyMuPDF (fitz) to extract text from each page,
|
|
joining them with double newlines. For text-based files (.md, .markdown,
|
|
.txt) reads the file with automatic encoding detection.
|
|
|
|
Args:
|
|
file_path: Absolute or relative path to the file.
|
|
|
|
Returns:
|
|
str: Extracted plain text content.
|
|
|
|
Raises:
|
|
FileNotFoundError: If the file does not exist.
|
|
ValueError: If the file extension is not supported.
|
|
ImportError: If PyMuPDF is not installed and a PDF is provided.
|
|
"""
|
|
import os
|
|
|
|
if not os.path.exists(file_path):
|
|
raise FileNotFoundError(f"File not found: {file_path}")
|
|
|
|
_, ext = os.path.splitext(file_path.lower())
|
|
|
|
if ext == ".pdf":
|
|
try:
|
|
import fitz # PyMuPDF
|
|
except ImportError as e:
|
|
raise ImportError(
|
|
"PyMuPDF is required for PDF extraction. "
|
|
"Install it with: pip install PyMuPDF"
|
|
) from e
|
|
|
|
doc = fitz.open(file_path)
|
|
pages = [page.get_text() for page in doc]
|
|
return "\n\n".join(pages)
|
|
|
|
elif ext in {".md", ".markdown", ".txt"}:
|
|
with open(file_path, "rb") as f:
|
|
raw = f.read()
|
|
|
|
encoding = _detect_encoding(raw)
|
|
try:
|
|
return raw.decode(encoding)
|
|
except (UnicodeDecodeError, LookupError):
|
|
return raw.decode("utf-8", errors="replace")
|
|
|
|
else:
|
|
raise ValueError(
|
|
f"Unsupported file extension: '{ext}'. "
|
|
f"Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
|
|
)
|