chore: initial sync
This commit is contained in:
@@ -0,0 +1,92 @@
|
||||
"""Extract plain text from PDF, Markdown, or TXT files."""
|
||||
|
||||
|
||||
SUPPORTED_EXTENSIONS = {".pdf", ".md", ".markdown", ".txt"}
|
||||
|
||||
|
||||
def _detect_encoding(data: bytes) -> str:
|
||||
"""Detect encoding of raw bytes using multiple fallback strategies."""
|
||||
# Strategy 1: UTF-8
|
||||
try:
|
||||
data.decode("utf-8")
|
||||
return "utf-8"
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
|
||||
# Strategy 2: charset_normalizer
|
||||
try:
|
||||
from charset_normalizer import from_bytes
|
||||
|
||||
result = from_bytes(data).best()
|
||||
if result is not None and result.encoding:
|
||||
return result.encoding
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Strategy 3: chardet
|
||||
try:
|
||||
import chardet
|
||||
|
||||
detected = chardet.detect(data)
|
||||
if detected and detected.get("encoding"):
|
||||
return detected["encoding"]
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Last resort: UTF-8 with replacement
|
||||
return "utf-8"
|
||||
|
||||
|
||||
def extract_text_from_file(file_path: str) -> str:
|
||||
"""Extract plain text from a file. Supports PDF, Markdown and TXT.
|
||||
|
||||
For PDF files uses PyMuPDF (fitz) to extract text from each page,
|
||||
joining them with double newlines. For text-based files (.md, .markdown,
|
||||
.txt) reads the file with automatic encoding detection.
|
||||
|
||||
Args:
|
||||
file_path: Absolute or relative path to the file.
|
||||
|
||||
Returns:
|
||||
str: Extracted plain text content.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the file does not exist.
|
||||
ValueError: If the file extension is not supported.
|
||||
ImportError: If PyMuPDF is not installed and a PDF is provided.
|
||||
"""
|
||||
import os
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
_, ext = os.path.splitext(file_path.lower())
|
||||
|
||||
if ext == ".pdf":
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"PyMuPDF is required for PDF extraction. "
|
||||
"Install it with: pip install PyMuPDF"
|
||||
) from e
|
||||
|
||||
doc = fitz.open(file_path)
|
||||
pages = [page.get_text() for page in doc]
|
||||
return "\n\n".join(pages)
|
||||
|
||||
elif ext in {".md", ".markdown", ".txt"}:
|
||||
with open(file_path, "rb") as f:
|
||||
raw = f.read()
|
||||
|
||||
encoding = _detect_encoding(raw)
|
||||
try:
|
||||
return raw.decode(encoding)
|
||||
except (UnicodeDecodeError, LookupError):
|
||||
return raw.decode("utf-8", errors="replace")
|
||||
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unsupported file extension: '{ext}'. "
|
||||
f"Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
|
||||
)
|
||||
Reference in New Issue
Block a user