chore: initial sync

2026-04-28 22:13:08 +02:00
commit 40bea81603
30 changed files with 6675 additions and 0 deletions
@@ -0,0 +1,92 @@
+"""Extract plain text from PDF, Markdown, or TXT files."""
+
+
+SUPPORTED_EXTENSIONS = {".pdf", ".md", ".markdown", ".txt"}
+
+
+def _detect_encoding(data: bytes) -> str:
+    """Detect encoding of raw bytes using multiple fallback strategies."""
+    # Strategy 1: UTF-8
+    try:
+        data.decode("utf-8")
+        return "utf-8"
+    except UnicodeDecodeError:
+        pass
+
+    # Strategy 2: charset_normalizer
+    try:
+        from charset_normalizer import from_bytes
+
+        result = from_bytes(data).best()
+        if result is not None and result.encoding:
+            return result.encoding
+    except ImportError:
+        pass
+
+    # Strategy 3: chardet
+    try:
+        import chardet
+
+        detected = chardet.detect(data)
+        if detected and detected.get("encoding"):
+            return detected["encoding"]
+    except ImportError:
+        pass
+
+    # Last resort: UTF-8 with replacement
+    return "utf-8"
+
+
+def extract_text_from_file(file_path: str) -> str:
+    """Extract plain text from a file. Supports PDF, Markdown and TXT.
+
+    For PDF files uses PyMuPDF (fitz) to extract text from each page,
+    joining them with double newlines. For text-based files (.md, .markdown,
+    .txt) reads the file with automatic encoding detection.
+
+    Args:
+        file_path: Absolute or relative path to the file.
+
+    Returns:
+        str: Extracted plain text content.
+
+    Raises:
+        FileNotFoundError: If the file does not exist.
+        ValueError: If the file extension is not supported.
+        ImportError: If PyMuPDF is not installed and a PDF is provided.
+    """
+    import os
+
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"File not found: {file_path}")
+
+    _, ext = os.path.splitext(file_path.lower())
+
+    if ext == ".pdf":
+        try:
+            import fitz  # PyMuPDF
+        except ImportError as e:
+            raise ImportError(
+                "PyMuPDF is required for PDF extraction. "
+                "Install it with: pip install PyMuPDF"
+            ) from e
+
+        doc = fitz.open(file_path)
+        pages = [page.get_text() for page in doc]
+        return "\n\n".join(pages)
+
+    elif ext in {".md", ".markdown", ".txt"}:
+        with open(file_path, "rb") as f:
+            raw = f.read()
+
+        encoding = _detect_encoding(raw)
+        try:
+            return raw.decode(encoding)
+        except (UnicodeDecodeError, LookupError):
+            return raw.decode("utf-8", errors="replace")
+
+    else:
+        raise ValueError(
+            f"Unsupported file extension: '{ext}'. "
+            f"Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
+        )