Source code for repo2xml.bundler

"""Core repository bundling logic."""

import sys
from enum import EnumMeta
from enum import StrEnum
from functools import lru_cache
from pathlib import Path
from typing import Annotated
from typing import Any
from typing import Final
from typing import cast
from typing import override

from beartype.vale import Is
from lazi.core import lazi

with lazi:
    import importlib.metadata
    import xml.etree.ElementTree as ET

    import pathspec
    from alive_progress import alive_bar
    from identify.identify import tags_from_path
    from magika import Magika
    from puremagic import PureError
    from puremagic import from_file as puremagic_from_file


def _is_positive_int(x: Any) -> bool:
    return isinstance(x, int) and x > 0


Bytes = Is[_is_positive_int]


class ContainsEnumMeta(EnumMeta):
    @override
    def __contains__(cls, value):
        return value in cls._value2member_map_


class TextEncodings(StrEnum, metaclass=ContainsEnumMeta):
    UTF8 = "utf-8"
    UTF8_SIG = "utf-8-sig"
    CP1252 = "cp1252"
    LATIN1 = "latin-1"


class TextApplicationMimeTypes(StrEnum, metaclass=ContainsEnumMeta):
    JSON = "application/json"
    XML = "application/xml"
    ECMASCRIPT = "application/ecmascript"
    JAVASCRIPT = "application/javascript"
    X_JAVASCRIPT = "application/x-javascript"
    RTF = "application/rtf"
    X_RTF = "application/x-rtf"
    X_TEX = "application/x-tex"
    X_TEXINFO = "application/x-texinfo"
    X_LATEX = "application/x-latex"
    X_TCL = "application/x-tcl"
    X_CSH = "application/x-csh"
    X_KSH = "application/x-ksh"
    X_LISP = "application/x-lisp"
    X_SH = "application/x-sh"
    X_SHELLSCRIPT = "application/x-shellscript"
    X_WAIS_SOURCE = "application/x-wais-source"
    X_YAML = "application/x-yaml"
    YAML = "application/yaml"
    TOML = "application/toml"
    SQL = "application/sql"


class DefaultExclude(StrEnum, metaclass=ContainsEnumMeta):
    GIT = ".git"
    VENV = "venv"
    DOTVENV = ".venv"
    TYPINGS = "typings"
    UV_LOCK = "uv.lock"
    PY_CACHE = "__pycache__"
    PYTEST_CACHE = ".pytest_cache"
    MYPY_CACHE = ".mypy_cache"
    RUFF_CACHE = ".ruff_cache"
    PYRIGHT = ".pyright"
    NODE_MODULES = "node_modules"
    EGGS = ".eggs"
    DIST = "dist"
    BUILD = "build"
    HTMLCOV = "htmlcov"


class BundleReadError(RuntimeError):
    """Raised when a repository file cannot be read during bundling."""


[docs] @lru_cache(maxsize=1) def get_version() -> str: try: return importlib.metadata.version("repo2xml") except importlib.metadata.PackageNotFoundError: return "unknown"
@lru_cache(maxsize=1) def _get_magika() -> Magika: """Return a cached Magika instance for file type detection.""" return Magika() def _load_gitignore_patterns(repo_path: Path) -> list[str]: gitignore = repo_path / ".gitignore" if not gitignore.exists(): return [] try: lines = gitignore.read_text(encoding="utf-8", errors="replace").splitlines() except OSError as exc: msg = f"failed to read {gitignore}: {exc}" raise BundleReadError(msg) from exc return [line.strip() for line in lines if line.strip() and not line.lstrip().startswith("#")] def _load_gitignore_spec(repo_path: Path) -> pathspec.PathSpec: """Load .gitignore patterns from *repo_path* and return a :class:`pathspec.PathSpec`.""" patterns = _load_gitignore_patterns(repo_path) return pathspec.PathSpec.from_lines("gitignore", patterns) def _is_text_file(path: Path) -> bool: """Return ``True`` if *path* is likely a text file.""" try: result = _get_magika().identify_path(path) except OSError as exc: msg = f"failed to read {path}: {exc}" raise BundleReadError(msg) from exc if result.status.value != "ok": msg = f"failed to read {path}: magika status {result.status.value}" raise BundleReadError(msg) if result.prediction.output.is_text: return True try: mime = puremagic_from_file(path, mime=True) except (PureError, OSError, ValueError): mime = "" mime = mime.lower() if mime.startswith("text/"): return True if mime in TextApplicationMimeTypes: return True if mime.endswith("+json") or mime.endswith("+xml") or mime.endswith("+yaml"): return True try: tags = tags_from_path(str(path)) except (OSError, ValueError): tags = set() if "text" in tags: return True if "binary" in tags: return False return False def _read_text_file(path: Path) -> str: """Read *path* using UTF-8 first, then common fallback encodings.""" try: data = path.read_bytes() except OSError as exc: msg = f"failed to read {path}: {exc}" raise BundleReadError(msg) from exc # Decode UTF-16 only when BOM is present to avoid false positives on 8-bit text. if data.startswith((b"\xff\xfe", b"\xfe\xff")): try: return data.decode("utf-16") except UnicodeDecodeError: pass for encoding in TextEncodings: try: return data.decode(encoding) except UnicodeDecodeError: continue msg = f"failed to decode {path} with supported encodings" raise BundleReadError(msg) def _create_markdown_converter() -> Any: # Import on demand to avoid import-time failures from optional converter extras. from markitdown import MarkItDown return MarkItDown() def _convert_file_to_markdown(path: Path, converter: Any) -> str | None: """Convert *path* to Markdown when supported; return ``None`` when unsupported.""" try: result = converter.convert_local(str(path)) except Exception: return None markdown = (result.markdown or result.text_content or "").strip() return markdown or None def _build_tree_str(files: list[Path], repo_path: Path) -> str: """Return a text tree diagram for *files* relative to *repo_path*.""" rel_paths = sorted(f.relative_to(repo_path) for f in files) # Build a nested dict: directories map to dicts, files map to None. tree: dict[str, object] = {} for rel in rel_paths: node: dict[str, object] = tree for part in rel.parts[:-1]: existing = node.get(part) if not isinstance(existing, dict): child: dict[str, object] = {} node[part] = child existing = child node = cast(dict[str, object], existing) node[rel.parts[-1]] = None lines: list[str] = [repo_path.name + "/"] def _render(subtree: dict[str, object], indent: str) -> None: entries = sorted(subtree.items()) for idx, (name, val) in enumerate(entries): is_last = idx == len(entries) - 1 branch = "└── " if is_last else "├── " suffix = "/" if isinstance(val, dict) else "" lines.append(indent + branch + name + suffix) if isinstance(val, dict): new_indent = indent + (" " if is_last else "│ ") _render(cast(dict[str, object], val), new_indent) _render(tree, "") return "\n".join(lines)
[docs] class RepoBundler: """Bundle a repository directory into a single XML representation for LLMs. >>> import tempfile, pathlib >>> with tempfile.TemporaryDirectory() as d: ... p = pathlib.Path(d) ... _ = (p / "hello.py").write_text("x = 1\\n") ... xml = RepoBundler(p).bundle() ... "hello.py" in xml True """ DEFAULT_MAX_FILE_SIZE: Final[Annotated[int, Bytes]] = 1 * 1024 * 1024 #: Default maximum file size in bytes for included files.
[docs] def __init__( self, repo_path: Path, *, respect_gitignore: bool = True, max_file_size: Annotated[int, Bytes] = DEFAULT_MAX_FILE_SIZE, extra_ignore_patterns: list[str] | None = None, include_patterns: list[str] | None = None, ) -> None: """Initialize the repository bundler. Args: repo_path: Path to the repository root. respect_gitignore: Whether to respect patterns in ``.gitignore``. max_file_size: Maximum file size in bytes to include in output. extra_ignore_patterns: Additional gitignore-style patterns to exclude. include_patterns: Patterns that override default and extra excludes. """ self._repo_path: Path = repo_path.resolve() #: Absolute path to the repository root. self._respect_gitignore: bool = respect_gitignore #: Whether ``.gitignore`` patterns are applied while collecting files. self._max_file_size: int = max_file_size #: Maximum file size in bytes accepted into the bundle. self._extra_ignore_patterns: list[str] = extra_ignore_patterns or [] #: Additional user-defined ignore patterns. self._include_patterns: list[str] = include_patterns or [] #: Include patterns that override default and extra excludes. self._extra_spec: pathspec.PathSpec = pathspec.PathSpec.from_lines("gitignore", extra_ignore_patterns or []) #: Compiled matcher for extra ignore patterns. self._include_spec: pathspec.PathSpec = pathspec.PathSpec.from_lines("gitignore", include_patterns or [])
#: Compiled matcher for include override patterns. def _candidate_files(self) -> list[Path]: """Return candidate files after ignore and size filtering.""" gitignore_spec = ( _load_gitignore_spec(self._repo_path) if self._respect_gitignore else pathspec.PathSpec.from_lines("gitignore", []) ) results: list[Path] = [] for path in sorted(self._repo_path.rglob("*")): if not path.is_file(): continue rel = path.relative_to(self._repo_path) rel_str = str(rel) is_included = self._include_spec.match_file(rel_str) if any(part in DefaultExclude for part in rel.parts) and not is_included: continue if gitignore_spec.match_file(rel_str): continue if self._extra_spec.match_file(rel_str) and not is_included: continue try: file_size = path.stat().st_size except OSError: continue if file_size == 0: continue if file_size > self._max_file_size: continue results.append(path) return results def collect_files(self) -> list[Path]: """Return a sorted list of text files to include in the bundle. >>> import tempfile, pathlib >>> with tempfile.TemporaryDirectory() as d: ... p = pathlib.Path(d) ... _ = (p / "a.py").write_text("x = 1\\n") ... _ = (p / ".gitignore").write_text("secret.txt\\n") ... _ = (p / "secret.txt").write_text("s\\n") ... files = RepoBundler(p).collect_files() ... sorted(f.name for f in files) ['.gitignore', 'a.py'] """ results: list[Path] = [] for path in self._candidate_files(): if not _is_text_file(path): continue results.append(path) return results def build_file_tree(self, files: list[Path]) -> str: """Return a text tree diagram for *files*. >>> import tempfile, pathlib >>> with tempfile.TemporaryDirectory() as d: ... p = pathlib.Path(d) ... (p / "src").mkdir() ... _ = (p / "src" / "main.py").write_text("print('x')\\n") ... _ = (p / "README.md").write_text("# Readme\\n") ... bundler = RepoBundler(p) ... files = bundler.collect_files() ... tree = bundler.build_file_tree(files) ... "src/" in tree and "main.py" in tree True """ return _build_tree_str(files, self._repo_path) def bundle(self, *, show_progress: bool = False) -> str: """Bundle the repository and return a well-formed XML string. >>> import tempfile, pathlib >>> with tempfile.TemporaryDirectory() as d: ... p = pathlib.Path(d) ... _ = (p / "hi.py").write_text("print('hi')\\n") ... xml = RepoBundler(p).bundle() ... xml.startswith("<?xml") True """ included_files: list[tuple[Path, str]] = [] converter = _create_markdown_converter() candidate_files = self._candidate_files() def _process_file(file_path: Path) -> None: if _is_text_file(file_path): content = _read_text_file(file_path) else: markdown = _convert_file_to_markdown(file_path, converter) if markdown is None: return content = markdown included_files.append((file_path, content)) if show_progress and candidate_files: with alive_bar( len(candidate_files), title="Bundling files", file=sys.stderr, enrich_print=False, ) as bar: progress = cast(Any, bar) for file_path in candidate_files: _process_file(file_path) progress() else: for file_path in candidate_files: _process_file(file_path) files = [path for path, _ in included_files] tree_str = self.build_file_tree(files) root = ET.Element("repository") settings = ET.SubElement(root, "repo2xml_settings") ET.SubElement(settings, "repo2xml_version").text = get_version() bundler_settings = ET.SubElement(settings, "bundler") ET.SubElement(bundler_settings, "respect_gitignore").text = str(self._respect_gitignore).lower() ET.SubElement(bundler_settings, "max_file_size_bytes").text = str(self._max_file_size) ignored = ET.SubElement(bundler_settings, "ignored_patterns") gitignore_patterns_elem = ET.SubElement( ignored, "gitignore_patterns", enabled=str(self._respect_gitignore).lower(), ) if self._respect_gitignore: for pattern in _load_gitignore_patterns(self._repo_path): ET.SubElement(gitignore_patterns_elem, "pattern").text = pattern extra_patterns_elem = ET.SubElement(ignored, "extra_ignore_patterns") for pattern in self._extra_ignore_patterns: ET.SubElement(extra_patterns_elem, "pattern").text = pattern include_patterns_elem = ET.SubElement(bundler_settings, "include_patterns") for pattern in self._include_patterns: ET.SubElement(include_patterns_elem, "pattern").text = pattern summary = ET.SubElement(root, "file_summary") ET.SubElement(summary, "purpose").text = ( "This file is a merged representation of the entire codebase, " "combined into a single document by repo2xml for analysis by AI language models." ) ET.SubElement(summary, "usage_guidelines").text = ( "When working with this file, an AI model should:\n" "1. Treat the content as a read-only snapshot of the repository.\n" "2. Use the directory structure to understand the project layout.\n" "3. Reference file paths when discussing specific code." ) info = ET.SubElement(root, "repository_info") ET.SubElement(info, "name").text = self._repo_path.name ET.SubElement(info, "path").text = str(self._repo_path) ET.SubElement(info, "file_count").text = str(len(files)) ET.SubElement(root, "directory_structure").text = tree_str files_elem = ET.SubElement(root, "files") for file_path, content in included_files: rel = str(file_path.relative_to(self._repo_path)) file_elem = ET.SubElement(files_elem, "file", path=rel) ET.SubElement(file_elem, "content").text = content ET.indent(root, space=" ") return '<?xml version="1.0" encoding="UTF-8"?>\n' + ET.tostring(root, encoding="unicode")
[docs] def bundle_repo( repo_path: str | Path = ".", *, output_path: str | Path | None = None, respect_gitignore: bool = True, max_file_size: int = RepoBundler.DEFAULT_MAX_FILE_SIZE, extra_ignore_patterns: list[str] | None = None, include_patterns: list[str] | None = None, ) -> str: """Bundle a repository into a single XML string. Convenience wrapper around :class:`RepoBundler`. Args: repo_path: Path to the repository directory (default: current directory). output_path: If given, also write the XML to this file. respect_gitignore: Whether to respect ``.gitignore`` rules. max_file_size: Maximum file size in bytes to include. extra_ignore_patterns: Additional gitignore-style patterns to exclude. include_patterns: Patterns that override default and extra excludes. Returns: The XML representation of the repository as a string. >>> import tempfile, pathlib >>> with tempfile.TemporaryDirectory() as d: ... p = pathlib.Path(d) ... _ = (p / "hello.py").write_text("x = 1\\n") ... xml = bundle_repo(p) ... "hello.py" in xml True """ bundler = RepoBundler( Path(repo_path), respect_gitignore=respect_gitignore, max_file_size=max_file_size, extra_ignore_patterns=extra_ignore_patterns, include_patterns=include_patterns, ) xml_content = bundler.bundle() if output_path is not None: Path(output_path).write_text(xml_content, encoding="utf-8") return xml_content