"""
hunk_aware_code_parser.py
=========================

A reference implementation of a custom **code chunking** Parser SKILL built
on top of `ragbase-parser-sdk`.

Default code-aware parser splits source files at function / class
boundaries (ATOMIC tree-sitter nodes). This example shows how to override
that with a *commit-hunk-aware* strategy: every chunk corresponds to a
contiguous range of lines that were last modified by the same commit.

Why you might want this
-----------------------
- PR / MR review: each chunk maps 1:1 to a hunk in the diff
- Blame / ownership analysis: chunks carry author + commit metadata
- Hot-path discovery: tag chunks by file's churn rate

How to run locally
------------------
1.  Install the SDK (request from info@nox-lumen.com):

        pip install ragbase-parser-sdk

2.  Drop this file into a fresh project:

        ragbase-cli init parser hunk-aware-code-parser
        cp hunk_aware_code_parser.py hunk-aware-code-parser/src/hunk_aware_code_parser/main.py

3.  Edit `manifest.json` so platform routes the right files at it:

        {
          "name": "hunk-aware-code-parser",
          "version": "1.0.0",
          "kind": "parser",
          "capabilities": {
            "extensions": [".py", ".java", ".go", ".rs", ".kt"],
            "priority": 50
          },
          "entry": "python -m hunk_aware_code_parser"
        }

4.  Test:

        ragbase-cli parse-test --input examples/sample.py --output ./out
        cat ./out/chunks.jsonl

5.  Push to platform:

        ragbase-cli skill validate
        ragbase-cli skill build
        ragbase-cli skill push

How it interacts with built-in `code-aware`
-------------------------------------------
- A given KB activates ONE code parser at a time (set in KB config).
- This SKILL is interchangeable with the built-in `code-aware` —— they
  produce chunks with the SAME contract (`outline_path`, `function_decls`,
  `class_decls`, `imports`, `references`), so all downstream code-retrieval
  tiers (zero / light / heavy index) keep working.
- The only thing that changes is HOW chunks are AGGREGATED.

License
-------
This sample is provided as-is for reference. Adapt freely.
"""

from __future__ import annotations

import subprocess
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, List

# ragbase-parser-sdk public surface (request package from info@nox-lumen.com)
from ragbase.parser_sdk import (
    Chunk,
    ParseContext,
    ParseResult,
    Parser,
    make_cli,
)
from ragbase.parser_sdk.decorators import incremental_update


# ---------------------------------------------------------------------------
# Hunk model
# ---------------------------------------------------------------------------


@dataclass
class Hunk:
    """One contiguous range of lines last modified by the same commit."""

    start_line: int          # 1-indexed, inclusive
    end_line: int            # 1-indexed, inclusive
    start_byte: int
    end_byte: int
    commit: str              # full sha (truncate when storing)
    author: str              # blame author name
    author_email: str


def _git_blame_porcelain(repo_root: Path, file_path: Path) -> Iterable[dict]:
    """Yield porcelain blame entries: {commit, author, author_mail, lineno}.

    Falls back to a single synthetic entry if the file is not tracked
    (e.g. when the parser is run on an unpacked archive).
    """
    rel = file_path.relative_to(repo_root)
    try:
        out = subprocess.check_output(
            ["git", "-C", str(repo_root), "blame", "--porcelain", str(rel)],
            stderr=subprocess.DEVNULL,
        ).decode("utf-8", errors="replace")
    except (subprocess.CalledProcessError, FileNotFoundError):
        # Not a git repo or git missing — degrade gracefully.
        yield {"commit": "untracked", "author": "unknown",
               "author_mail": "", "lineno": 1}
        return

    cur: dict = {}
    for line in out.splitlines():
        if line and line[0:1].isalnum() and " " in line and not line.startswith("\t"):
            head, _, _ = line.partition(" ")
            if len(head) == 40:                          # commit sha header
                if cur:
                    yield cur
                cur = {"commit": head}
            elif line.startswith("author "):
                cur["author"] = line[len("author "):]
            elif line.startswith("author-mail "):
                cur["author_mail"] = line[len("author-mail "):].strip("<>")
            elif line.startswith("\t"):
                pass                                     # source line, ignore
        # other porcelain headers ignored for brevity
    if cur:
        yield cur


def split_into_hunks(source: str, repo_root: Path, file_path: Path) -> List[Hunk]:
    """Group consecutive lines with the same commit into Hunks.

    A real implementation would use a streaming porcelain parser that knows
    line numbers; this is intentionally simplified to keep the example
    readable. Replace with `git_blame_repo`, `pygit2`, or your own walker.
    """
    lines = source.splitlines(keepends=True)
    blame = list(_git_blame_porcelain(repo_root, file_path))
    if not blame or len(blame) != len(lines):
        # Fallback: one Hunk for the whole file.
        return [Hunk(
            start_line=1, end_line=len(lines),
            start_byte=0, end_byte=len(source.encode()),
            commit=blame[0]["commit"] if blame else "untracked",
            author=blame[0].get("author", "unknown") if blame else "unknown",
            author_email=blame[0].get("author_mail", "") if blame else "",
        )]

    hunks: List[Hunk] = []
    cur_start = 0
    cur_byte = 0
    cur_commit = blame[0]["commit"]
    cur_author = blame[0].get("author", "unknown")
    cur_email = blame[0].get("author_mail", "")
    byte_cursor = 0
    line_starts: List[int] = []
    for ln in lines:
        line_starts.append(byte_cursor)
        byte_cursor += len(ln.encode())

    for i, b in enumerate(blame):
        if b["commit"] != cur_commit:
            hunks.append(Hunk(
                start_line=cur_start + 1,
                end_line=i,
                start_byte=line_starts[cur_start],
                end_byte=line_starts[i],
                commit=cur_commit,
                author=cur_author,
                author_email=cur_email,
            ))
            cur_start = i
            cur_commit = b["commit"]
            cur_author = b.get("author", "unknown")
            cur_email = b.get("author_mail", "")
    # tail
    hunks.append(Hunk(
        start_line=cur_start + 1,
        end_line=len(lines),
        start_byte=line_starts[cur_start],
        end_byte=byte_cursor,
        commit=cur_commit,
        author=cur_author,
        author_email=cur_email,
    ))
    return hunks


# ---------------------------------------------------------------------------
# Symbol enrichment (so chunks satisfy the code-aware contract)
# ---------------------------------------------------------------------------


def extract_symbols_in_range(source_bytes: bytes, language: str,
                             start_byte: int, end_byte: int) -> List[dict]:
    """Best-effort enclosing-symbol extractor.

    A production implementation should reuse `ragbase.parser_sdk.utils`
    helpers — they wrap tree-sitter and apply the same rules as built-in
    `code-aware`. This stub keeps the example self-contained.
    """
    try:
        from tree_sitter_languages import get_parser
    except ImportError:
        return []

    try:
        tree = get_parser(language).parse(source_bytes)
    except Exception:
        return []

    symbols: List[dict] = []

    def walk(node, scope_chain):
        if node.start_byte > end_byte or node.end_byte < start_byte:
            return
        if node.type in {"function_definition", "function_declaration",
                         "method_declaration"}:
            name = next(
                (c.text.decode() for c in node.children if c.type == "identifier"),
                "<anonymous>",
            )
            symbols.append({
                "kind": "function",
                "name": name,
                "scope_chain": scope_chain,
                "start_line": node.start_point[0] + 1,
                "end_line": node.end_point[0] + 1,
            })
        elif node.type in {"class_definition", "class_declaration"}:
            name = next(
                (c.text.decode() for c in node.children if c.type == "identifier"),
                "<anonymous>",
            )
            symbols.append({
                "kind": "class",
                "name": name,
                "scope_chain": scope_chain,
                "start_line": node.start_point[0] + 1,
                "end_line": node.end_point[0] + 1,
            })
            scope_chain = scope_chain + [name]
        for child in node.children:
            walk(child, scope_chain)

    walk(tree.root_node, [])
    return symbols


# ---------------------------------------------------------------------------
# The Parser
# ---------------------------------------------------------------------------


class HunkAwareCodeParser(Parser):
    """Chunk source files by git commit hunks instead of by ATOMIC nodes.

    Output chunks fully satisfy the platform's code-aware chunk contract:
    each chunk carries `outline_path`, `function_decls`, `class_decls`,
    `imports`, `tags` — so all three retrieval tiers (zero / light /
    heavy) work out of the box.
    """

    LANGUAGE_BY_EXT = {
        ".py": "python",
        ".java": "java",
        ".go": "go",
        ".rs": "rust",
        ".kt": "kotlin",
        ".ts": "typescript",
        ".js": "javascript",
    }

    @incremental_update
    def parse(self, ctx: ParseContext) -> ParseResult:
        path = Path(ctx.path)
        language = self.LANGUAGE_BY_EXT.get(path.suffix.lower(), "")
        source = ctx.read_text()
        source_bytes = source.encode()

        # Determine repo root — fall back to file's parent if not in a repo.
        repo_root = ctx.metadata.get("repo_root", path.parent)
        if isinstance(repo_root, str):
            repo_root = Path(repo_root)

        hunks = split_into_hunks(source, repo_root=Path(repo_root),
                                 file_path=path)

        chunks: List[Chunk] = []
        for h in hunks:
            symbols = (extract_symbols_in_range(
                           source_bytes, language, h.start_byte, h.end_byte)
                       if language else [])

            outline = ""
            if symbols:
                tail = symbols[-1]
                outline = ".".join(tail["scope_chain"] + [tail["name"]])

            chunks.append(Chunk(
                content=source[h.start_byte:h.end_byte],
                metadata={
                    # Required by the code-aware contract:
                    "outline_path": outline,
                    "function_decls": [s for s in symbols if s["kind"] == "function"],
                    "class_decls":    [s for s in symbols if s["kind"] == "class"],
                    "imports": [],            # fill from the symbol extractor in production
                    # Hunk-specific extras:
                    "commit_sha": h.commit,
                    "hunk_author": h.author,
                    "hunk_author_email": h.author_email,
                    "start_line": h.start_line,
                    "end_line": h.end_line,
                    # Tags downstream search can filter by:
                    "tags": [
                        f"author:{h.author}",
                        f"commit:{h.commit[:8]}",
                        f"language:{language or 'unknown'}",
                        "chunking:hunk-aware",
                    ],
                    # Source ref so platform can highlight the original lines:
                    "source_ref": {
                        "file": str(path),
                        "byte_start": h.start_byte,
                        "byte_end": h.end_byte,
                        "line_start": h.start_line,
                        "line_end": h.end_line,
                    },
                },
            ))

        return ParseResult(
            chunks=chunks,
            stats={
                "language": language,
                "hunk_count": len(hunks),
                "line_count": source.count("\n") + 1,
            },
        )


# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------

if __name__ == "__main__":
    # `make_cli` wraps the Parser into the §4.3-compliant CLI:
    #   python -m hunk_aware_code_parser parse        --input ... --output ...
    #   python -m hunk_aware_code_parser partial-parse ...
    #   python -m hunk_aware_code_parser info
    make_cli(HunkAwareCodeParser).run()
