Source code for ggblab.parser

"""Lightweight tokenizer utilities used by `ggblab`.

This module provides a compact `ggb_parser` class exposing only the
tokenization and token-reconstruction helpers required by the core package.
For richer parser features and DataFrame-based construction helpers, install
the optional `ggblab_extra` package and import the full implementations from
`ggblab_extra.parser` or `ggblab_extra.construction_parser`.
"""

import re
import logging
from ggblab.persistent_counter import PersistentCounter



[docs]
class ggb_parser:
    """Minimal parser exposing only `tokenize_with_commas` and `reconstruct_from_tokens`.

    This lightweight class preserves the original implementations of the two
    methods while removing other parser functionality. For richer parser
    features or DataFrame-based construction helpers, install ``ggblab_extra``.
    The core implementation intentionally keeps a compact surface area so
    that importing ``ggblab`` remains lightweight.
    """
    
    def __init__(self, cache_path=None, cache_enabled=True):
        """Initialize the lightweight parser and optional command cache."""
        cache_path = cache_path or '.ggblab_command_cache'
        try:
            self.command_cache = PersistentCounter(cache_path=cache_path, enabled=cache_enabled)
        except Exception:
            # Fallback: simple no-op cache when PersistentCounter fails
            class _Noop:
                def increment(self, *args, **kwargs):
                    return
            self.command_cache = _Noop()


[docs]
    def tokenize_with_commas(self, cmd_string, extract_commands=False):  # register_expr=False
        """Tokenize a GeoGebra command string into a structured list representation.
        
        Parses a mathematical or GeoGebra-like command string and converts it into
        a nested list structure that preserves parentheses, brackets, and commas.
        This is useful for analyzing GeoGebra command syntax and extracting object
        dependencies.
        
        === COMMA PRESERVATION AND GEOGEBRA'S IMPLICIT MULTIPLICATION ===
        
        This tokenizer preserves commas as explicit tokens for a critical reason:
        GeoGebra outputs commands with implicit multiplication operators omitted.
        
        Example:
            Internal representation: Circle(2 * a, b)
            GeoGebra output:         Circle(2a, b)  <- Information loss!
        
        The '*' operator is completely omitted, destroying information. This is a
        one-way transformation: we can't reliably reconstruct "2*a" from "2a" without
        external context (is it "2 times a" or "variable named 2a"?).
        
        BUT: GeoGebra ALWAYS uses comma-separation for parameter lists. We exploit
        this invariant. By preserving commas in the token stream, we can:
        1. Identify parameter boundaries (comma = separator)
        2. Use whitespace/context to infer where implicit multiplication occurred
        
        This is a workaround for GeoGebra's poor design. So the question becomes:
        
        - BLAME GeoGebra for being a one-way encoder (lose the *? Why?)
        - PRAISE the developer who recognized the comma-separation invariant
        
        Engineering lesson: deal with imperfect systems and find creative solutions.
        GeoGebra didn't help us. We had to be smarter than it.
        
        Args:
            cmd_string (str): Input command string (e.g., "Circle(A, Distance(A, B))").
            extract_commands (bool, optional): If True, also extract command name candidates
                                              (tokens preceding '(' or '['). Returns a dict
                                              with 'tokens' and 'commands' keys. If False
                                              (default), returns only the token list for
                                              backward compatibility. Default: False
            # register_expr (bool, optional): Future feature - if True, replace object references
            #                          with abstract labels like ${0}, ${1}, etc. based on
            #                          generation order in the construction protocol.
            #                          This is useful because GeoGebra applets may rename
            #                          objects at runtime, but the generation order remains
            #                          stable within a construction. Not yet implemented.
        
        Returns:
            list or dict: 
                - If extract_commands=False (default): Nested list structure with tokens.
                  Parentheses/brackets create nested lists; commas are preserved as ','.
                - If extract_commands=True: Dict with keys:
                  - 'tokens': Nested list structure (as above)
                  - 'commands': Set of command name candidates (tokens preceding '(' or '[')
        
        Raises:
            ValueError: If parentheses/brackets are mismatched.
        
        Examples:
            >>> tokenize_with_commas("Circle(A, 2)")
            ['Circle', ['A', ',', '2']]
            
            >>> tokenize_with_commas("Circle(A, 2)", extract_commands=True)
            {'tokens': ['Circle', ['A', ',', '2']], 'commands': {'Circle'}}
            
            >>> tokenize_with_commas("Distance(Point(1, 2), B)")
            ['Distance', [['Point', ['1', ',', '2']], ',', 'B']]
            
            >>> tokenize_with_commas("Distance(Point(1, 2), B)", extract_commands=True)
            {'tokens': ['Distance', [['Point', ['1', ',', '2']], ',', 'B']], 'commands': {'Distance', 'Point'}}
        
        Note:
            Empty or non-string input returns an empty list (or empty dict if
            extract_commands=True) without raising an error.
            
            Commas are INTENTIONALLY preserved as tokens to work around GeoGebra's
            implicit multiplication. This is not a quirk; it's the core design decision.
            
            Future (register_expr parameter): When implemented, would enable stable object
            references by using construction order indices instead of runtime labels.
            Example output: ['Circle', ['${0}', ',', '${1}']] if register_expr=True
            and the objects were the 0th and 1st in the protocol.
        """
        if not cmd_string or not isinstance(cmd_string, str):
            # raise ValueError("Input must be a non-empty string.")
            if extract_commands:
                return {'tokens': [], 'commands': set()}
            return []

        # Regex pattern to match (1) parentheses, (2) commas, or (3) any sequence of non-spacing characters.
        tokens = re.findall(r'[()\[\],]|[^()\[\]\s,]+', cmd_string)

        stack = [[]]
        commands = set() if extract_commands else None
        prev_token = None

        for token in tokens:
            if token in ['(', '[']:
                # If extracting commands and previous token looks like a command name, save it
                if extract_commands and prev_token and isinstance(prev_token, str) and prev_token[0].isalpha():
                    commands.add(prev_token)
                # Begin a new nested list
                new_list = []
                stack[-1].append(new_list)
                stack.append(new_list)
                prev_token = None
            elif token in [')', ']']:
                # Close an active nested list
                if len(stack) > 1:
                    stack.pop()
                else:
                    raise ValueError("Mismatched parentheses/brackets in input string.")
                prev_token = None
            elif token == ',':
                # Treat commas as tokens
                stack[-1].append(',')
                prev_token = None
            else:
                # Normal token gets added to the current list
                # Future: if register_expr and token in rd:
                #     token = f"${rd[token]}"  # Replace with abstract order-based label
                stack[-1].append(token)
                prev_token = token

        if len(stack) != 1:
            raise ValueError("Mismatched parentheses/brackets in input string.")

        if extract_commands and commands:
            try:
                self.command_cache.increment(commands)
            except Exception:
                logging.getLogger(__name__).exception("Failed to increment command_cache with commands")
            return {'tokens': stack[0], 'commands': commands}

        return stack[0]



[docs]
    def reconstruct_from_tokens(self, parsed_tokens):
        """Reconstruct the original command string from tokenized structured list.
        
        Takes a nested list structure produced by tokenize_with_commas() and
        reconstructs the original command string with proper parentheses, commas,
        and spacing.
        
        Args:
            parsed_tokens (list or str): Tokenized structured list, or a single
                                          token as a string.
        
        Returns:
            str: Reconstructed command string matching the original input structure.
        
        Raises:
            ValueError: If parsed_tokens contains unexpected types.
        
        Examples:
            >>> parser.reconstruct_from_tokens(['Circle', ['A', ',', '2']])
            'Circle(A, 2)'
            
            >>> parser.reconstruct_from_tokens(['Distance', [['Point', ['1', ',', '2']], ',', 'B']])
            'Distance(Point(1, 2), B)'
        
        Note:
            This function is the inverse of tokenize_with_commas(). It handles
            proper spacing around operators and parentheses.
            
            The 'register_expr' parameter (commented out) was intended for register expressions,
            where applet-assigned labels could be replaced with construction-order-based
            abstract expressions like '${n}', since GeoGebra may reassign object labels
            but construction order remains stable.
        """
        if isinstance(parsed_tokens, str):
            # If the token is a string, return it directly
            return parsed_tokens

        elif isinstance(parsed_tokens, list):
            result = []
            for token in parsed_tokens:
                if isinstance(token, list):
                    # For nested lists, recursively reconstruct and wrap in parentheses
                    result.append(f"({self.reconstruct_from_tokens(token)})")
                elif token == ',':
                    # Append a comma directly
                    result.append(',')
                else:
                    # For normal tokens, add them to the result list
                    result.append(token)

            # Reconstruct the final string with proper spacing and joining rules
            return re.sub(r'^\- ', '-',
                          re.sub(r'([^+\-*/]) \(', r'\1(',
                                 ' '.join(result).replace(' , ', ', ')))
        else:
            raise ValueError("Unexpected token type in parsed_tokens.")