Source code for ggblab.parser

import re
import polars as pl
import networkx as nx
from itertools import combinations, chain
from .persistent_counter import PersistentCounter
from .utils import flatten


[docs] class ggb_parser: """Dependency graph parser for GeoGebra constructions. Analyzes object relationships in GeoGebra constructions by building directed graphs using NetworkX. Provides two graph representations: - G (full dependency graph): Complete construction dependencies - G2 (simplified subgraph): Minimal construction sequences (DEPRECATED) The parse() method builds the forward/backward dependency graph (G). The parse_subgraph() method attempts minimal extraction but has critical performance limitations (see method docstring and ARCHITECTURE.md). Command learning: - Automatically extracts and caches GeoGebra commands from construction protocols - Persists command names to a shelve database for cross-project learning - Supports enable/disable of persistence via cache_enabled flag Attributes: df (polars.DataFrame): Construction protocol dataframe G (nx.DiGraph): Full dependency graph G2 (nx.DiGraph): Simplified subgraph (from parse_subgraph) roots (list): Objects with no dependencies (in-degree = 0) leaves (list): Terminal objects (out-degree = 0) rd (dict): Reverse mapping from object name to DataFrame row number ft (dict): Tokenized function definitions, flattened command_cache (shelve.DbfilenameShelf): Persistent command database cache_enabled (bool): Enable/disable automatic persistence Example: >>> parser = ggb_parser() >>> parser.df = construction_dataframe >>> parser.parse() >>> print(parser.roots) # Independent objects >>> print(parser.leaves) # Terminal constructions >>> commands = parser.get_known_commands() # Retrieved cached commands See: docs/architecture.md § Dependency Parser Architecture """ pl.Config.set_tbl_rows(-1) COLUMNS = ["Type", "Command", "Value", "Caption", "Layer"] SHAPES = ["point", "segment", "vector", "ray", "line", "circle", "polygon", "triangle", "quadrilateral"] def __init__(self, cache_path=None, cache_enabled=True): """Initialize the parser with optional command caching. Args: cache_path (str, optional): Path to shelve database for command persistence. Defaults to '.ggblab_command_cache' in current directory. cache_enabled (bool): Enable automatic persistence of discovered commands. Default: True """ cache_path = cache_path or '.ggblab_command_cache' self.command_cache = PersistentCounter(cache_path=cache_path, enabled=cache_enabled)
[docs] def parse(self): """Build the full dependency graph (G) from construction protocol. Analyzes the construction dataframe (self.df) and builds: - Forward dependencies: Object A depends on B (B → A edge) - Backward dependencies: Object A is used by B (A → B edge) The graph nodes are GeoGebra object names; edges represent dependencies. Attributes set: - self.G: NetworkX DiGraph of dependencies - self.roots: Objects with no dependencies (starting points) - self.leaves: Objects with no dependents (endpoints) - self.rd: Reverse dict (name → DataFrame row index) - self.ft: Tokenized function calls for each object Also extracts and persists command names if caching is enabled. Example: >>> parser.df = polars.DataFrame(construction_protocol) >>> parser.parse() >>> print(list(parser.G.edges())) # [(A, B), (B, C), ...] """ # reverse dict from name to row number of dataframe self.rd = {v: k for k, v in enumerate(self.df["Name"])} # tokenized function, flattened self.ft = {n: list([e for e in flatten(self.tokenize_with_commas(c)) if e != ',']) for n, c in self.df.filter(pl.col("Type").is_in(self.SHAPES)).select(["Name", "Command"]).iter_rows()} # Extract and cache command names from all commands in the dataframe for command_str in self.df["Command"]: if command_str: result = self.tokenize_with_commas(command_str, extract_commands=True) self.command_cache.increment(result['commands']) # graph in forward/backward dependency # self.graph = {k: self.ffd(k) for k in self.df.filter(pl.col("Type") != "text")["Name"]} # self.rgraph = {k: self.fbd(k) for k in self.ft} self.G = nx.DiGraph() self.G.clear() for n in self.ft: for o in self.ft[n]: if o in self.rd: # print(n, o) self.G.add_edge(o, n) for o in self.fbd(n): # print(o, ggb.ft[o]) if n in self.ft[o]: # print(o, n) self.G.add_edge(n, o) self.roots = [v for v, d in self.G.in_degree() if d == 0] self.leaves = [v for v, d in self.G.out_degree() if d == 0]
[docs] def parse_subgraph(self): """ Extract a simplified dependency subgraph (G2) from the full graph (G). WARNING: This implementation has significant performance limitations and should be replaced in v1.0. See ARCHITECTURE.md for details. Algorithm: - Enumerates all combinations of root objects (O(2^n) combinations) - For each combination, identifies dependent objects that exclusively depend on that combination - Adds edges to G2 when dependencies are uniquely determined KNOWN LIMITATIONS (Critical): 1. **Combinatorial Explosion**: O(2^n) time complexity where n = number of root objects. - With 15 roots: ~32,000 paths (manageable) - With 20 roots: ~1,000,000 paths (slow) - With 25+ roots: computation becomes intractable 2. **Infinite Loop Risk**: The while loop may not terminate under certain graph topologies where _nodes1 is not updated in each iteration. 3. **Limited N-ary Dependency Support**: Only handles 1-2 parents. Constructions where 3+ objects jointly create one output (e.g., polygon from 3+ points) have incomplete representation in G2 (these edges are silently skipped). 4. **Redundant Computation**: Neighbor lists are recomputed on every iteration of inner loops, causing O(n) redundant work. 5. **Debug Output**: Contains print() statements that should be removed for production. WORKAROUND: - Use with constructions having <15 independent root objects - For larger constructions, consider implementing the optimized algorithm described in ARCHITECTURE.md § Dependency Parser Architecture FUTURE: Replace with topological sort + reachability pruning in v1.0 for O(n(n+m)) complexity. See: https://github.com/[repo]/ARCHITECTURE.md#dependency-parser-architecture """ self.G2 = nx.DiGraph() self.G2.clear() _nodes0 = set() _nodes1 = {n for n in self.roots if n in self.ft} # set(['C', 'A']) while _nodes1: # print(f"path: {_nodes0} {_nodes1}") _paths = [] for __p in (list(chain.from_iterable(combinations(_nodes1, r) for r in range(1, len(_nodes1) + 1)))): _paths.append(_nodes0 | set(__p)) for _nodes2 in _paths: # _nodes2 = set(__p) # print(f"to: {_nodes2 - _nodes0}") _nodes3 = set() for n1 in _nodes2: _n = [set(self.G.neighbors(__n)) for __n in _nodes2] # print(set().union(*_n)) for n0 in set().union(*_n): # print(f"{n0} {ggb.ft[n0]}") d = {n: nx.descendants(self.G, n) for n in self.G.neighbors(n0)} for n1 in sorted(d.keys(), key=lambda e: len(d[e]), reverse=True): # if len(d[n1]) and not ggb.fbd(n0) - (_nodes2 | {n1}): if len(d[n1]) and not nx.ancestors(self.G, n0) - (_nodes2 | {n1}): _nodes3 |= {n0} for n in _nodes3 - _nodes2 - _nodes1: match len(_nodes2 - _nodes0): case 1: o, = tuple(_nodes2 - _nodes0) print(f"found: '{o}' => '{n}'") self.G2.add_edge(o, n) case 2: o1, o2, = tuple(_nodes2 - _nodes0) if o1 in self.G2 and n in self.G2.neighbors(o1): pass elif o2 in self.G2 and n in self.G2.neighbors(o2): pass else: print(f"found: '{o1}', '{o2}' => '{n}'") self.G2.add_edge(o1, n) self.G2.add_edge(o2, n) case _: pass _nodes0 |= _nodes1 _nodes1 = _nodes3 - _nodes2 - _nodes1
# def parse_subgraph_improved(self): # """ # Identify minimal construction sequences by analyzing the dependency graph. # Uses a topological sort + pruning approach instead of exhaustive path enumeration. # """ # self.G2 = nx.DiGraph() # # Identify which nodes are essential (no alternative path) # for node in self.G.nodes(): # direct_parents = list(self.G.predecessors(node)) # if not direct_parents: # continue # # Check if all direct parents are needed # # A parent is needed if removing it disconnects node from any root # parents_to_keep = [] # for parent in direct_parents: # # Check if there's an alternative path without this parent # G_without = self.G.copy() # G_without.remove_edge(parent, node) # has_alternative = nx.has_path(G_without, parent, node) # if not has_alternative: # parents_to_keep.append(parent) # # Add edges for essential parents # for parent in parents_to_keep: # self.G2.add_edge(parent, node)
[docs] def ffd(self, k, recursive=True): if recursive: def _ffd(k): if k in self.ft: # regular polygon contain not much dependency (includes new vertices and auxiliary edges) # return [[e, _ffd(e)] for e in ft if k in (ft[e] + find_returns(k)[1:])] return ([[e, _ffd(e)] for e in self.ft if k in self.ft[e]] + [[e, _ffd(e)] for e in self.find_returns(k)[1:]]) else: return [] return set(flatten(_ffd(k))) else: return {e for e in self.ft if k in self.ft[e]}
[docs] def fbd(self, k, recursive=True): if recursive: def _fbd(k): if k in self.ft: return [[e, _fbd(e)] for e in self.ft[k] if e in self.ft] + [self.vertex_on_regular_polygon(k)] else: return [] return set(flatten(_fbd(k))) - {k} else: return {e for e in self.ft[k] if e in self.ft}
[docs] def initialize_dataframe(self, df=None, file=None): if df is not None: self.df = df elif file is not None: self.df = pl.read_parquet(file) else: raise ValueError("Either df or file must be provided.") self.df = (self.df .transpose(include_header=True, header_name="Name", column_names=self.COLUMNS) .with_columns(pl.col("Layer").cast(pl.Int64).fill_null(0))) return self
[docs] def write_parquet(self, file=None): if file is not None: self.df.write_parquet(file) return self
[docs] def vertex_on_regular_polygon(self, v): try: if self.ft[v][0] == "Polygon" and int(self.ft[v][3]): return [self.df.filter((pl.col("Command") == self.df[self.rd[v]]["Command"]) & (pl.col("Type") == "polygon"))["Name"].item()] except (IndexError, ValueError): return [] else: return []
[docs] def tokenize_with_commas(self, cmd_string, extract_commands=False): # register_expr=False """Tokenize a GeoGebra command string into a structured list representation. Parses a mathematical or GeoGebra-like command string and converts it into a nested list structure that preserves parentheses, brackets, and commas. This is useful for analyzing GeoGebra command syntax and extracting object dependencies. === COMMA PRESERVATION AND GEOGEBRA'S IMPLICIT MULTIPLICATION === This tokenizer preserves commas as explicit tokens for a critical reason: GeoGebra outputs commands with implicit multiplication operators omitted. Example: Internal representation: Circle(2 * a, b) GeoGebra output: Circle(2a, b) <- Information loss! The '*' operator is completely omitted, destroying information. This is a one-way transformation: we can't reliably reconstruct "2*a" from "2a" without external context (is it "2 times a" or "variable named 2a"?). BUT: GeoGebra ALWAYS uses comma-separation for parameter lists. We exploit this invariant. By preserving commas in the token stream, we can: 1. Identify parameter boundaries (comma = separator) 2. Use whitespace/context to infer where implicit multiplication occurred This is a workaround for GeoGebra's poor design. So the question becomes: - BLAME GeoGebra for being a one-way encoder (lose the *? Why?) - PRAISE the developer who recognized the comma-separation invariant Engineering lesson: deal with imperfect systems and find creative solutions. GeoGebra didn't help us. We had to be smarter than it. Args: cmd_string (str): Input command string (e.g., "Circle(A, Distance(A, B))"). extract_commands (bool, optional): If True, also extract command name candidates (tokens preceding '(' or '['). Returns a dict with 'tokens' and 'commands' keys. If False (default), returns only the token list for backward compatibility. Default: False # register_expr (bool, optional): Future feature - if True, replace object references # with abstract labels like ${0}, ${1}, etc. based on # generation order in the construction protocol. # This is useful because GeoGebra applets may rename # objects at runtime, but the generation order remains # stable within a construction. Not yet implemented. Returns: list or dict: - If extract_commands=False (default): Nested list structure with tokens. Parentheses/brackets create nested lists; commas are preserved as ','. - If extract_commands=True: Dict with keys: - 'tokens': Nested list structure (as above) - 'commands': Set of command name candidates (tokens preceding '(' or '[') Raises: ValueError: If parentheses/brackets are mismatched. Examples: >>> tokenize_with_commas("Circle(A, 2)") ['Circle', ['A', ',', '2']] >>> tokenize_with_commas("Circle(A, 2)", extract_commands=True) {'tokens': ['Circle', ['A', ',', '2']], 'commands': {'Circle'}} >>> tokenize_with_commas("Distance(Point(1, 2), B)") ['Distance', [['Point', ['1', ',', '2']], ',', 'B']] >>> tokenize_with_commas("Distance(Point(1, 2), B)", extract_commands=True) {'tokens': ['Distance', [['Point', ['1', ',', '2']], ',', 'B']], 'commands': {'Distance', 'Point'}} Note: Empty or non-string input returns an empty list (or empty dict if extract_commands=True) without raising an error. Commas are INTENTIONALLY preserved as tokens to work around GeoGebra's implicit multiplication. This is not a quirk; it's the core design decision. Future (register_expr parameter): When implemented, would enable stable object references by using construction order indices instead of runtime labels. Example output: ['Circle', ['${0}', ',', '${1}']] if register_expr=True and the objects were the 0th and 1st in the protocol. """ if not cmd_string or not isinstance(cmd_string, str): # raise ValueError("Input must be a non-empty string.") if extract_commands: return {'tokens': [], 'commands': set()} return [] # Regex pattern to match (1) parentheses, (2) commas, or (3) any sequence of non-spacing characters. tokens = re.findall(r'[()\[\],]|[^()\[\]\s,]+', cmd_string) stack = [[]] commands = set() if extract_commands else None prev_token = None for token in tokens: if token in ['(', '[']: # If extracting commands and previous token looks like a command name, save it if extract_commands and prev_token and isinstance(prev_token, str) and prev_token[0].isalpha(): commands.add(prev_token) # Begin a new nested list new_list = [] stack[-1].append(new_list) stack.append(new_list) prev_token = None elif token in [')', ']']: # Close an active nested list if len(stack) > 1: stack.pop() else: raise ValueError("Mismatched parentheses/brackets in input string.") prev_token = None elif token == ',': # Treat commas as tokens stack[-1].append(',') prev_token = None else: # Normal token gets added to the current list # Future: if register_expr and token in rd: # token = f"${rd[token]}" # Replace with abstract order-based label stack[-1].append(token) prev_token = token if len(stack) != 1: raise ValueError("Mismatched parentheses/brackets in input string.") # Auto-cache commands if extract_commands is True if extract_commands and commands: self.command_cache.increment(commands) return {'tokens': stack[0], 'commands': commands} return stack[0]
[docs] def reconstruct_from_tokens(self, parsed_tokens): """Reconstruct the original command string from tokenized structured list. Takes a nested list structure produced by tokenize_with_commas() and reconstructs the original command string with proper parentheses, commas, and spacing. Args: parsed_tokens (list or str): Tokenized structured list, or a single token as a string. Returns: str: Reconstructed command string matching the original input structure. Raises: ValueError: If parsed_tokens contains unexpected types. Examples: >>> parser.reconstruct_from_tokens(['Circle', ['A', ',', '2']]) 'Circle(A, 2)' >>> parser.reconstruct_from_tokens(['Distance', [['Point', ['1', ',', '2']], ',', 'B']]) 'Distance(Point(1, 2), B)' Note: This function is the inverse of tokenize_with_commas(). It handles proper spacing around operators and parentheses. The 'register_expr' parameter (commented out) was intended for register expressions, where applet-assigned labels could be replaced with construction-order-based abstract expressions like '${n}', since GeoGebra may reassign object labels but construction order remains stable. """ if isinstance(parsed_tokens, str): # If the token is a string, return it directly return parsed_tokens elif isinstance(parsed_tokens, list): result = [] for i, token in enumerate(parsed_tokens): if isinstance(token, list): # For nested lists, recursively reconstruct and wrap in parentheses result.append(f"({self.reconstruct_from_tokens(token)})") elif token == ',': # Append a comma directly result.append(',') else: # For normal tokens, add them to the result list result.append(token) # Reconstruct the final string with proper spacing and joining rules return re.sub(r'^\- ', '-', re.sub(r'([^+\-*/]) \(', r'\1(', ' '.join(result).replace(' , ', ', '))) else: raise ValueError("Unexpected token type in parsed_tokens.")