From 9fc96ebb3deadd33557db030bb6582b81c024d83 Mon Sep 17 00:00:00 2001 From: Tony Stark Date: Fri, 30 May 2025 00:03:50 -0500 Subject: [PATCH] Support multi-word dictionary entries in lexer The lexer now matches sequences of words against the dictionary instead of just single words, allowing for phrases and compound terms to be recognized as a single token. --- packages/contracts/src/lib/a_lexer.cairo | 72 +++++++++++++++------ packages/contracts/src/lib/dictionary.cairo | 6 +- 2 files changed, 57 insertions(+), 21 deletions(-) diff --git a/packages/contracts/src/lib/a_lexer.cairo b/packages/contracts/src/lib/a_lexer.cairo index 91985f50..b172e9cf 100644 --- a/packages/contracts/src/lib/a_lexer.cairo +++ b/packages/contracts/src/lib/a_lexer.cairo @@ -181,29 +181,61 @@ pub mod lexer { fn match_tokens(world: WorldStorage, words: Array) -> Array { let mut tokens: Array = array![]; - for i in 0..words.len() { - // iterate over the words in the string and find a dictionary match - let mut token = Token { - position: i, - text: words[i].clone(), - token_type: TokenType::Unknown, - token_value: 0, - target: 0, + let mut i: u32 = 0; + + while i < words.len() { + let mut matched = false; + let mut max_match_length = 1; + let mut best_match: Option = Option::None; + + // Try matching sequences of increasing length from current position + for len in 1..(words.len() - i + 1) { + // Build the sequence of words to try matching + let mut sequence = ""; + for j in 0..len { + if j > 0 { + sequence = sequence + " "; + } + sequence = sequence + words[i + j].clone(); + }; + // Try to match this sequence in the dictionary + let dict_entry = get_dict_entry(world, sequence.clone()); + if dict_entry.is_some() { + let dict_entry = dict_entry.unwrap(); + max_match_length = len; + best_match = + Option::Some( + Token { + position: i, + text: sequence.clone(), + token_type: dict_entry.tokenType.clone(), + token_value: dict_entry.n_value, + target: 0, + }, + ); + matched = true; + } }; - let dict_entry = get_dict_entry(world, words[i].clone()); - if dict_entry.is_some() { - let dict_entry = dict_entry.unwrap(); - token = - Token { - position: i, - text: words[i].clone(), - token_type: dict_entry.tokenType.clone(), - token_value: dict_entry.n_value, - target: 0, - }; + if matched { + // Add the best match we found + tokens.append(best_match.unwrap()); + i += max_match_length; + } else { + // No match found, add as unknown token + tokens + .append( + Token { + position: i, + text: words[i].clone(), + token_type: TokenType::Unknown, + token_value: 0, + target: 0, + }, + ); + i += 1; } - tokens.append(token); }; + tokens } diff --git a/packages/contracts/src/lib/dictionary.cairo b/packages/contracts/src/lib/dictionary.cairo index 2fc9efda..17cda052 100644 --- a/packages/contracts/src/lib/dictionary.cairo +++ b/packages/contracts/src/lib/dictionary.cairo @@ -31,7 +31,11 @@ pub fn add_to_dictionary( } pub fn get_dict_entry(world: WorldStorage, word: ByteArray) -> Option { - let dict_key: felt252 = word.clone().to_felt252_word().unwrap(); + let dict_key_result = word.clone().to_felt252_word(); + if dict_key_result.is_err() { + return Option::None; + } + let dict_key: felt252 = dict_key_result.unwrap(); let entry: Dict = world.read_model(dict_key); if (entry.word == "") { return Option::None;