From 14e8f564c95aa252314101662f237409dc68397a Mon Sep 17 00:00:00 2001 From: yukimemi Date: Thu, 26 Mar 2026 22:33:48 +0900 Subject: [PATCH] fix: include raw remaining input as suffix in romaji_to_hiragana_predictively MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the trailing characters of a query are an incomplete romaji syllable (e.g. "s" at the end of "infurawbs"), the predictive suffix generator only produced kana alternatives ([さしすせそ…]) and never the plain ASCII character. This caused mixed kana+ASCII filenames like "インフラWBS" to be missed when querying "infurawbs": the generated pattern contained `インフラwb[サシスセソ…]` but not `インフラwbs`, so the case-insensitive match against "WBS" failed. Fix: after building the kana suffix set, also insert the raw remaining input (`query.to_vec()`) as one extra suffix option. This lets callers (e.g. query_a_word_with_generator) generate the literal pattern "インフラwbs" in addition to the kana alternatives, which then matches "インフラWBS" via the (?i) flag. Existing tests for romaji_to_hiragana_predictively_2/3/4 are updated to reflect that the raw remaining input is now included in the suffix list alongside the kana alternatives. --- src/migemo/romaji_processor.rs | 55 ++++++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 3 deletions(-) diff --git a/src/migemo/romaji_processor.rs b/src/migemo/romaji_processor.rs index aab59c1..2b7bc4a 100644 --- a/src/migemo/romaji_processor.rs +++ b/src/migemo/romaji_processor.rs @@ -167,6 +167,12 @@ impl RomajiProcessor { set.insert(value_buffer.clone()); } } + // Also include the raw remaining input as a suffix option. + // This ensures that e.g. querying "infurawbs" against "インフラWBS" works: + // the trailing "s" (a romaji prefix for sa/si/su/…) must also match as the + // plain ASCII character 's', otherwise the generated pattern only contains + // kana alternatives like [サシスセソ…] and misses the ASCII 'S'. + set.insert(query.to_vec()); return RomajiPredictiveResult { prefix: hiragana, suffixes: set.into_iter().collect(), @@ -398,7 +404,9 @@ mod tests { #[test] fn romaji_to_hiragana_predictively_2() { let (prefix, suffixes) = romaji_to_hiragana_predictively("ky"); - let mut expected_suffixes = vec!["きゃ", "きぃ", "きぇ", "きゅ", "きょ"]; + // "ky" itself is included as a raw-ASCII suffix so callers can also generate + // a literal "ky" pattern alongside the kana alternatives. + let mut expected_suffixes = vec!["ky", "きゃ", "きぃ", "きぇ", "きゅ", "きょ"]; expected_suffixes.sort(); assert_eq!(prefix, ""); assert_eq!(suffixes, expected_suffixes); @@ -407,7 +415,8 @@ mod tests { #[test] fn romaji_to_hiragana_predictively_3() { let (prefix, suffixes) = romaji_to_hiragana_predictively("kky"); - let mut expected_suffixes = vec!["きゃ", "きぃ", "きぇ", "きゅ", "きょ"]; + // "ky" is included as a raw-ASCII suffix (same reason as test 2). + let mut expected_suffixes = vec!["ky", "きゃ", "きぃ", "きぇ", "きゅ", "きょ"]; expected_suffixes.sort(); assert_eq!(prefix, "っ"); assert_eq!(suffixes, expected_suffixes); @@ -416,8 +425,9 @@ mod tests { #[test] fn romaji_to_hiragana_predictively_4() { let (prefix, suffixes) = romaji_to_hiragana_predictively("n"); + // "n" itself is included as a raw-ASCII suffix alongside kana alternatives. let mut expected_suffixes = vec![ - "にょ", "の", "にゃ", "ぬ", "ね", "な", "にぇ", "にゅ", "に", "ん", "にぃ", + "n", "にょ", "の", "にゃ", "ぬ", "ね", "な", "にぇ", "にゅ", "に", "ん", "にぃ", ]; expected_suffixes.sort(); assert_eq!(prefix, ""); @@ -435,4 +445,43 @@ mod tests { fn romaji_to_hiragana_predictively_w() { let (_, _) = romaji_to_hiragana_predictively("w"); } + + // --- regression: trailing romaji consonant must include the raw ASCII suffix --- + // + // When a query like "infurawbs" is processed, the trailing "s" is an incomplete + // romaji syllable (prefix of sa/si/su/…). Before the fix, the suffix set only + // contained kana alternatives ([さしすせそ…]) and never the plain ASCII "s", so + // the query could not match a filename like "インフラWBS". + // + // After the fix the raw remaining input ("s") is added as one extra suffix, which + // lets query_a_word generate the pattern "インフラwbs" (matched case-insensitively). + + #[test] + fn predictive_suffix_includes_raw_ascii_trailing_consonant() { + // "infurawbs": after consuming "infura"→"いんふら", remaining is "wbs". + // "w" and "b" pass through as-is; "s" triggers suffix generation. + // The suffix set must contain the raw "s" so that "インフラwbs" is generated. + let (prefix, suffixes) = romaji_to_hiragana_predictively("infurawbs"); + assert_eq!(prefix, "いんふらwb"); + assert!( + suffixes.iter().any(|s| s == "s"), + "expected raw 's' in suffixes, got: {:?}", + suffixes + ); + // kana alternatives must still be present + assert!(suffixes.iter().any(|s| s == "す" || s == "さ" || s == "せ")); + } + + #[test] + fn predictive_suffix_includes_raw_for_single_trailing_consonant() { + // "denks": after "でん", remaining "k" is consumed as prefix then "s" triggers. + // Simpler case: just "s" alone as the remaining. + let (prefix, suffixes) = romaji_to_hiragana_predictively("s"); + assert_eq!(prefix, ""); + assert!( + suffixes.iter().any(|s| s == "s"), + "expected raw 's' in suffixes, got: {:?}", + suffixes + ); + } }