From 7bc351d8435a0d6cd5be33a3cc2ad80ed1ef2450 Mon Sep 17 00:00:00 2001 From: KMiNT21 Date: Fri, 1 Mar 2019 22:55:32 +0200 Subject: [PATCH] Fix ignoring non-Cyrillic in RussianStemmer --- summa/preprocessing/snowball.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/summa/preprocessing/snowball.py b/summa/preprocessing/snowball.py index 64a166f..6d597a5 100644 --- a/summa/preprocessing/snowball.py +++ b/summa/preprocessing/snowball.py @@ -2969,8 +2969,11 @@ def stem(self, word): chr_exceeded = True break - if chr_exceeded: - word = self.__cyrillic_to_roman(word) + # If there are no Cyrillic chars -> return as is + if not chr_exceeded: + return word + + word = self.__cyrillic_to_roman(word) step1_success = False adjectival_removed = False @@ -3115,8 +3118,7 @@ def stem(self, word): if word.endswith("'"): word = word[:-1] - if chr_exceeded: - word = self.__roman_to_cyrillic(word) + word = self.__roman_to_cyrillic(word) return word