diff --git a/unit_testing.py b/unit_testing.py index 928eac6..82aa3f1 100644 --- a/unit_testing.py +++ b/unit_testing.py @@ -11,32 +11,63 @@ def test_positives(self): self.assertEqual(w2n.word_to_num('thousand'), 1000) self.assertEqual(w2n.word_to_num('million'), 1000000) self.assertEqual(w2n.word_to_num('billion'), 1000000000) - self.assertEqual(w2n.word_to_num('two million three thousand nine hundred and eighty four'), 2003984) + self.assertEqual(w2n.word_to_num( + 'two million three thousand nine hundred and eighty four'), 2003984) self.assertEqual(w2n.word_to_num('two thousand and nineteen'), 2019) - self.assertEqual(w2n.word_to_num('two million three thousand and nineteen'), 2003019) + self.assertEqual(w2n.word_to_num( + 'two million three thousand and nineteen'), 2003019) self.assertEqual(w2n.word_to_num('three billion'), 3000000000) self.assertEqual(w2n.word_to_num('three million'), 3000000) self.assertEqual(w2n.word_to_num('three hundred thousand'), 300000) - self.assertEqual(w2n.word_to_num('one hundred twenty three million four hundred fifty six thousand seven hundred eighty nine'), 123456789) - self.assertEqual(w2n.word_to_num('two million twenty three thousand and forty nine'), 2023049) - + self.assertEqual(w2n.word_to_num( + 'one hundred twenty three million four hundred fifty six thousand seven hundred eighty nine'), 123456789) + self.assertEqual(w2n.word_to_num( + 'two million twenty three thousand and forty nine'), 2023049) + # Handling metric suffixes self.assertEqual(w2n.word_to_num('$150k'), 150000) self.assertEqual(w2n.word_to_num('310 M'), 310000000) - + + # Handling metric suffixes without spaces + self.assertEqual(w2n.word_to_num('12million'), 12000000) + self.assertEqual(w2n.word_to_num('12Million'), 12000000) + self.assertEqual(w2n.word_to_num('12m'), 12000000) + self.assertEqual(w2n.word_to_num('12M'), 12000000) + self.assertEqual(w2n.word_to_num('12mn'), 12000000) + self.assertEqual(w2n.word_to_num('12Mn'), 12000000) + self.assertEqual(w2n.word_to_num('12MN'), 12000000) + self.assertEqual(w2n.word_to_num('12mil'), 12000000) + self.assertEqual(w2n.word_to_num('12Mil'), 12000000) + + self.assertEqual(w2n.word_to_num('12.3million'), 12300000) + self.assertEqual(w2n.word_to_num('12.3Million'), 12300000) + self.assertEqual(w2n.word_to_num('12.3m'), 12300000) + self.assertEqual(w2n.word_to_num('12.3M'), 12300000) + self.assertEqual(w2n.word_to_num('12.3mn'), 12300000) + self.assertEqual(w2n.word_to_num('12.3Mn'), 12300000) + self.assertEqual(w2n.word_to_num('12.3MN'), 12300000) + self.assertEqual(w2n.word_to_num('12.3mil'), 12300000) + self.assertEqual(w2n.word_to_num('12.3Mil'), 12300000) + # Minus/negative self.assertEqual(w2n.word_to_num('negative ten'), -10) self.assertEqual(w2n.word_to_num('-10'), -10) self.assertEqual(w2n.word_to_num('minus 10'), -10) self.assertEqual(w2n.word_to_num('minus ten point five'), -10.5) self.assertEqual(w2n.word_to_num('minus point five'), -0.5) - + # Excess spaces testing self.assertEqual(w2n.word_to_num('nineteen'), 19) self.assertEqual(w2n.word_to_num(' nineteen'), 19) self.assertEqual(w2n.word_to_num('nineteen '), 19) self.assertEqual(w2n.word_to_num('nineteen '), 19) - + + # Thousand mark testing + self.assertEqual(w2n.word_to_num('12,345,678'), 12345678) + self.assertEqual(w2n.word_to_num('12 345 678'), 12345678) + self.assertEqual(w2n.word_to_num('12,345,678.9'), 12345678.9) + self.assertEqual(w2n.word_to_num('12 345 678.9'), 12345678.9) + # Excess punctuation testing self.assertEqual(w2n.word_to_num('nineteen;'), 19) self.assertEqual(w2n.word_to_num('nineteen,'), 19) @@ -48,45 +79,50 @@ def test_positives(self): self.assertEqual(w2n.word_to_num('19 ;'), 19) self.assertEqual(w2n.word_to_num('19 ,'), 19) self.assertEqual(w2n.word_to_num('$ 19'), 19) - + # Joining words/symbols - self.assertEqual(w2n.word_to_num('nineteen billion and nineteen'), 19000000019) + self.assertEqual(w2n.word_to_num( + 'nineteen billion and nineteen'), 19000000019) self.assertEqual(w2n.word_to_num('one hundred and forty two'), 142) self.assertEqual(w2n.word_to_num('one hundred & forty two'), 142) self.assertEqual(w2n.word_to_num('one hundred thirty-five'), 135) self.assertEqual(w2n.word_to_num('six-four'), 64) - + # Handling regular digits alone self.assertEqual(w2n.word_to_num('112'), 112) self.assertEqual(w2n.word_to_num('11211234'), 11211234) self.assertEqual(w2n.word_to_num('2.3'), 2.3) - + # Handling decimals self.assertEqual(w2n.word_to_num('two point three'), 2.3) self.assertEqual(w2n.word_to_num('two point thirteen'), 2.13) self.assertEqual(w2n.word_to_num('nine point nine nine nine'), 9.999) - self.assertEqual(w2n.word_to_num('two million twenty three thousand and forty nine point two three six nine'), 2023049.2369) - self.assertEqual(w2n.word_to_num('one billion two million twenty three thousand and forty nine point two three six nine'), 1002023049.2369) - self.assertEqual(w2n.word_to_num('1 billion 2 million 23 thousand and 49.2369'), 1002023049.2369) + self.assertEqual(w2n.word_to_num( + 'two million twenty three thousand and forty nine point two three six nine'), 2023049.2369) + self.assertEqual(w2n.word_to_num( + 'one billion two million twenty three thousand and forty nine point two three six nine'), 1002023049.2369) + self.assertEqual(w2n.word_to_num( + '1 billion 2 million 23 thousand and 49.2369'), 1002023049.2369) # I don't even know if this should be allowed, but it works so we'll go with it self.assertEqual(w2n.word_to_num('one.one'), 1.1) - + # Handling decimals as multipliers self.assertEqual(w2n.word_to_num('four point nine million'), 4900000) self.assertEqual(w2n.word_to_num('1.3 thousand'), 1300) - + # Handling ommitted expected terms self.assertEqual(w2n.word_to_num('hundred twenty'), 120) self.assertEqual(w2n.word_to_num('point one'), 0.1) self.assertEqual(w2n.word_to_num('point nineteen'), 0.19) self.assertEqual(w2n.word_to_num('thousand million'), 1000000000) - + # Handling spelling out numbers by digit self.assertEqual(w2n.word_to_num('one niner niner'), 199) self.assertEqual(w2n.word_to_num('minus one niner niner'), -199) self.assertEqual(w2n.word_to_num('four four eight seven eight'), 44878) - self.assertEqual(w2n.word_to_num('four four eight seven eight point eight nine'), 44878.89) - + self.assertEqual(w2n.word_to_num( + 'four four eight seven eight point eight nine'), 44878.89) + # Less common number names self.assertEqual(w2n.word_to_num('two dozen'), 24) self.assertEqual(w2n.word_to_num('a gross'), 144) @@ -94,19 +130,27 @@ def test_positives(self): self.assertEqual(w2n.word_to_num('naught point five'), 0.5) self.assertEqual(w2n.word_to_num('two naught one one'), 2011) self.assertEqual(w2n.word_to_num('one decimal niner'), 1.9) - self.assertEqual(w2n.word_to_num('zero eight five decimal niner'), 85.9) - self.assertEqual(w2n.word_to_num('one zero eight five decimal niner'), 1085.9) - + self.assertEqual(w2n.word_to_num( + 'zero eight five decimal niner'), 85.9) + self.assertEqual(w2n.word_to_num( + 'one zero eight five decimal niner'), 1085.9) + # Testing the boring indexing method - self.assertEqual(w2n.num_word_indices('zero one two three four five six seven'), [ 0, 1, 2, 3, 4, 5, 6, 7 ]) - self.assertEqual(w2n.num_word_indices('four'), [ 0 ]) - self.assertEqual(w2n.num_word_indices('fourteen rats and three mice'), [ 0, 3 ]) - self.assertEqual(w2n.num_word_indices('1.5 rats (ew!) and -2 mice'), [ 0, 4 ]) - self.assertEqual(w2n.num_word_indices('Who wants to win $150,000'), [ 4 ]) - self.assertEqual(w2n.num_word_indices('There are no numbers in this sentence'), [ ]) + self.assertEqual(w2n.num_word_indices( + 'zero one two three four five six seven'), [0, 1, 2, 3, 4, 5, 6, 7]) + self.assertEqual(w2n.num_word_indices('four'), [0]) + self.assertEqual(w2n.num_word_indices( + 'fourteen rats and three mice'), [0, 3]) + self.assertEqual(w2n.num_word_indices( + '1.5 rats (ew!) and -2 mice'), [0, 4]) + self.assertEqual(w2n.num_word_indices( + 'Who wants to win $150,000'), [4]) + self.assertEqual(w2n.num_word_indices( + 'There are no numbers in this sentence'), []) def test_negatives(self): - self.assertRaises(ValueError, w2n.word_to_num, 'seventh point nineteen') + self.assertRaises(ValueError, w2n.word_to_num, + 'seventh point nineteen') self.assertRaises(ValueError, w2n.word_to_num, '19 calculators') self.assertRaises(ValueError, w2n.word_to_num, '-') self.assertRaises(ValueError, w2n.word_to_num, '19-') @@ -119,13 +163,17 @@ def test_negatives(self): self.assertRaises(ValueError, w2n.word_to_num, 'million. million.') self.assertRaises(ValueError, w2n.word_to_num, 'million & million') self.assertRaises(ValueError, w2n.word_to_num, 'three million million') - self.assertRaises(ValueError, w2n.word_to_num, 'one billion point two million twenty three thousand and forty nine point two three six nine') - self.assertRaises(ValueError, w2n.word_to_num, 'one decimal niner decimal eight') - self.assertRaises(ValueError, w2n.word_to_num, 'zero point eight five decimal niner') + self.assertRaises(ValueError, w2n.word_to_num, + 'one billion point two million twenty three thousand and forty nine point two three six nine') + self.assertRaises(ValueError, w2n.word_to_num, + 'one decimal niner decimal eight') + self.assertRaises(ValueError, w2n.word_to_num, + 'zero point eight five decimal niner') self.assertRaises(ValueError, w2n.word_to_num, 112) self.assertRaises(ValueError, w2n.word_to_num, False) self.assertRaises(ValueError, w2n.word_to_num, 'point') self.assertRaises(ValueError, w2n.word_to_num, '.') + if __name__ == '__main__': unittest.main() diff --git a/word2number/w2n.py b/word2number/w2n.py index 3676b2a..3efca16 100644 --- a/word2number/w2n.py +++ b/word2number/w2n.py @@ -1,3 +1,6 @@ +import re + + num_names = { 'zero': 0, 'naught': 0, @@ -35,7 +38,11 @@ place_abbrev = { 'k': 1000, 'm': 1000000, + 'mn': 1000000, + 'mil': 1000000, 'b': 1000000000, + 'bn': 1000000000, + 'bil': 1000000000, } place_names = { @@ -66,27 +73,40 @@ '-': '-', } -ignore_chars = [ '$', ';', ',' ] -ignore_words = [ 'a', 'and', '&', '' ] +ignore_chars = ['$', ';', ','] +ignore_words = ['a', 'and', '&', ''] + +word_to_number = {**num_names, **place_names, **dec_names, **neg_names} + -word_to_number = { **num_names, **place_names, **dec_names, **neg_names } - def num_generator(phrase): # remove dirty characters - commonly put in numbers but not "part of" the number cleanphrase = ''.join(char for char in phrase if char not in ignore_chars) # make . its own word so we can treat it like the other decimal words splitphrase = cleanphrase.replace('.', ' . ').lower() - - words = [ ] + + # remove spaces used as thousand markers here for numbers (e.g. 12 345 678 -> 12345678) + # any irregular spacing will be ignored (e.g. 123 45 678 -> 123 45678) + splitphrase = re.sub(r'([0-9]) (?=([0-9]{3}))', r'\g<1>', splitphrase) + + words = [] # remove dirty words - commonly put in number words but not "part of" the number for word in (word for word in splitphrase.split(' ') if word not in ignore_words): - # separate suffixes (e.g. 150k -> 150 k) - if word[:-1].isdigit() and word[-1] in place_abbrev: - words.append(word[:-1]) - words.append(word[-1]) + # separate suffixes (e.g. 150k -> 150 k or 12million -> 12 million) + words_added = False + for place_name in place_names: + if word.endswith(place_name) and word[:-len(place_name)].isdigit(): + words.append(word[:-len(place_name)]) + words.append(place_name) + words_added = True + break + + if words_added: + continue + # - is confusing, since it can be a separator (sixty-six) or a negative (-10) # fortunately, to be a negative it must be at the start of a word - elif '-' in word: + if '-' in word: i = word.index('-') if i == 0: words.append('-') @@ -96,30 +116,32 @@ def num_generator(phrase): words.append(word[i+1:]) else: words.append(word) - + if len(words) == 0: raise ValueError('No valid words provided') - - countDec = sum( words.count(dec) for dec in dec_names ) - countNeg = sum( words.count(neg) for neg in neg_names ) - + + countDec = sum(words.count(dec) for dec in dec_names) + countNeg = sum(words.count(neg) for neg in neg_names) + # Check if there are any valid number words if len(words) == countDec + countNeg: raise ValueError('No valid number words provided') - + # Check if there are any illegal duplicates if 1 < countDec: - raise ValueError('At most one of the following allowed: {}'.format(dec_names)) - + raise ValueError( + 'At most one of the following allowed: {}'.format(dec_names)) + if 1 < countNeg: - raise ValueError('At most one of the following allowed: {}'.format(countNeg)) - + raise ValueError( + 'At most one of the following allowed: {}'.format(countNeg)) + for place in place_names: # Hundred is a special case, since "one hundred thousand one hundred" is a valid number if place != 'hundred' and 1 < words.count(place): - raise ValueError('Duplicate number word provided: {}'.format(place)) - return 0 - + raise ValueError( + 'Duplicate number word provided: {}'.format(place)) + # Iterate over the words, yielding them consecutively as numbers for word in words: if word in word_to_number: @@ -131,72 +153,78 @@ def num_generator(phrase): try: yield float(word) except: - raise ValueError('Non-number words provided: {}'.format(word)) - return 0 - + raise ValueError( + 'Non-number words provided: {}'.format(word)) + + def word_to_num(phrase): if type(phrase) is not str: - raise ValueError('Type of input is not string! Please enter a valid number word (eg. \'two million twenty three thousand and forty nine\')') - - running_total = [ 0 ] + raise ValueError( + 'Type of input is not string! Please enter a valid number word (eg. \'two million twenty three thousand and forty nine\')') + + running_total = [0] postDecimalCount = 0 sign = 1 - + for num in num_generator(phrase): if num == '.': postDecimalCount = -1 - + elif num == '-': if running_total[0] != 0: raise ValueError('Negating word must be first word') sign = -1 - + elif num in place_names.values(): # Get the next index which is smaller than the current item - index = next((i for i, x in enumerate(running_total) if x < num), -1) - + index = next((i for i, x in enumerate( + running_total) if x < num), -1) + # Sum all the smaller parts # e.g. if we are parsing 'one million four hundred thirty six thousand', we'll have - # [ 1000000, 400, 36 ] when handling 1000; since 400 and 36 are both smaller than + # [ 1000000, 400, 36 ] when handling 1000; since 400 and 36 are both smaller than # 1000 but 1000000 is not, we'll sum the smaller stuff to give [ 1000000, 436 ]. # We'll later multiply the last item by this place name - running_total = running_total[:index] + [ sum(running_total[index:]) ] - + running_total = running_total[:index] + \ + [sum(running_total[index:])] + # Special case if someone starts with a place name, e.g. 'hundred twenty' rather than # 'one hundred twenty' if running_total[-1] == 0: running_total[-1] = 1 - + running_total[-1] *= num - + # Append a new item after this - we've just handled a place name, and need to separate # the remaining content in case we have another place name coming running_total.append(0) postDecimalCount = 0 - + elif len(str(num)) != len(str(running_total[-1])) or postDecimalCount: - # Special case to pre-adjust the decimal value, in case someone puts something like + # Special case to pre-adjust the decimal value, in case someone puts something like # 'point nineteen' if postDecimalCount: postDecimalCount -= len(str(num)) - 1 running_total[-1] += num * 10**postDecimalCount if postDecimalCount: postDecimalCount -= 1 - + else: running_total.append(0) running_total[-1] = num - - if all( num < 10 for num in running_total ): - return sign * sum( num * 10**i for i, num in enumerate(reversed(running_total)) ) + + if all(num < 10 for num in running_total): + return sign * sum(num * 10**i for i, num in enumerate(reversed(running_total))) else: return sign * sum(running_total) - + + def num_word_indices(phrase): indices = [] for i, word in enumerate(phrase.lower().split(' ')): - cleanWord = ''.join(char for char in word if char not in ignore_chars + [ '.', '-' ]) + cleanWord = ''.join( + char for char in word if char not in ignore_chars + ['.', '-']) if cleanWord.isdigit() or cleanWord in word_to_number: indices.append(i) - - return indices \ No newline at end of file + + return indices