diff --git a/general-tests/deptest.py b/general-tests/deptest.py index 343dacc..db1f7a7 100644 --- a/general-tests/deptest.py +++ b/general-tests/deptest.py @@ -10,11 +10,11 @@ assert reinflect("mökkiammeemme", model="talossa") == {"mökkiammeessa"} assert reinflect("esijuosta", model="katselemme") == {'esijuoksemme'} assert reinflect("mökkiammeemme", new_form="+sg+nom") == {'mökkiamme'} -assert reinflect("löhkö", new_form="+pl+ine+ko") == {'löhköissäkö'} +assert reinflect("löhkö", orig_form="+sg+nom", new_form="+pl+ine+ko") == {'löhköissäkö'} assert reinflect("viinissä", model="talot") == {'viinet'} assert reinflect("viinissä", model="talot", orig_form="+sg+ine") == {'viinit'} -assert reinflect("hömppäämme", model="juokset", pos="verb") == {'hömppäät'} -assert reinflect("hömppäämme", model="juokset", pos="noun") == {'hömpät'} +assert reinflect("homppaamme", model="juokset", pos="verb") == {'homppaat'} +assert reinflect("homppaamme", model="talot", pos="noun") == {'hompat'} from pypykko.utils import analyze assert analyze("hätkähtäneet") == [('hätkähtäneet', 'Lexicon', 'hätkähtää', 'verb', '', '', '+past+conneg+pl', 0.0), ('hätkähtäneet', 'Lexicon', 'hätkähtää', 'verb', '', '', '+part_past+pl+nom', 0.0), ('hätkähtäneet', 'Lexicon', 'hätkähtänyt', 'participle', '', ' ← verb:hätkähtää:+part_past', '+pl+nom', 0.0)] diff --git a/general-tests/test_alignment_pypykko.py b/general-tests/test_alignment_pypykko.py index 7da0c7c..c701773 100644 --- a/general-tests/test_alignment_pypykko.py +++ b/general-tests/test_alignment_pypykko.py @@ -4,238 +4,16 @@ isyms = PARSER_FST.split_to_symbols(text) gold = [ - [ - (0, '@0@', 'Lexicon'), - (0, '@0@', '\t'), - (0, 'i', 'i'), - (1, 's', 's'), - (2, 'o', 'o'), - (4, '@0@', '|'), - (4, 'v', 'v'), - (5, 'a', 'a'), - (6, 'r', 'r'), - (7, 'p', 'v'), - (8, 'a', 'a'), - (9, 'a', 's'), - (10, '@0@', '\tnoun\t'), - (10, '@0@', '\t'), - (10, '@0@', '\t'), - (10, '@0@', '+sg'), - (10, 'n', '+gen') - ], - [ - (0, '@0@', 'Lexicon|Pfx'), - (0, '@0@', '\t'), - (0, 'i', 'i'), - (1, 's', 's'), - (2, 'o', 'o'), - (3, 'n', 'n'), - (4, '@0@', '⁅BOUNDARY⁆'), - (4, 'v', 'v'), - (5, 'a', 'a'), - (6, 'r', 'r'), - (7, 'p', 'v'), - (8, 'a', 'a'), - (9, 'a', 's'), - (10, '@0@', '\tnoun\t'), - (10, '@0@', '\t'), - (10, '@0@', '\t'), - (10, '@0@', '+sg'), - (10, 'n', '+gen') - ], - [ - (0, '@0@', 'Guesser|Any'), - (0, '@0@', '\t'), - (0, 'i', 'i'), - (1, 's', 's'), - (2, 'o', 'o'), - (3, 'n', 'n'), - (4, 'v', 'v'), - (5, 'a', 'a'), - (6, 'r', 'r'), - (7, 'p', 'p'), - (8, 'a', 'a'), - (10, '@0@', 't'), - (10, '@0@', 'a'), - (10, '@0@', '\tverb\t'), - (10, '@0@', '\t'), - (10, '@0@', '\t'), - (10, '@0@', '+pres'), - (10, 'n', '+1sg') - ], - [ - (0, '@0@', 'Guesser|Any'), - (0, '@0@', '\t'), - (0, 'i', 'I'), - (1, 's', 's'), - (2, 'o', 'o'), - (3, 'n', 'n'), - (4, 'v', 'v'), - (5, 'a', 'a'), - (6, 'r', 'r'), - (7, 'p', 'p'), - (8, 'a', 'a'), - (10, '@0@', 't'), - (10, '@0@', 'a'), - (10, '@0@', '\tverb\t'), - (10, '@0@', '\t'), - (10, '@0@', '\t'), - (10, '@0@', '+pres'), - (10, 'n', '+1sg') - ], - [ - (0, '@0@', 'Guesser|Any'), - (0, '@0@', '\t'), - (0, 'i', 'i'), - (1, 's', 's'), - (2, 'o', 'o'), - (3, 'n', 'n'), - (4, 'v', 'v'), - (5, 'a', 'a'), - (6, 'r', 'r'), - (7, 'p', 'p'), - (8, 'a', 'a'), - (9, '@0@', '\tnoun\t'), - (9, '@0@', '\t'), - (9, '@0@', '\t'), - (9, '@0@', '+sg'), - (9, 'a', '+ill') - ], - [ - (0, '@0@', 'Guesser|Any'), - (0, '@0@', '\t'), - (0, 'i', 'i'), - (1, 's', 's'), - (2, 'o', 'o'), - (3, 'n', 'n'), - (4, 'v', 'v'), - (5, 'a', 'a'), - (6, 'r', 'r'), - (7, 'p', 'p'), - (8, 'a', 'a'), - (9, 'a', 's'), - (10, '@0@', '\tnoun\t'), - (10, '@0@', '\t'), - (10, '@0@', '\t'), - (10, '@0@', '+sg'), - (10, 'n', '+gen') - ], - [ - (0, '@0@', 'Guesser|Any'), - (0, '@0@', '\t'), - (0, 'i', 'i'), - (1, 's', 's'), - (2, 'o', 'o'), - (3, 'n', 'n'), - (4, 'v', 'v'), - (5, 'a', 'a'), - (6, 'r', 'r'), - (7, 'p', 'p'), - (8, 'a', 'a'), - (9, 'a', 'a'), - (10, '@0@', '\tnoun\t'), - (10, '@0@', '\t'), - (10, '@0@', '\t'), - (10, '@0@', '+sg'), - (10, 'n', '+gen') - ], - [ - (0, '@0@', 'Guesser|Any'), - (0, '@0@', '\t'), - (0, 'i', 'i'), - (1, 's', 's'), - (2, 'o', 'o'), - (3, 'n', 'n'), - (4, 'v', 'v'), - (5, 'a', 'a'), - (6, 'r', 'r'), - (7, 'p', 'p'), - (8, 'a', 'a'), - (9, 'a', 'a'), - (10, 'n', 'n'), - (11, '@0@', '\tnoun\t'), - (11, '@0@', '\t'), - (11, '@0@', '\t'), - (11, '@0@', '+sg'), - (11, '@0@', '+nom') - ], - [ - (0, '@0@', 'Guesser|Any'), - (0, '@0@', '\t'), - (0, 'i', 'I'), - (1, 's', 's'), - (2, 'o', 'o'), - (3, 'n', 'n'), - (4, 'v', 'v'), - (5, 'a', 'a'), - (6, 'r', 'r'), - (7, 'p', 'p'), - (8, 'a', 'a'), - (9, '@0@', '\tnoun\t'), - (9, '@0@', '\t'), - (9, '@0@', '\t'), - (9, '@0@', '+sg'), - (9, 'a', '+ill') - ], - [ - (0, '@0@', 'Guesser|Any'), - (0, '@0@', '\t'), - (0, 'i', 'I'), - (1, 's', 's'), - (2, 'o', 'o'), - (3, 'n', 'n'), - (4, 'v', 'v'), - (5, 'a', 'a'), - (6, 'r', 'r'), - (7, 'p', 'p'), - (8, 'a', 'a'), - (9, 'a', 's'), - (10, '@0@', '\tnoun\t'), - (10, '@0@', '\t'), - (10, '@0@', '\t'), - (10, '@0@', '+sg'), - (10, 'n', '+gen') - ], - [ - (0, '@0@', 'Guesser|Any'), - (0, '@0@', '\t'), - (0, 'i', 'I'), - (1, 's', 's'), - (2, 'o', 'o'), - (3, 'n', 'n'), - (4, 'v', 'v'), - (5, 'a', 'a'), - (6, 'r', 'r'), - (7, 'p', 'p'), - (8, 'a', 'a'), - (9, 'a', 'a'), - (10, '@0@', '\tnoun\t'), - (10, '@0@', '\t'), - (10, '@0@', '\t'), - (10, '@0@', '+sg'), - (10, 'n', '+gen') - ], - [ - (0, '@0@', 'Guesser|Any'), - (0, '@0@', '\t'), - (0, 'i', 'I'), - (1, 's', 's'), - (2, 'o', 'o'), - (3, 'n', 'n'), - (4, 'v', 'v'), - (5, 'a', 'a'), - (6, 'r', 'r'), - (7, 'p', 'p'), - (8, 'a', 'a'), - (9, 'a', 'a'), - (10, 'n', 'n'), - (11, '@0@', '\tnoun\t'), - (11, '@0@', '\t'), - (11, '@0@', '\t'), - (11, '@0@', '+sg'), - (11, '@0@', '+nom') - ] + [(0, '@0@', 'Lexicon'), (0, '@0@', '\t'), (0, 'i', 'i'), (1, 's', 's'), (2, 'o', 'o'), (4, '@0@', '|'), (4, 'v', 'v'), (5, 'a', 'a'), (6, 'r', 'r'), (7, 'p', 'v'), (8, 'a', 'a'), (9, 'a', 's'), (10, '@0@', '\tnoun\t'), (10, '@0@', '\t'), (10, '@0@', '\t'), (10, '@0@', '+sg'), (10, 'n', '+gen')], + [(0, '@0@', 'Lexicon|Pfx'), (0, '@0@', '\t'), (0, 'i', 'i'), (1, 's', 's'), (2, 'o', 'o'), (3, 'n', 'n'), (4, '@0@', '⁅BOUNDARY⁆'), (4, 'v', 'v'), (5, 'a', 'a'), (6, 'r', 'r'), (7, 'p', 'v'), (8, 'a', 'a'), (9, 'a', 's'), (10, '@0@', '\tnoun\t'), (10, '@0@', '\t'), (10, '@0@', '\t'), (10, '@0@', '+sg'), (10, 'n', '+gen')], + [(0, '@0@', 'Guesser|Any'), (0, '@0@', '\t'), (0, 'i', 'i'), (1, 's', 's'), (2, 'o', 'o'), (3, 'n', 'n'), (4, 'v', 'v'), (5, 'a', 'a'), (6, 'r', 'r'), (7, 'p', 'p'), (8, 'a', 'a'), (9, '@0@', '\tnoun\t'), (9, '@0@', '\t'), (9, '@0@', '\t'), (9, '@0@', '+sg'), (9, 'a', '+ill')], + [(0, '@0@', 'Guesser|Any'), (0, '@0@', '\t'), (0, 'i', 'i'), (1, 's', 's'), (2, 'o', 'o'), (3, 'n', 'n'), (4, 'v', 'v'), (5, 'a', 'a'), (6, 'r', 'r'), (7, 'p', 'p'), (8, 'a', 'a'), (9, 'a', 'a'), (10, '@0@', '\tnoun\t'), (10, '@0@', '\t'), (10, '@0@', '\t'), (10, '@0@', '+sg'), (10, 'n', '+gen')], + [(0, '@0@', 'Guesser|Any'), (0, '@0@', '\t'), (0, 'i', 'i'), (1, 's', 's'), (2, 'o', 'o'), (3, 'n', 'n'), (4, 'v', 'v'), (5, 'a', 'a'), (6, 'r', 'r'), (7, 'p', 'p'), (8, 'a', 'a'), (9, 'a', 'a'), (10, 'n', 'n'), (11, '@0@', '\tnoun\t'), (11, '@0@', '\t'), (11, '@0@', '\t'), (11, '@0@', '+sg'), (11, '@0@', '+nom')], + [(0, '@0@', 'Guesser|Any'), (0, '@0@', '\t'), (0, 'i', 'I'), (1, 's', 's'), (2, 'o', 'o'), (3, 'n', 'n'), (4, 'v', 'v'), (5, 'a', 'a'), (6, 'r', 'r'), (7, 'p', 'p'), (8, 'a', 'a'), (9, '@0@', '\tnoun\t'), (9, '@0@', '\t'), (9, '@0@', '\t'), (9, '@0@', '+sg'), (9, 'a', '+ill')], + [(0, '@0@', 'Guesser|Any'), (0, '@0@', '\t'), (0, 'i', 'I'), (1, 's', 's'), (2, 'o', 'o'), (3, 'n', 'n'), (4, 'v', 'v'), (5, 'a', 'a'), (6, 'r', 'r'), (7, 'p', 'p'), (8, 'a', 'a'), (9, 'a', 'a'), (10, '@0@', '\tnoun\t'), (10, '@0@', '\t'), (10, '@0@', '\t'), (10, '@0@', '+sg'), (10, 'n', '+gen')], + [(0, '@0@', 'Guesser|Any'), (0, '@0@', '\t'), (0, 'i', 'I'), (1, 's', 's'), (2, 'o', 'o'), (3, 'n', 'n'), (4, 'v', 'v'), (5, 'a', 'a'), (6, 'r', 'r'), (7, 'p', 'p'), (8, 'a', 'a'), (9, 'a', 'a'), (10, 'n', 'n'), (11, '@0@', '\tnoun\t'), (11, '@0@', '\t'), (11, '@0@', '\t'), (11, '@0@', '+sg'), (11, '@0@', '+nom')] ] + sys = [] for t, weight in PARSER_FST.lookup_aligned(text): diff --git a/pypykko/README.md b/pypykko/README.md index 15db3a4..4943d7a 100644 --- a/pypykko/README.md +++ b/pypykko/README.md @@ -108,4 +108,4 @@ Eg. when looking at "isonvarpaan", one might want to not only know that it is th PyPykko is licensed under the MIT license like Pykko itself, as it is mostly constituted of Pykko's files with minor modifications. See the LICENSE file for details. Note that kfst (and kfst-rs) have less permissive licenses. -Files from Pykko itself are modified from the version in commit 95f3d51f0e94a1e88ab7c750f2bedcb6b3fd5edd. The compiled transducers are from the same commit. +Files from Pykko itself are modified from the version in commit 9bf1f02a3b03046955a82643e273b6fc3b28174f. The compiled transducers are from the same commit. diff --git a/pypykko/pypykko/aux-abbreviations.tsv b/pypykko/pypykko/aux-abbreviations.tsv index a860b7a..6bc16ad 100644 --- a/pypykko/pypykko/aux-abbreviations.tsv +++ b/pypykko/pypykko/aux-abbreviations.tsv @@ -1,149 +1,133 @@ -- t. 0 conjunction - - - - abbr - -- A.A.A. 0 none - - - - abbr - -- Inc. 0 none - - - - abbr|foreign - -- Joh. 0 none - - - - abbr - -- Ltd. 0 none - - - - abbr|foreign - -- Luuk. 0 none - - - - abbr - -- Mark. 0 none - - - - abbr - -- Matt. 0 none - - - - abbr - -- Moos. 0 none - - - - abbr - -- Mr. 0 none - - - - abbr - -- Mrs. 0 none - - - - abbr - -- Ms. 0 none - - - - abbr - -- P.S. 0 none - - - - abbr - -- R.S.V.P. 0 none - - - - abbr - -- Room. 0 none - - - - abbr - -- Sananl. 0 none - - - - abbr - -- U.S. 0 none - - - - abbr|foreign - -- al. 0 none - - - - abbr|foreign - -- al. 0 none - - - - abbr - -- alk. 0 none - - - - abbr - -- alkup. 0 none - - - - abbr - -- ao. 0 none - - - - abbr - -- arab. 0 none - - - - abbr - -- as. 0 none - - - - abbr - -- biol. 0 none - - - - abbr - -- d.o.o. 0 none - - - - abbr|foreign - -- e.g. 0 none - - - - abbr|foreign - -- eKr. 0 none - - - - abbr - -- eaa. 0 none - - - - abbr - -- ed. 0 none - - - - abbr - -- em. 0 none - - - - abbr - -- engl. 0 none - - - - abbr - -- ent. 0 none - - - - abbr - -- esim. 0 none - - - - abbr - -- esp. 0 none - - - - abbr - -- etc. 0 none - - - - abbr|foreign - -- evp 0 none - - - - abbr - -- evp. 0 none - - - - abbr - -- harv. 0 none - - - - abbr - -- hepr. 0 none - - - - abbr - -- hist. 0 none - - - - abbr - -- hl. 0 none - - - - abbr|foreign - -- huom. 0 none - - - - abbr - -- ital. 0 none - - - - abbr - -- jKr. 0 none - - - - abbr - -- jaa. 0 none - - - - abbr - -- jap. 0 none - - - - abbr - -- jne. 0 none - - - - abbr - -- k. 0 none - - - - abbr - -- kd 0 none - - - - abbr - -- ke 0 none - - - - abbr - -- kesk 0 none - - - - abbr - -- kesk. 0 none - - - - abbr - -- kft. 0 none - - - - abbr - -- kiin. 0 none - - - - abbr - -- kirj. 0 none - - - - abbr - -- kk 0 none - - - - abbr - -- klo 0 none - - - - abbr - -- ko. 0 none - - - - abbr - -- kok 0 none - - - - abbr - -- kpl 0 none - - - - abbr - -- kreik. 0 none - - - - abbr - -- kreikk. 0 none - - - - abbr - -- ks. 0 none - - - - abbr - -- kts. 0 none - - - - abbr - -- la 0 none - - - - abbr - -- lat. 0 none - - - - abbr - -- lääk. 0 none - - - - abbr - -- m. 0 none - - - - abbr - -- ma 0 none - - - - abbr - -- mat. 0 none - - - - abbr - -- milj. 0 none - - - - abbr - -- ml. 0 none - - - - abbr - -- mm. 0 none - - - - abbr - -- mon. 0 none - - - - abbr - -- mrd 0 none - - - - abbr - -- mrd. 0 none - - - - abbr - -- n. 0 none - - - - abbr - -- n/a 0 none - - - - abbr - -- nk. 0 none - - - - abbr - -- nro 0 none - - - - abbr - -- ns. 0 none - - - - abbr - -- nyk. 0 none - - - - abbr - -- o.s. 0 none - - - - abbr - -- oik. 0 none - - - - abbr - -- os. 0 none - - - - abbr - -- p. 0 none - - - - abbr - -- paal. 0 none - - - - abbr - -- pe 0 none - - - - abbr - -- pp. 0 none - - - - abbr - -- ps. 0 none - - - - abbr - -- pvm 0 none - - - - abbr - -- päätoim. 0 none - - - - abbr - -- ransk. 0 none - - - - abbr - -- room. 0 none - - - - abbr - -- ruots. 0 none - - - - abbr - -- s. 0 none - - - - abbr - -- saks. 0 none - - - - abbr - -- sd 0 none - - - - abbr - -- sin 0 none - - - - abbr - -- so. 0 none - - - - abbr - -- su 0 none - - - - abbr - -- suom. 0 none - - - - abbr - -- terv. 0 none - - - - abbr - -- ti 0 none - - - - abbr - -- tms. 0 none - - - - abbr - -- to 0 none - - - - abbr - -- toim. 0 none - - - - abbr - +- t. - conjunction - - - - abbr - +- A.A.A. - none - - - - abbr - +- Dr. - none - - - - abbr - +- Joh. - none - - - - abbr - +- Luuk. - none - - - - abbr - +- Mark. - none - - - - abbr - +- Matt. - none - - - - abbr - +- Moos. - none - - - - abbr - +- Mr. - none - - - - abbr - +- Mrs. - none - - - - abbr - +- Ms. - none - - - - abbr - +- Mt. - none - - - - foreign - +- P.S. - none - - - - abbr - +- R.S.V.P. - none - - - - abbr - +- Room. - none - - - - abbr - +- Sananl. - none - - - - abbr - +- St. - none ! - - - foreign - +- al. - none - - - - abbr|foreign - +- alk. - none - - - - abbr - +- alkup. - none - - - - abbr - +- ao. - none - - - - abbr - +- arab. - none - - - - abbr - +- arkkit. - none - - - - abbr - +- as. - none - - - - abbr - +- biol. - none - - - - abbr - +- d.o.o. - none - - - - abbr|foreign - +- e.g. - none - - - - abbr|foreign - +- eKr. - none - - - - abbr - +- eaa. - none - - - - abbr - +- ed. - none - - - - abbr - +- em. - none - - - - abbr - +- engl. - none - - - - abbr - +- ent. - none - - - - abbr - +- esim. - none - - - - abbr - +- esp. - none - - - - abbr - +- etc. - none - - - - abbr|foreign - +- evp. - none - - - - abbr - +- farm. - none - - - - abbr - +- feat. - none - - - - abbr - +- fil. - none - - - - abbr - +- harv. - none - - - - abbr - +- hepr. - none - - - - abbr - +- hist. - none - - - - abbr - +- hl. - none - - - - abbr|foreign - +- hum. - none - - - - abbr - +- huom. - none - - - - abbr - +- ital. - none - - - - abbr - +- jKr. - none - - - - abbr - +- jaa. - none - - - - abbr - +- jap. - none - - - - abbr - +- jne. - none - - - - abbr - +- joht. - none - - - - abbr - +- k. - none - - - - abbr - +- kand. - none - - - - abbr - +- kasvatust. - none - - - - abbr - +- kauppat. - none - - - - abbr - +- kesk. - none - - - - abbr - +- kft. - none - - - - abbr - +- kiin. - none - - - - abbr - +- kirj. - none - - - - abbr - +- ko. - none - - - - abbr - +- kreik. - none - - - - abbr - +- kreikk. - none - - - - abbr - +- ks. - none - - - - abbr - +- kts. - none - - - - abbr - +- lat. - none - - - - abbr - +- liikuntat. - none - - - - abbr - +- lis. - none - - - - abbr - +- lyh. - none - - - - abbr - +- lääk. - none - - - - abbr - +- lääket. - none - - - - abbr - +- m. - none - - - - abbr - +- maist. - none - - - - abbr - +- mat. - none - - - - abbr - +- milj. - none - - - - abbr - +- ml. - none - - - - abbr - +- mm. - none - - - - abbr - +- mon. - none - - - - abbr - +- mrd. - none - - - - abbr - +- n. - none - - - - abbr - +- nk. - none - - - - abbr - +- ns. - none - - - - abbr - +- nyk. - none - - - - abbr - +- o.s. - none - - - - abbr - +- oik. - none - - - - abbr - +- os. - none - - - - abbr - +- p. - none - - - - abbr - +- paal. - none - - - - abbr - +- pp. - none - - - - abbr - +- ps. - none - - - - abbr - +- päätoim. - none - - - - abbr - +- ransk. - none - - - - abbr - +- room. - none - - - - abbr - +- ruots. - none - - - - abbr - +- s. - none - - - - abbr - +- saks. - none - - - - abbr - +- san. - none - - - - abbr - +- so. - none - - - - abbr - +- sov. - none - - - - abbr - +- suom. - none - - - - abbr - +- säv. - none - - - - abbr - +- taloust. - none - - - - abbr - +- tekn. - none - - - - abbr - +- teol. - none - - - - abbr - +- terv. - none - - - - abbr - +- tms. - none - - - - abbr - +- toht. - none - - - - abbr - +- toim. - none - - - - abbr - - torj. - none - - - - abbr - -- ts. 0 none - - - - abbr - -- v. 0 none - - - - abbr - -- vas 0 none - - - - abbr - -- vas. 0 none - - - - abbr - -- ven. 0 none - - - - abbr - -- vietn. 0 none - - - - abbr - -- vihr. 0 none - - - - abbr - -- vko 0 none - - - - abbr - -- vol. 0 none - - - - abbr - -- vrt. 0 none - - - - abbr - -- vs. 0 none - - - - abbr - -- yks. 0 none - - - - abbr - -- ym. 0 none - - - - abbr - -- yms. 0 none - - - - abbr - -- yo. 0 none - - - - abbr - -- yst. 0 none - - - - abbr - -- yst.terv. 0 none - - - - abbr - -- ADHD 0 noun 18B - - e abbr - -- BKT 0 noun 18B - front|back e abbr - -- CV 0 noun 18B - - e abbr - -- Co. - noun XX - - - abbr - -- DI 0 noun 18B - - i abbr - -- DNA 0 noun 18B - - a abbr - -- FM 0 noun 10B - - - abbr - -- FT 0 noun 18B - - e abbr - -- HuK 0 noun 18B - - o abbr - -- LuK 0 noun 18B - - - abbr - -- TV 0 noun 18B - - e abbr - -- ab 0 noun 18B - - e abbr - -- bkt 0 noun 18B - front|back e abbr - -- btm 0 noun 5 - - - abbr - -- ky 0 noun 18B - - y abbr - -- lng 0 noun 18B - - e abbr - -- oy 0 noun 18B - - y abbr - -- oyj 0 noun 18B - - i abbr - -- pj. 0 noun XX - - - abbr - -- ry 0 noun 18B - - y abbr - -- tj. 0 noun XX - - - abbr - -- tri 0 noun 18 - - - abbr - -- tv 0 noun 18B - - e abbr - +- trad. - none - - - - abbr - +- ts. - none - - - - abbr - +- v. - none - - - - abbr - +- valt. - none - - - - abbr - +- ven. - none - - - - abbr - +- vietn. - none - - - - abbr - +- vihr. - none - - - - abbr - +- vol. - none - - - - abbr - +- vrt. - none - - - - abbr - +- vs. - none - - - - abbr - +- vt. - none - - - - abbr - +- yks. - none - - - - abbr - +- ym. - none - - - - abbr - +- yms. - none - - - - abbr - +- yo. - none - - - - abbr - +- yst. - none - - - - abbr - +- yst.terv. - none - - - - abbr - +- Co. - noun XX - - - abbr|foreign - +- Inc. - noun XX - - - abbr|foreign - +- Ltd. - noun XX - - - abbr|foreign - +- Tl. - noun XX - - - abbr|dated - +- U.S. - noun ! - - - abbr|foreign - +- pj. - noun ! - - - abbr - +- tj. - noun ! - - - abbr - diff --git a/pypykko/pypykko/constants.py b/pypykko/pypykko/constants.py index 82f68be..c84e23d 100644 --- a/pypykko/pypykko/constants.py +++ b/pypykko/pypykko/constants.py @@ -19,35 +19,20 @@ 'ȘŅĻŖȚĶ' \ 'ØßÐĐÆŒŁĞŐŊÞ' -ALPHA_UPPER = 'ABCDEFGHIJKLMNOPQRSTUVWXYZÅÄÖÜŠŽČĆ' +ALPHA_LOWER_BASIC = 'abcdefghijklmnopqrstuvwxyzåäö' ALPHA_LOWER = 'abcdefghijklmnopqrstuvwxyzåäöüšžčćı' +ALPHA_UPPER = 'ABCDEFGHIJKLMNOPQRSTUVWXYZÅÄÖÜŠŽČĆ' PARSER_FST_PATH = os.path.join(scripts_path, 'fi-parser.kfst') GENERATOR_FST_PATH = os.path.join(scripts_path, 'fi-generator.kfst') LINE_BREAK = '@_LINEBREAK_@' SENT_BREAK = '@_SENTBREAK_@' -ZERO = '@_zero_@' +ZERO = '@_ZERO_@' TAB = '^TAB' OPENING_TAGS = [f'<{tag}>' for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']] -STYLE_TAGS = [ - '+arch', - '+child', - '+coll', - '+dated', - '+dial', - '+foreign', - '+jocul', - '+nstd', - '+poet', - '+rare', - '+slang', - '+vulg', -] -STYLE_TAG_REGEX = '|'.join(tag[1:] for tag in STYLE_TAGS) - POS_TAGS = [ 'noun', 'noun-pl', @@ -70,21 +55,37 @@ ] CLITICS = { -"+han", -"+ka", -"+kaan", -"+kin", -"+ko", -"+pa", -"+poss1pl", -"+poss1sg", -"+poss2pl", -"+poss2sg", -"+poss3", -"+poss3", -"+s", + "+han", + "+ka", + "+kaan", + "+kin", + "+ko", + "+pa", + "+poss1pl", + "+poss1sg", + "+poss2pl", + "+poss2sg", + "+poss3", + "+poss3", + "+s", } +STYLE_TAGS = [ + '+arch', + '+child', + '+coll', + '+dated', + '+dial', + '+foreign', + '+jocul', + '+nstd', + '+poet', + '+rare', + '+slang', + '+vulg', +] +STYLE_TAG_REGEX = '|'.join(tag[1:] for tag in STYLE_TAGS) + FIELDS = [ '', # 1. source '%s', # 2. lemma diff --git a/pypykko/pypykko/fi-generator.kfst b/pypykko/pypykko/fi-generator.kfst index d60495e..b41a53c 100644 Binary files a/pypykko/pypykko/fi-generator.kfst and b/pypykko/pypykko/fi-generator.kfst differ diff --git a/pypykko/pypykko/fi-parser.kfst b/pypykko/pypykko/fi-parser.kfst index 394ef37..2f3aa78 100644 Binary files a/pypykko/pypykko/fi-parser.kfst and b/pypykko/pypykko/fi-parser.kfst differ diff --git a/pypykko/pypykko/file_tools.py b/pypykko/pypykko/file_tools.py index 78c26d5..85bccce 100644 --- a/pypykko/pypykko/file_tools.py +++ b/pypykko/pypykko/file_tools.py @@ -12,20 +12,12 @@ def get_filepath(filename, directory): def read_tsv(filename, directory=''): - filename = get_filepath(filename, directory) - - table = [] - with open(filename, 'r') as file: - for line in file: - line = line.strip('\n') - if not line: - continue - if line.startswith('#'): - continue - row = ['' if val == '-' else val for val in line.split('\t')] - table.append(row) - return table + for line in read_list(filename): + if line.startswith('#'): + continue + row = ['' if val == '-' else val for val in line.split('\t')] + yield row def read_list_tsv(filename): @@ -52,17 +44,22 @@ def save_txt(filename, text: str, directory=''): def read_list(filename, directory=''): filename = get_filepath(filename, directory) - return [s for s in read_txt(filename).splitlines() if s] + with open(filename, 'r') as file: + for line in file: + line = line.strip('\n\r') + if not line: + continue + yield line def save_list(filename, items: list, sort=True, directory=''): filename = get_filepath(filename, directory) items = sorted(items) if sort else items - return save_txt(filename, '\n'.join(items)) + return save_txt(filename, ''.join(f'{item}\n' for item in items)) def load_json(filename, directory=''): filename = get_filepath(filename, directory) with open(filename, 'r') as file: data = json.load(file) - return data + return data \ No newline at end of file diff --git a/pypykko/pypykko/generate.py b/pypykko/pypykko/generate.py index 6f9a656..ca5ea1a 100644 --- a/pypykko/pypykko/generate.py +++ b/pypykko/pypykko/generate.py @@ -1,39 +1,163 @@ -from .constants import GENERATOR_FST_PATH, TAB -from .utils import add_compound_separators, inf +import os +from .file_tools import read_list import kfst +from .constants import GENERATOR_FST_PATH, TAB +from .scriptutils import is_valid_pos, is_uninflectable +from .utils import add_compound_separators, inf, pos_tag + +CURR = os.path.dirname(__file__) +POS_FST_SOURCES = { + 'noun-pl': ['Lexicon', 'Lexicon|Pfx', 'Lexicon|Hyp'], + 'noun': ['Lexicon', 'Lexicon|Pfx', 'Lexicon|Hyp'], + 'proper-pl': ['Lexicon', 'Lexicon|Pfx', 'Lexicon|Hyp'], + 'proper': ['Lexicon', 'Lexicon|Pfx', 'Lexicon|Hyp'], + 'adjective': ['Lexicon', 'Lexicon|Pfx', 'Lexicon|Hyp'], + 'pronoun': ['Lexicon', 'Lexicon|Hyp'], + 'pronoun-pl': ['Lexicon', 'Lexicon|Hyp'], + 'verb': ['Lexicon', 'Lexicon|Pfx'], + 'participle': ['Lexicon', 'Lexicon|Pfx'], + 'numeral': ['Lexicon|Num', 'Lexicon', 'Guesser|Any'], # (!) + 'ordinal': ['Lexicon|Num', 'Lexicon', 'Guesser|Any'], # (!) + 'adverb': ['Lexicon'], + 'adposition': ['Lexicon'], + 'interjection': ['Lexicon'], + 'conjunction': ['Lexicon'], + 'conjunction+verb': ['Lexicon'], + 'adverb+verb': ['Lexicon'], + 'none': ['Lexicon'], +} + +HOMONYMOUS = { + ("ahtaus", "noun"), + ("ale", "noun"), + ("appi", "noun"), + ("g", "noun"), + ("haiku", "noun"), + ("halata", "verb"), + ("hepo", "noun"), + ("isota", "verb"), + ("joka", "pronoun"), + ("karvaus", "noun"), + ("keritä", "verb"), + ("koto", "noun"), + ("kuori", "noun"), + ("kuti", "noun"), + ("l", "noun"), + ("lahti", "noun"), + ("laki", "noun"), + ("lento", "noun"), + ("live", "noun"), + ("m", "noun"), + ("merirosvous", "noun"), + ("mutu", "noun"), + ("palvi", "noun"), + ("parka", "noun"), + ("peitsi", "noun"), + ("pokata", "verb"), + ("puola", "noun"), + ("raakata", "verb"), + ("raita", "noun"), + ("raueta", "verb"), + ("ripsi", "noun"), + ("riuku", "noun"), + ("rosvous", "noun"), + ("s", "noun"), + ("saksi", "noun"), + ("sietä", "verb"), + ("siivous", "noun"), + ("sini", "noun"), + ("soppi", "noun"), + ("syli", "noun"), + ("säkä", "noun"), + ("tavata", "verb"), + ("tutti", "noun"), + ("tyvetä", "verb"), + ("vakaus", "noun"), + ("veto", "noun"), + ("viini", "noun"), + ("vika", "noun"), + ("vuori", "noun"), +} generator_fst = kfst.FST.from_kfst_file(GENERATOR_FST_PATH) +POS_MORPHTAG_PATTERNS = { + pos: list(read_list(os.path.join(CURR, 'patterns', f'pos-{pos}-patterns.txt'))) or [''] + for pos in POS_FST_SOURCES +} + +def generate_inflection_paradigm(word: str, pos: str, homonym: str = ''): -def generate_wordform(word: str, pos: str, morphtags: str, homonym: str = '', source='Lexicon'): + """ + Return a mapping of morphological tags to worforms. + """ - words = add_compound_separators(word, pos=pos, normalize_separators=False) - if not words: + if is_uninflectable(word): + return set() + if not is_valid_pos(pos): return set() - for word in words: - input_fields = source, word, f'^{pos}', str(homonym), '', morphtags - input_string = TAB.join(input_fields) + inflections = {} + for source in POS_FST_SOURCES[pos]: + for morphtags in POS_MORPHTAG_PATTERNS[pos]: + forms = generate_wordform(word, pos, morphtags, homonym, source) + if forms: + inflections[morphtags] = list(forms) + if inflections: + break + return inflections + +def generate_forms(word: str, pos: str | None = None, homonym: str = ''): + + """ + Return default set of unannotated standard inflected forms for given word (lemma). + """ + + if is_uninflectable(word): + return set() + if not is_valid_pos(pos): + return set() + + # Return all valid interpretations if POS tag has not been specified + if not pos: + return {form for pos in pos_tag(word) for form in generate_forms(word, pos, homonym)} + + for source in POS_FST_SOURCES[pos]: forms = set() + for morphtags in POS_MORPHTAG_PATTERNS[pos]: + forms.update(generate_wordform(word, pos, morphtags, homonym, source)) + if forms: + return forms + return set() + + +def generate_wordform(word: str, pos: str, morphtags: str, homonym: str = '', source: str ='Lexicon'): + + """ + Generate set of valid inflected form specified by the morphological tags for the given word (lemma). + """ + + if is_uninflectable(word): + return set() + if not is_valid_pos(pos): + return set() + + # TODO: Make this work for other sources as well? + if not homonym and (word, pos) in HOMONYMOUS and source == 'Lexicon': + forms1 = generate_wordform(word, pos, morphtags, '1', source) + forms2 = generate_wordform(word, pos, morphtags, '2', source) + return forms1 | forms2 + + forms = set() + for word in add_compound_separators(word, pos=pos, normalize_separators=False): + input_fields = source, word, f'^{pos}', str(homonym), '', morphtags + input_string = TAB.join(input_fields) best = inf for form, weight in generator_fst.lookup(input_string): if weight > best: break forms.add(form) best = weight - if forms: - return forms - return set() - - -if __name__ == '__main__': - print(generate_wordform('suuri', 'adjective', '+sg+gen')) - print(generate_wordform('kissakoira', 'noun', '+pl+par', source='Lexicon|Pfx')) - print(generate_wordform('-rakenteinen', 'adjective', '+sg+ine', source='Lexicon|Hyp')) # FIXME! - print(generate_wordform('-valkoinen', 'adjective', '+sg+ine', source='Lexicon|Hyp')) - print(generate_wordform('a-rakenteinen', 'adjective', '+sg+ine', source='Lexicon|Hyp')) - print(generate_wordform('a-valkoinen', 'adjective', '+sg+ine', source='Lexicon|Hyp')) - print(generate_wordform('16', 'numeral', '+sg+ine', source='Lexicon')) - print(generate_wordform('16:s', 'ordinal', '+sg+ine', source='Lexicon')) \ No newline at end of file + return forms diff --git a/pypykko/pypykko/normalize.py b/pypykko/pypykko/normalize.py index 4ab4cb7..6fe5725 100644 --- a/pypykko/pypykko/normalize.py +++ b/pypykko/pypykko/normalize.py @@ -6,7 +6,7 @@ inf = float('inf') indices = defaultdict(float) -LEADING_PUNCTUATION = set('-–—"”„’([') +LEADING_PUNCTUATION = set('-–—"”“‟„’([') def is_lowercase(w): @@ -60,8 +60,8 @@ def process_analyses(analyses, sentence_initial=None): # if '+ins' in tags or '+com' in tags: # weight += 0.5 - # if not sentence_initial: - # lemma = fix_lettercase(wform, lemma) + if not sentence_initial and is_uppercase(wform) and is_lowercase(lemma) and pos.startswith('noun'): + lemma = fix_lettercase(wform, lemma) index = indices[lemma, pos] or inf pair = weight, index @@ -87,16 +87,23 @@ def process_analyses(analyses, sentence_initial=None): def main(): analyses = [] - analysis = '', '', '', '', '' + analysis = '', '', '', '', '', '', '', inf sentence_initial = True + prev_wform = '' for line in sys.stdin: line = line.strip('\n\r') if not line and analyses: + + wform, _, _, _, _, _, _, _ = analysis + if prev_wform == ':' and wform in '"”„': + sentence_initial = True + process_analyses(analyses, sentence_initial) - prev_wform = analysis[0] + + prev_wform = wform sentence_initial = ( sentence_initial if prev_wform in LEADING_PUNCTUATION else prev_wform in [SENT_BREAK] + OPENING_TAGS diff --git a/pypykko/pypykko/patterns/pos-adjective-patterns.txt b/pypykko/pypykko/patterns/pos-adjective-patterns.txt new file mode 100644 index 0000000..aadf6e1 --- /dev/null +++ b/pypykko/pypykko/patterns/pos-adjective-patterns.txt @@ -0,0 +1,68 @@ ++sg+nom ++sg+gen ++sg+acc ++sg+par ++sg+ill ++sg+ine ++sg+ela ++sg+all ++sg+ade ++sg+abl ++sg+ess ++sg+tra ++sg+com ++sg+ins ++sg+abe ++pl+nom ++pl+gen ++pl+acc ++pl+par ++pl+ill ++pl+ine ++pl+ela ++pl+all ++pl+ade ++pl+abl ++pl+ess ++pl+tra ++pl+com ++pl+ins ++pl+abe ++sg+nom+rare ++sg+gen+rare ++sg+acc+rare ++sg+par+rare ++sg+ill+rare ++sg+ine+rare ++sg+ela+rare ++sg+all+rare ++sg+ade+rare ++sg+abl+rare ++sg+ess+rare ++sg+tra+rare ++sg+com+rare ++sg+ins+rare ++sg+abe+rare ++pl+nom+rare ++pl+gen+rare ++pl+acc+rare ++pl+par+rare ++pl+ill+rare ++pl+ine+rare ++pl+ela+rare ++pl+all+rare ++pl+ade+rare ++pl+abl+rare ++pl+ess+rare ++pl+tra+rare ++pl+com+rare ++pl+ins+rare ++pl+abe+rare ++comparative+sg+nom ++superlative+sg+nom ++comparative+rare+sg+nom ++superlative+rare+sg+nom ++comparative+pl+nom ++superlative+pl+nom ++comparative+rare+pl+nom ++superlative+rare+pl+nom diff --git a/pypykko/pypykko/patterns/pos-adposition-patterns.txt b/pypykko/pypykko/patterns/pos-adposition-patterns.txt new file mode 100644 index 0000000..6f08916 --- /dev/null +++ b/pypykko/pypykko/patterns/pos-adposition-patterns.txt @@ -0,0 +1,7 @@ ++poss1sg ++poss2sg ++poss3 ++poss1pl ++poss2pl ++comparative ++superlative diff --git a/pypykko/pypykko/patterns/pos-adverb+verb-patterns.txt b/pypykko/pypykko/patterns/pos-adverb+verb-patterns.txt new file mode 100644 index 0000000..d82a230 --- /dev/null +++ b/pypykko/pypykko/patterns/pos-adverb+verb-patterns.txt @@ -0,0 +1,6 @@ ++1sg ++2sg ++3sg ++1pl ++2pl ++3pl diff --git a/pypykko/pypykko/patterns/pos-adverb-patterns.txt b/pypykko/pypykko/patterns/pos-adverb-patterns.txt new file mode 100644 index 0000000..6f08916 --- /dev/null +++ b/pypykko/pypykko/patterns/pos-adverb-patterns.txt @@ -0,0 +1,7 @@ ++poss1sg ++poss2sg ++poss3 ++poss1pl ++poss2pl ++comparative ++superlative diff --git a/pypykko/pypykko/patterns/pos-conjunction+verb-patterns.txt b/pypykko/pypykko/patterns/pos-conjunction+verb-patterns.txt new file mode 100644 index 0000000..d82a230 --- /dev/null +++ b/pypykko/pypykko/patterns/pos-conjunction+verb-patterns.txt @@ -0,0 +1,6 @@ ++1sg ++2sg ++3sg ++1pl ++2pl ++3pl diff --git a/pypykko/pypykko/patterns/pos-conjunction-patterns.txt b/pypykko/pypykko/patterns/pos-conjunction-patterns.txt new file mode 100644 index 0000000..e69de29 diff --git a/pypykko/pypykko/patterns/pos-interjection-patterns.txt b/pypykko/pypykko/patterns/pos-interjection-patterns.txt new file mode 100644 index 0000000..e69de29 diff --git a/pypykko/pypykko/patterns/pos-none-patterns.txt b/pypykko/pypykko/patterns/pos-none-patterns.txt new file mode 100644 index 0000000..e69de29 diff --git a/pypykko/pypykko/patterns/pos-noun-patterns.txt b/pypykko/pypykko/patterns/pos-noun-patterns.txt new file mode 100644 index 0000000..2b4fd97 --- /dev/null +++ b/pypykko/pypykko/patterns/pos-noun-patterns.txt @@ -0,0 +1,90 @@ ++nom ++gen ++acc ++par ++ill ++ine ++ela ++all ++ade ++abl ++ess ++tra ++com ++ins ++abe ++sg+nom ++sg+gen ++sg+acc ++sg+par ++sg+ill ++sg+ine ++sg+ela ++sg+all ++sg+ade ++sg+abl ++sg+ess ++sg+tra ++sg+com ++sg+ins ++sg+abe ++pl+nom ++pl+gen ++pl+acc ++pl+par ++pl+ill ++pl+ine ++pl+ela ++pl+all ++pl+ade ++pl+abl ++pl+ess ++pl+tra ++pl+com ++pl+ins ++pl+abe ++nom+rare ++gen+rare ++acc+rare ++par+rare ++ill+rare ++ine+rare ++ela+rare ++all+rare ++ade+rare ++abl+rare ++ess+rare ++tra+rare ++com+rare ++ins+rare ++abe+rare ++sg+nom+rare ++sg+gen+rare ++sg+acc+rare ++sg+par+rare ++sg+ill+rare ++sg+ine+rare ++sg+ela+rare ++sg+all+rare ++sg+ade+rare ++sg+abl+rare ++sg+ess+rare ++sg+tra+rare ++sg+com+rare ++sg+ins+rare ++sg+abe+rare ++pl+nom+rare ++pl+gen+rare ++pl+acc+rare ++pl+par+rare ++pl+ill+rare ++pl+ine+rare ++pl+ela+rare ++pl+all+rare ++pl+ade+rare ++pl+abl+rare ++pl+ess+rare ++pl+tra+rare ++pl+com+rare ++pl+ins+rare ++pl+abe+rare diff --git a/pypykko/pypykko/patterns/pos-noun-pl-patterns.txt b/pypykko/pypykko/patterns/pos-noun-pl-patterns.txt new file mode 100644 index 0000000..8b0f14d --- /dev/null +++ b/pypykko/pypykko/patterns/pos-noun-pl-patterns.txt @@ -0,0 +1,30 @@ ++nom ++gen ++acc ++par ++ill ++ine ++ela ++all ++ade ++abl ++ess ++tra ++com ++ins ++abe ++nom+rare ++gen+rare ++acc+rare ++par+rare ++ill+rare ++ine+rare ++ela+rare ++all+rare ++ade+rare ++abl+rare ++ess+rare ++tra+rare ++com+rare ++ins+rare ++abe+rare diff --git a/pypykko/pypykko/patterns/pos-numeral-patterns.txt b/pypykko/pypykko/patterns/pos-numeral-patterns.txt new file mode 100644 index 0000000..be26141 --- /dev/null +++ b/pypykko/pypykko/patterns/pos-numeral-patterns.txt @@ -0,0 +1,60 @@ ++sg+nom ++sg+gen ++sg+acc ++sg+par ++sg+ill ++sg+ine ++sg+ela ++sg+all ++sg+ade ++sg+abl ++sg+ess ++sg+tra ++sg+com ++sg+ins ++sg+abe ++pl+nom ++pl+gen ++pl+acc ++pl+par ++pl+ill ++pl+ine ++pl+ela ++pl+all ++pl+ade ++pl+abl ++pl+ess ++pl+tra ++pl+com ++pl+ins ++pl+abe ++sg+nom+rare ++sg+gen+rare ++sg+acc+rare ++sg+par+rare ++sg+ill+rare ++sg+ine+rare ++sg+ela+rare ++sg+all+rare ++sg+ade+rare ++sg+abl+rare ++sg+ess+rare ++sg+tra+rare ++sg+com+rare ++sg+ins+rare ++sg+abe+rare ++pl+nom+rare ++pl+gen+rare ++pl+acc+rare ++pl+par+rare ++pl+ill+rare ++pl+ine+rare ++pl+ela+rare ++pl+all+rare ++pl+ade+rare ++pl+abl+rare ++pl+ess+rare ++pl+tra+rare ++pl+com+rare ++pl+ins+rare ++pl+abe+rare diff --git a/pypykko/pypykko/patterns/pos-ordinal-patterns.txt b/pypykko/pypykko/patterns/pos-ordinal-patterns.txt new file mode 100644 index 0000000..2b4fd97 --- /dev/null +++ b/pypykko/pypykko/patterns/pos-ordinal-patterns.txt @@ -0,0 +1,90 @@ ++nom ++gen ++acc ++par ++ill ++ine ++ela ++all ++ade ++abl ++ess ++tra ++com ++ins ++abe ++sg+nom ++sg+gen ++sg+acc ++sg+par ++sg+ill ++sg+ine ++sg+ela ++sg+all ++sg+ade ++sg+abl ++sg+ess ++sg+tra ++sg+com ++sg+ins ++sg+abe ++pl+nom ++pl+gen ++pl+acc ++pl+par ++pl+ill ++pl+ine ++pl+ela ++pl+all ++pl+ade ++pl+abl ++pl+ess ++pl+tra ++pl+com ++pl+ins ++pl+abe ++nom+rare ++gen+rare ++acc+rare ++par+rare ++ill+rare ++ine+rare ++ela+rare ++all+rare ++ade+rare ++abl+rare ++ess+rare ++tra+rare ++com+rare ++ins+rare ++abe+rare ++sg+nom+rare ++sg+gen+rare ++sg+acc+rare ++sg+par+rare ++sg+ill+rare ++sg+ine+rare ++sg+ela+rare ++sg+all+rare ++sg+ade+rare ++sg+abl+rare ++sg+ess+rare ++sg+tra+rare ++sg+com+rare ++sg+ins+rare ++sg+abe+rare ++pl+nom+rare ++pl+gen+rare ++pl+acc+rare ++pl+par+rare ++pl+ill+rare ++pl+ine+rare ++pl+ela+rare ++pl+all+rare ++pl+ade+rare ++pl+abl+rare ++pl+ess+rare ++pl+tra+rare ++pl+com+rare ++pl+ins+rare ++pl+abe+rare diff --git a/pypykko/pypykko/patterns/pos-participle-patterns.txt b/pypykko/pypykko/patterns/pos-participle-patterns.txt new file mode 100644 index 0000000..2b4fd97 --- /dev/null +++ b/pypykko/pypykko/patterns/pos-participle-patterns.txt @@ -0,0 +1,90 @@ ++nom ++gen ++acc ++par ++ill ++ine ++ela ++all ++ade ++abl ++ess ++tra ++com ++ins ++abe ++sg+nom ++sg+gen ++sg+acc ++sg+par ++sg+ill ++sg+ine ++sg+ela ++sg+all ++sg+ade ++sg+abl ++sg+ess ++sg+tra ++sg+com ++sg+ins ++sg+abe ++pl+nom ++pl+gen ++pl+acc ++pl+par ++pl+ill ++pl+ine ++pl+ela ++pl+all ++pl+ade ++pl+abl ++pl+ess ++pl+tra ++pl+com ++pl+ins ++pl+abe ++nom+rare ++gen+rare ++acc+rare ++par+rare ++ill+rare ++ine+rare ++ela+rare ++all+rare ++ade+rare ++abl+rare ++ess+rare ++tra+rare ++com+rare ++ins+rare ++abe+rare ++sg+nom+rare ++sg+gen+rare ++sg+acc+rare ++sg+par+rare ++sg+ill+rare ++sg+ine+rare ++sg+ela+rare ++sg+all+rare ++sg+ade+rare ++sg+abl+rare ++sg+ess+rare ++sg+tra+rare ++sg+com+rare ++sg+ins+rare ++sg+abe+rare ++pl+nom+rare ++pl+gen+rare ++pl+acc+rare ++pl+par+rare ++pl+ill+rare ++pl+ine+rare ++pl+ela+rare ++pl+all+rare ++pl+ade+rare ++pl+abl+rare ++pl+ess+rare ++pl+tra+rare ++pl+com+rare ++pl+ins+rare ++pl+abe+rare diff --git a/pypykko/pypykko/patterns/pos-pronoun-patterns.txt b/pypykko/pypykko/patterns/pos-pronoun-patterns.txt new file mode 100644 index 0000000..9e30ca6 --- /dev/null +++ b/pypykko/pypykko/patterns/pos-pronoun-patterns.txt @@ -0,0 +1,98 @@ ++nom ++gen ++acc ++par ++ill ++ine ++ela ++all ++ade ++abl ++ess ++tra ++com ++ins ++abe ++sg+nom ++sg+gen ++sg+acc ++sg+par ++sg+ill ++sg+ine ++sg+ela ++sg+all ++sg+ade ++sg+abl ++sg+ess ++sg+tra ++sg+com ++sg+ins ++sg+abe ++pl+nom ++pl+gen ++pl+acc ++pl+par ++pl+ill ++pl+ine ++pl+ela ++pl+all ++pl+ade ++pl+abl ++pl+ess ++pl+tra ++pl+com ++pl+ins ++pl+abe ++nom+rare ++gen+rare ++acc+rare ++par+rare ++ill+rare ++ine+rare ++ela+rare ++all+rare ++ade+rare ++abl+rare ++ess+rare ++tra+rare ++com+rare ++ins+rare ++abe+rare ++sg+nom+rare ++sg+gen+rare ++sg+acc+rare ++sg+par+rare ++sg+ill+rare ++sg+ine+rare ++sg+ela+rare ++sg+all+rare ++sg+ade+rare ++sg+abl+rare ++sg+ess+rare ++sg+tra+rare ++sg+com+rare ++sg+ins+rare ++sg+abe+rare ++pl+nom+rare ++pl+gen+rare ++pl+acc+rare ++pl+par+rare ++pl+ill+rare ++pl+ine+rare ++pl+ela+rare ++pl+all+rare ++pl+ade+rare ++pl+abl+rare ++pl+ess+rare ++pl+tra+rare ++pl+com+rare ++pl+ins+rare ++pl+abe+rare ++comparative+sg+nom ++superlative+sg+nom ++comparative+rare+sg+nom ++superlative+rare+sg+nom ++comparative+pl+nom ++superlative+pl+nom ++comparative+rare+pl+nom ++superlative+rare+pl+nom diff --git a/pypykko/pypykko/patterns/pos-pronoun-pl-patterns.txt b/pypykko/pypykko/patterns/pos-pronoun-pl-patterns.txt new file mode 100644 index 0000000..8b0f14d --- /dev/null +++ b/pypykko/pypykko/patterns/pos-pronoun-pl-patterns.txt @@ -0,0 +1,30 @@ ++nom ++gen ++acc ++par ++ill ++ine ++ela ++all ++ade ++abl ++ess ++tra ++com ++ins ++abe ++nom+rare ++gen+rare ++acc+rare ++par+rare ++ill+rare ++ine+rare ++ela+rare ++all+rare ++ade+rare ++abl+rare ++ess+rare ++tra+rare ++com+rare ++ins+rare ++abe+rare diff --git a/pypykko/pypykko/patterns/pos-proper-patterns.txt b/pypykko/pypykko/patterns/pos-proper-patterns.txt new file mode 100644 index 0000000..2b4fd97 --- /dev/null +++ b/pypykko/pypykko/patterns/pos-proper-patterns.txt @@ -0,0 +1,90 @@ ++nom ++gen ++acc ++par ++ill ++ine ++ela ++all ++ade ++abl ++ess ++tra ++com ++ins ++abe ++sg+nom ++sg+gen ++sg+acc ++sg+par ++sg+ill ++sg+ine ++sg+ela ++sg+all ++sg+ade ++sg+abl ++sg+ess ++sg+tra ++sg+com ++sg+ins ++sg+abe ++pl+nom ++pl+gen ++pl+acc ++pl+par ++pl+ill ++pl+ine ++pl+ela ++pl+all ++pl+ade ++pl+abl ++pl+ess ++pl+tra ++pl+com ++pl+ins ++pl+abe ++nom+rare ++gen+rare ++acc+rare ++par+rare ++ill+rare ++ine+rare ++ela+rare ++all+rare ++ade+rare ++abl+rare ++ess+rare ++tra+rare ++com+rare ++ins+rare ++abe+rare ++sg+nom+rare ++sg+gen+rare ++sg+acc+rare ++sg+par+rare ++sg+ill+rare ++sg+ine+rare ++sg+ela+rare ++sg+all+rare ++sg+ade+rare ++sg+abl+rare ++sg+ess+rare ++sg+tra+rare ++sg+com+rare ++sg+ins+rare ++sg+abe+rare ++pl+nom+rare ++pl+gen+rare ++pl+acc+rare ++pl+par+rare ++pl+ill+rare ++pl+ine+rare ++pl+ela+rare ++pl+all+rare ++pl+ade+rare ++pl+abl+rare ++pl+ess+rare ++pl+tra+rare ++pl+com+rare ++pl+ins+rare ++pl+abe+rare diff --git a/pypykko/pypykko/patterns/pos-proper-pl-patterns.txt b/pypykko/pypykko/patterns/pos-proper-pl-patterns.txt new file mode 100644 index 0000000..8b0f14d --- /dev/null +++ b/pypykko/pypykko/patterns/pos-proper-pl-patterns.txt @@ -0,0 +1,30 @@ ++nom ++gen ++acc ++par ++ill ++ine ++ela ++all ++ade ++abl ++ess ++tra ++com ++ins ++abe ++nom+rare ++gen+rare ++acc+rare ++par+rare ++ill+rare ++ine+rare ++ela+rare ++all+rare ++ade+rare ++abl+rare ++ess+rare ++tra+rare ++com+rare ++ins+rare ++abe+rare diff --git a/pypykko/pypykko/patterns/pos-verb-patterns.txt b/pypykko/pypykko/patterns/pos-verb-patterns.txt new file mode 100644 index 0000000..94230a7 --- /dev/null +++ b/pypykko/pypykko/patterns/pos-verb-patterns.txt @@ -0,0 +1,96 @@ ++pres+1sg ++pres+1sg+rare ++pres+2sg ++pres+2sg+rare ++pres+3sg ++pres+3sg+rare ++pres+1pl ++pres+1pl+rare ++pres+2pl ++pres+2pl+rare ++pres+3pl ++pres+3pl+rare ++pres+conneg ++pres+conneg+rare ++past+1sg ++past+1sg+rare ++past+2sg ++past+2sg+rare ++past+3sg ++past+3sg+rare ++past+1pl ++past+1pl+rare ++past+2pl ++past+2pl+rare ++past+3pl ++past+3pl+rare ++past+conneg ++past+conneg+sg ++past+conneg+pl ++imper+1sg ++imper+2sg ++imper+2sg+rare ++imper+3sg ++imper+1pl ++imper+2pl ++imper+3pl ++imper+2sg+conneg ++imper+2sg+conneg+rare ++imper+3sg+conneg ++imper+pl+conneg ++cond+1sg ++cond+1sg+rare ++cond+2sg ++cond+2sg+rare ++cond+3sg ++cond+3sg+rare ++cond+1pl ++cond+1pl+rare ++cond+2pl ++cond+2pl+rare ++cond+3pl ++cond+3pl+rare ++cond+conneg ++cond+conneg+rare ++poten+1sg ++poten+2sg ++poten+3sg ++poten+1pl ++poten+2pl ++poten+3pl ++poten+conneg ++part_ma+sg+nom ++part_maton+sg+nom ++part_pres+sg+nom ++part_pres+pl+nom ++part_past+sg+nom ++part_past+pl+nom ++inf1 ++inf1+tra+poss3 ++inf2+ine ++inf2+ins ++inf3+ill ++inf3+ill+rare ++inf3+ine ++inf3+ine+rare ++inf3+ela ++inf3+ela+rare ++inf3+ade ++inf3+ade+rare ++inf3+abe ++inf3+abe+rare ++pass+pres ++pass+pres+conneg ++pass+past ++pass+past+conneg ++pass+imper ++pass+imper+conneg ++pass+cond ++pass+cond+conneg ++pass+poten ++pass+poten+conneg ++pass+part_pres+sg+nom ++pass+part_pres+pl+nom ++pass+part_past+sg+nom ++pass+part_past+pl+nom ++pass+inf2+ine diff --git a/pypykko/pypykko/scriptutils.py b/pypykko/pypykko/scriptutils.py index 07c5186..4539277 100644 --- a/pypykko/pypykko/scriptutils.py +++ b/pypykko/pypykko/scriptutils.py @@ -4,6 +4,8 @@ from .file_tools import load_json from .constants import POS_TAGS from collections import defaultdict +from .utils import syllabify +C = '[bcdfghjklmnpqrstvwxz]' try: ADVERB_INFLECTIONS = load_json(filename='adverbs.json', directory='scripts/inflection') @@ -26,12 +28,14 @@ 'kuinka', ] -def validate_pos(pos): + +def is_valid_pos(pos): if pos and pos not in POS_TAGS: print(sys.stderr.write(f'Warning! Unknown POS tag "{pos}"\n')) return False return True + def get_wordform(pairs): return ''.join(c for _, c in pairs if c != '0') @@ -86,21 +90,85 @@ def determine_separator(w1, w2, default='0', strip_zeros=True): return default -def determine_wordform_harmony(wordform, default=None): - if default: - return default.upper() +def get_parts(lemma): + return re.findall(r'[^-|% ]+[-|% ]?', lemma) or [lemma] + + +def get_base_lemma(lemma): + return get_parts(lemma)[-1] + + +def count_syllables(lemma): + syllabified = syllabify(lemma, compound=False) + return len(syllabified.split('·')) + + +def determine_lemma_vowel_harmony(lemma, kotus_class=None): + + lemma = get_parts(lemma).pop() + + # "onomatopoeettinen" + if re.fullmatch('.*(poeettinen)', lemma): + return 'back' + + # "prototyyppi", "prototyyppinen", "geotekninen", "biokteknisesti" + if re.fullmatch('.*(' + 'depressiivi|elementti|elementtisesti|kineettinen|kineettisesti|kliininen|kliinisesti|oeettinen|oeettisesti|semiitti|semiittinen|semitismi|semitisti|semitistinen|semitistisesti|sentrinen|sentrisesti|sentrismi|synteesi|synteettinen|synteettisesti|tekninen|teknisesti|tyyppi|tyyppinen|tyyppisesti|syklinen|' + 'syklisesti|psyykkinen|psyykkisesti|fyysinen|fyysisesti)', lemma): + return 'front' + + # "makromolekyyli", "psykoanalyyttinen" + if re.fullmatch('.*(aldehydi|analyysi|analyyttinen|analyyttisesti|molekyyli|molekyylinen)', lemma): + return 'front|back' + + # "porfyyri", polyyppi", "dialyysi", "porfyriini", "molybdeeni" + if re.fullmatch(f'.*[aou].*(y{C}{C}?i|y{C}{C}?inen|y{C}{C}?isesti|y{C}{C}?ismi|y{C}{C}?isti|y{C}{C}?ii{C}{C}?i|y{C}{C}?ee{C}{C}?i)', lemma): + return 'front|back' + + # "anglofiili", "karsinogeeni", "telomeeri", "ortopedi", "antisepti", "dynamometri"/"barometri" "hypoteesi" + if count_syllables(lemma) >= 4 and re.fullmatch('.*[aou].*(geeni|iili|meeri|metri|pedi|septi|teesi)', lemma): + return 'front|back' + + # "fylogeneesi", "fylogeneettisesti" + if re.fullmatch('.*[aou].*(elektrinen|elektisesti|fiili|fiilinen|fiilisesti|geeninen|geenisesti|geneesi|geneettinen|geneettisesti|metrinen|metrisesti|pedinen|pedisesti|septinen|septisesti|teismi|teisti|teistinen|teistisesti|terminen|termisesti|tsepiini)', lemma): + return 'front|back' + + # Initialisms and numbers + if re.fullmatch('.*[14579BCDEFGIJLMNPRSTVWXYÄÖÜÉ]', lemma): + return 'front' + if re.fullmatch('.*[2368AHKOQUZÅ]', lemma): + return 'back' + if re.fullmatch('.*[123456789]0(:s)?', lemma): + return 'front' + if re.fullmatch('.+oy', lemma): + return 'back' + if re.fullmatch(f'.*[aouAOU]{C}+y', lemma): + return 'back' + if re.fullmatch('.*[aouAOU].*y', lemma): + return 'front|back' + if kotus_class in {'18B', '10B'} and lemma[-1] in set('bcdefgijlmnprstvwxyzäöüé'): + return 'front' + if kotus_class in {'18B', '10B'}: + return 'back' + + return determine_wordform_harmony(lemma) + + +def determine_wordform_harmony(wordform, default_harmony=None): + if default_harmony in {'front', 'back'}: + return default_harmony for c in reversed(wordform.lower()): if c in set('y'): - return 'FRONT' - if c in set('aouáóúàòùâôû'): - return 'BACK' + return 'front' + if c in set('aouáóúàòùâôûå'): + return 'back' if c in set('äöüø'): - return 'FRONT' + return 'front' if c in set('14579'): - return 'FRONT' + return 'front' if c in set('2368'): - return 'BACK' - return 'FRONT' + return 'back' + return 'front' def unpack(classes='', gradations='', harmonies='', vowels='', ignore_styles=False): @@ -139,6 +207,14 @@ def ddict(d: dict): result.update(d) return result +def is_uninflectable(lemma): + + """ + Return True if string is or ends with punctuation. + """ + + return not lemma or lemma[-1] in set('.:;-') + """ def combine(obj1: dict, obj2: dict): @@ -154,4 +230,4 @@ def combine_objs(objs): for obj in objs: combined = combine(combined, obj) return combined -""" \ No newline at end of file +""" diff --git a/pypykko/pypykko/tokenizer.py b/pypykko/pypykko/tokenizer.py index 3587e03..c261e90 100644 --- a/pypykko/pypykko/tokenizer.py +++ b/pypykko/pypykko/tokenizer.py @@ -1,7 +1,7 @@ #! /usr/bin/env python3 """ -Tokenize text. +Text tokenization. For testing/debugging purposes only. """ @@ -25,30 +25,31 @@ REGEX_HASHTAG = r'#[A-Za-z0-9_]+' REGEX_HANDLE = r'@[A-Za-z0-9_]+' REGEX_REDDIT = r'r/[A-Za-z0-9_]+|u/[A-Za-z0-9_]+' -REGEX_THOUSANDS = r'[1-9][0-9]?[0-9]?(?: [0-9][0-9][0-9])+(?:-[a-zåäö-]+)?' -REGEX_THOUSANDS_RANGE = r'[1-9][0-9]?[0-9]?(?: [0-9][0-9][0-9])+[-–][1-9][0-9]?[0-9]?(?: [0-9][0-9][0-9])+' +REGEX_THOUSANDS = r'[1-9][0-9]?[0-9]?(?:[  ][0-9][0-9][0-9])+(?:-[a-zåäö-]+)?' +REGEX_THOUSANDS_RANGE = r'[1-9][0-9]?[0-9]?(?:[  ][0-9][0-9][0-9])+[-–][1-9][0-9]?[0-9]?(?:[  ][0-9][0-9][0-9])+' REGEX_XML_ELEM = r'<[^<>]+>' REGEX_HTML_ENTITY = r'&[^;\s]+;' -# REGEX_URL = '(?:https?://|file:///)[a-z0-9](?:[.][a-z0-9][a-z0-9]+)+' -# REGEX_EMAIL = '(?:https?://|file:///)[a-z0-9](?:[.][a-z0-9][a-z0-9]+)+' -# REGEX_CHORD = 'xxx' -# REGEX_IUPAC_NAME = 'xxx' - -REGEX_ALL = f'{LINE_BREAK}' \ - f'{REGEX_UNITS}|' \ - f'{REGEX_EMOTICON}|' \ - f'{REGEX_ABBREV}|' \ - f'{REGEX_INITIAL}|' \ - f'{REGEX_ORDINAL}|' \ - f'{REGEX_DATE}|' \ - f'{REGEX_CLOCK}|' \ - f'{REGEX_HASHTAG}|' \ - f'{REGEX_HANDLE}|' \ - f'{REGEX_REDDIT}|' \ - f'{REGEX_THOUSANDS}|' \ - f'{REGEX_THOUSANDS_RANGE}|' \ - f'{REGEX_XML_ELEM}|' \ - f'{REGEX_HTML_ENTITY}' +REGEX_URL = r'(?:https?://|file:///|www\.)(?:[a-z0-9]+\.)+\S+[^ \t\n)(:;,.]' +# REGEX_IUPAC_NAME = r'...' + +REGEX_ALL = '|'.join(( + LINE_BREAK, + REGEX_UNITS, + REGEX_EMOTICON, + REGEX_ABBREV, + REGEX_INITIAL, + REGEX_ORDINAL, + REGEX_DATE, + REGEX_CLOCK, + REGEX_HASHTAG, + REGEX_HANDLE, + REGEX_REDDIT, + REGEX_THOUSANDS, + REGEX_THOUSANDS_RANGE, + REGEX_XML_ELEM, + REGEX_HTML_ENTITY, + REGEX_URL, +)) PUNCT_HEAD = '\t\n (/"“”„¿¡‹«»{[\'’' PUNCT_TAIL = '\t\n .…,;?!)/"“”„›»}\\]\'’' @@ -74,6 +75,7 @@ def separate_punct(s): separated = head + tail[::-1] return separated + text = re.sub('^( *[-–—])([A-ZÅÄÖ])', r'\1 \2', text) text = f' {text} ' text = re.sub(rf'({REGEX_XML_ELEM})', r' \1 ', text) text = text.replace('\n\n', f' {LINE_BREAK} ') @@ -145,10 +147,5 @@ def tokenize(text): if __name__ == '__main__': for line in sys.stdin: - line = line.replace('& ', '&').replace('< ', '<').replace('> ', '>') + # line = line.replace('& ', '&').replace('< ', '<').replace('> ', '>') print(tokenize(line), end="") - - - - - diff --git a/pypykko/pypykko/utils.py b/pypykko/pypykko/utils.py index 26580ab..2b499fd 100644 --- a/pypykko/pypykko/utils.py +++ b/pypykko/pypykko/utils.py @@ -1,7 +1,6 @@ import re from .constants import PARSER_FST_PATH, FIELD_STRING import kfst -from .scriptutils import validate_pos from typing import NamedTuple C = "[bcdfghjklmnpqrstvwxzšžčśźćń'’]" @@ -76,6 +75,8 @@ def compare_with_others(a_source, analyses): a_target[5] += f' ← {pos}:{lemma_source}:{participle_tag}' return 'has-participle' + return + class PykkoAnalysis(NamedTuple): wordform: str source: str @@ -86,7 +87,7 @@ class PykkoAnalysis(NamedTuple): morphtags: str weight: float -def analyze(word: str, only_best=True, normalize_separators=True, ignore_derivatives=True) -> list[PykkoAnalysis]: +def analyze(word, only_best=True, normalize_separators=True, ignore_derivatives=True) -> list[PykkoAnalysis]: """ Return list of tuples (morphological analyses) with duplicates removed. @@ -95,7 +96,7 @@ def analyze(word: str, only_best=True, normalize_separators=True, ignore_derivat analyses = [] taken = {} - for analysis_string, weight in list(PARSER_FST.lookup(word)) or [(unk_result(word), inf)]: + for analysis_string, weight in PARSER_FST.lookup(word): if normalize_separators: analysis_string = analysis_string.replace('⁅BOUNDARY⁆', '|').replace('⁅HYPHEN⁆', '-') @@ -124,14 +125,16 @@ def analyze(word: str, only_best=True, normalize_separators=True, ignore_derivat filtered.append(analysis) best = weight + filtered = filtered or [([word] + unk_result(word).split('\t') + [inf])] + return [PykkoAnalysis(*a) for a in filtered] -def add_compound_separators(word: str, pos=None, normalize_separators=True, pick_first=False) -> set[str] | str: +def add_compound_separators(word, pos=None, normalize_separators=True, pick_first=False): # TODO: Allow adding separators to non-lemma words? - valid: set[str] = set() + valid = set() best = inf for a in analyze(word, only_best=False, normalize_separators=normalize_separators): _, _, lemma, p, _, _, _, weight = a @@ -155,9 +158,11 @@ def is_plural(word): return lemma return False + def singularize(word): return is_plural(word) or word + def pos_tag(word, force_match=False, max_weight=inf): if force_match: @@ -189,15 +194,7 @@ def lemmatize(word, pos=None): def syllabify(word: str, pos=None, compound=True, big_words=False): - validate_pos(pos) - - # Type checker doesn't particularly enjoy add_compound_separators having a return type - # that depends on whether pick_first is True or not. - - if compound: - separated = add_compound_separators(word, pos, pick_first=True) - assert isinstance(separated, str) - word = separated + word = add_compound_separators(word, pos, pick_first=True) if compound else word # type: ignore # lito·grafia, mikro·skooppi (alternative syllabification) if big_words: diff --git a/pypykko/setup.cfg b/pypykko/setup.cfg index 92d2951..0f87c02 100644 --- a/pypykko/setup.cfg +++ b/pypykko/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = pypykko -version = 0.3.0 +version = 0.4.0-beta author = Théo Salmenkivi-Friberg author_email = theo.friberg@helsinki.f description = A pure-python wrapper for the pykko Finnish morphological analyser and inflector @@ -29,3 +29,4 @@ install_requires = pypykko = *.kfst *.tsv + patterns/*.txt