diff --git a/.gitignore b/.gitignore index 331c58f..688d850 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ .idea +composer.lock vendor \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..2fc5b48 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,41 @@ +## Changelog + +### Version 4.0.2 +### Fixed +- Fixed another type issue where int value was not always passed as the offest param to `HunspellResponse::__construct()`. + +### Version 4.0.1 +### Fixed +- Fixed type due to possibly passing non-int value to the `$offset` parameter when instantiating `HunspellResponse` objects. + +### Version 4.0.0 +### Updated +- Moved `getenv()` call to constructor and stored env data as a class property for caching to ensure the call is not made more than once per instance. +- Refactored `hunspellSuggest()` to work with space-separated list of words and return parsed result batch - This avoids needing to invoke the proc for each word which was a major performance issue. +- Updated `stem()`, `stemParse()` and `hunspellSuggest()` to ensure the stem branch of this library works correctly after the batch improvement. + +### Version 3.0.0 +#### Added +- New optional constructor argument `$custom_words_file` which takes a path to a custom word list to be merged with the dictionary at runtime. +- Windows/Linux environments now use the same process execution code. +- Hunspell process invocation is now handled through `proc_open` instead of `shell_exec`. +- Hunspell `stderr` output is now logged via `error_log()` call. +#### Changed +- Changed constructor argument `$encoding` default value from 'en_US.utf-8' to 'UTF-8'. + +### Version 2.0.0 +#### Added +- Added PHP8.0 typed class, +- Added constructor to main `HunspellPHP` class where the `$dictionary`, `$encoding` and `$dictionary_path` cal be set/overridden during initialization. +- Added `$dictionary_path` as a new argument were the dictionary files path may be specified (system default search locations are used otherwise). Additional `get()` and `set()`methods added. +- Added functionality to `findCommand` method via new `(bool)$stem_mode` argument. +#### Removed +- Removed `findStemCommand` method. +- Removed unused exception classes. +- Removed `HunspellPHP\Exceptions` namespace. +- Removed composer.lock from repo. +#### Fixed +- Renamed `$language` more appropriately `$dictionary` since that is what that property is referencing. +- Moved HunspellMatchTypeException up one directory to \HunspellPHP namespace. +- Fixed an issue where not all `$match` values were returned from the command response resulting in PHP warnings. +- Fixed a missing type `-` extraction from the matcher regex which resulted in PHP warnings and bad responses. \ No newline at end of file diff --git a/README.MD b/README.MD deleted file mode 100644 index 0d5f6d5..0000000 --- a/README.MD +++ /dev/null @@ -1,10 +0,0 @@ -##Hunspell PHP wrapper -This is a hunspell php wrapper. - -Example -=================== -```php -$hunspell = new \HunspellPHP\Hunspell(); - -var_dump($hunspell->find('otwórz')); -``` \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..a7b63cd --- /dev/null +++ b/README.md @@ -0,0 +1,27 @@ +# Hunspell PHP wrapper +Forked from [johnzuk/HunspellPHP](https://github.com/johnzuk/HunspellPHP) + +### Version 4.x (Optimization +Batch Mode) +This version changes find() and possibly stem() (I'm not exactly sure how stem() functioned before as I did not use it, but I updated to be compatible with the changes made under the hood to `hunspellSuggest()`). The changes to `hunspellSuggest()` can now take a space-separated string of words to batch process. This change allows a single process call to handle many spell checks (and stems) rather than having to invoke the process once for each word. The update also ensures the 1000ms timeout "deadline" is not forcing the process to wait that time before ending which appeared to be the case in previous versions. + +### Version 3.0.0 (Very minor backward breaking change) +This version updates the constructor signature with a different (better?) default value for `$encoding`, so if anyone was using that this would be a backward breaking change. Otherwise, a new constructor argument $custom_word_file (path) has been added and will bind your provided custom word list with your dictionary in real time. + +The other change this version takes care of is using `proc_open` and better env/encoding handling in general. We also now emmit an `error_log()` call so stderr output from the hunspell process are logged properly. + +### Version 2.x +Version 2.0.0 and above requires PHP ^8.0.0 and includes an important fix to the result matcher regex. If you need this for an older version of PHP I recommend that you fork 1.2 and update the regex matcher property of the Hunspell class to what is set in the current version of the code. + +[View Changelog](CHANGELOG.md) + +### The reason for this fork +This project was initially forked because the shell commands used were for a non-bash shell. This fork's main purpose was to convert the shell commands to a BASH compatible syntax and add support for Windows powershell. As such this fork will not work correctly outside of a bash or powershell environment. + +An additional change was made to the parsing of the return value as the `PHP_EOL` value used in the original source was not working in my testing. This was changed to "\n" which resolved the issue. + +Example +=================== +```php +$hunspell = new \HunspellPHP\Hunspell(); +var_dump($hunspell->find('otwórz')); +``` diff --git a/composer.json b/composer.json index 3f2c637..8f7d353 100644 --- a/composer.json +++ b/composer.json @@ -1,20 +1,21 @@ { - "name": "hunspell-php/hunspell-php", + "name": "belniakmedia/hunspell-php", "description": "Hunspell PHP wrapper", "minimum-stability": "dev", + "version": "4.0.2", "license": "MIT", "authors": [ { - "name": "Janusz Żukowicz", - "email": "john_zuk@wp.pl" + "name": "Richard Kukiela", + "email": "rick@belniakmedia.com" } ], "require": { - "php" : ">=5.6" + "php" : ">=8.0" }, "autoload": { "psr-4": { "HunspellPHP\\": "src/HunspellPHP" } } -} \ No newline at end of file +} diff --git a/composer.lock b/composer.lock deleted file mode 100644 index 3397b9a..0000000 --- a/composer.lock +++ /dev/null @@ -1,20 +0,0 @@ -{ - "_readme": [ - "This file locks the dependencies of your project to a known state", - "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file", - "This file is @generated automatically" - ], - "hash": "ee1117e1d6b15d290db1f1130656045a", - "content-hash": "d60c6568f84d2836865f7849ece2b5b1", - "packages": [], - "packages-dev": [], - "aliases": [], - "minimum-stability": "dev", - "stability-flags": [], - "prefer-stable": false, - "prefer-lowest": false, - "platform": { - "php": ">=5.6" - }, - "platform-dev": [] -} diff --git a/src/HunspellPHP/Exception/InvalidResultException.php b/src/HunspellPHP/Exception/InvalidResultException.php deleted file mode 100644 index 6ad7dfb..0000000 --- a/src/HunspellPHP/Exception/InvalidResultException.php +++ /dev/null @@ -1,8 +0,0 @@ - 'OK', Hunspell::ROOT => 'ROOT', Hunspell::MISS => 'MISS', Hunspell::NONE => 'NONE', - Hunspell::COMPOUND => 'COMPOUND', + Hunspell::COMPOUND => 'COMPOUND' ]; + private array $env; + + protected string $encoding; + protected string $dictionary; + protected string $dictionary_path; + protected string $custom_words_file; + protected string $matcher = + '/(?P\*|\+|&|#|-)\s?(?P\w+)?\s?(?P\d+)?\s?(?P\d+)?:?\s?(?P.*+)?/u'; + /** - * @var string + * @param string $dictionary Dictionary name e.g.: 'en_US' (default) + * @param string $encoding Encoding e.g.: 'UTF-8' (default) + * @param string|null $dictionary_path Specify the directory of the dictionary file (optional) + * @param string|null $custom_words_file Specify the path to the custom words file (optional) */ - protected $language = "pl_PL"; + public function __construct( + string $dictionary = 'en_US', + string $encoding = 'UTF-8', + ?string $dictionary_path = null, + ?string $custom_words_file = null + ) { + $this->dictionary = $this->clear($dictionary); + $this->encoding = $this->clear($encoding); + $this->dictionary_path = $dictionary_path ?? ''; + $this->custom_words_file = $custom_words_file ?? ''; + + $this->env = getenv(); + } + /** - * @var string + * @return string */ - protected $encoding = "pl_PL.utf-8"; + public function getEncoding(): string + { + return $this->encoding; + } /** - * @var string + * @return string */ - protected $matcher = - "/(?P\*|\+|&|#)\s?(?P\w+)?\s?(?P\d+)?\s?(?P\d+)?:?\s?(?P.*+)?/u"; + public function getDictionary(): string + { + return $this->dictionary; + } /** * @return string */ - public function getLanguage() + public function getDictionaryPath(): string { - return $this->language; + return $this->dictionary_path; } /** - * @param string $language + * @param string $dictionary Language code e.g.: 'en_US' */ - public function setLanguage($language) + public function setDictionary(string $dictionary): void { - $this->language = $this->clear($language); + $this->dictionary = $this->clear($dictionary); } /** - * @return string + * @param string $dictionary_path The path to load the dictionary files from */ - public function getEncoding() + public function setDictionaryPath(string $dictionary_path): void { - return $this->encoding; + $this->dictionary_path = $dictionary_path; } + /** - * @param string $encoding + * @param string $encoding Encoding value (includes language code) e.g.: 'en_US.utf-8' */ - public function setEncoding($encoding) + public function setEncoding(string $encoding): void { $this->encoding = $this->clear($encoding); } /** - * @param $words + * @param string $words * @return array * @throws InvalidMatchTypeException */ - public function find($words) + public function find(string $words): array { - $matches = []; $results = $this->preParse($this->findCommand($words), $words); $response = []; foreach ($results as $word => $result) { - $matches = []; - $match = preg_match($this->matcher, $result, $matches); - + $matches = ['type' => null]; + preg_match($this->matcher, $result, $matches); $matches['input'] = $word; + $matches['type'] = $matches['type'] ?? null; + $matches['original'] = $matches['original'] ?? ''; + $matches['misses'] = $matches['misses'] ?? []; + $matches['offset'] = $matches['offset'] ?? null; + $matches['count'] = $matches['count'] ?? null; $response[] = $this->parse($matches); } - return $response; } /** - * @param string $word word to find + * @param string $words word to find * @return HunspellStemResponse - * @throws InvalidMatchTypeException - * @throws InvalidResultException - * @throws WordNotFoundException */ - public function stem($word) + public function stem(string $words): HunspellStemResponse { - $result = explode(PHP_EOL, $this->stemCommand($word)); - $result['input'] = $word; - $result = $this->stemParse($result); - return $result; + $raw = $this->findCommand($words, true); + + // Normalize newlines + $raw = str_replace(["\r\n", "\r"], "\n", $raw); + $lines = preg_split('/\n/', $raw) ?: []; + + // Keep only real stem result lines + $lines = array_values(array_filter(array_map('trim', $lines), static function (string $line): bool { + if ($line === '') { + return false; + } + if (str_starts_with($line, '@(#)')) { + return false; + } + // stem lines contain at least two tokens + return preg_match('/\S+\s+\S+/u', $line) === 1; + })); + + return $this->stemParse([ + 'input' => $words, + 'lines' => $lines, + ]); } /** * @param string $input - * @return mixed + * @return string */ - protected function clear($input) + protected function clear(string $input): string { - return preg_replace('[^a-zA-Z0-9_\-.]', '', $input); + return (string)preg_replace('[^a-zA-Z0-9_-\.]', '', $input); } - /** - * @return string - * @param string $input - */ - protected function findCommand($input) + protected function hunspellSuggest(string $input, bool $stemSwitch): array { - return shell_exec(sprintf("LANG=%s; echo '%s' | hunspell -d %s", $this->encoding, $input, $this->language)); + $timeoutMs = 1000; + + $encoding = strtoupper(trim($this->encoding)); + $dictionaryFile = $this->dictionary_path + ? rtrim($this->dictionary_path, DIRECTORY_SEPARATOR) . DIRECTORY_SEPARATOR . $this->dictionary + : $this->dictionary; + + // Build command + $cmd = ['hunspell']; + + if ($stemSwitch) { + // Stem mode + $cmd[] = '-s'; + } else { + // Spellcheck (interactive) mode + $cmd[] = '-a'; + } + + $cmd[] = '-d'; + $cmd[] = trim($dictionaryFile); + $cmd[] = '-i'; + $cmd[] = $encoding; + + if (!empty($this->custom_words_file) && file_exists($this->custom_words_file)) { + $cmd[] = '-p'; + $cmd[] = $this->custom_words_file; + } elseif (!empty($this->custom_words_file)) { + error_log('WARNING: HunspellPHP - $custom_words_file "' . $this->custom_words_file . '" not found.'); + } + + $tokens = preg_split('/\R+|\s+/u', trim($input), -1, PREG_SPLIT_NO_EMPTY) ?: []; + if (empty($tokens)) { + return ['', '', 0]; + } + $batchedInput = implode("\n", $tokens) . "\n"; + + $descriptors = [ + 0 => ['pipe', 'r'], // stdin + 1 => ['pipe', 'w'], // stdout + 2 => ['pipe', 'w'], // stderr + ]; + + // Build minimal env with locale data to pass to proc + $this->env['LC_ALL'] = $this->env['LANG'] = PHP_OS_FAMILY === 'Windows' + ? $this->dictionary . '.' . $encoding + : 'C.UTF-8'; + + $proc = proc_open($cmd, $descriptors, $pipes, null, $this->env); + if (!is_resource($proc)) { + return ['', 'proc_open failed', 1]; + } + + // Write all in one go and close stdin so hunspell can exit cleanly + fwrite($pipes[0], $batchedInput); + fclose($pipes[0]); + + // Non-blocking read + stream_set_blocking($pipes[1], false); + stream_set_blocking($pipes[2], false); + + // Enforce Deadline + $deadline = microtime(true) + ($timeoutMs / 1000); + $out = ''; + $err = ''; + + while (true) { + $out .= stream_get_contents($pipes[1]) ?: ''; + $err .= stream_get_contents($pipes[2]) ?: ''; + + $status = proc_get_status($proc); + if (!$status['running']) { + break; + } + + if (microtime(true) >= $deadline) { + // IMPORTANT: terminate, otherwise proc_close() can still block. + proc_terminate($proc); + break; + } + + // Avoid hammer locking cpu during loop + usleep(1000); + } + + // Drain the pipes + $out .= stream_get_contents($pipes[1]) ?: ''; + $err .= stream_get_contents($pipes[2]) ?: ''; + + fclose($pipes[1]); + fclose($pipes[2]); + + $exit = proc_close($proc); + + return [$out, $err, $exit]; } /** - * @return string * @param string $input + * @param bool $stem_mode + * @return string */ - protected function stemCommand($input) + protected function findCommand(string $input, bool $stem_mode = false): string { - return shell_exec(sprintf("LANG=%s; echo '%s' | hunspell -d %s -s", $this->encoding, $input, $this->language)); + [$stdout, $stderr] = $this->hunspellSuggest($input, $stem_mode); + if($stderr !== '') { + error_log('hunspell stderr: ' . trim($stderr)); + } + return $stdout; } /** @@ -142,13 +279,86 @@ protected function stemCommand($input) * @param string $words * @return array */ - protected function preParse($input, $words) + protected function preParse(string $input, string $words): array { - $result = explode(PHP_EOL, trim($input)); - unset($result[0]); - $words = array_map('trim', explode(" ", $words)); + $input = str_replace(["\r\n", "\r"], "\n", $input); + + // Tokenize words the same way the batched hunspell call does: whitespace/newlines. + $tokens = preg_split('/\s+/u', trim($words), -1, PREG_SPLIT_NO_EMPTY) ?: []; + $tokens = array_values(array_map('trim', $tokens)); + + if (empty($tokens)) { + return []; + } + + // Split stdout into blocks separated by blank lines. + // Skip the hunspell banner/header lines starting with "@(#)". + $rawLines = preg_split('/\n/', $input); + $blocks = []; + $current = []; + + foreach ($rawLines as $line) { + $t = trim($line); + + if ($t !== '' && str_starts_with($t, '@(#)')) { + continue; + } + + if ($t === '') { + if (!empty($current)) { + $blocks[] = $current; + $current = []; + } + continue; + } + + $current[] = $t; + } + + if (!empty($current)) { + $blocks[] = $current; + } + + if (count($blocks) !== count($tokens)) { + return []; + } + + // Normalize each block to a single line compatible with the existing matcher. + $out = []; + foreach ($tokens as $i => $token) { + $lines = $blocks[$i]; + $first = $lines[0] ?? ''; + + // Merge any extra lines (e.g. ", ASTM") into the "misses" list. + // We strip a leading comma/space and append as additional misses. + $extras = []; + for ($j = 1; $j < count($lines); $j++) { + $extra = trim($lines[$j]); + if ($extra === '') { + continue; + } - return array_combine($words, $result); + // Hunspell often prefixes extra suggestions with ",". + $extra = preg_replace('/^[,]\s*/u', '', $extra); + if ($extra !== '') { + $extras[] = $extra; + } + } + + if (!empty($extras) && preg_match('/^(?:&|#|\+|-)/u', $first)) { + // If the first line already has a ":" misses list, append to it. + if (str_contains($first, ':')) { + $first .= ', ' . implode(', ', $extras); + } else { + // Otherwise create a misses list. + $first .= ': ' . implode(', ', $extras); + } + } + + $out[$token] = $first; + } + + return $out; } /** @@ -156,7 +366,7 @@ protected function preParse($input, $words) * @return HunspellResponse * @throws InvalidMatchTypeException */ - protected function parse(array $matches) + protected function parse(array $matches): HunspellResponse { if ($matches['type'] == Hunspell::OK || $matches['type'] == Hunspell::COMPOUND) { return new HunspellResponse( @@ -164,27 +374,33 @@ protected function parse(array $matches) $matches['input'], $matches['type'] ); - } else if ($matches['type'] == Hunspell::ROOT) { - return new HunspellResponse( - $matches['original'], - $matches['input'], - $matches['type'] - ); - } else if ($matches['type'] == Hunspell::MISS) { - return new HunspellResponse( - '', - $matches['original'], - $matches['type'], - $matches['offset'], - explode(", ", $matches['misses']) - ); - } else if ($matches['type'] == Hunspell::NONE) { - return new HunspellResponse( - '', - $matches['input'], - $matches['type'], - $matches['count'] - ); + } else { + if ($matches['type'] == Hunspell::ROOT) { + return new HunspellResponse( + $matches['original'], + $matches['input'], + $matches['type'] + ); + } else { + if ($matches['type'] == Hunspell::MISS) { + return new HunspellResponse( + '', + $matches['original'], + $matches['type'], + intval($matches['offset']), + explode(", ", $matches['misses']) + ); + } else { + if ($matches['type'] == Hunspell::NONE) { + return new HunspellResponse( + '', + $matches['input'], + $matches['type'], + intval($matches['count']) + ); + } + } + } } throw new InvalidMatchTypeException(sprintf("Match type %s is invalid", $matches['type'])); @@ -193,26 +409,31 @@ protected function parse(array $matches) /** * @param array $matches * @return HunspellStemResponse - * @throws InvalidMatchTypeException - * @throws WordNotFoundException */ - protected function stemParse(array $matches) + protected function stemParse(array $matches): HunspellStemResponse { - $input = $matches['input']; - unset($matches['input']); + $input = (string)($matches['input'] ?? ''); + $lines = $matches['lines'] ?? []; + $stems = []; - foreach ($matches as $match) { - $stem = explode(' ', $match); - if (isset($stem[1]) && !empty($stem[1])) { - if (!in_array($stem[1], $stems)) { - $stems[] = $stem[1]; - } - } elseif (isset($stem[0]) && !empty($stem[0])) { - if (!in_array($stem[0], $stems)) { - $stems[] = $stem[0]; - } + foreach ($lines as $line) { + $line = trim((string)$line); + if ($line === '') { + continue; + } + + // Split by any whitespace; hunspell can separate with multiple spaces/tabs. + $parts = preg_split('/\s+/u', $line, -1, PREG_SPLIT_NO_EMPTY) ?: []; + if (count($parts) < 2) { + continue; + } + + $stem = $parts[1] ?? ''; + if ($stem !== '' && !in_array($stem, $stems, true)) { + $stems[] = $stem; } } + return new HunspellStemResponse($input, $stems); } diff --git a/src/HunspellPHP/HunspellResponse.php b/src/HunspellPHP/HunspellResponse.php index 158df10..8b5a092 100644 --- a/src/HunspellPHP/HunspellResponse.php +++ b/src/HunspellPHP/HunspellResponse.php @@ -3,40 +3,21 @@ class HunspellResponse { - /** - * @var string - */ - public $root; - - /** - * @var string - */ - public $original; - - /** - * @var int - */ - public $offset; - - /** - * @var array - */ - public $misses = []; - - /** - * @var string - */ - public $type; + public string $root; + public string $original; + public ?int $offset; + public array $misses = []; + public string $type; /** * HunspellResponse constructor. * @param string $root * @param string $original - * @param int $offset + * @param ?int $offset * @param array $misses * @param string $type */ - public function __construct($root, $original, $type = '', $offset = null, array $misses = []) + public function __construct(string $root, string $original, string $type = '', ?int $offset = null, array $misses = []) { $this->root = $root; $this->original = $original; diff --git a/src/HunspellPHP/HunspellStemResponse.php b/src/HunspellPHP/HunspellStemResponse.php index 0c7a628..5dc28c0 100644 --- a/src/HunspellPHP/HunspellStemResponse.php +++ b/src/HunspellPHP/HunspellStemResponse.php @@ -3,22 +3,16 @@ class HunspellStemResponse { - /** - * @var string - */ - public $original; - - /** - * @var string[] - */ - public $stems; + public string $original; + /** @var string[] */ + public array $stems; /** * HunspellStemResponse constructor. * @param string $original * @param string[] $stems */ - public function __construct($original, $stems = []) + public function __construct(string $original, array $stems = []) { $this->original = $original; $this->stems = $stems; diff --git a/src/HunspellPHP/Exception/InvalidMatchTypeException.php b/src/HunspellPHP/InvalidMatchTypeException.php similarity index 65% rename from src/HunspellPHP/Exception/InvalidMatchTypeException.php rename to src/HunspellPHP/InvalidMatchTypeException.php index 5890554..6022b98 100644 --- a/src/HunspellPHP/Exception/InvalidMatchTypeException.php +++ b/src/HunspellPHP/InvalidMatchTypeException.php @@ -1,5 +1,5 @@