diff --git a/.gitignore b/.gitignore index 7579f74..4f38912 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ +.idea vendor composer.lock diff --git a/composer.json b/composer.json index 9f0e401..bbb9388 100644 --- a/composer.json +++ b/composer.json @@ -8,7 +8,7 @@ } ], "require": { - + "pwfisher/command-line-php": "*" }, "minimum-stability": "dev", "require-dev": { @@ -23,5 +23,8 @@ "psr-0": { "LanguageDetector": "lib/" } - } + }, + "bin": [ + "create-language-detector" + ] } diff --git a/create-language-detector b/create-language-detector new file mode 100755 index 0000000..9fc30a2 --- /dev/null +++ b/create-language-detector @@ -0,0 +1,61 @@ +#!/usr/bin/env php + $arg) { + if (is_numeric($key)) { + $samples[$key] = $arg; + } else { + $options[$key] = $arg; + } +} + +if (count($samples) === 0 || isset($options['help']) || isset($options['?'])) { + echo "Usage: ".$_SERVER['argv'][0]." [options]\n"; + echo "Options:\n"; + echo " --output, -o Sets the output script file path. Defaults to 'language.php'\n"; + echo " --help, -? Print this help\n"; + exit; +} + + +$outputFile = 'language.php'; + +if (isset($options['output'])) { $outputFile = $options['output']; } +if (isset($options['o'])) { $outputFile = $options['o']; } + + +// we load the configuration (which will be serialized +// later into our language model file +$config = new LanguageDetector\Config; + +$c = new LanguageDetector\Learn($config); +foreach ($samples as $sample) { + foreach (glob($sample) as $file) { + // feed with examples ('language', 'text'); + $c->addSample(basename($file), file_get_contents($file)); + } +} + +// some callback so we know where the process is +$c->addStepCallback(function($lang, $status) { + echo "Learning {$lang}: $status\n"; +}); + +// save it in `datafile`. +// we currently support the `php` serialization but it's trivial +// to add other formats, just extend `\LanguageDetector\Format\AbstractFormat`. +//You can check example at https://github.com/crodas/LanguageDetector/blob/master/lib/LanguageDetector/Format/PHP.php +$c->save(\LanguageDetector\AbstractFormat::initFormatByPath($outputFile)); diff --git a/detect-language b/detect-language new file mode 100755 index 0000000..1d3fc12 --- /dev/null +++ b/detect-language @@ -0,0 +1,43 @@ +#!/usr/bin/env php + | ".$_SERVER['argv'][0]." [options]\n"; + echo "Options:\n"; + echo " --detector, -d Sets the language detector script file path. Defaults to 'language.php'\n"; + echo " --help, -? Print this help\n"; + exit; +} + + +$languageScript = 'language.php'; + +if (isset($args['detector'])) { $languageScript = $args['detector']; } +if (isset($args['d'])) { $languageScript = $args['d']; } + + +// we load the language model, it would create +// the $config object for us. +$detect = LanguageDetector\Detect::initByPath($languageScript); + +// get the 5 most probable guesses +$languages = $detect->detect($text); +$languages = array_slice($languages, 0, 5); + +// print result +echo "Detected languages:\n"; + +foreach ($languages as $candidate) { + $lang = $candidate['lang']; + $score = $candidate['score'] * 100; + echo " $lang:\t".number_format($score, 1)."%\n"; +} diff --git a/lib/LanguageDetector/Detect.php b/lib/LanguageDetector/Detect.php index 59e3390..93eb97b 100644 --- a/lib/LanguageDetector/Detect.php +++ b/lib/LanguageDetector/Detect.php @@ -108,8 +108,8 @@ public function getLanguages() { return array_keys($this->data); } - - public function detect($text, $limit = 300) + + public function detectLanguageScores($text, $limit = 300) { $chunks = $this->parser->splitText($text, $limit); $results = array(); @@ -162,6 +162,13 @@ public function detect($text, $limit = 300) usort($distance, function($a, $b) { return $a['score'] > $b['score'] ? -1 : 1; }); + + return $distance; + } + + public function detect($text, $limit = 300) + { + $distance = $this->detectLanguageScores($text, $limit); if ($distance[0]['score'] - $distance[1]['score'] <= $this->threshold) { /** We're not sure at all, we return the whole array then */