From 266edb9f19a89d892f652bd2446c500a5978d413 Mon Sep 17 00:00:00 2001 From: Andy Wermke Date: Sun, 21 Jun 2015 17:31:10 +0200 Subject: [PATCH 1/3] Add create-language-detector cli command --- composer.json | 7 +++-- create-language-detector | 59 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 2 deletions(-) create mode 100755 create-language-detector diff --git a/composer.json b/composer.json index 9f0e401..bbb9388 100644 --- a/composer.json +++ b/composer.json @@ -8,7 +8,7 @@ } ], "require": { - + "pwfisher/command-line-php": "*" }, "minimum-stability": "dev", "require-dev": { @@ -23,5 +23,8 @@ "psr-0": { "LanguageDetector": "lib/" } - } + }, + "bin": [ + "create-language-detector" + ] } diff --git a/create-language-detector b/create-language-detector new file mode 100755 index 0000000..d137c48 --- /dev/null +++ b/create-language-detector @@ -0,0 +1,59 @@ +#!/usr/bin/env php + $arg) { + if (is_numeric($key)) { + $samples[$key] = $arg; + } else { + $options[$key] = $arg; + } +} + +if (count($samples) === 0) { + echo "Usage: ".$_SERVER['argv'][0]." [options]\n"; + echo "Options:\n"; + echo " --output, -o Sets the output script file path. Defaults to 'language.php'\n"; + exit; +} + + +$outputFile = 'language.php'; + +if (isset($options['output'])) { $outputFile = $options['output']; } +if (isset($options['o'])) { $outputFile = $options['o']; } + + +// we load the configuration (which will be serialized +// later into our language model file +$config = new LanguageDetector\Config; + +$c = new LanguageDetector\Learn($config); +foreach ($samples as $sample) { + foreach (glob($sample) as $file) { + // feed with examples ('language', 'text'); + $c->addSample(basename($file), file_get_contents($file)); + } +} + +// some callback so we know where the process is +$c->addStepCallback(function($lang, $status) { + echo "Learning {$lang}: $status\n"; +}); + +// save it in `datafile`. +// we currently support the `php` serialization but it's trivial +// to add other formats, just extend `\LanguageDetector\Format\AbstractFormat`. +//You can check example at https://github.com/crodas/LanguageDetector/blob/master/lib/LanguageDetector/Format/PHP.php +$c->save(\LanguageDetector\AbstractFormat::initFormatByPath($outputFile)); From 15748973cf6ce84131007d643c5e5952649a786a Mon Sep 17 00:00:00 2001 From: Andy Wermke Date: Sun, 21 Jun 2015 18:28:41 +0200 Subject: [PATCH 2/3] Add detect-language cli command & add detectLanguageScores method to Detect --- create-language-detector | 4 ++- detect-language | 43 +++++++++++++++++++++++++++++++++ lib/LanguageDetector/Detect.php | 11 +++++++-- 3 files changed, 55 insertions(+), 3 deletions(-) create mode 100755 detect-language diff --git a/create-language-detector b/create-language-detector index d137c48..9fc30a2 100755 --- a/create-language-detector +++ b/create-language-detector @@ -8,6 +8,7 @@ require 'vendor/autoload.php'; // because this process runs once. ini_set('memory_limit', '1G'); +// parse parameters $args = CommandLine::parseArgs($_SERVER['argv']); $samples = array(); @@ -21,10 +22,11 @@ foreach ($args as $key => $arg) { } } -if (count($samples) === 0) { +if (count($samples) === 0 || isset($options['help']) || isset($options['?'])) { echo "Usage: ".$_SERVER['argv'][0]." [options]\n"; echo "Options:\n"; echo " --output, -o Sets the output script file path. Defaults to 'language.php'\n"; + echo " --help, -? Print this help\n"; exit; } diff --git a/detect-language b/detect-language new file mode 100755 index 0000000..1d3fc12 --- /dev/null +++ b/detect-language @@ -0,0 +1,43 @@ +#!/usr/bin/env php + | ".$_SERVER['argv'][0]." [options]\n"; + echo "Options:\n"; + echo " --detector, -d Sets the language detector script file path. Defaults to 'language.php'\n"; + echo " --help, -? Print this help\n"; + exit; +} + + +$languageScript = 'language.php'; + +if (isset($args['detector'])) { $languageScript = $args['detector']; } +if (isset($args['d'])) { $languageScript = $args['d']; } + + +// we load the language model, it would create +// the $config object for us. +$detect = LanguageDetector\Detect::initByPath($languageScript); + +// get the 5 most probable guesses +$languages = $detect->detect($text); +$languages = array_slice($languages, 0, 5); + +// print result +echo "Detected languages:\n"; + +foreach ($languages as $candidate) { + $lang = $candidate['lang']; + $score = $candidate['score'] * 100; + echo " $lang:\t".number_format($score, 1)."%\n"; +} diff --git a/lib/LanguageDetector/Detect.php b/lib/LanguageDetector/Detect.php index 59e3390..93eb97b 100644 --- a/lib/LanguageDetector/Detect.php +++ b/lib/LanguageDetector/Detect.php @@ -108,8 +108,8 @@ public function getLanguages() { return array_keys($this->data); } - - public function detect($text, $limit = 300) + + public function detectLanguageScores($text, $limit = 300) { $chunks = $this->parser->splitText($text, $limit); $results = array(); @@ -162,6 +162,13 @@ public function detect($text, $limit = 300) usort($distance, function($a, $b) { return $a['score'] > $b['score'] ? -1 : 1; }); + + return $distance; + } + + public function detect($text, $limit = 300) + { + $distance = $this->detectLanguageScores($text, $limit); if ($distance[0]['score'] - $distance[1]['score'] <= $this->threshold) { /** We're not sure at all, we return the whole array then */ From 807ffa6a2fc1c0c5f1043eea2a0287f7c4673caf Mon Sep 17 00:00:00 2001 From: Andy Wermke Date: Sun, 21 Jun 2015 18:29:37 +0200 Subject: [PATCH 3/3] Ignore .idea (family of IDEs) --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 7579f74..4f38912 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ +.idea vendor composer.lock