shuyo · antoniomo · Nov 27, 2013 · Nov 27, 2013 · Mar 11, 2014 · Mar 11, 2014
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,19 @@
+*.pyc
+
+# Generated by setup.py
+ldig.egg-info
+
+# Generated by cmake
+ldig/ldigcpp/CMakeCache.txt
+ldig/ldigcpp/CMakeFiles/
+ldig/ldigcpp/Makefile
+ldig/ldigcpp/bin/
+ldig/ldigcpp/cmake_install.cmake
+
+# Uncompressed models
+ldig/models/model.latin/
+ldig/ldigcpp/lang50.x64.model
+
+# Dependency that is not a submodule
+ldig/ldigcpp/cybozulib/
+
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,3 @@
+recursive-include ldig/maxsubst *
+recursive-include ldig/static *
+recursive-include ldig/models *
diff --git a/readme.md → README.md b/readme.md → README.md
@@ -1,16 +1,34 @@
 ldig (Language Detection with Infinity Gram)
 ======================
 
-
 This is a prototype of language detection for short message service (twitter).
 with 99.1% accuracy for 17 languages
 
+About this fork
+---------------
+
+In this fork we just add some conveniences to enable the usage of this as
+a library in python programs.
+
+Changes include an updated `.gitignore` so that it ignores unpacked models,
+`__init__.py` file so that it can be treated as a module, added
+a `ldig_standalone.py` file with a convenient class to detect language on text
+and not on files, updated model file permissions and a `setup.py` to install it
+easily.
+
+The original `c++` branch, is merged with the original `master` for convenience
+too, since it takes away nothing from the python point of view, yet adds an
+extra `C++` version.
+
+All real work was done by the author of the original, Nakatani Shuyo / Cybozu
+Labs Inc. under a MIT License (see below or at https://github.com/shuyo/ldig).
+
 
 Usage
 ------
 
 1. Extract model directory
-    tar xf models/[select model archive]
+    tar xvzf models/[select model archive]
 
 2. Detect
     ldig.py -m [model directory] [text data file]
@@ -27,7 +45,7 @@ As input data, Each tweet is one line in text file as the below format.
 It is also optional as metadata.
 (ldig doesn't use metadata and label for detection, of course :D)
 
-The output data of lidg is as the below.
+The output data of ldig is as the below.
 
     [correct label]\t[detected label]\t[original metadata and text]
 

diff --git a/ldig/__init__.py b/ldig/__init__.py
diff --git a/da.py → ldig/da.py b/da.py → ldig/da.py
diff --git a/ldig.py → ldig/ldig.py b/ldig.py → ldig/ldig.py
diff --git a/ldig/ldig_standalone.py b/ldig/ldig_standalone.py
@@ -0,0 +1,41 @@
+import numpy as np
+from itertools import izip
+from operator import itemgetter
+import ldig
+
+
+class LdigDetector(object):
+    """Standalone detector, based on `server.py:Detector`.
+
+    If your text is already normalized, it might be slightly faster to
+    initialize with `normalize=False` prior to use the `detect` method, or just
+    use the `_detect_normalize` or `_detect` methods directly.
+
+    """
+    def __init__(self, modeldir, normalize=True):
+        self.ldig = ldig.ldig(modeldir)
+        self.features = self.ldig.load_features()
+        self.trie = self.ldig.load_da()
+        self.labels = self.ldig.load_labels()
+        self.param = np.load(self.ldig.param)
+        if normalize:
+            self.detect = self._detect_normalize
+        else:
+            self.detect = self._detect
+
+    def _detect_normalize(self, text):
+        _, text, _ = ldig.normalize_text(text)
+        return self._detect(text)
+
+    def _detect(self, text):
+        events = self.trie.extract_features(u"\u0001" + text + u"\u0001")
+        _sum = np.zeros(len(self.labels))
+
+        for id in sorted(events, key=lambda id: self.features[id][0]):
+            phi = self.param[id, ]
+            _sum += phi * events[id]
+        exp_w = np.exp(_sum - _sum.max())
+        prob = exp_w / exp_w.sum()
+
+        r = sorted(izip(self.labels, prob), key=itemgetter(1), reverse=True)
+        return r
diff --git a/ldig/ldigcpp/CMakeLists.txt b/ldig/ldigcpp/CMakeLists.txt
@@ -0,0 +1,23 @@
+cmake_minimum_required(VERSION 2.8)
+project(ldig)
+
+set(CMAKE_C_FLAGS_RELEASE "-Wall -O2")
+set(CMAKE_C_FLAGS_DEBUG "-g")
+set(CMAKE_BUILD_TYPE Release)
+set(CMAKE_CXX_FLAGS "-std=gnu++0x")
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "bin/")
+
+include_directories("${PROJECT_SOURCE_DIR}")
+include_directories("${PROJECT_SOURCE_DIR}/ldig")
+include_directories("${PROJECT_SOURCE_DIR}/cybozulib/include")
+
+find_package(Boost 1.46 COMPONENTS regex)
+if(Boost_FOUND)
+  include_directories(${Boost_INCLUDE_DIRS})
+  add_executable(ldig ldig/ldig.cpp)
+  target_link_libraries(ldig ${Boost_LIBRARIES})
+  add_executable(ldig_test ldigtest/test_model.cpp ldigtest/test_da.cpp)
+  target_link_libraries(ldig_test ${Boost_LIBRARIES})
+endif()
+
+
diff --git a/ldig/ldigcpp/esaxx/COPYING b/ldig/ldigcpp/esaxx/COPYING
@@ -0,0 +1,24 @@
+This is the esaxx copyright.
+
+Copyright (c) 2010 Daisuke Okanohara All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
diff --git a/ldig/ldigcpp/esaxx/README b/ldig/ldigcpp/esaxx/README
@@ -0,0 +1,34 @@
+ESAXX
+----------------------
+
+This library provides the implementation of enhanced suffix array.
+For an input text of length N, this library builds an enhanced suffix array in O(N) time
+using 20N bytes. 
+
+For a suffix array construction, I use sais.hxx, the induced sorting algorithm 
+implemented by Yuta Mori. 
+
+It also provides the program to enumerate the statistics of all substrings in the text.
+
+> enum_substring
+  Enumerate all substring 
+> enum_substring -w 
+  Input are words separated by space. 
+
+Example: 
+------------------
+$ cat abra
+abracadabra
+$ enum_substring < abra
+    n:11
+alpha:256
+ node:5
+0       2       4       abra
+1       5       1       a
+2       2       3       bra
+3       2       2       ra
+4       11      0
+
+$ enum_substring -w < wiki.txt > 
+
+Daisuke Okanohara <daisuke dot okanohara at gmail.com>