From 84fc93f6d8e575c36645e77972d39466f207338c Mon Sep 17 00:00:00 2001 From: Negassa Berhanu Date: Wed, 29 Dec 2021 15:14:15 +0300 Subject: [PATCH 01/17] Add OCR engine as an Enum .gitignore - update to include .vscode/ files main.py - import Enum and unique from enum module to declare for the new OCREngine_VAL enum class - declare OCREngine_VAL enum class and insert two values, engine_1 which is equal to 1 and engine_2 which is equal to 2 - use the declared enum class in API.__init__, make engine_1 default - check if the passed value for ocrengine is an instance of OCREngine_VAL (to enforce usage of the enum class) and the value is no different from 1 or 2. - use the ocrengine variable inside payload example.py - update to use the OCREngine_VAL enum class when wanting to change the ocrengine of the api README.md - clean up the README.md file with spaces and well defined programming languages for the code blocks - insert a description on how to change the ocrengine with example --- .gitignore | 3 +++ README.md | 28 ++++++++++++++++++++++++---- example.py | 6 +++++- ocrspace/main.py | 25 ++++++++++++++++++++++++- 4 files changed, 56 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 3fab126..3e31511 100644 --- a/.gitignore +++ b/.gitignore @@ -88,3 +88,6 @@ ENV/ # Rope project settings .ropeproject MANIFEST + +# vscode +.vscode/ diff --git a/README.md b/README.md index 6108ec4..712635f 100644 --- a/README.md +++ b/README.md @@ -3,34 +3,54 @@ > A Python wrapper for using the [ocr.space API](https://ocr.space/ocrapi). ## Installation + Simply install from `pip`: + ```sh pip install ocrspace ``` ## Use + First you'll need to import and instantiate the API wrapper: -```py + +```python import ocrspace api = ocrspace.API() # Or if you have a custom API host, API key or desired language, pass those: api = ocrspace.API(endpoint='https://example.host', api_key='Insert key here', language=ocrspace.Language.Croatian) ``` + +If you wish to change the OCR used, you'll have to import the enum class OCREngine_VAL from ocrspace and pass the value of OCREngine_VAL.val_2 to the api instantiation. By default it uses OCREngine_VAL.val_1: + +```python +import ocrspace +from ocrspace import OCREngine_VAL +api = ocrspace.API(ocrengine=OCREngine_VAL.val_2) +# Or if you have a custom API host, API key or desired language, pass those: +api = ocrspace.API(endpoint='https://example.host', api_key='Insert key here', language=ocrspace.Language.Croatian, ocrengine=OCREngine_VAL.val_2) +``` + To perform recognition on an image hosted at some URL: -```py + +```python api.ocr_url('URL of image goes here') ``` + Or, if you have an image locally upon which to perform recognition: -```py + +```python api.ocr_file('image.jpg') # or: api.ocr_file(open('image.jpg', 'rb')) # or any other file pointer ``` -That's it! Look at [`example.py`](example.py) for a demonstration. +That's it! Look at [`example.py`](example.py) for a demonstration. ## Authorship + This package was created by [Ali Najafi](https://github.com/a4fr) and is maintained by [Erik Boesen](https://github.com/ErikBoesen). ## License + [MIT](LICENSE) diff --git a/example.py b/example.py index 8250693..897ba54 100644 --- a/example.py +++ b/example.py @@ -1,8 +1,12 @@ import ocrspace +from ocrspace import OCREngine_VAL import requests - +# api with the default ocrengine aka engine 1 api = ocrspace.API() + +# api with engine 2 +api_with_engine_two = ocrspace.API(ocrengine=OCREngine_VAL.engine_2) TEST_IMAGE_URL = 'https://images-na.ssl-images-amazon.com/images/I/71ovNJN1URL._SL1244_.jpg' print('Testing URL-based OCR:') diff --git a/ocrspace/main.py b/ocrspace/main.py index eff38f4..cf870ab 100644 --- a/ocrspace/main.py +++ b/ocrspace/main.py @@ -1,4 +1,5 @@ import requests +from enum import (Enum, unique) class Language: @@ -28,25 +29,48 @@ class Language: Turkish = 'tur' +@unique +class OCREngine_VAL(Enum): + """ + OCRengine_VAL: the OCRengine to use, values can only be 1 or 2 + + Args: + Enum: python's generic enumeration that is used to define new enumerations. + """ + engine_1 = 1 + engine_2 = 2 + + class API: def __init__( self, endpoint='https://api.ocr.space/parse/image', api_key='helloworld', language=Language.English, + ocrengine=OCREngine_VAL.engine_1, **kwargs, ): """ :param endpoint: API endpoint to contact :param api_key: API key string :param language: document language + :param ocrengine: ocr engine to use :param **kwargs: other settings to API """ + if not isinstance(ocrengine, OCREngine_VAL): + raise ValueError( + "the value of ocrengine must be an instance of OCREngine_VAL enum class" + ) + if ocrengine.value != 1 and ocrengine.value != 2: + raise Exception( + "the value of ocrengine must be either 1 or 2, import & use OCREngine_VAL" + ) self.endpoint = endpoint self.payload = { 'isOverlayRequired': True, 'apikey': api_key, 'language': language, + 'OCREngine': ocrengine.value, **kwargs } @@ -57,7 +81,6 @@ def _parse(self, raw): raise Exception(raw['ErrorMessage'][0]) return raw['ParsedResults'][0]['ParsedText'] - def ocr_file(self, fp): """ Process image from a local path. From 404334c906db6645861e79a7feb62794653fa303 Mon Sep 17 00:00:00 2001 From: Negassa Berhanu Date: Wed, 29 Dec 2021 15:22:12 +0300 Subject: [PATCH 02/17] correct typo main.py - correct the typo when checking if it's an instance of OCREngine_VAL --- ocrspace/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrspace/main.py b/ocrspace/main.py index cf870ab..e492ab6 100644 --- a/ocrspace/main.py +++ b/ocrspace/main.py @@ -59,7 +59,7 @@ def __init__( """ if not isinstance(ocrengine, OCREngine_VAL): raise ValueError( - "the value of ocrengine must be an instance of OCREngine_VAL enum class" + "ocrengine must be an instance of OCREngine_VAL enum class" ) if ocrengine.value != 1 and ocrengine.value != 2: raise Exception( From 6ade1cd25e7e2677eff27a057f78efc25f84c691 Mon Sep 17 00:00:00 2001 From: Negassa Berhanu Date: Wed, 29 Dec 2021 15:29:19 +0300 Subject: [PATCH 03/17] Update example.py to include usage of engine_two main.py - give better description when throwing exception example.py - include example using api_with_engine_two --- example.py | 10 +++++++++- ocrspace/main.py | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/example.py b/example.py index 897ba54..b3869b3 100644 --- a/example.py +++ b/example.py @@ -11,8 +11,9 @@ print('Testing URL-based OCR:') print(api.ocr_url(TEST_IMAGE_URL)) +print('Testing URL-based OCR using engine_two:') +print(api_with_engine_two.ocr_url(TEST_IMAGE_URL)) -print('Testing file-based OCR:') # Download image for demo purposes TEST_FILENAME = '/tmp/test_image.jpg' with open(TEST_FILENAME, 'wb') as f: @@ -20,7 +21,14 @@ r.raw.decode_content = True f.write(r.content) +print('Testing file-based OCR:') # With file path print(api.ocr_file(TEST_FILENAME)) # With file pointer print(api.ocr_file(open(TEST_FILENAME, 'rb'))) + +print('Testing file-based OCR using engine_two:') +# With file path +print(api_with_engine_two.ocr_file(TEST_FILENAME)) +# With file pointer +print(api_with_engine_two.ocr_file(open(TEST_FILENAME, 'rb'))) diff --git a/ocrspace/main.py b/ocrspace/main.py index e492ab6..afa8d1a 100644 --- a/ocrspace/main.py +++ b/ocrspace/main.py @@ -63,7 +63,7 @@ def __init__( ) if ocrengine.value != 1 and ocrengine.value != 2: raise Exception( - "the value of ocrengine must be either 1 or 2, import & use OCREngine_VAL" + "the value of ocrengine must be either 1 or 2, import & use ocrspace.OCREngine_VAL" ) self.endpoint = endpoint self.payload = { From bf672d3c442041b255d3283b65821aaa106e3879 Mon Sep 17 00:00:00 2001 From: Negassa Berhanu Date: Wed, 29 Dec 2021 15:34:23 +0300 Subject: [PATCH 04/17] Update raised error types main.py - change the error raised for isinstance() failure from ValueError TypeError - change the error raise for wrong value from Exception() to ValueError() --- ocrspace/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrspace/main.py b/ocrspace/main.py index afa8d1a..0a15c75 100644 --- a/ocrspace/main.py +++ b/ocrspace/main.py @@ -58,11 +58,11 @@ def __init__( :param **kwargs: other settings to API """ if not isinstance(ocrengine, OCREngine_VAL): - raise ValueError( + raise TypeError( "ocrengine must be an instance of OCREngine_VAL enum class" ) if ocrengine.value != 1 and ocrengine.value != 2: - raise Exception( + raise ValueError( "the value of ocrengine must be either 1 or 2, import & use ocrspace.OCREngine_VAL" ) self.endpoint = endpoint From 8398ee9a3354a47ce08d23199f90d8853a107474 Mon Sep 17 00:00:00 2001 From: Negassa Berhanu Date: Wed, 29 Dec 2021 16:13:00 +0300 Subject: [PATCH 05/17] Update setup.py to include the needed package setup.py - include the dependecy needed for the lib inside setup() --- setup.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/setup.py b/setup.py index 0245b81..91d2aac 100644 --- a/setup.py +++ b/setup.py @@ -1,13 +1,14 @@ from distutils.core import setup setup( - name = 'ocrspace', - packages = ['ocrspace'], # this must be the same as the name above - version = '2.3.0', - description = 'Perform OCR through ocr.space API', - author = ['Ali Najafi', 'Erik Boesen'], - author_email = 'me@erikboesen.com', - url = 'https://github.com/ErikBoesen/ocrspace', - keywords = ['ocr'], - classifiers = [], + name='ocrspace', + packages=['ocrspace'], # this must be the same as the name above + requires=['requests'], + version='2.3.0', + description='Perform OCR through ocr.space API', + author=['Ali Najafi', 'Erik Boesen'], + author_email='me@erikboesen.com', + url='https://github.com/ErikBoesen/ocrspace', + keywords=['ocr'], + classifiers=[], ) From 4a9fa29b1edb8b9902c97ea2b48cb0cac78a6a53 Mon Sep 17 00:00:00 2001 From: Negassa Berhanu Date: Wed, 29 Dec 2021 16:36:35 +0300 Subject: [PATCH 06/17] update requirements.txt to list out all that are needed --- requirements.txt | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f229360..a4973f2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,6 @@ -requests +certifi==2021.10.8 +charset-normalizer==2.0.9 +idna==3.3 +pycodestyle==2.8.0 +requests==2.26.0 +urllib3==1.26.7 From 974318420ac9d7427f5d756cbb80318f4e09d407 Mon Sep 17 00:00:00 2001 From: Negassa Berhanu Date: Wed, 29 Dec 2021 18:33:06 +0300 Subject: [PATCH 07/17] Timout exception, update README README.md - update README to describe getting apikey from Free OCR API. main.py - refactor processing of files into it's own method named query_api(). - pass timeout parameter to request.post() inside query_api() to specifiy for how to wait before raise a Timeout error instead of hanging the process for undefined amount of time. - call query_api() in all the ocr processors. --- README.md | 15 ++++++++++--- ocrspace/main.py | 55 ++++++++++++++++++++++++++++++++++-------------- 2 files changed, 51 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 712635f..7cb9871 100644 --- a/README.md +++ b/README.md @@ -12,21 +12,30 @@ pip install ocrspace ## Use +Before anything else, you need to get your apikey from [Free OCR API](https://ocr.space/OCRAPI) otherwise the default apikey -- i.e **helloworld** -- will be used and according to the developers of [Free OCR API](https://ocr.space/faq#span12): +> This key is great for a quick test, but do not use it in your project, as it is severely rate limited. + First you'll need to import and instantiate the API wrapper: ```python import ocrspace -api = ocrspace.API() + +api_key = "apikey retrieved from Free OCR API" + +api = ocrspace.API(api_key=api_key) # Or if you have a custom API host, API key or desired language, pass those: api = ocrspace.API(endpoint='https://example.host', api_key='Insert key here', language=ocrspace.Language.Croatian) ``` -If you wish to change the OCR used, you'll have to import the enum class OCREngine_VAL from ocrspace and pass the value of OCREngine_VAL.val_2 to the api instantiation. By default it uses OCREngine_VAL.val_1: +If you wish to change the OCR engine used, you'll have to import the enum class OCREngine_VAL from ocrspace and pass the value of OCREngine_VAL.val_2 to the api instantiation. By default it uses OCREngine_VAL.val_1: ```python import ocrspace from ocrspace import OCREngine_VAL -api = ocrspace.API(ocrengine=OCREngine_VAL.val_2) + +api_key = "apikey retrieved from Free OCR API" + +api = ocrspace.API(api_key=api_key, ocrengine=OCREngine_VAL.val_2) # Or if you have a custom API host, API key or desired language, pass those: api = ocrspace.API(endpoint='https://example.host', api_key='Insert key here', language=ocrspace.Language.Croatian, ocrengine=OCREngine_VAL.val_2) ``` diff --git a/ocrspace/main.py b/ocrspace/main.py index 0a15c75..e2541d0 100644 --- a/ocrspace/main.py +++ b/ocrspace/main.py @@ -81,6 +81,42 @@ def _parse(self, raw): raise Exception(raw['ErrorMessage'][0]) return raw['ParsedResults'][0]['ParsedText'] + def query_api(self, url_data=None, pic_file=None): + """ + Process the provided parameter. + :param url_data: Either an Image url or base64image + :param pic_file: A path or pointer to image file + :return: Result in JSON format + :raise: request.exceptions or general Exception + """ + try: + if pic_file: + r = requests.post( + self.endpoint, + files={'filename': pic_file}, + data=self.payload, + timeout=30 + ) + if url_data: + r = requests.post( + self.endpoint, + data=url_data, + timeout=30 + ) + r.raise_for_status() + except requests.exceptions.Timeout as time_out: + raise time_out + except requests.exceptions.TooManyRedirects as too_man_redirects: + raise too_man_redirects + except requests.exceptions.HTTPError as http_error: + raise http_error + except requests.exceptions.RequestException as request_exception: + raise request_exception + except Exception as e: + raise e + else: + return self._parse(r.json()) + def ocr_file(self, fp): """ Process image from a local path. @@ -88,12 +124,7 @@ def ocr_file(self, fp): :return: Result in JSON format """ with (open(fp, 'rb') if type(fp) == str else fp) as f: - r = requests.post( - self.endpoint, - files={'filename': f}, - data=self.payload, - ) - return self._parse(r.json()) + return self.query_api(pic_file=f) def ocr_url(self, url): """ @@ -103,11 +134,7 @@ def ocr_url(self, url): """ data = self.payload data['url'] = url - r = requests.post( - self.endpoint, - data=data, - ) - return self._parse(r.json()) + return self.query_api(url_data=data) def ocr_base64(self, base64image): """ @@ -117,8 +144,4 @@ def ocr_base64(self, base64image): """ data = self.payload data['base64Image'] = base64image - r = requests.post( - self.endpoint, - data=data, - ) - return self._parse(r.json()) + return self.query_api(url_data=data) From 68fda597004293b354f81a2491e22265fc79a733 Mon Sep 17 00:00:00 2001 From: Negassa Berhanu Date: Wed, 29 Dec 2021 18:43:36 +0300 Subject: [PATCH 08/17] update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7cb9871..dfa8e24 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ api_key = "apikey retrieved from Free OCR API" api = ocrspace.API(api_key=api_key) # Or if you have a custom API host, API key or desired language, pass those: -api = ocrspace.API(endpoint='https://example.host', api_key='Insert key here', language=ocrspace.Language.Croatian) +api = ocrspace.API(endpoint='https://example.host', api_key=api_key, language=ocrspace.Language.Croatian) ``` If you wish to change the OCR engine used, you'll have to import the enum class OCREngine_VAL from ocrspace and pass the value of OCREngine_VAL.val_2 to the api instantiation. By default it uses OCREngine_VAL.val_1: @@ -37,7 +37,7 @@ api_key = "apikey retrieved from Free OCR API" api = ocrspace.API(api_key=api_key, ocrengine=OCREngine_VAL.val_2) # Or if you have a custom API host, API key or desired language, pass those: -api = ocrspace.API(endpoint='https://example.host', api_key='Insert key here', language=ocrspace.Language.Croatian, ocrengine=OCREngine_VAL.val_2) +api = ocrspace.API(endpoint='https://example.host', api_key=api_key, language=ocrspace.Language.Croatian, ocrengine=OCREngine_VAL.val_2) ``` To perform recognition on an image hosted at some URL: From 77f17fb8d05996861998827e5fac2838db2b4850 Mon Sep 17 00:00:00 2001 From: Negassa Berhanu Date: Wed, 5 Jan 2022 17:50:13 +0300 Subject: [PATCH 09/17] Per request change main.py - rename OCREngine_VAL to Engine - capitalize the values inside class Engine - rename parameter ocrengine inside __init__() to engine - simplify raised TypeError's message - rename parameter pic_file to image_file in query_api() example.py - update example.py in accordance with main.py --- example.py | 4 ++-- ocrspace/main.py | 33 +++++++++++++++------------------ 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/example.py b/example.py index b3869b3..07b3b3d 100644 --- a/example.py +++ b/example.py @@ -1,12 +1,12 @@ import ocrspace -from ocrspace import OCREngine_VAL +from ocrspace import Engine import requests # api with the default ocrengine aka engine 1 api = ocrspace.API() # api with engine 2 -api_with_engine_two = ocrspace.API(ocrengine=OCREngine_VAL.engine_2) +api_with_engine_two = ocrspace.API(engine=Engine.ENGINE_2) TEST_IMAGE_URL = 'https://images-na.ssl-images-amazon.com/images/I/71ovNJN1URL._SL1244_.jpg' print('Testing URL-based OCR:') diff --git a/ocrspace/main.py b/ocrspace/main.py index e2541d0..3a05b54 100644 --- a/ocrspace/main.py +++ b/ocrspace/main.py @@ -30,15 +30,12 @@ class Language: @unique -class OCREngine_VAL(Enum): +class Engine(Enum): """ - OCRengine_VAL: the OCRengine to use, values can only be 1 or 2 - - Args: - Enum: python's generic enumeration that is used to define new enumerations. + Engine: Enum representing the OCR engine to use """ - engine_1 = 1 - engine_2 = 2 + ENGINE_1 = 1 + ENGINE_2 = 2 class API: @@ -47,7 +44,7 @@ def __init__( endpoint='https://api.ocr.space/parse/image', api_key='helloworld', language=Language.English, - ocrengine=OCREngine_VAL.engine_1, + engine=Engine.ENGINE_1, **kwargs, ): """ @@ -57,20 +54,20 @@ def __init__( :param ocrengine: ocr engine to use :param **kwargs: other settings to API """ - if not isinstance(ocrengine, OCREngine_VAL): + if not isinstance(engine, Engine): raise TypeError( - "ocrengine must be an instance of OCREngine_VAL enum class" + "engine must be an instance of Engine" ) - if ocrengine.value != 1 and ocrengine.value != 2: + if engine.value != 1 and engine.value != 2: raise ValueError( - "the value of ocrengine must be either 1 or 2, import & use ocrspace.OCREngine_VAL" + "the value of engine must be either 1 or 2, import & use ocrspace.Engine" ) self.endpoint = endpoint self.payload = { 'isOverlayRequired': True, 'apikey': api_key, 'language': language, - 'OCREngine': ocrengine.value, + 'OCREngine': engine.value, **kwargs } @@ -81,19 +78,19 @@ def _parse(self, raw): raise Exception(raw['ErrorMessage'][0]) return raw['ParsedResults'][0]['ParsedText'] - def query_api(self, url_data=None, pic_file=None): + def query_api(self, url_data=None, image_file=None): """ Process the provided parameter. :param url_data: Either an Image url or base64image - :param pic_file: A path or pointer to image file + :param image_file: A path or file pointer to the image file :return: Result in JSON format :raise: request.exceptions or general Exception """ try: - if pic_file: + if image_file: r = requests.post( self.endpoint, - files={'filename': pic_file}, + files={'filename': image_file}, data=self.payload, timeout=30 ) @@ -124,7 +121,7 @@ def ocr_file(self, fp): :return: Result in JSON format """ with (open(fp, 'rb') if type(fp) == str else fp) as f: - return self.query_api(pic_file=f) + return self.query_api(image_file=f) def ocr_url(self, url): """ From f093f67eac865e4617380d1cdeaa315e8d281ae1 Mon Sep 17 00:00:00 2001 From: Negassa Berhanu Date: Wed, 5 Jan 2022 18:29:17 +0300 Subject: [PATCH 10/17] Per review #2 requirements.txt - remove unnecessarily listed dependencies - remove specific versioning of requests main.py - remove unnecessary except blocks --- ocrspace/main.py | 42 +++++++++++++++--------------------------- requirements.txt | 7 +------ 2 files changed, 16 insertions(+), 33 deletions(-) diff --git a/ocrspace/main.py b/ocrspace/main.py index 3a05b54..a45cf9e 100644 --- a/ocrspace/main.py +++ b/ocrspace/main.py @@ -86,33 +86,21 @@ def query_api(self, url_data=None, image_file=None): :return: Result in JSON format :raise: request.exceptions or general Exception """ - try: - if image_file: - r = requests.post( - self.endpoint, - files={'filename': image_file}, - data=self.payload, - timeout=30 - ) - if url_data: - r = requests.post( - self.endpoint, - data=url_data, - timeout=30 - ) - r.raise_for_status() - except requests.exceptions.Timeout as time_out: - raise time_out - except requests.exceptions.TooManyRedirects as too_man_redirects: - raise too_man_redirects - except requests.exceptions.HTTPError as http_error: - raise http_error - except requests.exceptions.RequestException as request_exception: - raise request_exception - except Exception as e: - raise e - else: - return self._parse(r.json()) + if image_file: + r = requests.post( + self.endpoint, + files={'filename': image_file}, + data=self.payload, + timeout=30 + ) + if url_data: + r = requests.post( + self.endpoint, + data=url_data, + timeout=30 + ) + r.raise_for_status() + return self._parse(r.json()) def ocr_file(self, fp): """ diff --git a/requirements.txt b/requirements.txt index a4973f2..f229360 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1 @@ -certifi==2021.10.8 -charset-normalizer==2.0.9 -idna==3.3 -pycodestyle==2.8.0 -requests==2.26.0 -urllib3==1.26.7 +requests From 8d58ee9640bc746cc30de63d8c1d54f449ee3806 Mon Sep 17 00:00:00 2001 From: Negassa Berhanu Date: Wed, 5 Jan 2022 18:29:17 +0300 Subject: [PATCH 11/17] Per 2nd review requirements.txt - remove unnecessarily listed dependencies - remove specific versioning of requests main.py - remove unnecessary except blocks --- ocrspace/main.py | 42 +++++++++++++++--------------------------- requirements.txt | 7 +------ 2 files changed, 16 insertions(+), 33 deletions(-) diff --git a/ocrspace/main.py b/ocrspace/main.py index 3a05b54..a45cf9e 100644 --- a/ocrspace/main.py +++ b/ocrspace/main.py @@ -86,33 +86,21 @@ def query_api(self, url_data=None, image_file=None): :return: Result in JSON format :raise: request.exceptions or general Exception """ - try: - if image_file: - r = requests.post( - self.endpoint, - files={'filename': image_file}, - data=self.payload, - timeout=30 - ) - if url_data: - r = requests.post( - self.endpoint, - data=url_data, - timeout=30 - ) - r.raise_for_status() - except requests.exceptions.Timeout as time_out: - raise time_out - except requests.exceptions.TooManyRedirects as too_man_redirects: - raise too_man_redirects - except requests.exceptions.HTTPError as http_error: - raise http_error - except requests.exceptions.RequestException as request_exception: - raise request_exception - except Exception as e: - raise e - else: - return self._parse(r.json()) + if image_file: + r = requests.post( + self.endpoint, + files={'filename': image_file}, + data=self.payload, + timeout=30 + ) + if url_data: + r = requests.post( + self.endpoint, + data=url_data, + timeout=30 + ) + r.raise_for_status() + return self._parse(r.json()) def ocr_file(self, fp): """ diff --git a/requirements.txt b/requirements.txt index a4973f2..f229360 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1 @@ -certifi==2021.10.8 -charset-normalizer==2.0.9 -idna==3.3 -pycodestyle==2.8.0 -requests==2.26.0 -urllib3==1.26.7 +requests From 96a7654cb060145bd4e3c4798326cb008b8c72c6 Mon Sep 17 00:00:00 2001 From: Negassa Berhanu Date: Wed, 5 Jan 2022 18:29:17 +0300 Subject: [PATCH 12/17] Per 2nd review requirements.txt - remove unnecessarily listed dependencies - remove specific versioning of requests main.py - remove unnecessary except blocks --- ocrspace/main.py | 42 +++++++++++++++--------------------------- requirements.txt | 7 +------ 2 files changed, 16 insertions(+), 33 deletions(-) diff --git a/ocrspace/main.py b/ocrspace/main.py index 3a05b54..a45cf9e 100644 --- a/ocrspace/main.py +++ b/ocrspace/main.py @@ -86,33 +86,21 @@ def query_api(self, url_data=None, image_file=None): :return: Result in JSON format :raise: request.exceptions or general Exception """ - try: - if image_file: - r = requests.post( - self.endpoint, - files={'filename': image_file}, - data=self.payload, - timeout=30 - ) - if url_data: - r = requests.post( - self.endpoint, - data=url_data, - timeout=30 - ) - r.raise_for_status() - except requests.exceptions.Timeout as time_out: - raise time_out - except requests.exceptions.TooManyRedirects as too_man_redirects: - raise too_man_redirects - except requests.exceptions.HTTPError as http_error: - raise http_error - except requests.exceptions.RequestException as request_exception: - raise request_exception - except Exception as e: - raise e - else: - return self._parse(r.json()) + if image_file: + r = requests.post( + self.endpoint, + files={'filename': image_file}, + data=self.payload, + timeout=30 + ) + if url_data: + r = requests.post( + self.endpoint, + data=url_data, + timeout=30 + ) + r.raise_for_status() + return self._parse(r.json()) def ocr_file(self, fp): """ diff --git a/requirements.txt b/requirements.txt index a4973f2..f229360 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1 @@ -certifi==2021.10.8 -charset-normalizer==2.0.9 -idna==3.3 -pycodestyle==2.8.0 -requests==2.26.0 -urllib3==1.26.7 +requests From 593ed79a87518cf20f23d57384b43c861f8ac9a4 Mon Sep 17 00:00:00 2001 From: Negassa Berhanu Date: Wed, 5 Jan 2022 19:07:34 +0300 Subject: [PATCH 13/17] Add a headers parameter, move apikey to headers main.py - remove key api_key from payload dict - insert new self attribute named api_key - add a headers parameter in requests.post() and place api_key in headers This was done in accordance with the documentation and examples provided by Free OCR API https://ocr.space/ocrapi#postman --- ocrspace/main.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ocrspace/main.py b/ocrspace/main.py index a45cf9e..6fc6957 100644 --- a/ocrspace/main.py +++ b/ocrspace/main.py @@ -63,9 +63,9 @@ def __init__( "the value of engine must be either 1 or 2, import & use ocrspace.Engine" ) self.endpoint = endpoint + self.api_key = api_key self.payload = { 'isOverlayRequired': True, - 'apikey': api_key, 'language': language, 'OCREngine': engine.value, **kwargs @@ -86,9 +86,11 @@ def query_api(self, url_data=None, image_file=None): :return: Result in JSON format :raise: request.exceptions or general Exception """ + if image_file: r = requests.post( self.endpoint, + headers={'apikey': self.api_key}, files={'filename': image_file}, data=self.payload, timeout=30 @@ -96,6 +98,7 @@ def query_api(self, url_data=None, image_file=None): if url_data: r = requests.post( self.endpoint, + headers={'apikey': self.api_key}, data=url_data, timeout=30 ) From 63929964b74343a113172bcbc3f3aeec5c72de65 Mon Sep 17 00:00:00 2001 From: Negassa Berhanu Date: Wed, 5 Jan 2022 19:53:20 +0300 Subject: [PATCH 14/17] Per 3rd review main.py - remove Engine: from docstring for Engine - rename ocreengine to engine in docstring for API - remove import & from ValueError message - update query_api() to if/elif/else statement when querying OCR api README.md - use clear & concise wording in the Use section - rename OCREngine_VAL to Engine - rename OCRENGINE_VAL.val_1 & OCRENGINE_VAL.val_2 to Engine.ENGINE_2 & ENGINE.ENGINE_1 --- README.md | 13 ++++++------- ocrspace/main.py | 10 ++++++---- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index dfa8e24..44983ca 100644 --- a/README.md +++ b/README.md @@ -12,32 +12,31 @@ pip install ocrspace ## Use -Before anything else, you need to get your apikey from [Free OCR API](https://ocr.space/OCRAPI) otherwise the default apikey -- i.e **helloworld** -- will be used and according to the developers of [Free OCR API](https://ocr.space/faq#span12): -> This key is great for a quick test, but do not use it in your project, as it is severely rate limited. +First, get an API key from [Free OCR API](https://ocr.space/OCRAPI), otherwise the default apikey ```helloworld``` will be used, which is [severely rate limited.](https://ocr.space/faq#span12) First you'll need to import and instantiate the API wrapper: ```python import ocrspace -api_key = "apikey retrieved from Free OCR API" +api_key = 'apikey retrieved from Free OCR API' api = ocrspace.API(api_key=api_key) # Or if you have a custom API host, API key or desired language, pass those: api = ocrspace.API(endpoint='https://example.host', api_key=api_key, language=ocrspace.Language.Croatian) ``` -If you wish to change the OCR engine used, you'll have to import the enum class OCREngine_VAL from ocrspace and pass the value of OCREngine_VAL.val_2 to the api instantiation. By default it uses OCREngine_VAL.val_1: +If you wish to change the OCR engine used, you'll have to import the enum class Engine from ocrspace and pass the value of Engine.ENGINE_2 to the api instantiation. By default it uses Engine.val_1: ```python import ocrspace -from ocrspace import OCREngine_VAL +from ocrspace import Engine api_key = "apikey retrieved from Free OCR API" -api = ocrspace.API(api_key=api_key, ocrengine=OCREngine_VAL.val_2) +api = ocrspace.API(api_key=api_key, engine=Engine.ENGINE_2) # Or if you have a custom API host, API key or desired language, pass those: -api = ocrspace.API(endpoint='https://example.host', api_key=api_key, language=ocrspace.Language.Croatian, ocrengine=OCREngine_VAL.val_2) +api = ocrspace.API(endpoint='https://example.host', api_key=api_key, language=ocrspace.Language.Croatian, engine=Engine.ENGINE_2) ``` To perform recognition on an image hosted at some URL: diff --git a/ocrspace/main.py b/ocrspace/main.py index a45cf9e..e571550 100644 --- a/ocrspace/main.py +++ b/ocrspace/main.py @@ -32,7 +32,7 @@ class Language: @unique class Engine(Enum): """ - Engine: Enum representing the OCR engine to use + Enum representing the OCR engine to use """ ENGINE_1 = 1 ENGINE_2 = 2 @@ -51,7 +51,7 @@ def __init__( :param endpoint: API endpoint to contact :param api_key: API key string :param language: document language - :param ocrengine: ocr engine to use + :param engine: ocr engine to use :param **kwargs: other settings to API """ if not isinstance(engine, Engine): @@ -60,7 +60,7 @@ def __init__( ) if engine.value != 1 and engine.value != 2: raise ValueError( - "the value of engine must be either 1 or 2, import & use ocrspace.Engine" + "the value of engine must be either 1 or 2, use ocrspace.Engine" ) self.endpoint = endpoint self.payload = { @@ -93,12 +93,14 @@ def query_api(self, url_data=None, image_file=None): data=self.payload, timeout=30 ) - if url_data: + elif url_data: r = requests.post( self.endpoint, data=url_data, timeout=30 ) + else: + raise TypeError("either image_file or url_data must be provided") r.raise_for_status() return self._parse(r.json()) From dbcf91afd16977d60018f2985188080ed8b987ee Mon Sep 17 00:00:00 2001 From: Negassa Berhanu Date: Thu, 6 Jan 2022 19:39:34 +0300 Subject: [PATCH 15/17] Per the 4th review README.md - use single quotes instead of triple codes for describing the apikey in the Use section - remove description and import of Engine, use instead full package name - update example code to use full package name for Engine --- README.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 44983ca..7889513 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ pip install ocrspace ## Use -First, get an API key from [Free OCR API](https://ocr.space/OCRAPI), otherwise the default apikey ```helloworld``` will be used, which is [severely rate limited.](https://ocr.space/faq#span12) +First, get an API key from [Free OCR API](https://ocr.space/OCRAPI), otherwise the default apikey `helloworld` will be used, which is [severely rate limited.](https://ocr.space/faq#span12) First you'll need to import and instantiate the API wrapper: @@ -26,17 +26,16 @@ api = ocrspace.API(api_key=api_key) api = ocrspace.API(endpoint='https://example.host', api_key=api_key, language=ocrspace.Language.Croatian) ``` -If you wish to change the OCR engine used, you'll have to import the enum class Engine from ocrspace and pass the value of Engine.ENGINE_2 to the api instantiation. By default it uses Engine.val_1: +If you wish to change the OCR engine used, use the enum class Engine from ocrspace and pass the value of ocrspace.Engine.Engine.ENGINE_2 to the api instantiation. By default it uses ocrspace.Engine.Engine.ENGINE_1: ```python import ocrspace -from ocrspace import Engine api_key = "apikey retrieved from Free OCR API" -api = ocrspace.API(api_key=api_key, engine=Engine.ENGINE_2) +api = ocrspace.API(api_key=api_key, engine=ocrspace.Engine.Engine.ENGINE_2) # Or if you have a custom API host, API key or desired language, pass those: -api = ocrspace.API(endpoint='https://example.host', api_key=api_key, language=ocrspace.Language.Croatian, engine=Engine.ENGINE_2) +api = ocrspace.API(endpoint='https://example.host', api_key=api_key, language=ocrspace.Language.Croatian, engine=ocrspace.Engine.Engine.ENGINE_2) ``` To perform recognition on an image hosted at some URL: From 52c143b310e4f550cd68b6cbc820cdffbad2315d Mon Sep 17 00:00:00 2001 From: Negassa Berhanu Date: Thu, 19 May 2022 13:06:23 +0300 Subject: [PATCH 16/17] add maintainer's name --- LICENSE | 5 ++++- README.md | 2 +- setup.py | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/LICENSE b/LICENSE index 2f49a2e..f764634 100644 --- a/LICENSE +++ b/LICENSE @@ -1,8 +1,11 @@ -MIT License +# MIT License Copyright (c) 2017 Ali Najafi + Copyright (c) 2019 Erik Bøsen +Copyright (c) 2022 Negassa Berhanu + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights diff --git a/README.md b/README.md index 7889513..8d47ccc 100644 --- a/README.md +++ b/README.md @@ -56,7 +56,7 @@ That's it! Look at [`example.py`](example.py) for a demonstration. ## Authorship -This package was created by [Ali Najafi](https://github.com/a4fr) and is maintained by [Erik Boesen](https://github.com/ErikBoesen). +This package was created by [Ali Najafi]() and is maintained by [Erik Boesen] () and Negassa Berhanu (). ## License diff --git a/setup.py b/setup.py index 91d2aac..8c537fa 100644 --- a/setup.py +++ b/setup.py @@ -6,8 +6,8 @@ requires=['requests'], version='2.3.0', description='Perform OCR through ocr.space API', - author=['Ali Najafi', 'Erik Boesen'], - author_email='me@erikboesen.com', + author=['Ali Najafi', 'Erik Boesen', 'Negassa Berhanu'], + author_email='forpurposes1435@gmail.com', url='https://github.com/ErikBoesen/ocrspace', keywords=['ocr'], classifiers=[], From c9ffd78ddc93ede49af145fe706ee269c4e0a42c Mon Sep 17 00:00:00 2001 From: Negassa Berhanu Date: Sun, 22 May 2022 00:50:27 +0300 Subject: [PATCH 17/17] Updated in accordance with the 6th review LICENSE - remove h1 tag from header README.md - remove duplicates of Engine - remove spaces from the authors' names & urls setup.py - bump version to 2.4.0 - restore author's email main.py - remove unique and paranthesis from enum import - replace Enum import by IntEnum - replace double quotes with single quotes - rename url_data to image_url with a better description in the docs --- LICENSE | 4 +--- README.md | 8 ++++---- ocrspace/main.py | 27 +++++++++++---------------- setup.py | 4 ++-- 4 files changed, 18 insertions(+), 25 deletions(-) diff --git a/LICENSE b/LICENSE index f764634..008c8a1 100644 --- a/LICENSE +++ b/LICENSE @@ -1,9 +1,7 @@ -# MIT License + MIT License Copyright (c) 2017 Ali Najafi - Copyright (c) 2019 Erik Bøsen - Copyright (c) 2022 Negassa Berhanu Permission is hereby granted, free of charge, to any person obtaining a copy diff --git a/README.md b/README.md index 8d47ccc..a68ea23 100644 --- a/README.md +++ b/README.md @@ -26,16 +26,16 @@ api = ocrspace.API(api_key=api_key) api = ocrspace.API(endpoint='https://example.host', api_key=api_key, language=ocrspace.Language.Croatian) ``` -If you wish to change the OCR engine used, use the enum class Engine from ocrspace and pass the value of ocrspace.Engine.Engine.ENGINE_2 to the api instantiation. By default it uses ocrspace.Engine.Engine.ENGINE_1: +If you wish to change the OCR engine used, use the enum class Engine from ocrspace and pass the value of ocrspace.Engine.ENGINE_2 to the api instantiation. By default it uses ocrspace.Engine.ENGINE_1: ```python import ocrspace api_key = "apikey retrieved from Free OCR API" -api = ocrspace.API(api_key=api_key, engine=ocrspace.Engine.Engine.ENGINE_2) +api = ocrspace.API(api_key=api_key, engine=ocrspace.Engine.ENGINE_2) # Or if you have a custom API host, API key or desired language, pass those: -api = ocrspace.API(endpoint='https://example.host', api_key=api_key, language=ocrspace.Language.Croatian, engine=ocrspace.Engine.Engine.ENGINE_2) +api = ocrspace.API(endpoint='https://example.host', api_key=api_key, language=ocrspace.Language.Croatian, engine=ocrspace.Engine.ENGINE_2) ``` To perform recognition on an image hosted at some URL: @@ -56,7 +56,7 @@ That's it! Look at [`example.py`](example.py) for a demonstration. ## Authorship -This package was created by [Ali Najafi]() and is maintained by [Erik Boesen] () and Negassa Berhanu (). +This package was created by [Ali Najafi](https://github.com/a4fr) and is maintained by [Erik Boesen](https://github.com/ErikBoesen) and [Negassa Berhanu](https://github.com/NegassaB). ## License diff --git a/ocrspace/main.py b/ocrspace/main.py index 069da52..37614c3 100644 --- a/ocrspace/main.py +++ b/ocrspace/main.py @@ -1,5 +1,5 @@ import requests -from enum import (Enum, unique) +from enum import IntEnum class Language: @@ -29,8 +29,7 @@ class Language: Turkish = 'tur' -@unique -class Engine(Enum): +class Engine(IntEnum): """ Enum representing the OCR engine to use """ @@ -55,13 +54,9 @@ def __init__( :param **kwargs: other settings to API """ if not isinstance(engine, Engine): - raise TypeError( - "engine must be an instance of Engine" - ) + raise TypeError('engine must be an instance of Engine') if engine.value != 1 and engine.value != 2: - raise ValueError( - "the value of engine must be either 1 or 2, use ocrspace.Engine" - ) + raise ValueError('the value of engine must be either 1 or 2, use ocrspace.Engine') self.endpoint = endpoint self.api_key = api_key self.payload = { @@ -78,10 +73,10 @@ def _parse(self, raw): raise Exception(raw['ErrorMessage'][0]) return raw['ParsedResults'][0]['ParsedText'] - def query_api(self, url_data=None, image_file=None): + def query_api(self, image_url=None, image_file=None): """ Process the provided parameter. - :param url_data: Either an Image url or base64image + :param image_url: An Image url or base64image encoded string :param image_file: A path or file pointer to the image file :return: Result in JSON format :raise: request.exceptions or general Exception @@ -95,15 +90,15 @@ def query_api(self, url_data=None, image_file=None): data=self.payload, timeout=30 ) - elif url_data: + elif image_url: r = requests.post( self.endpoint, headers={'apikey': self.api_key}, - data=url_data, + data=image_url, timeout=30 ) else: - raise TypeError("either image_file or url_data must be provided") + raise TypeError('either image_file or image_url must be provided') r.raise_for_status() return self._parse(r.json()) @@ -124,7 +119,7 @@ def ocr_url(self, url): """ data = self.payload data['url'] = url - return self.query_api(url_data=data) + return self.query_api(image_url=data) def ocr_base64(self, base64image): """ @@ -134,4 +129,4 @@ def ocr_base64(self, base64image): """ data = self.payload data['base64Image'] = base64image - return self.query_api(url_data=data) + return self.query_api(image_url=data) diff --git a/setup.py b/setup.py index 8c537fa..a2d54d9 100644 --- a/setup.py +++ b/setup.py @@ -4,10 +4,10 @@ name='ocrspace', packages=['ocrspace'], # this must be the same as the name above requires=['requests'], - version='2.3.0', + version='2.4.0', description='Perform OCR through ocr.space API', author=['Ali Najafi', 'Erik Boesen', 'Negassa Berhanu'], - author_email='forpurposes1435@gmail.com', + author_email='me@erikboesen.com', url='https://github.com/ErikBoesen/ocrspace', keywords=['ocr'], classifiers=[],