-
Notifications
You must be signed in to change notification settings - Fork 22
ability to use different ocrengine as enum, update of example.py to include ocrengine, clean up of README.md, include .vscode/ in .gitignore, update setup.py to require requests #13
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
84fc93f
404334c
6ade1cd
bf672d3
405f9da
8398ee9
4a9fa29
9743184
abdbfd6
68fda59
77f17fb
7b1f2d3
f093f67
88b2a8a
83799af
8d58ee9
665fccc
96a7654
c8ae64a
593ed79
fadf892
6392996
38f7398
ebb3b56
dbcf91a
52c143b
c9ffd78
649ed1f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -88,3 +88,6 @@ ENV/ | |
| # Rope project settings | ||
| .ropeproject | ||
| MANIFEST | ||
|
|
||
| # vscode | ||
| .vscode/ | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,22 +1,34 @@ | ||
| import ocrspace | ||
| from ocrspace import Engine | ||
|
NegassaB marked this conversation as resolved.
|
||
| import requests | ||
|
|
||
|
|
||
| # api with the default ocrengine aka engine 1 | ||
| api = ocrspace.API() | ||
|
|
||
| # api with engine 2 | ||
| api_with_engine_two = ocrspace.API(engine=Engine.ENGINE_2) | ||
| TEST_IMAGE_URL = 'https://images-na.ssl-images-amazon.com/images/I/71ovNJN1URL._SL1244_.jpg' | ||
|
|
||
| print('Testing URL-based OCR:') | ||
| print(api.ocr_url(TEST_IMAGE_URL)) | ||
| print('Testing URL-based OCR using engine_two:') | ||
| print(api_with_engine_two.ocr_url(TEST_IMAGE_URL)) | ||
|
|
||
| print('Testing file-based OCR:') | ||
| # Download image for demo purposes | ||
| TEST_FILENAME = '/tmp/test_image.jpg' | ||
| with open(TEST_FILENAME, 'wb') as f: | ||
| r = requests.get(TEST_IMAGE_URL) | ||
| r.raw.decode_content = True | ||
| f.write(r.content) | ||
|
|
||
| print('Testing file-based OCR:') | ||
| # With file path | ||
| print(api.ocr_file(TEST_FILENAME)) | ||
| # With file pointer | ||
| print(api.ocr_file(open(TEST_FILENAME, 'rb'))) | ||
|
|
||
| print('Testing file-based OCR using engine_two:') | ||
| # With file path | ||
| print(api_with_engine_two.ocr_file(TEST_FILENAME)) | ||
| # With file pointer | ||
| print(api_with_engine_two.ocr_file(open(TEST_FILENAME, 'rb'))) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,5 @@ | ||
| import requests | ||
| from enum import IntEnum | ||
|
|
||
|
|
||
| class Language: | ||
|
|
@@ -28,25 +29,40 @@ class Language: | |
| Turkish = 'tur' | ||
|
|
||
|
|
||
| class Engine(IntEnum): | ||
| """ | ||
| Enum representing the OCR engine to use | ||
| """ | ||
| ENGINE_1 = 1 | ||
| ENGINE_2 = 2 | ||
|
|
||
|
|
||
| class API: | ||
| def __init__( | ||
| self, | ||
| endpoint='https://api.ocr.space/parse/image', | ||
| api_key='helloworld', | ||
| language=Language.English, | ||
| engine=Engine.ENGINE_1, | ||
| **kwargs, | ||
| ): | ||
| """ | ||
| :param endpoint: API endpoint to contact | ||
| :param api_key: API key string | ||
| :param language: document language | ||
| :param engine: ocr engine to use | ||
| :param **kwargs: other settings to API | ||
| """ | ||
| if not isinstance(engine, Engine): | ||
| raise TypeError('engine must be an instance of Engine') | ||
| if engine.value != 1 and engine.value != 2: | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this check is needed; it's impossible to make an IntEnum class with an invalid value as far as I know |
||
| raise ValueError('the value of engine must be either 1 or 2, use ocrspace.Engine') | ||
| self.endpoint = endpoint | ||
| self.api_key = api_key | ||
| self.payload = { | ||
| 'isOverlayRequired': True, | ||
| 'apikey': api_key, | ||
| 'language': language, | ||
| 'OCREngine': engine.value, | ||
| **kwargs | ||
| } | ||
|
|
||
|
|
@@ -57,21 +73,44 @@ def _parse(self, raw): | |
| raise Exception(raw['ErrorMessage'][0]) | ||
| return raw['ParsedResults'][0]['ParsedText'] | ||
|
|
||
|
|
||
| def ocr_file(self, fp): | ||
| def query_api(self, image_url=None, image_file=None): | ||
| """ | ||
| Process image from a local path. | ||
| :param fp: A path or pointer to your file | ||
| Process the provided parameter. | ||
| :param image_url: An Image url or base64image encoded string | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You may have forgotten to use this parameter in the function |
||
| :param image_file: A path or file pointer to the image file | ||
| :return: Result in JSON format | ||
| :raise: request.exceptions or general Exception | ||
| """ | ||
| with (open(fp, 'rb') if type(fp) == str else fp) as f: | ||
|
|
||
| if image_file: | ||
| r = requests.post( | ||
| self.endpoint, | ||
| files={'filename': f}, | ||
| headers={'apikey': self.api_key}, | ||
| files={'filename': image_file}, | ||
| data=self.payload, | ||
| timeout=30 | ||
| ) | ||
| elif image_url: | ||
| r = requests.post( | ||
| self.endpoint, | ||
| headers={'apikey': self.api_key}, | ||
| data=image_url, | ||
| timeout=30 | ||
| ) | ||
|
NegassaB marked this conversation as resolved.
|
||
| else: | ||
| raise TypeError('either image_file or image_url must be provided') | ||
| r.raise_for_status() | ||
| return self._parse(r.json()) | ||
|
|
||
| def ocr_file(self, fp): | ||
| """ | ||
| Process image from a local path. | ||
| :param fp: A path or pointer to your file | ||
| :return: Result in JSON format | ||
| """ | ||
| with (open(fp, 'rb') if type(fp) == str else fp) as f: | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is kind of dense, can we reorganize to be a bit clearer?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you clarify on this. Do you want to reorganize the code or the documentation? and if so how do you want to reorganize the code or documentation? Dense doesn't really communicate what you want to do
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The code. As in, split the ternary if/else into a full if/else statement across multiple lines so that we don't have a ton of logic in a single line making it difficult to read and understand at a glance There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The code could be made more readable (or more verbose). |
||
| return self.query_api(image_file=f) | ||
|
|
||
| def ocr_url(self, url): | ||
| """ | ||
| Process an image at a given URL. | ||
|
|
@@ -80,11 +119,7 @@ def ocr_url(self, url): | |
| """ | ||
| data = self.payload | ||
| data['url'] = url | ||
| r = requests.post( | ||
| self.endpoint, | ||
| data=data, | ||
| ) | ||
| return self._parse(r.json()) | ||
| return self.query_api(image_url=data) | ||
|
|
||
| def ocr_base64(self, base64image): | ||
| """ | ||
|
|
@@ -94,8 +129,4 @@ def ocr_base64(self, base64image): | |
| """ | ||
| data = self.payload | ||
| data['base64Image'] = base64image | ||
| r = requests.post( | ||
| self.endpoint, | ||
| data=data, | ||
| ) | ||
| return self._parse(r.json()) | ||
| return self.query_api(image_url=data) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,13 +1,14 @@ | ||
| from distutils.core import setup | ||
|
|
||
| setup( | ||
| name = 'ocrspace', | ||
| packages = ['ocrspace'], # this must be the same as the name above | ||
| version = '2.3.0', | ||
| description = 'Perform OCR through ocr.space API', | ||
| author = ['Ali Najafi', 'Erik Boesen'], | ||
| author_email = 'me@erikboesen.com', | ||
| url = 'https://github.com/ErikBoesen/ocrspace', | ||
| keywords = ['ocr'], | ||
| classifiers = [], | ||
| name='ocrspace', | ||
| packages=['ocrspace'], # this must be the same as the name above | ||
| requires=['requests'], | ||
| version='2.4.0', | ||
| description='Perform OCR through ocr.space API', | ||
| author=['Ali Najafi', 'Erik Boesen', 'Negassa Berhanu'], | ||
| author_email='me@erikboesen.com', | ||
| url='https://github.com/ErikBoesen/ocrspace', | ||
| keywords=['ocr'], | ||
| classifiers=[], | ||
| ) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No need for leading space either