diff --git a/.gitignore b/.gitignore index c55de92..cdce0cc 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,7 @@ logs/ .idea/**/dynamic.xml .idea/**/uiDesigner.xml .idea/**/dbnavigator.xml +*.info # Gradle .idea/**/gradle.xml @@ -174,6 +175,7 @@ venv.bak/ .mypy_cache/ ### VisualStudioCode ### +.vscode/ .vscode/* !.vscode/settings.json !.vscode/tasks.json diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..024875c --- /dev/null +++ b/.travis.yml @@ -0,0 +1,27 @@ +notifications: + email: + on_success: never + on_failure: always +language: python +python: 3.6 +addons: + apt: + update: true +env: + global: + - GH_USER=Twiddly + - GH_MAIL=pew@pewpew.moe + - REPO=happypandax/plugins + - secure: sogo+NT/2/Xr1LNmksFbMSoGCNCgVUcQynhkBiLqSfLrfT9zDXVYl4FGqIntARJSNK1BTvayV/XD0HOtMzbjeARoR91+NYgPScV3PRDu/Bw+X4yM8jjN7GZjz5+a5+co+A8cfuKdrf93CNVV6N02fxzqdC9lhoyp+HD7JxbWKl2+8YmQetrAD4dGf+KaVcJKHr/pgpBND5Tp17jO4vAEbqD+GT7aXDoC+81Onq8UOzNI0/A9s+2IzMlG4Jhfdr52ynpND1plycspByVI3kRqrFPEcWk4x2U5C8OKU6Hf4Zuj3G1d37c8MP6F9F362m6MWa4cfxOeKCmP6uRSVEf9Oxb/w2OnEg40U5nYEpBvziml1STINSgQQlwI4+iO0o1G4Zl7nRsyICZEAhzJRjoPcF5+IfzjGera0xxFkVd+0A9fz8kAyTp53BK5uAxgiaLcIfVj9WsZhSuJPG4kx0abnaNLPJtrDDTdBX9+A0Xk9sXHBCphFVOhobTUspmpCN4zWO9jH8xcPotRyo4D53I7yapSiia89yDxQNLnoNGDVPuX4KTMWO9w2snczEKo9rSJUDJVqdRU1LXbERFCoyCd33Rfm9EZsR06mtkjkpAL7YWOfbjSlyes87OBS3sRW7FqmQPEgSVbovmygOa564Yfo9B48doSKtg09aj0IBJxxXk= + + +script: + - python3 build.py + - git config --local user.name "$GH_USER" + - git config --local user.email "$GH_MAIL" + - git commit -am "build" + - git push --quiet "https://$GITHUB_TOKEN@github.com/$REPO.git" master > /dev/null 2>&1 + +branches: + only: + - master \ No newline at end of file diff --git a/Example Plugin/main.py b/Example Plugin/main.py index 68a8b62..30ea790 100644 --- a/Example Plugin/main.py +++ b/Example Plugin/main.py @@ -5,12 +5,15 @@ @hpx.subscribe("init") def inited(): + "Called when this plugin is initialised" pass @hpx.subscribe("disable") def disabled(): + "Called when this plugin has been disiabled" pass @hpx.subscribe("remove") def removed(): + "Called when this plugin is about to be removed" pass \ No newline at end of file diff --git a/Example Plugin/tests.py b/Example Plugin/test.py similarity index 100% rename from Example Plugin/tests.py rename to Example Plugin/test.py diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..0a04128 --- /dev/null +++ b/LICENSE @@ -0,0 +1,165 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + + This version of the GNU Lesser General Public License incorporates +the terms and conditions of version 3 of the GNU General Public +License, supplemented by the additional permissions listed below. + + 0. Additional Definitions. + + As used herein, "this License" refers to version 3 of the GNU Lesser +General Public License, and the "GNU GPL" refers to version 3 of the GNU +General Public License. + + "The Library" refers to a covered work governed by this License, +other than an Application or a Combined Work as defined below. + + An "Application" is any work that makes use of an interface provided +by the Library, but which is not otherwise based on the Library. +Defining a subclass of a class defined by the Library is deemed a mode +of using an interface provided by the Library. + + A "Combined Work" is a work produced by combining or linking an +Application with the Library. The particular version of the Library +with which the Combined Work was made is also called the "Linked +Version". + + The "Minimal Corresponding Source" for a Combined Work means the +Corresponding Source for the Combined Work, excluding any source code +for portions of the Combined Work that, considered in isolation, are +based on the Application, and not on the Linked Version. + + The "Corresponding Application Code" for a Combined Work means the +object code and/or source code for the Application, including any data +and utility programs needed for reproducing the Combined Work from the +Application, but excluding the System Libraries of the Combined Work. + + 1. Exception to Section 3 of the GNU GPL. + + You may convey a covered work under sections 3 and 4 of this License +without being bound by section 3 of the GNU GPL. + + 2. Conveying Modified Versions. + + If you modify a copy of the Library, and, in your modifications, a +facility refers to a function or data to be supplied by an Application +that uses the facility (other than as an argument passed when the +facility is invoked), then you may convey a copy of the modified +version: + + a) under this License, provided that you make a good faith effort to + ensure that, in the event an Application does not supply the + function or data, the facility still operates, and performs + whatever part of its purpose remains meaningful, or + + b) under the GNU GPL, with none of the additional permissions of + this License applicable to that copy. + + 3. Object Code Incorporating Material from Library Header Files. + + The object code form of an Application may incorporate material from +a header file that is part of the Library. You may convey such object +code under terms of your choice, provided that, if the incorporated +material is not limited to numerical parameters, data structure +layouts and accessors, or small macros, inline functions and templates +(ten or fewer lines in length), you do both of the following: + + a) Give prominent notice with each copy of the object code that the + Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the object code with a copy of the GNU GPL and this license + document. + + 4. Combined Works. + + You may convey a Combined Work under terms of your choice that, +taken together, effectively do not restrict modification of the +portions of the Library contained in the Combined Work and reverse +engineering for debugging such modifications, if you also do each of +the following: + + a) Give prominent notice with each copy of the Combined Work that + the Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the Combined Work with a copy of the GNU GPL and this license + document. + + c) For a Combined Work that displays copyright notices during + execution, include the copyright notice for the Library among + these notices, as well as a reference directing the user to the + copies of the GNU GPL and this license document. + + d) Do one of the following: + + 0) Convey the Minimal Corresponding Source under the terms of this + License, and the Corresponding Application Code in a form + suitable for, and under terms that permit, the user to + recombine or relink the Application with a modified version of + the Linked Version to produce a modified Combined Work, in the + manner specified by section 6 of the GNU GPL for conveying + Corresponding Source. + + 1) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (a) uses at run time + a copy of the Library already present on the user's computer + system, and (b) will operate properly with a modified version + of the Library that is interface-compatible with the Linked + Version. + + e) Provide Installation Information, but only if you would otherwise + be required to provide such information under section 6 of the + GNU GPL, and only to the extent that such information is + necessary to install and execute a modified version of the + Combined Work produced by recombining or relinking the + Application with a modified version of the Linked Version. (If + you use option 4d0, the Installation Information must accompany + the Minimal Corresponding Source and Corresponding Application + Code. If you use option 4d1, you must provide the Installation + Information in the manner specified by section 6 of the GNU GPL + for conveying Corresponding Source.) + + 5. Combined Libraries. + + You may place library facilities that are a work based on the +Library side by side in a single library together with other library +facilities that are not Applications and are not covered by this +License, and convey such a combined library under terms of your +choice, if you do both of the following: + + a) Accompany the combined library with a copy of the same work based + on the Library, uncombined with any other library facilities, + conveyed under the terms of this License. + + b) Give prominent notice with the combined library that part of it + is a work based on the Library, and explaining where to find the + accompanying uncombined form of the same work. + + 6. Revised Versions of the GNU Lesser General Public License. + + The Free Software Foundation may publish revised and/or new versions +of the GNU Lesser General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the +Library as you received it specifies that a certain numbered version +of the GNU Lesser General Public License "or any later version" +applies to it, you have the option of following the terms and +conditions either of that published version or of any later version +published by the Free Software Foundation. If the Library as you +received it does not specify a version number of the GNU Lesser +General Public License, you may choose any version of the GNU Lesser +General Public License ever published by the Free Software Foundation. + + If the Library as you received it specifies that a proxy can decide +whether future versions of the GNU Lesser General Public License shall +apply, that proxy's public statement of acceptance of any version is +permanent authorization for you to choose that version for the +Library. diff --git a/README.md b/README.md new file mode 100644 index 0000000..d3142f9 --- /dev/null +++ b/README.md @@ -0,0 +1,33 @@ + +#### In this repository resides plugins for HappyPanda X. If you wish to write a plugin for HPX head over to [the docs](https://happypandax.github.io/plugin.html#plugins). + +### How to download + +I recommend these tools to download a single directory from this repo: +- https://minhaskamal.github.io/DownGit/ -- *Paste the url to the plugin folder in this repo* +- https://kinolien.github.io/gitzip/ -- *Paste the url to the plugin folder in this repo* +- [Firefox Addon](https://addons.mozilla.org/en-US/firefox/addon/gitzip/) +- [Chrome Extension](https://chrome.google.com/webstore/detail/gitzip-for-github/ffabmkklhbepgcgfonabamgnfafbdlkn) + +### How to install + +Please see [#Installing plugins](https://happypandax.github.io/usage.html#installing-plugins) in the documentation. + +# Be careful about plugins + +Read the relevant section [#Be careful about plugins](https://happypandax.github.io/usage.html#be-careful-about-plugins) in the documentation + +# Plugins + +Name | Version | Description +--- | --- | --- +[**Chaika Downloader**](https://github.com/happypandax/plugins/tree/master/plugins/Chaika%20Downloader) | `1.0.0` | *A plugin that enables downloading manga and doujinshi from panda.chaika.moe* +[**Chaika Metadata**](https://github.com/happypandax/plugins/tree/master/plugins/Chaika%20Metadata) | `1.0.0` | *A plugin that can fetch metadata from Panda.Chaika* +[**EHentai Downloader**](https://github.com/happypandax/plugins/tree/master/plugins/EHentai%20Downloader) | `1.0.0` | *A plugin that enables downloading manga and doujinshi from E-Hentai & ExHentai* +[**EHentai Login**](https://github.com/happypandax/plugins/tree/master/plugins/EHentai%20Login) | `1.1.0` | *A plugin that can login to E-Hentai & ExHentai* +[**EHentai Metadata**](https://github.com/happypandax/plugins/tree/master/plugins/EHentai%20Metadata) | `1.2.1` | *A plugin that can fetch metadata from E-Hentai & ExHentai* +[**File Metadata**](https://github.com/happypandax/plugins/tree/master/plugins/File%20Metadata) | `2.0.2` | *Extracts and applies metadata from a file accompanying a gallery. Supports files produced from eze, e-hentai-downloader and hdoujin* +[**NHentai Downloader**](https://github.com/happypandax/plugins/tree/master/plugins/NHentai%20Downloader) | `1.0.1` | *A plugin that enables downloading manga and doujinshi from nhentai.net* +[**NHentai Metadata**](https://github.com/happypandax/plugins/tree/master/plugins/NHentai%20Metadata) | `1.0.1` | *A plugin that can fetch metadata from nhentai.net* + + diff --git a/build.py b/build.py new file mode 100644 index 0000000..b67c4bd --- /dev/null +++ b/build.py @@ -0,0 +1,65 @@ +import json +import glob +import pathlib +from urllib.parse import quote + +readme = """ +#### In this repository resides plugins for HappyPanda X. If you wish to write a plugin for HPX head over to [the docs](https://happypandax.github.io/plugin.html#plugins). + +### How to download + +I recommend these tools to download a single directory from this repo: +- https://minhaskamal.github.io/DownGit/ -- *Paste the url to the plugin folder in this repo* +- https://kinolien.github.io/gitzip/ -- *Paste the url to the plugin folder in this repo* +- [Firefox Addon](https://addons.mozilla.org/en-US/firefox/addon/gitzip/) +- [Chrome Extension](https://chrome.google.com/webstore/detail/gitzip-for-github/ffabmkklhbepgcgfonabamgnfafbdlkn) + +### How to install + +Please see [#Installing plugins](https://happypandax.github.io/usage.html#installing-plugins) in the documentation. + +# Be careful about plugins + +Read the relevant section [#Be careful about plugins](https://happypandax.github.io/usage.html#be-careful-about-plugins) in the documentation + +# Plugins + +{} + +""" + +plugins_dir = "plugins" +readme_file = "README.md" +desc_max_length = 200 +repo_user = "happypandax" +repo_name = "plugins" + +def main(): + print("Building...") + plugin_readme = "Name | Version | Description\n--- | --- | ---\n" + + for p in sorted(glob.glob(f"{plugins_dir}/**/hplugin.json")): + with open(p, 'r', encoding="utf-8") as f: + d = json.load(f) + plugin_dir = pathlib.Path(p).parent + dir_name = plugin_dir.name + plugin_dir = str(plugin_dir).replace('\\', '/') + plugin_desc = d.get("description") + plugin_ver = d.get("version") + + gh_url = f"https://github.com/{repo_user}/{repo_name}/tree/master/{quote(plugin_dir)}" + + if plugin_desc and plugin_ver: + plugin_desc = plugin_desc.split('\n')[0] + if len(plugin_desc) > desc_max_length: + plugin_desc = plugin_desc[:desc_max_length] + '…' + plugin_readme += f"[**{dir_name}**]({gh_url}) | `{plugin_ver}` | *{plugin_desc}*\n" + + txt = readme.format(plugin_readme) + + with open(readme_file, 'w', encoding="utf-8") as f: + f.write(txt) + print("Done!") + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/plugins/Chaika Downloader/hplugin.json b/plugins/Chaika Downloader/hplugin.json new file mode 100644 index 0000000..e78e4f9 --- /dev/null +++ b/plugins/Chaika Downloader/hplugin.json @@ -0,0 +1,15 @@ +{ + "id": "dd86876d-4d4e-438e-bc2e-fbbb18e35742", + "shortname": "chaika-downloader", + "name": "Chaika Downloader", + "version": "1.0.0", + "description": "A plugin that enables downloading manga and doujinshi from panda.chaika.moe", + "author": "Twiddly", + "update_url": "https://github.com/happypandax/plugins/tree/master/plugins/Chaika%20Downloader", + "website": "https://github.com/happypandax/plugins/tree/master/plugins/Chaika%20Downloader", + "entry": "main.py", + "test": "test.py", + "require": [ + "happypandax >= 0.12.0" + ] +} \ No newline at end of file diff --git a/plugins/Chaika Downloader/main.py b/plugins/Chaika Downloader/main.py new file mode 100644 index 0000000..59ccb01 --- /dev/null +++ b/plugins/Chaika Downloader/main.py @@ -0,0 +1,149 @@ +# main.py +import __hpx__ as hpx +import regex + +from bs4 import BeautifulSoup + +DownloadRequest = hpx.command.DownloadRequest + +log = hpx.get_logger("main") + +IDENTIFIER = "chaika" +HEADERS = {'user-agent':"Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0"} +DEFAULT_DELAY = 0.5 + +URLS = { + 'ch': 'https://panda.chaika.moe', + 'gallery_api': 'https://panda.chaika.moe/jsearch?gallery=', +} + +def website_url_regex_gen(domain, path_regex=None, variable_port=False, variable_tld=False, trailing_slash=True, end=True, trailing_fragment=True): + """ + Generates a regex suitable for a specific domain + """ + rgx = r"^(http\:\/\/|https\:\/\/)?(www\.)?({})".format(domain) + if variable_tld: + rgx += r"\.[a-z]{2,5}" + if variable_port: + rgx += r"(:[0-9]{1,5})?" + if trailing_slash: + rgx += r"\/?" + if path_regex: + rgx += path_regex + if trailing_slash: + rgx += r"\/?" + if trailing_fragment: + rgx += r"(#\S+)?" + if end: + rgx += "$" + return rgx + +@hpx.subscribe("init") +def inited(): + # set default delay if not set + delays = hpx.get_setting("network", "delays", {}) + delay_url = URLS['ch'] + if delay_url not in delays: + log.info(f"Setting delay on {delay_url} requests to {DEFAULT_DELAY}") + delays[delay_url] = DEFAULT_DELAY + hpx.update_setting("network", "delays", delays) + +@hpx.attach("Download.info") +def download_info(): + return hpx.command.DownloadInfo( + identifier = IDENTIFIER, + name = "Chaika", + parser = website_url_regex_gen("panda.chaika.moe", path_regex=r"(gallery|archive)\/[0-9]{3,15}", trailing_slash=True, variable_tld=False, trailing_fragment=True, end=True), + sites = ("https://panda.chaika.moe",), + description = "Download manga and doujinshi from panda.chaika.moe", + ) + +@hpx.attach("Download.query", trigger=IDENTIFIER) +def download_query(item): + """ + Called to query for resource URLs that should be downloaded. + Note that HPX will handle the actual downloading part. + The attached handler should just return all the URLs that should be downloaded in the form of .:class:`DownloadRequest` objects + + should return: + a tuple of :class:`DownloadRequest` for all the URL resources that should be downloaded. + Note that the download system is recursive, so if the URL resource matches a download handler (the same or a different one), + That handler will be called upon with a new :class:`DownloadItem` for that particular URL + (though only once, meaning, no handler will be called upon again with the exact same URL during a single session) + """ + + log.info(f"querying url: {item.url}") + + # prepare request + req_props = hpx.command.RequestProperties( + headers=HEADERS, + ) + + # chaika has a simple url system where every download url is in the form of https://panda.chaika.moe/archive/32870/download/ + # if the url is a gallery url, find and retrieve the archive urls + + url_type, gid = parse_url(item.url) + + download_urls = [] + + if url_type == 'gallery': + log.info(f"url was a gallery url, retrieving archive urls") + req = hpx.command.SingleGETRequest().request(URLS['gallery_api']+str(gid), req_props) + if req.ok: + log.info("request was successful") + + # get all archive urls + a_urls = req.json.get("archives") + if a_urls: + # we also get to set the name of this download item + title = req.json.get('title') + if title: + item.name = title + + for a in a_urls: + download_urls.append(URLS['ch']+a['download']) + else: + download_urls.append(URLS['ch']+f"/archive/{gid}/download/") + + + download_requests = [] + + if download_urls: + log.debug(f"found {len(download_urls)} download urls: {download_urls}") + for durl in download_urls: + download_requests.append(DownloadRequest(downloaditem=item, url=durl)) + + if download_requests: + log.info(f"was able to prepare requests for {len(download_requests)} urls") + return tuple(download_requests) + +@hpx.attach("Download.done", trigger=IDENTIFIER) +def download_done(result): + """ + Called when downloading of all :class:`DownloadRequest` for a specific :class:`DownloadItem` has finished. + The handler should do any post-processing here (archive files, rename files or folders, delete extranous files and etc.). + Remember to set the `status` property on the :class:`DownloadResult` object to `False` if the post-processing was a failure. + Note that the handler should *not* import the file into HPX (if it's an item), that part will be taken care of by HPX + + should return: + the same :class:`DownloadResult` that was provided to the handler, potentially modified on the 'path' or `status` and `reason` properties + """ + # there's nothing special to post-process in the case of chaika downloader, so just return the result as is + log.info(f"download of archive was successful for {result.downloaditem.name}") + return result + +def parse_url(url): + "Parses url into a tuple of gallery/archive and id" + gallery_id = None + stype = "gallery" + + gallery_id = regex.search('([0-9]+)', url) + if gallery_id: + gallery_id = gallery_id.group() + else: + log.warning("Error extracting id from url: {}".format(url)) + + if 'archive' in url: + stype = 'archive' + + return stype, int(gallery_id) diff --git a/plugins/Chaika Downloader/readme.md b/plugins/Chaika Downloader/readme.md new file mode 100644 index 0000000..593469b --- /dev/null +++ b/plugins/Chaika Downloader/readme.md @@ -0,0 +1,13 @@ +Chaika Downloader +---------------------------- + +> A plugin that enables downloading manga and doujinshi from panda.chaika.moe + +## Configuration + +There's no available config options for this plugin + +# Changelog + +- `1.0.0` + - first version \ No newline at end of file diff --git a/plugins/Chaika Downloader/test.py b/plugins/Chaika Downloader/test.py new file mode 100644 index 0000000..36a317e --- /dev/null +++ b/plugins/Chaika Downloader/test.py @@ -0,0 +1 @@ +# test.py \ No newline at end of file diff --git a/plugins/Chaika Metadata/hplugin.json b/plugins/Chaika Metadata/hplugin.json new file mode 100644 index 0000000..1b6ab70 --- /dev/null +++ b/plugins/Chaika Metadata/hplugin.json @@ -0,0 +1,15 @@ +{ + "id": "55747a41-789b-43dd-964a-2a6cb1761ff4", + "shortname": "chaika-metadata", + "name": "Chaika Metadata", + "version": "1.0.0", + "description": "A plugin that can fetch metadata from Panda.Chaika", + "author": "Twiddly", + "update_url": "https://github.com/happypandax/plugins/tree/master/plugins/Chaika%20Metadata", + "website": "https://github.com/happypandax/plugins/tree/master/plugins/Chaika%20Metadata", + "entry": "main.py", + "test": "test.py", + "require": [ + "happypandax >= 0.10.0" + ] +} \ No newline at end of file diff --git a/plugins/Chaika Metadata/main.py b/plugins/Chaika Metadata/main.py new file mode 100644 index 0000000..cbb4de3 --- /dev/null +++ b/plugins/Chaika Metadata/main.py @@ -0,0 +1,473 @@ +# main.py +import __hpx__ as hpx +import regex +import arrow +import datetime +import os +import urllib +import html + +from bs4 import BeautifulSoup +from PIL import Image, ImageChops + +log = hpx.get_logger("main") + +MATCH_URL_PREFIX = r"^(http\:\/\/|https\:\/\/)?(www\.)?" # http:// or https:// + www. +MATCH_URL_END = r"\/?$" + +DEFAULT_DELAY = 1.5 + +URLS_REGEX = { + 'gallery': MATCH_URL_PREFIX + r"(panda\.chaika\.moe\/(archive|gallery)\/[0-9]+)" + MATCH_URL_END, +} + +URLS = { + 'ch': 'https://panda.chaika.moe', + 'gallery': 'https://panda.chaika.moe/gallery/', + 'archive': 'https://panda.chaika.moe/archive/', + 'gallery_api': 'https://panda.chaika.moe/jsearch?gallery=', + 'archive_api': 'https://panda.chaika.moe/jsearch?archive=', + 'hash_api': 'https://panda.chaika.moe/jsearch?sha1=', + 'title_search': "https://panda.chaika.moe/galleries/?title={title}&tags=&category=&provider=&uploader=&rating_from=&rating_to=&filesize_from=&filesize_to=&filecount_from=&filecount_to=&sort=posted&asc_desc=desc&apply=" +} + +HEADERS = {'user-agent':"Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0"} + +PLUGIN_CONFIG = { + 'filename_search': False, # use the filename/folder-name for searching instead of gallery title + 'remove_namespaces': True, # remove superfluous namespaces like 'artist', 'language' and 'group' because they are handled specially in HPX + 'gallery_results_limit': 10, # maximum amount of galleries to return + 'blacklist_tags': [], # tags to ignore when updating tags + 'add_gallery_url': True, # add ehentai url to gallery + 'preferred_language': "english", # preferred gallery langauge (in gallery title) to extract from if multiple galleries were found, set empty string for default +} + +@hpx.subscribe("init") +def inited(): + PLUGIN_CONFIG.update(hpx.get_plugin_config()) + + # set default delay values if not set + delays = hpx.get_setting("network", "delays", {}) + for u in (URLS['ch'],): + if u not in delays: + log.info(f"Setting delay on {u} requests to {DEFAULT_DELAY}") + delays[u] = DEFAULT_DELAY + hpx.update_setting("network", "delays", delays) + +@hpx.subscribe('config_update') +def config_update(cfg): + PLUGIN_CONFIG.update(cfg) + +@hpx.subscribe("disable") +def disabled(): + pass + +@hpx.subscribe("remove") +def removed(): + pass + +@hpx.attach("Metadata.info") +def metadata_info(): + return hpx.command.MetadataInfo( + identifier = "chaika", + name = "Panda.Chaika", + parser = URLS_REGEX['gallery'], + sites = ("https://panda.chaika.moe",), + description = "Fetch metadata from Panda.Chaika", + models = ( + hpx.command.GetDatabaseModel("Gallery"), + ) + ) + +@hpx.attach("Metadata.query", trigger="chaika") +def query(itemtuple): + """ + Called to query for candidates to extract metadata from. + Note that HPX will handle choosing which candidates to extract data from. + The attached handler should just return all the candidates found. + """ + log.info("Querying chaika for metadata") + mdata = [] + for mitem in itemtuple: + item = mitem.item + url = mitem.url + gurls = [] # tuple of (title, url) + # url was provided + if url: + log.info(f"url provided: {url} for {item}") + gurls.append((url, url)) + else: # manually search for id + log.info(f"url not provided for {item}") + # search with title + i_title = "" + i_hash = "" + if PLUGIN_CONFIG.get("filename_search"): + sources = item.get_sources() + if sources: + # get folder/file name + i_title = os.path.split(sources[0])[1] + # remove ext + i_title = os.path.splitext(i_title)[0] + else: + if item.titles: + i_title = item.titles[0].name # make user choice + if i_title: + gurls = title_search(i_title) + + # search with hash + if not gurls: + pass + + log.info(f"found {len(gurls)} urls for item: {item}") + + # list is sorted by date added so we reverse it + gurls.reverse() + + log.debug(f"{gurls}") + final_gurls = [] + pref_lang = PLUGIN_CONFIG.get('preferred_language') + if pref_lang: + for t in gurls: + if pref_lang.lower() in t[0].lower(): + final_gurls.insert(0, t) + continue + final_gurls.append(t) + else: + final_gurls = gurls + + for t, u in final_gurls: + g_type, g_id = parse_url(u) + if g_type and g_id: + mdata.append(hpx.command.MetadataData( + metadataitem = mitem, + title=t, + url=u, + data={ + 'type': g_type, + 'id': g_id, + 'gallery_url': u, + })) + return tuple(mdata) + +@hpx.attach("Metadata.apply", trigger="chaika") +def apply(datatuple): + """ + Called to fetch and apply metadata to the given data items. + Remember to set the `status` property on the :class:`MetadataResult` object to `True` on a successful fetch. + """ + log.info("Applying metadata from chaika") + mresult = [] + + for mdata in datatuple: + applied = False + # prepare request + req_props = hpx.command.RequestProperties( + headers=HEADERS, + ) + + api_url = URLS['archive_api'] if mdata.data['type'] == 'archive' else URLS['gallery_api'] + api_url += str(mdata.data['id']) + + r = hpx.command.SingleGETRequest().request(api_url, req_props) + if r.ok: + response = r.json + if response and not 'result' in response: + filtered_data = format_metadata(response, mdata.item, apply_url=PLUGIN_CONFIG.get('add_gallery_url', True), gallery_url=mdata.data['gallery_url']) + applied = apply_metadata(filtered_data, mdata.item, mdata.options) + elif response: + log.warning(response) + reason = "" + if not applied and 'result' in response: + reason = response['result'] + mresult.append(hpx.command.MetadataResult(data=mdata, status=applied, reason=reason)) + log.info(f"Applied: {applied}") + return tuple(mresult) + +def title_search(title, session=None, _times=0): + "Searches on chaika for galleries with given title, returns a list of (title, matching gallery urls)" + search_url = URLS['title_search'] + log.debug(f"searching with title: {title}") + f_url = search_url.format( + title=urllib.parse.quote_plus(title) + ) + log.debug(f"final url: {f_url}") + r = page_results(f_url, session=session) + if not r and not _times: + title = regex.sub(r"\(.+?\)|\[.+?\]", "", title) + title = " ".join(title.split()) + r = title_search(title, session, _times=_times+1) + return r + +def page_results(page_url, limit=None, session=None): + "Opens chaika page, parses for results, and then returns list of (title, url)" + found_urls = [] + if limit is None: + limit = PLUGIN_CONFIG.get("gallery_results_limit") + + # prepare request + req_props = hpx.command.RequestProperties( + headers=HEADERS, + ) + if session: + req_props.session = session + log.debug(f"COOKIES: {session.cookies}") + r = hpx.command.SingleGETRequest().request(page_url, req_props) + soup = BeautifulSoup(r.text, "html.parser") + results = soup.findAll("tr", class_="result-list", limit=limit) + results = [r.findAll('td')[1] for r in results] + # str(x.a.string) + found_urls = [(str(x.a.string), URLS['ch'] + x.a['href']) for x in results] # title, url + + if not found_urls: + log.warning(f"No results found on url: {page_url}") + log.debug(f"HTML: {r.text}") + return found_urls + +def parse_url(url): + "Parses url into a tuple of gallery/archive and id" + gallery_id = None + stype = "gallery" + + gallery_id = regex.search('([0-9]+)', url) + if gallery_id: + gallery_id = gallery_id.group() + else: + log.warning("Error extracting id from url: {}".format(url)) + + if 'archive' in url: + stype = 'archive' + + return stype, int(gallery_id) + +def capitalize_text(text): + """ + better str.capitalize + """ + return " ".join(x.capitalize() for x in text.strip().split()) + +def format_metadata(gdata, item, apply_url=False, gallery_url=None): + """ + Formats metadata to look like this for apply_metadata: + data = { + 'titles': None, # [(title, language),...] + 'artists': None, # [(artist, (circle, circle, ..)),...] + 'parodies': None, # [parody, ...] + 'category': None, + 'tags': None, # [tag, tag, tag, ..] or {ns:[tag, tag, tag, ...]} + 'pub_date': None, # DateTime object or Arrow object + 'language': None, + 'urls': None # [url, ...] + } + """ + mdata = {} + + mdata['titles'] = [] + + parsed_text = hpx.command.ItemTextParser(gdata['title']) + + parsed_title = parsed_text.extract_title() + if parsed_title: + parsed_title = parsed_title[0] + mdata['titles'].append((parsed_title or gdata['title'], 'english')) + + mdata['titles'].append((gdata['title_jpn'], 'japanese')) + + + mdata['category'] = gdata['category'] + if gdata['posted']: + mdata['pub_date'] = arrow.Arrow.fromtimestamp(gdata['posted']) + + lang = "japanese" # default language + + artists = set() + circles = set() + parodies = set() + + parsed_artists = parsed_text.extract_artist() + parsed_circles = parsed_text.extract_circle() + + extranous_namespaces = ("artist", "parody", "group", "language") + mdata['tags'] = {} + + for nstag in gdata['tags']: + onstag = nstag + nstag = nstag.replace('_', ' ') + blacklist_tags = PLUGIN_CONFIG.get("blacklist_tags") + if blacklist_tags and (nstag in blacklist_tags or onstag in blacklist_tags): + continue + + ns = None + if ':' in nstag: + ns, t = nstag.split(':', 1) + else: + t = nstag + + if ns == 'language' and t != 'translated': + lang = t + elif ns == "artist": + for a in artists: # the artist extracted from the title likely has better capitalization, so choose that instead + if a.lower() == t.lower(): + artists.add(a) + break + else: + artists.add(t) + elif ns == "group": + for c in circles: # the circle extracted from the title likely has better capitalization, so choose that instead + if c.lower() == t.lower(): + circles.add(c) + break + else: + circles.add(t) + elif ns == "parody": + parodies.add(t) + + if not (PLUGIN_CONFIG.get("remove_namespaces") and ns in extranous_namespaces): + mdata['tags'].setdefault(ns, []).append(t) + else: + log.debug(f"removing namespace {ns}") + + log.debug(f"tags: {mdata['tags']}") + + mdata['language'] = lang + + if parodies: + mdata['parodies'] = parodies + + if artists: + a_circles = [] + for a in artists: + a_circles.append((a, tuple(circles))) # assign circles to each artist + mdata['artists'] = a_circles + + if apply_url: + if gdata.get('gallery', False): + mdata['urls'] = [URLS['gallery']+f"{gdata['gallery']}/"] + elif gallery_url: + mdata['urls'] = [gallery_url] + + return mdata + +GalleryData = hpx.command.GalleryData +LanguageData = hpx.command.LanguageData +TitleData = hpx.command.TitleData +ArtistData = hpx.command.ArtistData +ArtistNameData = hpx.command.ArtistNameData +ParodyData = hpx.command.ParodyData +ParodyNameData = hpx.command.ParodyNameData +CircleData = hpx.command.CircleData +CategoryData = hpx.command.CategoryData +UrlData = hpx.command.UrlData +NamespaceTagData= hpx.command.NamespaceTagData +TagData= hpx.command.TagData +NamespaceData = hpx.command.NamespaceData + +def apply_metadata(data, gallery, options): + """ + data = { + 'titles': None, # [(title, language),...] + 'artists': None, # [(artist, (circle, circle, ..)),...] + 'parodies': None, # [parody, ...] + 'category': None, + 'tags': None, # [tag, tag, tag, ..] or {ns:[tag, tag, tag, ...]} + 'pub_date': None, # DateTime object or Arrow object + 'language': None, + 'urls': None # [url, ...] + } + """ + + log.debug(f"data: {data}") + + gdata = GalleryData() + + if isinstance(data.get('titles'), (list, tuple, set)): + gtitles = [] + for t, l in data['titles']: + gtitle = None + if t: + t = html.unescape(t) + gtitle = TitleData(name=t) + if t and l: + gtitle.language = LanguageData(name=l) + if gtitle: + gtitles.append(gtitle) + + if gtitles: + gdata.titles = gtitles + log.debug("applied titles") + + if isinstance(data.get('artists'), (list, tuple, set)): + gartists = [] + for a, c in data['artists']: + if a: + gartist = ArtistData(names=[ArtistNameData(name=capitalize_text(a))]) + gartists.append(gartist) + + if c: + gcircles = [] + for circlename in [x for x in c if x]: + gcircles.append(CircleData(name=capitalize_text(circlename))) + gartist.circles = gcircles + + if gartists: + gdata.artists = gartists + log.debug("applied artists") + + if isinstance(data.get('parodies'), (list, tuple, set)): + gparodies = [] + for p in data['parodies']: + if p: + gparody = ParodyData(names=[ParodyNameData(name=capitalize_text(p))]) + gparodies.append(gparody) + + if gparodies: + gdata.parodies = gparodies + log.debug("applied parodies") + + if data.get('category'): + gdata.category = CategoryData(name=data['category']) + log.debug("applied category") + + if data.get('language'): + gdata.language = LanguageData(name=data['language']) + log.debug("applied language") + + if isinstance(data.get('tags'), (dict, list)): + if isinstance(data['tags'], list): + data['tags'] = {None: data['tags']} + gnstags = [] + for ns, tags in data['tags'].items(): + if ns is not None: + ns = ns.strip() + if ns and ns.lower() == 'misc': + ns = None + for t in tags: + t = t.strip() + if t: + kw = {'tag': TagData(name=t)} + if ns: + kw['namespace'] = NamespaceData(name=ns) + gnstags.append(NamespaceTagData(**kw)) + + if gnstags: + gdata.tags = gnstags + log.debug("applied tags") + + if isinstance(data.get('pub_date'), (datetime.datetime, arrow.Arrow)): + pub_date = data['pub_date'] + gdata.pub_date = pub_date + log.debug("applied pub_date") + + if isinstance(data.get('urls'), (list, tuple)): + gurls = [] + for u in data['urls']: + if u: + gurls.append(UrlData(name=u)) + if gurls: + gdata.urls = gurls + log.debug("applied urls") + + applied = hpx.command.UpdateItemData(gallery, gdata, options=options) + + log.debug(f"applied: {applied}") + + return applied \ No newline at end of file diff --git a/plugins/Chaika Metadata/readme.md b/plugins/Chaika Metadata/readme.md new file mode 100644 index 0000000..28461ee --- /dev/null +++ b/plugins/Chaika Metadata/readme.md @@ -0,0 +1,41 @@ +Chaika Metadata +---------------------------- + +> This plugin fetches metadata from Panda.Chaika + +## Configuration + +Configure this plugin by adding `chaika-metadata` to the `plugin.config` namespace in your `config.yaml`: +```yaml +plugin: + config: + chaika-metadata: + option1: True + option2: + - item 1 + - item 2 +``` + +#### Available options + +Name | Default | Description +--- | --- | --- +`filename_search` | `false` | use the filename/folder-name for searching instead of gallery title +`remove_namespaces` | `true` | remove superfluous namespaces like 'artist', 'language' and 'group' because they are handled specially in HPX +`gallery_results_limit` | `10` | maximum amount of galleries to return +`blacklist_tags` | `[]` | tags to ignore when updating tags, a list of `namespace:tag` or `tag` strings +`add_gallery_url` | `true` | add chaika url to gallery +`preferred_language` | `english` | preferred gallery langauge (in gallery title) to extract from if multiple galleries were found, set empty string for default + + +## Things yet to be implemented + +- File similarity search + +# Changelog + +- `1.0.0` + - Updated to reflect new changes in HPX v0.10.0 + +- `0.1.0b` + - first version \ No newline at end of file diff --git a/plugins/Chaika Metadata/test.py b/plugins/Chaika Metadata/test.py new file mode 100644 index 0000000..36a317e --- /dev/null +++ b/plugins/Chaika Metadata/test.py @@ -0,0 +1 @@ +# test.py \ No newline at end of file diff --git a/plugins/EHentai Downloader/hplugin.json b/plugins/EHentai Downloader/hplugin.json new file mode 100644 index 0000000..21eb484 --- /dev/null +++ b/plugins/EHentai Downloader/hplugin.json @@ -0,0 +1,15 @@ +{ + "id": "efaec768-760c-49d7-8d45-fb70b7db45e5", + "shortname": "ehentai-downloader", + "name": "EHentai Downloader", + "version": "1.0.0", + "description": "A plugin that enables downloading manga and doujinshi from E-Hentai & ExHentai", + "author": "Twiddly", + "update_url": "https://github.com/happypandax/plugins/tree/master/plugins/EHentai%20Downloader", + "website": "https://github.com/happypandax/plugins/tree/master/plugins/EHentai%20Downloader", + "entry": "main.py", + "test": "test.py", + "require": [ + "happypandax >= 0.12.0" + ] +} \ No newline at end of file diff --git a/plugins/EHentai Downloader/main.py b/plugins/EHentai Downloader/main.py new file mode 100644 index 0000000..bfaaa06 --- /dev/null +++ b/plugins/EHentai Downloader/main.py @@ -0,0 +1,251 @@ +# main.py +import __hpx__ as hpx +import regex +import json +from bs4 import BeautifulSoup + +DownloadRequest = hpx.command.DownloadRequest + +log = hpx.get_logger("main") + +EH_IDENTIFIER = "ehentai" +EX_IDENTIFIER = "exhentai" + +HEADERS = {'user-agent':"Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0"} +DEFAULT_DELAY = 5 + +URLS = { + 'eh': 'https://e-hentai.org', + 'ex': 'https://exhentai.org', + 'e_api': 'https://api.e-hentai.org/api.php', + 'ex_api': 'https://exhentai.org/api.php', + 'e_archiver': 'https://e-hentai.org/archiver.php?gid={gallery_id}&token={gallery_token}&or={archiver_key}', + 'ex_archiver': 'https://exhentai.org/archiver.php?gid={gallery_id}&token={gallery_token}&or={archiver_key}', +} + + +def website_url_regex_gen(domain, path_regex=None, variable_port=False, variable_tld=False, trailing_slash=True, end=True, trailing_fragment=True): + """ + Generates a regex suitable for a specific domain + """ + rgx = r"^(http\:\/\/|https\:\/\/)?(www\.)?({})".format(domain) + if variable_tld: + rgx += r"\.[a-z]{2,5}" + if variable_port: + rgx += r"(:[0-9]{1,5})?" + if trailing_slash: + rgx += r"\/?" + if path_regex: + rgx += path_regex + if trailing_slash: + rgx += r"\/?" + if trailing_fragment: + rgx += r"(#\S+)?" + if end: + rgx += "$" + return rgx + +@hpx.subscribe("init") +def inited(): + # set default delay values if not set + delays = hpx.get_setting("network", "delays", {}) + for u in (URLS['ex'], URLS['eh'], "https://api.e-hentai.org", URLS['ex_api']): + if u not in delays: + log.info(f"Setting delay on {u} requests to {DEFAULT_DELAY}") + delays[u] = DEFAULT_DELAY + hpx.update_setting("network", "delays", delays) + +@hpx.attach("Download.info") +def eh_download_info(): + return hpx.command.DownloadInfo( + identifier = EH_IDENTIFIER, + name = "E-Hentai", + parser = website_url_regex_gen("e-hentai.org", path_regex=r"g\/[0-9]{3,10}\/[0-9a-zA-Z]{3,15}", trailing_slash=True, variable_tld=False, trailing_fragment=True, end=True), + sites = ("https://e-hentai.org",), + description = "Download manga and doujinshi from e-hentai.org", + ) + +@hpx.attach("Download.info") +def ex_download_info(): + return hpx.command.DownloadInfo( + identifier = EX_IDENTIFIER, + name = "ExHentai", + parser = website_url_regex_gen("exhentai.org", path_regex=r"g\/[0-9]{3,10}\/[0-9a-zA-Z]{3,15}", trailing_slash=True, variable_tld=False, trailing_fragment=True, end=True), + sites = ("https://exhentai.org",), + description = "Download manga and doujinshi from exhentai.org", + ) + +@hpx.attach("Download.query", trigger=EH_IDENTIFIER) +def eh_download_query(item): + return download_query(item, False) + +@hpx.attach("Download.query", trigger=EX_IDENTIFIER) +def ex_download_query(item): + return download_query(item, True) + +def download_query(item, is_exhentai): + """ + Called to query for resource URLs that should be downloaded. + Note that HPX will handle the actual downloading part. + The attached handler should just return all the URLs that should be downloaded in the form of .:class:`DownloadRequest` objects + + should return: + a tuple of :class:`DownloadRequest` for all the URL resources that should be downloaded. + Note that the download system is recursive, so if the URL resource matches a download handler (the same or a different one), + That handler will be called upon with a new :class:`DownloadItem` for that particular URL + (though only once, meaning, no handler will be called upon again with the exact same URL during a single session) + """ + + gid, gtoken = parse_url(item.url) + download_requests = [] + + #=============================================================================== + # get ehentai login + login_site = URLS['ex'] if is_exhentai else URLS['eh'] + login_status = hpx.command.GetLoginStatus(login_site) + login_session = None + if login_status: + login_session = hpx.command.GetLoginSession(login_site) + if not login_session: + log.warning("unable to get a login sesion for querying gallery data") + return () + log.info("logged in, attempting to download archive") + + #=============================================================================== + # get the gallery metadata which should have the archive key + # https://ehwiki.org/wiki/API#Gallery_Metadata + log.info("getting archiver key") + eh_data = { + 'method': 'gdata', + 'gidlist': [[gid, gtoken]], + } + req_props = hpx.command.RequestProperties( + headers=HEADERS, + json=eh_data, + session=login_session + ) + api_url = URLS['ex_api' if is_exhentai else 'e_api'] + log.info(f"requesting with api url {api_url}") + r = hpx.command.SinglePOSTRequest().request(api_url, req_props) + if not r.ok: + log.warning(f"got invalid metadata page or bad status: {r.status_code}") + log.debug(r.text) + return () + try: + response = r.json + except json.JSONDecodeError: + response = None + log.info("got empty response when trying to retrieve archiver key, this usually means that user has no access to exhentai") + return () + if not response or 'error' in response: + log.warning("response has an error of some sort, and so we have no archive key to use") + log.debug(r.text) + return () + + #=============================================================================== + # Read the metadata of the gallery to fill out the download queue item + # + # While in theory we should only ever have a single entry in the response, best keep it more general just in case this gets retrofitted to handle multiple urls at once + # The information we reliably get is the thumbnail url and the title of the gallery + # We seem to also always get an archive key, but the key is not always valid, and so the archive url request can fail + # + # Yes, there is an expunged flag in the metadata, but it is always false when the gallery/archive is not available + # It is also true sometimes and yet the gallery/archive is totally accessable and so is meaningless + for gdata in response['gmetadata']: + archive_req = False + try: + if 'title' in gdata: + item.name = gdata['title'] + if 'thumb' in gdata: + download_requests.append( + DownloadRequest( + downloaditem=item, + url=gdata['thumb'], + is_thumbnail=True, + properties=hpx.command.RequestProperties(method=hpx.Method.GET, headers=HEADERS, session=login_session), # we need to use the same session + )) + if 'archiver_key' in gdata: + log.info(f"found archiver key for gallery {(gid, gtoken)}") + a_key = gdata['archiver_key'] + a_url = URLS['ex_archiver' if is_exhentai else 'e_archiver'].format(gallery_id=gid, gallery_token=gtoken, archiver_key=a_key) + form_data = { + "dltype": "org", #original quality, instead of a resampled version + "dlcheck": "Download Original Archive" + } + req_props = hpx.command.RequestProperties( + headers=HEADERS, + data=form_data, + session=login_session + ) + r = hpx.command.SinglePOSTRequest().request(a_url, req_props) + if r.ok: + if "Insufficient funds" in r.text: + log.info("Unable to grab gallery archive due to insufficent funds (GP) on the account") + item.name = "(Insufficient GP) "+item.name + elif "Key missing, or incorrect key provided" not in r.text: + soup = BeautifulSoup(r.text, "html.parser") + dp_url = soup.find("p", id="continue") + if dp_url and dp_url.a: # finally + download_requests.append( + DownloadRequest( + downloaditem=item, + url=dp_url.a['href'] + '?start=1', + properties=hpx.command.RequestProperties(method=hpx.Method.GET, headers=HEADERS, session=login_session), # we need to use the same session + filename=item.name.strip()+'.zip')) + archive_req = True + log.debug(f"adding the archive url {download_requests[-1].url}") + if not archive_req: + log.info("Something went wrong and we did not actually find a URL") + #TODO Actually better handle the various cases of why we do not have a url + else: + log.warning(f"got invalid key page or bad status: {r.status_code}") + if r.status_code == 404 and "This gallery is currently unavailable" in r.text: + #We know that there is a valid key for us to get here, so the gallery existed at some point in the past + #This seems like it is most of the time a copyright takedown, but I have no idea why this is not marked as expunged + item.name = "(Gallery Unavailable) "+item.name + else: + log.warning(f"didn't find archiver key for data: {eh_data}") + item.name = "(Archive Unavailable) "+item.name + except Exception as e: + log.debug(f"got an error, last request content: \n\t {r.text}") + raise + + if not archive_req: + pass + # TODO: download individual images instead + + if download_requests: + log.info(f"was able to prepare {len(download_requests)} requests") + else: + log.info("unable to prepare any URLs to download") + return tuple(download_requests) + +@hpx.attach("Download.done", trigger=[EX_IDENTIFIER, EH_IDENTIFIER]) +def download_done(result): + """ + Called when downloading of all :class:`DownloadRequest` for a specific :class:`DownloadItem` has finished. + The handler should do any post-processing here (archive files, rename files or folders, delete extranous files and etc.). + Remember to set the `status` property on the :class:`DownloadResult` object to `False` if the post-processing was a failure. + Note that the handler should *not* import the file into HPX (if it's an item), that part will be taken care of by HPX + + should return: + the same :class:`DownloadResult` that was provided to the handler, potentially modified on the 'path' or `status` and `reason` properties + """ + # there's nothing special to post-process in the case of e(x)hentai downloader, so just return the result as is + log.info(f"download of archive was successful for {result.downloaditem.name}") + #TODO Mark it as a failure if there was only a thumbnail to download + #TODO Archive the individual images together into a cbz or something if we grabbed individual images + return result + +def parse_url(url): + "Parses url into a tuple of gallery id and token" + gallery_id = None + gallery_token = None + + gallery_id_token = regex.search('(?<=g/)([0-9]+)/([a-zA-Z0-9]+)', url) + if gallery_id_token: + gallery_id_token = gallery_id_token.group() + gallery_id, gallery_token = gallery_id_token.split('/') + else: + log.warning("Error extracting g_id and g_token from url: {}".format(url)) + return int(gallery_id), gallery_token diff --git a/plugins/EHentai Downloader/readme.md b/plugins/EHentai Downloader/readme.md new file mode 100644 index 0000000..36198e7 --- /dev/null +++ b/plugins/EHentai Downloader/readme.md @@ -0,0 +1,22 @@ +EHentai Downloader +---------------------------- + +> A plugin that enables downloading manga and doujinshi from E-Hentai & ExHentai + +**IMPORTANT:** To download using GP/credits, the plugin [EHentai Login](https://github.com/happypandax/plugins/tree/master/plugins/EHentai%20Login) is required to be present + +**IMPORTANT:** Only downloading through ehentai's archiver system (which costs GP/credits) is supported at this time + +## Configuration + +There's no available config options for this plugin + +## Things yet to be implemented + +- Support scraping the individual images +- Torrents (waiting for HPX to support this) + +# Changelog + +- `1.0.0` + - first version \ No newline at end of file diff --git a/plugins/EHentai Downloader/test.py b/plugins/EHentai Downloader/test.py new file mode 100644 index 0000000..36a317e --- /dev/null +++ b/plugins/EHentai Downloader/test.py @@ -0,0 +1 @@ +# test.py \ No newline at end of file diff --git a/plugins/EHentai Login/hplugin.json b/plugins/EHentai Login/hplugin.json new file mode 100644 index 0000000..f35d421 --- /dev/null +++ b/plugins/EHentai Login/hplugin.json @@ -0,0 +1,16 @@ +{ + "id": "d9b1d111-7250-4083-9efb-356fabbeada7", + "shortname": "ehentai-login", + "name": "EHentai Login", + "version": "1.1.0", + "description": "A plugin that can login to E-Hentai & ExHentai", + "author": "Twiddly", + "site_folder": "site/", + "update_url": "https://github.com/happypandax/plugins/tree/master/plugins/EHentai%20Login", + "website": "https://github.com/happypandax/plugins/tree/master/plugins/EHentai%20Login", + "entry": "main.py", + "test": "test.py", + "require": [ + "happypandax >= 0.10.0" + ] +} diff --git a/plugins/EHentai Login/main.py b/plugins/EHentai Login/main.py new file mode 100644 index 0000000..7563dbe --- /dev/null +++ b/plugins/EHentai Login/main.py @@ -0,0 +1,191 @@ +# main.py +import __hpx__ as hpx +import pickle +import os + +from bs4 import BeautifulSoup + +log = hpx.get_logger("main") + + + +current_user_name = "" +status_text = "" +response = None +user_dict = None + +save_file = os.path.join(hpx.constants.current_dir, '.info') + +default_delay = 8 + +HEADERS = {'user-agent':"Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0"} + +match_url_prefix = r"^(http\:\/\/|https\:\/\/)?(www\.)?" # http:// or https:// + www. +match_url_end = r"\/?$" + +url_regex = match_url_prefix + r"((exhentai|(g\.)?e-hentai)\.org)" + match_url_end + +MAIN_URLS = { + 'eh': "https://e-hentai.org", + 'ex': "https://exhentai.org" +} + +URLS = MAIN_URLS +URLS.update({ + 'login': "https://e-hentai.org/home.php" +}) + +@hpx.subscribe("init") +def inited(): + # set default delay values if not set + delays = hpx.get_setting("network", "delays", {}) + for u in (MAIN_URLS['ex'], MAIN_URLS['eh']): + if u not in delays: + log.info(f"Setting delay on {u} requests to {default_delay}") + delays[u] = default_delay + hpx.update_setting("network", "delays", delays) + + # retrieve saved user info + if os.path.exists(save_file): + with open(save_file, "rb") as f: + user_dict = pickle.load(f) + if user_dict: + login(user_dict, {}) + if response is not None: + log.info("Successfully re-logged in") + +@hpx.subscribe("disable") +def disabled(): + save_user_dict() + +@hpx.subscribe("remove") +def removed(): + pass + +@hpx.attach("Login.info") +def login_info(): + return hpx.command.LoginInfo( + identifier = "ehentai", + name = "EHentai", + parser = url_regex, + sites = ("www.e-hentai.org", "www.exhentai.org"), + description = "Login to E-Hentai & ExHentai", + ) + +@hpx.attach("Login.login", trigger="ehentai") +def login(userpass, options): + global current_user_name + global status_text + global response + global user_dict + + user_dict = userpass + response = None + current_user_name = "" + + ipb_member = userpass.get('ipb_member_id', "") + ipb_pass = userpass.get('ipb_pass_hash', "") + try: + if not ipb_member or not ipb_pass: + raise ValueError("Missing ipb_member_id or ipb_pass_hash") + + cookies = {} + + additional = userpass.get('additional', "") + if additional: + try: + additional = {k.strip():v.strip() for k, v in [x.strip().split('=', 1) for x in additional.split(',')] } + cookies.update(additional) + except: + raise ValueError("Failed to parse additional values") + + cookies.update({ + 'ipb_member_id': ipb_member, + 'ipb_pass_hash': ipb_pass, + }) + + # prepare request + req_props = hpx.command.RequestProperties( + session=True, + cookies=cookies, + headers=HEADERS + ) + + req = hpx.command.SingleGETRequest() + + # check ehentai.org/home.php + r = req.request(URLS['login'], req_props) + + if r.ok: + bad_access, msg = check_access(r) + status_text = msg + if not bad_access: + if userpass.get("exhentai", True): + # check exhentai + req_props.session = r.session + r = req.request(URLS['ex'], req_props) + if r.ok: + bad_access, status_text = check_access(r, ex=True) + else: + status_text = "Could not access ExHentai" + + response = r + + current_user_name = ipb_member + save_user_dict() + + else: + status_text = r.reason + + except ValueError as e: + status_text = str(e) + + return response + +@hpx.attach("Login.status", trigger="ehentai") +def status(options): + return status_text + +@hpx.attach("Login.logged_in", trigger="ehentai") +def logged_in(options): + if response: + return True + return False + +@hpx.attach("Login.response", trigger="ehentai") +def response_(options): + return response + +@hpx.attach("Login.current_user", trigger="ehentai") +def current_user(options): + return current_user_name + +def check_access(r, ex=False): + msg = "" + bad_access = False + content_type = r.headers['content-type'] + text = r.text + if 'image/gif' in content_type: + msg = "No access to ExHentai" + elif 'text/html' and 'Your IP address has been' in text: + msg = text + bad_access = True + + if not bad_access and not ex: + soup = BeautifulSoup(text, "html.parser") + if soup.find("div", class_="homebox"): # we have access to home.php + pass + elif soup.find("form"): # login page + bad_access = True + msg = "Wrong credentials!" + if msg: + log.info(f"MSG: {msg}") + return bad_access, msg + +def save_user_dict(): + global user_dict + + # save user info + if user_dict: + with open(save_file, "wb") as f: + user_dict = pickle.dump(user_dict, f) diff --git a/plugins/EHentai Login/readme.md b/plugins/EHentai Login/readme.md new file mode 100644 index 0000000..7258ebc --- /dev/null +++ b/plugins/EHentai Login/readme.md @@ -0,0 +1,22 @@ +EHentai Login +---------------------------- + +> This plugin can log-in to E-Hentai & ExHentai + +To login, go to this plugin's site through HPX. +The plugin site can be accessed through a webclient, or at `[webserver-host]/plugin/d9b1d111-7250-4083-9efb-356fabbeada7`. +Replace `[webserver-host]` with wherever your HPX webserver is hosted at. + +# Changelog + +- `1.1.0` + - Added a new field for additional cookie values + +- `1.0.0` + - Updated to reflect new changes in HPX v0.10.0 + +- `0.2.0b` + - increase default delay limit on EH requests to `9` from `4` secs, this value can be tweaked in `network.delays` inside your`config.yaml` + +- `0.1.0b` + - first version diff --git a/plugins/EHentai Login/site/index.html b/plugins/EHentai Login/site/index.html new file mode 100644 index 0000000..6d463c0 --- /dev/null +++ b/plugins/EHentai Login/site/index.html @@ -0,0 +1,62 @@ +{% extends __default__ %} + +{% block include_javascript %} + +{% endblock %} + +{% block title %} +EHentai Metadata Plugin +{% endblock %} + +{% block before_content_form %} +

Toggle the ExHentai option before logging in to also check for ExHentai access.

+

The user has access to both E-Hentai and ExHentai if no message is displayed on a succesful log-in. Else the message No access to ExHentai will be displayed.

+
+

To find your IPB Member ID and IPB Pass Hash, follow these steps (should work on all browsers):

+
    +
  1. Navigate to e-hentai.org (needs to be logged in) or exhentai.org
  2. +
  3. Right click on page => Inpect element
  4. +
  5. Go on Console tab
  6. +
  7. Write: document.cookie
  8. +
  9. A line of values should appear that correspond to active cookies
  10. +
  11. Look for the ipb_member_id, ipb_pass_hash and other values
  12. +
+
+{% endblock %} + +{% block outer_content_form %} +
+
+

Logged In

+
+
+

No access to ExHentai

+
+
+

Failed to log in

+
+
+ + +
+
+ + +
+
+ +
+

Comma-separated values: key_1=value_1, key_2=value2

+
+ +
+ +
+
+ + +
+
+ +
+{% endblock %} diff --git a/plugins/EHentai Login/site/script.js b/plugins/EHentai Login/site/script.js new file mode 100644 index 0000000..83b68b2 --- /dev/null +++ b/plugins/EHentai Login/site/script.js @@ -0,0 +1,124 @@ +const login_identifer = "ehentai" + +function main() { + check_login(true) +} + +function set_err_msg(msg) { + document.querySelector("#error-msg").innerHTML = msg +} + +function form_status(cls, msg) { + switch (cls) { + case 'success': + document.querySelector("form").classList.add("success") + document.querySelector("form").classList.remove("error") + document.querySelector("form").classList.remove("warning") + break + case 'error': + if (!document.querySelector("#error-msg").innerHTML) + set_err_msg("Failed to login: " + msg.toString()) + document.querySelector("form").classList.add("error") + document.querySelector("form").classList.remove("success") + document.querySelector("form").classList.remove("warning") + break; + case 'warning': + document.querySelector("form").classList.toggle("warning") + break; + case 'loading': + document.querySelector("form").classList.add("loading") + break; + case '!loading': + document.querySelector("form").classList.remove("loading") + break; + } +} + +function check_login(first_time) { + form_status("loading") + hpx.call_function( + "get_login_info", + {identifier: login_identifer}, + data => { + let fdata = data.data + form_status("!loading") + if (fdata) { + if (fdata.logged_in) { + form_status("success") + if (fdata.status.toLowerCase().indexOf("exhentai") !== -1) + form_status("warning") + } + else + if (!first_time){ + set_err_msg(fdata.status) + form_status("error") + } + } else { + if (!first_time) { + form_status("error", fdata.status) + } + } + }) +} + +function on_login(e) { + e.preventDefault() + let arr = serialize_form(e.target) + let data = { + exhentai: false + } + for (var i in arr) { + let x = arr[i] + if ( ['ipb_member_id', 'ipb_pass_hash', 'additional'].includes(x.name)) { + data[x.name] = x.value + } else if (x.name == 'exhentai') + data[x.name] = (x.value == 'on') ? true : false + } + if (data.ipb_member_id && data.ipb_pass_hash) { + hpx.call_function( + "submit_login", + { + identifier:login_identifer, + credentials: data, + }) + + // submit_login is an async function so delay abit before checking if the login was successful + // a better solution is to actually poll the command and get the result when finished, but ain't nobody got time for that + form_status("loading") + setTimeout(check_login, 4000) + } else { + form_status('error') + } +} + +// in case the document is already rendered +if (document.readyState!='loading') main(); +// modern browsers +else if (document.addEventListener) document.addEventListener('DOMContentLoaded', main); +// IE <= 8 +else document.attachEvent('onreadystatechange', function(){ + if (document.readyState=='complete') main(); +}); + +// Serialize form data into an array +function serialize_form(form) { + var field, l, s = []; + if (typeof form == 'object' && form.nodeName == "FORM") { + var len = form.elements.length; + for (var i=0; i= 0.10.0", + "ehentai-login" + ] +} \ No newline at end of file diff --git a/plugins/EHentai Metadata/main.py b/plugins/EHentai Metadata/main.py new file mode 100644 index 0000000..9f32d40 --- /dev/null +++ b/plugins/EHentai Metadata/main.py @@ -0,0 +1,593 @@ +# main.py +import __hpx__ as hpx +import regex +import arrow +import datetime +import os +import urllib +import html + +from bs4 import BeautifulSoup +from PIL import Image, ImageChops + +log = hpx.get_logger("main") + +MATCH_URL_PREFIX = r"^(http\:\/\/|https\:\/\/)?(www\.)?" # http:// or https:// + www. +MATCH_URL_END = r"\/?$" + +DEFAULT_DELAY = 8 + +URLS_REGEX = { + 'eh_gallery': MATCH_URL_PREFIX + r"((? 8: # check if exceeds 8 keywords retry with quotes around title + r = title_search(f'"{title}"', ex, session, _times=_times+1) + + return r + +def eh_page_results(eh_page_url, limit=None, session=None): + "Opens eh page, parses for results, and then returns list of (title, url)" + found_urls = [] + if limit is None: + limit = PLUGIN_CONFIG.get("gallery_results_limit") + + # prepare request + req_props = hpx.command.RequestProperties( + headers=HEADERS, + ) + if session: + req_props.session = session + log.debug(f"COOKIES: {session.cookies.keys()}") + r = hpx.command.SingleGETRequest().request(eh_page_url, req_props) + soup = BeautifulSoup(r.text, "html.parser") + list_style = "compact" + dmi_div = soup.find("div", id="dms") + if dmi_div: + list_style = dmi_div.find("option", selected=True).string.lower() + results = [] + if list_style == "compact": + results = soup.findAll("td", class_="gl3c glname", limit=limit) + elif list_style == "minimal": + results = soup.findAll("td", class_="gl3m glname", limit=limit) + elif list_style == "extended": + results = soup.findAll("div", class_="gl4e glname", limit=limit) + elif list_style == "thumbnail": + results = soup.findAll("div", class_="gl4t glname", limit=limit) + # str(x.a.string) + found_urls = [(str(x.a.string), x.a['href']) for x in results] # title, url + + if not found_urls: + log.debug(f"HTML: {r.text}") + return found_urls + +def parse_url(url): + "Parses url into a tuple of gallery id and token" + gallery_id = None + gallery_token = None + + gallery_id_token = regex.search('(?<=g/)([0-9]+)/([a-zA-Z0-9]+)', url) + if gallery_id_token: + gallery_id_token = gallery_id_token.group() + gallery_id, gallery_token = gallery_id_token.split('/') + else: + log.warning("Error extracting g_id and g_token from url: {}".format(url)) + return int(gallery_id), gallery_token + +def apply(datatuple): + """ + Called to fetch and apply metadata to the given data items. + Remember to set the `status` property on the :class:`MetadataResult` object to `True` on a successful fetch. + """ + mresults = [] + applied = False + eh_data = { + 'method': 'gdata', + 'gidlist': [], + 'namespace': 1 + } + + mdata_map = {} # (gid,token):metadatadata and gid:metadatadata + + for d in datatuple: + eh_data['gidlist'].append(d.data['gallery']) + mdata_map[tuple(d.data['gallery'])] = d + mdata_map[d.data['gallery'][0]] = d # used for when token is invalid, assumes that there's no duplicate gid's + + # prepare request + req_props = hpx.command.RequestProperties( + headers=HEADERS, + json=eh_data + ) + r = hpx.command.SinglePOSTRequest().request(URLS['api'], req_props) + if r.ok: + response = r.json + if response and not 'error' in response: + for gdata in response['gmetadata']: + if 'error' in gdata: + mdata = mdata_map[gdata['gid']] + mresults.append(hpx.command.MetadataResult(data=mdata, status=False, reason=gdata['error'])) + else: + mdata = mdata_map[(gdata['gid'], gdata['token'])] + urls_to_apply = [] + if PLUGIN_CONFIG.get('add_gallery_url', True): + urls_to_apply.append(mdata.data['gallery_url']) + fdata = format_metadata(gdata, mdata.metadataitem.item, urls_to_apply=urls_to_apply) + applied = apply_metadata(fdata, mdata.metadataitem.item, mdata.options) + mresults.append(hpx.command.MetadataResult(data=mdata, status=applied, reason="No data was applied" if not applied else "")) + + elif response: + log.warning(response) + for d in datatuple: + mresults.append(hpx.command.MetadataResult(data=d, status=False, reason=response['error'])) + + return tuple(mresults) + +def capitalize_text(text): + """ + better str.capitalize + """ + return " ".join(x.capitalize() for x in text.strip().split()) + +def format_metadata(gdata, item, urls_to_apply=None): + """ + Formats metadata to look like this for apply_metadata: + data = { + 'titles': None, # [(title, language),...] + 'artists': None, # [(artist, (circle, circle, ..)),...] + 'parodies': None, # [parody, ...] + 'category': None, + 'tags': None, # [tag, tag, tag, ..] or {ns:[tag, tag, tag, ...]} + 'pub_date': None, # DateTime object or Arrow object + 'language': None, + 'urls': None # [url, ...] + } + """ + mdata = {} + + mdata['titles'] = [] + + parsed_text = hpx.command.ItemTextParser(gdata['title']) + + parsed_title = parsed_text.extract_title() + if parsed_title: + parsed_title = parsed_title[0] + mdata['titles'].append((parsed_title or gdata['title'], 'english')) + + mdata['titles'].append((gdata['title_jpn'], 'japanese')) + + + mdata['category'] = gdata['category'] + mdata['pub_date'] = arrow.Arrow.fromtimestamp(gdata['posted']) + + lang = "japanese" # default language + + artists = set() + circles = set() + parodies = set() + + parsed_artists = parsed_text.extract_artist() + parsed_circles = parsed_text.extract_circle() + + extranous_namespaces = ("artist", "parody", "group", "language") + mdata['tags'] = {} + for nstag in gdata['tags']: + + blacklist_tags = PLUGIN_CONFIG.get("blacklist_tags") + if blacklist_tags and nstag in blacklist_tags: + continue + + ns = None + if ':' in nstag: + ns, t = nstag.split(':', 1) + else: + t = nstag + + if ns == 'language' and t != 'translated': + lang = t + elif ns == "artist": + for a in parsed_artists: # the artist extracted from the title likely has better capitalization, so choose that instead + if a.lower() == t.lower(): + artists.add(a) + break + else: + artists.add(t) + elif ns == "group": + for c in parsed_circles: # the circle extracted from the title likely has better capitalization, so choose that instead + if c.lower() == t.lower(): + circles.add(c) + break + else: + circles.add(t) + elif ns == "parody": + parodies.add(t) + + if not (PLUGIN_CONFIG.get("remove_namespaces") and ns in extranous_namespaces): + mdata['tags'].setdefault(ns, []).append(t) + else: + log.debug(f"removing namespace {ns}") + + log.debug(f"tags: {mdata['tags']}") + + mdata['language'] = lang + + if parodies: + mdata['parodies'] = parodies + + if artists: + a_circles = [] + for a in artists: + a_circles.append((a, tuple(circles))) # assign circles to each artist + mdata['artists'] = a_circles + + if urls_to_apply: + mdata['urls'] = urls_to_apply + + return mdata + +GalleryData = hpx.command.GalleryData +LanguageData = hpx.command.LanguageData +TitleData = hpx.command.TitleData +ArtistData = hpx.command.ArtistData +ArtistNameData = hpx.command.ArtistNameData +ParodyData = hpx.command.ParodyData +ParodyNameData = hpx.command.ParodyNameData +CircleData = hpx.command.CircleData +CategoryData = hpx.command.CategoryData +UrlData = hpx.command.UrlData +NamespaceTagData= hpx.command.NamespaceTagData +TagData= hpx.command.TagData +NamespaceData = hpx.command.NamespaceData + +def apply_metadata(data, gallery, options): + """ + data = { + 'titles': None, # [(title, language),...] + 'artists': None, # [(artist, (circle, circle, ..)),...] + 'parodies': None, # [parody, ...] + 'category': None, + 'tags': None, # [tag, tag, tag, ..] or {ns:[tag, tag, tag, ...]} + 'pub_date': None, # DateTime object or Arrow object + 'language': None, + 'urls': None # [url, ...] + } + """ + + log.debug(f"data: {data}") + + gdata = GalleryData() + + if isinstance(data.get('titles'), (list, tuple, set)): + gtitles = [] + for t, l in data['titles']: + gtitle = None + if t: + t = html.unescape(t) + gtitle = TitleData(name=t) + if t and l: + gtitle.language = LanguageData(name=l) + if gtitle: + gtitles.append(gtitle) + + if gtitles: + gdata.titles = gtitles + log.debug("applied titles") + + if isinstance(data.get('artists'), (list, tuple, set)): + gartists = [] + for a, c in data['artists']: + if a: + gartist = ArtistData(names=[ArtistNameData(name=capitalize_text(a))]) + gartists.append(gartist) + + if c: + gcircles = [] + for circlename in [x for x in c if x]: + gcircles.append(CircleData(name=capitalize_text(circlename))) + gartist.circles = gcircles + + if gartists: + gdata.artists = gartists + log.debug("applied artists") + + if isinstance(data.get('parodies'), (list, tuple, set)): + gparodies = [] + for p in data['parodies']: + if p: + gparody = ParodyData(names=[ParodyNameData(name=capitalize_text(p))]) + gparodies.append(gparody) + + if gparodies: + gdata.parodies = gparodies + log.debug("applied parodies") + + if data.get('category'): + gdata.category = CategoryData(name=data['category']) + log.debug("applied category") + + if data.get('language'): + gdata.language = LanguageData(name=data['language']) + log.debug("applied language") + + if isinstance(data.get('tags'), (dict, list)): + if isinstance(data['tags'], list): + data['tags'] = {None: data['tags']} + gnstags = [] + for ns, tags in data['tags'].items(): + if ns is not None: + ns = ns.strip() + if ns and ns.lower() == 'misc': + ns = None + for t in tags: + t = t.strip() + if t: + kw = {'tag': TagData(name=t)} + if ns: + kw['namespace'] = NamespaceData(name=ns) + gnstags.append(NamespaceTagData(**kw)) + + if gnstags: + gdata.tags = gnstags + log.debug("applied tags") + + if isinstance(data.get('pub_date'), (datetime.datetime, arrow.Arrow)): + pub_date = data['pub_date'] + gdata.pub_date = pub_date + log.debug("applied pub_date") + + if isinstance(data.get('urls'), (list, tuple)): + gurls = [] + for u in data['urls']: + if u: + gurls.append(UrlData(name=u)) + if gurls: + gdata.urls = gurls + log.debug("applied urls") + + applied = hpx.command.UpdateItemData(gallery, gdata, options=options) + + log.debug(f"applied: {applied}") + + return applied \ No newline at end of file diff --git a/plugins/EHentai Metadata/readme.md b/plugins/EHentai Metadata/readme.md new file mode 100644 index 0000000..423daac --- /dev/null +++ b/plugins/EHentai Metadata/readme.md @@ -0,0 +1,67 @@ +EHentai Metadata +---------------------------- + +> This plugin fetches metadata from E-Hentai & ExHentai + +**IMPORTANT:** This plugin requires the [EHentai Login](https://github.com/happypandax/plugins/tree/master/plugins/EHentai%20Login) plugin to be present + +## Configuration + +Configure this plugin by adding `ehentai-metadata` to the `plugin.config` namespace in your `config.yaml`: +```yaml +plugin: + config: + ehentai-metadata: + option1: True + option2: + - item 1 + - item 2 +``` + +#### Available options + +Name | Default | Description +--- | --- | --- +`filename_search` | `true` | use the filename/folder-name for searching instead of gallery title +`expunged_galleries` | `false` | enable expunged galleries in results +`remove_namespaces` | `true` | remove superfluous namespaces like 'artist', 'language' and 'group' because they are handled specially in HPX +`gallery_results_limit` | `10` | maximum amount of galleries to return +`blacklist_tags` | `[]` | tags to ignore when updating tags, a list of `namespace:tag` strings +`add_gallery_url` | `true` | add ehentai url to gallery +`preferred_language` | `"english"` | preferred gallery language (in gallery title) to extract from if multiple galleries were found, set empty string for default +`enabled_categories` | `['manga', 'doujinshi', 'non-h', 'artistcg', 'gamecg', 'western', 'imageset', 'cosplay', 'asianporn', 'misc']` | categories that are enbaled for the search +`search_query` | `"{title}"` | the search query, '{title}' will be replaced with the gallery title, use double curly brackets to escape a curly bracket. Tip: if you want to only allow english results, you should modify this to "{title} language:english" +`search_low_power_tags` | `true` | enable search low power tags +`search_torrent_name` | `true` | enable search torrent name +`search_gallery_description` | `false` | enable search gallery description + +## Things yet to be implemented + +- File similarity search + +# Changelog + +- `1.2.1` + - some misc. changes + +- `1.2.0` + - fixed title being qouted unconditionally + - retry the search with qouted title if keyword count exceeds 8 + +- `1.1.0` + - added several new options and fixed some errors + +- `1.0.0` + - updated to reflect new changes in HPX v0.10.0 + +- `0.4.0b` + - updated to work on new EH website design changes + +- `0.3.0b` + - add a default delay on `https://api.e-hentai.org/` requests, this value can be tweaked in `network.delays` inside your`config.yaml` + +- `0.2.0b` + - added `preferred_language` option + +- `0.1.0b` + - first version \ No newline at end of file diff --git a/plugins/EHentai Metadata/test.py b/plugins/EHentai Metadata/test.py new file mode 100644 index 0000000..36a317e --- /dev/null +++ b/plugins/EHentai Metadata/test.py @@ -0,0 +1 @@ +# test.py \ No newline at end of file diff --git a/plugins/File Metadata/extractors/__init__.py b/plugins/File Metadata/extractors/__init__.py new file mode 100644 index 0000000..8951b5c --- /dev/null +++ b/plugins/File Metadata/extractors/__init__.py @@ -0,0 +1,3 @@ +from . import eze +from . import ehentaidownloader +from . import hdoujin \ No newline at end of file diff --git a/plugins/File Metadata/extractors/common.py b/plugins/File Metadata/extractors/common.py new file mode 100644 index 0000000..23b79c6 --- /dev/null +++ b/plugins/File Metadata/extractors/common.py @@ -0,0 +1,112 @@ +import __hpx__ as hpx +import enum +import json +import typing + +log = hpx.get_logger(__name__) + +class IncompatibleFile(ValueError): + pass + +class DataType(enum.Flag): + """ + The available extractors. + Add your new extractor here + """ + eze = enum.auto() + hdoujin = enum.auto() + e_hentai_downloader = enum.auto() + +# The filetypes to look for, no duplicates, only add if necessary +filetypes = ('.json', '.txt') +# Which filetype belongs to which extractor, use inclusive OR '|' to combine multiple extractors +filenames = { + "info.json": DataType.eze | DataType.hdoujin, + "info.txt": DataType.hdoujin | DataType.e_hentai_downloader, +} + +common_data = { + 'titles': None, # [(title, language),...] + 'artists': None, # [(artist, (circle, circle, ..)),...] + 'parodies': None, # [parody, ...] + 'category': None, + 'tags': None, # [tag, tag, tag, ..] or {ns:[tag, tag, tag, ...]} + 'pub_date': None, # DateTime object or Arrow object + 'language': None, + 'urls': None # [url, ...] +} + +plugin_config = { + 'characters_namespace': 'character', # hdoujin, which namespace to put the values in the CHARACTERS field in +} + +extractors = {} + +def capitalize_text(text): + """ + better str.capitalize + """ + return " ".join(x.capitalize() for x in text.strip().split()) + +def register_extractor(cls, type): + assert issubclass(cls, Extractor) + assert isinstance(type, DataType) + extractors[type] = cls() + +class Extractor: + """ + Base extractor + """ + + def file_to_dict(self, fs: hpx.command.CoreFS) -> typing.Union[dict, None]: + """ + A subclass can choose to override or extend this method. + Should return a dict with data from the file which will be passed to the extract method. + If the file is not supported or should be skipped, return None. + The parameter fs is the file in question. + + Below is convenience code to read and convert a file into a dict. + Supports json and txt files. + If file is a txt, will try to parse files like this: + Field A: value 1 + Field B: value 2 + -> + { + 'Field A': 'value 1', + 'Field B': 'value 2', + } + otherwise the txt file is not supported and a ValueError will be raised. + NotImplementedError will be raised if file is neither json or txt file. + """ + try: + d = {} + log.debug(f"File ext: {fs.ext}") + kw = {} + if not fs.inside_archive: + kw['encoding'] = 'utf-8' + if fs.ext.lower() == '.json': + with fs.open("r", **kw) as f: + d = json.load(f) + elif fs.ext.lower() == '.txt': + with fs.open("r", **kw) as f: + for line in f.readlines(): + l = line.strip() + if isinstance(l, bytes): + l = l.decode(encoding="utf-8", errors="ignore") + k, v = l.split(':', 1) + if k.strip(): + d[k.strip()] = v.strip() + else: + raise NotImplementedError(f"{fs.ext} filetype not supported yet") + except Exception as e: # Bad, I know, but too lazy + raise IncompatibleFile(e) + return d + + def extract(self, filedata: dict) -> dict: + """ + A subclass must implement this method. + Should populate a dict that looks like common_data (see above) and return it + + filedata parameter is the dict created in the file_to_dict method + """ + raise NotImplementedError diff --git a/plugins/File Metadata/extractors/ehentaidownloader.py b/plugins/File Metadata/extractors/ehentaidownloader.py new file mode 100644 index 0000000..3a02f95 --- /dev/null +++ b/plugins/File Metadata/extractors/ehentaidownloader.py @@ -0,0 +1,80 @@ +import __hpx__ as hpx +from . import common + +log = hpx.get_logger(__name__) + +class EHentaiDownloader(common.Extractor): + + def file_to_dict(self, fs): + """ + A subclass can choose to override or extend this method. + Should return a dict with data from the file which will be passed to the extract method. + If the file is not supported or should be skipped, return None. + The parameter fs is the file in question. + + File is formatted weirdly so we just return {linenumber : line} + """ + d = {} + log.debug(f"File ext: {fs.ext}") + kw = {} + if not fs.inside_archive: + kw['encoding'] = "utf-8" + with fs.open("r", **kw) as f: + for num, line in enumerate(f.readlines(), 1): + if isinstance(line, bytes): + line = line.decode("utf-8") + d[num] = line + + # confirm it's the right file + if d and not "generated by e-hentai downloader" in d[len(d)].lower(): + d = None + return d + + def extract(self, filedata): + """ + A subclass must implement this method. + Should populate a dict that looks like common_data (see common.py) and return it + + filedata parameter is the dict created in the file_to_dict method + """ + d = {} + if filedata: + log.debug("Expecting e-hentai downloader metadata file") + for linenum in sorted(filedata): + line = filedata[linenum].strip() + if not line: + continue + + if line.startswith("Language:"): + line = line.split(':', 1)[1] + d['language'] = common.capitalize_text(line.lower().split()[0]) + continue + + if line.startswith("Category:"): + line = line.split(':', 1)[1] + d['category'] = common.capitalize_text(line.lower()) + continue + + if line.startswith("> "): # tags + line = line[2:] # remove > + ns, tags = line.split(':', 1) + tags = tags.split(",") + d.setdefault("tags", {})[ns.strip()] = [t.strip() for t in tags] + continue + + if linenum in (1, 2, 3): # most likely a title or url, must be last because maybe it wasn't included + # ensure + if not filedata.get(3, "").startswith("http"): + continue + + if linenum == 3: + d.setdefault('urls', []).append(line) + else: + title_lang = "english" if linenum == 1 else "japanese" + nameparser = hpx.command.ItemTextParser(line) + parsed_title = nameparser.extract_title() + d.setdefault("titles", []).append((parsed_title[0] if parsed_title else line, title_lang)) + continue + return d + +common.register_extractor(EHentaiDownloader, common.DataType.e_hentai_downloader) diff --git a/plugins/File Metadata/extractors/eze.py b/plugins/File Metadata/extractors/eze.py new file mode 100644 index 0000000..cf4be26 --- /dev/null +++ b/plugins/File Metadata/extractors/eze.py @@ -0,0 +1,70 @@ +import arrow +import __hpx__ as hpx +from . import common + +log = hpx.get_logger(__name__) + +class Eze(common.Extractor): + + def file_to_dict(self, fs): + """ + A subclass can choose to override or extend this method. + Should return a dict with data from the file which will be passed to the extract method. + If the file is not supported or should be skipped, return None. + The parameter fs is the file in question. + """ + d = super().file_to_dict(fs) + k = ('gallery_info',) + if d and not all(map(lambda x: x in d, k)): # make sure all keys are present + d = None + k = ('image_info', 'gallery_info_full') + if d and not any(map(lambda x: x in d, k)): # make sure one of the keys are present + d = None + return d + + def extract(self, filedata): + """ + A subclass must implement this method. + Should populate a dict that looks like common_data (see common.py) and return it + + filedata parameter is the dict created in the file_to_dict method + """ + d = {} + filedata = filedata.get('gallery_info') + if filedata: + log.debug("Expecting eze metadata file") + mtitle = filedata.get('title') + mtitle_jp = filedata.get('title_original') + + mcat = filedata.get("category") + if mcat: + d['category'] = common.capitalize_text(mcat) + + for t, l in ((mtitle, "english"), (mtitle_jp, "japanese")): + if t: + nameparser = hpx.command.ItemTextParser(t) + parsed_title = nameparser.extract_title() + d.setdefault("titles", []).append((parsed_title[0] if parsed_title else t, l)) + + mtags = filedata.get("tags") + + if mtags: + d['tags'] = {} + for ns, t in mtags.items(): + d['tags'].setdefault(common.capitalize_text(ns), t) + + mlang = filedata.get("language") + if mlang: + d['language'] = common.capitalize_text(mlang) + + msource = filedata.get('source') + if msource: + d.setdefault('urls', []).append(f"https://{msource['site']}.org/g/{msource['gid']}/{msource['token']}") + + mupdate = filedata.get("upload_date") + if mupdate: + d['pub_date'] = arrow.get(*mupdate) + + return d + +common.register_extractor(Eze, common.DataType.eze) diff --git a/version/File Metadata/extractors.py b/plugins/File Metadata/extractors/hdoujin.py similarity index 61% rename from version/File Metadata/extractors.py rename to plugins/File Metadata/extractors/hdoujin.py index a2f1f4b..df2fc06 100644 --- a/version/File Metadata/extractors.py +++ b/plugins/File Metadata/extractors/hdoujin.py @@ -1,57 +1,17 @@ import __hpx__ as hpx -import common +from . import common log = hpx.get_logger(__name__) -class Eze(common.Extractor): - - def file_to_dict(self, fs): - d = super().file_to_dict(fs) - k = ('gallery_info', 'image_info') - if d and not all(map(lambda x: x in d, k)): # make sure all keys are present - d = None - return d - - def extract(self, filedata): - d = {} - filedata = filedata.get('gallery_info') - if filedata: - log.debug("Expecting eze metadata file") - mtitle = filedata.get('title') - mtitle_jp = filedata.get('title_original') - - mcat = filedata.get("category") - if mcat: - d['category'] = common.capitalize_text(mcat) - - for t, l in ((mtitle, "english"), (mtitle_jp, "japanese")): - if t: - nameparser = hpx.command.NameParser(t) - parsed_title = nameparser.extract_title() - d.setdefault("titles", []).append((parsed_title[0] if parsed_title else t, l)) - - mtags = filedata.get("tags") - - if mtags: - d['tags'] = {} - for ns, t in mtags.items(): - d['tags'].setdefault(common.capitalize_text(ns), t) - - mlang = filedata.get("language") - if mlang: - d['language'] = common.capitalize_text(mlang) - - msource = filedata.get('source') - if msource: - d.setdefault('urls', []).append(f"https://{msource['site']}.org/g/{msource['gid']}/{msource['token']}") - - return d - -common.register_extractor(Eze, common.DataType.eze) - class HDoujin(common.Extractor): def file_to_dict(self, fs): + """ + A subclass can choose to override or extend this method. + Should return a dict with data from the file which will be passed to the extract method. + If the file is not supported or should be skipped, return None. + The parameter fs is the file in question. + """ d = super().file_to_dict(fs) if d: if fs.ext.lower() == '.txt': @@ -73,6 +33,12 @@ def file_to_dict(self, fs): return d def extract(self, filedata): + """ + A subclass must implement this method. + Should populate a dict that looks like common_data (see common.py) and return it + + filedata parameter is the dict created in the file_to_dict method + """ d = {} if filedata: log.debug("Expecting hdoujin metadata file") @@ -81,7 +47,7 @@ def extract(self, filedata): for t, l in ((mtitle, "english"), (mtitle_jp, "japanese")): if t: - nameparser = hpx.command.NameParser(t) + nameparser = hpx.command.ItemTextParser(t) parsed_title = nameparser.extract_title() d.setdefault("titles", []).append((parsed_title[0] if parsed_title else t, l)) @@ -113,7 +79,19 @@ def extract(self, filedata): for ns, t in mtags.items(): d['tags'].setdefault(common.capitalize_text(ns), t) else: - d['tags'] = mtags + d['tags'] = {None: mtags} # None for no namespace + + mcharacters = filedata.get("characters") + if mcharacters: + if isinstance(mcharacters, str): + mcharacters = mcharacters.split(',') + d.setdefault('tags', {})[common.plugin_config.get('characters_namespace') or 'characters'] = mcharacters + + mparody = filedata.get("parody") + if mparody: + if isinstance(mparody, str): + mparody = mparody.split(',') + d['parodies'] = mparody mlang = filedata.get("language") if mlang: @@ -127,4 +105,4 @@ def extract(self, filedata): return d -common.register_extractor(HDoujin, common.DataType.hdoujin) +common.register_extractor(HDoujin, common.DataType.hdoujin) \ No newline at end of file diff --git a/plugins/File Metadata/hplugin.json b/plugins/File Metadata/hplugin.json new file mode 100644 index 0000000..9defcf6 --- /dev/null +++ b/plugins/File Metadata/hplugin.json @@ -0,0 +1,12 @@ +{ + "id": "e38e24e4-8ca8-420e-b52b-c75510097653", + "shortname": "file-metadata", + "name": "File Metadata", + "version": "2.0.2", + "description": "Extracts and applies metadata from a file accompanying a gallery. Supports files produced from eze, e-hentai-downloader and hdoujin", + "author": "Twiddly", + "update_url": "https://github.com/happypandax/plugins/tree/master/plugins/File%20Metadata", + "website": "https://github.com/happypandax/plugins/tree/master/plugins/File%20Metadata", + "entry": "main.py", + "require": ["happypandax >= 0.11.0"] +} diff --git a/plugins/File Metadata/main.py b/plugins/File Metadata/main.py new file mode 100644 index 0000000..4e85c0f --- /dev/null +++ b/plugins/File Metadata/main.py @@ -0,0 +1,275 @@ +import __hpx__ as hpx +import os +import arrow +import datetime +import html +import extractors +from extractors import common + +log = hpx.get_logger(__name__) + +options = { +} + +def get_common_data(datatypes, fpath): + assert isinstance(datatypes, common.DataType) + d = {} + fpath = hpx.command.CoreFS(fpath) + + for datatype in common.DataType: + if datatype & datatypes: + log.info(f"Attempting with {datatype}") + md = {} + + ex = common.extractors.get(datatype, None) + if ex: + try: + fdata = ex.file_to_dict(fpath) + except common.IncompatibleFile as e: + log.info(f"Skipping incompatible file for {datatype}: {str(e)}") + continue + if fdata: + log.info(f"{datatype} matched!") + md.update(ex.extract(fdata)) + else: + log.info(f"{datatype} didn't match") + if md: + d.update(md) + break + return d + +SetValue = hpx.command.Set +GalleryData = hpx.command.GalleryData +LanguageData = hpx.command.LanguageData +TitleData = hpx.command.TitleData +ArtistData = hpx.command.ArtistData +ArtistNameData = hpx.command.ArtistNameData +ParodyData = hpx.command.ParodyData +ParodyNameData = hpx.command.ParodyNameData +CircleData = hpx.command.CircleData +CategoryData = hpx.command.CategoryData +UrlData = hpx.command.UrlData +NamespaceTagData = hpx.command.NamespaceTagData +TagData = hpx.command.TagData +NamespaceData = hpx.command.NamespaceData + +def apply_metadata(data, gallery, options={}): + """ + data = { + 'titles': None, # [(title, language),...] + 'artists': None, # [(artist, (circle, circle, ..)),...] + 'parodies': None, # [parody, ...] + 'category': None, + 'tags': None, # [tag, tag, tag, ..] or {ns:[tag, tag, tag, ...]} + 'pub_date': None, # DateTime object or Arrow object + 'language': None, + 'urls': None # [url, ...] + } + """ + + log.debug(f"data: {data}") + + gdata = GalleryData() + + if isinstance(data.get('titles'), (list, tuple, set)): + gtitles = [] + for t, l in data['titles']: + gtitle = None + if t: + t = html.unescape(t) + gtitle = TitleData(name=t) + if t and l: + gtitle.language = LanguageData(name=l) + if gtitle: + gtitles.append(gtitle) + + if gtitles: + gdata.titles = SetValue(gtitles) + log.debug("applied titles") + + if isinstance(data.get('artists'), (list, tuple, set)): + gartists = [] + for a, c in data['artists']: + if a: + gartist = ArtistData(names=[ArtistNameData(name=common.capitalize_text(a))]) + gartists.append(gartist) + + if c: + gcircles = [] + for circlename in [x for x in c if x]: + gcircles.append(CircleData(name=common.capitalize_text(circlename))) + gartist.circles = gcircles + + if gartists: + gdata.artists = SetValue(gartists) + log.debug("applied artists") + + if isinstance(data.get('parodies'), (list, tuple, set)): + gparodies = [] + for p in data['parodies']: + if p: + gparody = ParodyData(names=[ParodyNameData(name=common.capitalize_text(p))]) + gparodies.append(gparody) + + if gparodies: + gdata.parodies = SetValue(gparodies) + log.debug("applied parodies") + + if data.get('category'): + gdata.category = SetValue(CategoryData(name=data['category'])) + log.debug("applied category") + + if data.get('language'): + gdata.language = SetValue(LanguageData(name=data['language'])) + log.debug("applied language") + + if isinstance(data.get('tags'), (dict, list)): + if isinstance(data['tags'], list): + data['tags'] = {None: data['tags']} + gnstags = [] + for ns, tags in data['tags'].items(): + if ns is not None: + ns = ns.strip() + if ns and ns.lower() == 'misc': + ns = None + for t in tags: + t = t.strip() + if t: + kw = {'tag': TagData(name=t)} + if ns: + kw['namespace'] = NamespaceData(name=ns) + gnstags.append(NamespaceTagData(**kw)) + + if gnstags: + gdata.tags = SetValue(gnstags) + log.debug("applied tags") + + if isinstance(data.get('pub_date'), (datetime.datetime, arrow.Arrow)): + pub_date = data['pub_date'] + gdata.pub_date = SetValue(pub_date) + log.debug("applied pub_date") + + if isinstance(data.get('urls'), (list, tuple)): + gurls = [] + for u in data['urls']: + if u: + gurls.append(UrlData(name=u)) + if gurls: + gdata.urls = SetValue(gurls) + log.debug("applied urls") + + if data.get('times_read'): + gdata.times_read = SetValue(data['times_read']) + log.debug("applied times_read") + + if data['times_read'] > 0: + gallery_id = gallery.id + page_id = gallery.last_page.id + + GalleryProgress.update_progress(gallery_id, page_id) + + applied = hpx.command.UpdateItemData(gallery, gdata, options=options) + + log.debug(f"applied: {applied}") + + return applied + +@hpx.subscribe("init") +def inited(): + common.plugin_config.update(hpx.get_plugin_config()) + +@hpx.subscribe('config_update') +def config_update(cfg): + common.plugin_config.update(cfg) + +def has_file_metadata(path): + fs = hpx.command.CoreFS(path) + + contents = {x: os.path.split(x)[1].lower() for x in fs.contents(corefs=False) if x.lower().endswith(common.filetypes)} + log.debug(f"Contents for {fs.path}:") + log.debug(f"{tuple(contents.values())}") + + found_files = [] + for fnames, dtypes in common.filenames.items(): + for fpath, fname in contents.items(): + if fname in fnames: + found_files.append((dtypes, fpath)) + break + + return found_files + +def apply_file_metadata(gallery, found_files): + applied = False + cdata = common.common_data.copy() + for dtypes, fpath in found_files: + log.debug(f"path: {fpath}") + d = get_common_data(dtypes, fpath) + if d: + applied = True + cdata.update(d) + + if applied: + apply_metadata(cdata, gallery) + + return applied + +@hpx.attach("GalleryFS.parse_metadata_file") +def parse(path, gallery): + f = has_file_metadata(path) + return apply_file_metadata(gallery, f) + +##### --- + +@hpx.attach("Metadata.info") +def metadata_info(): + return hpx.command.MetadataInfo( + identifier="filemetadata", + name="File Metadata", + description="Extracts and applies metadata from a file accompanying a gallery", + sites=("eze", "E-Hentai-Downloader", "HDoujinDownloader"), + models=( + hpx.command.GetDatabaseModel("Gallery"), + ) + ) + +@hpx.attach("Metadata.query", trigger='filemetadata') +def query(itemtuple): + "Looks up files for matching items" + mdata = [] + + for mitem in itemtuple: + item = mitem.item + options = mitem.options + + found_files = [] + for s in item.get_sources(): + found_files.extend(has_file_metadata(s)) + + log.info(f"found {len(found_files)} metadata files for item: {item}") + + if found_files: + log.debug(f"{found_files}") + + mdata.append(hpx.command.MetadataData( + metadataitem=mitem, + title=item.preferred_title.name if item.preferred_title else '', + data={ + 'found': found_files, + })) + + log.info(f"Returning {len(mdata)} data items") + return tuple(mdata) + +@hpx.attach("Metadata.apply", trigger='filemetadata') +def apply(datatuple): + mresults = [] + applied = False + + for d in datatuple: + applied = apply_file_metadata(d.item, d.data['found']) + if applied: + mresults.append(hpx.command.MetadataResult(data=d, status=True)) + else: + mresults.append(hpx.command.MetadataResult(data=d, status=False, reason="failed to apply data from file")) + + return tuple(mresults) diff --git a/plugins/File Metadata/readme.md b/plugins/File Metadata/readme.md new file mode 100644 index 0000000..a8a7244 --- /dev/null +++ b/plugins/File Metadata/readme.md @@ -0,0 +1,83 @@ +File Metadata +---------------------------- + +> This plugin extracts and applies metadata from a file accompanying a gallery folder or archive. + +This plugin supports extracting metadata from files produced by: + +- [eze](https://dnsev-h.github.io/eze/) + > - only supports JSON format and file must be named `info.json` +- [HDoujin Downloader](https://doujindownloader.com/) + > - all file versions are supported + > - supports both JSON and TXT formats + > - file must be named `info.json` or `info.txt` +- [E-Hentai-Downloader](https://github.com/ccloli/E-Hentai-Downloader) + > - supports only the file named `info.txt` + +## Configuration + +Configure this plugin by adding `file-metadata` to the `plugin.config` namespace in your `config.yaml`: +```yaml +plugin: + config: + file-metadata: + option1: True + option2: + - item 1 + - item 2 +``` + +#### Available options + +Name | Default | Description +--- | --- | --- +`characters_namespace` | `character` | which namespace to put the values in the CHARACTERS field into (applies to hdoujin) + +# Extending + +Follow these steps to add support for more kind of files: + +1. Create a new enum member for your extractor in `extractors.common.DataType` +2. Add a new filetype to `extractors.common.filetypes` if necessary +3. Add your new enum member to `extractors.common.filenames` +4. Create a new `.py` file in the `extractors` folder +5. Import the `common` module and create a new `common.Extractor` subclass +6. At the end of the file, register the subclass with `common.register_extractor` +7. Import your new `.py` file in `extractors.__init__` + +# Changelog + +- `2.0.2` + - Improved error handling on incompatible detected files + +- `2.0.1` + - Fixed an issue where metadata files in archives would fail to be detected + +- `2.0.0` + - The plugin will also now act as a regular metadata plugin, making it possible to retrieve metadata from files on-demand + +- `1.0.3` + - Updated the eze handler to save uploaded date as published date + +- `1.0.2` + - Fixed a bug where not all metadata would be applied + +- `1.0.1` + - Updated the eze handler to support files produced by https://github.com/dnsev-h/ehentai-archive-info + - Fixed the extractors still using the old api + +- `1.0.0` + - Updated to reflect new changes in HPX v0.10.0 + +- `0.3.0b` + - **HDoujin**: add option `characters_namespace` + - **HDoujin**: parse `PARODY` and `CHARACTERS` fields + +- `0.2.0b` + - require HPX `0.2.0` + - use new api to update gallery data + - add support for E-Hentai-Downloader + - fix bug where `info.txt` in archive files would fail to get parsed + +- `0.1.0b` + - first version diff --git a/plugins/NHentai Downloader/hplugin.json b/plugins/NHentai Downloader/hplugin.json new file mode 100644 index 0000000..8f59237 --- /dev/null +++ b/plugins/NHentai Downloader/hplugin.json @@ -0,0 +1,13 @@ +{ + "id": "d2d70306-db03-4cc0-b9c6-b5b1f95d10fe", + "shortname": "nhentai-downloader", + "name": "NHentai Downloader", + "version": "1.0.1", + "description": "A plugin that enables downloading manga and doujinshi from nhentai.net", + "author": "Twiddly", + "update_url": "https://github.com/happypandax/plugins/tree/master/plugins/NHentai%20Downloader", + "website": "https://github.com/happypandax/plugins/tree/master/plugins/NHentai%20Downloader", + "entry": "main.py", + "test": "test.py", + "require": ["happypandax >= 0.12.0"] +} diff --git a/plugins/NHentai Downloader/main.py b/plugins/NHentai Downloader/main.py new file mode 100644 index 0000000..14bc655 --- /dev/null +++ b/plugins/NHentai Downloader/main.py @@ -0,0 +1,152 @@ +# main.py +import __hpx__ as hpx + +from bs4 import BeautifulSoup + +DownloadRequest = hpx.command.DownloadRequest + +log = hpx.get_logger("main") + +IDENTIFIER = "nhentai" +HEADERS = {'user-agent':"Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0"} +DEFAULT_DELAY = 1.5 + +def website_url_regex_gen(domain, path_regex=None, variable_port=False, variable_tld=False, trailing_slash=True, end=True, trailing_fragment=True): + """ + Generates a regex suitable for a specific domain + """ + rgx = r"^(http\:\/\/|https\:\/\/)?(www\.)?({})".format(domain) + if variable_tld: + rgx += r"\.[a-z]{2,5}" + if variable_port: + rgx += r"(:[0-9]{1,5})?" + if trailing_slash: + rgx += r"\/?" + if path_regex: + rgx += path_regex + if trailing_slash: + rgx += r"\/?" + if trailing_fragment: + rgx += r"(#\S+)?" + if end: + rgx += "$" + return rgx + +@hpx.subscribe("init") +def inited(): + # set default delay if not set + delays = hpx.get_setting("network", "delays", {}) + delay_url = "https://nhentai.net/g/" + if delay_url not in delays: + log.info(f"Setting delay on {delay_url} requests to {DEFAULT_DELAY}") + delays[delay_url] = DEFAULT_DELAY + hpx.update_setting("network", "delays", delays) + +@hpx.attach("Download.info") +def download_info(): + return hpx.command.DownloadInfo( + identifier = IDENTIFIER, + name = "NHentai", + parser = website_url_regex_gen("nhentai.net", path_regex=r"g\/[0-9]{3,10}", trailing_slash=True, variable_tld=False, trailing_fragment=True, end=True), + sites = ("https://nhentai.net",), + description = "Download manga and doujinshi from nhentai.net", + ) + +@hpx.attach("Download.query", trigger=IDENTIFIER) +def download_query(item): + """ + Called to query for resource URLs that should be downloaded. + Note that HPX will handle the actual downloading part. + The attached handler should just return all the URLs that should be downloaded in the form of .:class:`DownloadRequest` objects + + should return: + a tuple of :class:`DownloadRequest` for all the URL resources that should be downloaded. + Note that the download system is recursive, so if the URL resource matches a download handler (the same or a different one), + That handler will be called upon with a new :class:`DownloadItem` for that particular URL + (though only once, meaning, no handler will be called upon again with the exact same URL during a single session) + """ + # prepare request + req_props = hpx.command.RequestProperties( + headers=HEADERS, + ) + req = hpx.command.SingleGETRequest().request(item.url, req_props) + + log.info(f"querying url: {item.url}") + + download_requests = [] + + if req.ok: + log.info("request was successful") + # parse html page + soup = BeautifulSoup(req.text, "html.parser") + + # get gallery information + log.info("parsing gallery info") + info_div = soup.find("div", id="info") + if info_div: + title_el = soup.find("h1", class_="title") + if title_el: + title_name = soup.find("span", class_="pretty") + if title_name: + item.name = str(title_name.string) + log.info(f"found name of gallery: {item.name}") + else: + log.warning("couldn't find gallery info div") + + # get gallery cover url + cover_div = soup.find("div", id="cover") + if cover_div: + cover_img = cover_div.find("img") + if cover_img: + try: + download_requests.append(DownloadRequest(downloaditem=item, url=cover_img['data-src'], is_thumbnail=True)) + except: + log.warning("failed to get cover src") + + # get gallery page urls + thumbs_div = soup.find("div", id="thumbnail-container") + all_links = thumbs_div.findAll("a") + if all_links: + log.info(f"found {len(all_links)} thumbnail links") + for l in all_links: + # collect the urls to the page images + # nhentai has a simple url system where thumbs are stored at + # https://t.nhentai.net/galleries/1498842/2t.jpg + # and the real image at + # https://i.nhentai.net/galleries/1498842/2.jpg + url_parts = l.img['data-src'] # img is lazy loaded so src isn't available + if url_parts: + url_parts = url_parts.split('/') + img_id = url_parts[-2] + thumb_number = url_parts[-1] + img_number = thumb_number.replace('t', '') + # construct url for real image + img_url = "https://i.nhentai.net/galleries/{}/{}".format(img_id, img_number) + log.debug(f"final image url parsed to be: {img_url}") + # finally add the url to the list of requests for HPX downloader to take care of the rest + download_requests.append(DownloadRequest(downloaditem=item, url=img_url)) + else: + log.warning("failed to get thumbnail src") + else: + log.warning("couldn't find any thumbnail links") + else: + log.warning("request failed") + + if download_requests: + log.info(f"was able to prepare requests for {len(download_requests)} images") + return tuple(download_requests) + +@hpx.attach("Download.done", trigger=IDENTIFIER) +def download_done(result): + """ + Called when downloading of all :class:`DownloadRequest` for a specific :class:`DownloadItem` has finished. + The handler should do any post-processing here (archive files, rename files or folders, delete extranous files and etc.). + Remember to set the `status` property on the :class:`DownloadResult` object to `False` if the post-processing was a failure. + Note that the handler should *not* import the file into HPX (if it's an item), that part will be taken care of by HPX + + should return: + the same :class:`DownloadResult` that was provided to the handler, potentially modified on the 'path' or `status` and `reason` properties + """ + # there's nothing special to post-process in the case of nhentai downloader, so just return the result as is + log.info(f"download of images was successful for {result.downloaditem.name}") + return result diff --git a/plugins/NHentai Downloader/readme.md b/plugins/NHentai Downloader/readme.md new file mode 100644 index 0000000..f2b1790 --- /dev/null +++ b/plugins/NHentai Downloader/readme.md @@ -0,0 +1,20 @@ +NHentai Downloader +---------------------------- + +> A plugin that enables downloading manga and doujinshi from nhentai.net + +## Configuration + +There's no available config options for this plugin + +## Things yet to be implemented + +- Torrents (waiting for HPX to support this) + +# Changelog + +- `1.0.1` + - fixed an issue where galleries would fail to download because the extracted title was invalid + +- `1.0.0` + - first version \ No newline at end of file diff --git a/plugins/NHentai Downloader/test.py b/plugins/NHentai Downloader/test.py new file mode 100644 index 0000000..36a317e --- /dev/null +++ b/plugins/NHentai Downloader/test.py @@ -0,0 +1 @@ +# test.py \ No newline at end of file diff --git a/plugins/NHentai Metadata/hplugin.json b/plugins/NHentai Metadata/hplugin.json new file mode 100644 index 0000000..78e3ea3 --- /dev/null +++ b/plugins/NHentai Metadata/hplugin.json @@ -0,0 +1,15 @@ +{ + "id": "7d68901f-8cef-4f3c-82b1-6e93f63ba00c", + "shortname": "nhentai-metadata", + "name": "NHentai Metadata", + "version": "1.0.1", + "description": "A plugin that can fetch metadata from nhentai.net", + "author": "Twiddly", + "update_url": "https://github.com/happypandax/plugins/tree/master/plugins/NHentai%20Metadata", + "website": "https://github.com/happypandax/plugins/tree/master/plugins/NHentai%20Metadata", + "entry": "main.py", + "test": "test.py", + "require": [ + "happypandax >= 0.10.0" + ] +} \ No newline at end of file diff --git a/plugins/NHentai Metadata/main.py b/plugins/NHentai Metadata/main.py new file mode 100644 index 0000000..74039da --- /dev/null +++ b/plugins/NHentai Metadata/main.py @@ -0,0 +1,497 @@ +# main.py +import __hpx__ as hpx +import regex +import arrow +import datetime +import os +import urllib +import html + +from bs4 import BeautifulSoup +from PIL import Image, ImageChops + +log = hpx.get_logger("main") + +MATCH_URL_PREFIX = r"^(http\:\/\/|https\:\/\/)?(www\.)?" # http:// or https:// + www. +MATCH_URL_END = r"\/?$" + +DEFAULT_DELAY = 1.5 + +IDENTIFIER = "nhentai" + +URLS = { + 'nh': 'https://nhentai.net', + 'title_search': "https://nhentai.net/search/?q={title}" +} + +HEADERS = {'user-agent':"Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0"} + +PLUGIN_CONFIG = { + 'filename_search': False, # use the filename/folder-name for searching instead of gallery title + 'remove_namespaces': True, # remove superfluous namespaces like 'artists', 'languages' and 'groups' because they are handled specially in HPX + 'gallery_results_limit': 10, # maximum amount of galleries to return + 'blacklist_tags': [], # tags to ignore when updating tags + 'add_gallery_url': True, # add nhentai url to gallery + 'preferred_language': "english", # preferred gallery langauge (in gallery title) to extract from if multiple galleries were found, set empty string for default + 'search_query': "{title}", # the search query, '{title}' will be replaced with the gallery title, use double curly brackets to escape a bracket +} + +@hpx.subscribe("init") +def inited(): + PLUGIN_CONFIG.update(hpx.get_plugin_config()) + + # set default delay values if not set + delays = hpx.get_setting("network", "delays", {}) + for u in (URLS['nh'],): + if u not in delays: + log.info(f"Setting delay on {u} requests to {DEFAULT_DELAY}") + delays[u] = DEFAULT_DELAY + hpx.update_setting("network", "delays", delays) + +@hpx.subscribe('config_update') +def config_update(cfg): + PLUGIN_CONFIG.update(cfg) + +@hpx.attach("Metadata.info") +def metadata_info(): + return hpx.command.MetadataInfo( + identifier = IDENTIFIER, + name = "nhentai", + parser = MATCH_URL_PREFIX + r"(nhentai\.net\/g\/[0-9]{3,10})" + MATCH_URL_END, + sites = ("https://nhentai.net",), + description = "Fetch metadata from nhentai.net", + models = ( + hpx.command.GetDatabaseModel("Gallery"), + ) + ) + +@hpx.attach("Metadata.query", trigger=IDENTIFIER) +def query(itemtuple): + """ + Called to query for candidates to extract metadata from. + Note that HPX will handle choosing which candidates to extract data from. + The attached handler should just return all the candidates found. + """ + log.info("Querying nhentai for metadata") + mdata = [] + for mitem in itemtuple: + item = mitem.item + url = mitem.url + gurls = [] # tuple of (title, url) + # url was provided + if url: + log.info(f"url provided: {url} for {item}") + gurls.append((url, url)) + else: # manually search for id + log.info(f"url not provided for {item}") + # search with title + i_title = "" + if PLUGIN_CONFIG.get("filename_search"): + sources = item.get_sources() + if sources: + # get folder/file name + i_title = os.path.split(sources[0])[1] + # remove ext + i_title = os.path.splitext(i_title)[0] + else: + if item.titles: + i_title = item.titles[0].name # make user choice? + if i_title: + gurls = title_search(i_title) + + log.info(f"found {len(gurls)} urls for item: {item}") + + # list is sorted by date added so we reverse it + gurls.reverse() + + log.debug(f"{gurls}") + final_gurls = [] + pref_lang = PLUGIN_CONFIG.get('preferred_language') + if pref_lang: + for t in gurls: + if pref_lang.lower() in t[0].lower(): + final_gurls.insert(0, t) + continue + final_gurls.append(t) + else: + final_gurls = gurls + + for t, u in final_gurls: + g_id = parse_url(u) + if g_id: + mdata.append(hpx.command.MetadataData( + metadataitem = mitem, + title=t, + url=u, + data={ + 'id': g_id, + 'gallery_url': u, + })) + return tuple(mdata) + +@hpx.attach("Metadata.apply", trigger=IDENTIFIER) +def apply(datatuple): + """ + Called to fetch and apply metadata to the given data items. + Remember to set the `status` property on the :class:`MetadataResult` object to `True` on a successful fetch. + """ + log.info("Applying metadata from nhentai") + mresult = [] + + for mdata in datatuple: + applied = False + # prepare request + req_props = hpx.command.RequestProperties( + headers=HEADERS, + ) + + gallery_url = mdata.data['gallery_url'] + + r = hpx.command.SingleGETRequest().request(gallery_url, req_props) + if r.ok: + response = r.text + if response and not '404 – Not Found' in response: + filtered_data = format_metadata(response, mdata.item, apply_url=PLUGIN_CONFIG.get('add_gallery_url', True), gallery_url=gallery_url) + applied = apply_metadata(filtered_data, mdata.item, mdata.options) + elif response: + log.debug(response) + mresult.append(hpx.command.MetadataResult(data=mdata, status=applied)) + log.info(f"Applied: {applied}") + else: + log.warning(f"Request returned bad status: {r.status_code}") + return tuple(mresult) + +def title_search(title, _times=0): + "Searches on nhentai for galleries with given title, returns a list of (title, matching gallery urls)" + search_url = URLS['title_search'] + log.debug(f"searching with title: {title}") + + sq = PLUGIN_CONFIG.get("search_query") + try: + sq = sq.format(title=title) + except: + log.warning("Failed to use customized search query") + sq = title + + log.info(f"Final search query: {sq}") + + f_url = search_url.format( + title=urllib.parse.quote_plus(sq) + ) + + log.debug(f"final url: {f_url}") + + r = page_results(f_url) + + if not r and not _times: + title = regex.sub(r"\(.+?\)|\[.+?\]", "", title) + title = " ".join(title.split()) + r = title_search(title, _times=_times+1) + return r + +def page_results(page_url, limit=None): + "Opens nhentai page, parses for results, and then returns list of (title, url)" + found_urls = [] # title, url + if limit is None: + limit = PLUGIN_CONFIG.get("gallery_results_limit") + + # prepare request + req_props = hpx.command.RequestProperties( + headers=HEADERS, + ) + r = hpx.command.SingleGETRequest().request(page_url, req_props) + r.raise_for_status() + soup = BeautifulSoup(r.text, "html.parser") + results = soup.findAll("div", class_="gallery", limit=limit) + for x in results: + # str(x.a.string) + t = "" + cap = x.find("div", class_="caption") + if cap: + t = str(cap.string) + u = URLS['nh'] + x.a['href'] + found_urls.append((t or u, u)) + + if not found_urls: + log.warning(f"No results found on url: {page_url}") + log.debug(f"HTML: {r.text}") + return found_urls + +def parse_url(url): + "Extracts the gallery id from url" + gallery_id = None + + gallery_id_token = regex.search('(?<=g/)([0-9]+)', url) + if gallery_id_token: + gallery_id = gallery_id_token.group() + else: + log.warning("Error extracting gallery id from url: {}".format(url)) + return gallery_id + + +def capitalize_text(text): + """ + better str.capitalize + """ + return " ".join(x.capitalize() for x in text.strip().split()) + +def format_metadata(text, item, apply_url=False, gallery_url=None): + """ + Formats metadata to look like this for apply_metadata: + data = { + 'titles': None, # [(title, language),...] + 'artists': None, # [(artist, (circle, circle, ..)),...] + 'parodies': None, # [parody, ...] + 'category': None, + 'tags': None, # [tag, tag, tag, ..] or {ns:[tag, tag, tag, ...]} + 'pub_date': None, # DateTime object or Arrow object + 'language': None, + 'urls': None # [url, ...] + } + """ + mdata = {} + + soup = BeautifulSoup(text, "html.parser") + info_div = soup.find("div", id="info") + if info_div: + + mdata['titles'] = [] + + parsed_text = None + eng_title = info_div.find("h1") + if eng_title: + eng_title = str(eng_title.text) + parsed_text = hpx.command.ItemTextParser(eng_title) + + parsed_title = parsed_text.extract_title() + if parsed_title: + parsed_title = parsed_title[0] + + mdata['titles'].append((parsed_title or eng_title, 'english')) + + jp_title = info_div.find("h2") + if jp_title: + mdata['titles'].append((str(jp_title.text), 'japanese')) + + parsed_artists = parsed_text.extract_artist() if parsed_text else [] + parsed_circles = parsed_text.extract_circle() if parsed_text else [] + + artists = set() + circles = set() + parodies = set() + + lang = "japanese" # default language + + tags_containers = info_div.find("section", id="tags") + if tags_containers: + extranous_namespaces = ("artists", "categories", "parodies", "groups", "languages") + blacklist_tags = [x.lower() for x in PLUGIN_CONFIG.get("blacklist_tags")] + for tag_container in tags_containers.findAll("div", class_="tag-container"): + ns = list(tag_container.stripped_strings)[0] + if not ns: + continue + ns = ns[:-1] # remove colon + ns = ns.lower() + tags = [list(x.stripped_strings)[0] for x in tag_container.findAll("a", class_="tag")] + + nstag = lambda t: ns + ':' + t + + if ns == "artists": + for t in tags: + if blacklist_tags and nstag(t) in blacklist_tags: + continue + for a in parsed_artists: # the artist extracted from the title likely has better capitalization, so choose that instead + if a.lower() == t.lower(): + artists.add(a) + break + else: + artists.add(t) + elif ns == "groups": + for t in tags: + if blacklist_tags and nstag(t) in blacklist_tags: + continue + for a in parsed_circles: # the circle extracted from the title likely has better capitalization, so choose that instead + if a.lower() == t.lower(): + circles.add(a) + break + else: + circles.add(t) + elif ns == "parodies": + for t in tags: + if blacklist_tags and nstag(t) in blacklist_tags: + continue + parodies.add(t) + elif ns == "categories": + t = tags[0] # only supports one + if not (blacklist_tags and nstag(t) in blacklist_tags): + mdata['category'] = capitalize_text(t) + elif ns == "languages": + for t in tags: + if blacklist_tags and nstag(t) in blacklist_tags: + continue + if t in ('translated'): + continue + lang = t # only supports one + + if PLUGIN_CONFIG.get("remove_namespaces") and ns in extranous_namespaces: + if ns == 'languages': # keep other tags + tags = [x for x in tags if x != lang] + else: + continue + + # add rest as tags + if tags: + mdata.setdefault('tags', {}) + for t in tags: + if blacklist_tags and nstag(t) in blacklist_tags: + continue + if ns == 'tags': + mdata['tags'].setdefault(None, []).append(t) + else: + mdata['tags'].setdefault(ns, []).append(t) + + mdata['language'] = lang + + if not artists: + artists.union(set(parsed_artists)) + if not circles: + circles.union(set(parsed_circles)) + + if parodies: + mdata['parodies'] = parodies + + if artists: + a_circles = [] + for a in artists: + a_circles.append((a, tuple(circles))) # assign circles to each artist + mdata['artists'] = a_circles + + if apply_url: + mdata['urls'] = [gallery_url] + + log.debug(f"formatted data: {mdata}") + + return mdata + +GalleryData = hpx.command.GalleryData +LanguageData = hpx.command.LanguageData +TitleData = hpx.command.TitleData +ArtistData = hpx.command.ArtistData +ArtistNameData = hpx.command.ArtistNameData +ParodyData = hpx.command.ParodyData +ParodyNameData = hpx.command.ParodyNameData +CircleData = hpx.command.CircleData +CategoryData = hpx.command.CategoryData +UrlData = hpx.command.UrlData +NamespaceTagData= hpx.command.NamespaceTagData +TagData= hpx.command.TagData +NamespaceData = hpx.command.NamespaceData + +def apply_metadata(data, gallery, options): + """ + data = { + 'titles': None, # [(title, language),...] + 'artists': None, # [(artist, (circle, circle, ..)),...] + 'parodies': None, # [parody, ...] + 'category': None, + 'tags': None, # [tag, tag, tag, ..] or {ns:[tag, tag, tag, ...]} + 'pub_date': None, # DateTime object or Arrow object + 'language': None, + 'urls': None # [url, ...] + } + """ + + log.debug(f"data: {data}") + + gdata = GalleryData() + + if isinstance(data.get('titles'), (list, tuple, set)): + gtitles = [] + for t, l in data['titles']: + gtitle = None + if t: + t = html.unescape(t) + gtitle = TitleData(name=t) + if t and l: + gtitle.language = LanguageData(name=l) + if gtitle: + gtitles.append(gtitle) + + if gtitles: + gdata.titles = gtitles + log.debug("applied titles") + + if isinstance(data.get('artists'), (list, tuple, set)): + gartists = [] + for a, c in data['artists']: + if a: + gartist = ArtistData(names=[ArtistNameData(name=capitalize_text(a))]) + gartists.append(gartist) + + if c: + gcircles = [] + for circlename in [x for x in c if x]: + gcircles.append(CircleData(name=capitalize_text(circlename))) + gartist.circles = gcircles + + if gartists: + gdata.artists = gartists + log.debug("applied artists") + + if isinstance(data.get('parodies'), (list, tuple, set)): + gparodies = [] + for p in data['parodies']: + if p: + gparody = ParodyData(names=[ParodyNameData(name=capitalize_text(p))]) + gparodies.append(gparody) + + if gparodies: + gdata.parodies = gparodies + log.debug("applied parodies") + + if data.get('category'): + gdata.category = CategoryData(name=data['category']) + log.debug("applied category") + + if data.get('language'): + gdata.language = LanguageData(name=data['language']) + log.debug("applied language") + + if isinstance(data.get('tags'), (dict, list)): + if isinstance(data['tags'], list): + data['tags'] = {None: data['tags']} + gnstags = [] + for ns, tags in data['tags'].items(): + if ns is not None: + ns = ns.strip() + for t in tags: + t = t.strip() + if t: + kw = {'tag': TagData(name=t)} + if ns: + kw['namespace'] = NamespaceData(name=ns) + gnstags.append(NamespaceTagData(**kw)) + + if gnstags: + gdata.tags = gnstags + log.debug("applied tags") + + if isinstance(data.get('pub_date'), (datetime.datetime, arrow.Arrow)): + pub_date = data['pub_date'] + gdata.pub_date = pub_date + log.debug("applied pub_date") + + if isinstance(data.get('urls'), (list, tuple)): + gurls = [] + for u in data['urls']: + if u: + gurls.append(UrlData(name=u)) + if gurls: + gdata.urls = gurls + log.debug("applied urls") + + applied = hpx.command.UpdateItemData(gallery, gdata, options=options) + + log.debug(f"applied: {applied}") + + return applied \ No newline at end of file diff --git a/plugins/NHentai Metadata/readme.md b/plugins/NHentai Metadata/readme.md new file mode 100644 index 0000000..fdf7f71 --- /dev/null +++ b/plugins/NHentai Metadata/readme.md @@ -0,0 +1,38 @@ +NHentai Metadata +---------------------------- + +> This plugin fetches metadata from nhentai.net + +## Configuration + +Configure this plugin by adding `nhentai-metadata` to the `plugin.config` namespace in your `config.yaml`: +```yaml +plugin: + config: + nhentai-metadata: + option1: True + option2: + - item 1 + - item 2 +``` + +#### Available options + +Name | Default | Description +--- | --- | --- +`filename_search` | `true` | use the filename/folder-name for searching instead of gallery title +`remove_namespaces` | `true` | remove superfluous namespaces like 'artists', 'languages' and 'groups' and so on because they are handled specially in HPX +`gallery_results_limit` | `10` | maximum amount of galleries to return +`blacklist_tags` | `[]` | tags to ignore when updating tags, a list of `namespace:tag` strings +`add_gallery_url` | `true` | add ehentai url to gallery +`preferred_language` | `"english"` | preferred gallery language (in gallery title) to extract from if multiple galleries were found, set empty string for default +`search_query` | `"{title}"` | the search query, '{title}' will be replaced with the gallery title, use double curly brackets to escape a curly bracket. Tip: if you want to only allow english results, you should modify this to "{title} language:english" + + +# Changelog + +- `1.0.1` + - updated to reflect site changes where titles where not geting extracted + +- `1.0.0` + - first version \ No newline at end of file diff --git a/plugins/NHentai Metadata/test.py b/plugins/NHentai Metadata/test.py new file mode 100644 index 0000000..36a317e --- /dev/null +++ b/plugins/NHentai Metadata/test.py @@ -0,0 +1 @@ +# test.py \ No newline at end of file diff --git a/version/File Metadata/common.py b/version/File Metadata/common.py deleted file mode 100644 index 4a065c9..0000000 --- a/version/File Metadata/common.py +++ /dev/null @@ -1,69 +0,0 @@ -import __hpx__ as hpx -import enum -import json - -log = hpx.get_logger(__name__) - -class DataType(enum.Enum): - eze = 1 - hdoujin = 2 - -filetypes = ('.json', '.txt') -filenames = { - "info.json": (DataType.eze, DataType.hdoujin), - "info.txt": (DataType.hdoujin,) - } - -common_data = { - 'titles': None, # [(title, language),...] - 'artists': None, # [(artist, (circle, circle, ..)),...] - 'category': None, - 'tags': None, # [tag, tag, tag, ..] or {ns:[tag, tag, tag, ...]} - 'pub_date': None, # DateTime object - 'language': None, - 'urls': None # [url, ...] -} - -extractors = {} - -def capitalize_text(text): - """ - better str.capitalize - """ - return " ".join(x.capitalize() for x in text.strip().split()) - -def register_extractor(cls, type): - assert issubclass(cls, Extractor) - assert isinstance(type, DataType) - extractors[type] = cls() - -class Extractor: - """ - """ - - def file_to_dict(self, fs: hpx.command.CoreFS) -> dict: - """ - """ - d = {} - log.debug(f"File ext: {fs.ext}") - kw = {} - if not fs.inside_archive: - kw['encoding'] = 'utf-8' - if fs.ext.lower() == '.json': - with fs.open("r", **kw) as f: - d = json.load(f) - elif fs.ext.lower() == '.txt': - with fs.open("r", **kw) as f: - for line in f.readlines(): - l = line.strip() - k, v = l.split(':', 1) - if k.strip(): - d[k.strip()] = v.strip() - else: - raise NotImplementedError(f"{fs.ext} filetype not supported yet") - return d - - def extract(self, filedata: dict) -> dict: - """ - """ - raise NotImplementedError \ No newline at end of file diff --git a/version/File Metadata/hplugin.json b/version/File Metadata/hplugin.json deleted file mode 100644 index 3d9b165..0000000 --- a/version/File Metadata/hplugin.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "id": "e38e24e4-8ca8-420e-b52b-c75510097653", - "shortname": "file-metadata", - "name": "File Metadata", - "version": "0.1.0b", - "description": "Extracts and applies metadata from a file accompanying a gallery. Supports files produced from eze and hdoujin", - "author": "Twiddly", - "website": "https://github.com/happypandax/plugins", - "entry": "main.py", - "require": [ - "happypandax >= 0.1.2" - ] -} \ No newline at end of file diff --git a/version/File Metadata/main.py b/version/File Metadata/main.py deleted file mode 100644 index e20214b..0000000 --- a/version/File Metadata/main.py +++ /dev/null @@ -1,127 +0,0 @@ -import __hpx__ as hpx -import os -import arrow -import datetime -import common -import extractors - -log = hpx.get_logger(__name__) - -options = { -} - -def get_common_data(datatypes, fpath): - assert isinstance(datatypes, tuple) - d = {} - fpath = hpx.command.CoreFS(fpath) - - for datatype in datatypes: - md = {} - - ex = common.extractors.get(datatype, None) - if ex: - fdata = ex.file_to_dict(fpath) - if fdata: - md.update(ex.extract(fdata)) - if md: - d.update(md) - break - return d - -language_model = hpx.command.GetModelClass("Language") -title_model = hpx.command.GetModelClass("Title") -artist_model = hpx.command.GetModelClass("Artist") -circle_model = hpx.command.GetModelClass("Circle") -category_model = hpx.command.GetModelClass("Category") -artistname_model = hpx.command.GetModelClass("ArtistName") -url_model = hpx.command.GetModelClass("Url") -namespacetags_model = hpx.command.GetModelClass("NamespaceTags") - -def apply_metadata(data, gallery): - applied = False - - log.debug("data:") - log.debug(f"{data}") - - if isinstance(data['titles'], (list, tuple)): - for t, l in data['titles']: - if t: - gtitle = title_model(name=t) - gallery.titles.append(gtitle) - if t and l: - gtitle.language = language_model.as_unique(name=l) - applied = True - - if isinstance(data['artists'], (list, tuple)): - for a, c in data['artists']: - if a: - gartist = artist_model.as_unique(name=a) - if not gartist in gallery.artists: - gallery.artists.append(gartist) - if a and c: - for circlename in [x for x in c if x]: - gcircle = circle_model.as_unique(name=circlename) - if not gcircle in gartist.circles: - gartist.circles.append(gcircle) - applied = True - - if data['category']: - gcat = category_model.as_unique(name=data['category']) - gallery.category = gcat - applied = True - - if data['language']: - glang = language_model.as_unique(name=data['language']) - gallery.language = glang - applied = True - - if isinstance(data['tags'], (dict, list)): - if isinstance(data['tags'], list): - data['tags'] = {None: data['tags']} - ns_tags = [] - for ns, tags in data['tags'].items(): - if ns is not None: - ns = ns.strip() - if ns and ns.lower() == 'misc': - ns = None - for t in tags: - t = t.strip() - ns_tags.append(namespacetags_model.as_unique(ns=ns, tag=t)) - - for nstag in ns_tags: - if not nstag in gallery.tags: - gallery.tags.append(nstag) - applied = True - - if isinstance(data['pub_date'], (datetime.datetime, arrow.Arrow)): - pub_date = data['pub_date'] - if isinstance(pub_date, datetime.datetime): - pub_date = arrow.Arrow.fromdatetime(pub_date) - gallery.pub_date = pub_date - applied = True - - if isinstance(data['urls'], (list, tuple)): - for u in data['urls']: - gallery.urls.append(url_model(name=u)) - applied = True - - return applied - -@hpx.attach("GalleryFS.parse_metadata_file") -def parse(path, gallery): - fs = hpx.command.CoreFS(path) - - contents = {x: os.path.split(x)[1].lower() for x in fs.contents(corefs=False) if x.lower().endswith(common.filetypes)} - log.debug(f"Contents for {fs.path}:") - log.debug(f"{tuple(contents.values())}") - - cdata = common.common_data.copy() - - for fnames, dtypes in common.filenames.items(): - for fpath, fname in contents.items(): - if fname in fnames: - log.debug(f"path: {fpath}") - cdata.update(get_common_data(dtypes, fpath)) - break - - return apply_metadata(cdata, gallery) \ No newline at end of file