diff --git a/README.md b/README.md index 9cf3c50..ce5f5e1 100644 --- a/README.md +++ b/README.md @@ -127,6 +127,29 @@ proxy = FreeProxy(country_id=['US', 'BR'], timeout=0.3, rand=True).get() If there are no working proxies with provided parameters script raises `FreeProxyException` with `There are no working proxies at this time.` message. +- **Async usage** +``` +import asyncio +from fp import FreeProxy + +async def get_with_params(): + proxy = FreeProxy(country_id=['US'], timeout=1.0, rand=True) + working_proxy = await FreeProxy().get() + print(f"Working proxy: {working_proxy}") + +or + +async def get_with_params(): + working_proxy = await FreeProxy().get() + print(f"Working proxy: {working_proxy}") + + +asyncio.run(get_with_params()) + + +``` + + ## CHANGELOG --- diff --git a/fp/.gitignore b/fp/.gitignore new file mode 100644 index 0000000..3ca93a1 --- /dev/null +++ b/fp/.gitignore @@ -0,0 +1,169 @@ +# Created by .ignore support plugin (hsz.mobi) +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +### JetBrains template +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/modules.xml +# .idea/*.iml +# .idea/modules + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests +.idea/ +.vscode/ diff --git a/fp/fp.py b/fp/fp.py index 7614665..e3f73b0 100644 --- a/fp/fp.py +++ b/fp/fp.py @@ -1,22 +1,23 @@ #!/usr/bin/env python3 +import asyncio import random +import re +import aiohttp import lxml.html as lh -import requests - from fp.errors import FreeProxyException class FreeProxy: - ''' + """ FreeProxy class scrapes proxies from , , , - and and checks if proxy is working. - There is possibility to filter proxies by country and acceptable timeout. - You can also randomize list of proxies from where script would get first + and and checks if proxy is working. + There is possibility to filter proxies by country and acceptable timeout. + You can also randomize list of proxies from where script would get first working proxy. - ''' + """ def __init__(self, country_id=None, timeout=0.5, rand=False, anonym=False, elite=False, google=None, https=False): self.country_id = country_id @@ -27,17 +28,21 @@ def __init__(self, country_id=None, timeout=0.5, rand=False, anonym=False, elite self.google = google self.schema = 'https' if https else 'http' - def get_proxy_list(self, repeat): + async def get_proxy_list(self, repeat): try: - page = requests.get(self.__website(repeat)) - doc = lh.fromstring(page.content) - except requests.exceptions.RequestException as e: - raise FreeProxyException( - f'Request to {self.__website(repeat)} failed') from e + async with aiohttp.ClientSession() as session: + async with session.get(self.__website(repeat)) as response: + response.raise_for_status() + content = await response.text() + doc = lh.fromstring(content) + except aiohttp.ClientError as e: + raise FreeProxyException(f'Request to {self.__website(repeat)} failed') from e + try: tr_elements = doc.xpath('//*[@id="list"]//tr') - return [f'{tr_elements[i][0].text_content()}:{tr_elements[i][1].text_content()}' - for i in range(1, len(tr_elements)) if self.__criteria(tr_elements[i])] + proxies = [f'{tr_elements[i][0].text_content()}:{tr_elements[i][1].text_content()}' + for i in range(1, len(tr_elements)) if self.__criteria(tr_elements[i])] + return proxies except Exception as e: raise FreeProxyException('Failed to get list of proxies') from e @@ -57,7 +62,7 @@ def __criteria(self, row_elements): elite_criteria = True if not self.elite else 'elite' in row_elements[4].text_content( ) anonym_criteria = True if ( - not self.anonym) or self.elite else 'anonymous' == row_elements[4].text_content() + not self.anonym) or self.elite else 'anonymous' == row_elements[4].text_content() switch = {'yes': True, 'no': False} google_criteria = True if self.google is None else self.google == switch.get( row_elements[5].text_content()) @@ -65,31 +70,42 @@ def __criteria(self, row_elements): ).lower() == 'yes' return country_criteria and elite_criteria and anonym_criteria and google_criteria and https_criteria - def get(self, repeat=False): - '''Returns a working proxy that matches the specified parameters.''' - proxy_list = self.get_proxy_list(repeat) + async def get(self, repeat=False): + """Returns a working proxy that matches the specified parameters.""" + proxy_list = await self.get_proxy_list(repeat) if self.random: random.shuffle(proxy_list) working_proxy = None - for proxy_address in proxy_list: - proxies = {self.schema: f'http://{proxy_address}'} - try: - working_proxy = self.__check_if_proxy_is_working(proxies) - if working_proxy: - return working_proxy - except requests.exceptions.RequestException: - continue + async with aiohttp.ClientSession() as session: + for proxy_address in proxy_list: + proxies = {self.schema: f'http://{proxy_address}'} + try: + working_proxy = await self.__check_if_proxy_is_working(session, proxies) + if working_proxy: + return working_proxy + except aiohttp.ClientError: + continue if not working_proxy and not repeat: if self.country_id is not None: self.country_id = None - return self.get(repeat=True) - raise FreeProxyException( - 'There are no working proxies at this time.') + return await self.get(repeat=True) + raise FreeProxyException('There are no working proxies at this time.') - def __check_if_proxy_is_working(self, proxies): + async def __check_if_proxy_is_working(self, session, proxies): url = f'{self.schema}://www.google.com' - ip = proxies[self.schema].split(':')[1][2:] - with requests.get(url, proxies=proxies, timeout=self.timeout, stream=True) as r: - if r.raw.connection.sock and r.raw.connection.sock.getpeername()[0] == ip: - return proxies[self.schema] - return + try: + async with session.get(url, proxy=proxies[self.schema], timeout=self.timeout) as response: + if response.status == 200: + if response.connection: + pattern = r"URL\('(.+?)'\)" + match = re.search(pattern, str(response.connection)) + if match: + return proxies[self.schema] + else: + pass + except asyncio.TimeoutError: + pass + except aiohttp.ClientError: + pass + + return None diff --git a/requirements.txt b/requirements.txt index e295a70..092d12d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,13 @@ -lxml==4.9.1 -pip-chill==0.1.8 -requests==2.21.0 +aiohttp==3.8.4 +aiosignal==1.3.1 +async-timeout==4.0.2 +attrs==23.1.0 +certifi==2023.5.7 +charset-normalizer==3.1.0 +frozenlist==1.3.3 +idna==3.4 +lxml==4.9.2 +multidict==6.0.4 +soupsieve==2.4.1 +urllib3==2.0.2 +yarl==1.9.2