Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 24 additions & 2 deletions gnews/gnews.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

class GNews:
def __init__(self, language="en", country="US", max_results=100, period=None, start_date=None, end_date=None,
exclude_websites=None, proxy=None):
exclude_websites=None, proxy=None, des_keep_url=False):
"""
Initialize the GNews client with configuration options.

Expand All @@ -35,6 +35,8 @@ def __init__(self, language="en", country="US", max_results=100, period=None, st
:param end_date: Date before which results must have been published
:param exclude_websites: List of websites to exclude from results
:param proxy: Proxy settings as a dict {protocol: address}
:param des_keep_url: Flag to control whether to preserve additional links in the description.

"""
self.countries = tuple(AVAILABLE_COUNTRIES),
self.languages = tuple(AVAILABLE_LANGUAGES),
Expand All @@ -52,6 +54,7 @@ def __init__(self, language="en", country="US", max_results=100, period=None, st
self.start_date = start_date
self._exclude_websites = exclude_websites if exclude_websites and isinstance(exclude_websites, list) else []
self._proxy = proxy if proxy else None
self._des_keep_url = des_keep_url

def _ceid(self):
time_query = ''
Expand Down Expand Up @@ -184,13 +187,32 @@ def _clean(html):
text = text.replace('\xa0', ' ')
return text

@staticmethod
def _clean_keep_url(html):
soup = Soup(html, "html.parser")
result = []
for a in soup.find_all("a"):
text = a.get_text(strip=True)
href = a.get("href", "")
result.append({
"title": text,
"url": href
})
return result

def _process(self, item):
url = process_url(item, self._exclude_websites, self._proxy)
if url:

if self._des_keep_url:
des = self._clean(item.get("description", ""))
else:
des = self._clean(item.get("description", ""))

title = item.get("title", "")
item = {
'title': title,
'description': self._clean(item.get("description", "")),
'description': des,
'published date': item.get("published", ""),
'url': url,
'publisher': item.get("source", " ")
Expand Down