From 62c95825097b0ce722e51992a7522b1c88d87f44 Mon Sep 17 00:00:00 2001 From: luminos Date: Sun, 6 Feb 2022 12:35:56 +0100 Subject: [PATCH 1/4] * Added multiple geolocation news * Added multiple topic news * Added possibility to sort the news by the publish date * Added possibility to get the news published within the last X hours --- README.md | 15 ++++ pygooglenews/__init__.py | 151 +++++++++++++++++++++++++++++++++------ 2 files changed, 145 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 34d0c84..7b10f5c 100644 --- a/README.md +++ b/README.md @@ -113,6 +113,13 @@ business = gn.topic_headlines('business') ``` +### **Stories by Multiple Topic** + +```python +business = gn.topic_multiple_headlines(['business', 'world', 'nation']) + +``` + ### **Geolocation Specific Stories** ```python @@ -120,6 +127,14 @@ headquaters = gn.geo_headlines('San Fran') ``` + +### **Geolocation Multiple Specific Stories** + +```python +headquaters = gn.geo_multiple_headlines(['Rome', 'Milan', 'Turin']) + +``` + ### **Stories by a Query Search** ```python diff --git a/pygooglenews/__init__.py b/pygooglenews/__init__.py index 244c927..8f53221 100644 --- a/pygooglenews/__init__.py +++ b/pygooglenews/__init__.py @@ -1,13 +1,20 @@ +import time + import feedparser from bs4 import BeautifulSoup import urllib from dateparser import parse as parse_date import requests +from datetime import datetime +from datetime import timedelta +from time import mktime +import types +from typing import List class GoogleNews: - def __init__(self, lang = 'en', country = 'US'): + def __init__(self, lang='en', country='US'): self.lang = lang.lower() self.country = country.upper() self.BASE_URL = 'https://news.google.com/rss' @@ -32,7 +39,7 @@ def __top_news_parser(self, text): def __ceid(self): """Compile correct country-lang parameters for Google News RSS URL""" - return '?ceid={}:{}&hl={}&gl={}'.format(self.country,self.lang,self.lang,self.country) + return '?ceid={}:{}&hl={}&gl={}'.format(self.country, self.lang, self.lang, self.country) def __add_sub_articles(self, entries): for i, val in enumerate(entries): @@ -54,26 +61,25 @@ def __scaping_bee_request(self, api_key, url): if response.status_code == 200: return response if response.status_code != 200: - raise Exception("ScrapingBee status_code: " + str(response.status_code) + " " + response.text) + raise Exception("ScrapingBee status_code: " + str(response.status_code) + " " + response.text) - def __parse_feed(self, feed_url, proxies=None, scraping_bee = None): + def __parse_feed(self, feed_url, proxies=None, scraping_bee=None): if scraping_bee and proxies: raise Exception("Pick either ScrapingBee or proxies. Not both!") if proxies: - r = requests.get(feed_url, proxies = proxies) + r = requests.get(feed_url, proxies=proxies) else: r = requests.get(feed_url) if scraping_bee: - r = self.__scaping_bee_request(url = feed_url, api_key = scraping_bee) + r = self.__scaping_bee_request(url=feed_url, api_key=scraping_bee) else: r = requests.get(feed_url) - if 'https://news.google.com/rss/unsupported' in r.url: - raise Exception('This feed is not available') + raise Exception('This feed is not available: ' + r.url) d = feedparser.parse(r.text) @@ -92,40 +98,142 @@ def __from_to_helper(self, validate=None): except: raise Exception('Could not parse your date') - - - def top_news(self, proxies=None, scraping_bee = None): + def top_news(self, proxies=None, scraping_bee=None): """Return a list of all articles from the main page of Google News given a country and a language""" d = self.__parse_feed(self.BASE_URL + self.__ceid(), proxies=proxies, scraping_bee=scraping_bee) d['entries'] = self.__add_sub_articles(d['entries']) return d - def topic_headlines(self, topic: str, proxies=None, scraping_bee=None): + def topic_headlines(self, topic: str, proxies=None, hour_span: int = None, sort_by_publish_date: bool = True, + scraping_bee=None): """Return a list of all articles from the topic page of Google News given a country and a language""" - #topic = topic.upper() - if topic.upper() in ['WORLD', 'NATION', 'BUSINESS', 'TECHNOLOGY', 'ENTERTAINMENT', 'SCIENCE', 'SPORTS', 'HEALTH']: - d = self.__parse_feed(self.BASE_URL + '/headlines/section/topic/{}'.format(topic.upper()) + self.__ceid(), proxies = proxies, scraping_bee=scraping_bee) + # topic = topic.upper() + d = {'entries': []} + if topic.upper() in ['WORLD', 'NATION', 'BUSINESS', 'TECHNOLOGY', 'ENTERTAINMENT', 'SCIENCE', 'SPORTS', + 'HEALTH']: + t = self.__parse_feed( + self.BASE_URL + '/headlines/section/topic/{}'.format(topic.upper()) + self.__ceid(), + proxies=proxies, scraping_bee=scraping_bee) + else: + t = self.__parse_feed(self.BASE_URL + '/topics/{}'.format(topic) + self.__ceid(), proxies=proxies, + scraping_bee=scraping_bee) + d['feed'] = t['feed'] + if hour_span is not None: + d['entries'] += [ta for ta in t['entries'] if + datetime.now() - timedelta(hours=hour_span) <= datetime.fromtimestamp( + mktime(ta['published_parsed']))] + else: + d['entries'] += t['entries'] + d['entries'] = self.__add_sub_articles(d['entries']) + + if sort_by_publish_date: + d['entries'] = sorted(d['entries'], + key=lambda p: datetime.fromtimestamp(mktime(p['published_parsed'])).timestamp(), + reverse=True) + + if len(d['entries']) > 0: + return d else: - d = self.__parse_feed(self.BASE_URL + '/topics/{}'.format(topic) + self.__ceid(), proxies = proxies, scraping_bee=scraping_bee) + raise Exception('unsupported topic') + + def topic_multiple_headlines(self, topic_list: List[str], hour_span: int = None, sort_by_publish_date: bool = True, + proxies=None, scraping_bee=None): + """Return a list of all articles from the list of topic page of Google News + given a country and a language""" + + d = {'entries': []} + for topic in topic_list: + try: + if topic.upper() in ['WORLD', 'NATION', 'BUSINESS', 'TECHNOLOGY', 'ENTERTAINMENT', 'SCIENCE', 'SPORTS', + 'HEALTH']: + t = self.__parse_feed( + self.BASE_URL + '/headlines/section/topic/{}'.format(topic.upper()) + self.__ceid(), + proxies=proxies, scraping_bee=scraping_bee) + else: + t = self.__parse_feed(self.BASE_URL + '/topics/{}'.format(topic) + self.__ceid(), proxies=proxies, + scraping_bee=scraping_bee) + d['feed'] = t['feed'] + if hour_span is not None: + d['entries'] += [ta for ta in t['entries'] if + datetime.now() - timedelta(hours=hour_span) <= datetime.fromtimestamp( + mktime(ta['published_parsed']))] + else: + d['entries'] += t['entries'] + except Exception as e: + pass d['entries'] = self.__add_sub_articles(d['entries']) + if sort_by_publish_date: + d['entries'] = sorted(d['entries'], + key=lambda p: datetime.fromtimestamp(mktime(p['published_parsed'])).timestamp(), + reverse=True) + if len(d['entries']) > 0: return d else: raise Exception('unsupported topic') - def geo_headlines(self, geo: str, proxies=None, scraping_bee=None): + def geo_headlines(self, geo: str, proxies=None, scraping_bee=None, hour_span: int = None, + sort_by_publish_date: bool = True): """Return a list of all articles about a specific geolocation given a country and a language""" - d = self.__parse_feed(self.BASE_URL + '/headlines/section/geo/{}'.format(geo) + self.__ceid(), proxies = proxies, scraping_bee=scraping_bee) + d = {'entries': []} + t = self.__parse_feed(self.BASE_URL + '/headlines/section/geo/{}'.format(geo) + self.__ceid(), + proxies=proxies, scraping_bee=scraping_bee) + d['feed'] = t['feed'] + if hour_span is not None: + d['entries'] += [ta for ta in t['entries'] if + datetime.now() - timedelta(hours=hour_span) <= datetime.fromtimestamp( + mktime(ta['published_parsed']))] + else: + d['entries'] += t['entries'] + + d['entries'] = self.__add_sub_articles(d['entries']) + if sort_by_publish_date: + d['entries'] = sorted(d['entries'], + key=lambda p: datetime.fromtimestamp(mktime(p['published_parsed'])).timestamp(), + reverse=True) + return d + + def geo_multiple_headlines(self, geo: List[str], hour_span: int = None, sort_by_publish_date: bool = True, + proxies=None, scraping_bee=None): + """Return a list of all articles about a list of geolocation + given a country and a language""" + + d = {'entries': []} + for n in geo: + try: + t = self.__parse_feed(self.BASE_URL + '/headlines/section/geo/{}'.format(n) + self.__ceid(), + proxies=proxies, scraping_bee=scraping_bee) + d['feed'] = t['feed'] + if hour_span is not None: + d['entries'] += [ta for ta in t['entries'] if + datetime.now() - timedelta(hours=hour_span) <= datetime.fromtimestamp( + mktime(ta['published_parsed']))] + else: + d['entries'] += t['entries'] + except Exception as e: + pass d['entries'] = self.__add_sub_articles(d['entries']) + + if sort_by_publish_date: + d['entries'] = sorted(d['entries'], + key=lambda p: datetime.fromtimestamp(mktime(p['published_parsed'])).timestamp(), + reverse=True) + + # Remove multiple news + res = [] + for da in d['entries']: + if da['title'] not in [re['title'] for re in res]: + res += [da] + d['entries'] = res return d - def search(self, query: str, helper = True, when = None, from_ = None, to_ = None, proxies=None, scraping_bee=None): + def search(self, query: str, helper=True, when=None, from_=None, to_=None, proxies=None, scraping_bee=None): """ Return a list of all articles given a full-text search parameter, a country and a language @@ -151,7 +259,8 @@ def search(self, query: str, helper = True, when = None, from_ = None, to_ = Non search_ceid = self.__ceid() search_ceid = search_ceid.replace('?', '&') - d = self.__parse_feed(self.BASE_URL + '/search?q={}'.format(query) + search_ceid, proxies = proxies, scraping_bee=scraping_bee) + d = self.__parse_feed(self.BASE_URL + '/search?q={}'.format(query) + search_ceid, proxies=proxies, + scraping_bee=scraping_bee) d['entries'] = self.__add_sub_articles(d['entries']) - return d \ No newline at end of file + return d From b171f3bd42b66b82176e999ba9f163076c7afe70 Mon Sep 17 00:00:00 2001 From: luminos Date: Sun, 6 Feb 2022 12:36:42 +0100 Subject: [PATCH 2/4] * Added multiple geolocation news * Added multiple topic news * Added possibility to sort the news by the publish date * Added possibility to get the news published within the last X hours --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 7b10f5c..f0724a9 100644 --- a/README.md +++ b/README.md @@ -84,6 +84,7 @@ to collect all this informaion in one place. ## **Installation** + ```shell script $ pip install pygooglenews --upgrade From ccb56e61ffd1e348f32511f9776919f650fa35ba Mon Sep 17 00:00:00 2001 From: Matteo Sipione <39625454+Sipioteo@users.noreply.github.com> Date: Sun, 6 Feb 2022 12:39:19 +0100 Subject: [PATCH 3/4] Update __init__.py Generic enhancement for this repo. I needed these functions. * Added multiple geolocation news * Added multiple topic news * Added possibility to sort the news by the publish date * Added possibility to get the news published within the last X hours --- pygooglenews/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pygooglenews/__init__.py b/pygooglenews/__init__.py index 8f53221..85f40dd 100644 --- a/pygooglenews/__init__.py +++ b/pygooglenews/__init__.py @@ -1,5 +1,4 @@ import time - import feedparser from bs4 import BeautifulSoup import urllib From 9a1081781fa8dffca58b7a14a9fb9153198d2685 Mon Sep 17 00:00:00 2001 From: luminos Date: Sun, 6 Feb 2022 14:49:42 +0100 Subject: [PATCH 4/4] * Added multiple geolocation news * Added multiple topic news * Added possibility to sort the news by the publish date * Added possibility to get the news published within the last X hours --- README.md | 1 - pygooglenews/__init__.py | 24 ++++++++++++------------ pyproject.toml | 2 +- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index f0724a9..7b10f5c 100644 --- a/README.md +++ b/README.md @@ -84,7 +84,6 @@ to collect all this informaion in one place. ## **Installation** - ```shell script $ pip install pygooglenews --upgrade diff --git a/pygooglenews/__init__.py b/pygooglenews/__init__.py index 8f53221..aa8a2af 100644 --- a/pygooglenews/__init__.py +++ b/pygooglenews/__init__.py @@ -105,7 +105,7 @@ def top_news(self, proxies=None, scraping_bee=None): d['entries'] = self.__add_sub_articles(d['entries']) return d - def topic_headlines(self, topic: str, proxies=None, hour_span: int = None, sort_by_publish_date: bool = True, + def topic_headlines(self, topic: str, proxies=None, time_span: timedelta = None, sort_by_publish_date: bool = True, scraping_bee=None): """Return a list of all articles from the topic page of Google News given a country and a language""" @@ -120,9 +120,9 @@ def topic_headlines(self, topic: str, proxies=None, hour_span: int = None, sort_ t = self.__parse_feed(self.BASE_URL + '/topics/{}'.format(topic) + self.__ceid(), proxies=proxies, scraping_bee=scraping_bee) d['feed'] = t['feed'] - if hour_span is not None: + if time_span is not None: d['entries'] += [ta for ta in t['entries'] if - datetime.now() - timedelta(hours=hour_span) <= datetime.fromtimestamp( + datetime.now() - time_span <= datetime.fromtimestamp( mktime(ta['published_parsed']))] else: d['entries'] += t['entries'] @@ -139,7 +139,7 @@ def topic_headlines(self, topic: str, proxies=None, hour_span: int = None, sort_ else: raise Exception('unsupported topic') - def topic_multiple_headlines(self, topic_list: List[str], hour_span: int = None, sort_by_publish_date: bool = True, + def topic_multiple_headlines(self, topic_list: List[str], time_span: timedelta = None, sort_by_publish_date: bool = True, proxies=None, scraping_bee=None): """Return a list of all articles from the list of topic page of Google News given a country and a language""" @@ -156,9 +156,9 @@ def topic_multiple_headlines(self, topic_list: List[str], hour_span: int = None, t = self.__parse_feed(self.BASE_URL + '/topics/{}'.format(topic) + self.__ceid(), proxies=proxies, scraping_bee=scraping_bee) d['feed'] = t['feed'] - if hour_span is not None: + if time_span is not None: d['entries'] += [ta for ta in t['entries'] if - datetime.now() - timedelta(hours=hour_span) <= datetime.fromtimestamp( + datetime.now() - time_span <= datetime.fromtimestamp( mktime(ta['published_parsed']))] else: d['entries'] += t['entries'] @@ -176,7 +176,7 @@ def topic_multiple_headlines(self, topic_list: List[str], hour_span: int = None, else: raise Exception('unsupported topic') - def geo_headlines(self, geo: str, proxies=None, scraping_bee=None, hour_span: int = None, + def geo_headlines(self, geo: str, proxies=None, scraping_bee=None, time_span: timedelta = None, sort_by_publish_date: bool = True): """Return a list of all articles about a specific geolocation given a country and a language""" @@ -184,9 +184,9 @@ def geo_headlines(self, geo: str, proxies=None, scraping_bee=None, hour_span: in t = self.__parse_feed(self.BASE_URL + '/headlines/section/geo/{}'.format(geo) + self.__ceid(), proxies=proxies, scraping_bee=scraping_bee) d['feed'] = t['feed'] - if hour_span is not None: + if time_span is not None: d['entries'] += [ta for ta in t['entries'] if - datetime.now() - timedelta(hours=hour_span) <= datetime.fromtimestamp( + datetime.now() - time_span <= datetime.fromtimestamp( mktime(ta['published_parsed']))] else: d['entries'] += t['entries'] @@ -198,7 +198,7 @@ def geo_headlines(self, geo: str, proxies=None, scraping_bee=None, hour_span: in reverse=True) return d - def geo_multiple_headlines(self, geo: List[str], hour_span: int = None, sort_by_publish_date: bool = True, + def geo_multiple_headlines(self, geo: List[str], time_span: timedelta = None, sort_by_publish_date: bool = True, proxies=None, scraping_bee=None): """Return a list of all articles about a list of geolocation given a country and a language""" @@ -209,9 +209,9 @@ def geo_multiple_headlines(self, geo: List[str], hour_span: int = None, sort_by_ t = self.__parse_feed(self.BASE_URL + '/headlines/section/geo/{}'.format(n) + self.__ceid(), proxies=proxies, scraping_bee=scraping_bee) d['feed'] = t['feed'] - if hour_span is not None: + if time_span is not None: d['entries'] += [ta for ta in t['entries'] if - datetime.now() - timedelta(hours=hour_span) <= datetime.fromtimestamp( + datetime.now() - time_span <= datetime.fromtimestamp( mktime(ta['published_parsed']))] else: d['entries'] += t['entries'] diff --git a/pyproject.toml b/pyproject.toml index 6cf4d6b..64868ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pygooglenews" -version = "0.1.2" +version = "0.1.3" description = "If Google News had a Python library" authors = ["kotartemiy "] license = "MIT"