From 62c95825097b0ce722e51992a7522b1c88d87f44 Mon Sep 17 00:00:00 2001
From: luminos <luminossrl@gmail.com>
Date: Sun, 6 Feb 2022 12:35:56 +0100
Subject: [PATCH 1/4] * Added multiple geolocation news * Added multiple topic
 news * Added possibility to sort the news by the publish date * Added
 possibility to get the news published within the last X hours

---
 README.md                |  15 ++++
 pygooglenews/__init__.py | 151 +++++++++++++++++++++++++++++++++------
 2 files changed, 145 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index 34d0c84..7b10f5c 100644
--- a/README.md
+++ b/README.md
@@ -113,6 +113,13 @@ business = gn.topic_headlines('business')
 
 ```
 
+### **Stories by Multiple Topic**
+
+```python
+business = gn.topic_multiple_headlines(['business', 'world', 'nation'])
+
+```
+
 ### **Geolocation Specific Stories**
 
 ```python
@@ -120,6 +127,14 @@ headquaters = gn.geo_headlines('San Fran')
 
 ```
 
+
+### **Geolocation Multiple Specific Stories**
+
+```python
+headquaters = gn.geo_multiple_headlines(['Rome', 'Milan', 'Turin'])
+
+```
+
 ### **Stories by a Query Search**
 
 ```python
diff --git a/pygooglenews/__init__.py b/pygooglenews/__init__.py
index 244c927..8f53221 100644
--- a/pygooglenews/__init__.py
+++ b/pygooglenews/__init__.py
@@ -1,13 +1,20 @@
+import time
+
 import feedparser
 from bs4 import BeautifulSoup
 import urllib
 from dateparser import parse as parse_date
 import requests
+from datetime import datetime
+from datetime import timedelta
+from time import mktime
+import types
 
+from typing import List
 
 
 class GoogleNews:
-    def __init__(self, lang = 'en', country = 'US'):
+    def __init__(self, lang='en', country='US'):
         self.lang = lang.lower()
         self.country = country.upper()
         self.BASE_URL = 'https://news.google.com/rss'
@@ -32,7 +39,7 @@ def __top_news_parser(self, text):
 
     def __ceid(self):
         """Compile correct country-lang parameters for Google News RSS URL"""
-        return '?ceid={}:{}&hl={}&gl={}'.format(self.country,self.lang,self.lang,self.country)
+        return '?ceid={}:{}&hl={}&gl={}'.format(self.country, self.lang, self.lang, self.country)
 
     def __add_sub_articles(self, entries):
         for i, val in enumerate(entries):
@@ -54,26 +61,25 @@ def __scaping_bee_request(self, api_key, url):
         if response.status_code == 200:
             return response
         if response.status_code != 200:
-            raise Exception("ScrapingBee status_code: "  + str(response.status_code) + " " + response.text)
+            raise Exception("ScrapingBee status_code: " + str(response.status_code) + " " + response.text)
 
-    def __parse_feed(self, feed_url, proxies=None, scraping_bee = None):
+    def __parse_feed(self, feed_url, proxies=None, scraping_bee=None):
 
         if scraping_bee and proxies:
             raise Exception("Pick either ScrapingBee or proxies. Not both!")
 
         if proxies:
-            r = requests.get(feed_url, proxies = proxies)
+            r = requests.get(feed_url, proxies=proxies)
         else:
             r = requests.get(feed_url)
 
         if scraping_bee:
-            r = self.__scaping_bee_request(url = feed_url, api_key = scraping_bee)
+            r = self.__scaping_bee_request(url=feed_url, api_key=scraping_bee)
         else:
             r = requests.get(feed_url)
 
-
         if 'https://news.google.com/rss/unsupported' in r.url:
-            raise Exception('This feed is not available')
+            raise Exception('This feed is not available: ' + r.url)
 
         d = feedparser.parse(r.text)
 
@@ -92,40 +98,142 @@ def __from_to_helper(self, validate=None):
         except:
             raise Exception('Could not parse your date')
 
-
-
-    def top_news(self, proxies=None, scraping_bee = None):
+    def top_news(self, proxies=None, scraping_bee=None):
         """Return a list of all articles from the main page of Google News
         given a country and a language"""
         d = self.__parse_feed(self.BASE_URL + self.__ceid(), proxies=proxies, scraping_bee=scraping_bee)
         d['entries'] = self.__add_sub_articles(d['entries'])
         return d
 
-    def topic_headlines(self, topic: str, proxies=None, scraping_bee=None):
+    def topic_headlines(self, topic: str, proxies=None, hour_span: int = None, sort_by_publish_date: bool = True,
+                        scraping_bee=None):
         """Return a list of all articles from the topic page of Google News
         given a country and a language"""
-        #topic = topic.upper()
-        if topic.upper() in ['WORLD', 'NATION', 'BUSINESS', 'TECHNOLOGY', 'ENTERTAINMENT', 'SCIENCE', 'SPORTS', 'HEALTH']:
-            d = self.__parse_feed(self.BASE_URL + '/headlines/section/topic/{}'.format(topic.upper()) + self.__ceid(), proxies = proxies, scraping_bee=scraping_bee)
+        # topic = topic.upper()
+        d = {'entries': []}
+        if topic.upper() in ['WORLD', 'NATION', 'BUSINESS', 'TECHNOLOGY', 'ENTERTAINMENT', 'SCIENCE', 'SPORTS',
+                             'HEALTH']:
+            t = self.__parse_feed(
+                self.BASE_URL + '/headlines/section/topic/{}'.format(topic.upper()) + self.__ceid(),
+                proxies=proxies, scraping_bee=scraping_bee)
+        else:
+            t = self.__parse_feed(self.BASE_URL + '/topics/{}'.format(topic) + self.__ceid(), proxies=proxies,
+                                  scraping_bee=scraping_bee)
+        d['feed'] = t['feed']
+        if hour_span is not None:
+            d['entries'] += [ta for ta in t['entries'] if
+                             datetime.now() - timedelta(hours=hour_span) <= datetime.fromtimestamp(
+                                 mktime(ta['published_parsed']))]
+        else:
+            d['entries'] += t['entries']
 
+        d['entries'] = self.__add_sub_articles(d['entries'])
+
+        if sort_by_publish_date:
+            d['entries'] = sorted(d['entries'],
+                                  key=lambda p: datetime.fromtimestamp(mktime(p['published_parsed'])).timestamp(),
+                                  reverse=True)
+
+        if len(d['entries']) > 0:
+            return d
         else:
-            d = self.__parse_feed(self.BASE_URL + '/topics/{}'.format(topic) + self.__ceid(), proxies = proxies, scraping_bee=scraping_bee)
+            raise Exception('unsupported topic')
+
+    def topic_multiple_headlines(self, topic_list: List[str], hour_span: int = None, sort_by_publish_date: bool = True,
+                                 proxies=None, scraping_bee=None):
+        """Return a list of all articles from the list of topic page of Google News
+        given a country and a language"""
+
+        d = {'entries': []}
+        for topic in topic_list:
+            try:
+                if topic.upper() in ['WORLD', 'NATION', 'BUSINESS', 'TECHNOLOGY', 'ENTERTAINMENT', 'SCIENCE', 'SPORTS',
+                                     'HEALTH']:
+                    t = self.__parse_feed(
+                        self.BASE_URL + '/headlines/section/topic/{}'.format(topic.upper()) + self.__ceid(),
+                        proxies=proxies, scraping_bee=scraping_bee)
+                else:
+                    t = self.__parse_feed(self.BASE_URL + '/topics/{}'.format(topic) + self.__ceid(), proxies=proxies,
+                                          scraping_bee=scraping_bee)
+                d['feed'] = t['feed']
+                if hour_span is not None:
+                    d['entries'] += [ta for ta in t['entries'] if
+                                     datetime.now() - timedelta(hours=hour_span) <= datetime.fromtimestamp(
+                                         mktime(ta['published_parsed']))]
+                else:
+                    d['entries'] += t['entries']
+            except Exception as e:
+                pass
 
         d['entries'] = self.__add_sub_articles(d['entries'])
+        if sort_by_publish_date:
+            d['entries'] = sorted(d['entries'],
+                                  key=lambda p: datetime.fromtimestamp(mktime(p['published_parsed'])).timestamp(),
+                                  reverse=True)
+
         if len(d['entries']) > 0:
             return d
         else:
             raise Exception('unsupported topic')
 
-    def geo_headlines(self, geo: str, proxies=None, scraping_bee=None):
+    def geo_headlines(self, geo: str, proxies=None, scraping_bee=None, hour_span: int = None,
+                      sort_by_publish_date: bool = True):
         """Return a list of all articles about a specific geolocation
         given a country and a language"""
-        d = self.__parse_feed(self.BASE_URL + '/headlines/section/geo/{}'.format(geo) + self.__ceid(), proxies = proxies, scraping_bee=scraping_bee)
+        d = {'entries': []}
+        t = self.__parse_feed(self.BASE_URL + '/headlines/section/geo/{}'.format(geo) + self.__ceid(),
+                              proxies=proxies, scraping_bee=scraping_bee)
+        d['feed'] = t['feed']
+        if hour_span is not None:
+            d['entries'] += [ta for ta in t['entries'] if
+                             datetime.now() - timedelta(hours=hour_span) <= datetime.fromtimestamp(
+                                 mktime(ta['published_parsed']))]
+        else:
+            d['entries'] += t['entries']
+
+        d['entries'] = self.__add_sub_articles(d['entries'])
+        if sort_by_publish_date:
+            d['entries'] = sorted(d['entries'],
+                                  key=lambda p: datetime.fromtimestamp(mktime(p['published_parsed'])).timestamp(),
+                                  reverse=True)
+        return d
+
+    def geo_multiple_headlines(self, geo: List[str], hour_span: int = None, sort_by_publish_date: bool = True,
+                               proxies=None, scraping_bee=None):
+        """Return a list of all articles about a list of geolocation
+        given a country and a language"""
+
+        d = {'entries': []}
+        for n in geo:
+            try:
+                t = self.__parse_feed(self.BASE_URL + '/headlines/section/geo/{}'.format(n) + self.__ceid(),
+                                      proxies=proxies, scraping_bee=scraping_bee)
+                d['feed'] = t['feed']
+                if hour_span is not None:
+                    d['entries'] += [ta for ta in t['entries'] if
+                                     datetime.now() - timedelta(hours=hour_span) <= datetime.fromtimestamp(
+                                         mktime(ta['published_parsed']))]
+                else:
+                    d['entries'] += t['entries']
+            except Exception as e:
+                pass
 
         d['entries'] = self.__add_sub_articles(d['entries'])
+
+        if sort_by_publish_date:
+            d['entries'] = sorted(d['entries'],
+                                  key=lambda p: datetime.fromtimestamp(mktime(p['published_parsed'])).timestamp(),
+                                  reverse=True)
+
+        # Remove multiple news
+        res = []
+        for da in d['entries']:
+            if da['title'] not in [re['title'] for re in res]:
+                res += [da]
+        d['entries'] = res
         return d
 
-    def search(self, query: str, helper = True, when = None, from_ = None, to_ = None, proxies=None, scraping_bee=None):
+    def search(self, query: str, helper=True, when=None, from_=None, to_=None, proxies=None, scraping_bee=None):
         """
         Return a list of all articles given a full-text search parameter,
         a country and a language
@@ -151,7 +259,8 @@ def search(self, query: str, helper = True, when = None, from_ = None, to_ = Non
         search_ceid = self.__ceid()
         search_ceid = search_ceid.replace('?', '&')
 
-        d = self.__parse_feed(self.BASE_URL + '/search?q={}'.format(query) + search_ceid, proxies = proxies, scraping_bee=scraping_bee)
+        d = self.__parse_feed(self.BASE_URL + '/search?q={}'.format(query) + search_ceid, proxies=proxies,
+                              scraping_bee=scraping_bee)
 
         d['entries'] = self.__add_sub_articles(d['entries'])
-        return d
\ No newline at end of file
+        return d

From b171f3bd42b66b82176e999ba9f163076c7afe70 Mon Sep 17 00:00:00 2001
From: luminos <luminossrl@gmail.com>
Date: Sun, 6 Feb 2022 12:36:42 +0100
Subject: [PATCH 2/4] * Added multiple geolocation news * Added multiple topic
 news * Added possibility to sort the news by the publish date * Added
 possibility to get the news published within the last X hours

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 7b10f5c..f0724a9 100644
--- a/README.md
+++ b/README.md
@@ -84,6 +84,7 @@ to collect all this informaion in one place.
 
 ## **Installation**
 
+
 ```shell script
 $ pip install pygooglenews --upgrade
 

From ccb56e61ffd1e348f32511f9776919f650fa35ba Mon Sep 17 00:00:00 2001
From: Matteo Sipione <39625454+Sipioteo@users.noreply.github.com>
Date: Sun, 6 Feb 2022 12:39:19 +0100
Subject: [PATCH 3/4] Update __init__.py

Generic enhancement for this repo. I needed these functions.
* Added multiple geolocation news
* Added multiple topic news
* Added possibility to sort the news by the publish date
* Added possibility to get the news published within the last X hours
---
 pygooglenews/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pygooglenews/__init__.py b/pygooglenews/__init__.py
index 8f53221..85f40dd 100644
--- a/pygooglenews/__init__.py
+++ b/pygooglenews/__init__.py
@@ -1,5 +1,4 @@
 import time
-
 import feedparser
 from bs4 import BeautifulSoup
 import urllib

From 9a1081781fa8dffca58b7a14a9fb9153198d2685 Mon Sep 17 00:00:00 2001
From: luminos <luminossrl@gmail.com>
Date: Sun, 6 Feb 2022 14:49:42 +0100
Subject: [PATCH 4/4] * Added multiple geolocation news * Added multiple topic
 news * Added possibility to sort the news by the publish date * Added
 possibility to get the news published within the last X hours

---
 README.md                |  1 -
 pygooglenews/__init__.py | 24 ++++++++++++------------
 pyproject.toml           |  2 +-
 3 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index f0724a9..7b10f5c 100644
--- a/README.md
+++ b/README.md
@@ -84,7 +84,6 @@ to collect all this informaion in one place.
 
 ## **Installation**
 
-
 ```shell script
 $ pip install pygooglenews --upgrade
 
diff --git a/pygooglenews/__init__.py b/pygooglenews/__init__.py
index 8f53221..aa8a2af 100644
--- a/pygooglenews/__init__.py
+++ b/pygooglenews/__init__.py
@@ -105,7 +105,7 @@ def top_news(self, proxies=None, scraping_bee=None):
         d['entries'] = self.__add_sub_articles(d['entries'])
         return d
 
-    def topic_headlines(self, topic: str, proxies=None, hour_span: int = None, sort_by_publish_date: bool = True,
+    def topic_headlines(self, topic: str, proxies=None, time_span: timedelta = None, sort_by_publish_date: bool = True,
                         scraping_bee=None):
         """Return a list of all articles from the topic page of Google News
         given a country and a language"""
@@ -120,9 +120,9 @@ def topic_headlines(self, topic: str, proxies=None, hour_span: int = None, sort_
             t = self.__parse_feed(self.BASE_URL + '/topics/{}'.format(topic) + self.__ceid(), proxies=proxies,
                                   scraping_bee=scraping_bee)
         d['feed'] = t['feed']
-        if hour_span is not None:
+        if time_span is not None:
             d['entries'] += [ta for ta in t['entries'] if
-                             datetime.now() - timedelta(hours=hour_span) <= datetime.fromtimestamp(
+                             datetime.now() - time_span <= datetime.fromtimestamp(
                                  mktime(ta['published_parsed']))]
         else:
             d['entries'] += t['entries']
@@ -139,7 +139,7 @@ def topic_headlines(self, topic: str, proxies=None, hour_span: int = None, sort_
         else:
             raise Exception('unsupported topic')
 
-    def topic_multiple_headlines(self, topic_list: List[str], hour_span: int = None, sort_by_publish_date: bool = True,
+    def topic_multiple_headlines(self, topic_list: List[str], time_span: timedelta = None, sort_by_publish_date: bool = True,
                                  proxies=None, scraping_bee=None):
         """Return a list of all articles from the list of topic page of Google News
         given a country and a language"""
@@ -156,9 +156,9 @@ def topic_multiple_headlines(self, topic_list: List[str], hour_span: int = None,
                     t = self.__parse_feed(self.BASE_URL + '/topics/{}'.format(topic) + self.__ceid(), proxies=proxies,
                                           scraping_bee=scraping_bee)
                 d['feed'] = t['feed']
-                if hour_span is not None:
+                if time_span is not None:
                     d['entries'] += [ta for ta in t['entries'] if
-                                     datetime.now() - timedelta(hours=hour_span) <= datetime.fromtimestamp(
+                                     datetime.now() - time_span <= datetime.fromtimestamp(
                                          mktime(ta['published_parsed']))]
                 else:
                     d['entries'] += t['entries']
@@ -176,7 +176,7 @@ def topic_multiple_headlines(self, topic_list: List[str], hour_span: int = None,
         else:
             raise Exception('unsupported topic')
 
-    def geo_headlines(self, geo: str, proxies=None, scraping_bee=None, hour_span: int = None,
+    def geo_headlines(self, geo: str, proxies=None, scraping_bee=None, time_span: timedelta = None,
                       sort_by_publish_date: bool = True):
         """Return a list of all articles about a specific geolocation
         given a country and a language"""
@@ -184,9 +184,9 @@ def geo_headlines(self, geo: str, proxies=None, scraping_bee=None, hour_span: in
         t = self.__parse_feed(self.BASE_URL + '/headlines/section/geo/{}'.format(geo) + self.__ceid(),
                               proxies=proxies, scraping_bee=scraping_bee)
         d['feed'] = t['feed']
-        if hour_span is not None:
+        if time_span is not None:
             d['entries'] += [ta for ta in t['entries'] if
-                             datetime.now() - timedelta(hours=hour_span) <= datetime.fromtimestamp(
+                             datetime.now() - time_span <= datetime.fromtimestamp(
                                  mktime(ta['published_parsed']))]
         else:
             d['entries'] += t['entries']
@@ -198,7 +198,7 @@ def geo_headlines(self, geo: str, proxies=None, scraping_bee=None, hour_span: in
                                   reverse=True)
         return d
 
-    def geo_multiple_headlines(self, geo: List[str], hour_span: int = None, sort_by_publish_date: bool = True,
+    def geo_multiple_headlines(self, geo: List[str], time_span: timedelta = None, sort_by_publish_date: bool = True,
                                proxies=None, scraping_bee=None):
         """Return a list of all articles about a list of geolocation
         given a country and a language"""
@@ -209,9 +209,9 @@ def geo_multiple_headlines(self, geo: List[str], hour_span: int = None, sort_by_
                 t = self.__parse_feed(self.BASE_URL + '/headlines/section/geo/{}'.format(n) + self.__ceid(),
                                       proxies=proxies, scraping_bee=scraping_bee)
                 d['feed'] = t['feed']
-                if hour_span is not None:
+                if time_span is not None:
                     d['entries'] += [ta for ta in t['entries'] if
-                                     datetime.now() - timedelta(hours=hour_span) <= datetime.fromtimestamp(
+                                     datetime.now() - time_span <= datetime.fromtimestamp(
                                          mktime(ta['published_parsed']))]
                 else:
                     d['entries'] += t['entries']
diff --git a/pyproject.toml b/pyproject.toml
index 6cf4d6b..64868ad 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pygooglenews"
-version = "0.1.2"
+version = "0.1.3"
 description = "If Google News had a Python library"
 authors = ["kotartemiy <bugara.artem@gmail.com>"]
 license = "MIT"