From e48e3defd28c5647f57f95a792c90861160c0ac9 Mon Sep 17 00:00:00 2001 From: Paul Leclercq Date: Mon, 27 Nov 2023 16:28:57 +0100 Subject: [PATCH] refacto: parsing description --- docker-compose.yml | 4 +-- quotaclimat/data_ingestion/scrap_sitemap.py | 23 +++++++++++----- test/sitemap/test_scrap_html.py | 30 ++++++++++++++++----- test/sitemap/test_scrap_sitemap.py | 18 ++++--------- test/sitemap/utils.py | 14 ++++++++++ 5 files changed, 60 insertions(+), 29 deletions(-) create mode 100644 test/sitemap/utils.py diff --git a/docker-compose.yml b/docker-compose.yml index 6998157e7..cd34dd61f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -48,8 +48,8 @@ services: #entrypoint: ["sleep", "1200"] # use to debug the container if needed entrypoint: ["python", "quotaclimat/data_ingestion/ingest_db/ingest_sitemap_in_db.py"] environment: - ENV: docker - LOGLEVEL: debug # Change me to info (warning, error) to have less log + ENV: docker # change me to prod for real cases + LOGLEVEL: INFO # Change me to info (debug, info, warning, error) to have less log PYTHONPATH: /app POSTGRES_USER: user POSTGRES_DB: barometre diff --git a/quotaclimat/data_ingestion/scrap_sitemap.py b/quotaclimat/data_ingestion/scrap_sitemap.py index d77e07de4..e9e844bf2 100644 --- a/quotaclimat/data_ingestion/scrap_sitemap.py +++ b/quotaclimat/data_ingestion/scrap_sitemap.py @@ -10,7 +10,7 @@ from quotaclimat.data_ingestion.config_sitemap import (SITEMAP_CONFIG, SITEMAP_TEST_CONFIG, SITEMAP_DOCKER_CONFIG, MEDIA_CONFIG) from postgres.schemas.models import get_sitemap_cols -from quotaclimat.data_ingestion.scrap_html.scrap_description_article import get_meta_news +from quotaclimat.data_ingestion.scrap_html.scrap_description_article import get_meta_news, agent import asyncio import hashlib @@ -180,8 +180,9 @@ async def query_one_sitemap_and_transform(media: str, sitemap_conf: Dict, df_fro """ try: logging.info("\n\nParsing media %s with %s" % (media, sitemap_conf["sitemap_url"])) + logging.info(f"User-agent: { agent['User-Agent'] }") #@see https://advertools.readthedocs.io/en/master/advertools.sitemaps.html#news-sitemaps - + adv.sitemaps.headers['User-Agent'] = agent["User-Agent"] temp_df = adv.sitemap_to_df(sitemap_conf["sitemap_url"]) temp_df.rename(columns={"loc": "url"}, inplace=True) @@ -216,14 +217,22 @@ async def query_one_sitemap_and_transform(media: str, sitemap_conf: Dict, df_fro #keep only unknown id to not parse every website for new_description difference_df = get_diff_from_df(df, df_from_pg) - # concurrency : https://stackoverflow.com/a/67944888/3535853 - # https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy - difference_df['news_description'] = await asyncio.gather(*(add_news_meta(row["url"], media, row["news_title"]) for (_, row) in difference_df.iterrows())) - + difference_df['news_description'] = await get_description_article(media, difference_df) + return difference_df except Exception as err: logging.error( "Sitemap query error for %s: %s : %s" % (media, sitemap_conf["sitemap_url"], err) ) - return None \ No newline at end of file + return None + +# concurrency : https://stackoverflow.com/a/67944888/3535853 +# https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy +async def get_description_article(media, article_df): + article_tasks = [] + for (_, row) in article_df.iterrows(): + description = add_news_meta(row["url"], media, row["news_title"]) + article_tasks.append(description) + + return await asyncio.gather(*article_tasks) \ No newline at end of file diff --git a/test/sitemap/test_scrap_html.py b/test/sitemap/test_scrap_html.py index a60fd3c00..ea47cf394 100644 --- a/test/sitemap/test_scrap_html.py +++ b/test/sitemap/test_scrap_html.py @@ -1,14 +1,30 @@ -import logging import pytest -import os +import pandas as pd from quotaclimat.data_ingestion.scrap_html.scrap_description_article import get_meta_news, get_hat_20minutes, get_url_content +from quotaclimat.data_ingestion.scrap_sitemap import get_description_article from bs4 import BeautifulSoup +from utils import get_localhost, debug_df -localhost = "" -if(os.environ.get("ENV") == "docker"): - localhost ="http://nginxtest:80" -else: - localhost = "http://localhost:8000" +localhost = get_localhost() + +@pytest.mark.asyncio +async def test_get_description_article(): + url_to_parse = f"{localhost}/mediapart_website.html" + media = "Le Figaro" + df_articles = pd.DataFrame([{ + "url" : url_to_parse, + "news_title" :media, + }]) + + expected_result = pd.DataFrame([{ + "url" : url_to_parse, + "news_title" :media, + "news_description" : "description could be parsed with success" + }]) + + df_articles["news_description"] = await get_description_article(media, df_articles) + debug_df(df_articles) + pd.testing.assert_frame_equal(df_articles.reset_index(drop=True), expected_result.reset_index(drop=True)) @pytest.mark.asyncio async def test_get_meta_news(): diff --git a/test/sitemap/test_scrap_sitemap.py b/test/sitemap/test_scrap_sitemap.py index 329aca92c..93c0e2746 100644 --- a/test/sitemap/test_scrap_sitemap.py +++ b/test/sitemap/test_scrap_sitemap.py @@ -1,20 +1,12 @@ -import logging - -import numpy as np import pandas as pd import pytest -import os from quotaclimat.data_ingestion.scrap_sitemap import (filter_on_date, find_sections, get_consistent_hash, get_diff_from_df, query_one_sitemap_and_transform, get_sections_from_url, normalize_section) from quotaclimat.data_ingestion.config_sitemap import (SITEMAP_CONFIG) from datetime import datetime, timedelta - +from utils import get_localhost from quotaclimat.data_ingestion.ingest_db.ingest_sitemap_in_db import get_sitemap_list -url_to_parse = "" -if(os.environ.get("ENV") == "docker"): - url_to_parse ="http://nginxtest:80/" -else: - url_to_parse = "http://localhost:8000/" +url_to_parse = get_localhost() def test_normalize_section(): assert normalize_section(["test", "pizza"]) == ["test", "pizza"] @@ -32,7 +24,7 @@ def test_get_sitemap_list(): sitemap = list(get_sitemap_list())[0] # locally we test only a few items sitemap_url = sitemap - sitemap_url == "http://nginxtest:80/sitemap_news_figaro_3.xml" + sitemap_url == f"${url_to_parse}/sitemap_news_figaro_3.xml" @pytest.mark.asyncio async def test_query_one_sitemap_and_transform(): @@ -48,7 +40,7 @@ async def test_query_one_sitemap_and_transform(): output = await query_one_sitemap_and_transform(media, sitemap_config[media], pg_df) title = "EN DIRECT - Conflit Hamas-Israël : l’armée israélienne dit avoir frappé Gaza avec 4000 tonnes d’explosifs depuis samedi" expected_result = pd.DataFrame([{ - "url" : f"{url_to_parse}mediapart_website.html", + "url" : f"{url_to_parse}/mediapart_website.html", "lastmod" :pd.Timestamp("2023-10-12 15:34:28"), "publication_name" :"Le Figaro", "publication_language" :"fr", @@ -96,7 +88,7 @@ async def test_query_one_sitemap_and_transform_hat_parsing(): title = "Grève du 13 octobre : SNCF, RATP, aérien, médecins… Retrouvez le détail des perturbations à prévoir" publication_name = "Le Figaro" expected_result = pd.DataFrame([{ - "url" : f"{url_to_parse}20minutes_website.html", + "url" : f"{url_to_parse}/20minutes_website.html", "lastmod" :pd.Timestamp("2023-10-12 15:34:21"), "publication_name" :"Le Figaro", "publication_language" :"fr", diff --git a/test/sitemap/utils.py b/test/sitemap/utils.py new file mode 100644 index 000000000..401b4146e --- /dev/null +++ b/test/sitemap/utils.py @@ -0,0 +1,14 @@ +import logging +import os + +def get_localhost(): + localhost = "" + if(os.environ.get("ENV") == "docker"): + localhost ="http://nginxtest:80" + else: + localhost = "http://localhost:8000" + return localhost + +def debug_df(df): + logging.warning("--------------------DEBUG DF-------------------") + logging.warning(df.head(1).to_string()) \ No newline at end of file