Skip to content

Commit

Permalink
refacto: parsing description
Browse files Browse the repository at this point in the history
  • Loading branch information
polomarcus committed Nov 27, 2023
1 parent cd84292 commit e48e3de
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 29 deletions.
4 changes: 2 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ services:
#entrypoint: ["sleep", "1200"] # use to debug the container if needed
entrypoint: ["python", "quotaclimat/data_ingestion/ingest_db/ingest_sitemap_in_db.py"]
environment:
ENV: docker
LOGLEVEL: debug # Change me to info (warning, error) to have less log
ENV: docker # change me to prod for real cases
LOGLEVEL: INFO # Change me to info (debug, info, warning, error) to have less log
PYTHONPATH: /app
POSTGRES_USER: user
POSTGRES_DB: barometre
Expand Down
23 changes: 16 additions & 7 deletions quotaclimat/data_ingestion/scrap_sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from quotaclimat.data_ingestion.config_sitemap import (SITEMAP_CONFIG, SITEMAP_TEST_CONFIG, SITEMAP_DOCKER_CONFIG, MEDIA_CONFIG)
from postgres.schemas.models import get_sitemap_cols
from quotaclimat.data_ingestion.scrap_html.scrap_description_article import get_meta_news
from quotaclimat.data_ingestion.scrap_html.scrap_description_article import get_meta_news, agent
import asyncio
import hashlib

Expand Down Expand Up @@ -180,8 +180,9 @@ async def query_one_sitemap_and_transform(media: str, sitemap_conf: Dict, df_fro
"""
try:
logging.info("\n\nParsing media %s with %s" % (media, sitemap_conf["sitemap_url"]))
logging.info(f"User-agent: { agent['User-Agent'] }")
#@see https://advertools.readthedocs.io/en/master/advertools.sitemaps.html#news-sitemaps

adv.sitemaps.headers['User-Agent'] = agent["User-Agent"]
temp_df = adv.sitemap_to_df(sitemap_conf["sitemap_url"])

temp_df.rename(columns={"loc": "url"}, inplace=True)
Expand Down Expand Up @@ -216,14 +217,22 @@ async def query_one_sitemap_and_transform(media: str, sitemap_conf: Dict, df_fro
#keep only unknown id to not parse every website for new_description
difference_df = get_diff_from_df(df, df_from_pg)

# concurrency : https://stackoverflow.com/a/67944888/3535853
# https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
difference_df['news_description'] = await asyncio.gather(*(add_news_meta(row["url"], media, row["news_title"]) for (_, row) in difference_df.iterrows()))

difference_df['news_description'] = await get_description_article(media, difference_df)

return difference_df
except Exception as err:
logging.error(
"Sitemap query error for %s: %s : %s"
% (media, sitemap_conf["sitemap_url"], err)
)
return None
return None

# concurrency : https://stackoverflow.com/a/67944888/3535853
# https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
async def get_description_article(media, article_df):
article_tasks = []
for (_, row) in article_df.iterrows():
description = add_news_meta(row["url"], media, row["news_title"])
article_tasks.append(description)

return await asyncio.gather(*article_tasks)
30 changes: 23 additions & 7 deletions test/sitemap/test_scrap_html.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,30 @@
import logging
import pytest
import os
import pandas as pd
from quotaclimat.data_ingestion.scrap_html.scrap_description_article import get_meta_news, get_hat_20minutes, get_url_content
from quotaclimat.data_ingestion.scrap_sitemap import get_description_article
from bs4 import BeautifulSoup
from utils import get_localhost, debug_df

localhost = ""
if(os.environ.get("ENV") == "docker"):
localhost ="http://nginxtest:80"
else:
localhost = "http://localhost:8000"
localhost = get_localhost()

@pytest.mark.asyncio
async def test_get_description_article():
url_to_parse = f"{localhost}/mediapart_website.html"
media = "Le Figaro"
df_articles = pd.DataFrame([{
"url" : url_to_parse,
"news_title" :media,
}])

expected_result = pd.DataFrame([{
"url" : url_to_parse,
"news_title" :media,
"news_description" : "description could be parsed with success"
}])

df_articles["news_description"] = await get_description_article(media, df_articles)
debug_df(df_articles)
pd.testing.assert_frame_equal(df_articles.reset_index(drop=True), expected_result.reset_index(drop=True))

@pytest.mark.asyncio
async def test_get_meta_news():
Expand Down
18 changes: 5 additions & 13 deletions test/sitemap/test_scrap_sitemap.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,12 @@
import logging

import numpy as np
import pandas as pd
import pytest
import os
from quotaclimat.data_ingestion.scrap_sitemap import (filter_on_date, find_sections, get_consistent_hash, get_diff_from_df, query_one_sitemap_and_transform, get_sections_from_url, normalize_section)
from quotaclimat.data_ingestion.config_sitemap import (SITEMAP_CONFIG)
from datetime import datetime, timedelta

from utils import get_localhost
from quotaclimat.data_ingestion.ingest_db.ingest_sitemap_in_db import get_sitemap_list

url_to_parse = ""
if(os.environ.get("ENV") == "docker"):
url_to_parse ="http://nginxtest:80/"
else:
url_to_parse = "http://localhost:8000/"
url_to_parse = get_localhost()

def test_normalize_section():
assert normalize_section(["test", "pizza"]) == ["test", "pizza"]
Expand All @@ -32,7 +24,7 @@ def test_get_sitemap_list():
sitemap = list(get_sitemap_list())[0]
# locally we test only a few items
sitemap_url = sitemap
sitemap_url == "http://nginxtest:80/sitemap_news_figaro_3.xml"
sitemap_url == f"${url_to_parse}/sitemap_news_figaro_3.xml"

@pytest.mark.asyncio
async def test_query_one_sitemap_and_transform():
Expand All @@ -48,7 +40,7 @@ async def test_query_one_sitemap_and_transform():
output = await query_one_sitemap_and_transform(media, sitemap_config[media], pg_df)
title = "EN DIRECT - Conflit Hamas-Israël : l’armée israélienne dit avoir frappé Gaza avec 4000 tonnes d’explosifs depuis samedi"
expected_result = pd.DataFrame([{
"url" : f"{url_to_parse}mediapart_website.html",
"url" : f"{url_to_parse}/mediapart_website.html",
"lastmod" :pd.Timestamp("2023-10-12 15:34:28"),
"publication_name" :"Le Figaro",
"publication_language" :"fr",
Expand Down Expand Up @@ -96,7 +88,7 @@ async def test_query_one_sitemap_and_transform_hat_parsing():
title = "Grève du 13 octobre : SNCF, RATP, aérien, médecins… Retrouvez le détail des perturbations à prévoir"
publication_name = "Le Figaro"
expected_result = pd.DataFrame([{
"url" : f"{url_to_parse}20minutes_website.html",
"url" : f"{url_to_parse}/20minutes_website.html",
"lastmod" :pd.Timestamp("2023-10-12 15:34:21"),
"publication_name" :"Le Figaro",
"publication_language" :"fr",
Expand Down
14 changes: 14 additions & 0 deletions test/sitemap/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import logging
import os

def get_localhost():
localhost = ""
if(os.environ.get("ENV") == "docker"):
localhost ="http://nginxtest:80"
else:
localhost = "http://localhost:8000"
return localhost

def debug_df(df):
logging.warning("--------------------DEBUG DF-------------------")
logging.warning(df.head(1).to_string())

0 comments on commit e48e3de

Please sign in to comment.