feat: number_of_keywords climate/biodiv/ressources - 20 secondes wind…

…ow (#236) * feat: number_of_keywords climate/biodiv/ressources - 20 secondes window * doc: alembic * db: alembic
dataforgoodfr · Sep 13, 2024 · 4e0dfa0 · 4e0dfa0 · github-actions · Sep 13, 2024
1 parent 79ce925
commit 4e0dfa0
Show file tree

Hide file tree

Showing 10 changed files with 180 additions and 111 deletions.
diff --git a/README.md b/README.md
@@ -314,10 +314,6 @@ We can adjust batch update with these env variables (as in the docker-compose.ym
 BATCH_SIZE: 50000 # number of records to update in one batch
 ```
 
-### Comparison between 15/20/30/40 window
-Set `COMPARE_DURATION` to true such as in the docker-compose.yml to calculate number_of_keywords_20/30/40 in addition of 15.
-The goal is to compare different durations to select one, it should be desactivated to have more effective program.
-
 ### Batch program data
 `UPDATE_PROGRAM_ONLY` to true will only update program metadata, otherwise, it will update program metadata and all theme/keywords calculations.
 
@@ -336,12 +332,15 @@ We can use [a Github actions to start multiple update operations with different
 Using [Alembic](https://alembic.sqlalchemy.org/en/latest/autogenerate.html) Auto Generating Migrations¶ we can add a new column inside `models.py` and it will automatically make the schema evolution :
 
 ```
-# If changes have already been applied and you want to recreate your alembic file:
-# 1. change to you main branch
+# If changes have already been applied (on your feature vranch) and you have to recreate your alembic file by doing :
+# 1. change to your main branch 
+git  switch main
 # 2. start test container and run "pytest -vv -k api" to rebuild the state of the DB (or drop table the table you want)
 # 3. rechange to your WIP branch 
+git switch -
 # 4. connect to the test container : docker compose up test -d / docker compose exec test bash
-# 5. reapply the latest saved state : poetry run alembic upgrade head
+# 5. reapply the latest saved state : 
+poetry run alembic stamp head
 # 6. Save the new columns
 poetry run alembic revision --autogenerate -m "Add new column test for table keywords"
 # this should generate a file to commit inside "alembic/versions"

diff --git a/alembic/versions/a5c39db3c8e9_add_new_column_test_for_table_keywords.py b/alembic/versions/a5c39db3c8e9_add_new_column_test_for_table_keywords.py
@@ -0,0 +1,30 @@
+"""Add new column test for table keywords
+
+Revision ID: a5c39db3c8e9
+Revises: 5ccd746ee292
+Create Date: 2024-09-12 14:10:26.305593
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = 'a5c39db3c8e9'
+down_revision: Union[str, None] = '5ccd746ee292'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    pass
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    pass
+    # ### end Alembic commands ###
diff --git a/alembic/versions/af956a85658f_add_new_column_number_of_keywords_.py b/alembic/versions/af956a85658f_add_new_column_number_of_keywords_.py
@@ -0,0 +1,34 @@
+"""Add new column number_of_keywords climat/biod/r
+
+Revision ID: af956a85658f
+Revises: a5c39db3c8e9
+Create Date: 2024-09-12 14:15:12.049367
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = 'af956a85658f'
+down_revision: Union[str, None] = 'a5c39db3c8e9'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column('keywords', sa.Column('number_of_keywords_climat', sa.Integer(), nullable=True))
+    op.add_column('keywords', sa.Column('number_of_keywords_biodiversite', sa.Integer(), nullable=True))
+    op.add_column('keywords', sa.Column('number_of_keywords_ressources', sa.Integer(), nullable=True))
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column('keywords', 'number_of_keywords_ressources')
+    op.drop_column('keywords', 'number_of_keywords_biodiversite')
+    op.drop_column('keywords', 'number_of_keywords_climat')
+    # ### end Alembic commands ###
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,4 +1,3 @@
-version: '3.8'
 services:
   test:
     build:
@@ -41,7 +40,7 @@ services:
     environment:
       ENV: docker
       # CHANNEL: "fr3-idf"
-      LOGLEVEL: INFO
+      LOGLEVEL: DEBUG
       PYTHONPATH: /app
       POSTGRES_USER: user
       POSTGRES_DB: barometre

diff --git a/postgres/schemas/models.py b/postgres/schemas/models.py
@@ -81,9 +81,12 @@ class Keywords(Base):
     number_of_biodiversite_causes_directes= Column(Integer)  # ALTER TABLE keywords ADD number_of_biodiversite_causes_directes integer;
     number_of_biodiversite_consequences= Column(Integer)  # ALTER TABLE keywords ADD number_of_biodiversite_consequences integer;
     number_of_biodiversite_solutions_directes= Column(Integer)  # ALTER TABLE keywords ADD number_of_biodiversite_solutions_directes integer;
-    number_of_keywords_20 = Column(Integer) # ALTER TABLE keywords ADD number_of_keywords_20 integer;
-    number_of_keywords_30 = Column(Integer) # ALTER TABLE keywords ADD number_of_keywords_30 integer;
-    number_of_keywords_40 = Column(Integer) # ALTER TABLE keywords ADD number_of_keywords_40 integer;
+    number_of_keywords_20 = Column(Integer) # NOT USED ANYMORE -- ALTER TABLE keywords ADD number_of_keywords_20 integer;
+    number_of_keywords_30 = Column(Integer) # NOT USED ANYMORE -- ALTER TABLE keywords ADD number_of_keywords_30 integer;
+    number_of_keywords_40 = Column(Integer) # NOT USED ANYMORE -- ALTER TABLE keywords ADD number_of_keywords_40 integer;
+    number_of_keywords_climat = Column(Integer) # sum of all climatique counters without duplicate (like number_of_keywords)
+    number_of_keywords_biodiversite = Column(Integer) # sum of all biodiversite counters without duplicate
+    number_of_keywords_ressources = Column(Integer) # sum of all ressources counters without duplicate
 
 class Channel_Metadata(Base):
     __tablename__ = channel_metadata_table

diff --git a/quotaclimat/data_processing/mediatree/detect_keywords.py b/quotaclimat/data_processing/mediatree/detect_keywords.py
@@ -117,6 +117,7 @@ def remove_stopwords(plaintext: str) -> str:
 def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], start: datetime):
     keywords_with_timestamp = []
     number_of_elements_in_array = 17
+    default_window_in_seconds = 20
     plaitext_without_stopwords = remove_stopwords(plaintext)
     logging.debug(f"display datetime start {start}")
 
@@ -137,40 +138,46 @@ def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], s
             keywords_with_timestamp.extend(keywords_to_add)
 
     if len(keywords_with_timestamp) > 0:
-        # count false positive near of 15" of positive keywords
-        keywords_with_timestamp_15 = get_keywords_with_timestamp_with_false_positive(keywords_with_timestamp, start, duration_seconds=15)
-        filtered_keywords_with_timestamp = filter_indirect_words(keywords_with_timestamp_15)
+        # count false positive near of default_window_in_seconds of positive keywords
+        keywords_with_timestamp_default = get_keywords_with_timestamp_with_false_positive(keywords_with_timestamp, start, duration_seconds=default_window_in_seconds)
+        filtered_keywords_with_timestamp = filter_indirect_words(keywords_with_timestamp_default)
 
-        theme= get_themes(keywords_with_timestamp_15)
-        keywords_with_timestamp= clean_metadata(keywords_with_timestamp_15)
+        theme= get_themes(keywords_with_timestamp_default)
+        keywords_with_timestamp= clean_metadata(keywords_with_timestamp_default)
         number_of_keywords= count_keywords_duration_overlap(filtered_keywords_with_timestamp, start)
+
+        themes_climat = ["changement_climatique_constat",
+                        "changement_climatique_causes",
+                        "changement_climatique_consequences",
+                        "attenuation_climatique_solutions",
+                        "adaptation_climatique_solutions"
+        ]
+        number_of_keywords_climat= count_keywords_duration_overlap(filtered_keywords_with_timestamp, start, theme=themes_climat)
+        themes_biodiversite = [
+            "biodiversite_concepts_generaux",
+            "biodiversite_causes",
+            "biodiversite_consequences",
+            "biodiversite_solutions",
+        ]
+        number_of_keywords_biodiversite= count_keywords_duration_overlap(filtered_keywords_with_timestamp, start, themes_biodiversite)
+
+        themes_ressources = ["ressources",
+                "ressources_solutions",
+        ]
+        number_of_keywords_ressources= count_keywords_duration_overlap(filtered_keywords_with_timestamp, start, themes_ressources)
+
+        number_of_changement_climatique_constat = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["changement_climatique_constat"])
+        number_of_changement_climatique_causes = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["changement_climatique_causes"])
+        number_of_changement_climatique_consequences = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["changement_climatique_consequences"])
+        number_of_attenuation_climatique_solutions = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["attenuation_climatique_solutions"])
+        number_of_adaptation_climatique_solutions = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["adaptation_climatique_solutions"])
+        number_of_ressources = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["ressources"])
+        number_of_ressources_solutions = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["ressources_solutions"])
+        number_of_biodiversite_concepts_generaux = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_concepts_generaux"])
+        number_of_biodiversite_causes = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_causes"])
+        number_of_biodiversite_consequences = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_consequences"])
+        number_of_biodiversite_solutions = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_solutions"])
 
-        number_of_changement_climatique_constat = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="changement_climatique_constat")
-        number_of_changement_climatique_causes = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="changement_climatique_causes")
-        number_of_changement_climatique_consequences = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="changement_climatique_consequences")
-        number_of_attenuation_climatique_solutions = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="attenuation_climatique_solutions")
-        number_of_adaptation_climatique_solutions = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="adaptation_climatique_solutions")
-        number_of_ressources = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="ressources")
-        number_of_ressources_solutions = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="ressources_solutions")
-        number_of_biodiversite_concepts_generaux = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="biodiversite_concepts_generaux")
-        number_of_biodiversite_causes = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="biodiversite_causes")
-        number_of_biodiversite_consequences = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="biodiversite_consequences")
-        number_of_biodiversite_solutions = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="biodiversite_solutions")
-
-        if(os.environ.get("COMPARE_DURATION") == "true"):
-            logging.debug(f"Comparaison between 15/20/30/40 is activated")
-            keywords_with_timestamp_20 = get_keywords_with_timestamp_with_false_positive(keywords_with_timestamp, start, duration_seconds=20)
-            keywords_with_timestamp_30 = get_keywords_with_timestamp_with_false_positive(keywords_with_timestamp, start, duration_seconds=30)
-            keywords_with_timestamp_40 = get_keywords_with_timestamp_with_false_positive(keywords_with_timestamp, start, duration_seconds=40)
-            number_of_keywords_20= count_keywords_duration_overlap(filter_indirect_words(keywords_with_timestamp_20), start)
-            number_of_keywords_30= count_keywords_duration_overlap(filter_indirect_words(keywords_with_timestamp_30), start)
-            number_of_keywords_40= count_keywords_duration_overlap(filter_indirect_words(keywords_with_timestamp_40), start)
-        else:
-            logging.debug(f"No comparaison between 15/20/30/40 is activated")
-            number_of_keywords_20=None
-            number_of_keywords_30=None
-            number_of_keywords_40=None
-        # TODO refacto this return array and else
         return [
             theme
             ,keywords_with_timestamp 
@@ -186,10 +193,9 @@ def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], s
             ,number_of_biodiversite_causes
             ,number_of_biodiversite_consequences
             ,number_of_biodiversite_solutions
-            # number_of_keywords with special duration to compare duration
-            ,number_of_keywords_20
-            ,number_of_keywords_30
-            ,number_of_keywords_40
+            ,number_of_keywords_climat
+            ,number_of_keywords_biodiversite
+            ,number_of_keywords_ressources
         ]
     else:
         return [None] * number_of_elements_in_array
@@ -244,9 +250,9 @@ def filter_and_tag_by_theme(df: pd.DataFrame) -> pd.DataFrame :
                  'number_of_biodiversite_causes_directes',
                  'number_of_biodiversite_consequences',
                  'number_of_biodiversite_solutions_directes'
-                 ,'number_of_keywords_20'
-                 ,'number_of_keywords_30'
-                 ,'number_of_keywords_40'
+                 ,"number_of_keywords_climat"
+                 ,"number_of_keywords_biodiversite"
+                 ,"number_of_keywords_ressources"
                 ]
             ] = df[['plaintext','srt', 'start']]\
                 .swifter.apply(\
@@ -275,14 +281,14 @@ def add_primary_key(row):
 def filter_indirect_words(keywords_with_timestamp: List[dict]) -> List[dict]:
     return list(filter(lambda kw: indirectes not in kw['theme'], keywords_with_timestamp))
 
-def count_keywords_duration_overlap(keywords_with_timestamp: List[dict], start: datetime, theme: str = None) -> int:
+def count_keywords_duration_overlap(keywords_with_timestamp: List[dict], start: datetime, theme: List[str] = None) -> int:
     total_keywords = len(keywords_with_timestamp)
     if(total_keywords) == 0:
         return 0
     else:
         if theme is not None:
             logging.debug(f"filter theme {theme}")
-            keywords_with_timestamp = list(filter(lambda kw: kw['theme'] == theme, keywords_with_timestamp))
+            keywords_with_timestamp = list(filter(lambda kw: kw['theme'] in theme, keywords_with_timestamp))
 
         length_filtered_items = len(keywords_with_timestamp)
 

diff --git a/quotaclimat/data_processing/mediatree/update_pg_keywords.py b/quotaclimat/data_processing/mediatree/update_pg_keywords.py
@@ -47,9 +47,9 @@ def update_keywords(session: Session, batch_size: int = 50000, start_date : str
                     ,number_of_biodiversite_causes_directes \
                     ,number_of_biodiversite_consequences \
                     ,number_of_biodiversite_solutions_directes \
-                    ,new_number_of_keywords_20 \
-                    ,new_number_of_keywords_30 \
-                    ,new_number_of_keywords_40 = get_themes_keywords_duration(plaintext, srt, start)
+                    ,new_number_of_keywords_climat \
+                    ,new_number_of_keywords_biodiversite \
+                    ,new_number_of_keywords_ressources = get_themes_keywords_duration(plaintext, srt, start)
                 except Exception as err:
                         logging.error(f"continuing loop but met error : {err}")
                         continue
@@ -83,9 +83,9 @@ def update_keywords(session: Session, batch_size: int = 50000, start_date : str
                 ,number_of_biodiversite_consequences
                 ,number_of_biodiversite_solutions_directes
                 ,channel_title=channel_title
-                ,number_of_keywords_20=new_number_of_keywords_20
-                ,number_of_keywords_30=new_number_of_keywords_30
-                ,number_of_keywords_40=new_number_of_keywords_40
+                ,number_of_keywords_climat=new_number_of_keywords_climat
+                ,number_of_keywords_biodiversite=new_number_of_keywords_biodiversite
+                ,number_of_keywords_ressources=new_number_of_keywords_ressources
                 )
             else:
                 program_name, program_name_type = get_a_program_with_start_timestamp(df_programs, pd.Timestamp(start).tz_convert('Europe/Paris'), channel_name)
@@ -145,9 +145,9 @@ def update_keyword_row(session: Session,
                         number_of_biodiversite_consequences: int,
                         number_of_biodiversite_solutions_directes: int,
                         channel_title: str
-                        ,number_of_keywords_20: int
-                        ,number_of_keywords_30: int
-                        ,number_of_keywords_40: int
+                        ,number_of_keywords_climat: int
+                        ,number_of_keywords_biodiversite: int
+                        ,number_of_keywords_ressources: int
     ):
     if matching_themes is not None:
         session.query(Keywords).filter(Keywords.id == keyword_id).update(
@@ -167,9 +167,9 @@ def update_keyword_row(session: Session,
                 Keywords.number_of_biodiversite_consequences:number_of_biodiversite_consequences ,
                 Keywords.number_of_biodiversite_solutions_directes:number_of_biodiversite_solutions_directes,
                 Keywords.channel_title: channel_title
-                ,Keywords.number_of_keywords_20: number_of_keywords_20
-                ,Keywords.number_of_keywords_30: number_of_keywords_30
-                ,Keywords.number_of_keywords_40: number_of_keywords_40
+                ,Keywords.number_of_keywords_climat: number_of_keywords_climat
+                ,Keywords.number_of_keywords_biodiversite: number_of_keywords_biodiversite
+                ,Keywords.number_of_keywords_ressources: number_of_keywords_ressources
             },
             synchronize_session=False
         )
File	Stmts	Miss	Cover	Missing
postgres
insert_data.py	43	7	84%	36–38, 56–58, 63
insert_existing_data_example.py	19	3	84%	25–27
postgres/schemas
models.py	150	10	93%	124–131, 143–144, 202–203, 217–218
quotaclimat/data_ingestion
scrap_sitemap.py	134	17	87%	27–28, 33–34, 66–71, 95–97, 138–140, 202, 223–228
quotaclimat/data_ingestion/ingest_db
ingest_sitemap_in_db.py	55	37	33%	21–42, 45–58, 62–73
quotaclimat/data_ingestion/scrap_html
scrap_description_article.py	36	3	92%	19–20, 32
quotaclimat/data_processing/mediatree
api_import.py	211	132	37%	44–48, 53–69, 73–76, 82, 85–126, 132–147, 151–152, 165–177, 181–187, 200–212, 215–219, 225, 261–262, 265–301, 304–306
channel_program.py	157	56	64%	28–30, 41–43, 60–61, 64–66, 93, 105, 114, 154–195
config.py	15	2	87%	7, 16
detect_keywords.py	209	8	96%	222, 272–279
update_pg_keywords.py	54	39	28%	14–100, 125–129, 152–178, 184
utils.py	69	22	68%	27–51, 54, 63, 84–85
quotaclimat/utils
healthcheck_config.py	29	14	52%	22–24, 27–38
logger.py	24	11	54%	22–24, 28–37
sentry.py	11	2	82%	22–23
TOTAL	1242	363	71%