Skip to content

Commit

Permalink
feat: number_of_keywords climate/biodiv/ressources - 20 secondes wind…
Browse files Browse the repository at this point in the history
…ow (#236)

* feat: number_of_keywords climate/biodiv/ressources - 20 secondes window

* doc: alembic

* db: alembic
  • Loading branch information
polomarcus authored Sep 13, 2024
1 parent 79ce925 commit 4e0dfa0
Show file tree
Hide file tree
Showing 10 changed files with 180 additions and 111 deletions.
13 changes: 6 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -314,10 +314,6 @@ We can adjust batch update with these env variables (as in the docker-compose.ym
BATCH_SIZE: 50000 # number of records to update in one batch
```

### Comparison between 15/20/30/40 window
Set `COMPARE_DURATION` to true such as in the docker-compose.yml to calculate number_of_keywords_20/30/40 in addition of 15.
The goal is to compare different durations to select one, it should be desactivated to have more effective program.

### Batch program data
`UPDATE_PROGRAM_ONLY` to true will only update program metadata, otherwise, it will update program metadata and all theme/keywords calculations.

Expand All @@ -336,12 +332,15 @@ We can use [a Github actions to start multiple update operations with different
Using [Alembic](https://alembic.sqlalchemy.org/en/latest/autogenerate.html) Auto Generating Migrations¶ we can add a new column inside `models.py` and it will automatically make the schema evolution :

```
# If changes have already been applied and you want to recreate your alembic file:
# 1. change to you main branch
# If changes have already been applied (on your feature vranch) and you have to recreate your alembic file by doing :
# 1. change to your main branch
git switch main
# 2. start test container and run "pytest -vv -k api" to rebuild the state of the DB (or drop table the table you want)
# 3. rechange to your WIP branch
git switch -
# 4. connect to the test container : docker compose up test -d / docker compose exec test bash
# 5. reapply the latest saved state : poetry run alembic upgrade head
# 5. reapply the latest saved state :
poetry run alembic stamp head
# 6. Save the new columns
poetry run alembic revision --autogenerate -m "Add new column test for table keywords"
# this should generate a file to commit inside "alembic/versions"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""Add new column test for table keywords
Revision ID: a5c39db3c8e9
Revises: 5ccd746ee292
Create Date: 2024-09-12 14:10:26.305593
"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision: str = 'a5c39db3c8e9'
down_revision: Union[str, None] = '5ccd746ee292'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
pass
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
pass
# ### end Alembic commands ###
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""Add new column number_of_keywords climat/biod/r
Revision ID: af956a85658f
Revises: a5c39db3c8e9
Create Date: 2024-09-12 14:15:12.049367
"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision: str = 'af956a85658f'
down_revision: Union[str, None] = 'a5c39db3c8e9'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.add_column('keywords', sa.Column('number_of_keywords_climat', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_keywords_biodiversite', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_keywords_ressources', sa.Integer(), nullable=True))
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column('keywords', 'number_of_keywords_ressources')
op.drop_column('keywords', 'number_of_keywords_biodiversite')
op.drop_column('keywords', 'number_of_keywords_climat')
# ### end Alembic commands ###
3 changes: 1 addition & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
version: '3.8'
services:
test:
build:
Expand Down Expand Up @@ -41,7 +40,7 @@ services:
environment:
ENV: docker
# CHANNEL: "fr3-idf"
LOGLEVEL: INFO
LOGLEVEL: DEBUG
PYTHONPATH: /app
POSTGRES_USER: user
POSTGRES_DB: barometre
Expand Down
9 changes: 6 additions & 3 deletions postgres/schemas/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,12 @@ class Keywords(Base):
number_of_biodiversite_causes_directes= Column(Integer) # ALTER TABLE keywords ADD number_of_biodiversite_causes_directes integer;
number_of_biodiversite_consequences= Column(Integer) # ALTER TABLE keywords ADD number_of_biodiversite_consequences integer;
number_of_biodiversite_solutions_directes= Column(Integer) # ALTER TABLE keywords ADD number_of_biodiversite_solutions_directes integer;
number_of_keywords_20 = Column(Integer) # ALTER TABLE keywords ADD number_of_keywords_20 integer;
number_of_keywords_30 = Column(Integer) # ALTER TABLE keywords ADD number_of_keywords_30 integer;
number_of_keywords_40 = Column(Integer) # ALTER TABLE keywords ADD number_of_keywords_40 integer;
number_of_keywords_20 = Column(Integer) # NOT USED ANYMORE -- ALTER TABLE keywords ADD number_of_keywords_20 integer;
number_of_keywords_30 = Column(Integer) # NOT USED ANYMORE -- ALTER TABLE keywords ADD number_of_keywords_30 integer;
number_of_keywords_40 = Column(Integer) # NOT USED ANYMORE -- ALTER TABLE keywords ADD number_of_keywords_40 integer;
number_of_keywords_climat = Column(Integer) # sum of all climatique counters without duplicate (like number_of_keywords)
number_of_keywords_biodiversite = Column(Integer) # sum of all biodiversite counters without duplicate
number_of_keywords_ressources = Column(Integer) # sum of all ressources counters without duplicate

class Channel_Metadata(Base):
__tablename__ = channel_metadata_table
Expand Down
86 changes: 46 additions & 40 deletions quotaclimat/data_processing/mediatree/detect_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def remove_stopwords(plaintext: str) -> str:
def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], start: datetime):
keywords_with_timestamp = []
number_of_elements_in_array = 17
default_window_in_seconds = 20
plaitext_without_stopwords = remove_stopwords(plaintext)
logging.debug(f"display datetime start {start}")

Expand All @@ -137,40 +138,46 @@ def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], s
keywords_with_timestamp.extend(keywords_to_add)

if len(keywords_with_timestamp) > 0:
# count false positive near of 15" of positive keywords
keywords_with_timestamp_15 = get_keywords_with_timestamp_with_false_positive(keywords_with_timestamp, start, duration_seconds=15)
filtered_keywords_with_timestamp = filter_indirect_words(keywords_with_timestamp_15)
# count false positive near of default_window_in_seconds of positive keywords
keywords_with_timestamp_default = get_keywords_with_timestamp_with_false_positive(keywords_with_timestamp, start, duration_seconds=default_window_in_seconds)
filtered_keywords_with_timestamp = filter_indirect_words(keywords_with_timestamp_default)

theme= get_themes(keywords_with_timestamp_15)
keywords_with_timestamp= clean_metadata(keywords_with_timestamp_15)
theme= get_themes(keywords_with_timestamp_default)
keywords_with_timestamp= clean_metadata(keywords_with_timestamp_default)
number_of_keywords= count_keywords_duration_overlap(filtered_keywords_with_timestamp, start)

themes_climat = ["changement_climatique_constat",
"changement_climatique_causes",
"changement_climatique_consequences",
"attenuation_climatique_solutions",
"adaptation_climatique_solutions"
]
number_of_keywords_climat= count_keywords_duration_overlap(filtered_keywords_with_timestamp, start, theme=themes_climat)
themes_biodiversite = [
"biodiversite_concepts_generaux",
"biodiversite_causes",
"biodiversite_consequences",
"biodiversite_solutions",
]
number_of_keywords_biodiversite= count_keywords_duration_overlap(filtered_keywords_with_timestamp, start, themes_biodiversite)

themes_ressources = ["ressources",
"ressources_solutions",
]
number_of_keywords_ressources= count_keywords_duration_overlap(filtered_keywords_with_timestamp, start, themes_ressources)

number_of_changement_climatique_constat = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["changement_climatique_constat"])
number_of_changement_climatique_causes = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["changement_climatique_causes"])
number_of_changement_climatique_consequences = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["changement_climatique_consequences"])
number_of_attenuation_climatique_solutions = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["attenuation_climatique_solutions"])
number_of_adaptation_climatique_solutions = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["adaptation_climatique_solutions"])
number_of_ressources = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["ressources"])
number_of_ressources_solutions = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["ressources_solutions"])
number_of_biodiversite_concepts_generaux = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_concepts_generaux"])
number_of_biodiversite_causes = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_causes"])
number_of_biodiversite_consequences = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_consequences"])
number_of_biodiversite_solutions = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_solutions"])

number_of_changement_climatique_constat = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="changement_climatique_constat")
number_of_changement_climatique_causes = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="changement_climatique_causes")
number_of_changement_climatique_consequences = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="changement_climatique_consequences")
number_of_attenuation_climatique_solutions = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="attenuation_climatique_solutions")
number_of_adaptation_climatique_solutions = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="adaptation_climatique_solutions")
number_of_ressources = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="ressources")
number_of_ressources_solutions = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="ressources_solutions")
number_of_biodiversite_concepts_generaux = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="biodiversite_concepts_generaux")
number_of_biodiversite_causes = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="biodiversite_causes")
number_of_biodiversite_consequences = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="biodiversite_consequences")
number_of_biodiversite_solutions = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="biodiversite_solutions")

if(os.environ.get("COMPARE_DURATION") == "true"):
logging.debug(f"Comparaison between 15/20/30/40 is activated")
keywords_with_timestamp_20 = get_keywords_with_timestamp_with_false_positive(keywords_with_timestamp, start, duration_seconds=20)
keywords_with_timestamp_30 = get_keywords_with_timestamp_with_false_positive(keywords_with_timestamp, start, duration_seconds=30)
keywords_with_timestamp_40 = get_keywords_with_timestamp_with_false_positive(keywords_with_timestamp, start, duration_seconds=40)
number_of_keywords_20= count_keywords_duration_overlap(filter_indirect_words(keywords_with_timestamp_20), start)
number_of_keywords_30= count_keywords_duration_overlap(filter_indirect_words(keywords_with_timestamp_30), start)
number_of_keywords_40= count_keywords_duration_overlap(filter_indirect_words(keywords_with_timestamp_40), start)
else:
logging.debug(f"No comparaison between 15/20/30/40 is activated")
number_of_keywords_20=None
number_of_keywords_30=None
number_of_keywords_40=None
# TODO refacto this return array and else
return [
theme
,keywords_with_timestamp
Expand All @@ -186,10 +193,9 @@ def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], s
,number_of_biodiversite_causes
,number_of_biodiversite_consequences
,number_of_biodiversite_solutions
# number_of_keywords with special duration to compare duration
,number_of_keywords_20
,number_of_keywords_30
,number_of_keywords_40
,number_of_keywords_climat
,number_of_keywords_biodiversite
,number_of_keywords_ressources
]
else:
return [None] * number_of_elements_in_array
Expand Down Expand Up @@ -244,9 +250,9 @@ def filter_and_tag_by_theme(df: pd.DataFrame) -> pd.DataFrame :
'number_of_biodiversite_causes_directes',
'number_of_biodiversite_consequences',
'number_of_biodiversite_solutions_directes'
,'number_of_keywords_20'
,'number_of_keywords_30'
,'number_of_keywords_40'
,"number_of_keywords_climat"
,"number_of_keywords_biodiversite"
,"number_of_keywords_ressources"
]
] = df[['plaintext','srt', 'start']]\
.swifter.apply(\
Expand Down Expand Up @@ -275,14 +281,14 @@ def add_primary_key(row):
def filter_indirect_words(keywords_with_timestamp: List[dict]) -> List[dict]:
return list(filter(lambda kw: indirectes not in kw['theme'], keywords_with_timestamp))

def count_keywords_duration_overlap(keywords_with_timestamp: List[dict], start: datetime, theme: str = None) -> int:
def count_keywords_duration_overlap(keywords_with_timestamp: List[dict], start: datetime, theme: List[str] = None) -> int:
total_keywords = len(keywords_with_timestamp)
if(total_keywords) == 0:
return 0
else:
if theme is not None:
logging.debug(f"filter theme {theme}")
keywords_with_timestamp = list(filter(lambda kw: kw['theme'] == theme, keywords_with_timestamp))
keywords_with_timestamp = list(filter(lambda kw: kw['theme'] in theme, keywords_with_timestamp))

length_filtered_items = len(keywords_with_timestamp)

Expand Down
24 changes: 12 additions & 12 deletions quotaclimat/data_processing/mediatree/update_pg_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@ def update_keywords(session: Session, batch_size: int = 50000, start_date : str
,number_of_biodiversite_causes_directes \
,number_of_biodiversite_consequences \
,number_of_biodiversite_solutions_directes \
,new_number_of_keywords_20 \
,new_number_of_keywords_30 \
,new_number_of_keywords_40 = get_themes_keywords_duration(plaintext, srt, start)
,new_number_of_keywords_climat \
,new_number_of_keywords_biodiversite \
,new_number_of_keywords_ressources = get_themes_keywords_duration(plaintext, srt, start)
except Exception as err:
logging.error(f"continuing loop but met error : {err}")
continue
Expand Down Expand Up @@ -83,9 +83,9 @@ def update_keywords(session: Session, batch_size: int = 50000, start_date : str
,number_of_biodiversite_consequences
,number_of_biodiversite_solutions_directes
,channel_title=channel_title
,number_of_keywords_20=new_number_of_keywords_20
,number_of_keywords_30=new_number_of_keywords_30
,number_of_keywords_40=new_number_of_keywords_40
,number_of_keywords_climat=new_number_of_keywords_climat
,number_of_keywords_biodiversite=new_number_of_keywords_biodiversite
,number_of_keywords_ressources=new_number_of_keywords_ressources
)
else:
program_name, program_name_type = get_a_program_with_start_timestamp(df_programs, pd.Timestamp(start).tz_convert('Europe/Paris'), channel_name)
Expand Down Expand Up @@ -145,9 +145,9 @@ def update_keyword_row(session: Session,
number_of_biodiversite_consequences: int,
number_of_biodiversite_solutions_directes: int,
channel_title: str
,number_of_keywords_20: int
,number_of_keywords_30: int
,number_of_keywords_40: int
,number_of_keywords_climat: int
,number_of_keywords_biodiversite: int
,number_of_keywords_ressources: int
):
if matching_themes is not None:
session.query(Keywords).filter(Keywords.id == keyword_id).update(
Expand All @@ -167,9 +167,9 @@ def update_keyword_row(session: Session,
Keywords.number_of_biodiversite_consequences:number_of_biodiversite_consequences ,
Keywords.number_of_biodiversite_solutions_directes:number_of_biodiversite_solutions_directes,
Keywords.channel_title: channel_title
,Keywords.number_of_keywords_20: number_of_keywords_20
,Keywords.number_of_keywords_30: number_of_keywords_30
,Keywords.number_of_keywords_40: number_of_keywords_40
,Keywords.number_of_keywords_climat: number_of_keywords_climat
,Keywords.number_of_keywords_biodiversite: number_of_keywords_biodiversite
,Keywords.number_of_keywords_ressources: number_of_keywords_ressources
},
synchronize_session=False
)
Expand Down
Loading

1 comment on commit 4e0dfa0

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
postgres
   insert_data.py43784%36–38, 56–58, 63
   insert_existing_data_example.py19384%25–27
postgres/schemas
   models.py1501093%124–131, 143–144, 202–203, 217–218
quotaclimat/data_ingestion
   scrap_sitemap.py1341787%27–28, 33–34, 66–71, 95–97, 138–140, 202, 223–228
quotaclimat/data_ingestion/ingest_db
   ingest_sitemap_in_db.py553733%21–42, 45–58, 62–73
quotaclimat/data_ingestion/scrap_html
   scrap_description_article.py36392%19–20, 32
quotaclimat/data_processing/mediatree
   api_import.py21113237%44–48, 53–69, 73–76, 82, 85–126, 132–147, 151–152, 165–177, 181–187, 200–212, 215–219, 225, 261–262, 265–301, 304–306
   channel_program.py1575664%28–30, 41–43, 60–61, 64–66, 93, 105, 114, 154–195
   config.py15287%7, 16
   detect_keywords.py209896%222, 272–279
   update_pg_keywords.py543928%14–100, 125–129, 152–178, 184
   utils.py692268%27–51, 54, 63, 84–85
quotaclimat/utils
   healthcheck_config.py291452%22–24, 27–38
   logger.py241154%22–24, 28–37
   sentry.py11282%22–23
TOTAL124236371% 

Tests Skipped Failures Errors Time
87 0 💤 0 ❌ 0 🔥 1m 36s ⏱️

Please sign in to comment.