Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: number_of_keywords climate/biodiv/ressources - 20 secondes window #236

Merged
merged 3 commits into from
Sep 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -300,10 +300,6 @@ We can adjust batch update with these env variables (as in the docker-compose.ym
BATCH_SIZE: 50000 # number of records to update in one batch
```

### Comparison between 15/20/30/40 window
Set `COMPARE_DURATION` to true such as in the docker-compose.yml to calculate number_of_keywords_20/30/40 in addition of 15.
The goal is to compare different durations to select one, it should be desactivated to have more effective program.

### Batch program data
`UPDATE_PROGRAM_ONLY` to true will only update program metadata, otherwise, it will update program metadata and all theme/keywords calculations.

Expand All @@ -322,12 +318,15 @@ We can use [a Github actions to start multiple update operations with different
Using [Alembic](https://alembic.sqlalchemy.org/en/latest/autogenerate.html) Auto Generating Migrations¶ we can add a new column inside `models.py` and it will automatically make the schema evolution :

```
# If changes have already been applied and you want to recreate your alembic file:
# 1. change to you main branch
# If changes have already been applied (on your feature vranch) and you have to recreate your alembic file by doing :
# 1. change to your main branch
git switch main
# 2. start test container and run "pytest -vv -k api" to rebuild the state of the DB (or drop table the table you want)
# 3. rechange to your WIP branch
git switch -
# 4. connect to the test container : docker compose up test -d / docker compose exec test bash
# 5. reapply the latest saved state : poetry run alembic upgrade head
# 5. reapply the latest saved state :
poetry run alembic stamp head
# 6. Save the new columns
poetry run alembic revision --autogenerate -m "Add new column test for table keywords"
# this should generate a file to commit inside "alembic/versions"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""Add new column test for table keywords

Revision ID: a5c39db3c8e9
Revises: 5ccd746ee292
Create Date: 2024-09-12 14:10:26.305593

"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision: str = 'a5c39db3c8e9'
down_revision: Union[str, None] = '5ccd746ee292'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
pass
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
pass
# ### end Alembic commands ###
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""Add new column number_of_keywords climat/biod/r

Revision ID: af956a85658f
Revises: a5c39db3c8e9
Create Date: 2024-09-12 14:15:12.049367

"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision: str = 'af956a85658f'
down_revision: Union[str, None] = 'a5c39db3c8e9'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.add_column('keywords', sa.Column('number_of_keywords_climat', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_keywords_biodiversite', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_keywords_ressources', sa.Integer(), nullable=True))
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column('keywords', 'number_of_keywords_ressources')
op.drop_column('keywords', 'number_of_keywords_biodiversite')
op.drop_column('keywords', 'number_of_keywords_climat')
# ### end Alembic commands ###
3 changes: 1 addition & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
version: '3.8'
services:
test:
build:
Expand Down Expand Up @@ -41,7 +40,7 @@ services:
environment:
ENV: docker
# CHANNEL: "fr3-idf"
LOGLEVEL: INFO
LOGLEVEL: DEBUG
PYTHONPATH: /app
POSTGRES_USER: user
POSTGRES_DB: barometre
Expand Down
9 changes: 6 additions & 3 deletions postgres/schemas/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,12 @@ class Keywords(Base):
number_of_biodiversite_causes_directes= Column(Integer) # ALTER TABLE keywords ADD number_of_biodiversite_causes_directes integer;
number_of_biodiversite_consequences= Column(Integer) # ALTER TABLE keywords ADD number_of_biodiversite_consequences integer;
number_of_biodiversite_solutions_directes= Column(Integer) # ALTER TABLE keywords ADD number_of_biodiversite_solutions_directes integer;
number_of_keywords_20 = Column(Integer) # ALTER TABLE keywords ADD number_of_keywords_20 integer;
number_of_keywords_30 = Column(Integer) # ALTER TABLE keywords ADD number_of_keywords_30 integer;
number_of_keywords_40 = Column(Integer) # ALTER TABLE keywords ADD number_of_keywords_40 integer;
number_of_keywords_20 = Column(Integer) # NOT USED ANYMORE -- ALTER TABLE keywords ADD number_of_keywords_20 integer;
number_of_keywords_30 = Column(Integer) # NOT USED ANYMORE -- ALTER TABLE keywords ADD number_of_keywords_30 integer;
number_of_keywords_40 = Column(Integer) # NOT USED ANYMORE -- ALTER TABLE keywords ADD number_of_keywords_40 integer;
number_of_keywords_climat = Column(Integer) # sum of all climatique counters without duplicate (like number_of_keywords)
number_of_keywords_biodiversite = Column(Integer) # sum of all biodiversite counters without duplicate
number_of_keywords_ressources = Column(Integer) # sum of all ressources counters without duplicate

class Channel_Metadata(Base):
__tablename__ = channel_metadata_table
Expand Down
86 changes: 46 additions & 40 deletions quotaclimat/data_processing/mediatree/detect_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def remove_stopwords(plaintext: str) -> str:
def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], start: datetime):
keywords_with_timestamp = []
number_of_elements_in_array = 17
default_window_in_seconds = 20
plaitext_without_stopwords = remove_stopwords(plaintext)
logging.debug(f"display datetime start {start}")

Expand All @@ -137,40 +138,46 @@ def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], s
keywords_with_timestamp.extend(keywords_to_add)

if len(keywords_with_timestamp) > 0:
# count false positive near of 15" of positive keywords
keywords_with_timestamp_15 = get_keywords_with_timestamp_with_false_positive(keywords_with_timestamp, start, duration_seconds=15)
filtered_keywords_with_timestamp = filter_indirect_words(keywords_with_timestamp_15)
# count false positive near of default_window_in_seconds of positive keywords
keywords_with_timestamp_default = get_keywords_with_timestamp_with_false_positive(keywords_with_timestamp, start, duration_seconds=default_window_in_seconds)
filtered_keywords_with_timestamp = filter_indirect_words(keywords_with_timestamp_default)

theme= get_themes(keywords_with_timestamp_15)
keywords_with_timestamp= clean_metadata(keywords_with_timestamp_15)
theme= get_themes(keywords_with_timestamp_default)
keywords_with_timestamp= clean_metadata(keywords_with_timestamp_default)
number_of_keywords= count_keywords_duration_overlap(filtered_keywords_with_timestamp, start)

themes_climat = ["changement_climatique_constat",
"changement_climatique_causes",
"changement_climatique_consequences",
"attenuation_climatique_solutions",
"adaptation_climatique_solutions"
]
number_of_keywords_climat= count_keywords_duration_overlap(filtered_keywords_with_timestamp, start, theme=themes_climat)
themes_biodiversite = [
"biodiversite_concepts_generaux",
"biodiversite_causes",
"biodiversite_consequences",
"biodiversite_solutions",
]
number_of_keywords_biodiversite= count_keywords_duration_overlap(filtered_keywords_with_timestamp, start, themes_biodiversite)

themes_ressources = ["ressources",
"ressources_solutions",
]
number_of_keywords_ressources= count_keywords_duration_overlap(filtered_keywords_with_timestamp, start, themes_ressources)

number_of_changement_climatique_constat = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["changement_climatique_constat"])
number_of_changement_climatique_causes = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["changement_climatique_causes"])
number_of_changement_climatique_consequences = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["changement_climatique_consequences"])
number_of_attenuation_climatique_solutions = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["attenuation_climatique_solutions"])
number_of_adaptation_climatique_solutions = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["adaptation_climatique_solutions"])
number_of_ressources = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["ressources"])
number_of_ressources_solutions = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["ressources_solutions"])
number_of_biodiversite_concepts_generaux = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_concepts_generaux"])
number_of_biodiversite_causes = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_causes"])
number_of_biodiversite_consequences = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_consequences"])
number_of_biodiversite_solutions = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_solutions"])

number_of_changement_climatique_constat = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="changement_climatique_constat")
number_of_changement_climatique_causes = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="changement_climatique_causes")
number_of_changement_climatique_consequences = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="changement_climatique_consequences")
number_of_attenuation_climatique_solutions = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="attenuation_climatique_solutions")
number_of_adaptation_climatique_solutions = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="adaptation_climatique_solutions")
number_of_ressources = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="ressources")
number_of_ressources_solutions = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="ressources_solutions")
number_of_biodiversite_concepts_generaux = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="biodiversite_concepts_generaux")
number_of_biodiversite_causes = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="biodiversite_causes")
number_of_biodiversite_consequences = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="biodiversite_consequences")
number_of_biodiversite_solutions = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="biodiversite_solutions")

if(os.environ.get("COMPARE_DURATION") == "true"):
logging.debug(f"Comparaison between 15/20/30/40 is activated")
keywords_with_timestamp_20 = get_keywords_with_timestamp_with_false_positive(keywords_with_timestamp, start, duration_seconds=20)
keywords_with_timestamp_30 = get_keywords_with_timestamp_with_false_positive(keywords_with_timestamp, start, duration_seconds=30)
keywords_with_timestamp_40 = get_keywords_with_timestamp_with_false_positive(keywords_with_timestamp, start, duration_seconds=40)
number_of_keywords_20= count_keywords_duration_overlap(filter_indirect_words(keywords_with_timestamp_20), start)
number_of_keywords_30= count_keywords_duration_overlap(filter_indirect_words(keywords_with_timestamp_30), start)
number_of_keywords_40= count_keywords_duration_overlap(filter_indirect_words(keywords_with_timestamp_40), start)
else:
logging.debug(f"No comparaison between 15/20/30/40 is activated")
number_of_keywords_20=None
number_of_keywords_30=None
number_of_keywords_40=None
# TODO refacto this return array and else
return [
theme
,keywords_with_timestamp
Expand All @@ -186,10 +193,9 @@ def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], s
,number_of_biodiversite_causes
,number_of_biodiversite_consequences
,number_of_biodiversite_solutions
# number_of_keywords with special duration to compare duration
,number_of_keywords_20
,number_of_keywords_30
,number_of_keywords_40
,number_of_keywords_climat
,number_of_keywords_biodiversite
,number_of_keywords_ressources
]
else:
return [None] * number_of_elements_in_array
Expand Down Expand Up @@ -244,9 +250,9 @@ def filter_and_tag_by_theme(df: pd.DataFrame) -> pd.DataFrame :
'number_of_biodiversite_causes_directes',
'number_of_biodiversite_consequences',
'number_of_biodiversite_solutions_directes'
,'number_of_keywords_20'
,'number_of_keywords_30'
,'number_of_keywords_40'
,"number_of_keywords_climat"
,"number_of_keywords_biodiversite"
,"number_of_keywords_ressources"
]
] = df[['plaintext','srt', 'start']]\
.swifter.apply(\
Expand Down Expand Up @@ -275,14 +281,14 @@ def add_primary_key(row):
def filter_indirect_words(keywords_with_timestamp: List[dict]) -> List[dict]:
return list(filter(lambda kw: indirectes not in kw['theme'], keywords_with_timestamp))

def count_keywords_duration_overlap(keywords_with_timestamp: List[dict], start: datetime, theme: str = None) -> int:
def count_keywords_duration_overlap(keywords_with_timestamp: List[dict], start: datetime, theme: List[str] = None) -> int:
total_keywords = len(keywords_with_timestamp)
if(total_keywords) == 0:
return 0
else:
if theme is not None:
logging.debug(f"filter theme {theme}")
keywords_with_timestamp = list(filter(lambda kw: kw['theme'] == theme, keywords_with_timestamp))
keywords_with_timestamp = list(filter(lambda kw: kw['theme'] in theme, keywords_with_timestamp))

length_filtered_items = len(keywords_with_timestamp)

Expand Down
24 changes: 12 additions & 12 deletions quotaclimat/data_processing/mediatree/update_pg_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@ def update_keywords(session: Session, batch_size: int = 50000, start_date : str
,number_of_biodiversite_causes_directes \
,number_of_biodiversite_consequences \
,number_of_biodiversite_solutions_directes \
,new_number_of_keywords_20 \
,new_number_of_keywords_30 \
,new_number_of_keywords_40 = get_themes_keywords_duration(plaintext, srt, start)
,new_number_of_keywords_climat \
,new_number_of_keywords_biodiversite \
,new_number_of_keywords_ressources = get_themes_keywords_duration(plaintext, srt, start)
except Exception as err:
logging.error(f"continuing loop but met error : {err}")
continue
Expand Down Expand Up @@ -83,9 +83,9 @@ def update_keywords(session: Session, batch_size: int = 50000, start_date : str
,number_of_biodiversite_consequences
,number_of_biodiversite_solutions_directes
,channel_title=channel_title
,number_of_keywords_20=new_number_of_keywords_20
,number_of_keywords_30=new_number_of_keywords_30
,number_of_keywords_40=new_number_of_keywords_40
,number_of_keywords_climat=new_number_of_keywords_climat
,number_of_keywords_biodiversite=new_number_of_keywords_biodiversite
,number_of_keywords_ressources=new_number_of_keywords_ressources
)
else:
program_name, program_name_type = get_a_program_with_start_timestamp(df_programs, pd.Timestamp(start).tz_convert('Europe/Paris'), channel_name)
Expand Down Expand Up @@ -145,9 +145,9 @@ def update_keyword_row(session: Session,
number_of_biodiversite_consequences: int,
number_of_biodiversite_solutions_directes: int,
channel_title: str
,number_of_keywords_20: int
,number_of_keywords_30: int
,number_of_keywords_40: int
,number_of_keywords_climat: int
,number_of_keywords_biodiversite: int
,number_of_keywords_ressources: int
):
if matching_themes is not None:
session.query(Keywords).filter(Keywords.id == keyword_id).update(
Expand All @@ -167,9 +167,9 @@ def update_keyword_row(session: Session,
Keywords.number_of_biodiversite_consequences:number_of_biodiversite_consequences ,
Keywords.number_of_biodiversite_solutions_directes:number_of_biodiversite_solutions_directes,
Keywords.channel_title: channel_title
,Keywords.number_of_keywords_20: number_of_keywords_20
,Keywords.number_of_keywords_30: number_of_keywords_30
,Keywords.number_of_keywords_40: number_of_keywords_40
,Keywords.number_of_keywords_climat: number_of_keywords_climat
,Keywords.number_of_keywords_biodiversite: number_of_keywords_biodiversite
,Keywords.number_of_keywords_ressources: number_of_keywords_ressources
},
synchronize_session=False
)
Expand Down
Loading
Loading