Skip to content

Commit

Permalink
wip: only transform false positive when same subject
Browse files Browse the repository at this point in the history
  • Loading branch information
polomarcus committed Oct 9, 2024
1 parent e2e1022 commit da0a281
Show file tree
Hide file tree
Showing 2 changed files with 160 additions and 7 deletions.
22 changes: 19 additions & 3 deletions quotaclimat/data_processing/mediatree/detect_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,11 +307,26 @@ def count_keywords_duration_overlap(keywords_with_timestamp: List[dict], start:
def count_different_window_number(keywords_with_timestamp: List[dict], start: datetime) -> int:
window_numbers = [item['window_number'] for item in keywords_with_timestamp if 'window_number' in item]
final_count = len(set(window_numbers))
logging.debug(f"Count with 15 second logic: {final_count} keywords")
logging.debug(f"Count with {DEFAULT_WINDOW_DURATION} second logic: {final_count} keywords")

return final_count

def contains_direct_keywords(keywords_with_timestamp: List[dict]) -> bool:
def get_subject_from_theme(theme: str) -> str:
if 'climatique' in theme:
return 'climat'
elif 'biodiversite' in theme:
return 'biodiversite'
elif 'ressources' in theme:
return 'ressources'
else:
return 'unknown'

# only of the same subject (climate/biodiv/ressources)
def contains_direct_keywords_same_suject(keywords_with_timestamp: List[dict], theme: str) -> bool:
subject = get_subject_from_theme(theme)
logging.debug(f"subject {subject}")
# keep only keywords with timestamp from the same subject
keywords_with_timestamp = list(filter(lambda kw: get_subject_from_theme(kw['theme']) == subject, keywords_with_timestamp))
return any(indirectes not in kw['theme'] for kw in keywords_with_timestamp)

# we want to count false positive near of 15" of positive keywords
Expand All @@ -326,7 +341,8 @@ def transform_false_positive_keywords_to_positive(keywords_with_timestamp: List[
, keywords_with_timestamp)
)

if( contains_direct_keywords(neighbour_keywords) ) :
if( contains_direct_keywords_same_suject(neighbour_keywords, keyword_info['theme']) ) :
logging.debug(f"Transforming false positive to positive { keyword_info['keyword']} { keyword_info['theme']}")
keyword_info['theme'] = remove_indirect(keyword_info['theme'])

return keywords_with_timestamp
Expand Down
145 changes: 141 additions & 4 deletions test/sitemap/test_detect_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -1328,19 +1328,19 @@ def test_different_steps_transform_false_positive_keywords_to_positive():
},
{'keyword': 'agroforesterie',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 2 + 150,
'theme': 'attenuation_climatique_solutions_indirectes' # should be stayed to indirect
'theme': 'attenuation_climatique_solutions_indirectes' # should stay indirect
},
{'keyword': 'alternative durable',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 3 + 150,
'theme': 'attenuation_climatique_solutions_indirectes' # should be stayed to indirect
'theme': 'attenuation_climatique_solutions_indirectes' # should stay indirect
},
{'keyword': 'planification écologique',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 4 + 150,
'theme': 'attenuation_climatique_solutions_indirectes' # should be stayed to indirect
'theme': 'attenuation_climatique_solutions_indirectes' # should stay indirect
},
{'keyword': 'nucléaire',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 6 + 150,
'theme': 'attenuation_climatique_solutions_indirectes' # should be stayed to indirect
'theme': 'attenuation_climatique_solutions_indirectes' # should stay indirect
}
]

Expand Down Expand Up @@ -1379,6 +1379,143 @@ def test_different_steps_transform_false_positive_keywords_to_positive():

assert transform_false_positive_keywords_to_positive(tag_wanted_duration_second_window_number(keywords_with_timestamp,start, duration_seconds=15), start) == expected_output

def test_transform_false_positive_keywords_to_positive_different_and_same_subject():
keywords_with_timestamp = [
{'keyword': 'climatique',
'timestamp': original_timestamp + 150,
'theme': 'changement_climatique_constat'
},
{'keyword': "activisme climatique",
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 1 + 151,
'theme': 'attenuation_climatique_solutions_indirectes' # should be transformed to direct
},
{'keyword': 'industrie verte',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 1 + 150,
'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect
},
{'keyword': 'agroforesterie',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 2 + 150,
'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect
},
{'keyword': 'alternative durable',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 3 + 150,
'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect
},
{'keyword': 'planification écologique',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 4 + 150,
'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect
},
{'keyword': 'nucléaire',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 6 + 150,
'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect
}
]

expected_output = [
{'keyword': 'climatique',
'timestamp': original_timestamp + 150,
'window_number': 0,
'theme': 'changement_climatique_constat'
},
{'keyword': "activisme climatique",
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 1 + 151,
'window_number': 1,
'theme': 'attenuation_climatique_solutions' # should be transformed to direct
},
{'keyword': 'industrie verte',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 1 + 150,
'window_number': 1,
'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect
},
{'keyword': 'agroforesterie',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 2 + 150,
'window_number': 2,
'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect
},
{'keyword': 'alternative durable',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 3 + 150,
'window_number': 3,
'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect
},
{'keyword': 'planification écologique',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 4 + 150,
'window_number': 4,
'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect
},
{'keyword': 'nucléaire',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 6 + 150,
'window_number': 6,
'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect
}
]

assert transform_false_positive_keywords_to_positive(tag_wanted_duration_second_window_number(keywords_with_timestamp,start, duration_seconds=15), start) == expected_output



def test_transform_false_positive_keywords_to_positive_different_subject():
keywords_with_timestamp = [
{'keyword': 'climatique',
'timestamp': original_timestamp + 150,
'theme': 'changement_climatique_constat'
},
{'keyword': 'industrie verte',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 1 + 150,
'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect
},
{'keyword': 'agroforesterie',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 2 + 150,
'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect
},
{'keyword': 'alternative durable',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 3 + 150,
'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect
},
{'keyword': 'planification écologique',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 4 + 150,
'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect
},
{'keyword': 'nucléaire',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 6 + 150,
'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect
}
]

expected_output = [
{'keyword': 'climatique',
'timestamp': original_timestamp + 150,
'window_number': 0,
'theme': 'changement_climatique_constat'
},
{'keyword': 'industrie verte',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 1 + 150,
'window_number': 1,
'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect
},
{'keyword': 'agroforesterie',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 2 + 150,
'window_number': 2,
'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect
},
{'keyword': 'alternative durable',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 3 + 150,
'window_number': 3,
'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect
},
{'keyword': 'planification écologique',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 4 + 150,
'window_number': 4,
'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect
},
{'keyword': 'nucléaire',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 6 + 150,
'window_number': 6,
'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect
}
]

assert transform_false_positive_keywords_to_positive(tag_wanted_duration_second_window_number(keywords_with_timestamp,start, duration_seconds=15), start) == expected_output


def test_count_different_window_number():
keywords_with_timestamp = [
Expand Down

0 comments on commit da0a281

Please sign in to comment.