Skip to content

Commit

Permalink
wip (#272)
Browse files Browse the repository at this point in the history
  • Loading branch information
polomarcus authored Oct 17, 2024
1 parent 85c3ddb commit 033357a
Show file tree
Hide file tree
Showing 3 changed files with 231 additions and 44 deletions.
13 changes: 13 additions & 0 deletions quotaclimat/data_processing/mediatree/detect_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,12 +107,25 @@ def filter_keyword_with_same_timestamp(keywords_with_timestamp: List[dict])-> Li

return keywords_with_timestamp

def replace_word_with_context(text: str) -> str:
word = "groupe verlaine"
replacement = ""
pattern = f".{{0,50}}{re.escape(word)}.{{0,50}}"

# Replace the matched word along with its surrounding context
result = re.sub(pattern, replacement, text)

return result
def remove_stopwords(plaintext: str) -> str:
logging.debug(f"Removing stopwords {plaintext}")
stopwords = STOP_WORDS
for word in stopwords:
plaintext = plaintext.replace(word, '')

if "groupe verlaine" in plaintext:
logging.debug(f"special groupe verlaine case")
plaintext = replace_word_with_context(plaintext)

return plaintext

@sentry_sdk.trace
Expand Down
Loading

1 comment on commit 033357a

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
postgres
   insert_data.py43784%36–38, 56–58, 63
   insert_existing_data_example.py19384%25–27
postgres/schemas
   models.py1571193%126–133, 146, 148–149, 214–215, 229–230
quotaclimat/data_ingestion
   scrap_sitemap.py1341787%27–28, 33–34, 66–71, 95–97, 138–140, 202, 223–228
quotaclimat/data_ingestion/ingest_db
   ingest_sitemap_in_db.py553733%21–42, 45–58, 62–73
quotaclimat/data_ingestion/scrap_html
   scrap_description_article.py36392%19–20, 32
quotaclimat/data_processing/mediatree
   api_import.py21313338%44–48, 53–74, 78–81, 87, 90–132, 138–153, 158, 171–183, 187–193, 206–218, 221–225, 231, 269–270, 273–304, 307–309
   channel_program.py1625765%21–23, 34–36, 53–54, 57–59, 98–99, 108, 124, 175–216
   config.py15287%7, 16
   detect_keywords.py2321693%111–118, 126–127, 235, 293–300, 336
   update_pg_keywords.py674927%15–108, 132, 135, 142–157, 180–206, 213
   utils.py792568%29–53, 56, 65, 86–87, 117–120
quotaclimat/utils
   healthcheck_config.py291452%22–24, 27–38
   logger.py241154%22–24, 28–37
   sentry.py11282%22–23
TOTAL130338770% 

Tests Skipped Failures Errors Time
97 0 💤 0 ❌ 0 🔥 8m 10s ⏱️

Please sign in to comment.