Skip to content

Commit

Permalink
fix: use apply for id
Browse files Browse the repository at this point in the history
  • Loading branch information
polomarcus committed Jul 30, 2024
1 parent f9fe82d commit fec078b
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 9 deletions.
2 changes: 1 addition & 1 deletion quotaclimat/data_processing/mediatree/api_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ def extract_api_sub(

if(df is not None):
df = filter_and_tag_by_theme(df)
df["id"] = add_primary_key(df)
df["id"] = df.apply(lambda x: add_primary_key(x), axis=1)
return df
else:
None
Expand Down
13 changes: 6 additions & 7 deletions quotaclimat/data_processing/mediatree/detect_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,15 +262,14 @@ def filter_and_tag_by_theme(df: pd.DataFrame) -> pd.DataFrame :

return df

def add_primary_key(df):
logging.info("Adding primary key to save to PG and have idempotent result")
def add_primary_key(row):
logging.info(f"Adding primary key to save to PG and have idempotent results")
try:
return (
df["start"].astype(str) + df["channel_name"]
).apply(get_consistent_hash)
return get_consistent_hash(str(row["start"].timestamp()) + row["channel_name"])

except (Exception) as error:
logging.error(f"{error} with df {df.head()}")
return get_consistent_hash("empty") # TODO improve - should be a None ?
logging.error(f"{error} with df {row}")
raise Exception

def filter_indirect_words(keywords_with_timestamp: List[dict]) -> List[dict]:
return list(filter(lambda kw: indirectes not in kw['theme'], keywords_with_timestamp))
Expand Down
2 changes: 1 addition & 1 deletion test/sitemap/test_main_import_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def test_main_api_import():
start_time = t.time()
df = parse_reponse_subtitle(json_response)
df = filter_and_tag_by_theme(df)
df["id"] = add_primary_key(df)
df["id"] = df.apply(lambda x: add_primary_key(x), axis=1)
end_time = t.time()
logging.info(f"Elapsed time for api import {end_time - start_time}")
# must df._to_pandas() because to_sql does not handle modin dataframe
Expand Down

0 comments on commit fec078b

Please sign in to comment.