trends processing more structured (best, worst)

BasedHardware · Sep 17, 2024 · f72d3c4 · f72d3c4
1 parent 16deb1a
commit f72d3c4
Show file tree

Hide file tree

Showing 4 changed files with 238 additions and 32 deletions.
diff --git a/backend/database/trends.py b/backend/database/trends.py
@@ -14,17 +14,20 @@ def get_trends_data() -> List[Dict]:
     trends_docs = [doc for doc in trends_ref.stream(retry=Retry())]
     trends_data = []
     for category in trends_docs:
-        category_data = category.to_dict()
-
-        category_topics_ref = trends_ref.document(category_data['id']).collection('topics')
-        topics_docs = [topic.to_dict() for topic in category_topics_ref.stream(retry=Retry())]
-        topics = sorted(topics_docs, key=lambda e: len(e['memory_ids']), reverse=True)
-        for topic in topics:
-            topic['memories_count'] = len(topic['memory_ids'])
-            del topic['memory_ids']
-
-        category_data['topics'] = topics
-        trends_data.append(category_data)
+        try:
+            category_data = category.to_dict()
+
+            category_topics_ref = trends_ref.document(category_data['id']).collection('topics')
+            topics_docs = [topic.to_dict() for topic in category_topics_ref.stream(retry=Retry())]
+            topics = sorted(topics_docs, key=lambda e: len(e['memory_ids']), reverse=True)
+            for topic in topics:
+                topic['memories_count'] = len(topic['memory_ids'])
+                del topic['memory_ids']
+
+            category_data['topics'] = topics
+            trends_data.append(category_data)
+        except Exception as e:
+            continue
     return trends_data
 
 
@@ -34,10 +37,14 @@ def save_trends(memory: Memory, trends: List[Trend]):
     for trend in trends:
         category = trend.category.value
         topics = trend.topics
-        category_id = document_id_from_seed(category)
+        trend_type = trend.type.value
+        category_id = document_id_from_seed(category + trend_type)
         category_doc_ref = trends_coll_ref.document(category_id)
 
-        category_doc_ref.set({"id": category_id, "category": category, "created_at": datetime.utcnow()}, merge=True)
+        category_doc_ref.set(
+            {"id": category_id, "category": category, "type": trend_type, "created_at": datetime.utcnow()},
+            merge=True
+        )
 
         topics_coll_ref = category_doc_ref.collection('topics')
 

diff --git a/backend/models/trend.py b/backend/models/trend.py
@@ -5,20 +5,213 @@
 
 
 class TrendEnum(str, Enum):
-    acquisition = "acquisition"
     ceo = "ceo"
     company = "company"
-    event = "event"
-    founder = "founder"
-    industry = "industry"
-    innovation = "innovation"
-    investment = "investment"
-    partnership = "partnership"
-    product = "product"
-    research = "research"
-    tool = "tool"
+    software_product = "software_product"
+    hardware_product = "hardware_product"
+    ai_product = "ai_product"
+
+
+class TrendType(str, Enum):
+    best = "best"
+    worst = "worst"
 
 
 class Trend(BaseModel):
     category: TrendEnum = Field(description="The category identified")
+    type: TrendType = Field(description="The type of trend identified")
     topics: List[str] = Field(description="The specific topic corresponding the category")
+
+
+ceo_options = [
+    "Elon Musk",
+    "Sundar Pichai",
+    "Satya Nadella",
+    "Jensen Huang",
+    "Andy Jassy",
+    "Ryan Breslow",
+    "Henrique Dubugras",
+    "Alexandr Wang",
+    "Tim Cook",
+    "Marc Benioff",
+    "Dylan Field",
+    "Parag Agrawal",
+    "Brian Chesky",
+    "Patrick Collison",
+    "Andrew Wilson",
+    "Lisa Su",
+    "Austin Russell",
+    "Sam Altman",
+    "Darius Adamczyk",
+    "Shantanu Narayen",
+    "Bob Chapek",
+    "Mark Zuckerberg",
+    "David Zaslav",
+    "Mary Barra",
+    "Howard Schultz",
+    "Raj Subramaniam",
+    "Arvind Krishna",
+    "Adam Neumann",
+    "Vlad Tenev",
+    "Dara Khosrowshahi",
+    "Fran Horowitz",
+    "Yuanqing Yang",
+    "Frank Slootman",
+    "William McDermott",
+    "Anthony Wood",
+    "Roland Busch",
+    "Christian Klein",
+    "Kazuhiro Tsuga",
+    "Stéphane Bancel"
+]
+
+company_options = [
+    "Microsoft",
+    "Nvidia",
+    "Amazon",
+    "Apple",
+    "Tesla",
+    "Salesforce",
+    "Shopify",
+    "Google/Alphabet",
+    "SpaceX",
+    "OpenAI",
+    "Brex",
+    "Stripe",
+    "Adobe",
+    "Zoom",
+    "Figma",
+    "Databricks",
+    "GitHub",
+    "Luminar",
+    "Airbnb",
+    "Square",
+    "Meta",
+    "Warner Bros. Discovery",
+    "Disney",
+    "X (formerly Twitter)",
+    "BP",
+    "Robinhood",
+    "Peloton",
+    "Boeing",
+    "WeWork",
+    "FedEx",
+    "AT&T",
+    "IBM",
+    "Frontier Airlines",
+    "Uber",
+    "Juul",
+    "TikTok",
+    "Snapchat",
+    "Nestlé",
+    "Facebook",
+    "GameStop"
+]
+
+software_product_options = [
+    "Microsoft Copilot",
+    "OpenAI GPT-4",
+    "Slack",
+    "Google Workspace",
+    "Zoom",
+    "Salesforce CRM",
+    "Adobe Photoshop",
+    "Figma",
+    "Datadog",
+    "ServiceNow",
+    "HubSpot",
+    "Notion",
+    "Tableau",
+    "Monday.com",
+    "GitHub Copilot",
+    "Asana",
+    "Trello",
+    "Snowflake",
+    "Atlassian Jira",
+    "ZoomInfo",
+    "Meta Horizon Worlds",
+    "Robinhood",
+    "Oracle",
+    "Evernote",
+    "Google Stadia",
+    "Facebook Workplace",
+    "SAP S/4HANA",
+    "IBM Watson",
+    "Quibi",
+    "Kaspersky",
+    "Palantir",
+    "Clubhouse",
+    "Slack Threads",
+    "TikTok’s Creator Tools",
+    "Samsung Bixby",
+    "Salesforce Marketing Cloud",
+    "Microsoft Teams",
+    "Intel AI Suite",
+    "Uber Driver App"
+]
+
+hardware_product_options = [
+    "Tesla Cybertruck",
+    "iPhone 16",
+    "MacBook Pro",
+    "Nvidia RTX 5090",
+    "SpaceX Starship",
+    "Amazon Echo",
+    "Sony PlayStation 6",
+    "Microsoft Surface Pro 10",
+    "Dyson V16",
+    "Luminar LiDAR",
+    "DJI Mavic 3",
+    "Apple Vision Pro",
+    "Google Pixel 9",
+    "Framework Laptop",
+    "Oculus Quest 4",
+    "Logitech G Pro X",
+    "Samsung Galaxy Fold 4",
+    "Fitbit Charge 6",
+    "Apple Watch 9",
+    "Lumix S5 II"
+]
+
+ai_product_options = [
+    "OpenAI GPT-5",
+    "Google DeepMind",
+    "Nvidia Omniverse",
+    "Microsoft Copilot",
+    "Tesla FSD",
+    "Amazon Alexa",
+    "Salesforce Einstein",
+    "Palantir Foundry",
+    "Scale AI",
+    "Grammarly AI",
+    "MidJourney",
+    "Hugging Face",
+    "Runway Gen-2",
+    "Anthropic Claude 2",
+    "Cohere",
+    "Databricks AI",
+    "Hugging Face Transformers",
+    "Notion AI",
+    "Synthesia",
+    "Jasper AI",
+    "Meta AI",
+    "IBM Watson",
+    "Clearview AI",
+    "Facebook AI",
+    "Google Duplex",
+    "Samsung Bixby",
+    "Twitter AI moderation",
+    "Microsoft Tay",
+    "Replika",
+    "Clear AI",
+    "TikTok AI",
+    "Robinhood AI Trading",
+    "Meta Horizon AI",
+    "Ring AI",
+    "Uber AI",
+    "Watson Health",
+    "ChatGPT clones",
+    "Tinder AI",
+    "Zoom AI transcription",
+    "Salesforce AI"
+]
diff --git a/backend/utils/llm.py b/backend/utils/llm.py
@@ -12,7 +12,8 @@
 from models.memory import Structured, MemoryPhoto, CategoryEnum, Memory
 from models.plugin import Plugin
 from models.transcript_segment import TranscriptSegment
-from models.trend import TrendEnum
+from models.trend import TrendEnum, ceo_options, company_options, software_product_options, hardware_product_options, \
+    ai_product_options, TrendType
 from utils.memories.facts import get_prompt_facts
 
 llm_mini = ChatOpenAI(model='gpt-4o-mini')
@@ -472,6 +473,7 @@ def new_facts_extractor(uid: str, segments: List[TranscriptSegment]) -> List[Fac
 
 class Item(BaseModel):
     category: TrendEnum = Field(description="The category identified")
+    type: TrendType = Field(description="The sentiment identified")
     topic: str = Field(description="The specific topic corresponding the category")
 
 
@@ -487,16 +489,20 @@ def trends_extractor(memory: Memory) -> List[Item]:
     prompt = f'''
     You will be given a finished conversation transcript.
     You are responsible for extracting the topics of the conversation and classifying each one within one the following categories: {str([e.value for e in TrendEnum]).strip("[]")}.
+    You must identify if the perception is positive or negative, and classify it as "best" or "worst".
     
-    Each topic must be a person, company, event, technology, product, research, innovation, acquisition, partnership, investment, founder, CEO, industry, or any other relevant topic.
-    It can't be a non-specific topic like "the weather" or "the economy".
+    For the specific topics here are the options available, you must classify the topic within one of these options:
+    - ceo_options: {", ".join(ceo_options)}
+    - company_options: {", ".join(company_options)}
+    - software_product_options: {", ".join(software_product_options)}
+    - hardware_product_options: {", ".join(hardware_product_options)}
+    - ai_product_options: {", ".join(ai_product_options)}
     
     For example,
-    
-    If you identify the topic "Tesla", you should classify it as "company".
-    If you identify the topic "Elon Musk", you should classify it as "ceo".
-    If you identify the topic "Dreamforce", you should classify it as "event".
-    If you identify the topic "GPT O1", you should classify it as "tool".
+    If you identify the topic "Tesla stock has been going up incredibly", you should output:
+    - Category: company
+    - Type: best
+    - Topic: Tesla
     
     Conversation:
     {transcript}

diff --git a/backend/utils/memories/process_memory.py b/backend/utils/memories/process_memory.py
@@ -128,7 +128,7 @@ def _extract_facts(uid: str, memory: Memory):
 
 def _extract_trends(memory: Memory):
     extracted_items = trends_extractor(memory)
-    parsed = [Trend(category=item.category, topics=[item.topic]) for item in extracted_items]
+    parsed = [Trend(category=item.category, topics=[item.topic], type=item.type) for item in extracted_items]
     trends_db.save_trends(memory, parsed)