UTMIST · kevin-xu36 · Dec 1, 2024
diff --git a/app/Ingestion_Pipeline.py b/app/Ingestion_Pipeline.py
@@ -22,24 +22,27 @@
 llm = OpenAI(temperature=0.1, model="gpt-3.5-turbo")
 
 # Define the directory containing the text files
-directory = '/Users/dingshengliu/Desktop/ChatbotAI/AI Chatbot Version 2/app/data'
+desktop = os.path.expanduser("~\Desktop")
+directory = os.path.join(desktop, "Chatbot Code\\app\\data")
 
 # Function to read text files and create Document objects
 def read_files(directory):
     documents = []
     for filename in os.listdir(directory):
         if filename.endswith(".txt"):
             file_path = os.path.join(directory, filename)
-            with open(file_path, 'r') as file:
+            with open(file_path, encoding = "utf-8") as file:
                 content = file.read()
                 documents.append(Document(text=content))
     return documents
 
+#qdrant_key = 'Your Key'
+
 from qdrant_client import QdrantClient
 
 qdrant_client = QdrantClient(
-    url="https://5f8102de-7129-4a0a-8bb2-166dd7c92682.us-east4-0.gcp.cloud.qdrant.io:6333", 
-    api_key="TDT673LkRUY2e-CfBa4H1m9U8pufQjk3h_BCoXbpfIp6KfcjS1XRog",
+    url="local url", 
+    #api_key=qdrant_key
 )
 
 vector_store = QdrantVectorStore(client=qdrant_client, collection_name = "test") 

diff --git a/app/instagram_scraping.py b/app/instagram_scraping.py
@@ -1,15 +1,18 @@
 import instaloader
 import logging
+import os
+import re
 
 logging.basicConfig(level=logging.INFO)
 
 INSTALOADER_INSTANCE = instaloader.Instaloader()
-
-# NOTE: Logging in with instagram credentials may not be necessary depending on the machine you're on
-
-# May want to replace with environment variables
-INSTAGRAM_USERNAME = "Username"
-INSTAGRAM_PASSWORD = "Password"
+desktop = os.path.expanduser("~\Desktop")
+instafilepath = os.path.join(desktop, "instauserpass.txt")
+with open(instafilepath, "r") as file:
+    lines = file.readlines()
+    lines = [line.rstrip("\n") for line in lines]
+    INSTAGRAM_USERNAME = lines[0]
+    INSTAGRAM_PASSWORD = lines[1]
 
 INSTALOADER_INSTANCE.login(INSTAGRAM_USERNAME, INSTAGRAM_PASSWORD)
 
@@ -21,9 +24,44 @@ def get_captions() -> list:
     profile = instaloader.Profile.from_username(INSTALOADER_INSTANCE.context, "uoft_utmist")
 
     # Iterating through all posts
+    count = 0
     for i, post in enumerate(profile.get_posts()):
+        if (count == 30):
+            break
         captions.append(post.caption)
+        count +=1 
     logging.info(f"Got {len(captions)} captions.")
 
     return captions
 
+def remove_emojis(data):
+    emoj = re.compile("["
+        u"\U0001F600-\U0001F64F"  # emoticons
+        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
+        u"\U0001F680-\U0001F6FF"  # transport & map symbols
+        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
+        u"\U0001F1F2-\U0001F1F4"  # Macau flag
+        u"\U0001F1E6-\U0001F1FF"  # flags
+        u"\U0001F600-\U0001F64F"
+        u"\U00002702-\U000027B0"
+        u"\U000024C2-\U0001F251"
+        u"\U0001f926-\U0001f937"
+        u"\U0001F1F2"
+        u"\U0001F1F4"
+        u"\U0001F620"
+        u"\u200d"
+        u"\u2640-\u2642"
+        "]+", flags=re.UNICODE)
+    return re.sub(emoj, '', data)
+
+unfiltered_instagram_data = get_captions()
+instagram_data = []
+for i in range(len(unfiltered_instagram_data)):
+    instagram_data.append(remove_emojis(unfiltered_instagram_data[i]))
+
+file_path = os.path.join(os.getcwd(), "data\instagram_data.txt")
+
+with open(file_path, 'w', encoding="utf-8") as file:
+    for i in range(len(instagram_data)):
+        file.write(instagram_data[i] + ' ')
+