Merge pull request #1031 from kga245/feature/unified-logs-handler

Feature: unified logs handler
assafelovic · Dec 21, 2024 · 9c55fce · 9c55fce
2 parents d37418a + e1535bf
commit 9c55fce
Show file tree

Hide file tree

Showing 21 changed files with 785 additions and 153 deletions.
diff --git a/.gitignore b/.gitignore
@@ -40,4 +40,8 @@ docs/build
 package-lock.json
 
 #Vim swp files
-*.swp
+*.swp
+
+# Log files
+logs/
+*.orig
diff --git a/backend/server/app.py b/backend/server/app.py
@@ -0,0 +1,16 @@
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+import logging
+
+logger = logging.getLogger(__name__)
+
+app = FastAPI()
+
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # In production, replace with your frontend domain
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
diff --git a/backend/server/logging_config.py b/backend/server/logging_config.py
@@ -0,0 +1,83 @@
+import logging
+import json
+import os
+from datetime import datetime
+from pathlib import Path
+
+class JSONResearchHandler:
+    def __init__(self, json_file):
+        self.json_file = json_file
+        self.research_data = {
+            "timestamp": datetime.now().isoformat(),
+            "events": [],
+            "content": {
+                "query": "",
+                "sources": [],
+                "context": [],
+                "report": "",
+                "costs": 0.0
+            }
+        }
+
+    def log_event(self, event_type: str, data: dict):
+        self.research_data["events"].append({
+            "timestamp": datetime.now().isoformat(),
+            "type": event_type,
+            "data": data
+        })
+        self._save_json()
+
+    def update_content(self, key: str, value):
+        self.research_data["content"][key] = value
+        self._save_json()
+
+    def _save_json(self):
+        with open(self.json_file, 'w') as f:
+            json.dump(self.research_data, f, indent=2)
+
+def setup_research_logging():
+    # Create logs directory if it doesn't exist
+    logs_dir = Path("logs")
+    logs_dir.mkdir(exist_ok=True)
+
+    # Generate timestamp for log files
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+
+    # Create log file paths
+    log_file = logs_dir / f"research_{timestamp}.log"
+    json_file = logs_dir / f"research_{timestamp}.json"
+
+    # Configure file handler for research logs
+    file_handler = logging.FileHandler(log_file)
+    file_handler.setLevel(logging.INFO)
+    file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
+
+    # Get research logger and configure it
+    research_logger = logging.getLogger('research')
+    research_logger.setLevel(logging.INFO)
+
+    # Remove any existing handlers to avoid duplicates
+    research_logger.handlers.clear()
+
+    # Add file handler
+    research_logger.addHandler(file_handler)
+
+    # Add stream handler for console output
+    console_handler = logging.StreamHandler()
+    console_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
+    research_logger.addHandler(console_handler)
+
+    # Prevent propagation to root logger to avoid duplicate logs
+    research_logger.propagate = False
+
+    # Create JSON handler
+    json_handler = JSONResearchHandler(json_file)
+
+    return str(log_file), str(json_file), research_logger, json_handler
+
+# Create a function to get the logger and JSON handler
+def get_research_logger():
+    return logging.getLogger('research')
+
+def get_json_handler():
+    return getattr(logging.getLogger('research'), 'json_handler', None)
diff --git a/backend/server/server.py b/backend/server/server.py
@@ -15,6 +15,26 @@
     execute_multi_agents, handle_websocket_communication
 )
 
+from gpt_researcher.utils.logging_config import setup_research_logging
+
+import logging
+
+# Get logger instance
+logger = logging.getLogger(__name__)
+
+# Don't override parent logger settings
+logger.propagate = True
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    handlers=[
+        logging.FileHandler("server_log.txt"),  # Log to file
+        logging.StreamHandler()  # Also print to console
+    ]
+)
+
+
 # Models
 
 
@@ -73,6 +93,12 @@ def startup_event():
     os.makedirs("outputs", exist_ok=True)
     app.mount("/outputs", StaticFiles(directory="outputs"), name="outputs")
     os.makedirs(DOC_PATH, exist_ok=True)
+
+    # Setup research logging
+    log_file, json_file, research_logger, json_handler = setup_research_logging()  # Unpack all 4 values
+    research_logger.json_handler = json_handler  # Store the JSON handler on the logger
+    research_logger.info(f"Research log file: {log_file}")
+    research_logger.info(f"Research JSON file: {json_file}")
 
 # Routes
 

diff --git a/backend/server/server_utils.py b/backend/server/server_utils.py
@@ -4,14 +4,115 @@
 import time
 import shutil
 from typing import Dict, List, Any
-from fastapi.responses import JSONResponse
+from fastapi.responses import JSONResponse, FileResponse
 from gpt_researcher.document.document import DocumentLoader
-# Add this import
 from backend.utils import write_md_to_pdf, write_md_to_word, write_text_to_md
+from pathlib import Path
+from datetime import datetime
+from fastapi import HTTPException
+import logging
 
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+
+class CustomLogsHandler:
+    """Custom handler to capture streaming logs from the research process"""
+    def __init__(self, websocket, task: str):
+        self.logs = []
+        self.websocket = websocket
+        sanitized_filename = sanitize_filename(f"task_{int(time.time())}_{task}")
+        self.log_file = os.path.join("outputs", f"{sanitized_filename}.json")
+        self.timestamp = datetime.now().isoformat()
+        # Initialize log file with metadata
+        os.makedirs("outputs", exist_ok=True)
+        with open(self.log_file, 'w') as f:
+            json.dump({
+                "timestamp": self.timestamp,
+                "events": [],
+                "content": {
+                    "query": "",
+                    "sources": [],
+                    "context": [],
+                    "report": "",
+                    "costs": 0.0
+                }
+            }, f, indent=2)
+
+    async def send_json(self, data: Dict[str, Any]) -> None:
+        """Store log data and send to websocket"""
+        # Send to websocket for real-time display
+        if self.websocket:
+            await self.websocket.send_json(data)
+
+        # Read current log file
+        with open(self.log_file, 'r') as f:
+            log_data = json.load(f)
+
+        # Update appropriate section based on data type
+        if data.get('type') == 'logs':
+            log_data['events'].append({
+                "timestamp": datetime.now().isoformat(),
+                "type": "event",
+                "data": data
+            })
+        else:
+            # Update content section for other types of data
+            log_data['content'].update(data)
+
+        # Save updated log file
+        with open(self.log_file, 'w') as f:
+            json.dump(log_data, f, indent=2)
+        logger.debug(f"Log entry written to: {self.log_file}")
+
+
+class Researcher:
+    def __init__(self, query: str, report_type: str = "research_report"):
+        self.query = query
+        self.report_type = report_type
+        # Generate unique ID for this research task
+        self.research_id = f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{hash(query)}"
+        # Initialize logs handler with research ID
+        self.logs_handler = CustomLogsHandler(self.research_id)
+        self.researcher = GPTResearcher(
+            query=query,
+            report_type=report_type,
+            websocket=self.logs_handler
+        )
+
+    async def research(self) -> dict:
+        """Conduct research and return paths to generated files"""
+        await self.researcher.conduct_research()
+        report = await self.researcher.write_report()
+
+        # Generate the files
+        sanitized_filename = sanitize_filename(f"task_{int(time.time())}_{self.query}")
+        file_paths = await generate_report_files(report, sanitized_filename)
+
+        # Get the JSON log path that was created by CustomLogsHandler
+        json_relative_path = os.path.relpath(self.logs_handler.log_file)
+
+        return {
+            "output": {
+                **file_paths,  # Include PDF, DOCX, and MD paths
+                "json": json_relative_path
+            }
+        }
 
 def sanitize_filename(filename: str) -> str:
-    return re.sub(r"[^\w\s-]", "", filename).strip()
+    # Split into components
+    prefix, timestamp, *task_parts = filename.split('_')
+    task = '_'.join(task_parts)
+
+    # Calculate max length for task portion
+    # 255 - len("outputs/") - len("task_") - len(timestamp) - len("_.json") - safety_margin
+    max_task_length = 255 - 8 - 5 - 10 - 6 - 10  # ~216 chars for task
+
+    # Truncate task if needed
+    truncated_task = task[:max_task_length] if len(task) > max_task_length else task
+
+    # Reassemble and clean the filename
+    sanitized = f"{prefix}_{timestamp}_{truncated_task}"
+    return re.sub(r"[^\w\s-]", "", sanitized).strip()
 
 
 async def handle_start_command(websocket, data: str, manager):
@@ -23,13 +124,31 @@ async def handle_start_command(websocket, data: str, manager):
         print("Error: Missing task or report_type")
         return
 
+    # Create logs handler with websocket and task
+    logs_handler = CustomLogsHandler(websocket, task)
+    # Initialize log content with query
+    await logs_handler.send_json({
+        "query": task,
+        "sources": [],
+        "context": [],
+        "report": ""
+    })
+
     sanitized_filename = sanitize_filename(f"task_{int(time.time())}_{task}")
 
     report = await manager.start_streaming(
-        task, report_type, report_source, source_urls, tone, websocket, headers
+        task, 
+        report_type, 
+        report_source, 
+        source_urls, 
+        tone, 
+        logs_handler,
+        headers
     )
     report = str(report)
     file_paths = await generate_report_files(report, sanitized_filename)
+    # Add JSON log path to file_paths
+    file_paths["json"] = os.path.relpath(logs_handler.log_file)
     await send_file_paths(websocket, file_paths)
 
 

diff --git a/frontend/index.html b/frontend/index.html
@@ -143,13 +143,11 @@ <h2>Research Report</h2>
             <div id="reportContainer"></div>
             <div id="reportActions">
                 <div class="alert alert-info" role="alert" id="status"></div>
-                <a id="copyToClipboard" onclick="GPTResearcher.copyToClipboard()" class="btn btn-secondary mt-3"
-                    style="margin-right: 10px;">Copy to clipboard (markdown)</a>
-                <a id="downloadLinkMd" href="#" class="btn btn-secondary mt-3" style="margin-right: 10px;"
-                    target="_blank">Download as Markdown</a>
-                <a id="downloadLink" href="#" class="btn btn-secondary mt-3" style="margin-right: 10px;"
-                    target="_blank">Download as PDF</a>
-                <a id="downloadLinkWord" href="#" class="btn btn-secondary mt-3" target="_blank">Download as Docx</a>
+                <a id="copyToClipboard" onclick="GPTResearcher.copyToClipboard()" class="btn btn-secondary mt-3" style="margin-right: 10px;">Copy to clipboard (markdown)</a>
+                <a id="downloadLinkMd" href="#" class="btn btn-secondary mt-3" style="margin-right: 10px;" target="_blank" rel="noopener noreferrer">Download as Markdown</a>
+                <a id="downloadLink" href="#" class="btn btn-secondary mt-3" style="margin-right: 10px;" target="_blank" rel="noopener noreferrer">Download as PDF</a>
+                <a id="downloadLinkWord" href="#" class="btn btn-secondary mt-3" style="margin-right: 10px;" target="_blank" rel="noopener noreferrer">Download as Docx</a>
+                <a id="downloadLinkJson" href="#" class="btn btn-secondary mt-3" style="margin-right: 10px;" target="_blank" rel="noopener noreferrer">Download Log</a>
             </div>
         </div>
     </main>

diff --git a/frontend/nextjs/app/page.tsx b/frontend/nextjs/app/page.tsx
@@ -257,6 +257,7 @@ export default function Home() {
                   orderedData={orderedData}
                   answer={answer}
                   allLogs={allLogs}
+                  chatBoxSettings={chatBoxSettings}
                   handleClickSuggestion={handleClickSuggestion}
                 />
               </div>