nasa-petal · WorldsEndDunce · Sep 8, 2023 · Sep 15, 2023 · Sep 15, 2023 · Sep 18, 2023
diff --git a/.gitignore b/.gitignore
@@ -1 +1,9 @@
 .env
+sandbox.py
+ScholarQA.py
+
+
+.idea/
+__pycache__/
+citation*
+pdfs/
diff --git a/SSReader.py b/SSReader.py
@@ -0,0 +1,278 @@
+import logging
+from llama_index.readers.base import BaseReader
+from llama_index.readers.schema.base import Document
+import requests
+from typing import List
+import re
+import os
+import logging
+from llama_index.readers.base import BaseReader
+from llama_index.readers.schema.base import Document
+import requests
+from typing import List
+import os
+import pandas as pd
+import openai
+import ast
+
+def get_unique_docs(docs):
+    unique_docs_id = []
+    unique_docs = []
+    for doc in docs:
+        if doc.extra_info['paperId'] not in unique_docs:
+            unique_docs_id.append(doc.extra_info['paperId'])
+            unique_docs.append(doc)
+    return unique_docs
+def get_questions(response_text):
+    data = response_text.split("\n")
+    data = [ast.literal_eval(item)[0] for item in data]
+    return data
+def generate_search_queries_prompt(question):
+    """Generates the search queries prompt for the given question.
+    Args: question (str): The question to generate the search queries prompt for
+    Returns: str: The search queries prompt for the given question
+    """
+
+    return (
+        f'Please generate four related search queries that align with the initial query: "{question}"'
+        f'Each variation should be presented as a list of strings, following this format: ["query 1", "query 2", "query 3", "query 4"]'
+    )
+
+def get_related_questions(query):
+    research_template = """You are a search engine expert"""
+
+    messages = [{
+        "role": "system",
+        "content": research_template
+    }, {
+        "role": "user",
+        "content": generate_search_queries_prompt(query),
+    }]
+
+    response = openai.ChatCompletion.create(
+        model="gpt-3.5-turbo",
+        messages=messages,
+        temperature=0.5,
+        max_tokens=256
+    )
+    related_questions = get_questions(response.choices[0].message.content)
+    related_questions.append(query)
+    return related_questions
+class SemanticScholarReader(BaseReader):
+    """
+    A class to read and process data from Semantic Scholar API
+    ...
+
+    Methods
+    -------
+    __init__():
+       Instantiate the SemanticScholar object
+
+    load_data(query: str, limit: int = 10, returned_fields: list = ["title", "abstract", "venue", "year", "paperId", "citationCount", "openAccessPdf", "authors"]) -> list:
+        Loads data from Semantic Scholar based on the query and returned_fields
+
+    """
+
+    def __init__(self, timeout=10, api_key=None, base_dir="pdfs"):
+        """
+        Instantiate the SemanticScholar object
+        """
+        from semanticscholar import SemanticScholar
+        import arxiv
+
+        self.arxiv = arxiv
+        self.base_dir = base_dir
+        self.s2 = SemanticScholar(timeout=timeout)
+        # check for base dir
+        if not os.path.exists(self.base_dir):
+            os.makedirs(self.base_dir)
+
+    def _clear_cache(self):
+        """
+        delete the .citation* folder
+        """
+        import shutil
+
+        shutil.rmtree("./.citation*")
+
+    def _download_pdf(self, paper_id, url: str, base_dir="pdfs"):
+        logger = logging.getLogger()
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
+        }
+        # Making a GET request
+        response = requests.get(url, headers=headers, stream=True)
+        content_type = response.headers["Content-Type"]
+
+        # As long as the content-type is application/pdf, this will download the file
+        if "application/pdf" in content_type:
+            os.makedirs(base_dir, exist_ok=True)
+            file_path = os.path.join(base_dir, f"{paper_id}.pdf")
+            # check if the file already exists
+            if os.path.exists(file_path):
+                logger.info(f"{file_path} already exists")
+                return file_path
+            with open(file_path, "wb") as file:
+                for chunk in response.iter_content(chunk_size=1024):
+                    if chunk:
+                        file.write(chunk)
+            logger.info(f"Downloaded pdf from {url}")
+            return file_path
+        else:
+            logger.warning(f"{url} was not downloaded: protected")
+            return None
+
+    def _get_full_text_docs(self, documents: List[Document]) -> List[Document]:
+        from PyPDF2 import PdfReader
+
+        """
+        Gets the full text of the documents from Semantic Scholar
+
+        Parameters
+        ----------
+        documents: list
+            The list of Document object that contains the search results
+
+        Returns
+        -------
+        list
+            The list of Document object that contains the search results with full text
+
+        Raises
+        ------
+        Exception
+            If there is an error while getting the full text
+
+        """
+        full_text_docs = []
+        for paper in documents:
+            metadata = paper.extra_info
+            url = metadata["openAccessPdf"]
+            externalIds = metadata["externalIds"]
+            paper_id = metadata["paperId"]
+            file_path = None
+            persist_dir = os.path.join(self.base_dir, f"{paper_id}.pdf")
+            if url and not os.path.exists(persist_dir):
+                # Download the document first
+                file_path = self._download_pdf(metadata["paperId"], url, persist_dir)
+
+            if (
+                not url
+                and externalIds
+                and "ArXiv" in externalIds
+                and not os.path.exists(persist_dir)
+            ):
+                # download the pdf from arxiv
+                file_path = self._download_pdf_from_arxiv(
+                    paper_id, externalIds["ArXiv"]
+                )
+
+            # Then, check if it's a valid PDF. If it's not, skip to the next document.
+            if file_path:
+                try:
+                    pdf = PdfReader(open(file_path, "rb"))
+                except Exception as e:
+                    logging.error(
+                        f"Failed to read pdf with exception: {e}. Skipping document..."
+                    )
+                    continue
+
+                text = ""
+                for page in pdf.pages:
+                    text += page.extract_text()
+                full_text_docs.append(Document(text=text, extra_info=metadata))
+
+        return full_text_docs
+
+    def _download_pdf_from_arxiv(self, paper_id, arxiv_id):
+        paper = next(self.arxiv.Search(id_list=[arxiv_id], max_results=1).results())
+        paper.download_pdf(dirpath=self.base_dir, filename=paper_id + ".pdf")
+        return os.path.join(self.base_dir, f"{paper_id}.pdf")
+
+    def load_data(
+        self,
+        query,
+        limit,
+        full_text,
+        returned_fields=[
+            "title",
+            "abstract",
+            "venue",
+            "year",
+            "paperId",
+            "citationCount",
+            "openAccessPdf",
+            "authors",
+            "externalIds",
+        ],
+    ) -> List[Document]:
+        """
+        Loads data from Semantic Scholar based on the entered query and returned_fields
+
+        Parameters
+        ----------
+        query: str
+            The search query for the paper
+        limit: int, optional
+            The number of maximum results returned (default is 10)
+        returned_fields: list, optional
+            The list of fields to be returned from the search
+
+        Returns
+        -------
+        list
+            The list of Document object that contains the search results
+
+        Raises
+        ------
+        Exception
+            If there is an error while performing the search
+
+        """
+        results = []
+        query = get_related_questions(query)
+        try:
+            for question in query:
+                logging.info(f"Searching for {question}")
+                _results = self.s2.search_paper(question, limit=limit, fields=returned_fields)
+                results.extend(_results[:limit])
+        except (requests.HTTPError, requests.ConnectionError, requests.Timeout) as e:
+            logging.error(
+                "Failed to fetch data from Semantic Scholar with exception: %s", e
+            )
+            raise
+        except Exception as e:
+            logging.error("An unexpected error occurred: %s", e)
+            raise
+
+        documents = []
+
+        for item in results[:limit*len(query)]:
+            openAccessPdf = getattr(item, "openAccessPdf", None)
+            abstract = getattr(item, "abstract", None)
+            title = getattr(item, "title", None)
+            text = None
+            # concat title and abstract
+            if abstract and title:
+                text = title + " " + abstract
+            elif not abstract:
+                text = title
+
+            metadata = {
+                "title": title,
+                "venue": getattr(item, "venue", None),
+                "year": getattr(item, "year", None),
+                "paperId": getattr(item, "paperId", None),
+                "citationCount": getattr(item, "citationCount", None),
+                "openAccessPdf": openAccessPdf.get("url") if openAccessPdf else None,
+                "authors": [author["name"] for author in getattr(item, "authors", [])],
+                "externalIds": getattr(item, "externalIds", None),
+            }
+            documents.append(Document(text=text, extra_info=metadata))
+
+        if full_text:
+            logging.info("Getting full text documents...")
+            full_text_documents = self._get_full_text_docs(documents)
+            documents.extend(full_text_documents)
+        documents = get_unique_docs(documents)
+        return full_text_documents # Edited from documents