Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Full Text Q&A + Patent Search #15

Open
wants to merge 30 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
16cd7e6
feat: Restructure tools by condensing BDA
Sep 8, 2023
4486c65
feat: Paper retrieval to all non-agent modes
Sep 15, 2023
3ec4c00
feat: Bidara can now generate images with DallE
Sep 15, 2023
fe85da5
docs: Some comments to fxn-calling
Sep 18, 2023
ee381e0
feat: Ability to add custom OpenAI API key
Sep 21, 2023
9587abd
feat: Delete msg with user's API key
Sep 21, 2023
8a70bc8
chore: Delete __pycache__ directory
WorldsEndDunce Sep 21, 2023
c706f34
chore: Delete .idea directory
WorldsEndDunce Sep 21, 2023
c110515
feat: Add API key link
WorldsEndDunce Sep 22, 2023
da784a9
feat: Llama set and query research space
WorldsEndDunce Oct 3, 2023
1c09d6a
feat: Q&A on full text
WorldsEndDunce Oct 10, 2023
d872f71
feat: Delete previous research space files upon startup
WorldsEndDunce Oct 12, 2023
06de69f
feat: List research space papers
WorldsEndDunce Oct 14, 2023
8677483
feat: Basic convo sliding window
WorldsEndDunce Oct 17, 2023
f79d09a
fix: API keys unique to user
WorldsEndDunce Oct 17, 2023
10f7654
fix: Remove curkey command
WorldsEndDunce Oct 17, 2023
7f47b1c
feat: Update setRS and qRS function instructions
WorldsEndDunce Oct 25, 2023
10ba73a
feat: Patent search functionality
WorldsEndDunce Oct 25, 2023
03649fd
feat: Bidara lists functions on greeting
WorldsEndDunce Oct 26, 2023
ec7ae5c
refactor: Move functions to functions.py
WorldsEndDunce Oct 26, 2023
f8f055e
refactor: Move function_descriptions to functions.py
WorldsEndDunce Oct 26, 2023
5e91406
feat: Manage and delete research space
WorldsEndDunce Oct 28, 2023
d1db7e3
feat: Clear RS pdfs on clearconv
WorldsEndDunce Oct 30, 2023
31ff1ed
feat: Set API key pop-up
WorldsEndDunce Nov 8, 2023
89d5a23
feat: Only one user per API button
WorldsEndDunce Nov 8, 2023
68832b9
docs: Update README to main
WorldsEndDunce Nov 8, 2023
c940a4a
Merge branch 'main' into functions
WorldsEndDunce Nov 8, 2023
8bc37b6
refactor: Move Button and modal classes
WorldsEndDunce Nov 8, 2023
a891a23
Merge branch 'functions' of https://github.com/nasa-petal/discord_bot…
WorldsEndDunce Nov 8, 2023
b8f199c
fix: Remove agent init
WorldsEndDunce Nov 8, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,9 @@
.env
sandbox.py
ScholarQA.py


.idea/
__pycache__/
citation*
pdfs/
278 changes: 278 additions & 0 deletions SSReader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,278 @@
import logging
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document
import requests
from typing import List
import re
import os
import logging
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document
import requests
from typing import List
import os
import pandas as pd
import openai
import ast

def get_unique_docs(docs):
unique_docs_id = []
unique_docs = []
for doc in docs:
if doc.extra_info['paperId'] not in unique_docs:
unique_docs_id.append(doc.extra_info['paperId'])
unique_docs.append(doc)
return unique_docs
def get_questions(response_text):
data = response_text.split("\n")
data = [ast.literal_eval(item)[0] for item in data]
return data
def generate_search_queries_prompt(question):
"""Generates the search queries prompt for the given question.
Args: question (str): The question to generate the search queries prompt for
Returns: str: The search queries prompt for the given question
"""

return (
f'Please generate four related search queries that align with the initial query: "{question}"'
f'Each variation should be presented as a list of strings, following this format: ["query 1", "query 2", "query 3", "query 4"]'
)

def get_related_questions(query):
research_template = """You are a search engine expert"""

messages = [{
"role": "system",
"content": research_template
}, {
"role": "user",
"content": generate_search_queries_prompt(query),
}]

response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=messages,
temperature=0.5,
max_tokens=256
)
related_questions = get_questions(response.choices[0].message.content)
related_questions.append(query)
return related_questions
class SemanticScholarReader(BaseReader):
"""
A class to read and process data from Semantic Scholar API
...

Methods
-------
__init__():
Instantiate the SemanticScholar object

load_data(query: str, limit: int = 10, returned_fields: list = ["title", "abstract", "venue", "year", "paperId", "citationCount", "openAccessPdf", "authors"]) -> list:
Loads data from Semantic Scholar based on the query and returned_fields

"""

def __init__(self, timeout=10, api_key=None, base_dir="pdfs"):
"""
Instantiate the SemanticScholar object
"""
from semanticscholar import SemanticScholar
import arxiv

self.arxiv = arxiv
self.base_dir = base_dir
self.s2 = SemanticScholar(timeout=timeout)
# check for base dir
if not os.path.exists(self.base_dir):
os.makedirs(self.base_dir)

def _clear_cache(self):
"""
delete the .citation* folder
"""
import shutil

shutil.rmtree("./.citation*")

def _download_pdf(self, paper_id, url: str, base_dir="pdfs"):
logger = logging.getLogger()
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}
# Making a GET request
response = requests.get(url, headers=headers, stream=True)
content_type = response.headers["Content-Type"]

# As long as the content-type is application/pdf, this will download the file
if "application/pdf" in content_type:
os.makedirs(base_dir, exist_ok=True)
file_path = os.path.join(base_dir, f"{paper_id}.pdf")
# check if the file already exists
if os.path.exists(file_path):
logger.info(f"{file_path} already exists")
return file_path
with open(file_path, "wb") as file:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
file.write(chunk)
logger.info(f"Downloaded pdf from {url}")
return file_path
else:
logger.warning(f"{url} was not downloaded: protected")
return None

def _get_full_text_docs(self, documents: List[Document]) -> List[Document]:
from PyPDF2 import PdfReader

"""
Gets the full text of the documents from Semantic Scholar

Parameters
----------
documents: list
The list of Document object that contains the search results

Returns
-------
list
The list of Document object that contains the search results with full text

Raises
------
Exception
If there is an error while getting the full text

"""
full_text_docs = []
for paper in documents:
metadata = paper.extra_info
url = metadata["openAccessPdf"]
externalIds = metadata["externalIds"]
paper_id = metadata["paperId"]
file_path = None
persist_dir = os.path.join(self.base_dir, f"{paper_id}.pdf")
if url and not os.path.exists(persist_dir):
# Download the document first
file_path = self._download_pdf(metadata["paperId"], url, persist_dir)

if (
not url
and externalIds
and "ArXiv" in externalIds
and not os.path.exists(persist_dir)
):
# download the pdf from arxiv
file_path = self._download_pdf_from_arxiv(
paper_id, externalIds["ArXiv"]
)

# Then, check if it's a valid PDF. If it's not, skip to the next document.
if file_path:
try:
pdf = PdfReader(open(file_path, "rb"))
except Exception as e:
logging.error(
f"Failed to read pdf with exception: {e}. Skipping document..."
)
continue

text = ""
for page in pdf.pages:
text += page.extract_text()
full_text_docs.append(Document(text=text, extra_info=metadata))

return full_text_docs

def _download_pdf_from_arxiv(self, paper_id, arxiv_id):
paper = next(self.arxiv.Search(id_list=[arxiv_id], max_results=1).results())
paper.download_pdf(dirpath=self.base_dir, filename=paper_id + ".pdf")
return os.path.join(self.base_dir, f"{paper_id}.pdf")

def load_data(
self,
query,
limit,
full_text,
returned_fields=[
"title",
"abstract",
"venue",
"year",
"paperId",
"citationCount",
"openAccessPdf",
"authors",
"externalIds",
],
) -> List[Document]:
"""
Loads data from Semantic Scholar based on the entered query and returned_fields

Parameters
----------
query: str
The search query for the paper
limit: int, optional
The number of maximum results returned (default is 10)
returned_fields: list, optional
The list of fields to be returned from the search

Returns
-------
list
The list of Document object that contains the search results

Raises
------
Exception
If there is an error while performing the search

"""
results = []
query = get_related_questions(query)
try:
for question in query:
logging.info(f"Searching for {question}")
_results = self.s2.search_paper(question, limit=limit, fields=returned_fields)
results.extend(_results[:limit])
except (requests.HTTPError, requests.ConnectionError, requests.Timeout) as e:
logging.error(
"Failed to fetch data from Semantic Scholar with exception: %s", e
)
raise
except Exception as e:
logging.error("An unexpected error occurred: %s", e)
raise

documents = []

for item in results[:limit*len(query)]:
openAccessPdf = getattr(item, "openAccessPdf", None)
abstract = getattr(item, "abstract", None)
title = getattr(item, "title", None)
text = None
# concat title and abstract
if abstract and title:
text = title + " " + abstract
elif not abstract:
text = title

metadata = {
"title": title,
"venue": getattr(item, "venue", None),
"year": getattr(item, "year", None),
"paperId": getattr(item, "paperId", None),
"citationCount": getattr(item, "citationCount", None),
"openAccessPdf": openAccessPdf.get("url") if openAccessPdf else None,
"authors": [author["name"] for author in getattr(item, "authors", [])],
"externalIds": getattr(item, "externalIds", None),
}
documents.append(Document(text=text, extra_info=metadata))

if full_text:
logging.info("Getting full text documents...")
full_text_documents = self._get_full_text_docs(documents)
documents.extend(full_text_documents)
documents = get_unique_docs(documents)
return full_text_documents # Edited from documents
Loading