Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Firecrawl API Components for Langflow #2127

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 19 additions & 4 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,9 @@ youtube-transcript-api = "^0.6.2"
markdown = "^3.6"
langchain-chroma = "^0.1.1"
upstash-vector = "^0.4.0"
firecrawl-py = "^0.0.14"
unstructured = {extras = ["docx", "md", "pptx"], version = "^0.14.4"}


[tool.poetry.group.dev.dependencies]
types-redis = "^4.6.0.5"
ipykernel = "^6.29.0"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from typing import Optional
from firecrawl.firecrawl import FirecrawlApp
from langflow.custom import CustomComponent
from langflow.schema.schema import Record
from langchain_community.agent_toolkits.json.toolkit import JsonToolkit
import json
import uuid


class FirecrawlCrawlApi(CustomComponent):
display_name: str = "FirecrawlCrawlApi"
description: str = "Firecrawl Crawl API."
output_types: list[str] = ["Document"]
documentation: str = "https://docs.firecrawl.dev/api-reference/endpoint/crawl"
field_config = {
"api_key": {
"display_name": "API Key",
"field_type": "str",
"required": True,
"password": True,
"info": "The API key to use Firecrawl API.",
},
"url": {
"display_name": "URL",
"field_type": "str",
"required": True,
"info": "The base URL to start crawling from.",
},
"crawlerOptions": {
"display_name": "Crawler Options",
"info": "Options for the crawler behavior.",
},
"pageOptions": {
"display_name": "Page Options",
"info": "The page options to send with the request.",
},
"idempotency_key": {
"display_name": "Idempotency Key",
"field_type": "str",
"info": "Optional idempotency key to ensure unique requests.",
},
}

def build(
self,
api_key: str,
url: str,
crawlerOptions: Optional[JsonToolkit] = None,
pageOptions: Optional[JsonToolkit] = None,
idempotency_key: Optional[str] = None,
) -> Record:
if crawlerOptions:
crawler_options_dict = crawlerOptions.spec.dict_
else:
crawler_options_dict = {}

if pageOptions:
page_options_dict = pageOptions.spec.dict_
else:
page_options_dict = {}

if not idempotency_key:
idempotency_key = str(uuid.uuid4())

app = FirecrawlApp(api_key=api_key)
crawl_result = app.crawl_url(
url,
{
"crawlerOptions": json.loads(json.dumps(crawler_options_dict)),
"pageOptions": json.loads(json.dumps(page_options_dict)),
},
True,
2,
idempotency_key,
)

records = [Record(data=item) for item in crawl_result]

return records
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from typing import Optional

from firecrawl.firecrawl import FirecrawlApp
from langflow.custom import CustomComponent
from langflow.schema.schema import Record
from langchain_community.agent_toolkits.json.toolkit import JsonToolkit
import json


class FirecrawlScrapeApi(CustomComponent):
display_name: str = "FirecrawlScrapeApi"
description: str = "Firecrawl Scrape API."
output_types: list[str] = ["Document"]
documentation: str = "https://docs.firecrawl.dev/api-reference/endpoint/scrape"
field_config = {
"api_key": {
"display_name": "API Key",
"field_type": "str",
"required": True,
"password": True,
"info": "The API key to use Firecrawl API.",
},
"url": {
"display_name": "URL",
"field_type": "str",
"required": True,
"info": "The URL to scrape.",
},
"timeout": {
"display_name": "Timeout",
"info": "Timeout in milliseconds for the request.",
"field_type": "int",
"default_value": 10000,
},
"pageOptions": {
"display_name": "Page Options",
"info": "The page options to send with the request.",
},
"extractorOptions": {
"display_name": "Extractor Options",
"info": "The extractor options to send with the request.",
},
}

def build(
self,
api_key: str,
url: str,
timeout: Optional[int] = 10000,
pageOptions: Optional[JsonToolkit] = None,
extractorOptions: Optional[JsonToolkit] = None,
) -> Record:
if extractorOptions:
extractor_options_dict = extractorOptions.spec.dict_
else:
extractor_options_dict = {}

if pageOptions:
page_options_dict = pageOptions.spec.dict_
else:
page_options_dict = {}

app = FirecrawlApp(api_key=api_key)
result = app.scrape_url(
url,
{
"timeout": timeout,
"extractorOptions": json.loads(json.dumps(extractor_options_dict)),
"pageOptions": json.loads(json.dumps(page_options_dict)),
},
)

record = Record(data=result)
return record
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from typing import Optional, List
from firecrawl.firecrawl import FirecrawlApp
from langflow.custom import CustomComponent
from langflow.schema.schema import Record
from langchain_community.agent_toolkits.json.toolkit import JsonToolkit
import json


class FirecrawlSearchApi(CustomComponent):
display_name: str = "FirecrawlSearchApi"
description: str = "Firecrawl Search API."
output_types: list[str] = ["Document"]
documentation: str = "https://docs.firecrawl.dev/api-reference/endpoint/search"
field_config = {
"api_key": {
"display_name": "API Key",
"field_type": "str",
"required": True,
"password": True,
"info": "The API key to use Firecrawl API.",
},
"query": {
"display_name": "Query",
"field_type": "str",
"required": True,
"info": "The query string to search for.",
},
"searchOptions": {
"display_name": "Search Options",
"info": "Options to refine search results.",
},
"pageOptions": {
"display_name": "Page Options",
"info": "Options to control the page content returned.",
},
}

def build(
self,
api_key: str,
query: str,
searchOptions: Optional[JsonToolkit] = None,
pageOptions: Optional[JsonToolkit] = None,
) -> List[Record]:
if searchOptions:
search_options_dict = searchOptions.spec.dict_
else:
search_options_dict = {}

if pageOptions:
page_options_dict = pageOptions.spec.dict_
else:
page_options_dict = {}

app = FirecrawlApp(api_key=api_key)
search_result = app.search(
query,
{
"searchOptions": json.loads(json.dumps(search_options_dict)),
"pageOptions": json.loads(json.dumps(page_options_dict)),
},
)

records = [Record(data=item) for item in search_result]

return records
6 changes: 6 additions & 0 deletions src/backend/base/langflow/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,12 @@ tools:
utilities:
BingSearchAPIWrapper:
documentation: ""
FirecrawlCrawlApi:
documentation: "https://docs.firecrawl.dev/api-reference/endpoint/crawl"
FirecrawlScrapeApi:
documentation: "https://docs.firecrawl.dev/api-reference/endpoint/scrape"
FirecrawlSearchApi:
documentation: "https://docs.firecrawl.dev/api-reference/endpoint/search"
GoogleSearchAPIWrapper:
documentation: ""
GoogleSerperAPIWrapper:
Expand Down
16 changes: 15 additions & 1 deletion src/backend/base/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion src/backend/base/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ cryptography = "^42.0.5"
asyncer = "^0.0.5"
pyperclip = "^1.8.2"
uncurl = "^0.0.11"

firecrawl-py = "^0.0.14"

[tool.poetry.extras]
deploy = ["celery", "redis", "flower"]
Expand Down
Loading
Loading