Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added Firecrawl integration #2359

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
c0624c4
chore: update linting workflows to include dev branch in merge_group
ogabrielluiz Jun 24, 2024
71a9ee5
Update README.md
rodrigosnader Jun 24, 2024
7337301
Update README.md
rodrigosnader Jun 24, 2024
546f856
chore: update package versions in pyproject.toml files
ogabrielluiz Jun 24, 2024
bc334ca
Merge branch 'main' into dev
ogabrielluiz Jun 24, 2024
77e23d9
Refactor "created_at" column type for consistency and fix cancel midd…
ogabrielluiz Jun 24, 2024
6b67c49
chore: Update AstraDB.py imports and method signature for search_docu…
ogabrielluiz Jun 24, 2024
75fa498
chore: Update package versions in pyproject.toml files
ogabrielluiz Jun 24, 2024
c6ca57c
chore: Update run-name in release.yml for Langflow Release
ogabrielluiz Jun 24, 2024
f97c5be
Merge branch 'main' into dev
ogabrielluiz Jun 24, 2024
b02462f
fix: add call to _add_documents_to_vector_store in AstraDB component
ogabrielluiz Jun 24, 2024
fcdaeb3
chore: Fix missing parentheses in RequestCancelledMiddleware
ogabrielluiz Jun 24, 2024
c4d1427
chore: Update pydantic-settings and tenacity versions
ogabrielluiz Jun 24, 2024
7a6c957
Update README.md
rodrigosnader Jun 24, 2024
5d8abaf
Merge remote-tracking branch 'origin/main' into dev
ogabrielluiz Jun 24, 2024
dfd419e
fix fetch data to work even with autologin true
anovazzi1 Jun 24, 2024
853f548
format code
anovazzi1 Jun 24, 2024
8eae0c5
deactivate stop button until we have a better solution (#2337)
ogabrielluiz Jun 25, 2024
d4c18d3
added firecrawl integration
rafaelsideguide Jun 25, 2024
9373749
Merge branch 'main' into feat/firecrawl-integration
rafaelsideguide Jun 25, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 20 additions & 6 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ unstructured = {extras = ["docx", "md", "pptx"], version = "^0.14.4"}
langchain-aws = "^0.1.6"
langchain-mongodb = "^0.1.6"
kubernetes = "^30.1.0"
firecrawl-py = "^0.0.16"


[tool.poetry.group.dev.dependencies]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from typing import Optional
from firecrawl.firecrawl import FirecrawlApp
from langflow.custom import CustomComponent
from langflow.schema import Data
import uuid

class FirecrawlCrawlApi(CustomComponent):
display_name: str = "FirecrawlCrawlApi"
description: str = "Firecrawl Crawl API."
output_types: list[str] = ["Document"]
documentation: str = "https://docs.firecrawl.dev/api-reference/endpoint/crawl"
field_config = {
"api_key": {
"display_name": "API Key",
"field_type": "str",
"required": True,
"password": True,
"info": "The API key to use Firecrawl API.",
},
"url": {
"display_name": "URL",
"field_type": "str",
"required": True,
"info": "The base URL to start crawling from.",
},
"timeout": {
"display_name": "Timeout",
"field_type": "int",
"info": "The timeout in milliseconds.",
},
"crawlerOptions": {
"display_name": "Crawler Options",
"info": "Options for the crawler behavior.",
},
"pageOptions": {
"display_name": "Page Options",
"info": "The page options to send with the request.",
},
"idempotency_key": {
"display_name": "Idempotency Key",
"field_type": "str",
"info": "Optional idempotency key to ensure unique requests.",
},
}

def build(
self,
api_key: str,
url: str,
timeout: Optional[int] = 30000,
crawlerOptions: Optional[Data] = None,
pageOptions: Optional[Data] = None,
idempotency_key: Optional[str] = None,
) -> Data:
if crawlerOptions:
crawler_options_dict = crawlerOptions.__dict__['data']['text']
else:
crawler_options_dict = {}

if pageOptions:
page_options_dict = pageOptions.__dict__['data']['text']
else:
page_options_dict = {}

if not idempotency_key:
idempotency_key = str(uuid.uuid4())

app = FirecrawlApp(api_key=api_key)
crawl_result = app.crawl_url(
url,
{
"crawlerOptions": crawler_options_dict,
"pageOptions": page_options_dict,
},
True,
int(timeout / 1000),
idempotency_key
)

records = Data(data={"results": crawl_result})
return records
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from typing import Optional
from firecrawl.firecrawl import FirecrawlApp
from langflow.custom import CustomComponent
from langflow.schema import Data
from langflow.services.database.models.base import orjson_dumps

Check failure on line 5 in src/backend/base/langflow/components/langchain_utilities/FirecrawlScrapeApi.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.12)

Ruff (F401)

src/backend/base/langflow/components/langchain_utilities/FirecrawlScrapeApi.py:5:52: F401 `langflow.services.database.models.base.orjson_dumps` imported but unused
import json

Check failure on line 6 in src/backend/base/langflow/components/langchain_utilities/FirecrawlScrapeApi.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.12)

Ruff (F401)

src/backend/base/langflow/components/langchain_utilities/FirecrawlScrapeApi.py:6:8: F401 `json` imported but unused

class FirecrawlScrapeApi(CustomComponent):
display_name: str = "FirecrawlScrapeApi"
description: str = "Firecrawl Scrape API."
output_types: list[str] = ["Document"]
documentation: str = "https://docs.firecrawl.dev/api-reference/endpoint/scrape"
field_config = {
"api_key": {
"display_name": "API Key",
"field_type": "str",
"required": True,
"password": True,
"info": "The API key to use Firecrawl API.",
},
"url": {
"display_name": "URL",
"field_type": "str",
"required": True,
"info": "The URL to scrape.",
},
"timeout": {
"display_name": "Timeout",
"info": "Timeout in milliseconds for the request.",
"field_type": "int",
"default_value": 10000,
},
"pageOptions": {
"display_name": "Page Options",
"info": "The page options to send with the request.",
},
"extractorOptions": {
"display_name": "Extractor Options",
"info": "The extractor options to send with the request.",
},
}

def build(
self,
api_key: str,
url: str,
timeout: Optional[int] = 10000,
pageOptions: Optional[Data] = None,
extractorOptions: Optional[Data] = None,
) -> Data:
if extractorOptions:
extractor_options_dict = extractorOptions.__dict__['data']['text']
else:
extractor_options_dict = {}

if pageOptions:
page_options_dict = pageOptions.__dict__['data']['text']
else:
page_options_dict = {}

app = FirecrawlApp(api_key=api_key)
results = app.scrape_url(
url,
{
"timeout": str(timeout),
"extractorOptions": extractor_options_dict,
"pageOptions": page_options_dict,
},
)

record = Data(data=results)
return record
16 changes: 15 additions & 1 deletion src/backend/base/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions src/backend/base/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ pyperclip = "^1.8.2"
uncurl = "^0.0.11"
sentry-sdk = {extras = ["fastapi", "loguru"], version = "^2.5.1"}
chardet = "^5.2.0"
firecrawl-py = "^0.0.16"


[tool.poetry.extras]
Expand Down
61 changes: 61 additions & 0 deletions src/frontend/src/icons/Firecrawl/FirecrawlLogo.jsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
const SvgFirecrawlLogo = (props) => (
<svg
viewBox="-33 0 255 255"
width="24"
height="24"
xmlns="http://www.w3.org/2000/svg"
xmlnsXlink="http://www.w3.org/1999/xlink"
preserveAspectRatio="xMidYMid"
>
<defs>
<style>
{`
.cls-3 {
fill: url(#linear-gradient-1);
}

.cls-4 {
fill: #fc9502;
}

.cls-5 {
fill: #fce202;
}
`}
</style>

<linearGradient
id="linear-gradient-1"
gradientUnits="userSpaceOnUse"
x1="94.141"
y1="255"
x2="94.141"
y2="0.188"
>
<stop offset="0" stopColor="#ff4c0d" />
<stop offset="1" stopColor="#fc9502" />
</linearGradient>
</defs>
<g id="fire">
<path
d="M187.899,164.809 C185.803,214.868 144.574,254.812 94.000,254.812 C42.085,254.812 -0.000,211.312 -0.000,160.812 C-0.000,154.062 -0.121,140.572 10.000,117.812 C16.057,104.191 19.856,95.634 22.000,87.812 C23.178,83.513 25.469,76.683 32.000,87.812 C35.851,94.374 36.000,103.812 36.000,103.812 C36.000,103.812 50.328,92.817 60.000,71.812 C74.179,41.019 62.866,22.612 59.000,9.812 C57.662,5.384 56.822,-2.574 66.000,0.812 C75.352,4.263 100.076,21.570 113.000,39.812 C131.445,65.847 138.000,90.812 138.000,90.812 C138.000,90.812 143.906,83.482 146.000,75.812 C148.365,67.151 148.400,58.573 155.999,67.813 C163.226,76.600 173.959,93.113 180.000,108.812 C190.969,137.321 187.899,164.809 187.899,164.809 Z"
id="path-1"
className="cls-3"
fillRule="evenodd"
/>
<path
d="M94.000,254.812 C58.101,254.812 29.000,225.711 29.000,189.812 C29.000,168.151 37.729,155.000 55.896,137.166 C67.528,125.747 78.415,111.722 83.042,102.172 C83.953,100.292 86.026,90.495 94.019,101.966 C98.212,107.982 104.785,118.681 109.000,127.812 C116.266,143.555 118.000,158.812 118.000,158.812 C118.000,158.812 125.121,154.616 130.000,143.812 C131.573,140.330 134.753,127.148 143.643,140.328 C150.166,150.000 159.127,167.390 159.000,189.812 C159.000,225.711 129.898,254.812 94.000,254.812 Z"
id="path-2"
className="cls-4"
fillRule="evenodd"
/>
<path
d="M95.000,183.812 C104.250,183.812 104.250,200.941 116.000,223.812 C123.824,239.041 112.121,254.812 95.000,254.812 C77.879,254.812 69.000,240.933 69.000,223.812 C69.000,206.692 85.750,183.812 95.000,183.812 Z"
id="path-3"
className="cls-5"
fillRule="evenodd"
/>
</g>
</svg>
);
export default SvgFirecrawlLogo;
28 changes: 28 additions & 0 deletions src/frontend/src/icons/Firecrawl/firecraw-logo.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
9 changes: 9 additions & 0 deletions src/frontend/src/icons/Firecrawl/index.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import React, { forwardRef } from "react";
import SvgFirecrawlLogo from "./FirecrawlLogo";

export const FirecrawlIcon = forwardRef<
SVGSVGElement,
React.PropsWithChildren<{}>
>((props, ref) => {
return <SvgFirecrawlLogo ref={ref} {...props} />;
});
3 changes: 3 additions & 0 deletions src/frontend/src/utils/styleUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ import { CouchbaseIcon } from "../icons/Couchbase";
import { ElasticsearchIcon } from "../icons/ElasticsearchStore";
import { EvernoteIcon } from "../icons/Evernote";
import { FBIcon } from "../icons/FacebookMessenger";
import { FirecrawlIcon } from "../icons/Firecrawl";
import { GitBookIcon } from "../icons/GitBook";
import { GoogleIcon } from "../icons/Google";
import { GoogleGenerativeAIIcon } from "../icons/GoogleGenerativeAI";
Expand Down Expand Up @@ -363,6 +364,8 @@ export const nodeIconsLucide: iconsType = {
CohereEmbeddings: CohereIcon,
EverNoteLoader: EvernoteIcon,
FacebookChatLoader: FBIcon,
FirecrawlCrawlApi: FirecrawlIcon,
FirecrawlScrapeApi: FirecrawlIcon,
GitbookLoader: GitBookIcon,
GoogleSearchAPIWrapper: GoogleIcon,
GoogleSearchResults: GoogleIcon,
Expand Down
Loading