community, genai, vertexai[major]: release 2.0 (#489)

* update deps * delete check_pydantic script * to_pydantic_2 * model_before_rewrite * model_after_rewrite * Self * format * clean up * model_before_rewrite * change VertexAI.validate_environment to pre * lint * update chat and embeddings validation to pre * update some features to pydantic 2 * remove unused type ignores * fix validate_environment in llm and chat models * more validation updates * change maas model garden validation to post * add protected namespaces to embeddings * fix embeddings init * update docstrings * delete check_pydantic script * update dependencies * to_pydantic_2 * model_after_rewrite * Self * revert change to _genai_extension * format * remove check_pydantic from MakeFile * remove unused imports * fix * upgrade mypy * fix type hints * update serialization test * add test_watch to makefile * merge * add snapshots * schema -> model_json_schema * update _format_json_schema_to_gapic * support v1 function * add test for union types * add integration test workflow * Revert "add integration test workflow" This reverts commit 2589fd6. * lock * infra: remove pydantic compatibility checks * delete check_pydantic script * extra_migrate * to_pydantic_2 * model_before_rewrite * model_after_rewrite * Self * format * update deps * change some post validators to pre * set private attributes with PrivateAttr instead of config * resolve lint errors * increment version to 2.0.0.dev1 * increment version to 2.0.0.dev1 * add snapshots for serialization standard test * bump core dep * json_schema_extra in test * protected namespaces * fix warnings * fix warnings * fix mistral dependency * fix warning * increment version to 2.0.0.dev1 * fix mistral dep and lock * update docstrings * todo: figure out if we need google-cloud-core in package deps * increment version to get around test.pypi * assign missed default * catch pydantic v2 schemas in dict (#488) * catch pydantic v2 schemas in dict * lock * update test * fix VertexFSVectorStore * fix equality check in community integration tests for pydantic 2 * update genai + lock * update vertexai + lock * update snapshots * update genai snapshots * update community + lock --------- Co-authored-by: Leonid Kuligin <[email protected]> Co-authored-by: Bagatur <[email protected]>
langchain-ai · Sep 13, 2024 · b4b3c1c · b4b3c1c
1 parent 72d1b5a
commit b4b3c1c
Show file tree

Hide file tree

Showing 71 changed files with 1,962 additions and 1,719 deletions.
diff --git a/.github/workflows/_all_ci.yml b/.github/workflows/_all_ci.yml
@@ -41,13 +41,6 @@ jobs:
       working-directory: ${{ inputs.working-directory }}
     secrets: inherit
 
-  dependencies:
-    name: "-"
-    uses: ./.github/workflows/_dependencies.yml
-    with:
-      working-directory: ${{ inputs.working-directory }}
-    secrets: inherit
-
   test:
     name: "-"
     uses: ./.github/workflows/_test.yml

diff --git a/.github/workflows/_dependencies.yml b/.github/workflows/_dependencies.yml
diff --git a/libs/community/Makefile b/libs/community/Makefile
@@ -33,7 +33,6 @@ lint_tests: PYTHON_FILES=tests
 lint_tests: MYPY_CACHE=.mypy_cache_test
 
 lint lint_diff lint_package lint_tests:
-	./scripts/check_pydantic.sh .
 	./scripts/lint_imports.sh
 	poetry run ruff .
 	poetry run ruff format $(PYTHON_FILES) --diff

diff --git a/libs/community/langchain_google_community/bq_storage_vectorstores/_base.py b/libs/community/langchain_google_community/bq_storage_vectorstores/_base.py
@@ -13,8 +13,9 @@
 from langchain_community.vectorstores.utils import maximal_marginal_relevance
 from langchain_core.documents import Document
 from langchain_core.embeddings import Embeddings
-from langchain_core.pydantic_v1 import BaseModel, ConfigDict, root_validator
 from langchain_core.vectorstores import VectorStore
+from pydantic import BaseModel, ConfigDict, model_validator
+from typing_extensions import Self
 
 from langchain_google_community._utils import get_client_info
 from langchain_google_community.bq_storage_vectorstores.utils import (
@@ -75,8 +76,9 @@ class BaseBigQueryVectorStore(VectorStore, BaseModel, ABC):
     _logger: Any = None
     _full_table_id: Optional[str] = None
 
-    class Config:
-        arbitrary_types_allowed = True
+    model_config = ConfigDict(
+        arbitrary_types_allowed=True,
+    )
 
     @abstractmethod
     def sync_data(self) -> None:
@@ -113,8 +115,8 @@ def _similarity_search_by_vectors_with_scores_and_embeddings(
     ) -> list[list[list[Any]]]:
         ...
 
-    @root_validator(pre=False, skip_on_failure=True)
-    def validate_vals(cls, values: dict) -> dict:
+    @model_validator(mode="after")
+    def validate_vals(self) -> Self:
         try:
             import pandas  # noqa: F401
             from google.cloud import bigquery  # type: ignore[attr-defined]
@@ -127,41 +129,37 @@ def validate_vals(cls, values: dict) -> dict:
                 "Please, install feature store dependency group: "
                 "`pip install langchain-google-community[featurestore]`"
             )
-        values["_logger"] = base.Logger(__name__)
-        values["_bq_client"] = bigquery.Client(
-            project=values["project_id"],
-            location=values["location"],
-            credentials=values["credentials"],
+        self._logger = base.Logger(__name__)
+        self._bq_client = bigquery.Client(
+            project=self.project_id,
+            location=self.location,
+            credentials=self.credentials,
             client_info=get_client_info(module="bigquery-vector-search"),
         )
-        if values["embedding_dimension"] is None:
-            values["embedding_dimension"] = len(values["embedding"].embed_query("test"))
-        full_table_id = (
-            f"{values['project_id']}.{values['dataset_name']}.{values['table_name']}"
-        )
-        values["_full_table_id"] = full_table_id
-        temp_dataset_id = f"{values['dataset_name']}_temp"
+        if self.embedding_dimension is None:
+            self.embedding_dimension = len(self.embedding.embed_query("test"))
+        full_table_id = f"{self.project_id}.{self.dataset_name}.{self.table_name}"
+        self._full_table_id = full_table_id
+        temp_dataset_id = f"{self.dataset_name}_temp"
         if not check_bq_dataset_exists(
-            client=values["_bq_client"], dataset_id=values["dataset_name"]
+            client=self._bq_client, dataset_id=self.dataset_name
         ):
-            values["_bq_client"].create_dataset(
-                dataset=values["dataset_name"], exists_ok=True
-            )
+            self._bq_client.create_dataset(dataset=self.dataset_name, exists_ok=True)
         if not check_bq_dataset_exists(
-            client=values["_bq_client"], dataset_id=temp_dataset_id
+            client=self._bq_client, dataset_id=temp_dataset_id
         ):
-            values["_bq_client"].create_dataset(dataset=temp_dataset_id, exists_ok=True)
+            self._bq_client.create_dataset(dataset=temp_dataset_id, exists_ok=True)
         table_ref = bigquery.TableReference.from_string(full_table_id)
-        values["_bq_client"].create_table(table_ref, exists_ok=True)
-        values["_logger"].info(
+        self._bq_client.create_table(table_ref, exists_ok=True)
+        self._logger.info(
             f"BigQuery table {full_table_id} "
             f"initialized/validated as persistent storage. "
             f"Access via BigQuery console:\n "
-            f"https://console.cloud.google.com/bigquery?project={values['project_id']}"
-            f"&ws=!1m5!1m4!4m3!1s{values['project_id']}!2s{values['dataset_name']}!3s"
-            f"{values['table_name']}"
+            f"https://console.cloud.google.com/bigquery?project={self.project_id}"
+            f"&ws=!1m5!1m4!4m3!1s{self.project_id}!2s{self.dataset_name}!3s"
+            f"{self.table_name}"
         )
-        return values
+        return self
 
     @property
     def embeddings(self) -> Optional[Embeddings]:

diff --git a/libs/community/langchain_google_community/bq_storage_vectorstores/bigquery.py b/libs/community/langchain_google_community/bq_storage_vectorstores/bigquery.py
@@ -6,11 +6,13 @@
 from google.api_core.exceptions import ClientError
 from langchain_core.documents import Document
 from langchain_core.embeddings import Embeddings
-from langchain_core.pydantic_v1 import root_validator
+from pydantic import model_validator
 
 if TYPE_CHECKING:
     from google.cloud.bigquery.table import Table
 
+from typing_extensions import Self
+
 from langchain_google_community.bq_storage_vectorstores._base import (
     BaseBigQueryVectorStore,
 )
@@ -114,67 +116,67 @@ def get_documents(
             docs.append(doc)
         return docs
 
-    @root_validator(pre=False, skip_on_failure=True)
-    def initialize_bq_vector_index(cls, values: dict) -> dict:
+    @model_validator(mode="after")
+    def initialize_bq_vector_index(self) -> Self:
         """
         A vector index in BigQuery table enables efficient
         approximate vector search.
         """
         from google.cloud import bigquery  # type: ignore[attr-defined]
 
-        values["_creating_index"] = values.get("_creating_index", False)
-        values["_have_index"] = values.get("_have_index", False)
-        values["_last_index_check"] = values.get("_last_index_check", datetime.min)
+        self._creating_index = self._creating_index
+        self._have_index = self._have_index
+        self._last_index_check = self._last_index_check
 
-        if values.get("_have_index") or values.get("_creating_index"):
-            return values
+        if self._have_index or self._creating_index:
+            return self
 
-        table = values["_bq_client"].get_table(values["_full_table_id"])  # type: ignore[union-attr]
+        table = self._bq_client.get_table(self._full_table_id)  # type: ignore[union-attr]
 
         # Update existing table schema
         schema = table.schema.copy()
         if schema:  ## Check if table has a schema
-            values["table_schema"] = {field.name: field.field_type for field in schema}
+            self.table_schema = {field.name: field.field_type for field in schema}
 
         if (table.num_rows or 0) < MIN_INDEX_ROWS:
-            values["_logger"].debug("Not enough rows to create a vector index.")
-            return values
+            self._logger.debug("Not enough rows to create a vector index.")
+            return self
 
-        if datetime.utcnow() - values["_last_index_check"] < INDEX_CHECK_INTERVAL:
-            return values
+        if datetime.utcnow() - self._last_index_check < INDEX_CHECK_INTERVAL:
+            return self
 
         with _vector_table_lock:
-            values["_last_index_check"] = datetime.utcnow()
+            self._last_index_check = datetime.utcnow()
             # Check if index exists, create if necessary
             check_query = (
-                f"SELECT 1 FROM `{values['project_id']}."
-                f"{values['dataset_name']}"
+                f"SELECT 1 FROM `{self.project_id}."
+                f"{self.dataset_name}"
                 ".INFORMATION_SCHEMA.VECTOR_INDEXES` WHERE"
-                f" table_name = '{values['table_name']}'"
+                f" table_name = '{self.table_name}'"
             )
-            job = values["_bq_client"].query(  # type: ignore[union-attr]
+            job = self._bq_client.query(  # type: ignore[union-attr]
                 check_query, api_method=bigquery.enums.QueryApiMethod.QUERY
             )
             if job.result().total_rows == 0:
                 # Need to create an index. Make it in a separate thread.
-                values["_logger"].debug("Trying to create a vector index.")
+                self._logger.debug("Trying to create a vector index.")
                 Thread(
                     target=_create_bq_index,
                     kwargs={
-                        "bq_client": values["_bq_client"],
-                        "table_name": values["table_name"],
-                        "full_table_id": values["_full_table_id"],
-                        "embedding_field": values["embedding_field"],
-                        "distance_type": values["distance_type"],
-                        "logger": values["_logger"],
+                        "bq_client": self._bq_client,
+                        "table_name": self.table_name,
+                        "full_table_id": self._full_table_id,
+                        "embedding_field": self.embedding_field,
+                        "distance_type": self.distance_type,
+                        "logger": self._logger,
                     },
                     daemon=True,
                 ).start()
 
             else:
-                values["_logger"].debug("Vector index already exists.")
-                values["_have_index"] = True
-            return values
+                self._logger.debug("Vector index already exists.")
+                self._have_index = True
+            return self
 
     def _similarity_search_by_vectors_with_scores_and_embeddings(
         self,
@@ -565,7 +567,9 @@ def to_vertex_fs_vector_store(self, **kwargs: Any) -> Any:
             VertexFSVectorStore,
         )
 
-        base_params = self.dict(include=BaseBigQueryVectorStore.__fields__.keys())
+        base_params = self.model_dump(
+            include=set(BaseBigQueryVectorStore.model_fields.keys())
+        )
         base_params["embedding"] = self.embedding
         all_params = {**base_params, **kwargs}
         fs_obj = VertexFSVectorStore(**all_params)