diff --git a/packages/bigframes/bigframes/bigquery/_operations/ai.py b/packages/bigframes/bigframes/bigquery/_operations/ai.py index 7a509d4f95ff..c6115c6d6a5e 100644 --- a/packages/bigframes/bigframes/bigquery/_operations/ai.py +++ b/packages/bigframes/bigframes/bigquery/_operations/ai.py @@ -705,6 +705,113 @@ def generate_table( return session.read_gbq_query(query) +@log_adapter.method_logger(custom_base_name="bigquery_ai") +def embed( + content: str | series.Series | pd.Series, + *, + endpoint: str | None = None, + model: str | None = None, + task_type: ( + Literal[ + "retrieval_query", + "retrieval_document", + "semantic_similarity", + "classification", + "clustering", + "question_answering", + "fact_verification", + "code_retrieval_query", + ] + | None + ) = None, + title: str | None = None, + model_params: Mapping[Any, Any] | None = None, + connection_id: str | None = None, +) -> series.Series: + """ + Creates embeddings from text or image data in BigQuery. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bbq.ai.embed("dog", endpoint="text-embedding-005") # doctest: +SKIP + 0 {'result': array([ 1.78243860e-03, -1.10658340... + + >>> s = bpd.Series(['dog']) # doctest: +SKIP + >>> bbq.ai.embed(s, endpoint='text-embedding-005') # doctest: +SKIP + 0 {'result': array([ 1.78243860e-03, -1.10658340... + + Args: + content (str | Series): + A string literal or a Series (either BigFrames series or pandas Series) that provides the text or image to embed. + endpoint (str, optional): + A string value that specifies a supported Vertex AI embedding model endpoint to use. + The endpoint value that you specify must include the model version, for example, + `"text-embedding-005"`. If you specify this parameter, you can't specify the + `model` parameter. + model (str, optional): + A string value that specifies a built-in embedding model. The only supported value is + `"embeddinggemma-300m"`. If you specify this parameter, you can't specify the `endpoint`, + `title`, `model_params`, or `connection_id` parameters. + task_type (str, optional): + A string literal that specifies the intended downstream application to help the model + produce better quality embeddings. Accepts `"retrieval_query"`, `"retrieval_document"`, + `"semantic_similarity"`, `"classification"`, `"clustering"`, `"question_answering"`, + `"fact_verification"`, `"code_retrieval_query"`. + title (str, optional): + A string value that specifies the document title, which the model uses to improve + embedding quality. You can only use this parameter if you specify `"retrieval_document"` + for the `task_type` value. + model_params (Mapping[Any, Any], optional): + A JSON literal that provides additional parameters to the model. For example, + `{"outputDimensionality": 768}` lets you specify the number of dimensions to use when + generating embeddings. + connection_id (str, optional): + A STRING value specifying the connection to use to communicate with the model, in the + format `PROJECT_ID.LOCATION.CONNECTION_ID`. For example, `myproject.us.myconnection`. + If not provided, the query uses your end-user credential. + + Returns: + bigframes.series.Series: A new struct Series with the result data. The struct contains these fields: + * "result": an ARRAY value containing the generated embeddings. + * "status": a STRING value that contains the API response status for the corresponding row. This value is empty if the operation was successful. + """ + + if model is not None: + if any(x is not None for x in [endpoint, title, model_params, connection_id]): + raise ValueError( + "You cannot specify endpoint, title, model_params, or connection_id when the model is set." + ) + elif endpoint is None: + raise ValueError( + "You must specify exactly one of 'endpoint' or 'model' argument." + ) + + if title is not None and task_type != "retrieval_document": + raise ValueError( + "You can only use 'title' parameter if you specify retrieval_document for the task_type value." + ) + + operator = ai_ops.AIEmbed( + endpoint=endpoint, + model=model, + task_type=task_type, + title=title, + model_params=json.dumps(model_params) if model_params else None, + connection_id=connection_id, + ) + + if isinstance(content, str): + return series.Series([content])._apply_unary_op(operator) + elif isinstance(content, pd.Series): + return series.Series(content)._apply_unary_op(operator) + elif isinstance(content, series.Series): + return content._apply_unary_op(operator) + else: + raise ValueError(f"Unsupported 'content' parameter type: {type(content)}") + + @log_adapter.method_logger(custom_base_name="bigquery_ai") def if_( prompt: PROMPT_TYPE, diff --git a/packages/bigframes/bigframes/bigquery/ai.py b/packages/bigframes/bigframes/bigquery/ai.py index 25a7df778127..5f161632d98c 100644 --- a/packages/bigframes/bigframes/bigquery/ai.py +++ b/packages/bigframes/bigframes/bigquery/ai.py @@ -58,6 +58,7 @@ from bigframes.bigquery._operations.ai import ( classify, + embed, forecast, generate, generate_bool, @@ -72,6 +73,7 @@ __all__ = [ "classify", + "embed", "forecast", "generate", "generate_bool", diff --git a/packages/bigframes/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/packages/bigframes/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index 26ba0d8cb4b4..75bb134f1df3 100644 --- a/packages/bigframes/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/packages/bigframes/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -1965,6 +1965,19 @@ def ai_generate_double( ).to_expr() +@scalar_op_compiler.register_unary_op(ops.AIEmbed, pass_op=True) +def ai_embed(value: ibis_types.Value, op: ops.AIEmbed) -> ibis_types.StructValue: + return ai_ops.AIEmbed( + value, # type: ignore + connection_id=op.connection_id, # type: ignore + endpoint=op.endpoint, # type: ignore + model=op.model, # type: ignore + task_type=op.task_type.upper() if op.task_type is not None else None, # type: ignore + title=op.title, # type: ignore + model_params=op.model_params, # type: ignore + ).to_expr() + + @scalar_op_compiler.register_nary_op(ops.AIIf, pass_op=True) def ai_if(*values: ibis_types.Value, op: ops.AIIf) -> ibis_types.StructValue: return ai_ops.AIIf( diff --git a/packages/bigframes/bigframes/core/compile/sqlglot/expressions/ai_ops.py b/packages/bigframes/bigframes/core/compile/sqlglot/expressions/ai_ops.py index eaeeb0b3a56d..8dbb298a1bab 100644 --- a/packages/bigframes/bigframes/core/compile/sqlglot/expressions/ai_ops.py +++ b/packages/bigframes/bigframes/core/compile/sqlglot/expressions/ai_ops.py @@ -15,6 +15,7 @@ from __future__ import annotations from dataclasses import asdict +from typing import Any import bigframes_vendored.sqlglot.expressions as sge @@ -23,6 +24,7 @@ from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr register_nary_op = expression_compiler.expression_compiler.register_nary_op +register_unary_op = expression_compiler.expression_compiler.register_unary_op @register_nary_op(ops.AIGenerate, pass_op=True) @@ -53,6 +55,13 @@ def _(*exprs: TypedExpr, op: ops.AIGenerateDouble) -> sge.Expression: return sge.func("AI.GENERATE_DOUBLE", *args) +@register_unary_op(ops.AIEmbed, pass_op=True) +def _(expr: TypedExpr, op: ops.AIEmbed) -> sge.Expression: + args: list[Any] = [expr.expr] + _construct_named_args(op) + + return sge.func("AI.EMBED", *args) + + @register_nary_op(ops.AIIf, pass_op=True) def _(*exprs: TypedExpr, op: ops.AIIf) -> sge.Expression: args = [_construct_prompt(exprs, op.prompt_context)] + _construct_named_args(op) @@ -94,7 +103,7 @@ def _construct_prompt( return sge.Kwarg(this=param_name, expression=sge.Tuple(expressions=prompt)) -def _construct_named_args(op: ops.NaryOp) -> list[sge.Kwarg]: +def _construct_named_args(op: ops.ScalarOp) -> list[sge.Kwarg]: args = [] op_args = asdict(op) diff --git a/packages/bigframes/bigframes/operations/__init__.py b/packages/bigframes/bigframes/operations/__init__.py index f35a07271bd7..f4c138a5527a 100644 --- a/packages/bigframes/bigframes/operations/__init__.py +++ b/packages/bigframes/bigframes/operations/__init__.py @@ -16,6 +16,7 @@ from bigframes.operations.ai_ops import ( AIClassify, + AIEmbed, AIGenerate, AIGenerateBool, AIGenerateDouble, @@ -434,6 +435,7 @@ "AIGenerateBool", "AIGenerateDouble", "AIGenerateInt", + "AIEmbed", "AIIf", "AIScore", # Numpy ops mapping diff --git a/packages/bigframes/bigframes/operations/ai_ops.py b/packages/bigframes/bigframes/operations/ai_ops.py index b20314fe2321..2878b6584447 100644 --- a/packages/bigframes/bigframes/operations/ai_ops.py +++ b/packages/bigframes/bigframes/operations/ai_ops.py @@ -118,6 +118,28 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT ) +@dataclasses.dataclass(frozen=True) +class AIEmbed(base_ops.UnaryOp): + name: ClassVar[str] = "ai_embed" + + endpoint: str | None + model: str | None + task_type: str | None + title: str | None + model_params: str | None + connection_id: str | None + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return pd.ArrowDtype( + pa.struct( + ( + pa.field("result", pa.list_(pa.float64())), + pa.field("status", pa.string()), + ) + ) + ) + + @dataclasses.dataclass(frozen=True) class AIIf(base_ops.NaryOp): name: ClassVar[str] = "ai_if" diff --git a/packages/bigframes/tests/system/small/bigquery/test_ai.py b/packages/bigframes/tests/system/small/bigquery/test_ai.py index 16e9cca9f136..e64acdb64c0d 100644 --- a/packages/bigframes/tests/system/small/bigquery/test_ai.py +++ b/packages/bigframes/tests/system/small/bigquery/test_ai.py @@ -255,6 +255,69 @@ def test_ai_generate_double_multi_model(session): ) +def test_ai_embed_series_content(session): + content = bpd.Series(["dog"], session=session) + + result = bbq.ai.embed(content, endpoint="text-embedding-005") + + assert _contains_no_nulls(result) + assert result.dtype == pd.ArrowDtype( + pa.struct( + ( + pa.field("result", pa.list_(pa.float64())), + pa.field("status", pa.string()), + ) + ) + ) + + +def test_ai_embed_string_content(session): + with mock.patch( + "bigframes.core.global_session.get_global_session" + ) as mock_get_session: + mock_get_session.return_value = session + + result = bbq.ai.embed("dog", endpoint="text-embedding-005") + + assert _contains_no_nulls(result) + assert result.dtype == pd.ArrowDtype( + pa.struct( + ( + pa.field("result", pa.list_(pa.float64())), + pa.field("status", pa.string()), + ) + ) + ) + + +def test_ai_embed_no_endpoint_or_model_raises_error(session): + content = bpd.Series(["dog"], session=session) + + with pytest.raises(ValueError): + bbq.ai.embed(content) + + +def test_ai_embed_both_model_and_endpoint_are_set_raises_error(session): + content = bpd.Series(["dog"], session=session) + + with pytest.raises(ValueError): + bbq.ai.embed( + content, endpoint="text-embedding-005", model="embeddinggemma-300m model" + ) + + +def test_ai_embed_title_and_task_type_mismatch_raises_error(session): + content = bpd.Series(["dog"], session=session) + + with pytest.raises(ValueError): + bbq.ai.embed( + content, + endpoint="text-embedding-005", + title="my title", + task_type="text_similarity", + ) + + def test_ai_if(session): s1 = bpd.Series(["apple", "bear"], session=session) s2 = bpd.Series(["fruit", "tree"], session=session) diff --git a/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_embed/out.sql b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_embed/out.sql new file mode 100644 index 000000000000..9c18a7cd532f --- /dev/null +++ b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_embed/out.sql @@ -0,0 +1,3 @@ +SELECT + AI.EMBED(`string_col`, endpoint => 'text-embedding-005') AS `result` +FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_embed_with_connection_id/out.sql b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_embed_with_connection_id/out.sql new file mode 100644 index 000000000000..0968a101b22a --- /dev/null +++ b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_embed_with_connection_id/out.sql @@ -0,0 +1,7 @@ +SELECT + AI.EMBED( + `string_col`, + endpoint => 'text-embedding-005', + connection_id => 'bigframes-dev.us.bigframes-default-connection' + ) AS `result` +FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_embed_with_model/out.sql b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_embed_with_model/out.sql new file mode 100644 index 000000000000..4c3c76f87b61 --- /dev/null +++ b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_embed_with_model/out.sql @@ -0,0 +1,3 @@ +SELECT + AI.EMBED(`string_col`, model => 'embeddinggemma-300m') AS `result` +FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_embed_with_model_param_and_title/out.sql b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_embed_with_model_param_and_title/out.sql new file mode 100644 index 000000000000..873db838682a --- /dev/null +++ b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_embed_with_model_param_and_title/out.sql @@ -0,0 +1,9 @@ +SELECT + AI.EMBED( + `string_col`, + endpoint => 'text-embedding-005', + task_type => 'retrieval_document', + title => 'My Document', + model_params => JSON '{"outputDimensionality": 256}' + ) AS `result` +FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_embed_with_task_type_and_title/out.sql b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_embed_with_task_type_and_title/out.sql new file mode 100644 index 000000000000..873db838682a --- /dev/null +++ b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_embed_with_task_type_and_title/out.sql @@ -0,0 +1,9 @@ +SELECT + AI.EMBED( + `string_col`, + endpoint => 'text-embedding-005', + task_type => 'retrieval_document', + title => 'My Document', + model_params => JSON '{"outputDimensionality": 256}' + ) AS `result` +FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/test_ai_ops.py b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/test_ai_ops.py index 64a5a94c9e7f..a7aacb4799ba 100644 --- a/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/test_ai_ops.py +++ b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/test_ai_ops.py @@ -281,6 +281,76 @@ def test_ai_generate_double_with_model_param( snapshot.assert_match(sql, "out.sql") +def test_ai_embed(scalar_types_df: dataframe.DataFrame, snapshot): + col_name = "string_col" + + op = ops.AIEmbed( + endpoint="text-embedding-005", + model=None, + task_type=None, + title=None, + model_params=None, + connection_id=None, + ) + + sql = utils._apply_ops_to_sql(scalar_types_df, [op.as_expr(col_name)], ["result"]) + + snapshot.assert_match(sql, "out.sql") + + +def test_ai_embed_with_connection_id(scalar_types_df: dataframe.DataFrame, snapshot): + col_name = "string_col" + + op = ops.AIEmbed( + endpoint="text-embedding-005", + model=None, + task_type=None, + title=None, + model_params=None, + connection_id=CONNECTION_ID, + ) + + sql = utils._apply_ops_to_sql(scalar_types_df, [op.as_expr(col_name)], ["result"]) + + snapshot.assert_match(sql, "out.sql") + + +def test_ai_embed_with_model(scalar_types_df: dataframe.DataFrame, snapshot): + col_name = "string_col" + + op = ops.AIEmbed( + endpoint=None, + model="embeddinggemma-300m", + task_type=None, + title=None, + model_params=None, + connection_id=None, + ) + + sql = utils._apply_ops_to_sql(scalar_types_df, [op.as_expr(col_name)], ["result"]) + + snapshot.assert_match(sql, "out.sql") + + +def test_ai_embed_with_task_type_and_title( + scalar_types_df: dataframe.DataFrame, snapshot +): + col_name = "string_col" + + op = ops.AIEmbed( + endpoint="text-embedding-005", + model=None, + task_type="retrieval_document", + title="My Document", + model_params=json.dumps({"outputDimensionality": 256}), + connection_id=None, + ) + + sql = utils._apply_ops_to_sql(scalar_types_df, [op.as_expr(col_name)], ["result"]) + + snapshot.assert_match(sql, "out.sql") + + @pytest.mark.parametrize("connection_id", [None, CONNECTION_ID]) def test_ai_if(scalar_types_df: dataframe.DataFrame, snapshot, connection_id): col_name = "string_col" diff --git a/packages/bigframes/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py b/packages/bigframes/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py index 4879f576f0e1..9a6906f4e5cc 100644 --- a/packages/bigframes/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py +++ b/packages/bigframes/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py @@ -1134,6 +1134,9 @@ def visit_AIGenerateInt(self, op, **kwargs): def visit_AIGenerateDouble(self, op, **kwargs): return sge.func("AI.GENERATE_DOUBLE", *self._compile_ai_args(**kwargs)) + def visit_AIEmbed(self, op, **kwargs): + return sge.func("AI.EMBED", *self._compile_ai_args(**kwargs)) + def visit_AIIf(self, op, **kwargs): return sge.func("AI.IF", *self._compile_ai_args(**kwargs)) diff --git a/packages/bigframes/third_party/bigframes_vendored/ibis/expr/operations/ai_ops.py b/packages/bigframes/third_party/bigframes_vendored/ibis/expr/operations/ai_ops.py index e50f6fe893ab..642460da5291 100644 --- a/packages/bigframes/third_party/bigframes_vendored/ibis/expr/operations/ai_ops.py +++ b/packages/bigframes/third_party/bigframes_vendored/ibis/expr/operations/ai_ops.py @@ -108,6 +108,30 @@ def dtype(self) -> dt.Struct: ) +@public +class AIEmbed(Value): + """Create embeddings from text or image data.""" + + content: Value + connection_id: Optional[Value[dt.String]] + endpoint: Optional[Value[dt.String]] + model: Optional[Value[dt.String]] + task_type: Optional[Value[dt.String]] + title: Optional[Value[dt.String]] + model_params: Optional[Value[dt.String]] + + shape = rlz.shape_like("content") + + @attribute + def dtype(self) -> dt.Struct: + return dt.Struct.from_tuples( + ( + ("result", dt.Array(dt.float64)), + ("status", dt.string), + ) + ) + + @public class AIIf(Value): """Generate True/False based on the prompt"""