diff --git a/.github/labeler.yml b/.github/labeler.yml index 1b50ea599a..3a606a5e21 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -243,6 +243,11 @@ integration:stackit: - any-glob-to-any-file: "integrations/stackit/**/*" - any-glob-to-any-file: ".github/workflows/stackit.yml" +integration:supabase: + - changed-files: + - any-glob-to-any-file: "integrations/supabase/**/*" + - any-glob-to-any-file: ".github/workflows/supabase.yml" + integration:tavily: - changed-files: - any-glob-to-any-file: "integrations/tavily/**/*" diff --git a/.github/workflows/CI_coverage_comment.yml b/.github/workflows/CI_coverage_comment.yml index 54f12679e5..763d02316b 100644 --- a/.github/workflows/CI_coverage_comment.yml +++ b/.github/workflows/CI_coverage_comment.yml @@ -50,6 +50,7 @@ on: - "Test / snowflake" - "Test / sqlalchemy" - "Test / stackit" + - "Test / supabase" - "Test / tavily" - "Test / togetherai" - "Test / unstructured" diff --git a/.github/workflows/supabase.yml b/.github/workflows/supabase.yml new file mode 100644 index 0000000000..53a8c0a94d --- /dev/null +++ b/.github/workflows/supabase.yml @@ -0,0 +1,159 @@ +# This workflow comes from https://github.com/ofek/hatch-mypyc +# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml +name: Test / supabase + +on: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - "integrations/supabase/**" + - "!integrations/supabase/*.md" + - ".github/workflows/supabase.yml" + push: + branches: + - main + paths: + - "integrations/supabase/**" + - "!integrations/supabase/*.md" + - ".github/workflows/supabase.yml" + +defaults: + run: + working-directory: integrations/supabase + +concurrency: + group: supabase-${{ github.head_ref || github.sha }} + cancel-in-progress: true + +env: + PYTHONUNBUFFERED: "1" + FORCE_COLOR: "1" + TEST_MATRIX_OS: '["ubuntu-latest"]' + TEST_MATRIX_PYTHON: '["3.10", "3.14"]' + +jobs: + compute-test-matrix: + runs-on: ubuntu-slim + defaults: + run: + working-directory: . + outputs: + os: ${{ steps.set.outputs.os }} + python-version: ${{ steps.set.outputs.python-version }} + steps: + - id: set + run: | + echo 'os=${{ github.event_name == 'push' && '["ubuntu-latest"]' || env.TEST_MATRIX_OS }}' >> "$GITHUB_OUTPUT" + echo 'python-version=${{ github.event_name == 'push' && '["3.10"]' || env.TEST_MATRIX_PYTHON }}' >> "$GITHUB_OUTPUT" + + run: + name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }} + needs: compute-test-matrix + permissions: + contents: write + pull-requests: write + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: ${{ fromJSON(needs.compute-test-matrix.outputs.os) }} + python-version: ${{ fromJSON(needs.compute-test-matrix.outputs.python-version) }} + + services: + pgvector: + image: pgvector/pgvector:pg17 + env: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: postgres + ports: + - 5432:5432 + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Create pgvector extension + run: > + docker run --rm --network host + -e PGPASSWORD=postgres + pgvector/pgvector:pg17 + psql -h localhost -U postgres -d postgres -c "CREATE EXTENSION IF NOT EXISTS vector;" + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Hatch + run: pip install hatch + - name: Lint + if: matrix.python-version == '3.10' && runner.os == 'Linux' + run: hatch run fmt-check && hatch run test:types + + - name: Run unit tests + run: hatch run test:unit-cov-retry + + # On PR: posts coverage comment (directly on same-repo PRs; via artifact for fork PRs). On push to main: stores coverage baseline on data branch. + - name: Store unit tests coverage + id: coverage_comment + if: matrix.python-version == '3.10' && runner.os == 'Linux' && github.event_name != 'schedule' + uses: py-cov-action/python-coverage-comment-action@63f52f4fbbffada6e8dee8ec432de7e01df9ba79 # v3.41 + with: + GITHUB_TOKEN: ${{ github.token }} + COVERAGE_PATH: integrations/supabase + SUBPROJECT_ID: supabase + MINIMUM_GREEN: 90 + MINIMUM_ORANGE: 60 + + - name: Upload coverage comment to be posted + if: matrix.python-version == '3.10' && runner.os == 'Linux' && github.event_name == 'pull_request' && steps.coverage_comment.outputs.COMMENT_FILE_WRITTEN == 'true' + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: coverage-comment-supabase + path: python-coverage-comment-action-supabase.txt + + - name: Run integration tests + if: runner.os == 'Linux' + env: + SUPABASE_DB_URL: "postgresql://postgres:postgres@localhost:5432/postgres" + run: hatch run test:integration-cov-append-retry + + - name: Store combined coverage + if: github.event_name == 'push' + uses: py-cov-action/python-coverage-comment-action@63f52f4fbbffada6e8dee8ec432de7e01df9ba79 # v3.41 + with: + GITHUB_TOKEN: ${{ github.token }} + COVERAGE_PATH: integrations/supabase + SUBPROJECT_ID: supabase-combined + MINIMUM_GREEN: 90 + MINIMUM_ORANGE: 60 + + - name: Run unit tests with lowest direct dependencies + if: github.event_name != 'push' + run: | + hatch run uv pip compile pyproject.toml --resolution lowest-direct --output-file requirements_lowest_direct.txt + hatch -e test env run -- uv pip install -r requirements_lowest_direct.txt + hatch run test:unit + + - name: Nightly - run unit tests with Haystack main branch + if: github.event_name == 'schedule' + run: | + hatch env prune + hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main + hatch run test:unit + + + notify-slack-on-failure: + needs: run + if: failure() && github.event_name == 'schedule' + runs-on: ubuntu-slim + steps: + - uses: deepset-ai/notify-slack-action@3cda73b77a148f16f703274198e7771340cf862b # v1 + with: + slack-webhook-url: ${{ secrets.SLACK_WEBHOOK_URL_NOTIFICATIONS }} diff --git a/README.md b/README.md index a9dcd67bfe..8f3de61ed7 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta | [snowflake-haystack](integrations/snowflake/) | Retriever | [![PyPI - Version](https://img.shields.io/pypi/v/snowflake-haystack.svg)](https://pypi.org/project/snowflake-haystack) | [![Test / snowflake](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/snowflake.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/snowflake.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-snowflake/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-snowflake/htmlcov/index.html) | | | [sqlalchemy-haystack](integrations/sqlalchemy/) | Retriever | [![PyPI - Version](https://img.shields.io/pypi/v/sqlalchemy-haystack.svg)](https://pypi.org/project/sqlalchemy-haystack) | [![Test / sqlalchemy](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/sqlalchemy.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/sqlalchemy.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-sqlalchemy/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-sqlalchemy/htmlcov/index.html) | | | [stackit-haystack](integrations/stackit/) | Embedder, Generator | [![PyPI - Version](https://img.shields.io/pypi/v/stackit-haystack.svg)](https://pypi.org/project/stackit-haystack) | [![Test / stackit](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/stackit.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/stackit.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-stackit/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-stackit/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-stackit-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-stackit-combined/htmlcov/index.html) | +| [supabase-haystack](integrations/supabase/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/supabase-haystack.svg)](https://pypi.org/project/supabase-haystack) | [![Test / supabase](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/supabase.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/supabase.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-supabase/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-supabase/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-supabase-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-supabase-combined/htmlcov/index.html) | | [tavily-haystack](integrations/tavily/) | Websearch | [![PyPI - Version](https://img.shields.io/pypi/v/tavily-haystack.svg)](https://pypi.org/project/tavily-haystack) | [![Test / tavily](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/tavily.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/tavily.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-tavily/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-tavily/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-tavily-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-tavily-combined/htmlcov/index.html) | | [togetherai-haystack](integrations/togetherai/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/togetherai-haystack.svg)](https://pypi.org/project/togetherai-haystack) | [![Test / togetherai](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/togetherai.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/togetherai.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-togetherai/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-togetherai/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-togetherai-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-togetherai-combined/htmlcov/index.html) | | [unstructured-fileconverter-haystack](integrations/unstructured/) | File converter | [![PyPI - Version](https://img.shields.io/pypi/v/unstructured-fileconverter-haystack.svg)](https://pypi.org/project/unstructured-fileconverter-haystack) | [![Test / unstructured](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-unstructured/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-unstructured/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-unstructured-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-unstructured-combined/htmlcov/index.html) | diff --git a/integrations/supabase/LICENSE.txt b/integrations/supabase/LICENSE.txt new file mode 100644 index 0000000000..6134ab324f --- /dev/null +++ b/integrations/supabase/LICENSE.txt @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2023-present deepset GmbH + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/integrations/supabase/README.md b/integrations/supabase/README.md new file mode 100644 index 0000000000..dd23751b18 --- /dev/null +++ b/integrations/supabase/README.md @@ -0,0 +1,12 @@ +# supabase-haystack + +[![PyPI - Version](https://img.shields.io/pypi/v/supabase-haystack.svg)](https://pypi.org/project/supabase-haystack) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/supabase-haystack.svg)](https://pypi.org/project/supabase-haystack) + +- [Changelog](https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/supabase/CHANGELOG.md) + +--- + +## Contributing + +Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md). diff --git a/integrations/supabase/docker-compose.yml b/integrations/supabase/docker-compose.yml new file mode 100644 index 0000000000..9b5c301d00 --- /dev/null +++ b/integrations/supabase/docker-compose.yml @@ -0,0 +1,24 @@ +services: + postgres: + image: pgvector/pgvector:pg17 + environment: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: postgres + ports: + - "5432:5432" + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres"] + interval: 10s + timeout: 5s + retries: 5 + + setup: + image: pgvector/pgvector:pg17 + depends_on: + postgres: + condition: service_healthy + environment: + PGPASSWORD: postgres + command: psql -h postgres -U postgres -d postgres -c "CREATE EXTENSION IF NOT EXISTS vector;" + restart: "no" diff --git a/integrations/supabase/pydoc/config_docusaurus.yml b/integrations/supabase/pydoc/config_docusaurus.yml new file mode 100644 index 0000000000..2586c79ef7 --- /dev/null +++ b/integrations/supabase/pydoc/config_docusaurus.yml @@ -0,0 +1,15 @@ +loaders: + - modules: + - haystack_integrations.document_stores.supabase.document_store + - haystack_integrations.components.retrievers.supabase.embedding_retriever + - haystack_integrations.components.retrievers.supabase.keyword_retriever + search_path: [../src] +processors: + - type: filter + documented_only: true + skip_empty_modules: true +renderer: + description: Supabase integration for Haystack + id: integrations-supabase + filename: supabase.md + title: Supabase diff --git a/integrations/supabase/pyproject.toml b/integrations/supabase/pyproject.toml new file mode 100644 index 0000000000..02cff19cb6 --- /dev/null +++ b/integrations/supabase/pyproject.toml @@ -0,0 +1,161 @@ +[build-system] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" + +[project] +name = "supabase-haystack" +dynamic = ["version"] +description = "Haystack integration for supabase" +readme = "README.md" +requires-python = ">=3.10" +license = "Apache-2.0" +keywords = [] +authors = [{ name = "deepset GmbH", email = "info@deepset.ai" }] +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = ["haystack-ai>=2.26.1", "pgvector-haystack>=6.3.0"] + +[project.urls] +Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/supabase#readme" +Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" +Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/supabase" + +[tool.hatch.build.targets.wheel] +packages = ["src/haystack_integrations"] + +[tool.hatch.version] +source = "vcs" +tag-pattern = 'integrations\/supabase-v(?P.*)' + +[tool.hatch.version.raw-options] +root = "../.." +git_describe_command = 'git describe --tags --match="integrations/supabase-v[0-9]*"' + +[tool.hatch.envs.default] +installer = "uv" +dependencies = ["haystack-pydoc-tools", "ruff"] + +[tool.hatch.envs.default.scripts] +docs = ["haystack-pydoc pydoc/config_docusaurus.yml"] +fmt = "ruff check --fix {args}; ruff format {args}" +fmt-check = "ruff check {args} && ruff format --check {args}" + +[tool.hatch.envs.test] +dependencies = [ + "pytest", + "pytest-asyncio", + "pytest-cov", + "pytest-rerunfailures", + "mypy", + "pip", +] + +[tool.hatch.envs.test.scripts] +unit = 'pytest -m "not integration" {args:tests}' +integration = 'pytest -m "integration" {args:tests}' +all = 'pytest {args:tests}' +unit-cov-retry = 'pytest --cov=haystack_integrations.document_stores.supabase --cov=haystack_integrations.components.retrievers.supabase --reruns 3 --reruns-delay 30 -x -m "not integration" {args:tests}' +integration-cov-append-retry = 'pytest --cov=haystack_integrations.document_stores.supabase --cov=haystack_integrations.components.retrievers.supabase --cov-append --reruns 3 --reruns-delay 30 -x -m "integration" {args:tests}' +types = "mypy -p haystack_integrations.document_stores.supabase -p haystack_integrations.components.retrievers.supabase {args}" + +[tool.mypy] +install_types = true +non_interactive = true +check_untyped_defs = true +disallow_incomplete_defs = true + +[tool.ruff] +line-length = 120 + +[tool.ruff.lint] +select = [ + "A", + "ANN", + "ARG", + "B", + "C", + "D102", # Missing docstring in public method + "D103", # Missing docstring in public function + "D205", # 1 blank line required between summary line and description + "D209", # Closing triple quotes go to new line + "D213", # summary lines must be positioned on the second physical line of the docstring + "D417", # Missing argument descriptions in the docstring + "D419", # Docstring is empty + "DTZ", + "E", + "EM", + "F", + "I", + "ICN", + "ISC", + "N", + "PLC", + "PLE", + "PLR", + "PLW", + "Q", + "RUF", + "S", + "T", + "TID", + "UP", + "W", + "YTT", +] +ignore = [ + # Allow non-abstract empty methods in abstract base classes + "B027", + # Allow function calls in argument defaults (common Haystack pattern for Secret.from_env_var) + "B008", + # Ignore checks for possible passwords + "S105", + "S106", + "S107", + # Ignore complexity + "C901", + "PLR0911", + "PLR0912", + "PLR0913", + "PLR0915", + # Allow `Any` type - used legitimately for dynamic types and SDK boundaries + "ANN401", +] + +[tool.ruff.lint.isort] +known-first-party = ["haystack_integrations"] + +[tool.ruff.lint.flake8-tidy-imports] +ban-relative-imports = "parents" + +[tool.ruff.lint.per-file-ignores] +# Tests can use magic values, assertions, relative imports, and don't need type annotations +"tests/**/*" = ["PLR2004", "S101", "TID252", "D", "ANN"] + +[tool.coverage.run] +source = ["haystack_integrations.document_stores.supabase", "haystack_integrations.components.retrievers.supabase"] +branch = true +parallel = false +relative_files = true + +[tool.coverage.report] +omit = ["*/tests/*", "*/__init__.py"] +show_missing = true +exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] + +[tool.pytest.ini_options] +addopts = "--strict-markers" +markers = [ + "integration: integration tests", +] +log_cli = true +asyncio_default_fixture_loop_scope = "function" diff --git a/integrations/supabase/src/haystack_integrations/components/retrievers/py.typed b/integrations/supabase/src/haystack_integrations/components/retrievers/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/__init__.py b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/__init__.py new file mode 100644 index 0000000000..fdc5a89c23 --- /dev/null +++ b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/__init__.py @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from .embedding_retriever import SupabasePgvectorEmbeddingRetriever +from .keyword_retriever import SupabasePgvectorKeywordRetriever + +__all__ = ["SupabasePgvectorEmbeddingRetriever", "SupabasePgvectorKeywordRetriever"] diff --git a/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/embedding_retriever.py b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/embedding_retriever.py new file mode 100644 index 0000000000..8e373e476f --- /dev/null +++ b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/embedding_retriever.py @@ -0,0 +1,137 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import copy +from typing import Any, Literal + +from haystack import component, default_from_dict, default_to_dict +from haystack.document_stores.types import FilterPolicy + +from haystack_integrations.components.retrievers.pgvector import PgvectorEmbeddingRetriever +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore + + +@component +class SupabasePgvectorEmbeddingRetriever(PgvectorEmbeddingRetriever): + """ + Retrieves documents from the `SupabasePgvectorDocumentStore`, based on their dense embeddings. + + This is a thin wrapper around `PgvectorEmbeddingRetriever`, adapted for use with + `SupabasePgvectorDocumentStore`. + + Example usage: + + # Set an environment variable `SUPABASE_DB_URL` with the connection string to your Supabase database. + ```bash + export SUPABASE_DB_URL=postgresql://postgres:postgres@localhost:5432/postgres + ``` + + + ```python + from haystack import Document, Pipeline + from haystack.document_stores.types.policy import DuplicatePolicy + from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder + + from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore + from haystack_integrations.components.retrievers.supabase import SupabasePgvectorEmbeddingRetriever + + document_store = SupabasePgvectorDocumentStore( + embedding_dimension=768, + vector_function="cosine_similarity", + recreate_table=True, + ) + + documents = [Document(content="There are over 7,000 languages spoken around the world today."), + Document(content="Elephants have been observed to behave in a way that indicates..."), + Document(content="In certain places, you can witness the phenomenon of bioluminescent waves.")] + + document_embedder = SentenceTransformersDocumentEmbedder() + document_embedder.warm_up() + documents_with_embeddings = document_embedder.run(documents) + document_store.write_documents(documents_with_embeddings.get("documents"), policy=DuplicatePolicy.OVERWRITE) + + query_pipeline = Pipeline() + query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder()) + query_pipeline.add_component("retriever", SupabasePgvectorEmbeddingRetriever(document_store=document_store)) + query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding") + + query = "How many languages are there?" + + res = query_pipeline.run({"text_embedder": {"text": query}}) + print(res['retriever']['documents'][0].content) + # >> "There are over 7,000 languages spoken around the world today." + ``` + """ + + def __init__( + self, + *, + document_store: SupabasePgvectorDocumentStore, + filters: dict[str, Any] | None = None, + top_k: int = 10, + vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"] | None = None, + filter_policy: str | FilterPolicy = FilterPolicy.REPLACE, + ) -> None: + """ + Initialize the SupabasePgvectorEmbeddingRetriever. + + :param document_store: An instance of `SupabasePgvectorDocumentStore`. + :param filters: Filters applied to the retrieved Documents. + :param top_k: Maximum number of Documents to return. + :param vector_function: The similarity function to use when searching for similar embeddings. + Defaults to the one set in the `document_store` instance. + `"cosine_similarity"` and `"inner_product"` are similarity functions and + higher scores indicate greater similarity between the documents. + `"l2_distance"` returns the straight-line distance between vectors, + and the most similar documents are the ones with the smallest score. + **Important**: if the document store is using the `"hnsw"` search strategy, the vector function + should match the one utilized during index creation to take advantage of the index. + :param filter_policy: Policy to determine how filters are applied. + :raises ValueError: If `document_store` is not an instance of `SupabasePgvectorDocumentStore` or if + `vector_function` is not one of the valid options. + """ + if not isinstance(document_store, SupabasePgvectorDocumentStore): + msg = "document_store must be an instance of SupabasePgvectorDocumentStore" + raise ValueError(msg) + + super(SupabasePgvectorEmbeddingRetriever, self).__init__( # noqa: UP008 + document_store=document_store, + filters=filters, + top_k=top_k, + vector_function=vector_function, + filter_policy=filter_policy, + ) + + def to_dict(self) -> dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ + return default_to_dict( + self, + filters=self.filters, + top_k=self.top_k, + vector_function=self.vector_function, + filter_policy=self.filter_policy.value, + document_store=self.document_store.to_dict(), + ) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "SupabasePgvectorEmbeddingRetriever": + """ + Deserializes the component from a dictionary. + + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ + data = copy.deepcopy(data) + doc_store_params = data["init_parameters"]["document_store"] + data["init_parameters"]["document_store"] = SupabasePgvectorDocumentStore.from_dict(doc_store_params) + if filter_policy := data["init_parameters"].get("filter_policy"): + data["init_parameters"]["filter_policy"] = FilterPolicy.from_str(filter_policy) + return default_from_dict(cls, data) diff --git a/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/keyword_retriever.py b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/keyword_retriever.py new file mode 100644 index 0000000000..0b0ec3dcd5 --- /dev/null +++ b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/keyword_retriever.py @@ -0,0 +1,118 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import copy +from typing import Any + +from haystack import component, default_from_dict, default_to_dict +from haystack.document_stores.types import FilterPolicy + +from haystack_integrations.components.retrievers.pgvector import PgvectorKeywordRetriever +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore + + +@component +class SupabasePgvectorKeywordRetriever(PgvectorKeywordRetriever): + """ + Retrieves documents from the `SupabasePgvectorDocumentStore`, based on keywords. + + This is a thin wrapper around `PgvectorKeywordRetriever`, adapted for use with + `SupabasePgvectorDocumentStore`. + + To rank the documents, the `ts_rank_cd` function of PostgreSQL is used. + It considers how often the query terms appear in the document, how close together the terms are in the document, + and how important is the part of the document where they occur. + + Example usage: + + # Set an environment variable `SUPABASE_DB_URL` with the connection string to your Supabase database. + ```bash + export SUPABASE_DB_URL=postgresql://postgres:postgres@localhost:5432/postgres + ``` + + + ```python + from haystack import Document, Pipeline + from haystack.document_stores.types.policy import DuplicatePolicy + + from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore + from haystack_integrations.components.retrievers.supabase import SupabasePgvectorKeywordRetriever + + document_store = SupabasePgvectorDocumentStore( + embedding_dimension=768, + recreate_table=True, + ) + + documents = [Document(content="There are over 7,000 languages spoken around the world today."), + Document(content="Elephants have been observed to behave in a way that indicates..."), + Document(content="In certain places, you can witness the phenomenon of bioluminescent waves.")] + + document_store.write_documents(documents, policy=DuplicatePolicy.OVERWRITE) + retriever = SupabasePgvectorKeywordRetriever(document_store=document_store) + result = retriever.run(query="languages") + + print(result['documents'][0].content) + # >> "There are over 7,000 languages spoken around the world today." + ``` + """ + + def __init__( + self, + *, + document_store: SupabasePgvectorDocumentStore, + filters: dict[str, Any] | None = None, + top_k: int = 10, + filter_policy: str | FilterPolicy = FilterPolicy.REPLACE, + ) -> None: + """ + Initialize the SupabasePgvectorKeywordRetriever. + + :param document_store: An instance of `SupabasePgvectorDocumentStore`. + :param filters: Filters applied to the retrieved Documents. + :param top_k: Maximum number of Documents to return. + :param filter_policy: Policy to determine how filters are applied. + :raises ValueError: If `document_store` is not an instance of `SupabasePgvectorDocumentStore`. + """ + if not isinstance(document_store, SupabasePgvectorDocumentStore): + msg = "document_store must be an instance of SupabasePgvectorDocumentStore" + raise ValueError(msg) + + super(SupabasePgvectorKeywordRetriever, self).__init__( # noqa: UP008 + document_store=document_store, + filters=filters, + top_k=top_k, + filter_policy=filter_policy, + ) + + def to_dict(self) -> dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ + return default_to_dict( + self, + filters=self.filters, + top_k=self.top_k, + filter_policy=self.filter_policy.value, + document_store=self.document_store.to_dict(), + ) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "SupabasePgvectorKeywordRetriever": + """ + Deserializes the component from a dictionary. + + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ + data = copy.deepcopy(data) + doc_store_params = data["init_parameters"]["document_store"] + data["init_parameters"]["document_store"] = SupabasePgvectorDocumentStore.from_dict(doc_store_params) + if filter_policy := data["init_parameters"].get("filter_policy"): + data["init_parameters"]["filter_policy"] = FilterPolicy.from_str(filter_policy) + return default_from_dict(cls, data) diff --git a/integrations/supabase/src/haystack_integrations/document_stores/py.typed b/integrations/supabase/src/haystack_integrations/document_stores/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/integrations/supabase/src/haystack_integrations/document_stores/supabase/__init__.py b/integrations/supabase/src/haystack_integrations/document_stores/supabase/__init__.py new file mode 100644 index 0000000000..7512a97b75 --- /dev/null +++ b/integrations/supabase/src/haystack_integrations/document_stores/supabase/__init__.py @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +from .document_store import SupabasePgvectorDocumentStore + +__all__ = ["SupabasePgvectorDocumentStore"] diff --git a/integrations/supabase/src/haystack_integrations/document_stores/supabase/document_store.py b/integrations/supabase/src/haystack_integrations/document_stores/supabase/document_store.py new file mode 100644 index 0000000000..2bf1a33099 --- /dev/null +++ b/integrations/supabase/src/haystack_integrations/document_stores/supabase/document_store.py @@ -0,0 +1,140 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Literal + +from haystack import default_from_dict, default_to_dict +from haystack.utils.auth import Secret, deserialize_secrets_inplace + +from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore + + +class SupabasePgvectorDocumentStore(PgvectorDocumentStore): + """ + A Document Store for Supabase, using PostgreSQL with the pgvector extension. + + It should be used with Supabase installed. + + This is a thin wrapper around `PgvectorDocumentStore` with Supabase-specific defaults: + - Reads the connection string from the `SUPABASE_DB_URL` environment variable. + - Defaults `create_extension` to `False` since pgvector is pre-installed on Supabase. + + **Connection notes:** Supabase offers two pooler ports — transaction mode (6543) and session mode (5432). + For best compatibility with pgvector operations, use session mode (port 5432) or a direct connection. + + Example usage: + + # Set an environment variable `SUPABASE_DB_URL` with the connection string to your Supabase database. + ```bash + export SUPABASE_DB_URL=postgresql://postgres:postgres@localhost:5432/postgres + ``` + + ```python + from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore + + document_store = SupabasePgvectorDocumentStore( + embedding_dimension=768, + vector_function="cosine_similarity", + recreate_table=True, + ) + ``` + """ + + def __init__( + self, + *, + connection_string: Secret = Secret.from_env_var("SUPABASE_DB_URL"), + create_extension: bool = False, + schema_name: str = "public", + table_name: str = "haystack_documents", + language: str = "english", + embedding_dimension: int = 768, + vector_type: Literal["vector", "halfvec"] = "vector", + vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"] = "cosine_similarity", + recreate_table: bool = False, + search_strategy: Literal["exact_nearest_neighbor", "hnsw"] = "exact_nearest_neighbor", + hnsw_recreate_index_if_exists: bool = False, + hnsw_index_creation_kwargs: dict[str, int] | None = None, + hnsw_index_name: str = "haystack_hnsw_index", + hnsw_ef_search: int | None = None, + keyword_index_name: str = "haystack_keyword_index", + ) -> None: + """ + Creates a new SupabasePgvectorDocumentStore instance. + + :param connection_string: The connection string for the Supabase PostgreSQL database, defined as an + environment variable. Default: `SUPABASE_DB_URL`. Format: + `postgresql://postgres.[project-ref]:[password]@aws-0-[region].pooler.supabase.com:5432/postgres` + :param create_extension: Whether to create the pgvector extension if it doesn't exist. + Defaults to `False` since Supabase has pgvector pre-installed. + :param schema_name: The name of the schema the table is created in. + :param table_name: The name of the table to use to store Haystack documents. + :param language: The language to be used to parse query and document content in keyword retrieval. + :param embedding_dimension: The dimension of the embedding. + :param vector_type: The type of vector used for embedding storage. `"vector"` or `"halfvec"`. + :param vector_function: The similarity function to use when searching for similar embeddings. + :param recreate_table: Whether to recreate the table if it already exists. + :param search_strategy: The search strategy to use: `"exact_nearest_neighbor"` or `"hnsw"`. + :param hnsw_recreate_index_if_exists: Whether to recreate the HNSW index if it already exists. + :param hnsw_index_creation_kwargs: Additional keyword arguments for HNSW index creation. + :param hnsw_index_name: Index name for the HNSW index. + :param hnsw_ef_search: The `ef_search` parameter to use at query time for HNSW. + :param keyword_index_name: Index name for the Keyword index. + """ + super().__init__( + connection_string=connection_string, + create_extension=create_extension, + schema_name=schema_name, + table_name=table_name, + language=language, + embedding_dimension=embedding_dimension, + vector_type=vector_type, + vector_function=vector_function, + recreate_table=recreate_table, + search_strategy=search_strategy, + hnsw_recreate_index_if_exists=hnsw_recreate_index_if_exists, + hnsw_index_creation_kwargs=hnsw_index_creation_kwargs, + hnsw_index_name=hnsw_index_name, + hnsw_ef_search=hnsw_ef_search, + keyword_index_name=keyword_index_name, + ) + + def to_dict(self) -> dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ + return default_to_dict( + self, + connection_string=self.connection_string.to_dict(), + create_extension=self.create_extension, + schema_name=self.schema_name, + table_name=self.table_name, + embedding_dimension=self.embedding_dimension, + vector_type=self.vector_type, + vector_function=self.vector_function, + recreate_table=self.recreate_table, + search_strategy=self.search_strategy, + hnsw_recreate_index_if_exists=self.hnsw_recreate_index_if_exists, + hnsw_index_creation_kwargs=self.hnsw_index_creation_kwargs, + hnsw_index_name=self.hnsw_index_name, + hnsw_ef_search=self.hnsw_ef_search, + keyword_index_name=self.keyword_index_name, + language=self.language, + ) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "SupabasePgvectorDocumentStore": + """ + Deserializes the component from a dictionary. + + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ + deserialize_secrets_inplace(data["init_parameters"], ["connection_string"]) + return default_from_dict(cls, data) diff --git a/integrations/supabase/tests/__init__.py b/integrations/supabase/tests/__init__.py new file mode 100644 index 0000000000..c1764a6e03 --- /dev/null +++ b/integrations/supabase/tests/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/supabase/tests/conftest.py b/integrations/supabase/tests/conftest.py new file mode 100644 index 0000000000..4f83fe6e02 --- /dev/null +++ b/integrations/supabase/tests/conftest.py @@ -0,0 +1,91 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import os +from unittest.mock import patch + +import pytest + +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore + +SUPABASE_DB_URL = os.environ.get("SUPABASE_DB_URL", "postgresql://postgres:postgres@localhost:5432/postgres") + + +@pytest.fixture +def document_store(request, monkeypatch): + monkeypatch.setenv("SUPABASE_DB_URL", SUPABASE_DB_URL) + table_name = f"haystack_{request.node.name}" + embedding_dimension = 768 + vector_function = "cosine_similarity" + recreate_table = True + search_strategy = "exact_nearest_neighbor" + + store = SupabasePgvectorDocumentStore( + table_name=table_name, + embedding_dimension=embedding_dimension, + vector_function=vector_function, + recreate_table=recreate_table, + search_strategy=search_strategy, + ) + + yield store + + store._ensure_db_setup() + store.delete_table() + + +@pytest.fixture +def document_store_w_hnsw_index(request, monkeypatch): + monkeypatch.setenv("SUPABASE_DB_URL", SUPABASE_DB_URL) + table_name = f"haystack_hnsw_{request.node.name}" + embedding_dimension = 768 + vector_function = "cosine_similarity" + recreate_table = True + search_strategy = "hnsw" + + store = SupabasePgvectorDocumentStore( + table_name=table_name, + embedding_dimension=embedding_dimension, + vector_function=vector_function, + recreate_table=recreate_table, + search_strategy=search_strategy, + ) + yield store + + store._ensure_db_setup() + store.delete_table() + + +@pytest.fixture +def patches_for_unit_tests(): + with ( + patch("haystack_integrations.document_stores.pgvector.document_store.register_vector") as mock_register, + patch( + "haystack_integrations.document_stores.pgvector.document_store.PgvectorDocumentStore.delete_table" + ) as mock_delete, + patch( + "haystack_integrations.document_stores.pgvector.document_store.PgvectorDocumentStore._handle_hnsw" + ) as mock_hnsw, + ): + yield mock_register, mock_delete, mock_hnsw + + +@pytest.fixture +def mock_store(patches_for_unit_tests, monkeypatch): # noqa: ARG001 patches are not explicitly called but necessary + monkeypatch.setenv("SUPABASE_DB_URL", "some-connection-string") + table_name = "haystack" + embedding_dimension = 768 + vector_function = "cosine_similarity" + recreate_table = True + search_strategy = "exact_nearest_neighbor" + + store = SupabasePgvectorDocumentStore( + table_name=table_name, + embedding_dimension=embedding_dimension, + vector_function=vector_function, + recreate_table=recreate_table, + search_strategy=search_strategy, + ) + + yield store diff --git a/integrations/supabase/tests/test_document_store.py b/integrations/supabase/tests/test_document_store.py new file mode 100644 index 0000000000..78d86d9115 --- /dev/null +++ b/integrations/supabase/tests/test_document_store.py @@ -0,0 +1,214 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import pytest +from haystack.dataclasses.document import ByteStream, Document +from haystack.document_stores.errors import DuplicateDocumentError +from haystack.document_stores.types import DuplicatePolicy +from haystack.testing.document_store import ( + CountDocumentsByFilterTest, + CountDocumentsTest, + CountUniqueMetadataByFilterTest, + DeleteAllTest, + DeleteByFilterTest, + DeleteDocumentsTest, + FilterableDocsFixtureMixin, + GetMetadataFieldMinMaxTest, + GetMetadataFieldsInfoTest, + GetMetadataFieldUniqueValuesTest, + UpdateByFilterTest, + WriteDocumentsTest, +) + +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore + + +@pytest.mark.integration +class TestDocumentStore( + CountDocumentsTest, + DeleteAllTest, + DeleteByFilterTest, + DeleteDocumentsTest, + FilterableDocsFixtureMixin, + UpdateByFilterTest, + WriteDocumentsTest, + CountDocumentsByFilterTest, + CountUniqueMetadataByFilterTest, + GetMetadataFieldsInfoTest, + GetMetadataFieldMinMaxTest, + GetMetadataFieldUniqueValuesTest, +): + def test_get_metadata_fields_info_empty_collection(self, document_store: SupabasePgvectorDocumentStore): + """SupabasePgvectorDocumentStore always includes 'content' in fields info, even for empty stores.""" + assert document_store.count_documents() == 0 + + fields_info = document_store.get_metadata_fields_info() + assert fields_info == {"content": {"type": "text"}} + + def test_get_metadata_field_min_max_empty_collection(self, document_store: SupabasePgvectorDocumentStore): + """SupabasePgvectorDocumentStore raises ValueError when the field doesn't exist in the store.""" + assert document_store.count_documents() == 0 + + with pytest.raises(ValueError, match="not found in document store"): + document_store.get_metadata_field_min_max("priority") + + def test_write_documents(self, document_store: SupabasePgvectorDocumentStore): + docs = [Document(id="1")] + assert document_store.write_documents(docs) == 1 + with pytest.raises(DuplicateDocumentError): + document_store.write_documents(docs, DuplicatePolicy.FAIL) + + def test_write_blob(self, document_store: SupabasePgvectorDocumentStore): + bytestream = ByteStream(b"test", meta={"meta_key": "meta_value"}, mime_type="mime_type") + docs = [Document(id="1", blob=bytestream)] + document_store.write_documents(docs) + + retrieved_docs = document_store.filter_documents() + assert retrieved_docs == docs + + +@pytest.mark.integration +def test_delete_table_first_call(document_store): + """ + Test that delete_table can be executed as the initial operation on the Document Store + without triggering errors due to an uninitialized state. + """ + document_store.delete_table() + + +@pytest.mark.usefixtures("patches_for_unit_tests") +def test_init(monkeypatch): + monkeypatch.setenv("SUPABASE_DB_URL", "some_connection_string") + + document_store = SupabasePgvectorDocumentStore( + create_extension=True, + schema_name="my_schema", + table_name="my_table", + language="spanish", + embedding_dimension=512, + vector_type="halfvec", + vector_function="l2_distance", + recreate_table=True, + search_strategy="hnsw", + hnsw_recreate_index_if_exists=True, + hnsw_index_creation_kwargs={"m": 32, "ef_construction": 128}, + hnsw_index_name="my_hnsw_index", + hnsw_ef_search=50, + keyword_index_name="my_keyword_index", + ) + + assert document_store.create_extension + assert document_store.schema_name == "my_schema" + assert document_store.table_name == "my_table" + assert document_store.language == "spanish" + assert document_store.embedding_dimension == 512 + assert document_store.vector_type == "halfvec" + assert document_store.vector_function == "l2_distance" + assert document_store.recreate_table + assert document_store.search_strategy == "hnsw" + assert document_store.hnsw_recreate_index_if_exists + assert document_store.hnsw_index_creation_kwargs == {"m": 32, "ef_construction": 128} + assert document_store.hnsw_index_name == "my_hnsw_index" + assert document_store.hnsw_ef_search == 50 + assert document_store.keyword_index_name == "my_keyword_index" + + +@pytest.mark.usefixtures("patches_for_unit_tests") +def test_init_defaults(monkeypatch): + monkeypatch.setenv("SUPABASE_DB_URL", "some_connection_string") + + document_store = SupabasePgvectorDocumentStore() + + assert not document_store.create_extension + assert document_store.schema_name == "public" + assert document_store.table_name == "haystack_documents" + assert document_store.language == "english" + assert document_store.embedding_dimension == 768 + assert document_store.vector_type == "vector" + assert document_store.vector_function == "cosine_similarity" + assert not document_store.recreate_table + assert document_store.search_strategy == "exact_nearest_neighbor" + + +@pytest.mark.usefixtures("patches_for_unit_tests") +def test_to_dict(monkeypatch): + monkeypatch.setenv("SUPABASE_DB_URL", "some_connection_string") + + document_store = SupabasePgvectorDocumentStore( + table_name="my_table", + embedding_dimension=512, + vector_type="halfvec", + vector_function="l2_distance", + recreate_table=True, + search_strategy="hnsw", + hnsw_recreate_index_if_exists=True, + hnsw_index_creation_kwargs={"m": 32, "ef_construction": 128}, + hnsw_index_name="my_hnsw_index", + hnsw_ef_search=50, + keyword_index_name="my_keyword_index", + ) + + assert document_store.to_dict() == { + "type": "haystack_integrations.document_stores.supabase.document_store.SupabasePgvectorDocumentStore", + "init_parameters": { + "connection_string": {"env_vars": ["SUPABASE_DB_URL"], "strict": True, "type": "env_var"}, + "create_extension": False, + "table_name": "my_table", + "schema_name": "public", + "embedding_dimension": 512, + "vector_type": "halfvec", + "vector_function": "l2_distance", + "recreate_table": True, + "search_strategy": "hnsw", + "hnsw_recreate_index_if_exists": True, + "language": "english", + "hnsw_index_creation_kwargs": {"m": 32, "ef_construction": 128}, + "hnsw_index_name": "my_hnsw_index", + "hnsw_ef_search": 50, + "keyword_index_name": "my_keyword_index", + }, + } + + +@pytest.mark.usefixtures("patches_for_unit_tests") +def test_from_dict(monkeypatch): + monkeypatch.setenv("SUPABASE_DB_URL", "some_connection_string") + + data = { + "type": "haystack_integrations.document_stores.supabase.document_store.SupabasePgvectorDocumentStore", + "init_parameters": { + "connection_string": {"env_vars": ["SUPABASE_DB_URL"], "strict": True, "type": "env_var"}, + "create_extension": False, + "table_name": "my_table", + "schema_name": "public", + "embedding_dimension": 512, + "vector_type": "halfvec", + "vector_function": "l2_distance", + "recreate_table": True, + "search_strategy": "hnsw", + "hnsw_recreate_index_if_exists": True, + "language": "english", + "hnsw_index_creation_kwargs": {"m": 32, "ef_construction": 128}, + "hnsw_index_name": "my_hnsw_index", + "hnsw_ef_search": 50, + "keyword_index_name": "my_keyword_index", + }, + } + + document_store = SupabasePgvectorDocumentStore.from_dict(data) + + assert isinstance(document_store, SupabasePgvectorDocumentStore) + assert not document_store.create_extension + assert document_store.table_name == "my_table" + assert document_store.schema_name == "public" + assert document_store.embedding_dimension == 512 + assert document_store.vector_type == "halfvec" + assert document_store.vector_function == "l2_distance" + assert document_store.recreate_table + assert document_store.search_strategy == "hnsw" + assert document_store.hnsw_recreate_index_if_exists + assert document_store.hnsw_index_creation_kwargs == {"m": 32, "ef_construction": 128} + assert document_store.hnsw_index_name == "my_hnsw_index" + assert document_store.hnsw_ef_search == 50 + assert document_store.keyword_index_name == "my_keyword_index" diff --git a/integrations/supabase/tests/test_embedding_retriever.py b/integrations/supabase/tests/test_embedding_retriever.py new file mode 100644 index 0000000000..268fcf0b07 --- /dev/null +++ b/integrations/supabase/tests/test_embedding_retriever.py @@ -0,0 +1,375 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from unittest.mock import Mock + +import pytest +from haystack.dataclasses import Document +from haystack.document_stores.types import FilterPolicy +from haystack.utils.auth import EnvVarSecret +from numpy.random import rand + +from haystack_integrations.components.retrievers.supabase import SupabasePgvectorEmbeddingRetriever +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore + + +def test_init_default(mock_store): + retriever = SupabasePgvectorEmbeddingRetriever(document_store=mock_store) + assert retriever.document_store == mock_store + assert retriever.filters == {} + assert retriever.top_k == 10 + assert retriever.filter_policy == FilterPolicy.REPLACE + assert retriever.vector_function == mock_store.vector_function + + +def test_init_filter_policy_string(mock_store): + retriever = SupabasePgvectorEmbeddingRetriever(document_store=mock_store, filter_policy="merge") + assert retriever.filter_policy == FilterPolicy.MERGE + + +def test_init_invalid_filter_policy(mock_store): + with pytest.raises(ValueError): + SupabasePgvectorEmbeddingRetriever(document_store=mock_store, filter_policy="invalid") + + +def test_init(mock_store): + retriever = SupabasePgvectorEmbeddingRetriever( + document_store=mock_store, filters={"field": "value"}, top_k=5, vector_function="l2_distance" + ) + assert retriever.document_store == mock_store + assert retriever.filters == {"field": "value"} + assert retriever.top_k == 5 + assert retriever.filter_policy == FilterPolicy.REPLACE + assert retriever.vector_function == "l2_distance" + + +def test_init_invalid_document_store(): + with pytest.raises(ValueError, match="must be an instance of SupabasePgvectorDocumentStore"): + SupabasePgvectorEmbeddingRetriever(document_store="not a store") + + +def test_to_dict(mock_store): + retriever = SupabasePgvectorEmbeddingRetriever( + document_store=mock_store, filters={"field": "value"}, top_k=5, vector_function="l2_distance" + ) + res = retriever.to_dict() + t = "haystack_integrations.components.retrievers.supabase.embedding_retriever.SupabasePgvectorEmbeddingRetriever" + assert res == { + "type": t, + "init_parameters": { + "document_store": { + "type": ("haystack_integrations.document_stores.supabase.document_store.SupabasePgvectorDocumentStore"), + "init_parameters": { + "connection_string": {"env_vars": ["SUPABASE_DB_URL"], "strict": True, "type": "env_var"}, + "create_extension": False, + "schema_name": "public", + "table_name": "haystack", + "embedding_dimension": 768, + "vector_type": "vector", + "vector_function": "cosine_similarity", + "recreate_table": True, + "search_strategy": "exact_nearest_neighbor", + "hnsw_recreate_index_if_exists": False, + "language": "english", + "hnsw_index_creation_kwargs": {}, + "hnsw_index_name": "haystack_hnsw_index", + "hnsw_ef_search": None, + "keyword_index_name": "haystack_keyword_index", + }, + }, + "filters": {"field": "value"}, + "top_k": 5, + "vector_function": "l2_distance", + "filter_policy": "replace", + }, + } + + +@pytest.mark.usefixtures("patches_for_unit_tests") +def test_from_dict(monkeypatch): + monkeypatch.setenv("SUPABASE_DB_URL", "some-connection-string") + t = "haystack_integrations.components.retrievers.supabase.embedding_retriever.SupabasePgvectorEmbeddingRetriever" + data = { + "type": t, + "init_parameters": { + "document_store": { + "type": ("haystack_integrations.document_stores.supabase.document_store.SupabasePgvectorDocumentStore"), + "init_parameters": { + "connection_string": {"env_vars": ["SUPABASE_DB_URL"], "strict": True, "type": "env_var"}, + "create_extension": False, + "table_name": "haystack_test_to_dict", + "embedding_dimension": 768, + "vector_function": "cosine_similarity", + "recreate_table": True, + "search_strategy": "exact_nearest_neighbor", + "hnsw_recreate_index_if_exists": False, + "hnsw_index_creation_kwargs": {}, + "hnsw_index_name": "haystack_hnsw_index", + "hnsw_ef_search": None, + "keyword_index_name": "haystack_keyword_index", + }, + }, + "filters": {"field": "value"}, + "top_k": 5, + "vector_function": "l2_distance", + "filter_policy": "replace", + }, + } + + retriever = SupabasePgvectorEmbeddingRetriever.from_dict(data) + document_store = retriever.document_store + + assert isinstance(document_store, SupabasePgvectorDocumentStore) + assert isinstance(document_store.connection_string, EnvVarSecret) + assert not document_store.create_extension + assert document_store.table_name == "haystack_test_to_dict" + assert document_store.embedding_dimension == 768 + assert document_store.vector_function == "cosine_similarity" + assert document_store.recreate_table + assert document_store.search_strategy == "exact_nearest_neighbor" + assert not document_store.hnsw_recreate_index_if_exists + assert document_store.hnsw_index_creation_kwargs == {} + assert document_store.hnsw_index_name == "haystack_hnsw_index" + assert document_store.hnsw_ef_search is None + assert document_store.keyword_index_name == "haystack_keyword_index" + + assert retriever.filters == {"field": "value"} + assert retriever.top_k == 5 + assert retriever.filter_policy == FilterPolicy.REPLACE + assert retriever.vector_function == "l2_distance" + + +@pytest.mark.usefixtures("patches_for_unit_tests") +def test_from_dict_without_filter_policy(monkeypatch): + monkeypatch.setenv("SUPABASE_DB_URL", "some-connection-string") + t = "haystack_integrations.components.retrievers.supabase.embedding_retriever.SupabasePgvectorEmbeddingRetriever" + data = { + "type": t, + "init_parameters": { + "document_store": { + "type": ("haystack_integrations.document_stores.supabase.document_store.SupabasePgvectorDocumentStore"), + "init_parameters": { + "connection_string": {"env_vars": ["SUPABASE_DB_URL"], "strict": True, "type": "env_var"}, + "create_extension": False, + "table_name": "haystack_test_to_dict", + "embedding_dimension": 768, + "vector_function": "cosine_similarity", + "recreate_table": True, + "search_strategy": "exact_nearest_neighbor", + "hnsw_recreate_index_if_exists": False, + "hnsw_index_creation_kwargs": {}, + "hnsw_index_name": "haystack_hnsw_index", + "hnsw_ef_search": None, + "keyword_index_name": "haystack_keyword_index", + }, + }, + "filters": {"field": "value"}, + "top_k": 5, + "vector_function": "l2_distance", + }, + } + + retriever = SupabasePgvectorEmbeddingRetriever.from_dict(data) + document_store = retriever.document_store + + assert isinstance(document_store, SupabasePgvectorDocumentStore) + assert isinstance(document_store.connection_string, EnvVarSecret) + assert document_store.table_name == "haystack_test_to_dict" + assert document_store.embedding_dimension == 768 + assert document_store.vector_function == "cosine_similarity" + assert document_store.recreate_table + assert document_store.search_strategy == "exact_nearest_neighbor" + assert not document_store.hnsw_recreate_index_if_exists + assert document_store.hnsw_index_creation_kwargs == {} + assert document_store.hnsw_index_name == "haystack_hnsw_index" + assert document_store.hnsw_ef_search is None + assert document_store.keyword_index_name == "haystack_keyword_index" + + assert retriever.filters == {"field": "value"} + assert retriever.top_k == 5 + assert retriever.filter_policy == FilterPolicy.REPLACE # defaults to REPLACE + assert retriever.vector_function == "l2_distance" + + +def test_run(): + mock_store = Mock(spec=SupabasePgvectorDocumentStore) + mock_store.vector_function = "cosine_similarity" + doc = Document(content="Test doc", embedding=[0.1, 0.2]) + mock_store._embedding_retrieval.return_value = [doc] + + retriever = SupabasePgvectorEmbeddingRetriever(document_store=mock_store, vector_function="l2_distance") + res = retriever.run(query_embedding=[0.3, 0.5]) + + mock_store._embedding_retrieval.assert_called_once_with( + query_embedding=[0.3, 0.5], filters={}, top_k=10, vector_function="l2_distance" + ) + assert res == {"documents": [doc]} + + +@pytest.mark.asyncio +async def test_run_async(): + mock_store = Mock(spec=SupabasePgvectorDocumentStore) + mock_store.vector_function = "cosine_similarity" + doc = Document(content="Test doc", embedding=[0.1, 0.2]) + mock_store._embedding_retrieval_async.return_value = [doc] + + retriever = SupabasePgvectorEmbeddingRetriever(document_store=mock_store, vector_function="l2_distance") + res = await retriever.run_async(query_embedding=[0.3, 0.5]) + + mock_store._embedding_retrieval_async.assert_called_once_with( + query_embedding=[0.3, 0.5], filters={}, top_k=10, vector_function="l2_distance" + ) + assert res == {"documents": [doc]} + + +def test_run_with_filters(): + mock_store = Mock(spec=SupabasePgvectorDocumentStore) + mock_store.vector_function = "cosine_similarity" + doc = Document(content="Test doc", embedding=[0.1, 0.2]) + mock_store._embedding_retrieval.return_value = [doc] + + init_filter = {"field": "meta.category", "operator": "==", "value": "news"} + runtime_filter = {"field": "meta.score", "operator": ">", "value": 0.5} + merged_filter = {"operator": "AND", "conditions": [init_filter, runtime_filter]} + + retriever = SupabasePgvectorEmbeddingRetriever( + document_store=mock_store, filter_policy=FilterPolicy.MERGE, filters=init_filter, vector_function="l2_distance" + ) + res = retriever.run(query_embedding=[0.3, 0.5], filters=runtime_filter) + + mock_store._embedding_retrieval.assert_called_once_with( + query_embedding=[0.3, 0.5], filters=merged_filter, top_k=10, vector_function="l2_distance" + ) + assert res == {"documents": [doc]} + + +@pytest.mark.asyncio +async def test_run_async_with_filters(): + mock_store = Mock(spec=SupabasePgvectorDocumentStore) + mock_store.vector_function = "cosine_similarity" + doc = Document(content="Test doc", embedding=[0.1, 0.2]) + mock_store._embedding_retrieval_async.return_value = [doc] + + init_filter = {"field": "meta.category", "operator": "==", "value": "news"} + runtime_filter = {"field": "meta.score", "operator": ">", "value": 0.5} + merged_filter = {"operator": "AND", "conditions": [init_filter, runtime_filter]} + + retriever = SupabasePgvectorEmbeddingRetriever( + document_store=mock_store, filter_policy=FilterPolicy.MERGE, filters=init_filter, vector_function="l2_distance" + ) + res = await retriever.run_async(query_embedding=[0.3, 0.5], filters=runtime_filter) + + mock_store._embedding_retrieval_async.assert_called_once_with( + query_embedding=[0.3, 0.5], filters=merged_filter, top_k=10, vector_function="l2_distance" + ) + assert res == {"documents": [doc]} + + +@pytest.mark.integration +@pytest.mark.parametrize("document_store", ["document_store", "document_store_w_hnsw_index"], indirect=True) +def test_embedding_retrieval_cosine_similarity(document_store: SupabasePgvectorDocumentStore): + query_embedding = [0.1] * 768 + most_similar_embedding = [0.8] * 768 + second_best_embedding = [0.8] * 700 + [0.1] * 3 + [0.2] * 65 + another_embedding = rand(768).tolist() + + docs = [ + Document(content="Most similar document (cosine sim)", embedding=most_similar_embedding), + Document(content="2nd best document (cosine sim)", embedding=second_best_embedding), + Document(content="Not very similar document (cosine sim)", embedding=another_embedding), + ] + + document_store.write_documents(docs) + + results = document_store._embedding_retrieval( + query_embedding=query_embedding, top_k=2, filters={}, vector_function="cosine_similarity" + ) + assert len(results) == 2 + assert results[0].content == "Most similar document (cosine sim)" + assert results[1].content == "2nd best document (cosine sim)" + assert results[0].score > results[1].score + + +@pytest.mark.integration +@pytest.mark.parametrize("document_store", ["document_store", "document_store_w_hnsw_index"], indirect=True) +def test_embedding_retrieval_inner_product(document_store: SupabasePgvectorDocumentStore): + query_embedding = [0.1] * 768 + most_similar_embedding = [0.8] * 768 + second_best_embedding = [0.8] * 700 + [0.1] * 3 + [0.2] * 65 + another_embedding = rand(768).tolist() + + docs = [ + Document(content="Most similar document (inner product)", embedding=most_similar_embedding), + Document(content="2nd best document (inner product)", embedding=second_best_embedding), + Document(content="Not very similar document (inner product)", embedding=another_embedding), + ] + + document_store.write_documents(docs) + + results = document_store._embedding_retrieval( + query_embedding=query_embedding, top_k=2, filters={}, vector_function="inner_product" + ) + assert len(results) == 2 + assert results[0].content == "Most similar document (inner product)" + assert results[1].content == "2nd best document (inner product)" + assert results[0].score > results[1].score + + +@pytest.mark.integration +@pytest.mark.parametrize("document_store", ["document_store", "document_store_w_hnsw_index"], indirect=True) +def test_embedding_retrieval_l2_distance(document_store: SupabasePgvectorDocumentStore): + query_embedding = [0.1] * 768 + most_similar_embedding = [0.1] * 765 + [0.15] * 3 + second_best_embedding = [0.1] * 700 + [0.1] * 3 + [0.2] * 65 + another_embedding = rand(768).tolist() + + docs = [ + Document(content="Most similar document (l2 dist)", embedding=most_similar_embedding), + Document(content="2nd best document (l2 dist)", embedding=second_best_embedding), + Document(content="Not very similar document (l2 dist)", embedding=another_embedding), + ] + + document_store.write_documents(docs) + + results = document_store._embedding_retrieval( + query_embedding=query_embedding, top_k=2, filters={}, vector_function="l2_distance" + ) + assert len(results) == 2 + assert results[0].content == "Most similar document (l2 dist)" + assert results[1].content == "2nd best document (l2 dist)" + assert results[0].score < results[1].score + + +@pytest.mark.integration +@pytest.mark.parametrize("document_store", ["document_store", "document_store_w_hnsw_index"], indirect=True) +def test_embedding_retrieval_with_filters(document_store: SupabasePgvectorDocumentStore): + docs = [Document(content=f"Document {i}", embedding=rand(768).tolist()) for i in range(10)] + + for i in range(10): + docs[i].meta["meta_field"] = "custom_value" if i % 2 == 0 else "other_value" + + document_store.write_documents(docs) + + query_embedding = [0.1] * 768 + filters = {"field": "meta.meta_field", "operator": "==", "value": "custom_value"} + + results = document_store._embedding_retrieval(query_embedding=query_embedding, top_k=3, filters=filters) + assert len(results) == 3 + for result in results: + assert result.meta["meta_field"] == "custom_value" + assert results[0].score > results[1].score > results[2].score + + +@pytest.mark.integration +def test_empty_query_embedding(document_store: SupabasePgvectorDocumentStore): + query_embedding: list[float] = [] + with pytest.raises(ValueError): + document_store._embedding_retrieval(query_embedding=query_embedding) + + +@pytest.mark.integration +def test_query_embedding_wrong_dimension(document_store: SupabasePgvectorDocumentStore): + query_embedding = [0.1] * 4 + with pytest.raises(ValueError): + document_store._embedding_retrieval(query_embedding=query_embedding) diff --git a/integrations/supabase/tests/test_filters.py b/integrations/supabase/tests/test_filters.py new file mode 100644 index 0000000000..471aa5d239 --- /dev/null +++ b/integrations/supabase/tests/test_filters.py @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from dataclasses import replace + +import pytest +from haystack.dataclasses.document import Document +from haystack.testing.document_store import FilterDocumentsTest + + +@pytest.mark.integration +class TestFilters(FilterDocumentsTest): + def assert_documents_are_equal(self, received: list[Document], expected: list[Document]): + """ + Override to handle floating-point precision differences when embeddings are retrieved from PostgreSQL. + """ + assert len(received) == len(expected) + received.sort(key=lambda x: x.id) + expected.sort(key=lambda x: x.id) + for received_doc, expected_doc in zip(received, expected, strict=True): + if received_doc.embedding is None: + assert expected_doc.embedding is None + else: + assert received_doc.embedding == pytest.approx(expected_doc.embedding) + + assert replace(received_doc, embedding=None) == replace(expected_doc, embedding=None) + + @pytest.mark.skip(reason="NOT operator is not supported in PgvectorDocumentStore") + def test_not_operator(self, document_store, filterable_docs): ... diff --git a/integrations/supabase/tests/test_keyword_retriever.py b/integrations/supabase/tests/test_keyword_retriever.py new file mode 100644 index 0000000000..ab87744fde --- /dev/null +++ b/integrations/supabase/tests/test_keyword_retriever.py @@ -0,0 +1,300 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from unittest.mock import Mock + +import pytest +from haystack.dataclasses import Document +from haystack.document_stores.types import FilterPolicy +from haystack.utils.auth import EnvVarSecret + +from haystack_integrations.components.retrievers.supabase import SupabasePgvectorKeywordRetriever +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore + + +def test_init_default(mock_store): + retriever = SupabasePgvectorKeywordRetriever(document_store=mock_store) + assert retriever.document_store == mock_store + assert retriever.filters == {} + assert retriever.top_k == 10 + assert retriever.filter_policy == FilterPolicy.REPLACE + + +def test_init_filter_policy_string(mock_store): + retriever = SupabasePgvectorKeywordRetriever(document_store=mock_store, filter_policy="merge") + assert retriever.filter_policy == FilterPolicy.MERGE + + +def test_init_invalid_filter_policy(mock_store): + with pytest.raises(ValueError): + SupabasePgvectorKeywordRetriever(document_store=mock_store, filter_policy="invalid") + + +def test_init(mock_store): + retriever = SupabasePgvectorKeywordRetriever(document_store=mock_store, filters={"field": "value"}, top_k=5) + assert retriever.document_store == mock_store + assert retriever.filters == {"field": "value"} + assert retriever.top_k == 5 + + +def test_init_with_filter_policy(mock_store): + retriever = SupabasePgvectorKeywordRetriever( + document_store=mock_store, filters={"field": "value"}, top_k=5, filter_policy=FilterPolicy.MERGE + ) + assert retriever.document_store == mock_store + assert retriever.filters == {"field": "value"} + assert retriever.top_k == 5 + assert retriever.filter_policy == FilterPolicy.MERGE + + +def test_init_invalid_document_store(): + with pytest.raises(ValueError, match="must be an instance of SupabasePgvectorDocumentStore"): + SupabasePgvectorKeywordRetriever(document_store="not a store") + + +def test_to_dict(mock_store): + retriever = SupabasePgvectorKeywordRetriever(document_store=mock_store, filters={"field": "value"}, top_k=5) + res = retriever.to_dict() + t = "haystack_integrations.components.retrievers.supabase.keyword_retriever.SupabasePgvectorKeywordRetriever" + assert res == { + "type": t, + "init_parameters": { + "document_store": { + "type": ("haystack_integrations.document_stores.supabase.document_store.SupabasePgvectorDocumentStore"), + "init_parameters": { + "connection_string": {"env_vars": ["SUPABASE_DB_URL"], "strict": True, "type": "env_var"}, + "create_extension": False, + "schema_name": "public", + "table_name": "haystack", + "embedding_dimension": 768, + "vector_type": "vector", + "vector_function": "cosine_similarity", + "recreate_table": True, + "search_strategy": "exact_nearest_neighbor", + "hnsw_recreate_index_if_exists": False, + "language": "english", + "hnsw_index_creation_kwargs": {}, + "hnsw_index_name": "haystack_hnsw_index", + "hnsw_ef_search": None, + "keyword_index_name": "haystack_keyword_index", + }, + }, + "filters": {"field": "value"}, + "top_k": 5, + "filter_policy": "replace", + }, + } + + +@pytest.mark.usefixtures("patches_for_unit_tests") +def test_from_dict(monkeypatch): + monkeypatch.setenv("SUPABASE_DB_URL", "some-connection-string") + t = "haystack_integrations.components.retrievers.supabase.keyword_retriever.SupabasePgvectorKeywordRetriever" + data = { + "type": t, + "init_parameters": { + "document_store": { + "type": ("haystack_integrations.document_stores.supabase.document_store.SupabasePgvectorDocumentStore"), + "init_parameters": { + "connection_string": {"env_vars": ["SUPABASE_DB_URL"], "strict": True, "type": "env_var"}, + "create_extension": False, + "table_name": "haystack_test_to_dict", + "embedding_dimension": 768, + "vector_function": "cosine_similarity", + "recreate_table": True, + "search_strategy": "exact_nearest_neighbor", + "hnsw_recreate_index_if_exists": False, + "hnsw_index_creation_kwargs": {}, + "hnsw_index_name": "haystack_hnsw_index", + "hnsw_ef_search": None, + "keyword_index_name": "haystack_keyword_index", + }, + }, + "filters": {"field": "value"}, + "top_k": 5, + "filter_policy": "replace", + }, + } + + retriever = SupabasePgvectorKeywordRetriever.from_dict(data) + document_store = retriever.document_store + + assert isinstance(document_store, SupabasePgvectorDocumentStore) + assert isinstance(document_store.connection_string, EnvVarSecret) + assert not document_store.create_extension + assert document_store.table_name == "haystack_test_to_dict" + assert document_store.embedding_dimension == 768 + assert document_store.vector_function == "cosine_similarity" + assert document_store.recreate_table + assert document_store.search_strategy == "exact_nearest_neighbor" + assert not document_store.hnsw_recreate_index_if_exists + assert document_store.hnsw_index_creation_kwargs == {} + assert document_store.hnsw_index_name == "haystack_hnsw_index" + assert document_store.hnsw_ef_search is None + assert document_store.keyword_index_name == "haystack_keyword_index" + + assert retriever.filters == {"field": "value"} + assert retriever.top_k == 5 + assert retriever.filter_policy == FilterPolicy.REPLACE + + +@pytest.mark.usefixtures("patches_for_unit_tests") +def test_from_dict_without_filter_policy(monkeypatch): + monkeypatch.setenv("SUPABASE_DB_URL", "some-connection-string") + t = "haystack_integrations.components.retrievers.supabase.keyword_retriever.SupabasePgvectorKeywordRetriever" + data = { + "type": t, + "init_parameters": { + "document_store": { + "type": ("haystack_integrations.document_stores.supabase.document_store.SupabasePgvectorDocumentStore"), + "init_parameters": { + "connection_string": {"env_vars": ["SUPABASE_DB_URL"], "strict": True, "type": "env_var"}, + "table_name": "haystack_test_to_dict", + "embedding_dimension": 768, + "vector_function": "cosine_similarity", + "recreate_table": True, + "search_strategy": "exact_nearest_neighbor", + "hnsw_recreate_index_if_exists": False, + "hnsw_index_creation_kwargs": {}, + "hnsw_index_name": "haystack_hnsw_index", + "hnsw_ef_search": None, + "keyword_index_name": "haystack_keyword_index", + }, + }, + "filters": {"field": "value"}, + "top_k": 5, + }, + } + + retriever = SupabasePgvectorKeywordRetriever.from_dict(data) + document_store = retriever.document_store + + assert isinstance(document_store, SupabasePgvectorDocumentStore) + assert isinstance(document_store.connection_string, EnvVarSecret) + assert document_store.table_name == "haystack_test_to_dict" + assert document_store.embedding_dimension == 768 + assert document_store.vector_function == "cosine_similarity" + assert document_store.recreate_table + assert document_store.search_strategy == "exact_nearest_neighbor" + assert not document_store.hnsw_recreate_index_if_exists + assert document_store.hnsw_index_creation_kwargs == {} + assert document_store.hnsw_index_name == "haystack_hnsw_index" + assert document_store.hnsw_ef_search is None + assert document_store.keyword_index_name == "haystack_keyword_index" + + assert retriever.filters == {"field": "value"} + assert retriever.filter_policy == FilterPolicy.REPLACE # defaults to REPLACE + assert retriever.top_k == 5 + + +def test_run(): + mock_store = Mock(spec=SupabasePgvectorDocumentStore) + doc = Document(content="Test doc", embedding=[0.1, 0.2]) + mock_store._keyword_retrieval.return_value = [doc] + + retriever = SupabasePgvectorKeywordRetriever(document_store=mock_store) + res = retriever.run(query="test query") + + mock_store._keyword_retrieval.assert_called_once_with(query="test query", filters={}, top_k=10) + assert res == {"documents": [doc]} + + +@pytest.mark.asyncio +async def test_run_async(): + mock_store = Mock(spec=SupabasePgvectorDocumentStore) + doc = Document(content="Test doc", embedding=[0.1, 0.2]) + mock_store._keyword_retrieval_async.return_value = [doc] + + retriever = SupabasePgvectorKeywordRetriever(document_store=mock_store) + res = await retriever.run_async(query="test query") + + mock_store._keyword_retrieval_async.assert_called_once_with(query="test query", filters={}, top_k=10) + assert res == {"documents": [doc]} + + +def test_run_with_filters(): + mock_store = Mock(spec=SupabasePgvectorDocumentStore) + doc = Document(content="Test doc", embedding=[0.1, 0.2]) + mock_store._keyword_retrieval.return_value = [doc] + + init_filter = {"field": "meta.category", "operator": "==", "value": "news"} + runtime_filter = {"field": "meta.score", "operator": ">", "value": 0.5} + merged_filter = {"operator": "AND", "conditions": [init_filter, runtime_filter]} + + retriever = SupabasePgvectorKeywordRetriever( + document_store=mock_store, filter_policy=FilterPolicy.MERGE, filters=init_filter + ) + res = retriever.run(query="test query", filters=runtime_filter) + + mock_store._keyword_retrieval.assert_called_once_with(query="test query", filters=merged_filter, top_k=10) + assert res == {"documents": [doc]} + + +@pytest.mark.asyncio +async def test_run_async_with_filters(): + mock_store = Mock(spec=SupabasePgvectorDocumentStore) + doc = Document(content="Test doc", embedding=[0.1, 0.2]) + mock_store._keyword_retrieval_async.return_value = [doc] + + init_filter = {"field": "meta.category", "operator": "==", "value": "news"} + runtime_filter = {"field": "meta.score", "operator": ">", "value": 0.5} + merged_filter = {"operator": "AND", "conditions": [init_filter, runtime_filter]} + + retriever = SupabasePgvectorKeywordRetriever( + document_store=mock_store, filter_policy=FilterPolicy.MERGE, filters=init_filter + ) + res = await retriever.run_async(query="test query", filters=runtime_filter) + + mock_store._keyword_retrieval_async.assert_called_once_with(query="test query", filters=merged_filter, top_k=10) + assert res == {"documents": [doc]} + + +@pytest.mark.integration +def test_keyword_retrieval(document_store: SupabasePgvectorDocumentStore): + docs = [ + Document(content="The quick brown fox chased the dog", embedding=[0.1] * 768), + Document(content="The fox was brown", embedding=[0.1] * 768), + Document(content="The lazy dog", embedding=[0.1] * 768), + Document(content="fox fox fox", embedding=[0.1] * 768), + ] + + document_store.write_documents(docs) + + results = document_store._keyword_retrieval(query="fox", top_k=2) + + assert len(results) == 2 + for doc in results: + assert "fox" in doc.content + assert results[0].id == docs[-1].id + assert results[0].score > results[1].score + + +@pytest.mark.integration +def test_keyword_retrieval_with_filters(document_store: SupabasePgvectorDocumentStore): + docs = [ + Document( + content="The quick brown fox chased the dog", + embedding=[0.1] * 768, + meta={"meta_field": "right_value"}, + ), + Document(content="The fox was brown", embedding=[0.1] * 768, meta={"meta_field": "right_value"}), + Document(content="The lazy dog", embedding=[0.1] * 768, meta={"meta_field": "right_value"}), + Document(content="fox fox fox", embedding=[0.1] * 768, meta={"meta_field": "wrong_value"}), + ] + + document_store.write_documents(docs) + + filters = {"field": "meta.meta_field", "operator": "==", "value": "right_value"} + + results = document_store._keyword_retrieval(query="fox", top_k=3, filters=filters) + assert len(results) == 2 + for doc in results: + assert "fox" in doc.content + assert doc.meta["meta_field"] == "right_value" + + +@pytest.mark.integration +def test_empty_query(document_store: SupabasePgvectorDocumentStore): + with pytest.raises(ValueError): + document_store._keyword_retrieval(query="")