diff --git a/.github/labeler.yml b/.github/labeler.yml index e16d1c6ad3..23cc5069c2 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -64,6 +64,11 @@ integration:docling: - any-glob-to-any-file: "integrations/docling/**/*" - any-glob-to-any-file: ".github/workflows/docling.yml" +integration:docling-serve: + - changed-files: + - any-glob-to-any-file: "integrations/docling_serve/**/*" + - any-glob-to-any-file: ".github/workflows/docling_serve.yml" + integration:elasticsearch: - changed-files: - any-glob-to-any-file: "integrations/elasticsearch/**/*" diff --git a/.github/workflows/CI_coverage_comment.yml b/.github/workflows/CI_coverage_comment.yml index f4b83385a5..828f3dff9a 100644 --- a/.github/workflows/CI_coverage_comment.yml +++ b/.github/workflows/CI_coverage_comment.yml @@ -15,6 +15,7 @@ on: - "Test / cohere" - "Test / cometapi" - "Test / deepeval" + - "Test / docling_serve" - "Test / dspy" - "Test / elasticsearch" - "Test / faiss" diff --git a/.github/workflows/docling_serve.yml b/.github/workflows/docling_serve.yml new file mode 100644 index 0000000000..f56f04c5a9 --- /dev/null +++ b/.github/workflows/docling_serve.yml @@ -0,0 +1,90 @@ +name: Test / docling_serve + +on: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - "integrations/docling_serve/**" + - "!integrations/docling_serve/*.md" + - ".github/workflows/docling_serve.yml" + +defaults: + run: + working-directory: integrations/docling_serve + +concurrency: + group: docling_serve-${{ github.head_ref }} + cancel-in-progress: true + +env: + PYTHONUNBUFFERED: "1" + FORCE_COLOR: "1" + +jobs: + run: + name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.10", "3.14"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Hatch + run: pip install hatch + + - name: Lint + if: matrix.python-version == '3.10' && runner.os == 'Linux' + run: hatch run fmt-check && hatch run test:types + + - name: Run unit tests + run: hatch run test:unit-cov-retry + + - name: Store unit tests coverage + id: coverage_comment + if: matrix.python-version == '3.10' && runner.os == 'Linux' + uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40 + with: + GITHUB_TOKEN: ${{ github.token }} + COVERAGE_PATH: integrations/docling_serve + SUBPROJECT_ID: docling_serve + MINIMUM_GREEN: 90 + MINIMUM_ORANGE: 60 + + - name: Upload coverage comment to be posted + if: matrix.python-version == '3.10' && runner.os == 'Linux' && github.event_name == 'pull_request' && steps.coverage_comment.outputs.COMMENT_FILE_WRITTEN == 'true' + uses: actions/upload-artifact@v4 + with: + name: coverage-comment-docling_serve + path: python-coverage-comment-action-docling_serve.txt + + - name: Run unit tests with lowest direct dependencies + run: | + hatch run uv pip compile pyproject.toml --resolution lowest-direct --output-file requirements_lowest_direct.txt + hatch -e test env run -- uv pip install -r requirements_lowest_direct.txt + hatch run test:unit + + - name: Nightly - run unit tests with Haystack main branch + if: github.event_name == 'schedule' + run: | + hatch env prune + hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main + hatch run test:unit + + notify-slack-on-failure: + needs: run + if: failure() && github.event_name == 'schedule' + runs-on: ubuntu-latest + steps: + - uses: deepset-ai/notify-slack-action@3cda73b77a148f16f703274198e7771340cf862b # v1 + with: + slack-webhook-url: ${{ secrets.SLACK_WEBHOOK_URL_NOTIFICATIONS }} diff --git a/README.md b/README.md index 146ccb334d..fb3148e9f2 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta | [cometapi-haystack](integrations/cometapi/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/cometapi-haystack.svg)](https://pypi.org/project/cometapi-haystack) | [![Test / cometapi](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cometapi.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cometapi.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-cometapi/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-cometapi/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-cometapi-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-cometapi-combined/htmlcov/index.html) | | [deepeval-haystack](integrations/deepeval/) | Evaluator | [![PyPI - Version](https://img.shields.io/pypi/v/deepeval-haystack.svg)](https://pypi.org/project/deepeval-haystack) | [![Test / deepeval](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/deepeval.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/deepeval.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-deepeval/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-deepeval/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-deepeval-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-deepeval-combined/htmlcov/index.html) | | [docling-haystack](integrations/docling/) | Converter | [![PyPI - Version](https://img.shields.io/pypi/v/docling-haystack.svg)](https://pypi.org/project/docling-haystack) | [![Test / docling](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/docling.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/docling.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-docling/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-docling/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-docling-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-docling-combined/htmlcov/index.html) | +| [docling-serve-haystack](integrations/docling_serve/) | Converter | [![PyPI - Version](https://img.shields.io/pypi/v/docling-serve-haystack.svg)](https://pypi.org/project/docling-serve-haystack) | [![Test / docling_serve](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/docling_serve.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/docling_serve.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-docling_serve/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-docling_serve/htmlcov/index.html) | N/A | | [dspy-haystack](integrations/dspy/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/dspy-haystack.svg)](https://pypi.org/project/dspy-haystack) | [![Test / dspy](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/dspy.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/dspy.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-dspy/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-dspy/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-dspy-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-dspy-combined/htmlcov/index.html) | | [elasticsearch-haystack](integrations/elasticsearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/elasticsearch-haystack.svg)](https://pypi.org/project/elasticsearch-haystack) | [![Test / elasticsearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-elasticsearch/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-elasticsearch/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-elasticsearch-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-elasticsearch-combined/htmlcov/index.html) | | [faiss-haystack](integrations/faiss/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/faiss-haystack.svg)](https://pypi.org/project/faiss-haystack) | [![Test / faiss](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/faiss.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/faiss.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-faiss/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-faiss/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-faiss-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-faiss-combined/htmlcov/index.html) | diff --git a/integrations/docling_serve/README.md b/integrations/docling_serve/README.md new file mode 100644 index 0000000000..6fda66948e --- /dev/null +++ b/integrations/docling_serve/README.md @@ -0,0 +1,10 @@ +# docling-serve-haystack + +[![PyPI - Version](https://img.shields.io/pypi/v/docling-serve-haystack.svg)](https://pypi.org/project/docling-serve-haystack) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/docling-serve-haystack.svg)](https://pypi.org/project/docling-serve-haystack) + +- [Changelog](https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/docling_serve/CHANGELOG.md) + +--- + +Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md). diff --git a/integrations/docling_serve/pydoc/config_docusaurus.yml b/integrations/docling_serve/pydoc/config_docusaurus.yml new file mode 100644 index 0000000000..321b431fdd --- /dev/null +++ b/integrations/docling_serve/pydoc/config_docusaurus.yml @@ -0,0 +1,13 @@ +loaders: + - modules: + - haystack_integrations.components.converters.docling_serve.converter + search_path: [../src] +processors: + - type: filter + documented_only: true + skip_empty_modules: true +renderer: + description: DoclingServe integration for Haystack + id: integrations-docling-serve + filename: docling_serve.md + title: DoclingServe diff --git a/integrations/docling_serve/pyproject.toml b/integrations/docling_serve/pyproject.toml new file mode 100644 index 0000000000..640f651566 --- /dev/null +++ b/integrations/docling_serve/pyproject.toml @@ -0,0 +1,162 @@ +[build-system] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" + +[project] +name = "docling-serve-haystack" +dynamic = ["version"] +description = "Haystack converter component for DoclingServe — document conversion via HTTP" +readme = "README.md" +requires-python = ">=3.10" +license = "Apache-2.0" +keywords = ["Haystack", "Docling", "DoclingServe", "document conversion", "PDF", "OCR"] +authors = [{ name = "deepset GmbH", email = "info@deepset.ai" }] +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = [ + "haystack-ai>=2.9.0", + "httpx>=0.27.0", +] + +[project.urls] +Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/docling_serve#readme" +Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" +Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/docling_serve" + +[tool.hatch.build.targets.wheel] +packages = ["src/haystack_integrations"] + +[tool.hatch.version] +source = "vcs" +tag-pattern = 'integrations\/docling_serve-v(?P.*)' + +[tool.hatch.version.raw-options] +root = "../.." +git_describe_command = 'git describe --tags --match="integrations/docling_serve-v[0-9]*"' + +[tool.hatch.envs.default] +installer = "uv" +dependencies = ["haystack-pydoc-tools", "ruff"] + +[tool.hatch.envs.default.scripts] +docs = ["haystack-pydoc pydoc/config_docusaurus.yml"] +fmt = "ruff check --fix {args}; ruff format {args}" +fmt-check = "ruff check {args} && ruff format --check {args}" + +[tool.hatch.envs.test] +dependencies = [ + "pytest", + "pytest-asyncio", + "pytest-cov", + "pytest-rerunfailures", + "mypy", + "pip", +] + +[tool.hatch.envs.test.scripts] +unit = 'pytest -m "not integration" {args:tests}' +integration = 'pytest -m "integration" {args:tests}' +all = 'pytest {args:tests}' +unit-cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x -m "not integration" {args:tests}' +types = "mypy -p haystack_integrations.components.converters.docling_serve {args}" + +[tool.mypy] +install_types = true +non_interactive = true +check_untyped_defs = true +disallow_incomplete_defs = true + +[[tool.mypy.overrides]] +module = ["httpx"] +ignore_missing_imports = true + +[tool.ruff] +line-length = 120 + +[tool.ruff.lint] +select = [ + "A", + "ANN", + "ARG", + "B", + "C", + "D102", + "D103", + "D205", + "D209", + "D213", + "D417", + "D419", + "DTZ", + "E", + "EM", + "F", + "I", + "ICN", + "ISC", + "N", + "PLC", + "PLE", + "PLR", + "PLW", + "Q", + "RUF", + "S", + "T", + "TID", + "UP", + "W", + "YTT", +] +ignore = [ + "B027", + "B008", + "S105", + "S106", + "S107", + "C901", + "PLR0911", + "PLR0912", + "PLR0913", + "PLR0915", + "ANN401", +] + +[tool.ruff.lint.isort] +known-first-party = ["haystack_integrations"] + +[tool.ruff.lint.flake8-tidy-imports] +ban-relative-imports = "parents" + +[tool.ruff.lint.per-file-ignores] +"tests/**/*" = ["PLR2004", "S101", "TID252", "D", "ANN"] + +[tool.coverage.run] +source = ["haystack_integrations"] +branch = true +relative_files = true +parallel = false + +[tool.coverage.report] +omit = ["*/tests/*", "*/__init__.py"] +show_missing = true +exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] + +[tool.pytest.ini_options] +addopts = "--strict-markers" +markers = [ + "integration: integration tests", +] +log_cli = true +asyncio_mode = "auto" +asyncio_default_fixture_loop_scope = "function" diff --git a/integrations/docling_serve/src/haystack_integrations/components/converters/docling_serve/__init__.py b/integrations/docling_serve/src/haystack_integrations/components/converters/docling_serve/__init__.py new file mode 100644 index 0000000000..34d499eada --- /dev/null +++ b/integrations/docling_serve/src/haystack_integrations/components/converters/docling_serve/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from haystack_integrations.components.converters.docling_serve.converter import DoclingServeConverter, ExportType + +__all__ = ["DoclingServeConverter", "ExportType"] diff --git a/integrations/docling_serve/src/haystack_integrations/components/converters/docling_serve/converter.py b/integrations/docling_serve/src/haystack_integrations/components/converters/docling_serve/converter.py new file mode 100644 index 0000000000..6fd68624a6 --- /dev/null +++ b/integrations/docling_serve/src/haystack_integrations/components/converters/docling_serve/converter.py @@ -0,0 +1,254 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import base64 +import json +from enum import Enum +from pathlib import Path +from typing import Any + +import httpx +from haystack import Document, component, default_from_dict, default_to_dict, logging +from haystack.components.converters.utils import normalize_metadata +from haystack.dataclasses import ByteStream +from haystack.utils import Secret + +logger = logging.getLogger(__name__) + + +class ExportType(str, Enum): + """ + Enumeration of export formats supported by DoclingServe. + + - `MARKDOWN`: Converts documents to Markdown format. + - `TEXT`: Extracts plain text. + - `JSON`: Returns the full Docling document as a JSON string. + """ + + MARKDOWN = "markdown" + TEXT = "text" + JSON = "json" + + +@component +class DoclingServeConverter: + """ + Converts documents to Haystack Documents using a DoclingServe server. + + See [DoclingServe](https://github.com/docling-project/docling-serve). + + DoclingServe hosts Docling in a scalable HTTP server, supporting PDFs, Office documents, HTML, and many other + formats. Unlike the local `DoclingConverter`, this component has no heavy ML dependencies — all processing + happens on the remote server. + + Supports both synchronous (`run`) and asynchronous (`arun`) execution. + + ### Usage example + + ```python + from haystack_integrations.components.converters.docling_serve import DoclingServeConverter + + converter = DoclingServeConverter(base_url="http://localhost:5001") + result = converter.run(sources=["https://arxiv.org/pdf/2206.01062"]) + print(result["documents"][0].content[:200]) + ``` + """ + + def __init__( + self, + *, + base_url: str = "http://localhost:5001", + export_type: ExportType = ExportType.MARKDOWN, + convert_options: dict[str, Any] | None = None, + timeout: float = 120.0, + api_key: Secret | None = None, + ) -> None: + """ + Initializes the DoclingServeConverter. + + :param base_url: + Base URL of the DoclingServe instance. Defaults to `"http://localhost:5001"`. + :param export_type: + The output format for converted documents. One of `ExportType.MARKDOWN` (default), + `ExportType.TEXT`, or `ExportType.JSON`. + :param convert_options: + Optional dictionary of conversion options passed directly to the DoclingServe API + (e.g. `{"do_ocr": True, "ocr_engine": "tesseract"}`). + See [DoclingServe options](https://github.com/docling-project/docling-serve/blob/main/docs/usage.md). + :param timeout: + HTTP request timeout in seconds. Defaults to `120.0`. + :param api_key: + Optional API key for authenticating with a secured DoclingServe instance. + """ + self.base_url = base_url.rstrip("/") + self.export_type = ExportType(export_type) + self.convert_options = convert_options or {} + self.timeout = timeout + self.api_key = api_key + + def to_dict(self) -> dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + A dictionary representation of the component. + """ + return default_to_dict( + self, + base_url=self.base_url, + export_type=self.export_type.value, + convert_options=self.convert_options, + timeout=self.timeout, + api_key=self.api_key.to_dict() if self.api_key else None, + ) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "DoclingServeConverter": + """ + Deserializes the component from a dictionary. + + :param data: + Dictionary representation of the component. + :returns: + A new `DoclingServeConverter` instance. + """ + if api_key := data.get("init_parameters", {}).get("api_key"): + data["init_parameters"]["api_key"] = Secret.from_dict(api_key) + return default_from_dict(cls, data) + + def _headers(self) -> dict[str, str]: + headers: dict[str, str] = {} + if self.api_key: + headers["Authorization"] = f"Bearer {self.api_key.resolve_value()}" + return headers + + def _to_format(self) -> str: + return {"markdown": "md", "text": "text", "json": "json"}[self.export_type.value] + + def _build_payload(self, source_entry: dict[str, Any]) -> dict[str, Any]: + options = {**self.convert_options, "to_formats": [self._to_format()]} + return {"options": options, "sources": [source_entry]} + + def _source_entry(self, source: str | Path | ByteStream) -> tuple[dict[str, Any], dict[str, Any]]: + """ + Convert a source to a DoclingServe API source entry. + + :returns: + A tuple of (source_entry dict, extra_meta dict from ByteStream if applicable). + """ + if isinstance(source, str) and source.startswith(("http://", "https://")): + return {"kind": "http", "url": source}, {} + if isinstance(source, ByteStream): + b64 = base64.b64encode(source.data).decode() + filename = (source.meta or {}).get("file_name", "document") + return {"kind": "file", "base64_string": b64, "filename": filename}, source.meta or {} + path = Path(source) + b64 = base64.b64encode(path.read_bytes()).decode() + return {"kind": "file", "base64_string": b64, "filename": path.name}, {} + + def _extract_content(self, data: dict[str, Any]) -> str | None: + doc = data.get("document", {}) + if self.export_type == ExportType.MARKDOWN: + return doc.get("md_content") + if self.export_type == ExportType.TEXT: + return doc.get("text_content") + if self.export_type == ExportType.JSON: + content = doc.get("json_content") + return json.dumps(content) if content is not None else None + return None + + @component.output_types(documents=list[Document]) + def run( + self, + sources: list[str | Path | ByteStream], + meta: dict[str, Any] | list[dict[str, Any]] | None = None, + ) -> dict[str, list[Document]]: + """ + Converts documents by sending them to DoclingServe and returns Haystack Documents. + + :param sources: + List of sources to convert. Each item can be a URL string, a local file path, or a + `ByteStream`. + :param meta: + Optional metadata to attach to the output Documents. Can be a single dict applied to + all documents, or a list of dicts with one entry per source. + :returns: + A dictionary with key `"documents"` containing the converted Haystack Documents. + """ + meta_list = normalize_metadata(meta=meta, sources_count=len(sources)) + documents: list[Document] = [] + headers = self._headers() + + with httpx.Client(timeout=self.timeout) as client: + for source, source_meta in zip(sources, meta_list, strict=True): + try: + source_entry, bytestream_meta = self._source_entry(source) + payload = self._build_payload(source_entry) + response = client.post( + f"{self.base_url}/v1/convert/source", + json=payload, + headers=headers, + ) + response.raise_for_status() + content = self._extract_content(response.json()) + if content is not None: + documents.append(Document(content=content, meta={**bytestream_meta, **source_meta})) + else: + logger.warning("No content returned for source {source}.", source=source) + except Exception as e: + logger.warning( + "Could not convert source {source}. Skipping it. Error: {error}", + source=source, + error=e, + ) + + return {"documents": documents} + + async def arun( + self, + sources: list[str | Path | ByteStream], + meta: dict[str, Any] | list[dict[str, Any]] | None = None, + ) -> dict[str, list[Document]]: + """ + Asynchronously converts documents by sending them to DoclingServe. + + This is the async equivalent of `run()`, useful when DoclingServe requests should not + block the event loop. + + :param sources: + List of sources to convert. Each item can be a URL string, a local file path, or a + `ByteStream`. + :param meta: + Optional metadata to attach to the output Documents. + :returns: + A dictionary with key `"documents"` containing the converted Haystack Documents. + """ + meta_list = normalize_metadata(meta=meta, sources_count=len(sources)) + documents: list[Document] = [] + headers = self._headers() + + async with httpx.AsyncClient(timeout=self.timeout) as client: + for source, source_meta in zip(sources, meta_list, strict=True): + try: + source_entry, bytestream_meta = self._source_entry(source) + payload = self._build_payload(source_entry) + response = await client.post( + f"{self.base_url}/v1/convert/source", + json=payload, + headers=headers, + ) + response.raise_for_status() + content = self._extract_content(response.json()) + if content is not None: + documents.append(Document(content=content, meta={**bytestream_meta, **source_meta})) + else: + logger.warning("No content returned for source {source}.", source=source) + except Exception as e: + logger.warning( + "Could not convert source {source}. Skipping it. Error: {error}", + source=source, + error=e, + ) + + return {"documents": documents} diff --git a/integrations/docling_serve/src/haystack_integrations/components/converters/py.typed b/integrations/docling_serve/src/haystack_integrations/components/converters/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/integrations/docling_serve/tests/test_converter.py b/integrations/docling_serve/tests/test_converter.py new file mode 100644 index 0000000000..97779f7c4a --- /dev/null +++ b/integrations/docling_serve/tests/test_converter.py @@ -0,0 +1,401 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import json +import logging +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest +from haystack.dataclasses import ByteStream +from haystack.utils import Secret + +from haystack_integrations.components.converters.docling_serve import DoclingServeConverter, ExportType + + +def _mock_response(content: str, export_type: ExportType = ExportType.MARKDOWN) -> MagicMock: + content_key = {"markdown": "md_content", "text": "text_content", "json": "json_content"}[export_type.value] + mock = MagicMock() + mock.json.return_value = {"document": {content_key: content}, "status": "success"} + mock.raise_for_status = MagicMock() + return mock + + +class TestDoclingServeConverterInit: + def test_defaults(self): + converter = DoclingServeConverter() + assert converter.base_url == "http://localhost:5001" + assert converter.export_type == ExportType.MARKDOWN + assert converter.convert_options == {} + assert converter.timeout == 120.0 + assert converter.api_key is None + + def test_custom_params(self): + converter = DoclingServeConverter( + base_url="http://myserver:8080/", + export_type=ExportType.TEXT, + convert_options={"do_ocr": True}, + timeout=60.0, + ) + assert converter.base_url == "http://myserver:8080" # trailing slash stripped + assert converter.export_type == ExportType.TEXT + assert converter.convert_options == {"do_ocr": True} + assert converter.timeout == 60.0 + + def test_trailing_slash_stripped(self): + converter = DoclingServeConverter(base_url="http://localhost:5001/") + assert converter.base_url == "http://localhost:5001" + + +class TestDoclingServeConverterSerialization: + def test_to_dict(self): + converter = DoclingServeConverter( + base_url="http://localhost:5001", + export_type=ExportType.TEXT, + convert_options={"do_ocr": False}, + timeout=30.0, + ) + data = converter.to_dict() + assert data["type"].endswith("DoclingServeConverter") + params = data["init_parameters"] + assert params["base_url"] == "http://localhost:5001" + assert params["export_type"] == "text" + assert params["convert_options"] == {"do_ocr": False} + assert params["timeout"] == 30.0 + assert params["api_key"] is None + + def test_from_dict(self): + data = { + "type": "haystack_integrations.components.converters.docling_serve.converter.DoclingServeConverter", + "init_parameters": { + "base_url": "http://myserver:9000", + "export_type": "json", + "convert_options": {}, + "timeout": 60.0, + "api_key": None, + }, + } + converter = DoclingServeConverter.from_dict(data) + assert converter.base_url == "http://myserver:9000" + assert converter.export_type == ExportType.JSON + assert converter.timeout == 60.0 + + def test_to_dict_with_api_key(self, monkeypatch): + monkeypatch.setenv("DOCLING_API_KEY", "test-key") + converter = DoclingServeConverter(api_key=Secret.from_env_var("DOCLING_API_KEY")) + data = converter.to_dict() + assert data["init_parameters"]["api_key"] is not None + assert data["init_parameters"]["api_key"]["type"] == "env_var" + + def test_roundtrip(self): + converter = DoclingServeConverter( + base_url="http://remote:5001", + export_type=ExportType.MARKDOWN, + convert_options={"table_mode": "fast"}, + timeout=45.0, + ) + data = converter.to_dict() + restored = DoclingServeConverter.from_dict(data) + assert restored.base_url == converter.base_url + assert restored.export_type == converter.export_type + assert restored.convert_options == converter.convert_options + assert restored.timeout == converter.timeout + + +class TestDoclingServeConverterPayload: + def test_payload_markdown(self): + converter = DoclingServeConverter(export_type=ExportType.MARKDOWN) + entry = {"kind": "http", "url": "https://example.com/doc.pdf"} + payload = converter._build_payload(entry) + assert payload["options"]["to_formats"] == ["md"] + assert payload["sources"] == [entry] + + def test_payload_text(self): + converter = DoclingServeConverter(export_type=ExportType.TEXT) + entry = {"kind": "http", "url": "https://example.com/doc.pdf"} + payload = converter._build_payload(entry) + assert payload["options"]["to_formats"] == ["text"] + + def test_payload_merges_convert_options(self): + converter = DoclingServeConverter(convert_options={"do_ocr": True, "table_mode": "accurate"}) + entry = {"kind": "http", "url": "https://example.com/doc.pdf"} + payload = converter._build_payload(entry) + assert payload["options"]["do_ocr"] is True + assert payload["options"]["table_mode"] == "accurate" + assert payload["options"]["to_formats"] == ["md"] + + def test_source_entry_http_url(self): + converter = DoclingServeConverter() + entry, extra = converter._source_entry("https://example.com/file.pdf") + assert entry["kind"] == "http" + assert entry["url"] == "https://example.com/file.pdf" + assert extra == {} + + def test_source_entry_file_path(self, tmp_path): + pdf = tmp_path / "test.pdf" + pdf.write_bytes(b"%PDF-test") + converter = DoclingServeConverter() + entry, extra = converter._source_entry(pdf) + assert entry["kind"] == "file" + assert entry["filename"] == "test.pdf" + assert "base64_string" in entry + assert extra == {} + + def test_source_entry_bytestream(self): + converter = DoclingServeConverter() + bs = ByteStream(data=b"some bytes", meta={"file_name": "report.pdf", "source": "s3"}) + entry, extra = converter._source_entry(bs) + assert entry["kind"] == "file" + assert entry["filename"] == "report.pdf" + assert "base64_string" in entry + assert extra == {"file_name": "report.pdf", "source": "s3"} + + def test_source_entry_bytestream_no_meta(self): + converter = DoclingServeConverter() + bs = ByteStream(data=b"bytes") + entry, _ = converter._source_entry(bs) + assert entry["filename"] == "document" + + +class TestDoclingServeConverterRun: + def test_run_url_source(self): + converter = DoclingServeConverter() + mock_resp = _mock_response("# Hello World") + + with patch("httpx.Client") as mock_client_cls: + mock_client = MagicMock() + mock_client.__enter__ = MagicMock(return_value=mock_client) + mock_client.__exit__ = MagicMock(return_value=False) + mock_client.post.return_value = mock_resp + mock_client_cls.return_value = mock_client + + result = converter.run(sources=["https://example.com/doc.pdf"]) + + assert len(result["documents"]) == 1 + assert result["documents"][0].content == "# Hello World" + + def test_run_multiple_sources(self): + converter = DoclingServeConverter() + mock_resp = _mock_response("content") + + with patch("httpx.Client") as mock_client_cls: + mock_client = MagicMock() + mock_client.__enter__ = MagicMock(return_value=mock_client) + mock_client.__exit__ = MagicMock(return_value=False) + mock_client.post.return_value = mock_resp + mock_client_cls.return_value = mock_client + + result = converter.run(sources=["https://a.com/1.pdf", "https://b.com/2.pdf"]) + + assert len(result["documents"]) == 2 + + def test_run_with_meta(self): + converter = DoclingServeConverter() + mock_resp = _mock_response("text") + + with patch("httpx.Client") as mock_client_cls: + mock_client = MagicMock() + mock_client.__enter__ = MagicMock(return_value=mock_client) + mock_client.__exit__ = MagicMock(return_value=False) + mock_client.post.return_value = mock_resp + mock_client_cls.return_value = mock_client + + result = converter.run(sources=["https://example.com/doc.pdf"], meta={"author": "Alice"}) + + assert result["documents"][0].meta["author"] == "Alice" + + def test_run_bytestream_meta_merged(self): + converter = DoclingServeConverter() + mock_resp = _mock_response("text") + bs = ByteStream(data=b"bytes", meta={"file_path": "doc.pdf"}) + + with patch("httpx.Client") as mock_client_cls: + mock_client = MagicMock() + mock_client.__enter__ = MagicMock(return_value=mock_client) + mock_client.__exit__ = MagicMock(return_value=False) + mock_client.post.return_value = mock_resp + mock_client_cls.return_value = mock_client + + result = converter.run(sources=[bs], meta={"page": 1}) + + doc = result["documents"][0] + assert doc.meta["file_path"] == "doc.pdf" + assert doc.meta["page"] == 1 + + def test_run_skips_on_http_error(self, caplog): + converter = DoclingServeConverter() + + with patch("httpx.Client") as mock_client_cls: + mock_client = MagicMock() + mock_client.__enter__ = MagicMock(return_value=mock_client) + mock_client.__exit__ = MagicMock(return_value=False) + mock_client.post.side_effect = Exception("Connection refused") + mock_client_cls.return_value = mock_client + + with caplog.at_level(logging.WARNING): + result = converter.run(sources=["https://example.com/doc.pdf"]) + + assert result["documents"] == [] + assert "Could not convert source" in caplog.text + + def test_run_skips_when_no_content(self, caplog): + converter = DoclingServeConverter() + mock_resp = MagicMock() + mock_resp.json.return_value = {"document": {"md_content": None}, "status": "success"} + mock_resp.raise_for_status = MagicMock() + + with patch("httpx.Client") as mock_client_cls: + mock_client = MagicMock() + mock_client.__enter__ = MagicMock(return_value=mock_client) + mock_client.__exit__ = MagicMock(return_value=False) + mock_client.post.return_value = mock_resp + mock_client_cls.return_value = mock_client + + with caplog.at_level(logging.WARNING): + result = converter.run(sources=["https://example.com/doc.pdf"]) + + assert result["documents"] == [] + assert "No content returned" in caplog.text + + def test_run_text_export(self): + converter = DoclingServeConverter(export_type=ExportType.TEXT) + mock_resp = _mock_response("plain text content", ExportType.TEXT) + + with patch("httpx.Client") as mock_client_cls: + mock_client = MagicMock() + mock_client.__enter__ = MagicMock(return_value=mock_client) + mock_client.__exit__ = MagicMock(return_value=False) + mock_client.post.return_value = mock_resp + mock_client_cls.return_value = mock_client + + result = converter.run(sources=["https://example.com/doc.pdf"]) + + assert result["documents"][0].content == "plain text content" + + def test_run_json_export(self): + converter = DoclingServeConverter(export_type=ExportType.JSON) + json_doc = {"schema_name": "DoclingDocument", "pages": []} + mock_resp = _mock_response(json_doc, ExportType.JSON) # type: ignore[arg-type] + + with patch("httpx.Client") as mock_client_cls: + mock_client = MagicMock() + mock_client.__enter__ = MagicMock(return_value=mock_client) + mock_client.__exit__ = MagicMock(return_value=False) + mock_client.post.return_value = mock_resp + mock_client_cls.return_value = mock_client + + result = converter.run(sources=["https://example.com/doc.pdf"]) + + content = json.loads(result["documents"][0].content) + assert content["schema_name"] == "DoclingDocument" + + def test_run_sends_api_key_header(self): + converter = DoclingServeConverter(api_key=Secret.from_token("my-secret-token")) + mock_resp = _mock_response("content") + + with patch("httpx.Client") as mock_client_cls: + mock_client = MagicMock() + mock_client.__enter__ = MagicMock(return_value=mock_client) + mock_client.__exit__ = MagicMock(return_value=False) + mock_client.post.return_value = mock_resp + mock_client_cls.return_value = mock_client + + converter.run(sources=["https://example.com/doc.pdf"]) + + _, kwargs = mock_client.post.call_args + assert kwargs["headers"]["Authorization"] == "Bearer my-secret-token" + + def test_run_file_path(self, tmp_path): + pdf = tmp_path / "test.pdf" + pdf.write_bytes(b"%PDF-test") + converter = DoclingServeConverter() + mock_resp = _mock_response("# PDF content") + + with patch("httpx.Client") as mock_client_cls: + mock_client = MagicMock() + mock_client.__enter__ = MagicMock(return_value=mock_client) + mock_client.__exit__ = MagicMock(return_value=False) + mock_client.post.return_value = mock_resp + mock_client_cls.return_value = mock_client + + result = converter.run(sources=[pdf]) + + payload = mock_client.post.call_args[1]["json"] + assert payload["sources"][0]["kind"] == "file" + assert payload["sources"][0]["filename"] == "test.pdf" + assert result["documents"][0].content == "# PDF content" + + +class TestDoclingServeConverterArun: + @pytest.mark.asyncio + async def test_arun_url_source(self): + converter = DoclingServeConverter() + mock_resp = MagicMock() + mock_resp.json.return_value = {"document": {"md_content": "# Async content"}, "status": "success"} + mock_resp.raise_for_status = MagicMock() + + with patch("httpx.AsyncClient") as mock_client_cls: + mock_client = AsyncMock() + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + mock_client.post = AsyncMock(return_value=mock_resp) + mock_client_cls.return_value = mock_client + + result = await converter.arun(sources=["https://example.com/doc.pdf"]) + + assert len(result["documents"]) == 1 + assert result["documents"][0].content == "# Async content" + + @pytest.mark.asyncio + async def test_arun_skips_on_error(self, caplog): + converter = DoclingServeConverter() + + with patch("httpx.AsyncClient") as mock_client_cls: + mock_client = AsyncMock() + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + mock_client.post = AsyncMock(side_effect=Exception("timeout")) + mock_client_cls.return_value = mock_client + + with caplog.at_level(logging.WARNING): + result = await converter.arun(sources=["https://example.com/doc.pdf"]) + + assert result["documents"] == [] + assert "Could not convert source" in caplog.text + + @pytest.mark.asyncio + async def test_arun_multiple_sources(self): + converter = DoclingServeConverter() + mock_resp = MagicMock() + mock_resp.json.return_value = {"document": {"md_content": "content"}, "status": "success"} + mock_resp.raise_for_status = MagicMock() + + with patch("httpx.AsyncClient") as mock_client_cls: + mock_client = AsyncMock() + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + mock_client.post = AsyncMock(return_value=mock_resp) + mock_client_cls.return_value = mock_client + + result = await converter.arun(sources=["https://a.com/1.pdf", "https://b.com/2.pdf"]) + + assert len(result["documents"]) == 2 + + +class TestDoclingServeConverterIntegration: + @pytest.mark.integration + def test_run_integration(self): + """Requires a running DoclingServe instance at http://localhost:5001.""" + converter = DoclingServeConverter() + result = converter.run(sources=["https://arxiv.org/pdf/2206.01062"]) + assert len(result["documents"]) > 0 + assert result["documents"][0].content + + @pytest.mark.integration + @pytest.mark.asyncio + async def test_arun_integration(self): + """Requires a running DoclingServe instance at http://localhost:5001.""" + converter = DoclingServeConverter() + result = await converter.arun(sources=["https://arxiv.org/pdf/2206.01062"]) + assert len(result["documents"]) > 0 + assert result["documents"][0].content