diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml new file mode 100644 index 000000000..74dd68dd9 --- /dev/null +++ b/.github/workflows/publish-docs.yml @@ -0,0 +1,47 @@ +# This workflow builds and publishes the latest docs to +# the `gh-pages` branch. +# For more details: https://github.com/marketplace/actions/deploy-to-github-pages +name: Publish docs + +on: + release: + types: [created] + workflow_dispatch: + +jobs: + build-and-deploy: + runs-on: ubuntu-latest + permissions: + contents: write + pages: write + defaults: + run: + shell: bash -l {0} + steps: + - uses: actions/checkout@v2 + with: + # fetch all tags so `versioneer` can properly determine current version + fetch-depth: 0 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.11' + - name: Install pandoc + uses: pandoc/actions/setup@v1 + - name: Install dependencies + run: | + pip install -r requirements.txt + pip install -r requirements-ml.txt + pip install -r requirements-reports.txt + pip install -r requirements-docs.txt + pip install -e . + + - name: Build + run: | + cd _docs/docs + python update_documentation.py + - name: Publish + uses: JamesIves/github-pages-deploy-action@v4 + with: + branch: gh-pages + folder: _docs/docs/LATEST/html diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml new file mode 100644 index 000000000..9a230cd27 --- /dev/null +++ b/.github/workflows/publish-package.yml @@ -0,0 +1,52 @@ +# This workflow publishes the package to pypi. +# For more details: +# https://docs.github.com/en/actions/guides/building-and-testing-python#publishing-to-package-registries +name: Publish to PyPi + +on: + release: + types: [created] + workflow_dispatch: + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + # fetch all tags so `versioneer` can properly determine current version + with: + fetch-depth: 0 + - name: Check if current commit is tagged + # fails and cancels release if the current commit is not tagged + run: | + git describe --exact-match --tags + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.11' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + if [ -f requirements-ml.txt ]; then pip install -r requirements-ml.txt; fi + if [ -f requirements-reports.txt ]; then pip install -r requirements-reports.txt; fi + pip install setuptools wheel twine + - name: Build + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + TWINE_REPOSITORY: pypi + run: | + python setup.py sdist bdist_wheel + - name: Test build + # fails and cancels release if the built package fails to import + run: | + pip install dist/*.whl + python -c 'import dataprofiler; print(dataprofiler.__version__)' + - name: Publish + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + TWINE_REPOSITORY: pypi + run: | + twine upload dist/* diff --git a/.github/workflows/publish-python-package.yml b/.github/workflows/publish-python-package.yml deleted file mode 100644 index 4ed9e1bf3..000000000 --- a/.github/workflows/publish-python-package.yml +++ /dev/null @@ -1,38 +0,0 @@ - -# This workflow will upload a Python Package using Twine when a release is created -# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries - -name: Publish Python Package - -on: - release: - types: [created] - branches: - - 'release/*' - -jobs: - deploy: - - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.11' - - name: Install dependencies - run: | - python -m pip install --upgrade pip - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - if [ -f requirements-ml.txt ]; then pip install -r requirements-ml.txt; fi - if [ -f requirements-reports.txt ]; then pip install -r requirements-reports.txt; fi - pip install setuptools wheel twine - - name: Build and publish - env: - TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} - TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} - TWINE_REPOSITORY: pypi - run: | - python setup.py sdist bdist_wheel - twine upload dist/* diff --git a/.github/workflows/test-python-package.yml b/.github/workflows/test-package.yml similarity index 88% rename from .github/workflows/test-python-package.yml rename to .github/workflows/test-package.yml index 3c88e7211..47416a938 100644 --- a/.github/workflows/test-python-package.yml +++ b/.github/workflows/test-package.yml @@ -7,8 +7,6 @@ on: pull_request: branches: - 'main' - - 'feature/**' - - 'dev' jobs: build: @@ -16,7 +14,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.9, "3.10", "3.11"] + python-version: ["3.10", "3.11"] steps: - uses: actions/checkout@v4 @@ -38,4 +36,4 @@ jobs: pre-commit run --all-files - name: Test with pytest run: | - DATAPROFILER_SEED=0 pytest --forked --cov=dataprofiler --cov-fail-under=80 + DATAPROFILER_SEED=0 pytest --cov=dataprofiler --cov-fail-under=80 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 092cc5a48..07b1d865a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,6 +5,7 @@ repos: rev: 24.10.0 hooks: - id: black + exclude: (versioneer.py|dataprofiler/_version.py|_docs/) types: [file, python] language_version: python3 # Isort: sort import statements @@ -15,6 +16,7 @@ repos: rev: 5.12.0 hooks: - id: isort + exclude: _docs/ language_version: python3 # Flake8: complexity and style checking # https://flake8.pycqa.org/en/latest/user/using-hooks.html @@ -23,7 +25,7 @@ repos: hooks: - id: flake8 additional_dependencies: [flake8-docstrings] - exclude: (^docs/|^dataprofiler/tests/|^.*/__init__.py) + exclude: (^docs/|^dataprofiler/tests/|^.*/__init__.py|_docs/) language_version: python3 # General fixers: format files for white spaces and trailing new lines, warn on debug statements # https://github.com/pre-commit/pre-commit-hooks#hooks-available @@ -31,17 +33,17 @@ repos: rev: v4.0.1 hooks: - id: trailing-whitespace - exclude: (^dataprofiler/tests/data/|^dataprofiler/tests/speed_tests/data/) + exclude: (^dataprofiler/tests/data/|^dataprofiler/tests/speed_tests/data/|_docs/) - id: debug-statements - id: end-of-file-fixer - exclude: (^dataprofiler/tests/data/) + exclude: (^dataprofiler/tests/data/|_docs/) # Mypy: Optional static type checking # https://github.com/pre-commit/mirrors-mypy - repo: https://github.com/pre-commit/mirrors-mypy rev: v1.11.2 hooks: - id: mypy - exclude: (^dataprofiler/tests/|^resources/|^examples|venv*/) + exclude: (^dataprofiler/tests/|^resources/|^examples|venv*/|versioneer.py|dataprofiler/_version.py|_docs/) language_version: python3 additional_dependencies: # Keep up-to-date with the respective requirement files [ @@ -93,7 +95,6 @@ repos: # requirements-test.txt coverage>=5.0.1, - dask>=2.29.0, fsspec>=0.3.3, pytest>=6.0.1, pytest-cov>=2.8.1, @@ -108,24 +109,23 @@ repos: rev: "0.48" hooks: - id: check-manifest - additional_dependencies: - [ - 'matplotlib', 'h5py', 'wheel', 'future', 'numpy<2.0.0', 'pandas', - 'python-dateutil', 'pytz', 'pyarrow', 'chardet', 'fastavro', - 'python-snappy', 'charset-normalizer', 'psutil', 'scipy', 'requests', - 'networkx','typing-extensions', 'HLL', 'datasketches', 'boto3', - ] + additional_dependencies: ['h5py', 'wheel', 'future', 'numpy<2.0.0', 'pandas', + 'python-dateutil', 'pytz', 'pyarrow', 'chardet', 'fastavro', + 'python-snappy', 'charset-normalizer', 'psutil', 'scipy', 'requests', + 'networkx','typing-extensions', 'HLL', 'datasketches', 'boto3'] # Pyupgrade - standardize and modernize Python syntax for newer versions of the language - repo: https://github.com/asottile/pyupgrade rev: v3.3.0 hooks: - id: pyupgrade - args: ["--py39-plus"] + args: ["--py310-plus"] + exclude: (versioneer.py|dataprofiler/_version.py| _docs/) # Autoflake - cleanup unused variables and imports - repo: https://github.com/PyCQA/autoflake rev: v2.0.0 hooks: - id: autoflake + exclude: _docs/ args: - "--in-place" - "--ignore-pass-statements" diff --git a/.whitesource b/.whitesource new file mode 100644 index 000000000..37dfa8e25 --- /dev/null +++ b/.whitesource @@ -0,0 +1,3 @@ +{ + "settingsInheritedFrom": "capitalone/whitesource-config" +} diff --git a/MANIFEST.in b/MANIFEST.in index 9a62e405e..3f426b7bb 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -17,4 +17,17 @@ recursive-include resources *.json recursive-include resources *.pb recursive-include resources *.py -recursive-include dataprofiler/labelers/embeddings/*.txt +recursive-include dataprofiler/labelers/embeddings *.txt +include versioneer.py +include dataprofiler/_version.py +include .whitesource + +recursive-exclude _docs *.html +recursive-exclude _docs *.cfg +exclude _docs/LICENSE +recursive-exclude _docs *.md +recursive-exclude _docs *.nojekyll +recursive-exclude _docs *.png +recursive-exclude _docs *.py +recursive-exclude _docs *.rst +recursive-exclude _docs Makefile diff --git a/_docs/README.md b/_docs/README.md new file mode 100644 index 000000000..0f925ac58 --- /dev/null +++ b/_docs/README.md @@ -0,0 +1,59 @@ +Visit our [documentation page.](https://capitalone.github.io/DataProfiler) + +### How to properly write documentation: + +#### Packages +In any package directory, overall package comments can be made in the +\_\_init\_\_.py of the directory. At the top of the \_\_init\_\_.py, +include your comments in between triple quotations. + +#### Classes +In any class file, include overall class comments at the top of the file +in between triple quotes and/or in the init function. + +#### Functions +reStructuredText Docstring Format is the standard. Here is an example: + + def format_data(self, predictions, verbose=False): + """ + Formats word level labeling of the Unstructured Data Labeler as you want + + :param predictions: A 2D list of word level predictions/labeling + :type predictions: Dict + :param verbose: A flag to determine verbosity + :type verbose: Bool + :return: JSON structure containing specified formatted output + :rtype: JSON + + :Example: + Look at this test. Don't forget the double colons to make a code block:: + This is a codeblock + Type example code here + """ + +### How to update the documentation: + + +1. Set up your local environment +```bash +# install sphinx requirements +# install the requirements from the feature branch +pip install pandoc && +pip install -r requirements.txt && +pip install -r requirements-ml.txt && +pip install -r requirements-reports.txt && +pip install -r requirements-docs.txt && +pip install -e . + +``` +2. And finally, from the root of `DataProfiler`, run the following commands to generate the sphinx documentation: +```bash +cd _docs/docs +python update_documentation.py + +``` + +3. View new docs +```bash +open index.html +``` diff --git a/_docs/docs/Makefile b/_docs/docs/Makefile new file mode 100644 index 000000000..81ca02cf5 --- /dev/null +++ b/_docs/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = buildcode + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/_docs/docs/make.bat b/_docs/docs/make.bat new file mode 100644 index 000000000..6247f7e23 --- /dev/null +++ b/_docs/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/_docs/docs/source/API.rst b/_docs/docs/source/API.rst new file mode 100644 index 000000000..fdbf2242b --- /dev/null +++ b/_docs/docs/source/API.rst @@ -0,0 +1,16 @@ +.. _API: + +API +*** + +The API is split into 4 main components: Profilers, Labelers, Data Readers, and +Validators. + +.. toctree:: + :maxdepth: 1 + :caption: Contents: + + dataprofiler.data_readers + dataprofiler.profilers + dataprofiler.labelers + dataprofiler.validators \ No newline at end of file diff --git a/_docs/docs/source/DL-Flowchart.png b/_docs/docs/source/DL-Flowchart.png new file mode 100644 index 000000000..696eeb5dc Binary files /dev/null and b/_docs/docs/source/DL-Flowchart.png differ diff --git a/_docs/docs/source/_static/custom.css b/_docs/docs/source/_static/custom.css new file mode 100644 index 000000000..8a7c7cb54 --- /dev/null +++ b/_docs/docs/source/_static/custom.css @@ -0,0 +1,50 @@ +/* + the ipython3 code blocks coming from the notebooks + were not getting the dark theme styles applied, so + manually overriding them +*/ +@media (prefers-color-scheme: dark) { + .highlight-ipython3 { + border: none !important; + border-radius: 2px !important; + background: #202020 !important; + color: #d0d0d0 !important; + } +} + +@media (prefers-color-scheme: dark) { + tr:nth-child(odd) { + background-color: #202020 !important; + } +} + +@media (prefers-color-scheme: dark) { + .dataframe { + color: white !important; + } +} + +.hidden { + display: none; +} + +.version { + text-align: right; + font-size: 24px; + margin-top: -47px; + margin-right: 3px; +} + +.sidebar-brand { + margin-bottom: -10px; + margin-top: 10px; +} + +/* unknown warning was showing, manually hiding */ +#Visualizing-Logged-Dataframes .admonition.warning { + display: none; +} + +div.output_area.stderr { + display: none; +} diff --git a/_docs/docs/source/_static/images/DataProfilerDarkLogoLong.png b/_docs/docs/source/_static/images/DataProfilerDarkLogoLong.png new file mode 100644 index 000000000..a339e0f6a Binary files /dev/null and b/_docs/docs/source/_static/images/DataProfilerDarkLogoLong.png differ diff --git a/_docs/docs/source/_static/images/DataProfilerLogoLightTheme.png b/_docs/docs/source/_static/images/DataProfilerLogoLightTheme.png new file mode 100644 index 000000000..35e59c349 Binary files /dev/null and b/_docs/docs/source/_static/images/DataProfilerLogoLightTheme.png differ diff --git a/_docs/docs/source/_static/images/DataProfilerLogoLightThemeLong.png b/_docs/docs/source/_static/images/DataProfilerLogoLightThemeLong.png new file mode 100644 index 000000000..ca86fe167 Binary files /dev/null and b/_docs/docs/source/_static/images/DataProfilerLogoLightThemeLong.png differ diff --git a/_docs/docs/source/_static/images/branching_workflow_diagram.png b/_docs/docs/source/_static/images/branching_workflow_diagram.png new file mode 100644 index 000000000..60a9515d0 Binary files /dev/null and b/_docs/docs/source/_static/images/branching_workflow_diagram.png differ diff --git a/_docs/docs/source/_static/images/histogram_example_0.png b/_docs/docs/source/_static/images/histogram_example_0.png new file mode 100644 index 000000000..9b8301363 Binary files /dev/null and b/_docs/docs/source/_static/images/histogram_example_0.png differ diff --git a/_docs/docs/source/_static/images/histogram_example_1.png b/_docs/docs/source/_static/images/histogram_example_1.png new file mode 100644 index 000000000..062dfdbb9 Binary files /dev/null and b/_docs/docs/source/_static/images/histogram_example_1.png differ diff --git a/_docs/docs/source/_static/images/histogram_example_2.png b/_docs/docs/source/_static/images/histogram_example_2.png new file mode 100644 index 000000000..1aedf7549 Binary files /dev/null and b/_docs/docs/source/_static/images/histogram_example_2.png differ diff --git a/_docs/docs/source/_static/images/missing_value_barchart_example_0.png b/_docs/docs/source/_static/images/missing_value_barchart_example_0.png new file mode 100644 index 000000000..33cb7afd2 Binary files /dev/null and b/_docs/docs/source/_static/images/missing_value_barchart_example_0.png differ diff --git a/_docs/docs/source/_static/images/missing_value_matrix_example_0.png b/_docs/docs/source/_static/images/missing_value_matrix_example_0.png new file mode 100644 index 000000000..21799cddf Binary files /dev/null and b/_docs/docs/source/_static/images/missing_value_matrix_example_0.png differ diff --git a/_docs/docs/source/add_new_model_to_data_labeler.nblink b/_docs/docs/source/add_new_model_to_data_labeler.nblink new file mode 100644 index 000000000..4c5fe646a --- /dev/null +++ b/_docs/docs/source/add_new_model_to_data_labeler.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../../examples/add_new_model_to_data_labeler.ipynb" +} \ No newline at end of file diff --git a/_docs/docs/source/architecture.rst b/_docs/docs/source/architecture.rst new file mode 100644 index 000000000..469308993 --- /dev/null +++ b/_docs/docs/source/architecture.rst @@ -0,0 +1,48 @@ +.. _architecture: + +Architecture & Design Overview +****************************** + +This section describes the design rationale, algorithmic choices, assumptions, testing strategy, and contribution process used in the DataProfiler library. + +Overview +-------- + +DataProfiler computes numeric statistics (e.g., mean, variance, skewness, kurtosis) using **streaming algorithms** that allow efficient, incremental updates without recomputing from raw data. Approximate quantile metrics like the median are calculated using histogram-based estimation, making the system scalable for large or streaming datasets. + +Additionally, DataProfiler uses a **Convolutional Neural Network (CNN)** to detect and label entities (e.g., names, emails, credit cards) in unstructured text. This supports critical tasks such as **PII detection**, **schema inference**, and **data quality analysis** across structured and unstructured data. + +Algorithm Rationale +------------------- + +The algorithms used are designed for **speed, scalability, and flexibility**: + +- **Streaming numeric methods** (e.g., Welford's algorithm, moment-based metrics, histogram binning) efficiently summarize data without full recomputation. +- **CNNs for entity detection** are fast, high-throughput, and well-suited for sequence labeling tasks in production environments. + +These choices align with the tool's goal of delivering fast, accurate data profiling with minimal configuration. + +Assumptions & Limitations +------------------------- + +- **Consistent formatting** of sensitive entities is assumed (e.g., standardized credit card or SSN formats). +- **Overlapping entity types** (e.g., phone vs. SSN) may lead to misclassification without context. +- **Synthetic training data** may not fully capture real-world diversity, reducing model accuracy on natural or unstructured text. +- **Quantile estimation** (e.g., median) is approximate and based on binning rather than exact sorting. + +Testing & Validation +-------------------- + +- Comprehensive **unit testing** is performed across Python 3.9, 3.10, and 3.11. +- Tests are executed on every pull request targeting `dev` or `main` branches. +- All pull requests require **two code reviewer approvals** before merging. +- Testing includes correctness, performance, and compatibility checks to ensure production readiness. + +Versioning & Contributions +-------------------------- + +- Versioning and development are managed via **GitHub**. +- Future changes must follow the guidelines in `CONTRIBUTING.md`, including: + - Forking the repo and branching from `dev` or an active feature branch. + - Ensuring **80%+ unit test coverage** for all new functionality. + - Opening a PR and securing **two approvals** prior to merging. diff --git a/_docs/docs/source/column_name_labeler_example.nblink b/_docs/docs/source/column_name_labeler_example.nblink new file mode 100644 index 000000000..c39e674fb --- /dev/null +++ b/_docs/docs/source/column_name_labeler_example.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../../examples/column_name_labeler.ipynb" +} \ No newline at end of file diff --git a/_docs/docs/source/conf.py b/_docs/docs/source/conf.py new file mode 100644 index 000000000..80168effd --- /dev/null +++ b/_docs/docs/source/conf.py @@ -0,0 +1,84 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys +import re + +sys.path.insert(0, os.path.abspath(f'../../../')) + +# -- Project information ----------------------------------------------------- + +project = 'Data Profiler' +copyright = '2024, Jeremy Goodsitt, Austin Walters, Anh Truong, Grant Eden, and Chris Wallace' +author = 'Jeremy Goodsitt, Austin Walters, Anh Truong, Grant Eden, and Chris Wallace' + +# The full version, including alpha/beta/rc tags +# release = '21.01.20' +from dataprofiler import __version__ as version # noqa F401 + + +version_clip = re.search(r'\s*([\d.]+)', version).group(1) +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.napoleon', + 'sphinx.ext.intersphinx', + 'nbsphinx', + 'nbsphinx_link', +] + +# Don't execute the notebook cells when generating the documentation +# This can be configured on a per notebook basis as well +# See: https://nbsphinx.readthedocs.io/en/0.2.15/never-execute.html#Explicitly-Dis-/ +nbsphinx_execute = "never" +nbsphinx_prolog = """ +`View this notebook on GitHub `_ +""" + +autoclass_content = 'both' +autodoc_default_options = { + 'members': True, + 'member-order': 'bysource', + 'undoc-members': True, + 'exclude-members': '__weakref__', + 'inherited-members': True, +} + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build'] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "furo" +html_title = f"
v{version_clip}
" +html_static_path = ["_static"] +html_css_files = ["custom.css"] +html_favicon = "_static/images/DataProfilerLogoLightTheme.png" +html_theme_options = { + "light_logo": "images/DataProfilerLogoLightThemeLong.png", + "dark_logo": "images/DataProfilerDarkLogoLong.png", +} diff --git a/_docs/docs/source/data_labeling.rst b/_docs/docs/source/data_labeling.rst new file mode 100644 index 000000000..db76fe791 --- /dev/null +++ b/_docs/docs/source/data_labeling.rst @@ -0,0 +1,365 @@ +.. _data_labeling: + +Labeler (Sensitive Data) +************************ + +In this library, the term *data labeling* refers to entity recognition. + +Builtin to the data profiler is a classifier which evaluates the complex data types of the dataset. +For structured data, it determines the complex data type of each column. When +running the data profile, it uses the default data labeling model builtin to the +library. However, the data labeler allows users to train their own data labeler +as well. + +*Data Labels* are determined per cell for structured data (column/row when +the *profiler* is used) or at the character level for unstructured data. This +is a list of the default labels. + +* UNKNOWN +* ADDRESS +* BAN (bank account number, 10-18 digits) +* CREDIT_CARD +* EMAIL_ADDRESS +* UUID +* HASH_OR_KEY (md5, sha1, sha256, random hash, etc.) +* IPV4 +* IPV6 +* MAC_ADDRESS +* PERSON +* PHONE_NUMBER +* SSN +* URL +* US_STATE +* DRIVERS_LICENSE +* DATE +* TIME +* DATETIME +* INTEGER +* FLOAT +* QUANTITY +* ORDINAL + + +Identify Entities in Structured Data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Makes predictions and identifying labels: + +.. code-block:: python + + import dataprofiler as dp + + # load data and data labeler + data = dp.Data("your_data.csv") + data_labeler = dp.DataLabeler(labeler_type='structured') + + # make predictions and get labels per cell + predictions = data_labeler.predict(data) + +Identify Entities in Unstructured Data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Predict which class characters belong to in unstructured text: + +.. code-block:: python + + import dataprofiler as dp + + data_labeler = dp.DataLabeler(labeler_type='unstructured') + + # Example sample string, must be in an array (multiple arrays can be passed) + sample = ["Help\tJohn Macklemore\tneeds\tfood.\tPlease\tCall\t555-301-1234." + "\tHis\tssn\tis\tnot\t334-97-1234. I'm a BAN: 000043219499392912.\n"] + + # Prediction what class each character belongs to + model_predictions = data_labeler.predict( + sample, predict_options=dict(show_confidences=True)) + + # Predictions / confidences are at the character level + final_results = model_predictions["pred"] + final_confidences = model_predictions["conf"] + +It's also possible to change output formats, output similar to a **SpaCy** format: + +.. code-block:: python + + import dataprofiler as dp + + data_labeler = dp.DataLabeler(labeler_type='unstructured', trainable=True) + + # Example sample string, must be in an array (multiple arrays can be passed) + sample = ["Help\tJohn Macklemore\tneeds\tfood.\tPlease\tCall\t555-301-1234." + "\tHis\tssn\tis\tnot\t334-97-1234. I'm a BAN: 000043219499392912.\n"] + + # Set the output to the NER format (start position, end position, label) + data_labeler.set_params( + { 'postprocessor': { 'output_format':'ner', 'use_word_level_argmax':True } } + ) + + results = data_labeler.predict(sample) + + print(results) + +Train a New Data Labeler +~~~~~~~~~~~~~~~~~~~~~~~~ + +Mechanism for training your own data labeler on their own set of structured data +(tabular): + +.. code-block:: python + + import dataprofiler as dp + + # Will need one column with a default label of UNKNOWN + data = dp.Data("your_file.csv") + + data_labeler = dp.train_structured_labeler( + data=data, + save_dirpath="/path/to/save/labeler", + epochs=2 + ) + + data_labeler.save_to_disk("my/save/path") # Saves the data labeler for reuse + +Load an Existing Data Labeler +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Mechanism for loading an existing data_labeler: + +.. code-block:: python + + import dataprofiler as dp + + data_labeler = dp.DataLabeler( + labeler_type='structured', dirpath="/path/to/my/labeler") + + # get information about the parameters/inputs/output formats for the DataLabeler + data_labeler.help() + +Extending a Data Labeler with Transfer Learning +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Extending or changing labels of a data labeler w/ transfer learning: +Note: By default, **a labeler loaded will not be trainable**. In order to load a +trainable DataLabeler, the user must set `trainable=True` or load a labeler +using the `TrainableDataLabeler` class. + +The following illustrates how to change the labels: + +.. code-block:: python + + import dataprofiler as dp + + labels = ['label1', 'label2', ...] # new label set can also be an encoding dict + data = dp.Data("your_file.csv") # contains data with new labels + + # load default structured Data Labeler w/ trainable set to True + data_labeler = dp.DataLabeler(labeler_type='structured', trainable=True) + + # this will use transfer learning to retrain the data labeler on your new + # dataset and labels. + # NOTE: data must be in an acceptable format for the preprocessor to interpret. + # please refer to the preprocessor/model for the expected data format. + # Currently, the DataLabeler cannot take in Tabular data, but requires + # data to be ingested with two columns [X, y] where X is the samples and + # y is the labels. + model_results = data_labeler.fit(x=data['samples'], y=data['labels'], + validation_split=0.2, epochs=2, labels=labels) + + # final_results, final_confidences are a list of results for each epoch + epoch_id = 0 + final_results = model_results[epoch_id]["pred"] + final_confidences = model_results[epoch_id]["conf"] + +The following illustrates how to extend the labels: + +.. code-block:: python + + import dataprofiler as dp + + new_labels = ['label1', 'label2', ...] + data = dp.Data("your_file.csv") # contains data with new labels + + # load default structured Data Labeler w/ trainable set to True + data_labeler = dp.DataLabeler(labeler_type='structured', trainable=True) + + # this will maintain current labels and model weights, but extend the model's + # labels + for label in new_labels: + data_labeler.add_label(label) + + # NOTE: a user can also add a label which maps to the same index as an existing + # label + # data_labeler.add_label(label, same_as='') + + # For a trainable model, the user must then train the model to be able to + # continue using the labeler since the model's graph has likely changed + # NOTE: data must be in an acceptable format for the preprocessor to interpret. + # please refer to the preprocessor/model for the expected data format. + # Currently, the DataLabeler cannot take in Tabular data, but requires + # data to be ingested with two columns [X, y] where X is the samples and + # y is the labels. + model_results = data_labeler.fit(x=data['samples'], y=data['labels'], + validation_split=0.2, epochs=2) + + # final_results, final_confidences are a list of results for each epoch + epoch_id = 0 + final_results = model_results[epoch_id]["pred"] + final_confidences = model_results[epoch_id]["conf"] + + +Changing pipeline parameters: + +.. code-block:: python + + import dataprofiler as dp + + # load default Data Labeler + data_labeler = dp.DataLabeler(labeler_type='structured') + + # change parameters of specific component + data_labeler.preprocessor.set_params({'param1': 'value1'}) + + # change multiple simultaneously. + data_labeler.set_params({ + 'preprocessor': {'param1': 'value1'}, + 'model': {'param2': 'value2'}, + 'postprocessor': {'param3': 'value3'} + }) + + +Build Your Own Data Labeler +=========================== + +The DataLabeler has 3 main components: preprocessor, model, and postprocessor. +To create your own DataLabeler, each one would have to be created or an +existing component can be reused. + +Given a set of the 3 components, you can construct your own DataLabeler: + +.. code-block:: python + from dataprofiler.labelers.base_data_labeler import BaseDataLabeler, \ + TrainableDataLabeler + from dataprofiler.labelers.character_level_cnn_model import CharacterLevelCnnModel + from dataprofiler.labelers.data_processing import \ + StructCharPreprocessor, StructCharPostprocessor + + # load a non-trainable data labeler + model = CharacterLevelCnnModel(...) + preprocessor = StructCharPreprocessor(...) + postprocessor = StructCharPostprocessor(...) + + data_labeler = BaseDataLabeler.load_with_components( + preprocessor=preprocessor, model=model, postprocessor=postprocessor) + + # check for basic compatibility between the processors and the model + data_labeler.check_pipeline() + + + # load trainable data labeler + data_labeler = TrainableDataLabeler.load_with_components( + preprocessor=preprocessor, model=model, postprocessor=postprocessor) + + # check for basic compatibility between the processors and the model + data_labeler.check_pipeline() + +Option for swapping out specific components of an existing labeler. + +.. code-block:: python + + import dataprofiler as dp + from dataprofiler.labelers.character_level_cnn_model import \ + CharacterLevelCnnModel + from dataprofiler.labelers.data_processing import \ + StructCharPreprocessor, StructCharPostprocessor + + model = CharacterLevelCnnModel(...) + preprocessor = StructCharPreprocessor(...) + postprocessor = StructCharPostprocessor(...) + + data_labeler = dp.DataLabeler(labeler_type='structured') + data_labeler.set_preprocessor(preprocessor) + data_labeler.set_model(model) + data_labeler.set_postprocessor(postprocessor) + + # check for basic compatibility between the processors and the model + data_labeler.check_pipeline() + + +Model Component +~~~~~~~~~~~~~~~ + +In order to create your own model component for data labeling, you can utilize +the `BaseModel` class from `dataprofiler.labelers.base_model` and +overriding the abstract class methods. + +Reviewing `CharacterLevelCnnModel` from +`dataprofiler.labelers.character_level_cnn_model` illustrates the functions +which need an override. + +#. `__init__`: specifying default parameters and calling base `__init__` +#. `_validate_parameters`: validating parameters given by user during setting +#. `_need_to_reconstruct_model`: flag for when to reconstruct a model (i.e. + parameters change or labels change require a model reconstruction) +#. `_construct_model`: initial construction of the model given the parameters +#. `_reconstruct_model`: updates model architecture for new label set while + maintaining current model weights +#. `fit`: mechanism for the model to learn given training data +#. `predict`: mechanism for model to make predictions on data +#. `details`: prints a summary of the model construction +#. `save_to_disk`: saves model and model parameters to disk +#. `load_from_disk`: loads model given a path on disk + + +Preprocessor Component +~~~~~~~~~~~~~~~~~~~~~~ + +In order to create your own preprocessor component for data labeling, you can +utilize the `BaseDataPreprocessor` class +from `dataprofiler.labelers.data_processing` and override the abstract class +methods. + +Reviewing `StructCharPreprocessor` from +`dataprofiler.labelers.data_processing` illustrates the functions which +need an override. + +#. `__init__`: passing parameters to the base class and executing any + extraneous calculations to be saved as parameters +#. `_validate_parameters`: validating parameters given by user during + setting +#. `process`: takes in the user data and converts it into an digestible, + iterable format for the model +#. `set_params` (optional): if a parameter requires processing before setting, + a user can override this function to assist with setting the parameter +#. `_save_processor` (optional): if a parameter is not JSON serializable, a + user can override this function to assist in saving the processor and its + parameters +#. `load_from_disk` (optional): if a parameter(s) is not JSON serializable, a + user can override this function to assist in loading the processor + +Postprocessor Component +~~~~~~~~~~~~~~~~~~~~~~~ + +The postprocessor is nearly identical to the preprocessor except it handles +the output of the model for processing. In order to create your own +postprocessor component for data labeling, you can utilize the +`BaseDataPostprocessor` class from `dataprofiler.labelers.data_processing` +and override the abstract class methods. + +Reviewing `StructCharPostprocessor` from +`dataprofiler.labelers.data_processing` illustrates the functions which +need an override. + +#. `__init__`: passing parameters to the base class and executing any + extraneous calculations to be saved as parameters +#. `_validate_parameters`: validating parameters given by user during + setting +#. `process`: takes in the output of the model and processes for output to + the user +#. `set_params` (optional): if a parameter requires processing before setting, + a user can override this function to assist with setting the parameter +#. `_save_processor` (optional): if a parameter is not JSON serializable, a + user can override this function to assist in saving the processor and its + parameters +#. `load_from_disk` (optional): if a parameter(s) is not JSON serializable, a + user can override this function to assist in loading the processor diff --git a/_docs/docs/source/data_reader.nblink b/_docs/docs/source/data_reader.nblink new file mode 100644 index 000000000..8d7215f46 --- /dev/null +++ b/_docs/docs/source/data_reader.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../../examples/data_readers.ipynb" +} \ No newline at end of file diff --git a/_docs/docs/source/data_readers.rst b/_docs/docs/source/data_readers.rst new file mode 100644 index 000000000..877ea56dd --- /dev/null +++ b/_docs/docs/source/data_readers.rst @@ -0,0 +1,184 @@ +.. _data_readers: + +Data Readers +************ + +The `Data` class itself will identify then output one of the following `Data` class types. +Using the data reader is easy, just pass it through the Data object. + +.. code-block:: python + + import dataprofiler as dp + data = dp.Data("your_file.csv") + +The supported file types are: + +* CSV file (or any delimited file) +* JSON object +* Avro file +* Parquet file +* Graph data file +* Text file +* Pandas DataFrame +* A URL that points to one of the supported file types above + +It's also possible to specifically call one of the data classes such as the following command: + +.. code-block:: python + + from dataprofiler.data_readers.csv_data import CSVData + data = CSVData("your_file.csv", options={"delimiter": ","}) + +Additionally any of the data classes can be loaded using a URL: + +.. code-block:: python + + import dataprofiler as dp + data = dp.Data("https://you_website.com/your_file.file", options={"verify_ssl": "True"}) + +Below are descriptions of the various `Data` classes and the available options. + +CSVData +======= + +Data class for loading datasets of type CSV. Can be specified by passing +in memory data or via a file path. Options pertaining the CSV may also +be specified using the options dict parameter. + +`CSVData(input_file_path=None, data=None, options=None)` + +Possible `options`: + +* delimiter - Must be a string, for example `"delimiter": ","` +* data_format - Must be a string, possible choices: "dataframe", "records" +* selected_columns - Columns being selected from the entire dataset, must be a + list `["column 1", "ssn"]` +* sample_nrows - Reservoir sampling to sample `"n"` rows out of a total of `"M"` rows. + Specified for how many rows to sample, default None. +* header - Define the header, for example + + * `"header": 'auto'` for auto detection + * `"header": None` for no header + * `"header": ` to specify the header row (0 based index) + +JSONData +======== + +Data class for loading datasets of type JSON. Can be specified by +passing in memory data or via a file path. Options pertaining the JSON +may also be specified using the options dict parameter. JSON data can be +accessed via the "data" property, the "metadata" property, and the +"data_and_metadata" property. + +`JSONData(input_file_path=None, data=None, options=None)` + +Possible `options`: + +* data_format - must be a string, choices: "dataframe", "records", "json", "flattened_dataframe" + + * "flattened_dataframe" is best used for JSON structure typically found in data streams that contain + nested lists of dictionaries and a payload. For example: `{"data": [ columns ], "response": 200}` +* selected_keys - columns being selected from the entire dataset, must be a list `["column 1", "ssn"]` +* payload_keys - The dictionary keys for the payload of the JSON, typically called "data" + or "payload". Defaults to ["data", "payload", "response"]. + + +AVROData +======== + +Data class for loading datasets of type AVRO. Can be specified by +passing in memory data or via a file path. Options pertaining the AVRO +may also be specified using the options dict parameter. + +`AVROData(input_file_path=None, data=None, options=None)` + +Possible `options`: + +* data_format - must be a string, choices: "dataframe", "records", "avro", "json", "flattened_dataframe" + + * "flattened_dataframe" is best used for AVROs with a JSON structure typically found in data streams that contain + nested lists of dictionaries and a payload. For example: `{"data": [ columns ], "response": 200}` +* selected_keys - columns being selected from the entire dataset, must be a list `["column 1", "ssn"]` + +ParquetData +=========== + +Data class for loading datasets of type PARQUET. Can be specified by +passing in memory data or via a file path. Options pertaining the +PARQUET may also be specified using the options dict parameter. + +`ParquetData(input_file_path=None, data=None, options=None)` + +Possible `options`: + +* data_format - must be a string, choices: "dataframe", "records", "json" +* selected_keys - columns being selected from the entire dataset, must be a list `["column 1", "ssn"]` +* sample_nrows - Random sampling to sample `"n"` rows out of a total of `"M"` rows. + Specified for how many rows to sample, default None. + +GraphData +========= + +Data Class for loading datasets of graph data. Currently takes CSV format, +further type formats will be supported. Can be specified by passing +in memory data (NetworkX Graph) or via a file path. Options pertaining the CSV file may also +be specified using the options dict parameter. Loads data from CSV into memory +as a NetworkX Graph. + +`GraphData(input_file_path=None, data=None, options=None)` + +Possible `options`: + +* delimiter - must be a string, for example `"delimiter": ","` +* data_format - must be a string, possible choices: "graph", "dataframe", "records" +* header - Define the header, for example + + * `"header": 'auto'` for auto detection + * `"header": None` for no header + * `"header": ` to specify the header row (0 based index) + +TextData +======== + +Data class for loading datasets of type TEXT. Can be specified by +passing in memory data or via a file path. Options pertaining the TEXT +may also be specified using the options dict parameter. + +`TextData(input_file_path=None, data=None, options=None)` + +Possible `options`: + +* data_format: user selected format in which to return data. Currently only supports "text". +* samples_per_line - chunks by which to read in the specified dataset + + +Data Using a URL +================ + +Data class for loading datasets of any type using a URL. Specified by passing in +any valid URL that points to one of the valid data types. Options pertaining the +URL may also be specified using the options dict parameter. + +`Data(input_file_path=None, data=None, options=None)` + +Possible `options`: + +* verify_ssl: must be a boolean string, choices: "True", "False". Set to "True" by default. + +Data Using an AWS S3 URI +======================== + +Data class for loading datasets from AWS S3 URI. Specified by passing in +any valid bucket path that points to one of the valid data types. + +`Data('s3a://my-bucket/file_name.txt')` + +Possible `options`: + +* `storage_options`: must be a dictionary where the keys for boto3 initialization are set + If `storage_options` is provided in `options`, the below variables are retrieved from the dictionary provided. Otherwise, will retrieve from `environment variables `_. + + * `AWS_ACCESS_KEY_ID` + * `AWS_SECRET_ACCESS_KEY` + * `AWS_SESSION_TOKEN` + * `AWS_REGION` (default `us-east-1`) diff --git a/_docs/docs/source/examples.rst b/_docs/docs/source/examples.rst new file mode 100644 index 000000000..3637da6ac --- /dev/null +++ b/_docs/docs/source/examples.rst @@ -0,0 +1,24 @@ +.. _examples: + +Examples +******** + +These examples provide a more in-depth look into the details of the ``Data Profiler`` library. + +Basics +------ + +.. toctree:: + :maxdepth: 0 + + Overview of Data Profiler + Data Reader + Structured Profiler + Unstructured Profiler + Graph Profiler + Labeler + Adding Models to a Labeler Pipeline + Creating a Regex Labeler + Creating a ColumnName Labeler + Merge Profile List + Dataloader with Popmon Reports diff --git a/_docs/docs/source/graph_data_demo.nblink b/_docs/docs/source/graph_data_demo.nblink new file mode 100644 index 000000000..40408c3ae --- /dev/null +++ b/_docs/docs/source/graph_data_demo.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../../examples/graph_data_demo.ipynb" +} diff --git a/_docs/docs/source/graphs.rst b/_docs/docs/source/graphs.rst new file mode 100644 index 000000000..23c2d316b --- /dev/null +++ b/_docs/docs/source/graphs.rst @@ -0,0 +1,196 @@ +.. _reports: + +Graphs +****** + +Graph Your Data +=============== + +We can plot some of our data as seaborn histogram plots. Below will demonstrate how to do so and provide examples. + +The following plots are currently available to work directly with your profilers: + + * histogram (numeric columns only) + * missing values matrix + +Below shows how to do so with examples. + +What we need to import +~~~~~~~~~~~~~~~~~~~~~~ +.. code-block:: python + + from dataprofiler.reports import graphs + +The main functions that is used to plot histograms are in graphs. **You will also need the `dataprofiler[reports]` requirement to be installed**: + +.. code-block:: console + + pip install 'dataprofiler[reports]' + +Plotting from a StructuredProfiler class +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +With a StructuredProfiler class variable, we can specify what columns we want to be plotted, and plot them into histograms. + +.. code-block:: python + + graphs.plot_histograms(profiler, column_names, column_inds) + +These are what the variables mean: + + * **profiler** - StructuredProfiler class variable that contains the data we want + * **columns** - (Optional) The list of IntColumn or FloatColumn *names* we want to specifically plot. If specified, `column_inds` cannot be specified. + * **column_inds** - (Optional) The list of IntColumn or FloatColumn *indexes* we want to specifically plot. If specified, `column_names` cannot be specified. + + +Additionally, we can also plot the missing values matrix for a StructuredProfiler: + +.. code-block:: python + + graphs.plot_missing_values_matrix(profiler, ax, title) + +These are what the variables mean: + + * **profiler** - StructuredProfiler class variable that contains the data we want + * **ax** - (Optional) MatPlotLib Axes to plot the matrix within. + * **title** - (Optional) The title of the axes we want to define. + + +Plotting an individual IntColumn or FloatColumn +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +With a column's Int or Float profile, we can plot their respective histograms. + +.. code-block:: python + + graphs.plot_col_histogram(column, axes, title) + +These are what the variables mean: + + * **column** - The IntColumn or FloatColumn we want to plot + * **axes** - (Optional) The MatPlotLib Axes to plot the histogram within. + * **title** - (Optional) The title of the axes we want to define. + + +Additionally, we can also plot the missing values bargraph for any column profile: + +.. code-block:: python + + graphs.plot_col_missing_values(profiler, ax, title) + +These are what the variables mean: + + * **profiler** - The StructuredColProfiler we want to plot + * **ax** - (Optional) MatPlotLib Axes to plot the matrix within. + * **title** - (Optional) The title of the axes we want to define. + +Examples +~~~~~~~~ + +Histograms +---------- + +1. This example demonstrates how we can take a StructuredProfiler class and plot histograms of the specified columns. + +.. code-block:: python + + import dataprofiler as dp + from dataprofiler.reports import graphs + + + data = [[1, 'a', 1.0], + [2, 'b', 2.2], + [3, 'c', 3.5], + [None, 'd', 10.0]] + profiler = dp.StructuredProfiler(data) + + # This will plot all IntColumn and FloatColumn as histograms (The first and last column). + fig = graphs.plot_histograms(profiler) + fig.show() + + # This will only plot the specified column, 0. + columns_names = [0] + fig = graphs.plot_histograms(profiler, columns_names) + fig.show() + +.. image:: _static/images/histogram_example_0.png + :alt: First Histogram Example Image + +.. image:: _static/images/histogram_example_1.png + :alt: Second Histogram Example Image + +2. This example demonstrates how we can plot a low level profiler. + +.. code-block:: python + + import pandas as pd + + from dataprofiler.profilers import IntColumn + from dataprofiler.reports import graphs + + + data = pd.Series([1, 2, 3], dtype=str) + profiler = IntColumn('example') + profiler.update(data) + + # Plot the axes + ax = graphs.plot_col_histogram(profiler) + + # get and show the figure of the plotted histogram + fig = ax.get_figure() + fig.show() + +.. image:: _static/images/histogram_example_2.png + :alt: Histogram Column Only Example Image + + +Missing Values Matrix +--------------------- + +1. This example demonstrates how we can take a StructuredProfiler class and plot a missing values matrix. + +.. code-block:: python + + import dataprofiler as dp + from dataprofiler.reports import graphs + + + data = pd.DataFrame( + [[None, '', 1.0, '1/2/2021'], + [3, None, 3.5, ''], + [1, None, 1.0, '2/5/2020'], + [None, 1, 10.0, '3/5/2020']], + columns=['integer', 'str', 'float', 'datetime'], + dtype=object + ) + profiler = dp.StructuredProfiler(data) + + # This will plot the missing values matrix for all columns. + fig = graphs.plot_missing_values_matrix(profiler) + fig.show() + +.. image:: _static/images/missing_value_matrix_example_0.png + :alt: Missing Values Matrix Example Image + +2. This example demonstrates how we can plot barchart of a column's missing values. + +.. code-block:: python + + import pandas as pd + + from dataprofiler.profilers.profile_builder import StructuredColProfiler + from dataprofiler.reports import graphs + + + data = pd.Series([1, 2, 3, None, None, 4], name='example', dtype=str) + profiler = StructuredColProfiler(data) + + # Plot the axes, can be a list of multiple columns + ax = graphs.plot_col_missing_values([profiler]) + + # get and show the figure of the plotted histogram + fig = ax.get_figure() + fig.show() + +.. image:: _static/images/missing_value_barchart_example_0.png + :alt: Missing Values Column Only Example Image \ No newline at end of file diff --git a/_docs/docs/source/index.rst b/_docs/docs/source/index.rst new file mode 100644 index 000000000..8225be28f --- /dev/null +++ b/_docs/docs/source/index.rst @@ -0,0 +1,479 @@ +.. _Data Profiler: + +==================================== +Data Profiler | What's in your data? +==================================== + +Purpose +======= + +The DataProfiler is a Python library designed to make data analysis, monitoring and **sensitive data detection** easy. + +Loading **Data** with a single command, the library automatically formats & loads files into a DataFrame. **Profiling** the Data, the library identifies the schema, statistics, entities and more. Data Profiles can then be used in downstream applications or reports. + +The Data Profiler comes with a cutting edge pre-trained deep learning model, used to efficiently identify **sensitive data** (or **PII**). If customization is needed, it's easy to add new entities to the existing pre-trained model or insert a new pipeline for entity recognition. + +The best part? Getting started only takes a few lines of code (`Example CSV`_): + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + data = Data("your_file.csv") # Auto-Detect & Load: CSV, AVRO, Parquet, JSON, Text + print(data.data.head(5)) # Access data directly via a compatible Pandas DataFrame + + profile = Profiler(data) # Calculate Statistics, Entity Recognition, etc + readable_report = profile.report(report_options={"output_format":"pretty"}) + print(json.dumps(readable_report, indent=4)) + + +To install the full package from pypi: + +.. code-block:: console + + pip install DataProfiler[ml] + +If the ML requirements are too strict (say, you don't want to install tensorflow), you can install a slimmer package. The slimmer package disables the default sensitive data detection / entity recognition (labler) + +Install from pypi: + +.. code-block:: console + + pip install DataProfiler + +If you have suggestions or find a bug, please open an `issue`_. + +Visit the :ref:`API` to explore Data Profiler's terminology. + + +What is a Data Profile? +======================= + +In the case of this library, a data profile is a dictionary containing statistics and predictions about the underlying dataset. There are "global statistics" or `global_stats`, which contain dataset level data and there are "column/row level statistics" or `data_stats` (each column is a new key-value entry). + +The format for a structured profile is below: + +.. code-block:: python + + "global_stats": { + "samples_used": int, + "column_count": int, + "row_count": int, + "row_has_null_ratio": float, + "row_is_null_ratio": float, + "unique_row_ratio": float, + "duplicate_row_count": int, + "file_type": string, + "encoding": string, + "correlation_matrix": list[list[int]], (*) + "chi2_matrix": list[list[float]], + "profile_schema": dict[string, list[int]] + }, + "data_stats": [ + { + "column_name": string, + "data_type": string, + "data_label": string, + "categorical": bool, + "order": string, + "samples": list[str], + "statistics": { + "sample_size": int, + "null_count": int, + "null_types": list[string], + "null_types_index": dict[string, list[int]], + "data_type_representation": dict[string, list[string]], + "min": [null, float], + "max": [null, float], + "sum": float, + "mode": list[float], + "median": float, + "median_absolute_deviation": float, + "mean": float, + "variance": float, + "stddev": float, + "skewness": float, + "kurtosis": float, + "num_zeros": int, + "num_negatives": int, + "histogram": { + "bin_counts": list[int], + "bin_edges": list[float], + }, + "quantiles": { + int: float + }, + "vocab": list[char], + "avg_predictions": dict[string, float], + "data_label_representation": dict[string, float], + "categories": list[str], + "unique_count": int, + "unique_ratio": float, + "categorical_count": dict[string, int], + "gini_impurity": float, + "unalikeability": float, + "precision": { + 'min': int, + 'max': int, + 'mean': float, + 'var': float, + 'std': float, + 'sample_size': int, + 'margin_of_error': float, + 'confidence_level': float + }, + "times": dict[string, float], + "format": string + }, + "null_replication_metrics": { + "class_prior": list[int], + "class_sum": list[list[int]], + "class_mean": list[list[int]] + } + } + ] + +(*) Currently the correlation matrix update is toggled off. It will be reset in a later update. Users can still use it as desired with the is_enable option set to True. + +The format for an unstructured profile is below: + +.. code-block:: python + + "global_stats": { + "samples_used": int, + "empty_line_count": int, + "file_type": string, + "encoding": string, + "memory_size": float, # in MB + }, + "data_stats": { + "data_label": { + "entity_counts": { + "word_level": dict[string, int], + "true_char_level": dict[string, int], + "postprocess_char_level": dict[string, int] + }, + "entity_percentages": { + "word_level": dict[string, float], + "true_char_level": dict[string, float], + "postprocess_char_level": dict[string, float] + }, + "times": dict[string, float] + }, + "statistics": { + "vocab": list[char], + "vocab_count": dict[string, int], + "words": list[string], + "word_count": dict[string, int], + "times": dict[string, float] + } + } + +The format for a graph profile is below: + +.. code-block:: python + + "num_nodes": int, + "num_edges": int, + "categorical_attributes": list[string], + "continuous_attributes": list[string], + "avg_node_degree": float, + "global_max_component_size": int, + "continuous_distribution": { + "": { + "name": string, + "scale": float, + "properties": list[float, np.array] + }, + "": None, + }, + "categorical_distribution": { + "": None, + "": { + "bin_counts": list[int], + "bin_edges": list[float] + }, + }, + "times": dict[string, float] + +Supported Data Formats +~~~~~~~~~~~~~~~~~~~~~~ + +* Any delimited file (CSV, TSV, etc.) +* JSON object +* Avro file +* Parquet file +* Text file +* Pandas DataFrame +* A URL that points to one of the supported file types above + + +Data Labels +~~~~~~~~~~~ + +*Data Labels* are determined per cell for structured data (column/row when the *profiler* is used) or at the character level for unstructured data. + +* UNKNOWN +* ADDRESS +* BAN (bank account number, 10-18 digits) +* CREDIT_CARD +* EMAIL_ADDRESS +* UUID +* HASH_OR_KEY (md5, sha1, sha256, random hash, etc.) +* IPV4 +* IPV6 +* MAC_ADDRESS +* PERSON +* PHONE_NUMBER +* SSN +* URL +* US_STATE +* DRIVERS_LICENSE +* DATE +* TIME +* DATETIME +* INTEGER +* FLOAT +* QUANTITY +* ORDINAL + + +Get Started +=========== + +Load a File +~~~~~~~~~~~ + +The profiler should automatically identify the file type and load the data into a `Data Class`. + +Along with other attributtes the `Data class` enables structured data to be accessed via a valid Pandas DataFrame. + +.. code-block:: python + + # Load a csv file, return a CSVData object + csv_data = Data('your_file.csv') + + # Print the first 10 rows of the csv file + print(csv_data.data.head(10)) + + # Load a parquet file, return a ParquetData object + parquet_data = Data('your_file.parquet') + + # Sort the data by the name column + parquet_data.data.sort_values(by='name', inplace=True) + + # Print the sorted first 10 rows of the parquet data + print(parquet_data.data.head(10)) + + +If the file type is not automatically identified (rare), you can specify them +specifically, see section Data Readers. + +Profile a File +~~~~~~~~~~~~~~ + +Example uses a CSV file for example, but CSV, JSON, Avro, Parquet or Text should also work. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load file (CSV should be automatically identified) + data = Data("your_file.csv") + + # Profile the dataset + profile = Profiler(data) + + # Generate a report and use json to prettify. + report = profile.report(report_options={"output_format":"pretty"}) + + # Print the report + print(json.dumps(report, indent=4)) + +Updating Profiles +~~~~~~~~~~~~~~~~~ + +Currently, the data profiler is equipped to update its profile in batches. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load and profile a CSV file + data = Data("your_file.csv") + profile = Profiler(data) + + # Update the profile with new data: + new_data = Data("new_data.csv") + profile.update_profile(new_data) + + # Print the report using json to prettify. + report = profile.report(report_options={"output_format":"pretty"}) + print(json.dumps(report, indent=4)) + + +Merging Profiles +~~~~~~~~~~~~~~~~ + +If you have two files with the same schema (but different data), it is possible to merge the two profiles together via an addition operator. + +This also enables profiles to be determined in a distributed manner. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load a CSV file with a schema + data1 = Data("file_a.csv") + profile1 = Profiler(data) + + # Load another CSV file with the same schema + data2 = Data("file_b.csv") + profile2 = Profiler(data) + + profile3 = profile1 + profile2 + + # Print the report using json to prettify. + report = profile3.report(report_options={"output_format":"pretty"}) + print(json.dumps(report, indent=4)) + +Profile a Pandas DataFrame +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import pandas as pd + import dataprofiler as dp + import json + + my_dataframe = pd.DataFrame([[1, 2.0],[1, 2.2],[-1, 3]]) + profile = dp.Profiler(my_dataframe) + + # print the report using json to prettify. + report = profile.report(report_options={"output_format":"pretty"}) + print(json.dumps(report, indent=4)) + + # read a specified column, in this case it is labeled 0: + print(json.dumps(report["data stats"][0], indent=4)) + + +Unstructured Profiler +~~~~~~~~~~~~~~~~~~~~~ + +In addition to the structured profiler, the Data Profiler provides unstructured +profiling for the TextData object or string. Unstructured profiling also works +with list(string), pd.Series(string) or pd.DataFrame(string) given profiler_type +option specified as `unstructured`. Below is an example of unstructured profile +with a text file. + +.. code-block:: python + + import dataprofiler as dp + import json + my_text = dp.Data('text_file.txt') + profile = dp.Profiler(my_text) + + # print the report using json to prettify. + report = profile.report(report_options={"output_format":"pretty"}) + print(json.dumps(report, indent=4)) + +Another example of unstructured profile with pd.Series of string is given as below + +.. code-block:: python + + import dataprofiler as dp + import pandas as pd + import json + + text_data = pd.Series(['first string', 'second string']) + profile = dp.Profiler(text_data, profiler_type="unstructured") + + # print the report using json to prettify. + report = profile.report(report_options={"output_format":"pretty"}) + print(json.dumps(report, indent=4)) + + +Graph Profiler +~~~~~~~~~~~~~~ + +DataProfiler also provides the ability to profile graph data from a csv file. Below is an example of the graph profiler with a graph data csv file: + +.. code-block:: python + + import dataprofiler as dp + import pprint + + my_graph = dp.Data('graph_file.csv') + profile = dp.Profiler(my_graph) + + # print the report using pretty print (json dump does not work on numpy array values inside dict) + report = profile.report() + printer = pprint.PrettyPrinter(sort_dicts=False, compact=True) + printer.pprint(report) + + +Specifying a Filetype or Delimiter +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Example of specifying a CSV data type, with a `,` delimiter. +In addition, it utilizes only the first 10,000 rows. + +.. code-block:: python + + import json + import os + from dataprofiler import Data, Profiler + from dataprofiler.data_readers.csv_data import CSVData + + # Load a CSV file, with "," as the delimiter + data = CSVData("your_file.csv", options={"delimiter": ","}) + + # Split the data, such that only the first 10,000 rows are used + data = data.data[0:10000] + + # Read in profile and print results + profile = Profiler(data) + print(json.dumps(profile.report(report_options={"output_format":"pretty"}), indent=4)) + + +.. toctree:: + :maxdepth: 2 + :hidden: + :caption: Getting Started: + + Intro + install.rst + data_readers.rst + profiler.rst + data_labeling.rst + graphs.rst + architecture.rst + +.. toctree:: + :maxdepth: 2 + :hidden: + :caption: User Guide: + + examples.rst + API.rst + +.. toctree:: + :maxdepth: 2 + :hidden: + :caption: Community: + + roadmap.rst + Changelog + Feedback + GitHub + Contributing + +.. _Example CSV: https://raw.githubusercontent.com/capitalone/DataProfiler/main/dataprofiler/tests/data/csv/aws_honeypot_marx_geo.csv +.. _issue: https://github.com/capitalone/DataProfiler/issues/new/choose + + + diff --git a/_docs/docs/source/install.rst b/_docs/docs/source/install.rst new file mode 100644 index 000000000..bdf4c3bb4 --- /dev/null +++ b/_docs/docs/source/install.rst @@ -0,0 +1,145 @@ +.. _install: + +Install +******* + +To install the full package from pypi: + +.. code-block:: console + + pip install DataProfiler[ml] + +If the ML requirements are too strict (say, you don't want to install +tensorflow), you can install a slimmer package. The slimmer package disables +the default sensitive data detection / entity recognition (labler) + +Install from pypi: + +.. code-block:: console + + pip install DataProfiler + +Snappy Installation +=================== + +This is required to profile parquet/avro datasets + +MacOS (intel chip) with homebrew: + +.. code-block:: console + + brew install snappy && CPPFLAGS="-I/usr/local/include -L/usr/local/lib" pip install python-snappy + + +MacOS (apple chip) with homebrew: + +.. code-block:: console + + brew install snappy && CPPFLAGS="-I/opt/homebrew/include -L/opt/homebrew/lib" pip install python-snappy + + +Linux install: + +.. code-block:: console + + sudo apt-get -y install libsnappy-dev + + +Build From Scratch +================== + +NOTE: Installation for python3 + +virtualenv install: + +.. code-block:: console + + python3 -m pip install virtualenv + + +Setup virtual env: + +.. code-block:: console + + python3 -m virtualenv --python=python3 venv3 + source venv3/bin/activate + + +Install requirements: + +.. code-block:: console + + pip3 install -r requirements.txt + +Install labeler dependencies: + +.. code-block:: console + + pip3 install -r requirements-ml.txt + + +Install via the repo -- Build setup.py and install locally: + +.. code-block:: console + + python3 setup.py sdist bdist bdist_wheel + pip3 install dist/DataProfiler*-py3-none-any.whl + + +If you see: + +.. code-block:: console + + ERROR: Double requirement given:dataprofiler==X.Y.Z from dataprofiler/dist/DataProfiler-X.Y.Z-py3-none-any.whl (already in dataprofiler==X2.Y2.Z2 from dataprofiler/dist/DataProfiler-X2.Y2.Z2-py3-none-any.whl, name='dataprofiler') + +This means that you have multiple versions of the DataProfiler distribution +in the dist folder. +To resolve, either remove the older one or delete the folder and rerun the steps +above. + +Install via github: + +.. code-block:: console + + pip3 install git+https://github.com/capitalone/dataprofiler.git#egg=dataprofiler + + + +Testing +======= + +For testing, install test requirements: + +.. code-block:: console + + pip3 install -r requirements-test.txt + + +To run all unit tests, use: + +.. code-block:: console + + DATAPROFILER_SEED=0 python3 -m unittest discover -p "test*.py" + + +To run file of unit tests, use form: + +.. code-block:: console + + DATAPROFILER_SEED=0 python3 -m unittest discover -p test_profile_builder.py + + +To run a file with Pytest use: + +.. code-block:: console + + DATAPROFILER_SEED=0 pytest dataprofiler/tests/data_readers/test_csv_data.py -v + + +To run individual of unit test, use form: + +.. code-block:: console + + DATAPROFILER_SEED=0 python3 -m unittest dataprofiler.tests.profilers.test_profile_builder.TestProfiler + + diff --git a/_docs/docs/source/labeler.nblink b/_docs/docs/source/labeler.nblink new file mode 100644 index 000000000..f862443fd --- /dev/null +++ b/_docs/docs/source/labeler.nblink @@ -0,0 +1,6 @@ +{ + "path": "../../../examples/labeler.ipynb", + "extra-media": [ + "../../../examples/DL-Flowchart.png" + ] +} \ No newline at end of file diff --git a/_docs/docs/source/merge_profile_list.nblink b/_docs/docs/source/merge_profile_list.nblink new file mode 100644 index 000000000..39102658b --- /dev/null +++ b/_docs/docs/source/merge_profile_list.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../../examples/merge_profile_list.ipynb" +} \ No newline at end of file diff --git a/_docs/docs/source/modules.rst b/_docs/docs/source/modules.rst new file mode 100644 index 000000000..0593459df --- /dev/null +++ b/_docs/docs/source/modules.rst @@ -0,0 +1,7 @@ +dataprofiler +============ + +.. toctree:: + :maxdepth: 4 + + dataprofiler diff --git a/_docs/docs/source/overview.nblink b/_docs/docs/source/overview.nblink new file mode 100644 index 000000000..4c118878e --- /dev/null +++ b/_docs/docs/source/overview.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../../examples/intro_data_profiler.ipynb" +} \ No newline at end of file diff --git a/_docs/docs/source/popmon_dp_loader_example.nblink b/_docs/docs/source/popmon_dp_loader_example.nblink new file mode 100644 index 000000000..1a288a318 --- /dev/null +++ b/_docs/docs/source/popmon_dp_loader_example.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../../examples/popmon_dp_loader_example.ipynb" +} \ No newline at end of file diff --git a/_docs/docs/source/profiler.rst b/_docs/docs/source/profiler.rst new file mode 100644 index 000000000..56d16a274 --- /dev/null +++ b/_docs/docs/source/profiler.rst @@ -0,0 +1,965 @@ +.. _profiler: + +Profiler +******** + +Profile Your Data +================= + +Profiling your data is easy. Just use the data reader, send the data to the +profiler, and print out the report. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + data = Data("your_file.csv") # Auto-Detect & Load: CSV, AVRO, Parquet, JSON, Text + + profile = Profiler(data) # Calculate Statistics, Entity Recognition, etc + + readable_report = profile.report(report_options={"output_format": "pretty"}) + print(json.dumps(readable_report, indent=4)) + +If the data is structured, the profile will return global statistics as well as +column by column statistics. The vast amount of statistics are listed on the +intro page. + +Load a File +~~~~~~~~~~~ + +The profiler should automatically identify the file type and load the data into a `Data Class`. + +Along with other attributtes the `Data class` enables structured data to be accessed via a valid Pandas DataFrame. + +.. code-block:: python + + # Load a csv file, return a CSVData object + csv_data = Data('your_file.csv') + + # Print the first 10 rows of the csv file + print(csv_data.data.head(10)) + + # Load a parquet file, return a ParquetData object + parquet_data = Data('your_file.parquet') + + # Sort the data by the name column + parquet_data.data.sort_values(by='name', inplace=True) + + # Print the sorted first 10 rows of the parquet data + print(parquet_data.data.head(10)) + + +If the file type is not automatically identified (rare), you can specify them +specifically, see section Data Readers. + +Profile a File +~~~~~~~~~~~~~~ + +Example uses a CSV file for example, but CSV, JSON, Avro or Parquet should also work. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load file (CSV should be automatically identified) + data = Data("your_file.csv") + + # Profile the dataset + profile = Profiler(data) + + # Generate a report and use json to prettify. + report = profile.report(report_options={"output_format": "pretty"}) + + # Print the report + print(json.dumps(report, indent=4)) + +Updating Profiles +~~~~~~~~~~~~~~~~~ + +Currently, the data profiler is equipped to update its profile in batches. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load and profile a CSV file + data = Data("your_file.csv") + profile = Profiler(data) + + # Update the profile with new data: + new_data = Data("new_data.csv") + profile.update_profile(new_data) + + # Print the report using json to prettify. + report = profile.report(report_options={"output_format": "pretty"}) + print(json.dumps(report, indent=4)) + + +Merging Profiles +~~~~~~~~~~~~~~~~ + +If you have two files with the same schema (but different data), it is possible to merge the two profiles together via an addition operator. + +This also enables profiles to be determined in a distributed manner. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load a CSV file with a schema + data1 = Data("file_a.csv") + profile1 = Profiler(data) + + # Load another CSV file with the same schema + data2 = Data("file_b.csv") + profile2 = Profiler(data) + + profile3 = profile1 + profile2 + + # Print the report using json to prettify. + report = profile3.report(report_options={"output_format": "pretty"}) + print(json.dumps(report, indent=4)) + + +Profile Differences +~~~~~~~~~~~~~~~~~~~ + +Profile differences take two profiles and find the differences +between them. Create the difference report like this: + +.. code-block:: python + + from dataprofiler import Data, Profiler + + # Load a CSV file + data1 = Data("file_a.csv") + profile1 = Profiler(data) + + # Load another CSV file + data2 = Data("file_b.csv") + profile2 = Profiler(data) + + diff_report = profile1.diff(profile2) + print(diff_report) + +The `.diff()` operation is available between two profiles, although there are different +outputs depending on the type of profile being differenced. For example, for numerical +column profiles (e.g. integers and floats), two valuable calculations that +`.diff()` returns are `t-test`, `chi2-test`, and `psi` (Popoulation Stability Index) +for understanding distributional changes. + +The difference report contains a dictionary that mirrors the profile report. +Each data type has its own difference: + +* **Int/Float** - One profile subtracts the value from the other. + +* **String** - The strings will be shown in a list: + + - [profile1 str, profile2 str] +* **List** - A list of 3 will be returned showing the unique values of + each profile and the shared values: + + - [profile 1 unique values, shared values, profile 2 unique values] +* **Dict** - Some dictionaries with varied keys will also return a list + of three in the format: + + - [profile 1 unique key-values, shared key differences, profile 2 unique key-values] + +Otherwise, when no differences occur: + +* **Any Type No Differences** - A string will report: "unchanged". + +Below is the structured difference report: + +.. code-block:: python + + { + 'global_stats': { + 'file_type': [str, str], + 'encoding': [str, str], + 'samples_used': int, + 'column_count': int, + 'row_count': int, + 'row_has_null_ratio': float, + 'row_is_null_ratio': float, + 'unique_row_ratio': float, + 'duplicate_row_count': int, + 'correlation_matrix': list[list[float]], + 'chi2_matrix': list[list[float]], + 'profile_schema': list[dict[str, int]] + }, + 'data_stats': [{ + 'column_name': str, + 'data_type': [str, str], + 'data_label': [list[str], list[str], list[str]], + 'categorical': [str, str], + 'order': [str, str], + 'statistics': { + 'min': float, + 'max': float, + 'sum': float, + 'mean': float, + 'median': float, + 'mode': [list[float], list[float], list[float]], + 'median_absolute_deviation': float, + 'variance': float, + 'stddev': float, + 't-test': { + 't-statistic': float, + 'conservative': {'deg_of_free': int, + 'p-value': float}, + 'welch': {'deg_of_free': float, + 'p-value': float}}, + 'psi': float, + "chi2-test": { + "chi2-statistic": float, + "deg_of_free": int, + "p-value": float + }, + 'unique_count': int, + 'unique_ratio': float, + 'categories': [list[str], list[str], list[str]], + 'gini_impurity': float, + 'unalikeability': float, + 'categorical_count': [dict[str, int], dict[str, int], dict[str, int]], + 'avg_predictions': [dict[str, float]], + 'label_representation': [dict[str, float]], + 'sample_size': int, + 'null_count': int, + 'null_types': [list[str], list[str], list[str]], + 'null_types_index': [dict[str, int], dict[str, int], dict[str, int]], + 'data_type_representation': [dict[str, float]] + }, + "null_replication_metrics": { + "class_prior": list[int], + "class_sum": list[list[int]], + "class_mean": list[list[int]] + } + } + +Below is the unstructured difference report: + +.. code-block:: python + + { + 'global_stats': { + 'file_type': [str, str], + 'encoding': [str, str], + 'samples_used': int, + 'empty_line_count': int, + 'memory_size': float + }, + 'data_stats': { + 'data_label': { + 'entity_counts': { + 'word_level': dict[str, int], + 'true_char_level': dict[str, int], + 'postprocess_char_level': dict[str, int] + }, + 'entity_percentages': { + 'word_level': dict[str, float], + 'true_char_level': dict[str, float], + 'postprocess_char_level': dict[str, float] + } + }, + 'statistics': { + 'vocab': [list[str], list[str], list[str]], + 'vocab_count': [dict[str, int], dict[str, int], dict[str, int]], + 'words': [list[str], list[str], list[str]], + 'word_count': [dict[str, int], dict[str, int], dict[str, int]] + } + } + } + + +Saving and Loading a Profile +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The profiles can easily be saved and loaded as shown below: + +**NOTE: Json saving and loading only supports Structured Profiles currently.** + +There are two save/load methods: + +* **Pickle save/load** + + * Save a profile as a `.pkl` file. + * Load a `.pkl` file as a profile object. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load a CSV file, with "," as the delimiter + data = Data("your_file.csv") + + # Read data into profile + profile = Profiler(data) + + # save structured profile to pkl file + profile.save(filepath="my_profile.pkl") + + # load pkl file to structured profile + loaded_pkl_profile = dp.Profiler.load(filepath="my_profile.pkl") + + print(json.dumps(loaded_pkl_profile.report(report_options={"output_format": "compact"}), + indent=4)) + +* **Json save/load** + + * Save a profile as a human-readable `.json` file. + * Load a `.json` file as a profile object. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load a CSV file, with "," as the delimiter + data = Data("your_file.csv") + + # Read data into profile + profile = Profiler(data) + + # save structured profile to json file + profile.save(filepath="my_profile.json", save_method="json") + + # load json file to structured profile + loaded_json_profile = dp.Profiler.load(filepath="my_profile.json", load_method="json") + + print(json.dumps(loaded_json_profile.report(report_options={"output_format": "compact"}), + indent=4)) + + +Structured vs Unstructured Profiles +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When using the profiler, the data profiler will automatically infer whether to +create the structured profile or the unstructured profile. However, you can be +explicit as shown below: + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Creating a structured profile + data1 = Data("normal_csv_file.csv") + structured_profile = Profiler(data1, profiler_type="structured") + + structured_report = structured_profile.report(report_options={"output_format": "pretty"}) + print(json.dumps(structured_report, indent=4)) + + # Creating an unstructured profile + data2 = Data("normal_text_file.txt") + unstructured_profile = Profiler(data2, profiler_type="unstructured") + + unstructured_report = unstructured_profile.report(report_options={"output_format": "pretty"}) + print(json.dumps(unstructured_report, indent=4)) + + +Setting the Sample Size +~~~~~~~~~~~~~~~~~~~~~~~ + +There are two ways to set sample size in a profile: samples_per_update and +min_true_samples. Samples_per_update takes an integer as the exact amount that +will be sampled. Min_true_samples will set the minimum amount of samples that +are not null. For example: + +.. code-block:: python + + from dataprofiler import Profiler + + sample_array = [1.0, NULL, 2.0] + profile = dp.Profiler(sample_array, samples_per_update=2) + +The first two samples (1.0 and NULL) are used for the statistical analysis. + +In contrast, if we also set min_true_samples to 2 then the Data Reader will +continue to read until the minimum true samples were found for the given column. +For example: + +.. code-block:: python + + from dataprofiler import Profiler + + sample_array = [1.0, NULL, 2.0] + profile = dp.Profiler(sample_array, samples_per_update=2, min_true_samples=2) + +This will use all samples in the statistical analysis until the number of "true" +(non-NULL) values are reached. Both min_true_samples and +samples_per_update conditions must be met. In this case, the profile will grab +the first two samples (1.0 and NULL) to satisfy the samples_per_update, and then +it will grab the first two VALID samples (1.0 and 2.0) to satisfy the +min_true_samples. + +Profile a Pandas DataFrame +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import pandas as pd + import dataprofiler as dp + import json + + my_dataframe = pd.DataFrame([[1, 2.0],[1, 2.2],[-1, 3]]) + profile = dp.Profiler(my_dataframe) + + # print the report using json to prettify. + report = profile.report(report_options={"output_format": "pretty"}) + print(json.dumps(report, indent=4)) + + # read a specified column, in this case it is labeled 0: + print(json.dumps(report["data stats"][0], indent=4)) + + +Specifying a Filetype or Delimiter +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Example of specifying a CSV data type, with a `,` delimiter. +In addition, it utilizes only the first 10,000 rows. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + from dataprofiler.data_readers.csv_data import CSVData + + # Load a CSV file, with "," as the delimiter + data = CSVData("your_file.csv", options={"delimiter": ","}) + + # Split the data, such that only the first 10,000 rows are used + data = data.data[0:10000] + + # Read in profile and print results + profile = Profiler(data) + print(json.dumps(profile.report(report_options={"output_format": "pretty"}), indent=4)) + +Setting Profiler Seed +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Example of specifying a seed for reproducibility. + +.. code-block:: python + + import dataprofiler as dp + + # Set seed to non-negative integer value or None + dp.set_seed(0) + + +Profile Statistic Descriptions +============================== + +Structured Profile +~~~~~~~~~~~~~~~~~~ + +**global_stats**: + +* samples_used - number of input data samples used to generate this profile +* column_count - the number of columns contained in the input dataset +* row_count - the number of rows contained in the input dataset +* row_has_null_ratio - the proportion of rows that contain at least one null value to the total number of rows +* row_is_null_ratio - the proportion of rows that are fully comprised of null values (null rows) to the total number of rows +* unique_row_ratio - the proportion of distinct rows in the input dataset to the total number of rows +* duplicate_row_count - the number of rows that occur more than once in the input dataset +* file_type - the format of the file containing the input dataset (ex: .csv) +* encoding - the encoding of the file containing the input dataset (ex: UTF-8) +* correlation_matrix - matrix of shape `column_count` x `column_count` containing the correlation coefficients between each column in the dataset +* chi2_matrix - matrix of shape `column_count` x `column_count` containing the chi-square statistics between each column in the dataset +* profile_schema - a description of the format of the input dataset labeling each column and its index in the dataset + * string - the label of the column in question and its index in the profile schema +* times - the duration of time it took to generate the global statistics for this dataset in milliseconds + +**data_stats**: + +* column_name - the label/title of this column in the input dataset +* data_type - the primitive python data type that is contained within this column +* data_label - the label/entity of the data in this column as determined by the Labeler component +* categorical - 'true' if this column contains categorical data +* order - the way in which the data in this column is ordered, if any, otherwise “random” +* samples - a small subset of data entries from this column +* statistics - statistical information on the column + * sample_size - number of input data samples used to generate this profile + * null_count - the number of null entries in the sample + * null_types - a list of the different null types present within this sample + * null_types_index - a dict containing each null type and a respective list of the indicies that it is present within this sample + * data_type_representation - the percentage of samples used identifying as each data_type + * min - minimum value in the sample + * max - maximum value in the sample + * mode - mode of the entries in the sample + * median - median of the entries in the sample + * median_absolute_deviation - the median absolute deviation of the entries in the sample + * sum - the total of all sampled values from the column + * mean - the average of all entries in the sample + * variance - the variance of all entries in the sample + * stddev - the standard deviation of all entries in the sample + * skewness - the statistical skewness of all entries in the sample + * kurtosis - the statistical kurtosis of all entries in the sample + * num_zeros - the number of entries in this sample that have the value 0 + * num_negatives - the number of entries in this sample that have a value less than 0 + * histogram - contains histogram relevant information + * bin_counts - the number of entries within each bin + * bin_edges - the thresholds of each bin + * quantiles - the value at each percentile in the order they are listed based on the entries in the sample + * vocab - a list of the characters used within the entries in this sample + * avg_predictions - average of the data label prediction confidences across all data points sampled + * categories - a list of each distinct category within the sample if `categorial` = 'true' + * unique_count - the number of distinct entries in the sample + * unique_ratio - the proportion of the number of distinct entries in the sample to the total number of entries in the sample + * categorical_count - number of entries sampled for each category if `categorical` = 'true' + * gini_impurity - measure of how often a randomly chosen element from the set would be incorrectly labeled if it was randomly labeled according to the distribution of labels in the subset + * unalikeability - a value denoting how frequently entries differ from one another within the sample + * precision - a dict of statistics with respect to the number of digits in a number for each sample + * times - the duration of time it took to generate this sample's statistics in milliseconds + * format - list of possible datetime formats +* null_replication_metrics - statistics of data partitioned based on whether column value is null (index 1 of lists referenced by dict keys) or not (index 0) + * class_prior - a list containing probability of a column value being null and not null + * class_sum - a list containing sum of all other rows based on whether column value is null or not + * class_mean - a list containing mean of all other rows based on whether column value is null or not + +Unstructured Profile +~~~~~~~~~~~~~~~~~~~~ + +**global_stats**: + +* samples_used - number of input data samples used to generate this profile +* empty_line_count - the number of empty lines in the input data +* file_type - the file type of the input data (ex: .txt) +* encoding - file encoding of the input data file (ex: UTF-8) +* memory_size - size of the input data in MB +* times - duration of time it took to generate this profile in milliseconds + +**data_stats**: + +* data_label - labels and statistics on the labels of the input data + * entity_counts - the number of times a specific label or entity appears inside the input data + * word_level - the number of words counted within each label or entity + * true_char_level - the number of characters counted within each label or entity as determined by the model + * postprocess_char_level - the number of characters counted within each label or entity as determined by the postprocessor + * entity_percentages - the percentages of each label or entity within the input data + * word_level - the percentage of words in the input data that are contained within each label or entity + * true_char_level - the percentage of characters in the input data that are contained within each label or entity as determined by the model + * postprocess_char_level - the percentage of characters in the input data that are contained within each label or entity as determined by the postprocessor + * times - the duration of time it took for the data labeler to predict on the data +* statistics - statistics of the input data + * vocab - a list of each character in the input data + * vocab_count - the number of occurrences of each distinct character in the input data + * words - a list of each word in the input data + * word_count - the number of occurrences of each distinct word in the input data + * times - the duration of time it took to generate the vocab and words statistics in milliseconds + +Graph Profile +~~~~~~~~~~~~~~~~~~ + +* num_nodes - number of nodes in the graph +* num_edges - number of edges in the graph +* categorical_attributes - list of categorical edge attributes +* continuous_attributes - list of continuous edge attributes +* avg_node_degree - average degree of nodes in the graph +* global_max_component_size: size of the global max component + +**continuous_distribution**: + +* : name of N-th edge attribute in list of attributes + * name - name of distribution for attribute + * scale - negative log likelihood used to scale and compare distributions + * properties - list of statistical properties describing the distribution + * [shape (optional), loc, scale, mean, variance, skew, kurtosis] + +**categorical_distribution**: + +* : name of N-th edge attribute in list of attributes + * bin_counts: counts in each bin of the distribution histogram + * bin_edges: edges of each bin of the distribution histogram + +* times - duration of time it took to generate this profile in milliseconds + +Profile Options +=============== + +The data profiler accepts several options to toggle on and off +features. The 8 columns (int options, float options, datetime options, +text options, order options, category options, data labeler options) can be +enabled or disabled. By default, all options are toggled on. Below is an example +of how to alter these options. Options shared by structured and unstructured options +must be specified as structured or unstructured when setting (ie. datalabeler options). + + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler, ProfilerOptions + + # Load and profile a CSV file + data = Data("your_file.csv") + profile_options = ProfilerOptions() + + #All of these are different examples of adjusting the profile options + + # Options can be toggled directly like this: + profile_options.structured_options.text.is_enabled = False + profile_options.structured_options.text.vocab.is_enabled = True + profile_options.structured_options.int.variance.is_enabled = True + profile_options.structured_options.data_labeler.data_labeler_dirpath = \ + "Wheres/My/Datalabeler" + profile_options.structured_options.data_labeler.is_enabled = False + + # A dictionary can be sent in to set the properties for all the options + profile_options.set({"structured_options.data_labeler.is_enabled": False, "min.is_enabled": False}) + + # Specific columns can be set/disabled/enabled in the same way + profile_options.structured_options.text.set({"max.is_enabled":True, + "variance.is_enabled": True}) + + # numeric stats can be turned off/on entirely + profile_options.set({"is_numeric_stats_enabled": False}) + profile_options.set({"int.is_numeric_stats_enabled": False}) + + profile = Profiler(data, options=profile_options) + + # Print the report using json to prettify. + report = profile.report(report_options={"output_format": "pretty"}) + print(json.dumps(report, indent=4)) + + +Below is an breakdown of all the options. + +* **ProfilerOptions** - The top-level options class that contains options for the Profiler class + + * **presets** - A pre-configured mapping of a string name to group of options: + + * **default is None** + + * **"complete"** + + .. code-block:: python + + options = ProfilerOptions(presets="complete") + + * **"data_types"** + + .. code-block:: python + + options = ProfilerOptions(presets="data_types") + + * **"numeric_stats_disabled"** + + .. code-block:: python + + options = ProfilerOptions(presets="numeric_stats_disabled") + + * **"lower_memory_sketching"** + + .. code-block:: python + + options = ProfilerOptions(presets="lower_memory_sketching") + + * **structured_options** - Options responsible for all structured data + + * **multiprocess** - Option to enable multiprocessing. If on, multiprocessing is toggled on if the dataset contains more than 750,000 rows or more than 20 columns. + Automatically selects the optimal number of pooling processes to utilize based on system constraints when toggled on. + + * is_enabled - (Boolean) Enables or disables multiprocessing + + * **sampling_ratio** - A percentage, as a decimal, ranging from greater than 0 to less than or equal to 1 indicating how much input data to sample. Default value set to 0.2. + + * **int** - Options for the integer columns + + * is_enabled - (Boolean) Enables or disables the integer operations + + * min - Finds minimum value in a column + + * is_enabled - (Boolean) Enables or disables min + + * max - Finds maximum value in a column + + * is_enabled - (Boolean) Enables or disables max + + * mode - Finds mode(s) in a column + + * is_enabled - (Boolean) Enables or disables mode + * top_k_modes - (Int) Sets the number of modes to return if multiple exist. Default returns max 5 modes. + * median - Finds median value in a column + + * is_enabled - (Boolean) Enables or disables median + * sum - Finds sum of all values in a column + + * is_enabled - (Boolean) Enables or disables sum + + * variance - Finds variance of all values in a column + + * is_enabled - (Boolean) Enables or disables variance + * skewness - Finds skewness of all values in a column + + * is_enabled - (Boolean) Enables or disables skewness + * kurtosis - Finds kurtosis of all values in a column + + * is_enabled - (Boolean) Enables or disables kurtosis + * median_abs_deviation - Finds median absolute deviation of all values in a column + + * is_enabled - (Boolean) Enables or disables median absolute deviation + * num_zeros - Finds the count of zeros in a column + + * is_enabled - (Boolean) Enables or disables num_zeros + * num_negatives - Finds the count of negative numbers in a column + + * is_enabled - (Boolean) Enables or disables num_negatives + * bias_correction - Applies bias correction to variance, skewness, and kurtosis calculations + + * is_enabled - (Boolean) Enables or disables bias correction + * histogram_and_quantiles - Generates a histogram and quantiles + from the column values + + * bin_count_or_method - (String/List[String]) Designates preferred method for calculating histogram bins or the number of bins to use. + If left unspecified (None) the optimal method will be chosen by attempting all methods. + If multiple specified (list) the optimal method will be chosen by attempting the provided ones. + methods: 'auto', 'fd', 'doane', 'scott', 'rice', 'sturges', 'sqrt' + Note: 'auto' is used to choose optimally between 'fd' and 'sturges' + * num_quantiles - (Int) Number of quantiles to bin the data. + Default value is set to 1,000 quantiles. + * is_enabled - (Boolean) Enables or disables histogram and quantiles + * **float** - Options for the float columns + + * is_enabled - (Boolean) Enables or disables the float operations + * precision - Finds the precision (significant figures) within the column + + * is_enabled - (Boolean) Enables or disables precision + * sample_ratio - (Float) The ratio of 0 to 1 how much data (identified as floats) to utilize as samples in determining precision + + * min - Finds minimum value in a column + + * is_enabled - (Boolean) Enables or disables min + * max - Finds maximum value in a column + + * is_enabled - (Boolean) Enables or disables max + * mode - Finds mode(s) in a column + + * is_enabled - (Boolean) Enables or disables mode + * top_k_modes - (Int) Sets the number of modes to return if multiple exist. Default returns max 5 modes. + * median - Finds median value in a column + + * is_enabled - (Boolean) Enables or disables median + * sum - Finds sum of all values in a column + + * is_enabled - (Boolean) Enables or disables sum + * variance - Finds variance of all values in a column + + * is_enabled - (Boolean) Enables or disables variance + * skewness - Finds skewness of all values in a column + + * is_enabled - (Boolean) Enables or disables skewness + * kurtosis - Finds kurtosis of all values in a column + + * is_enabled - (Boolean) Enables or disables kurtosis + * median_abs_deviation - Finds median absolute deviation of all values in a column + + * is_enabled - (Boolean) Enables or disables median absolute deviation + * is_numeric_stats_enabled - (Boolean) enable or disable all numeric stats + * num_zeros - Finds the count of zeros in a column + + * is_enabled - (Boolean) Enables or disables num_zeros + * num_negatives - Finds the count of negative numbers in a column + + * is_enabled - (Boolean) Enables or disables num_negatives + * bias_correction - Applies bias correction to variance, skewness, and kurtosis calculations + + * is_enabled - (Boolean) Enables or disables bias correction + * histogram_and_quantiles - Generates a histogram and quantiles + from the column values + + * bin_count_or_method - (String/List[String]) Designates preferred method for calculating histogram bins or the number of bins to use. + If left unspecified (None) the optimal method will be chosen by attempting all methods. + If multiple specified (list) the optimal method will be chosen by attempting the provided ones. + methods: 'auto', 'fd', 'doane', 'scott', 'rice', 'sturges', 'sqrt' + Note: 'auto' is used to choose optimally between 'fd' and 'sturges' + * num_quantiles - (Int) Number of quantiles to bin the data. + Default value is set to 1,000 quantiles. + * is_enabled - (Boolean) Enables or disables histogram and quantiles + * **text** - Options for the text columns + + * is_enabled - (Boolean) Enables or disables the text operations + * vocab - Finds all the unique characters used in a column + + * is_enabled - (Boolean) Enables or disables vocab + * min - Finds minimum value in a column + + * is_enabled - (Boolean) Enables or disables min + * max - Finds maximum value in a column + + * is_enabled - (Boolean) Enables or disables max + * mode - Finds mode(s) in a column + + * is_enabled - (Boolean) Enables or disables mode + * top_k_modes - (Int) Sets the number of modes to return if multiple exist. Default returns max 5 modes. + * median - Finds median value in a column + + * is_enabled - (Boolean) Enables or disables median + * sum - Finds sum of all values in a column + + * is_enabled - (Boolean) Enables or disables sum + * variance - Finds variance of all values in a column + + * is_enabled - (Boolean) Enables or disables variance + * skewness - Finds skewness of all values in a column + + * is_enabled - (Boolean) Enables or disables skewness + * kurtosis - Finds kurtosis of all values in a column + + * is_enabled - (Boolean) Enables or disables kurtosis + * median_abs_deviation - Finds median absolute deviation of all values in a column + + * is_enabled - (Boolean) Enables or disables median absolute deviation + * bias_correction - Applies bias correction to variance, skewness, and kurtosis calculations + + * is_enabled - (Boolean) Enables or disables bias correction + * is_numeric_stats_enabled - (Boolean) enable or disable all numeric stats + * num_zeros - Finds the count of zeros in a column + + * is_enabled - (Boolean) Enables or disables num_zeros + * num_negatives - Finds the count of negative numbers in a column + + * is_enabled - (Boolean) Enables or disables num_negatives + * histogram_and_quantiles - Generates a histogram and quantiles + from the column values + + * bin_count_or_method - (String/List[String]) Designates preferred method for calculating histogram bins or the number of bins to use. + If left unspecified (None) the optimal method will be chosen by attempting all methods. + If multiple specified (list) the optimal method will be chosen by attempting the provided ones. + methods: 'auto', 'fd', 'doane', 'scott', 'rice', 'sturges', 'sqrt' + Note: 'auto' is used to choose optimally between 'fd' and 'sturges' + * num_quantiles - (Int) Number of quantiles to bin the data. + Default value is set to 1,000 quantiles. + * is_enabled - (Boolean) Enables or disables histogram and quantiles + * **datetime** - Options for the datetime columns + + * is_enabled - (Boolean) Enables or disables the datetime operations + * **order** - Options for the order columns + + * is_enabled - (Boolean) Enables or disables the order operations + * **category** - Options for the category columns + + * is_enabled - (Boolean) Enables or disables the category operations + * top_k_categories - (int) Number of categories to be displayed when reporting + * max_sample_size_to_check_stop_condition - (int) The maximum sample size before categorical stop conditions are checked + * stop_condition_unique_value_ratio - (float) The highest ratio of unique values to dataset size that is to be considered a categorical type + * cms - (Boolean) Enables or Disables the use of count min sketch / heavy hitters for approximate frequency counts + * cms_confidence - (float) Defines the number of hashes used in CMS, default 0.95 + * cms_relative_error - (float) Defines the number of buckets used in CMS, default 0.01 + * cms_max_num_heavy_hitters - (int) The value used to define the threshold for minimum frequency required by a category to be counted + * **data_labeler** - Options for the data labeler columns + + * is_enabled - (Boolean) Enables or disables the data labeler operations + * data_labeler_dirpath - (String) Directory path to data labeler + * data_labeler_object - (BaseDataLabeler) Datalabeler to replace + the default labeler + * max_sample_size - (Int) The max number of samples for the data + labeler + * **correlation** - Option set for correlation profiling + * is_enabled - (Boolean) Enables or disables performing correlation profiling + * columns - Columns considered to calculate correlation + * **row_statistics** - (Boolean) Option to enable/disable row statistics calculations + + * unique_count - (UniqueCountOptions) Option to enable/disable unique row count calculations + + * is_enabled - (Bool) Enables or disables options for unique row count + * hashing_method - (String) Property to specify row hashing method ("full" | "hll") + * hll - (HyperLogLogOptions) Options for alternative method of estimating unique row count (activated when `hll` is the selected hashing_method) + + * seed - (Int) Used to set HLL hashing function seed + * register_count - (Int) Number of registers is equal to 2^register_count + + * null_count - (Boolean) Option to enable/disable functionalities for row_has_null_ratio and row_is_null_ratio + * **chi2_homogeneity** - Options for the chi-squared test matrix + + * is_enabled - (Boolean) Enables or disables performing chi-squared tests for homogeneity between the categorical columns of the dataset. + * **null_replication_metrics** - Options for calculating null replication metrics + + * is_enabled - (Boolean) Enables or disables calculation of null replication metrics + * **unstructured_options** - Options responsible for all unstructured data + + * **text** - Options for the text profile + + * is_case_sensitive - (Boolean) Specify whether the profile is case sensitive + * stop_words - (List of Strings) List of stop words to be removed when profiling + * top_k_chars - (Int) Number of top characters to be retrieved when profiling + * top_k_words - (Int) Number of top words to be retrieved when profiling + * vocab - Options for vocab count + + * is_enabled - (Boolean) Enables or disables the vocab stats + * words - Options for word count + + * is_enabled - (Boolean) Enables or disables the word stats + * **data_labeler** - Options for the data labeler + + * is_enabled - (Boolean) Enables or disables the data labeler operations + * data_labeler_dirpath - (String) Directory path to data labeler + * data_labeler_object - (BaseDataLabeler) Datalabeler to replace + the default labeler + * max_sample_size - (Int) The max number of samples for the data + labeler + + + +Statistical Dependency on Order of Updates +========================================== + +Some profile features/statistics are dependent on the order in which the profiler +is updated with new data. + +Order Profile +~~~~~~~~~~~~~ + +The order profiler utilizes the last value in the previous data batch to ensure +the subsequent dataset is above/below/equal to that value when predicting +non-random order. + +For instance, a dataset to be predicted as ascending would require the following +batch data update to be ascending and its first value `>=` than that of the +previous batch of data. + +Ex. of ascending: + +.. code-block:: python + + batch_1 = [0, 1, 2] + batch_2 = [3, 4, 5] + +Ex. of random: + +.. code-block:: python + + batch_1 = [0, 1, 2] + batch_2 = [1, 2, 3] # notice how the first value is less than the last value in the previous batch + + +Reporting Structure +=================== + +For every profile, we can provide a report and customize it with a couple optional parameters: + +* output_format (string) + + * This will allow the user to decide the output format for report. + + * Options are one of [pretty, compact, serializable, flat]: + + * Pretty: floats are rounded to four decimal places, and lists are shortened. + * Compact: Similar to pretty, but removes detailed statistics such as runtimes, label probabilities, index locations of null types, etc. + * Serializable: Output is json serializable and not prettified + * Flat: Nested output is returned as a flattened dictionary +* num_quantile_groups (int) + + * You can sample your data as you like! With a minimum of one and a maximum of 1000, you can decide the number of quantile groups! + +.. code-block:: python + + report = profile.report(report_options={"output_format": "pretty"}) + report = profile.report(report_options={"output_format": "compact"}) + report = profile.report(report_options={"output_format": "serializable"}) + report = profile.report(report_options={"output_format": "flat"}) diff --git a/_docs/docs/source/profiler_example.nblink b/_docs/docs/source/profiler_example.nblink new file mode 100644 index 000000000..142ebd97f --- /dev/null +++ b/_docs/docs/source/profiler_example.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../../examples/structured_profilers.ipynb" +} \ No newline at end of file diff --git a/_docs/docs/source/regex_labeler_from_scratch.nblink b/_docs/docs/source/regex_labeler_from_scratch.nblink new file mode 100644 index 000000000..3d98c5f1e --- /dev/null +++ b/_docs/docs/source/regex_labeler_from_scratch.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../../examples/regex_labeler_from_scratch/DataLabeler_from_scratch.ipynb" +} \ No newline at end of file diff --git a/_docs/docs/source/roadmap.rst b/_docs/docs/source/roadmap.rst new file mode 100644 index 000000000..93886690b --- /dev/null +++ b/_docs/docs/source/roadmap.rst @@ -0,0 +1,58 @@ +.. _roadmap: + +Roadmap +******* + +For more detailed tasks, checkout the repo's github issues page here: +`Github Issues `_. + + +Data Reader Updates +=================== +- Read data from S3 bucket + - All in the current `dp.Data()` API paradigm, we want to enable passing an S3 bucket file path to read in data from AWS s3. +- Pass list of data file paths to data reader +- Pass in linst of data frames to data reader + +New Model +========= +- Transformer model from sensitive data detection + +Historical Profiles +=================== +- Some questions about Historical Profiles / need to step back and rething design to start: + - Meta profile on top? + - Stored windowed info inside? Etc... +- Branch with current state of Historical Profiles +- Two example notebooks of current state: + - Notebook example `one `_. + - Notebook example `two `_. + + +Conditional Report Metric +========================= +- Based on what is populated on other metrics in the report, have "secondary" / "derivatives" of that number (or that number in conjunction with another number) populate in thie report as well. +- For example, if null_count is not None, then populate a null_percent key with a value of the dividence of (null_count / sample_count). + +Space / Time Testing +==================== +- Automatic comparison testing for space and time analysis on PR’s + - Standardize a report for space time analysis for future comparisons (create baseline numbers) + - Include those in integration tests that will automatically run on code when it is changed in PRs +- Could be an optional test, if the user thinks there is concern around the change driving an issue in the library performance + +Testing Suite Upgrades +====================== +- Add mocking to unit tests where mocking is not utilized +- Integration testing separated out from the unit testing suite. Determine how to only run remotely during PRs +- Backward compatibility testing along with informative warnings and errors when a user is utilizing incompatible versions of the library and saved profile object + +Historical Versions +=================== +- Legacy version upgrades to enable patches to prior versions of the Data Profiler + +Miscellaneous +============== +- Refact/or Pandas to Polars DataFrames +- Spearman correlation calculation +- Workflow Profiles diff --git a/_docs/docs/source/unstructured_profiler_example.nblink b/_docs/docs/source/unstructured_profiler_example.nblink new file mode 100644 index 000000000..5b6829754 --- /dev/null +++ b/_docs/docs/source/unstructured_profiler_example.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../../examples/unstructured_profilers.ipynb" +} \ No newline at end of file diff --git a/_docs/docs/update_documentation.py b/_docs/docs/update_documentation.py new file mode 100644 index 000000000..7be79612d --- /dev/null +++ b/_docs/docs/update_documentation.py @@ -0,0 +1,87 @@ +#!/usr/bin/python +"""Script which auto updates the github pages documentation.""" +import os +import subprocess +import sys + +sys.path.insert(0, os.path.abspath(f'../../')) + +from dataprofiler import __version__ as version # noqa F401 + +# Make the rst files from the current repo +subprocess.run( + [ + "sphinx-apidoc", + "--templatedir=./source/_templates/", + "-f", + "-e", + "-o", + "../docs/source", + f"../../dataprofiler", + f"../../dataprofiler/tests/", + ] +) + +update_index_rst = True + +if not version: + Exception("There must be a valid version argument.") + +# Check if the source index file has already been updated +source_index = open("source/index.rst", "r+") +source_index_lines = source_index.readlines() +source_index.close() +for sentence in source_index_lines: + if sentence.startswith("* `" + version): + update_index_rst = False + +# Update the index file if needed +version_reference = "" +if update_index_rst: + buffer = 0 + source_index = open("source/index.rst", "w") + for sentence in source_index_lines: + if sentence.startswith("Documentation for"): + doc_version = "Documentation for " + version + "\n" + source_index.write(doc_version) + elif sentence.startswith("Versions"): + source_index.write("Versions\n") + source_index.write("========\n") + version_tag = "* `" + version + "`_\n" + source_index.write(version_tag) + version_reference = ( + ".. _" + version + ": ../../" + version + "/html/index.html\n\n" + ) + buffer = 1 + else: + if buffer == 0: + source_index.write(sentence) + else: + buffer = buffer - 1 + source_index.write(version_reference) +source_index.close() + +# Make the html files + +build_directory = "BUILDDIR= LATEST" +subprocess.run(["make", "html", build_directory]) + +# update the index file to redirect to the most current version of documentation +index_file = open("../index.html", "w") +redirect_link = ( + '' +) +index_file.write(redirect_link) +index_file.close() + +# update the profiler_options.html file to redirect to detailed options docs +index_file = open("../profiler_options.html", "w") +redirect_link = ( + '' +) +index_file.write(redirect_link) +index_file.close() diff --git a/_docs/index.html b/_docs/index.html new file mode 100644 index 000000000..fb51eaca9 --- /dev/null +++ b/_docs/index.html @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/_docs/profiler_options.html b/_docs/profiler_options.html new file mode 100644 index 000000000..831f653ff --- /dev/null +++ b/_docs/profiler_options.html @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/_docs/setup.cfg b/_docs/setup.cfg new file mode 100644 index 000000000..c9c21e52f --- /dev/null +++ b/_docs/setup.cfg @@ -0,0 +1,7 @@ +[flake8] +max-line-length = 88 +extend-ignore = E203 + +[isort] +multi_line_output=3 +profile=black diff --git a/dataprofiler/__init__.py b/dataprofiler/__init__.py index f23cf6494..0c7919242 100644 --- a/dataprofiler/__init__.py +++ b/dataprofiler/__init__.py @@ -1,6 +1,7 @@ """Package for dataprofiler.""" from . import settings +from ._version import get_versions from .data_readers.data import Data from .dp_logging import get_logger, set_verbosity from .labelers.data_labelers import ( @@ -19,8 +20,8 @@ from .profilers.profiler_options import ProfilerOptions from .reports import graphs from .validators.base_validators import Validator -from .version import __version__ +__version__ = get_versions()["version"] def set_seed(seed=None): # also check it's an integer diff --git a/dataprofiler/_version.py b/dataprofiler/_version.py new file mode 100644 index 000000000..669959883 --- /dev/null +++ b/dataprofiler/_version.py @@ -0,0 +1,524 @@ +# This file helps to compute a version number in source trees obtained from +# git-archive tarball (such as those provided by githubs download-from-tag +# feature). Distribution tarballs (built by setup.py sdist) and build +# directories (produced by setup.py build) will contain a much shorter file +# that just contains the computed version number. + +# This file is released into the public domain. Generated by +# versioneer-0.19 (https://github.com/python-versioneer/python-versioneer) + +"""Git implementation of _version.py.""" + +import errno +import os +import re +import subprocess +import sys + + +def get_keywords(): + """Get the keywords needed to look up the version information.""" + # these strings will be replaced by git during git-archive. + # setup.py/versioneer.py will grep for the variable names, so they must + # each be defined on a line of their own. _version.py will just call + # get_keywords(). + git_refnames = "$Format:%d$" + git_full = "$Format:%H$" + git_date = "$Format:%ci$" + keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} + return keywords + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + +def get_config(): + """Create, populate and return the VersioneerConfig() object.""" + # these strings are filled in when 'setup.py versioneer' creates + # _version.py + cfg = VersioneerConfig() + cfg.VCS = "git" + cfg.style = "pep440" + cfg.tag_prefix = "" + cfg.parentdir_prefix = "dataprofiler-" + cfg.versionfile_source = "dataprofiler/_version.py" + cfg.verbose = False + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +LONG_VERSION_PY = {} # type: ignore +HANDLERS = {} + + +def register_vcs_handler(vcs, method): # decorator + """Create decorator to mark a method as the handler of a VCS.""" + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + return decorate + + +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, + env=None): + """Call the given command(s).""" + assert isinstance(commands, list) + p = None + for c in commands: + try: + dispcmd = str([c] + args) + # remember shell=False, so use git.cmd on windows, not just git + p = subprocess.Popen([c] + args, cwd=cwd, env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None)) + break + except OSError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %s" % dispcmd) + print(e) + return None, None + else: + if verbose: + print(f"unable to find command, tried {commands}") + return None, None + stdout = p.communicate()[0].strip().decode() + if p.returncode != 0: + if verbose: + print("unable to run %s (error)" % dispcmd) + print("stdout was %s" % stdout) + return None, p.returncode + return stdout, p.returncode + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for i in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None, "date": None} + else: + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print("Tried directories %s but none started with prefix %s" % + (str(rootdirs), parentdir_prefix)) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + f = open(versionfile_abs) + for line in f.readlines(): + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + f.close() + except OSError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if not keywords: + raise NotThisMethod("no keywords at all, weird") + date = keywords.get("date") + if date is not None: + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + + # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = {r.strip() for r in refnames.strip("()").split(",")} + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = {r for r in refs if re.search(r'\d', r)} + if verbose: + print("discarding '%s', no digits" % ",".join(refs - tags)) + if verbose: + print("likely tags: %s" % ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix):] + if verbose: + print("picking %s" % r) + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None, + "date": date} + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags", "date": None} + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, + hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %s not under git control" % root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", + "--always", "--long", + "--match", "%s*" % tag_prefix], + cwd=root) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[:git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + if not mo: + # unparseable. Maybe git-describe is misbehaving? + pieces["error"] = ("unable to parse git-describe output: '%s'" + % describe_out) + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%s' doesn't start with prefix '%s'" + print(fmt % (full_tag, tag_prefix)) + pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" + % (full_tag, tag_prefix)) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], + cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], + cwd=root)[0].strip() + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%d.g%s" % (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_pre(pieces): + """TAG[.post0.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post0.devDISTANCE + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += ".post0.dev%d" % pieces["distance"] + else: + # exception #1 + rendered = "0.post0.dev%d" % pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None} + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%s'" % style) + + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None, + "date": pieces.get("date")} + + +def get_versions(): + """Get version information or return default if unable to do so.""" + # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have + # __file__, we can work backwards from there to the root. Some + # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which + # case we can only use expanded keywords. + + cfg = get_config() + verbose = cfg.verbose + + try: + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, + verbose) + except NotThisMethod: + pass + + try: + root = os.path.realpath(__file__) + # versionfile_source is the relative path from the top of the source + # tree (where the .git directory might live) to this file. Invert + # this to find the root from __file__. + for i in cfg.versionfile_source.split('/'): + root = os.path.dirname(root) + except NameError: + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None} + + try: + pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) + return render(pieces, cfg.style) + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + except NotThisMethod: + pass + + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", "date": None} diff --git a/dataprofiler/tests/profilers/test_histogram_utils.py b/dataprofiler/tests/profilers/test_histogram_utils.py index 3be8cdcae..10c88b344 100644 --- a/dataprofiler/tests/profilers/test_histogram_utils.py +++ b/dataprofiler/tests/profilers/test_histogram_utils.py @@ -32,7 +32,7 @@ def mock_sqrt_return_nan(profile): return float("nan") -class TestColumn(NumericStatsMixin): +class MockColumn(NumericStatsMixin): def __init__(self): NumericStatsMixin.__init__(self) self.times = defaultdict(float) @@ -75,7 +75,7 @@ def test_ptp(self): def test_calc_doane_bin_width_from_profile(self): # Initial setup of profile - profile = TestColumn() + profile = MockColumn() with mock.patch( "dataprofiler.profilers.NumericStatsMixin.stddev", new_callable=mock_stddev @@ -177,7 +177,7 @@ def test_calc_doane_bin_width_from_profile(self): def test_calc_rice_bin_width_from_profile(self): # Initial setup of profile - profile = TestColumn() + profile = MockColumn() # Case 1: min, max, and match_count are set expected_dataset_size = profile.match_count @@ -230,7 +230,7 @@ def test_calc_rice_bin_width_from_profile(self): def test_calc_sturges_bin_width_from_profile(self): # Initial setup of profile - profile = TestColumn() + profile = MockColumn() # Case 1: min, max, and match_count are set expected_dataset_size = profile.match_count @@ -283,7 +283,7 @@ def test_calc_sturges_bin_width_from_profile(self): def test_calc_sqrt_bin_width_from_profile(self): # Initial setup of profile - profile = TestColumn() + profile = MockColumn() # Case 1: min, max, and match_count are set expected_dataset_size = profile.match_count @@ -336,7 +336,7 @@ def test_calc_sqrt_bin_width_from_profile(self): def test_calc_fd_bin_width_from_profile(self): # Initial setup of profile - profile = TestColumn() + profile = MockColumn() with mock.patch( "dataprofiler.profilers.NumericStatsMixin._get_percentile", @@ -359,7 +359,7 @@ def test_calc_fd_bin_width_from_profile(self): def test_calc_auto_bin_width_from_profile(self): # Initial setup of profile - profile = TestColumn() + profile = MockColumn() with mock.patch( "dataprofiler.profilers.histogram_utils._calc_fd_bin_width_from_profile" @@ -396,7 +396,7 @@ def test_calc_auto_bin_width_from_profile(self): def test_calc_scott_bin_width_from_profile(self): # Initial setup of profile - profile = TestColumn() + profile = MockColumn() with mock.patch( "dataprofiler.profilers.NumericStatsMixin.stddev", new_callable=mock_stddev @@ -418,7 +418,7 @@ def test_calc_scott_bin_width_from_profile(self): def test_calculate_bins_from_profile(self): # Initial setup of profile - profile = TestColumn() + profile = MockColumn() # Case 1: bin method not in set of valid bin methods with self.assertRaises(ValueError): @@ -457,7 +457,7 @@ def test_calculate_bins_from_profile(self): dataprofiler.profilers.histogram_utils._hist_bin_width_selectors_for_profile, {"sqrt": mock_sqrt_return_none}, ): - profile = TestColumn() + profile = MockColumn() actual = histogram_utils._calculate_bins_from_profile(profile, "sqrt") self.assertEqual(1, actual) @@ -466,6 +466,6 @@ def test_calculate_bins_from_profile(self): dataprofiler.profilers.histogram_utils._hist_bin_width_selectors_for_profile, {"sqrt": mock_sqrt_return_nan}, ): - profile = TestColumn() + profile = MockColumn() actual = histogram_utils._calculate_bins_from_profile(profile, "sqrt") self.assertEqual(1, actual) diff --git a/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py b/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py index e112781ab..cac04bfc9 100644 --- a/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py +++ b/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py @@ -18,7 +18,7 @@ test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) -class TestColumn(NumericStatsMixin): +class MockColumn(NumericStatsMixin): def __init__(self): NumericStatsMixin.__init__(self) self.match_count = 0 @@ -31,7 +31,7 @@ def _filter_properties_w_options(self, calculations, options): pass -class TestColumnWProps(TestColumn): +class MockColumnWProps(MockColumn): # overrides the property func median = None mode = None @@ -117,9 +117,9 @@ def test_check_int(self): def test_hist_loss_on_merge(self): # Initial setup of profiles - profile3 = TestColumn() - profile1 = TestColumn() - profile2 = TestColumn() + profile3 = MockColumn() + profile1 = MockColumn() + profile2 = MockColumn() mock_histogram1 = { "bin_counts": np.array([1, 1, 1, 1]), "bin_edges": np.array([2, 4, 6, 8, 10]), @@ -161,7 +161,7 @@ def test_update_variance(self): Checks update variance :return: """ - num_profiler = TestColumn() + num_profiler = MockColumn() # test update variance data1 = [-3.0, 2.0, 11.0] @@ -209,7 +209,7 @@ def test_update_variance_with_varying_data_length(self): data1 = [] mean1, var1, count1 = 0, np.nan, 0 - num_profiler = TestColumn() + num_profiler = MockColumn() num_profiler._biased_variance = num_profiler._update_variance( mean1, var1, count1 ) @@ -221,7 +221,7 @@ def test_update_variance_with_varying_data_length(self): data2 = [5.0] mean2, var2, count2 = 5.0, 0, 1 - num_profiler = TestColumn() + num_profiler = MockColumn() num_profiler._biased_variance = num_profiler._update_variance( mean2, var2, count2 ) @@ -239,7 +239,7 @@ def test_update_variance_with_varying_data_length(self): + (-11.0 - mean3) ** 2 ) / 3 - num_profiler = TestColumn() + num_profiler = MockColumn() num_profiler._biased_variance = num_profiler._update_variance( mean3, var3 * 3 / 4, count3 ) @@ -252,7 +252,7 @@ def test_update_variance_with_empty_data(self): Checks update variance :return: """ - num_profiler = TestColumn() + num_profiler = MockColumn() data1 = [-3.0, 2.0, 11.0] mean1 = (-3.0 + 2.0 + 11.0) / 3 @@ -284,7 +284,7 @@ def test_timeit_merge(self): Checks profiles have been merged and timed :return: """ - num_profiler, other1, other2 = TestColumn(), TestColumn(), TestColumn() + num_profiler, other1, other2 = MockColumn(), MockColumn(), MockColumn() mock_histogram = { "bin_counts": np.array([1, 1, 1, 1]), "bin_edges": np.array([2.0, 5.25, 8.5, 11.75, 15.0]), @@ -331,7 +331,7 @@ def test_timeit(self): Checks stat properties have been timed :return: """ - num_profiler = TestColumn() + num_profiler = MockColumn() # Dummy data to make min call prev_dependent_properties = { @@ -402,8 +402,8 @@ def test_from_dict_helper(self): fake_profile_name = "Fake profile name" # Build expected CategoricalColumn - actual_profile = TestColumn() - expected_profile = TestColumn() + actual_profile = MockColumn() + expected_profile = MockColumn() mock_saved_profile = dict( { "quantiles": None, @@ -429,7 +429,7 @@ def test_from_dict_helper(self): test_utils.assert_profiles_equal(expected_profile, actual_profile) def test_histogram_bin_error(self): - num_profiler = TestColumn() + num_profiler = MockColumn() # Dummy data for calculating bin error num_profiler._stored_histogram = { @@ -475,7 +475,7 @@ def test_histogram_bin_error(self): assert sum_error == np.inf def test_get_best_histogram_profile(self): - num_profiler = TestColumn() + num_profiler = MockColumn() num_profiler._histogram_for_profile = mock.MagicMock( side_effect=[("hist_1", 3), ("hist_2", 2), ("hist_3", 1)] @@ -509,7 +509,7 @@ def test_get_best_histogram_profile(self): assert best_histogram == "hist_3" def test_get_best_histogram_profile_infinite_loss(self): - num_profiler = TestColumn() + num_profiler = MockColumn() num_profiler._histogram_for_profile = mock.MagicMock(return_value=("hist_1", 3)) @@ -529,7 +529,7 @@ def test_get_best_histogram_profile_infinite_loss(self): assert best_histogram == "hist_1" def test_get_percentile_median(self): - num_profiler = TestColumn() + num_profiler = MockColumn() # Dummy data for calculating bin error num_profiler._stored_histogram = { "histogram": { @@ -541,7 +541,7 @@ def test_get_percentile_median(self): self.assertListEqual([10, 10], median) def test_num_zeros(self): - num_profiler = TestColumn() + num_profiler = MockColumn() # Dummy data to make num_zeros call prev_dependent_properties = {"mean": 0} @@ -568,7 +568,7 @@ def test_num_zeros(self): self.assertEqual(subset_properties["num_zeros"], 4) def test_num_negatives(self): - num_profiler = TestColumn() + num_profiler = MockColumn() # Dummy data to make num_negatives call prev_dependent_properties = {"mean": 0} @@ -595,7 +595,7 @@ def test_num_negatives(self): self.assertEqual(subset_properties["num_negatives"], 4) def test_fold_histogram(self): - num_profiler = TestColumn() + num_profiler = MockColumn() # the break point is at the mid point of a bin bin_counts = np.array([1 / 6, 1 / 6, 1 / 6, 1 / 6, 1 / 6, 1 / 6]) @@ -670,7 +670,7 @@ def test_timeit_num_zeros_and_negatives(self): Checks num_zeros and num_negatives have been timed :return: """ - num_profiler = TestColumn() + num_profiler = MockColumn() # Dummy data to make min call prev_dependent_properties = {"mean": 0} @@ -702,14 +702,14 @@ def test_merge_num_zeros_and_negatives(self): Checks num_zeros and num_negatives can be merged :return: """ - num_profiler, other1, other2 = TestColumn(), TestColumn(), TestColumn() + num_profiler, other1, other2 = MockColumn(), MockColumn(), MockColumn() other1.num_zeros, other1.num_negatives = 3, 1 other2.num_zeros, other2.num_negatives = 7, 1 num_profiler._add_helper(other1, other2) self.assertEqual(num_profiler.num_zeros, 10) self.assertEqual(num_profiler.num_negatives, 2) - num_profiler, other1, other2 = TestColumn(), TestColumn(), TestColumn() + num_profiler, other1, other2 = MockColumn(), MockColumn(), MockColumn() other1.num_zeros, other1.num_negatives = 0, 0 other2.num_zeros, other2.num_negatives = 0, 0 num_profiler._add_helper(other1, other2) @@ -717,7 +717,7 @@ def test_merge_num_zeros_and_negatives(self): self.assertEqual(num_profiler.num_negatives, 0) def test_profile(self): - num_profiler = TestColumn() + num_profiler = MockColumn() mock_profile = dict( min=1.0, @@ -815,7 +815,7 @@ def test_report(self): self.assertIn(disabled_key, report_keys) def test_report_no_numerical_options(self): - num_profiler = TestColumn() + num_profiler = MockColumn() num_profiler.match_count = 0 num_profiler.times = defaultdict(float) @@ -833,7 +833,7 @@ def test_diff(self): Checks _diff_helper() works appropriately. """ - other1, other2 = TestColumnWProps(), TestColumnWProps() + other1, other2 = MockColumnWProps(), MockColumnWProps() other1.min = 3 other1.max = 4 other1._biased_variance = 1 @@ -881,7 +881,7 @@ def test_diff(self): self.assertDictEqual(expected_diff, difference) # Invalid statistics - other1, other2 = TestColumnWProps(), TestColumnWProps() + other1, other2 = MockColumnWProps(), MockColumnWProps() other1.min = 3 other1.max = 4 other1._biased_variance = np.nan # NaN variance @@ -931,7 +931,7 @@ def test_diff(self): self.assertTrue(np.isnan([expected_var, var, expected_stddev, stddev]).all()) # Insufficient match count - other1, other2 = TestColumnWProps(), TestColumnWProps() + other1, other2 = MockColumnWProps(), MockColumnWProps() other1.min = 3 other1.max = 4 other1._biased_variance = 1 @@ -980,7 +980,7 @@ def test_diff(self): self.assertTrue(np.isnan([expected_var, var, expected_stddev, stddev]).all()) # Constant values - other1, other2 = TestColumnWProps(), TestColumnWProps() + other1, other2 = MockColumnWProps(), MockColumnWProps() other1.min = 3 other1.max = 4 other1._biased_variance = 0 # constant value has 0 variance @@ -1028,7 +1028,7 @@ def test_diff(self): self.assertDictEqual(expected_diff, difference) # Small p-value - other1, other2 = TestColumnWProps(), TestColumnWProps() + other1, other2 = MockColumnWProps(), MockColumnWProps() other1.min = 3 other1.max = 4 other1._biased_variance = 1 @@ -1075,11 +1075,11 @@ def test_diff(self): other1.diff("Inproper input") self.assertEqual( str(exc.exception), - "Unsupported operand type(s) for diff: 'TestColumnWProps' and" " 'str'", + "Unsupported operand type(s) for diff: 'MockColumnWProps' and" " 'str'", ) # PSI same distribution test - other1, other2 = TestColumnWProps(), TestColumnWProps() + other1, other2 = MockColumnWProps(), MockColumnWProps() other1.match_count = 55 other1._stored_histogram = { "total_loss": 0, @@ -1112,7 +1112,7 @@ def test_diff(self): self.assertEqual(expected_psi_value, psi_value) # PSI min_min_edge == max_max_edge - other1, other2 = TestColumnWProps(), TestColumnWProps() + other1, other2 = MockColumnWProps(), MockColumnWProps() other1.match_count = 10 other1._stored_histogram = { "total_loss": 0, @@ -1139,7 +1139,7 @@ def test_diff(self): self.assertEqual(expected_psi_value, psi_value) # PSI regen other / not self - other1, other2 = TestColumnWProps(), TestColumnWProps() + other1, other2 = MockColumnWProps(), MockColumnWProps() other1.match_count = 55 other1._stored_histogram = { "total_loss": 0, diff --git a/dataprofiler/tests/test_data_profiler.py b/dataprofiler/tests/test_data_profiler.py index 54a5f2d82..2c8e9583a 100644 --- a/dataprofiler/tests/test_data_profiler.py +++ b/dataprofiler/tests/test_data_profiler.py @@ -31,8 +31,6 @@ def setUpClass(cls): def test_set_seed(self): import dataprofiler as dp - self.assertEqual(dp.settings._seed, None) - dp.set_seed(5) self.assertEqual(dp.settings._seed, 5) diff --git a/dataprofiler/tests/test_dp_logging.py b/dataprofiler/tests/test_dp_logging.py index 7f78903ee..99496e314 100644 --- a/dataprofiler/tests/test_dp_logging.py +++ b/dataprofiler/tests/test_dp_logging.py @@ -22,12 +22,6 @@ def tearDownClass(cls): root_logger.removeHandler(dp_logging.get_logger()) dp_logging._dp_logger = None - def test_default_verbosity(self, mock_stdout): - # Ensure that default effective level is INFO - self.assertEqual( - logging.INFO, logging.getLogger("DataProfiler").getEffectiveLevel() - ) - def test_set_verbosity(self, mock_stdout): from dataprofiler import dp_logging diff --git a/dataprofiler/version.py b/dataprofiler/version.py deleted file mode 100644 index b4e4c2b9c..000000000 --- a/dataprofiler/version.py +++ /dev/null @@ -1,13 +0,0 @@ -"""File contains the version number for the package.""" - -MAJOR = 0 -MINOR = 13 -MICRO = 1 -POST = None # otherwise None - -VERSION = "%d.%d.%d" % (MAJOR, MINOR, MICRO) - -_post_str = "" -if POST: - _post_str = f".post{POST}" -__version__ = VERSION + _post_str diff --git a/requirements-docs.txt b/requirements-docs.txt new file mode 100644 index 000000000..36517b45f --- /dev/null +++ b/requirements-docs.txt @@ -0,0 +1,7 @@ +Sphinx>=5.0.0 +sphinx-rtd-theme +nbsphinx +furo +nbsphinx-link +pre-commit +tornado diff --git a/requirements.txt b/requirements.txt index 3ccc4c6f5..e32f32851 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,11 +11,12 @@ python-snappy>=0.7.1 charset-normalizer>=1.3.6 psutil>=4.0.0 scipy>=1.10.0 -requests==2.32.* +requests>=2.32.4 networkx>=2.5.1 typing-extensions>=3.10.0.2 HLL>=2.0.3 datasketches>=4.1.0 packaging>=23.0 -boto3>=1.28.61 -# adding comment to trigger mend check +boto3>=1.37.15 +urllib3>=2.5.0 +versioneer diff --git a/setup.cfg b/setup.cfg index 6c2be03be..5930fdfa3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,10 +1,19 @@ +[versioneer] +vcs = git +style = pep440 +versionfile_source = dataprofiler/_version.py +versionfile_build = dataprofiler/_version.py +tag_prefix = "" +parentdir_prefix = dataprofiler- [flake8] max-line-length = 88 extend-ignore = E203 +exclude = versioneer.py, dataprofiler/_version.py [isort] + multi_line_output=3 -skip=dataprofiler/tests/data/,venv/ +skip=dataprofiler/tests/data/,venv/, versioneer.py, dataprofiler/_version.py profile=black include_trailing_comma=True force_grid_wrap=0 diff --git a/setup.py b/setup.py index eeca6629b..f1f799446 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ from setuptools import find_packages, setup # Load package version -from dataprofiler.version import __version__ +import versioneer here = path.abspath(path.dirname(__file__)) @@ -53,8 +53,9 @@ setup( name="DataProfiler", - version=__version__, - python_requires=">=3.9", + version=versioneer.get_version(), + cmdclass=versioneer.get_cmdclass(), + python_requires=">=3.10", description=DESCRIPTION, long_description=LONG_DESCRIPTION, long_description_content_type="text/markdown", diff --git a/tox.ini b/tox.ini index 4ee6081bd..caf70d437 100644 --- a/tox.ini +++ b/tox.ini @@ -16,7 +16,7 @@ deps = -rrequirements-reports.txt -rrequirements-test.txt commands = - python3 -m pytest dataprofiler/tests/ --cov=dataprofiler --cov-fail-under=80 --cov-report=xml:coverage.xml --forked + python3 -m pytest dataprofiler/tests/ --cov=dataprofiler --cov-fail-under=80 --cov-report=xml:coverage.xml # add "docs" to `envlist` to run the docs build #[testenv:docs] diff --git a/versioneer.py b/versioneer.py new file mode 100644 index 000000000..fcbc15bd1 --- /dev/null +++ b/versioneer.py @@ -0,0 +1,1741 @@ +# Version: 0.19 + +"""The Versioneer - like a rocketeer, but for versions. +The Versioneer +============== +* like a rocketeer, but for versions! +* https://github.com/python-versioneer/python-versioneer +* Brian Warner +* License: Public Domain +* Compatible with: Python 3.6, 3.7, 3.8, 3.9 and pypy3 +* [![Latest Version][pypi-image]][pypi-url] +* [![Build Status][travis-image]][travis-url] +This is a tool for managing a recorded version number in distutils-based +python projects. The goal is to remove the tedious and error-prone "update +the embedded version string" step from your release process. Making a new +release should be as easy as recording a new tag in your version-control +system, and maybe making new tarballs. +## Quick Install +* `pip install versioneer` to somewhere in your $PATH +* add a `[versioneer]` section to your setup.cfg (see [Install](INSTALL.md)) +* run `versioneer install` in your source tree, commit the results +* Verify version information with `python setup.py version` +## Version Identifiers +Source trees come from a variety of places: +* a version-control system checkout (mostly used by developers) +* a nightly tarball, produced by build automation +* a snapshot tarball, produced by a web-based VCS browser, like github's + "tarball from tag" feature +* a release tarball, produced by "setup.py sdist", distributed through PyPI +Within each source tree, the version identifier (either a string or a number, +this tool is format-agnostic) can come from a variety of places: +* ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows + about recent "tags" and an absolute revision-id +* the name of the directory into which the tarball was unpacked +* an expanded VCS keyword ($Id$, etc) +* a `_version.py` created by some earlier build step +For released software, the version identifier is closely related to a VCS +tag. Some projects use tag names that include more than just the version +string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool +needs to strip the tag prefix to extract the version identifier. For +unreleased software (between tags), the version identifier should provide +enough information to help developers recreate the same tree, while also +giving them an idea of roughly how old the tree is (after version 1.2, before +version 1.3). Many VCS systems can report a description that captures this, +for example `git describe --tags --dirty --always` reports things like +"0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the +0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has +uncommitted changes). +The version identifier is used for multiple purposes: +* to allow the module to self-identify its version: `myproject.__version__` +* to choose a name and prefix for a 'setup.py sdist' tarball +## Theory of Operation +Versioneer works by adding a special `_version.py` file into your source +tree, where your `__init__.py` can import it. This `_version.py` knows how to +dynamically ask the VCS tool for version information at import time. +`_version.py` also contains `$Revision$` markers, and the installation +process marks `_version.py` to have this marker rewritten with a tag name +during the `git archive` command. As a result, generated tarballs will +contain enough information to get the proper version. +To allow `setup.py` to compute a version too, a `versioneer.py` is added to +the top level of your source tree, next to `setup.py` and the `setup.cfg` +that configures it. This overrides several distutils/setuptools commands to +compute the version when invoked, and changes `setup.py build` and `setup.py +sdist` to replace `_version.py` with a small static file that contains just +the generated version data. +## Installation +See [INSTALL.md](./INSTALL.md) for detailed installation instructions. +## Version-String Flavors +Code which uses Versioneer can learn about its version string at runtime by +importing `_version` from your main `__init__.py` file and running the +`get_versions()` function. From the "outside" (e.g. in `setup.py`), you can +import the top-level `versioneer.py` and run `get_versions()`. +Both functions return a dictionary with different flavors of version +information: +* `['version']`: A condensed version string, rendered using the selected + style. This is the most commonly used value for the project's version + string. The default "pep440" style yields strings like `0.11`, + `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section + below for alternative styles. +* `['full-revisionid']`: detailed revision identifier. For Git, this is the + full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac". +* `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the + commit date in ISO 8601 format. This will be None if the date is not + available. +* `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that + this is only accurate if run in a VCS checkout, otherwise it is likely to + be False or None +* `['error']`: if the version string could not be computed, this will be set + to a string describing the problem, otherwise it will be None. It may be + useful to throw an exception in setup.py if this is set, to avoid e.g. + creating tarballs with a version string of "unknown". +Some variants are more useful than others. Including `full-revisionid` in a +bug report should allow developers to reconstruct the exact code being tested +(or indicate the presence of local changes that should be shared with the +developers). `version` is suitable for display in an "about" box or a CLI +`--version` output: it can be easily compared against release notes and lists +of bugs fixed in various releases. +The installer adds the following text to your `__init__.py` to place a basic +version in `YOURPROJECT.__version__`: + from ._version import get_versions + __version__ = get_versions()['version'] + del get_versions +## Styles +The setup.cfg `style=` configuration controls how the VCS information is +rendered into a version string. +The default style, "pep440", produces a PEP440-compliant string, equal to the +un-prefixed tag name for actual releases, and containing an additional "local +version" section with more detail for in-between builds. For Git, this is +TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags +--dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the +tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and +that this commit is two revisions ("+2") beyond the "0.11" tag. For released +software (exactly equal to a known tag), the identifier will only contain the +stripped tag, e.g. "0.11". +Other styles are available. See [details.md](details.md) in the Versioneer +source tree for descriptions. +## Debugging +Versioneer tries to avoid fatal errors: if something goes wrong, it will tend +to return a version of "0+unknown". To investigate the problem, run `setup.py +version`, which will run the version-lookup code in a verbose mode, and will +display the full contents of `get_versions()` (including the `error` string, +which may help identify what went wrong). +## Known Limitations +Some situations are known to cause problems for Versioneer. This details the +most significant ones. More can be found on Github +[issues page](https://github.com/python-versioneer/python-versioneer/issues). +### Subprojects +Versioneer has limited support for source trees in which `setup.py` is not in +the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are +two common reasons why `setup.py` might not be in the root: +* Source trees which contain multiple subprojects, such as + [Buildbot](https://github.com/buildbot/buildbot), which contains both + "master" and "slave" subprojects, each with their own `setup.py`, + `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI + distributions (and upload multiple independently-installable tarballs). +* Source trees whose main purpose is to contain a C library, but which also + provide bindings to Python (and perhaps other languages) in subdirectories. +Versioneer will look for `.git` in parent directories, and most operations +should get the right version string. However `pip` and `setuptools` have bugs +and implementation details which frequently cause `pip install .` from a +subproject directory to fail to find a correct version string (so it usually +defaults to `0+unknown`). +`pip install --editable .` should work correctly. `setup.py install` might +work too. +Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in +some later version. +[Bug #38](https://github.com/python-versioneer/python-versioneer/issues/38) is tracking +this issue. The discussion in +[PR #61](https://github.com/python-versioneer/python-versioneer/pull/61) describes the +issue from the Versioneer side in more detail. +[pip PR#3176](https://github.com/pypa/pip/pull/3176) and +[pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve +pip to let Versioneer work correctly. +Versioneer-0.16 and earlier only looked for a `.git` directory next to the +`setup.cfg`, so subprojects were completely unsupported with those releases. +### Editable installs with setuptools <= 18.5 +`setup.py develop` and `pip install --editable .` allow you to install a +project into a virtualenv once, then continue editing the source code (and +test) without re-installing after every change. +"Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a +convenient way to specify executable scripts that should be installed along +with the python package. +These both work as expected when using modern setuptools. When using +setuptools-18.5 or earlier, however, certain operations will cause +`pkg_resources.DistributionNotFound` errors when running the entrypoint +script, which must be resolved by re-installing the package. This happens +when the install happens with one version, then the egg_info data is +regenerated while a different version is checked out. Many setup.py commands +cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into +a different virtualenv), so this can be surprising. +[Bug #83](https://github.com/python-versioneer/python-versioneer/issues/83) describes +this one, but upgrading to a newer version of setuptools should probably +resolve it. +## Updating Versioneer +To upgrade your project to a new release of Versioneer, do the following: +* install the new Versioneer (`pip install -U versioneer` or equivalent) +* edit `setup.cfg`, if necessary, to include any new configuration settings + indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details. +* re-run `versioneer install` in your source tree, to replace + `SRC/_version.py` +* commit any changed files +## Future Directions +This tool is designed to make it easily extended to other version-control +systems: all VCS-specific components are in separate directories like +src/git/ . The top-level `versioneer.py` script is assembled from these +components by running make-versioneer.py . In the future, make-versioneer.py +will take a VCS name as an argument, and will construct a version of +`versioneer.py` that is specific to the given VCS. It might also take the +configuration arguments that are currently provided manually during +installation by editing setup.py . Alternatively, it might go the other +direction and include code from all supported VCS systems, reducing the +number of intermediate scripts. +## Similar projects +* [setuptools_scm](https://github.com/pypa/setuptools_scm/) - a non-vendored build-time + dependency +* [minver](https://github.com/jbweston/miniver) - a lightweight reimplementation of + versioneer +## License +To make Versioneer easier to embed, all its code is dedicated to the public +domain. The `_version.py` that it creates is also in the public domain. +Specifically, both are released under the Creative Commons "Public Domain +Dedication" license (CC0-1.0), as described in +https://creativecommons.org/publicdomain/zero/1.0/ . +[pypi-image]: https://img.shields.io/pypi/v/versioneer.svg +[pypi-url]: https://pypi.python.org/pypi/versioneer/ +[travis-image]: +https://img.shields.io/travis/com/python-versioneer/python-versioneer.svg +[travis-url]: https://travis-ci.com/github/python-versioneer/python-versioneer +""" + +import configparser +import errno +import json +import os +import re +import subprocess +import sys + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + +def get_root(): + """Get the project root directory. + We require that all commands are run from the project root, i.e. the + directory that contains setup.py, setup.cfg, and versioneer.py . + """ + root = os.path.realpath(os.path.abspath(os.getcwd())) + setup_py = os.path.join(root, "setup.py") + versioneer_py = os.path.join(root, "versioneer.py") + if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): + # allow 'python path/to/setup.py COMMAND' + root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0]))) + setup_py = os.path.join(root, "setup.py") + versioneer_py = os.path.join(root, "versioneer.py") + if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): + err = ( + "Versioneer was unable to run the project root directory. " + "Versioneer requires setup.py to be executed from " + "its immediate directory (like 'python setup.py COMMAND'), " + "or in a way that lets it use sys.argv[0] to find the root " + "(like 'python path/to/setup.py COMMAND')." + ) + raise VersioneerBadRootError(err) + try: + # Certain runtime workflows (setup.py install/develop in a setuptools + # tree) execute all dependencies in a single python process, so + # "versioneer" may be imported multiple times, and python's shared + # module-import table will cache the first one. So we can't use + # os.path.dirname(__file__), as that will find whichever + # versioneer.py was first imported, even in later projects. + me = os.path.realpath(os.path.abspath(__file__)) + me_dir = os.path.normcase(os.path.splitext(me)[0]) + vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0]) + if me_dir != vsr_dir: + print( + "Warning: build in %s is using versioneer.py from %s" + % (os.path.dirname(me), versioneer_py) + ) + except NameError: + pass + return root + + +def get_config_from_root(root): + """Read the project setup.cfg file to determine Versioneer config.""" + # This might raise EnvironmentError (if setup.cfg is missing), or + # configparser.NoSectionError (if it lacks a [versioneer] section), or + # configparser.NoOptionError (if it lacks "VCS="). See the docstring at + # the top of versioneer.py for instructions on writing your setup.cfg . + setup_cfg = os.path.join(root, "setup.cfg") + parser = configparser.ConfigParser() + with open(setup_cfg) as f: + parser.read_file(f) + VCS = parser.get("versioneer", "VCS") # mandatory + + def get(parser, name): + if parser.has_option("versioneer", name): + return parser.get("versioneer", name) + return None + + cfg = VersioneerConfig() + cfg.VCS = VCS + cfg.style = get(parser, "style") or "" + cfg.versionfile_source = get(parser, "versionfile_source") + cfg.versionfile_build = get(parser, "versionfile_build") + cfg.tag_prefix = get(parser, "tag_prefix") + if cfg.tag_prefix in ("''", '""'): + cfg.tag_prefix = "" + cfg.parentdir_prefix = get(parser, "parentdir_prefix") + cfg.verbose = get(parser, "verbose") + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +# these dictionaries contain VCS-specific tools +LONG_VERSION_PY = {} +HANDLERS = {} + + +def register_vcs_handler(vcs, method): # decorator + """Create decorator to mark a method as the handler of a VCS.""" + + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + + return decorate + + +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): + """Call the given command(s).""" + assert isinstance(commands, list) + p = None + for c in commands: + try: + dispcmd = str([c] + args) + # remember shell=False, so use git.cmd on windows, not just git + p = subprocess.Popen( + [c] + args, + cwd=cwd, + env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr else None), + ) + break + except OSError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %s" % dispcmd) + print(e) + return None, None + else: + if verbose: + print(f"unable to find command, tried {commands}") + return None, None + stdout = p.communicate()[0].strip().decode() + if p.returncode != 0: + if verbose: + print("unable to run %s (error)" % dispcmd) + print("stdout was %s" % stdout) + return None, p.returncode + return stdout, p.returncode + + +LONG_VERSION_PY[ + "git" +] = r''' +# This file helps to compute a version number in source trees obtained from +# git-archive tarball (such as those provided by githubs download-from-tag +# feature). Distribution tarballs (built by setup.py sdist) and build +# directories (produced by setup.py build) will contain a much shorter file +# that just contains the computed version number. +# This file is released into the public domain. Generated by +# versioneer-0.19 (https://github.com/python-versioneer/python-versioneer) +"""Git implementation of _version.py.""" +import errno +import os +import re +import subprocess +import sys +def get_keywords(): + """Get the keywords needed to look up the version information.""" + # these strings will be replaced by git during git-archive. + # setup.py/versioneer.py will grep for the variable names, so they must + # each be defined on a line of their own. _version.py will just call + # get_keywords(). + git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s" + git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s" + git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s" + keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} + return keywords +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" +def get_config(): + """Create, populate and return the VersioneerConfig() object.""" + # these strings are filled in when 'setup.py versioneer' creates + # _version.py + cfg = VersioneerConfig() + cfg.VCS = "git" + cfg.style = "%(STYLE)s" + cfg.tag_prefix = "%(TAG_PREFIX)s" + cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s" + cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s" + cfg.verbose = False + return cfg +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" +LONG_VERSION_PY = {} +HANDLERS = {} +def register_vcs_handler(vcs, method): # decorator + """Create decorator to mark a method as the handler of a VCS.""" + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + return decorate +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, + env=None): + """Call the given command(s).""" + assert isinstance(commands, list) + p = None + for c in commands: + try: + dispcmd = str([c] + args) + # remember shell=False, so use git.cmd on windows, not just git + p = subprocess.Popen([c] + args, cwd=cwd, env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None)) + break + except EnvironmentError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %%s" %% dispcmd) + print(e) + return None, None + else: + if verbose: + print("unable to find command, tried %%s" %% (commands,)) + return None, None + stdout = p.communicate()[0].strip().decode() + if p.returncode != 0: + if verbose: + print("unable to run %%s (error)" %% dispcmd) + print("stdout was %%s" %% stdout) + return None, p.returncode + return stdout, p.returncode +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + for i in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None, "date": None} + else: + rootdirs.append(root) + root = os.path.dirname(root) # up a level + if verbose: + print("Tried directories %%s but none started with prefix %%s" %% + (str(rootdirs), parentdir_prefix)) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + f = open(versionfile_abs, "r") + for line in f.readlines(): + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + f.close() + except EnvironmentError: + pass + return keywords +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if not keywords: + raise NotThisMethod("no keywords at all, weird") + date = keywords.get("date") + if date is not None: + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = set([r.strip() for r in refnames.strip("()").split(",")]) + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %%d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = set([r for r in refs if re.search(r'\d', r)]) + if verbose: + print("discarding '%%s', no digits" %% ",".join(refs - tags)) + if verbose: + print("likely tags: %%s" %% ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix):] + if verbose: + print("picking %%s" %% r) + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None, + "date": date} + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags", "date": None} +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): + """Get version from 'git describe' in the root of the source tree. + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, + hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %%s not under git control" %% root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", + "--always", "--long", + "--match", "%%s*" %% tag_prefix], + cwd=root) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[:git_describe.rindex("-dirty")] + # now we have TAG-NUM-gHEX or HEX + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + if not mo: + # unparseable. Maybe git-describe is misbehaving? + pieces["error"] = ("unable to parse git-describe output: '%%s'" + %% describe_out) + return pieces + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%%s' doesn't start with prefix '%%s'" + print(fmt %% (full_tag, tag_prefix)) + pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'" + %% (full_tag, tag_prefix)) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + # commit: short hex revision ID + pieces["short"] = mo.group(3) + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], + cwd=root) + pieces["distance"] = int(count_out) # total number of commits + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = run_command(GITS, ["show", "-s", "--format=%%ci", "HEAD"], + cwd=root)[0].strip() + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + return pieces +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered +def render_pep440_pre(pieces): + """TAG[.post0.devDISTANCE] -- No -dirty. + Exceptions: + 1: no tags. 0.post0.devDISTANCE + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += ".post0.dev%%d" %% pieces["distance"] + else: + # exception #1 + rendered = "0.post0.dev%%d" %% pieces["distance"] + return rendered +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%%s" %% pieces["short"] + else: + # exception #1 + rendered = "0.post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%%s" %% pieces["short"] + return rendered +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + The ".dev0" means dirty. + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + Like 'git describe --tags --dirty --always'. + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None} + if not style or style == "default": + style = "pep440" # the default + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%%s'" %% style) + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None, + "date": pieces.get("date")} +def get_versions(): + """Get version information or return default if unable to do so.""" + # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have + # __file__, we can work backwards from there to the root. Some + # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which + # case we can only use expanded keywords. + cfg = get_config() + verbose = cfg.verbose + try: + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, + verbose) + except NotThisMethod: + pass + try: + root = os.path.realpath(__file__) + # versionfile_source is the relative path from the top of the source + # tree (where the .git directory might live) to this file. Invert + # this to find the root from __file__. + for i in cfg.versionfile_source.split('/'): + root = os.path.dirname(root) + except NameError: + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None} + try: + pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) + return render(pieces, cfg.style) + except NotThisMethod: + pass + try: + if cfg.parentdir_prefix: + return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + except NotThisMethod: + pass + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", "date": None} +''' + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + f = open(versionfile_abs) + for line in f.readlines(): + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + f.close() + except OSError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if not keywords: + raise NotThisMethod("no keywords at all, weird") + date = keywords.get("date") + if date is not None: + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + + # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = {r.strip() for r in refnames.strip("()").split(",")} + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)} + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = {r for r in refs if re.search(r"\d", r)} + if verbose: + print("discarding '%s', no digits" % ",".join(refs - tags)) + if verbose: + print("likely tags: %s" % ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix) :] + if verbose: + print("picking %s" % r) + return { + "version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": None, + "date": date, + } + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return { + "version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": "no suitable tags", + "date": None, + } + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): + """Get version from 'git describe' in the root of the source tree. + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %s not under git control" % root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = run_command( + GITS, + [ + "describe", + "--tags", + "--dirty", + "--always", + "--long", + "--match", + "%s*" % tag_prefix, + ], + cwd=root, + ) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[: git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) + if not mo: + # unparseable. Maybe git-describe is misbehaving? + pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%s' doesn't start with prefix '%s'" + print(fmt % (full_tag, tag_prefix)) + pieces["error"] = "tag '{}' doesn't start with prefix '{}'".format( + full_tag, + tag_prefix, + ) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix) :] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[ + 0 + ].strip() + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def do_vcs_install(manifest_in, versionfile_source, ipy): + """Git-specific installation logic for Versioneer. + For Git, this means creating/changing .gitattributes to mark _version.py + for export-subst keyword substitution. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + files = [manifest_in, versionfile_source] + if ipy: + files.append(ipy) + try: + me = __file__ + if me.endswith(".pyc") or me.endswith(".pyo"): + me = os.path.splitext(me)[0] + ".py" + versioneer_file = os.path.relpath(me) + except NameError: + versioneer_file = "versioneer.py" + files.append(versioneer_file) + present = False + try: + f = open(".gitattributes") + for line in f.readlines(): + if line.strip().startswith(versionfile_source): + if "export-subst" in line.strip().split()[1:]: + present = True + f.close() + except OSError: + pass + if not present: + f = open(".gitattributes", "a+") + f.write("%s export-subst\n" % versionfile_source) + f.close() + files.append(".gitattributes") + run_command(GITS, ["add", "--"] + files) + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for i in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return { + "version": dirname[len(parentdir_prefix) :], + "full-revisionid": None, + "dirty": False, + "error": None, + "date": None, + } + else: + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print( + "Tried directories %s but none started with prefix %s" + % (str(rootdirs), parentdir_prefix) + ) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +SHORT_VERSION_PY = """ +# This file was generated by 'versioneer.py' (0.19) from +# revision-control system data, or from the parent directory name of an +# unpacked source archive. Distribution tarballs contain a pre-generated copy +# of this file. +import json +version_json = ''' +%s +''' # END VERSION_JSON +def get_versions(): + return json.loads(version_json) +""" + + +def versions_from_file(filename): + """Try to determine the version from _version.py if present.""" + try: + with open(filename) as f: + contents = f.read() + except OSError: + raise NotThisMethod("unable to read _version.py") + mo = re.search( + r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S + ) + if not mo: + mo = re.search( + r"version_json = '''\r\n(.*)''' # END VERSION_JSON", contents, re.M | re.S + ) + if not mo: + raise NotThisMethod("no version_json in _version.py") + return json.loads(mo.group(1)) + + +def write_to_version_file(filename, versions): + """Write the given version number to the given _version.py file.""" + os.unlink(filename) + contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": ")) + with open(filename, "w") as f: + f.write(SHORT_VERSION_PY % contents) + + print("set {} to '{}'".format(filename, versions["version"])) + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_pre(pieces): + """TAG[.post0.devDISTANCE] -- No -dirty. + Exceptions: + 1: no tags. 0.post0.devDISTANCE + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += ".post0.dev%d" % pieces["distance"] + else: + # exception #1 + rendered = "0.post0.dev%d" % pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + The ".dev0" means dirty. + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + Like 'git describe --tags --dirty --always'. + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return { + "version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None, + } + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%s'" % style) + + return { + "version": rendered, + "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], + "error": None, + "date": pieces.get("date"), + } + + +class VersioneerBadRootError(Exception): + """The project root directory is unknown or missing key files.""" + + +def get_versions(verbose=False): + """Get the project version from whatever source is available. + Returns dict with two keys: 'version' and 'full'. + """ + if "versioneer" in sys.modules: + # see the discussion in cmdclass.py:get_cmdclass() + del sys.modules["versioneer"] + + root = get_root() + cfg = get_config_from_root(root) + + assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg" + handlers = HANDLERS.get(cfg.VCS) + assert handlers, "unrecognized VCS '%s'" % cfg.VCS + verbose = verbose or cfg.verbose + assert ( + cfg.versionfile_source is not None + ), "please set versioneer.versionfile_source" + assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" + + versionfile_abs = os.path.join(root, cfg.versionfile_source) + + # extract version from first of: _version.py, VCS command (e.g. 'git + # describe'), parentdir. This is meant to work for developers using a + # source checkout, for users of a tarball created by 'setup.py sdist', + # and for users of a tarball/zipball created by 'git archive' or github's + # download-from-tag feature or the equivalent in other VCSes. + + get_keywords_f = handlers.get("get_keywords") + from_keywords_f = handlers.get("keywords") + if get_keywords_f and from_keywords_f: + try: + keywords = get_keywords_f(versionfile_abs) + ver = from_keywords_f(keywords, cfg.tag_prefix, verbose) + if verbose: + print("got version from expanded keyword %s" % ver) + return ver + except NotThisMethod: + pass + + try: + ver = versions_from_file(versionfile_abs) + if verbose: + print(f"got version from file {versionfile_abs} {ver}") + return ver + except NotThisMethod: + pass + + from_vcs_f = handlers.get("pieces_from_vcs") + if from_vcs_f: + try: + pieces = from_vcs_f(cfg.tag_prefix, root, verbose) + ver = render(pieces, cfg.style) + if verbose: + print("got version from VCS %s" % ver) + return ver + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + if verbose: + print("got version from parentdir %s" % ver) + return ver + except NotThisMethod: + pass + + if verbose: + print("unable to compute version") + + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", + "date": None, + } + + +def get_version(): + """Get the short version string for this project.""" + return get_versions()["version"] + + +def get_cmdclass(cmdclass=None): + """Get the custom setuptools/distutils subclasses used by Versioneer. + If the package uses a different cmdclass (e.g. one from numpy), it + should be provide as an argument. + """ + if "versioneer" in sys.modules: + del sys.modules["versioneer"] + # this fixes the "python setup.py develop" case (also 'install' and + # 'easy_install .'), in which subdependencies of the main project are + # built (using setup.py bdist_egg) in the same python process. Assume + # a main project A and a dependency B, which use different versions + # of Versioneer. A's setup.py imports A's Versioneer, leaving it in + # sys.modules by the time B's setup.py is executed, causing B to run + # with the wrong versioneer. Setuptools wraps the sub-dep builds in a + # sandbox that restores sys.modules to it's pre-build state, so the + # parent is protected against the child's "import versioneer". By + # removing ourselves from sys.modules here, before the child build + # happens, we protect the child from the parent's versioneer too. + # Also see https://github.com/python-versioneer/python-versioneer/issues/52 + + cmds = {} if cmdclass is None else cmdclass.copy() + + # we add "version" to both distutils and setuptools + from distutils.core import Command + + class cmd_version(Command): + description = "report generated version string" + user_options = [] + boolean_options = [] + + def initialize_options(self): + pass + + def finalize_options(self): + pass + + def run(self): + vers = get_versions(verbose=True) + print("Version: %s" % vers["version"]) + print(" full-revisionid: %s" % vers.get("full-revisionid")) + print(" dirty: %s" % vers.get("dirty")) + print(" date: %s" % vers.get("date")) + if vers["error"]: + print(" error: %s" % vers["error"]) + + cmds["version"] = cmd_version + + # we override "build_py" in both distutils and setuptools + # + # most invocation pathways end up running build_py: + # distutils/build -> build_py + # distutils/install -> distutils/build ->.. + # setuptools/bdist_wheel -> distutils/install ->.. + # setuptools/bdist_egg -> distutils/install_lib -> build_py + # setuptools/install -> bdist_egg ->.. + # setuptools/develop -> ? + # pip install: + # copies source tree to a tempdir before running egg_info/etc + # if .git isn't copied too, 'git describe' will fail + # then does setup.py bdist_wheel, or sometimes setup.py install + # setup.py egg_info -> ? + + # we override different "build_py" commands for both environments + if "build_py" in cmds: + _build_py = cmds["build_py"] + elif "setuptools" in sys.modules: + from setuptools.command.build_py import build_py as _build_py + else: + from distutils.command.build_py import build_py as _build_py + + class cmd_build_py(_build_py): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + _build_py.run(self) + # now locate _version.py in the new build/ directory and replace + # it with an updated value + if cfg.versionfile_build: + target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + cmds["build_py"] = cmd_build_py + + if "setuptools" in sys.modules: + from setuptools.command.build_ext import build_ext as _build_ext + else: + from distutils.command.build_ext import build_ext as _build_ext + + class cmd_build_ext(_build_ext): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + _build_ext.run(self) + if self.inplace: + # build_ext --inplace will only build extensions in + # build/lib<..> dir with no _version.py to write to. + # As in place builds will already have a _version.py + # in the module dir, we do not need to write one. + return + # now locate _version.py in the new build/ directory and replace + # it with an updated value + target_versionfile = os.path.join(self.build_lib, cfg.versionfile_source) + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + cmds["build_ext"] = cmd_build_ext + + if "cx_Freeze" in sys.modules: # cx_freeze enabled? + from cx_Freeze.dist import build_exe as _build_exe + + # nczeczulin reports that py2exe won't like the pep440-style string + # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g. + # setup(console=[{ + # "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION + # "product_version": versioneer.get_version(), + # ... + + class cmd_build_exe(_build_exe): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + target_versionfile = cfg.versionfile_source + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + _build_exe.run(self) + os.unlink(target_versionfile) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + + cmds["build_exe"] = cmd_build_exe + del cmds["build_py"] + + if "py2exe" in sys.modules: # py2exe enabled? + from py2exe.distutils_buildexe import py2exe as _py2exe + + class cmd_py2exe(_py2exe): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + target_versionfile = cfg.versionfile_source + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + _py2exe.run(self) + os.unlink(target_versionfile) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + + cmds["py2exe"] = cmd_py2exe + + # we override different "sdist" commands for both environments + if "sdist" in cmds: + _sdist = cmds["sdist"] + elif "setuptools" in sys.modules: + from setuptools.command.sdist import sdist as _sdist + else: + from distutils.command.sdist import sdist as _sdist + + class cmd_sdist(_sdist): + def run(self): + versions = get_versions() + self._versioneer_generated_versions = versions + # unless we update this, the command will keep using the old + # version + self.distribution.metadata.version = versions["version"] + return _sdist.run(self) + + def make_release_tree(self, base_dir, files): + root = get_root() + cfg = get_config_from_root(root) + _sdist.make_release_tree(self, base_dir, files) + # now locate _version.py in the new base_dir directory + # (remembering that it may be a hardlink) and replace it with an + # updated value + target_versionfile = os.path.join(base_dir, cfg.versionfile_source) + print("UPDATING %s" % target_versionfile) + write_to_version_file( + target_versionfile, self._versioneer_generated_versions + ) + + cmds["sdist"] = cmd_sdist + + return cmds + + +CONFIG_ERROR = """ +setup.cfg is missing the necessary Versioneer configuration. You need +a section like: + [versioneer] + VCS = git + style = pep440 + versionfile_source = src/myproject/_version.py + versionfile_build = myproject/_version.py + tag_prefix = + parentdir_prefix = myproject- +You will also need to edit your setup.py to use the results: + import versioneer + setup(version=versioneer.get_version(), + cmdclass=versioneer.get_cmdclass(), ...) +Please read the docstring in ./versioneer.py for configuration instructions, +edit setup.cfg, and re-run the installer or 'python versioneer.py setup'. +""" + +SAMPLE_CONFIG = """ +# See the docstring in versioneer.py for instructions. Note that you must +# re-run 'versioneer.py setup' after changing this section, and commit the +# resulting files. +[versioneer] +#VCS = git +#style = pep440 +#versionfile_source = +#versionfile_build = +#tag_prefix = +#parentdir_prefix = +""" + +INIT_PY_SNIPPET = """ +from ._version import get_versions +__version__ = get_versions()['version'] +del get_versions +""" + + +def do_setup(): + """Do main VCS-independent setup function for installing Versioneer.""" + root = get_root() + try: + cfg = get_config_from_root(root) + except (OSError, configparser.NoSectionError, configparser.NoOptionError) as e: + if isinstance(e, (EnvironmentError, configparser.NoSectionError)): + print("Adding sample versioneer config to setup.cfg", file=sys.stderr) + with open(os.path.join(root, "setup.cfg"), "a") as f: + f.write(SAMPLE_CONFIG) + print(CONFIG_ERROR, file=sys.stderr) + return 1 + + print(" creating %s" % cfg.versionfile_source) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + + ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py") + if os.path.exists(ipy): + try: + with open(ipy) as f: + old = f.read() + except OSError: + old = "" + if INIT_PY_SNIPPET not in old: + print(" appending to %s" % ipy) + with open(ipy, "a") as f: + f.write(INIT_PY_SNIPPET) + else: + print(" %s unmodified" % ipy) + else: + print(" %s doesn't exist, ok" % ipy) + ipy = None + + # Make sure both the top-level "versioneer.py" and versionfile_source + # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so + # they'll be copied into source distributions. Pip won't be able to + # install the package without this. + manifest_in = os.path.join(root, "MANIFEST.in") + simple_includes = set() + try: + with open(manifest_in) as f: + for line in f: + if line.startswith("include "): + for include in line.split()[1:]: + simple_includes.add(include) + except OSError: + pass + # That doesn't cover everything MANIFEST.in can do + # (http://docs.python.org/2/distutils/sourcedist.html#commands), so + # it might give some false negatives. Appending redundant 'include' + # lines is safe, though. + if "versioneer.py" not in simple_includes: + print(" appending 'versioneer.py' to MANIFEST.in") + with open(manifest_in, "a") as f: + f.write("include versioneer.py\n") + else: + print(" 'versioneer.py' already in MANIFEST.in") + if cfg.versionfile_source not in simple_includes: + print( + " appending versionfile_source ('%s') to MANIFEST.in" + % cfg.versionfile_source + ) + with open(manifest_in, "a") as f: + f.write("include %s\n" % cfg.versionfile_source) + else: + print(" versionfile_source already in MANIFEST.in") + + # Make VCS-specific changes. For git, this means creating/changing + # .gitattributes to mark _version.py for export-subst keyword + # substitution. + do_vcs_install(manifest_in, cfg.versionfile_source, ipy) + return 0 + + +def scan_setup_py(): + """Validate the contents of setup.py against Versioneer's expectations.""" + found = set() + setters = False + errors = 0 + with open("setup.py") as f: + for line in f.readlines(): + if "import versioneer" in line: + found.add("import") + if "versioneer.get_cmdclass()" in line: + found.add("cmdclass") + if "versioneer.get_version()" in line: + found.add("get_version") + if "versioneer.VCS" in line: + setters = True + if "versioneer.versionfile_source" in line: + setters = True + if len(found) != 3: + print("") + print("Your setup.py appears to be missing some important items") + print("(but I might be wrong). Please make sure it has something") + print("roughly like the following:") + print("") + print(" import versioneer") + print(" setup( version=versioneer.get_version(),") + print(" cmdclass=versioneer.get_cmdclass(), ...)") + print("") + errors += 1 + if setters: + print("You should remove lines like 'versioneer.VCS = ' and") + print("'versioneer.versionfile_source = ' . This configuration") + print("now lives in setup.cfg, and should be removed from setup.py") + print("") + errors += 1 + return errors + + +if __name__ == "__main__": + cmd = sys.argv[1] + if cmd == "setup": + errors = do_setup() + errors += scan_setup_py() + if errors: + sys.exit(1)