diff --git a/.adr-dir b/.adr-dir new file mode 100644 index 00000000..548f973c --- /dev/null +++ b/.adr-dir @@ -0,0 +1 @@ +docs/contributor-guide/adr/ diff --git a/.flake8 b/.flake8 index 60415bed..9e31c973 100644 --- a/.flake8 +++ b/.flake8 @@ -2,12 +2,15 @@ # https://ljvmiranda921.github.io/notebook/2018/06/21/precommits-using-black-and-flake8/ [flake8] -ignore = E203, E266, E501, W503, F403, F401 +ignore = E203, E266, E501, W503, F403 max-line-length = 88 max-complexity = 18 select = B,C,E,F,I,W,T4,B9 -exclude = build,.venv,*.egg-info +exclude = build,venv,.venv,*.egg-info +per-file-ignores = + tamr_client/__init__.py:E402,F401,I100,I202 + tamr_client/*/__init__.py:F401 # flake8-import-order plugin import-order-style = google -application-import-names = tamr_unify_client +application-import-names = tamr_client, tamr_unify_client, tests diff --git a/.github/ISSUE_TEMPLATE/BUG_REPORT.md b/.github/ISSUE_TEMPLATE/BUG_REPORT.md index 139d795c..18fe189f 100644 --- a/.github/ISSUE_TEMPLATE/BUG_REPORT.md +++ b/.github/ISSUE_TEMPLATE/BUG_REPORT.md @@ -44,6 +44,6 @@ Search open/closed issues before submitting since someone might have asked the s | Software | Version(s) | | ----------------- | ---------- | | tamr-unify-client | -| Tamr Unify server | +| Tamr server | | Python | | Operating System | diff --git a/.github/ISSUE_TEMPLATE/QUESTION.md b/.github/ISSUE_TEMPLATE/QUESTION.md index 026166d4..c222c533 100644 --- a/.github/ISSUE_TEMPLATE/QUESTION.md +++ b/.github/ISSUE_TEMPLATE/QUESTION.md @@ -30,6 +30,6 @@ Search open/closed issues before submitting since someone might have asked the s | Software | Version(s) | | ----------------- | ---------- | | tamr-unify-client | -| Tamr Unify server | +| Tamr server | | Python | | Operating System | diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 459ee8f9..e62bc81d 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,9 +1,3 @@ - - # ↪️ Pull Request + ## ✔️ PR Todo -- [ ] Added/updated testing for this change -- [ ] Included links to related issues/PRs -- [ ] Update relevant [docs](https://github.com/Datatamer/unify-client-python/tree/master/docs) + docstrings -- [ ] Update the [CHANGELOG](https://github.com/Datatamer/unify-client-python/blob/master/CHANGELOG.md) under the current `-dev` version: - - Add changelog entries under any that apply: **BREAKING CHANGES**, **NEW FEATURES**, **BUG FIXES**. - - Changelog entry format: `[#]() ` +- [ ] Testing for this change +- [ ] Links to related issues/PRs +- [ ] Update relevant [docs](https://github.com/Datatamer/tamr-client/tree/master/docs) + docstrings diff --git a/.github/PULL_REQUEST_TEMPLATE/BUG_FIX.md b/.github/PULL_REQUEST_TEMPLATE/BUG_FIX.md deleted file mode 100644 index 2aa99d02..00000000 --- a/.github/PULL_REQUEST_TEMPLATE/BUG_FIX.md +++ /dev/null @@ -1,44 +0,0 @@ ---- -name: 🐛 Bug Fix -about: Did you fix something that did not work as expected? ---- - - - -# 🐛 bug fix - - - -## 🤔 Tell Us What Goes Wrong - - - -## 💁 Your Solution - - - -## 🚨 Test instructions - - - -## 🌍 Your Environment - - - -| Software | Version(s) | -| ----------------- | ---------- | -| Python | -| tamr-unify-client | -| Operating System | - -## ✔️ PR Todo - -- [ ] Added/updated unit tests for this change -- [ ] Filled out test instructions (In case there aren't any unit tests) -- [ ] Included links to related issues/PRs -- [ ] Update relevant docs + docstrings -- [ ] Add a "Bug Fixes" entry for the current development version in the changelog diff --git a/.github/PULL_REQUEST_TEMPLATE/NEW_FEATURE.md b/.github/PULL_REQUEST_TEMPLATE/NEW_FEATURE.md deleted file mode 100644 index b854a3a0..00000000 --- a/.github/PULL_REQUEST_TEMPLATE/NEW_FEATURE.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -name: 🙋 New Feature -about: Do you want to add something to tamr-unify-client? ---- - - - -# ✨ New Feature - - - -## 🔦 Context - - - - - -## 💻 Examples - - - -## 🚨 Test instructions - - - -## ✔️ PR Todo - -- [ ] Added/updated unit tests for this change -- [ ] Filled out test instructions (In case there aren't any unit tests) -- [ ] Included links to related issues/PRs -- [ ] Update relevant docs + docstrings -- [ ] Add an "NEW FEATURES" entry for the current development version in the changelog diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 00000000..591371e7 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,131 @@ +name: CI + +on: + pull_request: + branches: + - master + push: + branches: + - master + +jobs: + Lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Install python + uses: actions/setup-python@v2 + with: + python-version: 3.6 + - name: Install poetry + run: pip install poetry==1.0.5 + - name: Install nox + run: pip install nox==2020.5.24 + - name: Run flake8 + run: nox -s lint + + Format: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Install python + uses: actions/setup-python@v2 + with: + python-version: 3.6 + - name: Install poetry + run: pip install poetry==1.0.5 + - name: Install nox + run: pip install nox==2020.5.24 + - name: Run black + run: nox -s format + + Typecheck: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Install python + uses: actions/setup-python@v2 + with: + python-version: 3.6 + - name: Install poetry + run: pip install poetry==1.0.5 + - name: Install nox + run: pip install nox==2020.5.24 + - name: Run mypy + run: nox -s typecheck + + Test: + strategy: + matrix: + python_version: [3.6, 3.7, 3.8] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Install python + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python_version }} + - name: Install poetry + run: pip install poetry==1.0.5 + - name: Install nox + run: pip install nox==2020.5.24 + - name: Run pytest + run: nox -s test-${{ matrix.python_version }} + + Docs: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Install python + uses: actions/setup-python@v2 + with: + python-version: 3.6 + - name: Install nox + run: pip install nox==2020.5.24 + - name: Run sphinx-build + run: nox -s docs + + Commitlint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + - name: Install npm + uses: actions/setup-node@v2 + with: + node-version: "14" + - name: Install commitlint + run: npm install -g @commitlint/cli @commitlint/config-conventional + - name: Run commitlint + run: commitlint --from=origin/master + + Release: + if: github.event_name == 'push' && github.ref == 'refs/heads/master' + runs-on: ubuntu-latest + needs: [Lint, Format, Typecheck, Test, Docs, Commitlint] + steps: + - uses: actions/checkout@v2 + - name: Install Python + uses: actions/setup-python@v2 + with: + python-version: 3.6 + - name: Install poetry + uses: snok/install-poetry@v1.1.1 + with: + version: 1.1.4 + - name: Install toml-cli + run: | + pip install --upgrade pip + pip install toml-cli==0.1.3 + - name: Install npm + uses: actions/setup-node@v2 + with: + node-version: "14" + - name: Install semantic-release + run: npm install -g semantic-release@17 @semantic-release/exec@5 + - name: Run semantic-release + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + POETRY_PYPI_TOKEN_PYPI: ${{ secrets.PYPI_API_TOKEN }} + run: semantic-release diff --git a/.gitignore b/.gitignore index f7cb931e..b761bef2 100644 --- a/.gitignore +++ b/.gitignore @@ -86,9 +86,6 @@ target/ profile_default/ ipython_config.py -# pyenv -.python-version - # celery beat schedule file celerybeat-schedule diff --git a/.python-version b/.python-version new file mode 100644 index 00000000..58689652 --- /dev/null +++ b/.python-version @@ -0,0 +1,3 @@ +3.6.10 +3.7.7 +3.8.2 \ No newline at end of file diff --git a/.releaserc.yaml b/.releaserc.yaml new file mode 100644 index 00000000..a7ca97e8 --- /dev/null +++ b/.releaserc.yaml @@ -0,0 +1,16 @@ +repositoryUrl: https://github.com/Datatamer/tamr-client +branches: + - master +plugins: + - "@semantic-release/commit-analyzer" + - "@semantic-release/release-notes-generator" + - [ + "@semantic-release/exec", + { + # Set the project version according to semantic-release (depends on `toml-cli`) + prepareCmd: "toml set --toml-path pyproject.toml tool.poetry.version ${nextRelease.version}", + # Publish the project to PyPI (depends on `python` and `poetry`) + publishCmd: "poetry install --no-dev && poetry publish --build", + }, + ] + - "@semantic-release/github" diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 9b34a743..00000000 --- a/.travis.yml +++ /dev/null @@ -1,22 +0,0 @@ -dist: xenial -language: python -python: -- 3.6 -before_install: -- pip install poetry -install: -- poetry install -script: -- poetry run black --check . -- poetry run flake8 . -- poetry run pytest tests -before_deploy: -- poetry build -- poetry config http-basic.pypi $PYPI_USERNAME $PYPI_PASSWORD -deploy: - provider: script - script: poetry publish - skip_cleanup: true - on: - tags: true - condition: "$TRAVIS_TAG =~ ^[0-9]+\\.[0-9]+\\.[0-9]+$" diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..254d37ee --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,7 @@ +{ + "python.linting.pylintEnabled": false, + "python.linting.flake8Enabled": true, + "python.linting.enabled": true, + "python.formatting.provider": "black", + "editor.formatOnSave": true +} \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index 88a97f63..00000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,92 +0,0 @@ -## 0.6.0-dev - -## 0.5.0 - **NEW FEATURES** - - [#94](https://github.com/Datatamer/unify-client-python/issues/94) Add access to Attributes of a Dataset - - [#103](https://github.com/Datatamer/unify-client-python/issues/103) Dataset `update_records` now returns the JSON response body for the underlying `POST datasets/{id}:updateRecords` call - - [#98](https://github.com/Datatamer/unify-client-python/issues/98) Add `__geo_interface__` to Dataset - - [#100](https://github.com/Datatamer/unify-client-python/issues/100) Add `from_geo_features` to Dataset - - [#116](https://github.com/Datatamer/unify-client-python/issues/116) Add support for associating a dataset with a project - - [#109](https://github.com/Datatamer/unify-client-python/issues/109) Add support for profiling datasets - - [#86](https://github.com/Datatamer/unify-client-python/issues/86) Add support for creating projects - - [#114](https://github.com/Datatamer/unify-client-python/issues/114) Add support for generating pairs estimate - - [#106](https://github.com/Datatamer/unify-client-python/issues/106) Add support for initializing a source dataset - - [#107](https://github.com/Datatamer/unify-client-python/issues/107) Add support for creating a dataset attribute - - **BUG FIXES** - - [#118](https://github.com/Datatamer/unify-client-python/issues/118) Fix JSON sent for Dataset.update_records - -## 0.4.0 - **BREAKING CHANGES** - - [#61](https://github.com/Datatamer/unify-client-python/issues/61) `data` field renamed to `_data` (private). - - [#78](https://github.com/Datatamer/unify-client-python/issues/78) Property accessors return `None` rather than raise `KeyError` - - **NEW FEATURES** - - Record Clusters API endpoint to finish working mastering workflow. - - [#78](https://github.com/Datatamer/unify-client-python/issues/78) Improved repr for objects through the library - - [#42](https://github.com/Datatamer/unify-client-python/issues/42) Optional `session` argument to `Client` to use a specific `requests.Session` instance - - **BUG FIXES** - - Mastering workflow example was missing the generate clusters step, which has been rectified using proper endpoint - - [#30](https://github.com/Datatamer/unify-client-python/issues/30) Better docs for how to call directly call APIs - - [#61](https://github.com/Datatamer/unify-client-python/issues/61) `data` field renamed to `_data` (private). - -## 0.3.0 -*released on 2019-3-1* - - **NEW FEATURES** - - Versioning example in FAQ - - Offline installation docs - - `by_external_id` methods for `Dataset` and `Project` - - `DatasetStatus` resource (subresource of `Dataset`) - - `Client.request` accepts absolute paths as relative to origin - - **BUG FIXES** - - `requests` version specified changed to `>=2.20.0` for Airflow compatibility - - `setup.py` reads `VERSION.txt` and `README.md` with explicit `utf-8` encodings - -## 0.2.0 -*released on 2019-1-17* - - **NEW FEATURES** - - [Docs via readthedocs](https://tamr-unify-python-client.readthedocs.io/en/stable/) - - [CI testing via TravisCI](https://travis-ci.org/Datatamer/unify-client-python) ([details](https://github.com/Datatamer/unify-client-python/commit/ae381ce29593a70ed992f88a3e3ef3eb170a5cd4)) - - Release process documented in [RELEASE.md](https://github.com/Datatamer/unify-client-python/blob/master/RELEASE.md) ([details](https://github.com/Datatamer/unify-client-python/commit/fe717bbddca96b82bc1e447a93ae5c8817481675)) - - README Badges - - Version, Python version, License, Codestyle ([details](https://github.com/Datatamer/unify-client-python/pull/1)) - - Docs ([details](https://github.com/Datatamer/unify-client-python/pull/14)) - - CI build/test ([details](https://github.com/Datatamer/unify-client-python/pull/19)) - - HTTP errors raised as exceptions. More helpful than always getting `JSONDecodeError`s. ([details](https://github.com/Datatamer/unify-client-python/pull/7)) - - Stream records from a dataset ([details](https://github.com/Datatamer/unify-client-python/pull/13)) - - Migrate all Python Client docs from docs.tamr.com to Sphinx docs ([details](https://github.com/Datatamer/unify-client-python/pull/21)) - - **BUG FIXES** - - PyPI metadata - - `-` not `_` in project name ([details](https://github.com/Datatamer/unify-client-python/commit/5e25c45ec9bff0d0f9f40f52e81aacecdccb3e1b)) - - correct github repo URL ([details](https://github.com/Datatamer/unify-client-python/commit/767cf537f247d20293aa3a81b7830534aa6f84ec)) - - "Apache 2.0" as license value ([details](https://github.com/Datatamer/unify-client-python/pull/2)) - - README now parsed/rendered as Markdown ([details](https://github.com/Datatamer/unify-client-python/pull/4)) - - Change Log for 0.1.0 release ([details](https://github.com/Datatamer/unify-client-python/commit/852d6f0fd11f8ea33d2ea49d60a406f4e7267143)) - - readthedocs compatibility ([details](https://github.com/Datatamer/unify-client-python/pull/12)) - -## 0.1.0 -*released on 2019-1-10* - - Initial public release - - **BREAKING CHANGES** - - Protobuf-related dependencies ([details](https://github.com/pcattori/unify-client-python/commit/5f25bcf41ba64fce67c2cfc1bba81d382bc70efe)) - - **NEW FEATURES** - - Repo Documentation ([details](https://github.com/pcattori/unify-client-python/commit/5f25bcf41ba64fce67c2cfc1bba81d382bc70efe)) - - [CHANGELOG.md](https://github.com/Datatamer/unify-client-python/blob/master/CHANGELOG.md) - - [CODE_OF_CONDUCT.md](https://github.com/Datatamer/unify-client-python/blob/master/CODE_OF_CONDUCT.md) - - [LICENSE](https://github.com/Datatamer/unify-client-python/blob/master/LICENSE) - - [README.md](https://github.com/Datatamer/unify-client-python/blob/master/README.md) - - Version in [VERSION.txt](VERSION.txt) ([details](https://github.com/pcattori/unify-client-python/commit/41e93d4dba03bc7445f1935345bfd76cf45b877c)) - - **BUG FIXES** - - Reference documentation - - Autodoc should show inherited members ([details](https://github.com/pcattori/unify-client-python/commit/8356eb3d8ea995227e808a07d71de1bf3d7453c7)) - - Autodoc warning about `**` in `param` docstrings ([details](https://github.com/pcattori/unify-client-python/commit/2a204b294a41e4b9eea5cc383569f6303d3a5206)) - - Shortened Sphinx references with `~` ([details](https://github.com/pcattori/unify-client-python/commit/9827e98dd7dab4eaeaef5e60197e280649de3737)) \ No newline at end of file diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md deleted file mode 100644 index 06e3f5f1..00000000 --- a/CODE_OF_CONDUCT.md +++ /dev/null @@ -1,76 +0,0 @@ -# Contributor Covenant Code of Conduct - -## Our Pledge - -In the interest of fostering an open and welcoming environment, we as -contributors and maintainers pledge to making participation in our project and -our community a harassment-free experience for everyone, regardless of age, body -size, disability, ethnicity, sex characteristics, gender identity and expression, -level of experience, education, socio-economic status, nationality, personal -appearance, race, religion, or sexual identity and orientation. - -## Our Standards - -Examples of behavior that contributes to creating a positive environment -include: - -* Using welcoming and inclusive language -* Being respectful of differing viewpoints and experiences -* Gracefully accepting constructive criticism -* Focusing on what is best for the community -* Showing empathy towards other community members - -Examples of unacceptable behavior by participants include: - -* The use of sexualized language or imagery and unwelcome sexual attention or - advances -* Trolling, insulting/derogatory comments, and personal or political attacks -* Public or private harassment -* Publishing others' private information, such as a physical or electronic - address, without explicit permission -* Other conduct which could reasonably be considered inappropriate in a - professional setting - -## Our Responsibilities - -Project maintainers are responsible for clarifying the standards of acceptable -behavior and are expected to take appropriate and fair corrective action in -response to any instances of unacceptable behavior. - -Project maintainers have the right and responsibility to remove, edit, or -reject comments, commits, code, wiki edits, issues, and other contributions -that are not aligned to this Code of Conduct, or to ban temporarily or -permanently any contributor for other behaviors that they deem inappropriate, -threatening, offensive, or harmful. - -## Scope - -This Code of Conduct applies both within project spaces and in public spaces -when an individual is representing the project or its community. Examples of -representing a project or community include using an official project e-mail -address, posting via an official social media account, or acting as an appointed -representative at an online or offline event. Representation of a project may be -further defined and clarified by project maintainers. - -## Enforcement - -Instances of abusive, harassing, or otherwise unacceptable behavior may be -reported by contacting the project team at support@tamr.com . All -complaints will be reviewed and investigated and will result in a response that -is deemed necessary and appropriate to the circumstances. The project team is -obligated to maintain confidentiality with regard to the reporter of an incident. -Further details of specific enforcement policies may be posted separately. - -Project maintainers who do not follow or enforce the Code of Conduct in good -faith may face temporary or permanent repercussions as determined by other -members of the project's leadership. - -## Attribution - -This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, -available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html - -[homepage]: https://www.contributor-covenant.org - -For answers to common questions about this code of conduct, see -https://www.contributor-covenant.org/faq diff --git a/LICENSE b/LICENSE index a4f8764a..bc5bfdd8 100644 --- a/LICENSE +++ b/LICENSE @@ -186,7 +186,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright 2019 Tamr + Copyright 2020 Tamr Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/README.md b/README.md index f8bc8ae9..5bbeb50d 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ # Python Client -Programmatically 💻 interact with Tamr Unify using Python 🐍 +Programmatically 💻 interact with Tamr using Python 🐍 [![Version](https://img.shields.io/pypi/v/tamr-unify-client.svg?style=flat-square)](https://pypi.org/project/tamr-unify-client/) -[![Documentation Status](https://readthedocs.org/projects/tamr-unify-python-client/badge/?version=stable&style=flat-square)](https://tamr-unify-python-client.readthedocs.io/en/stable/?badge=stable) -[![Build Status](https://img.shields.io/travis/Datatamer/unify-client-python.svg?style=flat-square)](https://travis-ci.org/Datatamer/unify-client-python) +[![Documentation Status](https://readthedocs.org/projects/tamr-client/badge/?version=stable&style=flat-square)](https://tamr-client.readthedocs.io/en/stable/?badge=stable) +[![Build Status](https://img.shields.io/github/workflow/status/Datatamer/tamr-client/CI?&style=flat-square)](https://github.com/Datatamer/tamr-client/actions?query=workflow%3ACI) ![Supported Python Versions](https://img.shields.io/pypi/pyversions/tamr-unify-client.svg?style=flat-square) [![License](https://img.shields.io/pypi/l/tamr-unify-client.svg?style=flat-square)](LICENSE) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg?style=flat-square)](https://github.com/ambv/black) @@ -11,11 +11,9 @@ Programmatically 💻 interact with Tamr Unify using Python 🐍 --- *Quick links:* -**[Docs](https://tamr-unify-python-client.readthedocs.io/en/stable/)** | -**[Contributing](https://tamr-unify-python-client.readthedocs.io/en/stable/contributor-guide.html)** | -**[Code of Conduct](https://github.com/Datatamer/unify-client-python/blob/master/CODE_OF_CONDUCT.md)** | -**[Change Log](https://github.com/Datatamer/unify-client-python/blob/master/CHANGELOG.md)** | -**[License](https://github.com/Datatamer/unify-client-python/blob/master/LICENSE)** +**[Docs](https://tamr-client.readthedocs.io/en/stable/)** | +**[Contributing](https://tamr-client.readthedocs.io/en/stable/contributor-guide.html)** | +**[License](https://github.com/Datatamer/tamr-client/blob/master/LICENSE)** --- @@ -32,16 +30,13 @@ pip install tamr-unify-client - Continuous Categorization - 🚀 Kick-off synchronous/asynchronous operations - Refresh datasets in your pipeline - - Train Tamr Unify's machine learning models + - Train Tamr's machine learning models - Generate predictions from trained models -- 🔒 Authenticate with Tamr Unify -- 📥 Fetch resources (e.g projects) by resource ID (e.g. `"1"`) -- 📝 Read resource metadata -- 🔁 Iterate over collections -- ⚠️ Advanced - - Logging for API requests/responses - - Call custom/arbitrary API endpoints +- 🔒 Authenticate with Tamr + +For more see the [official docs](https://tamr-client.readthedocs.io/en/stable/). ## Maintainers - [Pedro Cattori](https://github.com/pcattori) +- [Samuel Kalish](https://github.com/skalish) diff --git a/RELEASE.md b/RELEASE.md deleted file mode 100644 index 0046f821..00000000 --- a/RELEASE.md +++ /dev/null @@ -1,66 +0,0 @@ -# Release process for `tamr-unify-client` - -During the following steps, we'll consider releasing the `0.3.0` version as an example. - -For our example, that means `master` branch is currently on version `0.3.0-dev` (you can check the actual version in `pyproject.toml`). - -Be sure to substitute `0.3.0` appropriately with the actual version being released. - -NOTE: You should make sure the commit you plan to use for the release branch is passing CI tests. - -# 1. Version bump on `master` - -Create a PR with the following changes: -- `pyproject.toml`: bump the version to the next one, keeping the `-dev` suffix e.g. `0.3.0-dev` -> `0.4.0-dev`. -- `CHANGELOG.md`: - - Create a new section for the new development version e.g. Add `# 0.4.0-dev` to the top of the changelog (with an empty line between it and the next version header). - - Remove the `-dev` suffix from the version being released e.g. `# 0.3.0-dev` -> `# 0.3.0`. - -Ensure CI tests pass for your PR and merge your changes into `master`. - -# 2. Cut a release branch - -On the [Datatamer/unify-client-python](https://github.com/Datatamer/unify-client-python) Github repo, click on [Commits](https://github.com/Datatamer/unify-client-python/commits/master). Navigate to the commit just before the version bump commit from Step 1. Click the `<>` icon to browse the repo at that commit. - -Then, create a branch on Github within the [Datatamer/unify-client-python](https://github.com/Datatamer/unify-client-python) repo titled `release-` e.g. `release-0.3.0`. - -NOTE: This release branch should *not* contain the version bump changes from Step 1. - -# 3. Remove `-dev` suffix on release branch - -Create a PR *to the release branch* with the following changes: -- `pyproject.toml`: Remove `-dev` suffix from version e.g. `0.3.0-dev` -> `0.3.0`. -- `CHANGELOG.md`: Remove the `-dev` suffix from the version being released e.g. `# 0.3.0-dev` -> `# 0.3.0`. - -Ensure CI tests pass for your PR and merge your changes into the release branch e.g. `release-0.3.0`. - -# 4. Create a Github release - -On the [Datatamer/unify-client-python](https://github.com/Datatamer/unify-client-python) Github repo, click on [Releases](https://github.com/Datatamer/unify-client-python/releases). Click "Draft a new release". - -Title the release with the release version. Do not include anything else in the release title e.g. -- Correct: `0.3.0` -- Incorrect: `v0.3.0` -- Incorrect: `Release 0.3.0` - -Select the corresponding release branch in the `Target` branch dropdown. - -Copy/paste the `CHANGELOG.md` entries for this release into the description for the release (only the entries, not the header since the version number is already encoded as the title for this release). - -Create the release. This should also implicitly create a tag for the release under [Tags](https://github.com/Datatamer/unify-client-python/tags). - -# 5. Check on published artifacts - -We use Travis CI as our Continuous Integration (CI) solution. - -CI is wired to ["deploy"](https://github.com/Datatamer/unify-client-python/blob/master/.travis.yml#L14) (a.k.a. publish) releases to PyPI for any tags that look like a semantic version number e.g. `0.3.0`. So CI should handle publishing for you. - -Check that CI tests passed. -Check that CI successfully published the release version to [PyPI](https://pypi.org/project/tamr-unify-client/#history). - -On the `master` branch, add release date for this release in the `CHANGELOG.md`. ---- - -If everything went correctly `pip install -U tamr-unify-client` should install the new release of the Python Client. - -If testing/publishing failed on the release branch, make additional PRs to fix any issues and get CI tests to pass. Be sure to merge those fixes into `master` too! diff --git a/commitlint.config.js b/commitlint.config.js new file mode 100644 index 00000000..858aaa8c --- /dev/null +++ b/commitlint.config.js @@ -0,0 +1 @@ +module.exports = { extends: ["@commitlint/config-conventional"] } diff --git a/docs/_static/css/custom.css b/docs/_static/css/custom.css new file mode 100644 index 00000000..621bc108 --- /dev/null +++ b/docs/_static/css/custom.css @@ -0,0 +1,24 @@ +@import url('https://fonts.googleapis.com/css2?family=Lexend+Deca&display=swap'); + +h1, +h2, +.rst-content .toctree-wrapper p.caption, +h3, +h4, +h5, +h6, +legend { + font-family: 'Lexend Deca', sans-serif; +} + +h1 { + font-size: 225%; +} + +.wy-side-nav-search { + background-color: #0859C6; +} + +.wy-side-nav-search input[type="text"] { + border-color: unset; +} \ No newline at end of file diff --git a/docs/_static/favicon.png b/docs/_static/favicon.png new file mode 100644 index 00000000..460b38d8 Binary files /dev/null and b/docs/_static/favicon.png differ diff --git a/docs/_static/tamr.png b/docs/_static/tamr.png index 6f972073..390233ef 100644 Binary files a/docs/_static/tamr.png and b/docs/_static/tamr.png differ diff --git a/docs/_templates/footer.html b/docs/_templates/footer.html new file mode 100644 index 00000000..527cb84b --- /dev/null +++ b/docs/_templates/footer.html @@ -0,0 +1,8 @@ +{% extends '!footer.html' %} +{% block extrafooter %} +
+ © 2020, Tamr, Inc. All rights reserved. Terms of Use | Privacy Policy +
+{% endblock %} \ No newline at end of file diff --git a/docs/beta.md b/docs/beta.md new file mode 100644 index 00000000..40d3c0cd --- /dev/null +++ b/docs/beta.md @@ -0,0 +1,27 @@ +# BETA + + **WARNING**: Do not rely on BETA features in production workflows. + Support from Tamr may be limited. + +## Tutorials + * [Get Tamr version](beta/tutorial/get_version) + * [Continuous Mastering](beta/tutorial/continuous_mastering) + +## Reference + + * [Attribute](beta/attribute) + * [Auth](beta/auth) + * [Backup](beta/backup) + * [Categorization](beta/categorization) + * [Dataset](beta/dataset) + * [Golden Records](beta/golden_records) + * [Instance](beta/instance) + * [Mastering](beta/mastering) + * [Operation](beta/operation) + * [Primary Key](beta/primary_key) + * [Project](beta/project) + * [Restore](beta/restore) + * [Schema Mapping](beta/schema_mapping) + * [Transformations](beta/transformations) + * [Response](beta/response) + * [Session](beta/session) diff --git a/docs/beta/attribute.md b/docs/beta/attribute.md new file mode 100644 index 00000000..169722f0 --- /dev/null +++ b/docs/beta/attribute.md @@ -0,0 +1,5 @@ +# Attribute + + * [Attribute](/beta/attribute/attribute) + * [Attribute Type](/beta/attribute/type) + * [SubAttribute](/beta/attribute/sub) diff --git a/docs/beta/attribute/attribute.rst b/docs/beta/attribute/attribute.rst new file mode 100644 index 00000000..9500568b --- /dev/null +++ b/docs/beta/attribute/attribute.rst @@ -0,0 +1,22 @@ +Attribute +========= + +.. autoclass:: tamr_client.Attribute + +.. autofunction:: tamr_client.attribute.by_resource_id +.. autofunction:: tamr_client.attribute.to_json +.. autofunction:: tamr_client.attribute.create +.. autofunction:: tamr_client.attribute.update +.. autofunction:: tamr_client.attribute.delete + +Exceptions +---------- + +.. autoclass:: tamr_client.attribute.AlreadyExists + :no-inherited-members: + +.. autoclass:: tamr_client.attribute.NotFound + :no-inherited-members: + +.. autoclass:: tamr_client.attribute.ReservedName + :no-inherited-members: diff --git a/docs/beta/attribute/sub.rst b/docs/beta/attribute/sub.rst new file mode 100644 index 00000000..38ddecd7 --- /dev/null +++ b/docs/beta/attribute/sub.rst @@ -0,0 +1,21 @@ +SubAttribute +============ + +.. NOTE(pcattori): + `SubAttribute` has a recursive dependency on `AttributeType`. + `sphinx_autodoc_typehint` cannot handle recursive dependencies, + so reference docs are written manually + +.. class:: tamr_client.SubAttribute(name, type, is_nullable, description=None) + + :param name: + :type name: :class:`str` + :param type: + :type type: :class:`~tamr_client.AttributeType` + :param is_nullable: + :type is_nullable: :class:`bool` + :param description: + :type description: :class:`~typing.Optional` [:class:`str`] + +.. autofunction:: tamr_client.attribute.sub.from_json +.. autofunction:: tamr_client.attribute.sub.to_json diff --git a/docs/beta/attribute/type.rst b/docs/beta/attribute/type.rst new file mode 100644 index 00000000..a157e62d --- /dev/null +++ b/docs/beta/attribute/type.rst @@ -0,0 +1,40 @@ +AttributeType +============= + +See https://docs.tamr.com/reference#attribute-types + +.. autodata:: tamr_client.attribute.type.BOOLEAN +.. autodata:: tamr_client.attribute.type.DOUBLE +.. autodata:: tamr_client.attribute.type.INT +.. autodata:: tamr_client.attribute.type.LONG +.. autodata:: tamr_client.attribute.type.STRING + +.. autodata:: tamr_client.attribute.type.DEFAULT +.. autodata:: tamr_client.attribute.type.GEOSPATIAL + + +.. NOTE(pcattori): + `Array` has a recursive dependency on `AttributeType`. + `sphinx_autodoc_typehint` cannot handle recursive dependencies, + so reference docs are written manually + +.. class:: tamr_client.attribute.type.Array(inner_type) + + :param inner_type: + :type inner_type: :class:`~tamr_client.AttributeType` + +.. NOTE(pcattori): + `Map` has a recursive dependency on `AttributeType`. + `sphinx_autodoc_typehint` cannot handle recursive dependencies, + so reference docs are written manually + +.. class:: tamr_client.attribute.type.Map(inner_type) + + :param inner_type: + :type inner_type: :class:`~tamr_client.AttributeType` + + +.. autoclass:: tamr_client.attribute.type.Record + +.. autofunction:: tamr_client.attribute.type.from_json +.. autofunction:: tamr_client.attribute.type.to_json diff --git a/docs/beta/auth.rst b/docs/beta/auth.rst new file mode 100644 index 00000000..0ac54cfc --- /dev/null +++ b/docs/beta/auth.rst @@ -0,0 +1,4 @@ +Auth +==== + +.. autoclass:: tamr_client.UsernamePasswordAuth diff --git a/docs/beta/backup.rst b/docs/beta/backup.rst new file mode 100644 index 00000000..9c849592 --- /dev/null +++ b/docs/beta/backup.rst @@ -0,0 +1,18 @@ +Backup +====== + +.. autoclass:: tamr_client.Backup + +.. autofunction:: tamr_client.backup.get_all +.. autofunction:: tamr_client.backup.by_resource_id +.. autofunction:: tamr_client.backup.initiate +.. autofunction:: tamr_client.backup.cancel +.. autofunction:: tamr_client.backup.poll + +Exceptions +---------- + +.. autoclass:: tamr_client.backup.NotFound + :no-inherited-members: +.. autoclass:: tamr_client.backup.InvalidOperation + :no-inherited-members: diff --git a/docs/beta/categorization.md b/docs/beta/categorization.md new file mode 100644 index 00000000..3b884bb5 --- /dev/null +++ b/docs/beta/categorization.md @@ -0,0 +1,4 @@ +# Categorization + + * [Categorization](/beta/categorization/categorization) + * [Project](/beta/categorization/project) diff --git a/docs/beta/categorization/categorization.rst b/docs/beta/categorization/categorization.rst new file mode 100644 index 00000000..c2e4fe46 --- /dev/null +++ b/docs/beta/categorization/categorization.rst @@ -0,0 +1,7 @@ +Categorization +============== + +.. autofunction:: tamr_client.categorization.update_unified_dataset +.. autofunction:: tamr_client.categorization.apply_feedback +.. autofunction:: tamr_client.categorization.update_results +.. autofunction:: tamr_client.categorization.manual_labels \ No newline at end of file diff --git a/docs/beta/categorization/project.rst b/docs/beta/categorization/project.rst new file mode 100644 index 00000000..18a142a4 --- /dev/null +++ b/docs/beta/categorization/project.rst @@ -0,0 +1,6 @@ +Categorization Project +====================== + +.. autoclass:: tamr_client.CategorizationProject + +.. autofunction:: tamr_client.categorization.project.create \ No newline at end of file diff --git a/docs/beta/dataset.md b/docs/beta/dataset.md new file mode 100644 index 00000000..e95a0681 --- /dev/null +++ b/docs/beta/dataset.md @@ -0,0 +1,6 @@ +# Dataset + + * [Dataset](/beta/dataset/dataset) + * [Record](/beta/dataset/record) + * [Dataframe](/beta/dataset/dataframe) + * [Unified](/beta/dataset/unified) diff --git a/docs/beta/dataset/dataframe.rst b/docs/beta/dataset/dataframe.rst new file mode 100644 index 00000000..f63964f8 --- /dev/null +++ b/docs/beta/dataset/dataframe.rst @@ -0,0 +1,5 @@ +Dataframe +========= + +.. autofunction:: tamr_client.dataframe.upsert +.. autofunction:: tamr_client.dataframe.create diff --git a/docs/beta/dataset/dataset.rst b/docs/beta/dataset/dataset.rst new file mode 100644 index 00000000..4a932a16 --- /dev/null +++ b/docs/beta/dataset/dataset.rst @@ -0,0 +1,24 @@ +Dataset +======= + +.. autoclass:: tamr_client.Dataset + +.. autofunction:: tamr_client.dataset.by_resource_id +.. autofunction:: tamr_client.dataset.by_name +.. autofunction:: tamr_client.dataset.attributes +.. autofunction:: tamr_client.dataset.materialize +.. autofunction:: tamr_client.dataset.delete +.. autofunction:: tamr_client.dataset.get_all +.. autofunction:: tamr_client.dataset.create + +Exceptions +---------- + +.. autoclass:: tamr_client.dataset.NotFound + :no-inherited-members: + +.. autoclass:: tamr_client.dataset.Ambiguous + :no-inherited-members: + +.. autoclass:: tamr_client.dataset.AlreadyExists + :no-inherited-members: diff --git a/docs/beta/dataset/record.rst b/docs/beta/dataset/record.rst new file mode 100644 index 00000000..cce6072a --- /dev/null +++ b/docs/beta/dataset/record.rst @@ -0,0 +1,11 @@ +Record +========= + +.. automodule:: tamr_client.record + :no-members: + +.. autofunction:: tamr_client.record.upsert +.. autofunction:: tamr_client.record.delete +.. autofunction:: tamr_client.record._update +.. autofunction:: tamr_client.record.stream +.. autofunction:: tamr_client.record.delete_all \ No newline at end of file diff --git a/docs/beta/dataset/unified.rst b/docs/beta/dataset/unified.rst new file mode 100644 index 00000000..78054d33 --- /dev/null +++ b/docs/beta/dataset/unified.rst @@ -0,0 +1,13 @@ +Unified +======= + +.. autoclass:: tamr_client.dataset.unified.UnifiedDataset + +.. autofunction:: tamr_client.dataset.unified.from_project +.. autofunction:: tamr_client.dataset.unified.apply_changes + +Exceptions +---------- + +.. autoclass:: tamr_client.dataset.unified.NotFound + :no-inherited-members: diff --git a/docs/beta/golden_records.md b/docs/beta/golden_records.md new file mode 100644 index 00000000..af265d67 --- /dev/null +++ b/docs/beta/golden_records.md @@ -0,0 +1,4 @@ +# Golden Records + + * [Golden Records](/beta/golden_records/golden_records) + * [Project](/beta/golden_records/project) diff --git a/docs/beta/golden_records/golden_records.rst b/docs/beta/golden_records/golden_records.rst new file mode 100644 index 00000000..6736ed3b --- /dev/null +++ b/docs/beta/golden_records/golden_records.rst @@ -0,0 +1,5 @@ +Golden Records +============== + +.. autofunction:: tamr_client.golden_records.update +.. autofunction:: tamr_client.golden_records.publish diff --git a/docs/beta/golden_records/project.rst b/docs/beta/golden_records/project.rst new file mode 100644 index 00000000..a1267890 --- /dev/null +++ b/docs/beta/golden_records/project.rst @@ -0,0 +1,4 @@ +Golden Records Project +====================== + +.. autoclass:: tamr_client.GoldenRecordsProject \ No newline at end of file diff --git a/docs/beta/instance.rst b/docs/beta/instance.rst new file mode 100644 index 00000000..1892274a --- /dev/null +++ b/docs/beta/instance.rst @@ -0,0 +1,8 @@ +Instance +======== + +.. autoclass:: tamr_client.Instance + +.. autofunction:: tamr_client.instance.origin + +.. autofunction:: tamr_client.instance.version \ No newline at end of file diff --git a/docs/beta/mastering.md b/docs/beta/mastering.md new file mode 100644 index 00000000..c833f55d --- /dev/null +++ b/docs/beta/mastering.md @@ -0,0 +1,4 @@ +# Mastering + + * [Mastering](/beta/mastering/mastering) + * [Project](/beta/mastering/project) diff --git a/docs/beta/mastering/mastering.rst b/docs/beta/mastering/mastering.rst new file mode 100644 index 00000000..274bb07d --- /dev/null +++ b/docs/beta/mastering/mastering.rst @@ -0,0 +1,11 @@ +Mastering +========= + +.. autofunction:: tamr_client.mastering.update_unified_dataset +.. autofunction:: tamr_client.mastering.estimate_pairs +.. autofunction:: tamr_client.mastering.generate_pairs +.. autofunction:: tamr_client.mastering.apply_feedback +.. autofunction:: tamr_client.mastering.update_pair_results +.. autofunction:: tamr_client.mastering.update_high_impact_pairs +.. autofunction:: tamr_client.mastering.update_cluster_results +.. autofunction:: tamr_client.mastering.publish_clusters diff --git a/docs/beta/mastering/project.rst b/docs/beta/mastering/project.rst new file mode 100644 index 00000000..e699d61d --- /dev/null +++ b/docs/beta/mastering/project.rst @@ -0,0 +1,6 @@ +Mastering Project +================= + +.. autoclass:: tamr_client.MasteringProject + +.. autofunction:: tamr_client.mastering.project.create \ No newline at end of file diff --git a/docs/beta/operation.rst b/docs/beta/operation.rst new file mode 100644 index 00000000..09574708 --- /dev/null +++ b/docs/beta/operation.rst @@ -0,0 +1,19 @@ +Operation +========= + +.. autoclass:: tamr_client.Operation + +.. autofunction:: tamr_client.operation.check +.. autofunction:: tamr_client.operation.poll +.. autofunction:: tamr_client.operation.wait +.. autofunction:: tamr_client.operation.succeeded +.. autofunction:: tamr_client.operation.by_resource_id + +Exceptions +---------- + +.. autoclass:: tamr_client.operation.Failed + :no-inherited-members: + +.. autoclass:: tamr_client.operation.NotFound + :no-inherited-members: \ No newline at end of file diff --git a/docs/beta/primary_key.rst b/docs/beta/primary_key.rst new file mode 100644 index 00000000..bd4462a9 --- /dev/null +++ b/docs/beta/primary_key.rst @@ -0,0 +1,11 @@ +Primary Key +=========== + +Exceptions +---------- + +.. autoclass:: tamr_client.primary_key.Ambiguous + :no-inherited-members: + +.. autoclass:: tamr_client.primary_key.NotFound + :no-inherited-members: diff --git a/docs/beta/project.rst b/docs/beta/project.rst new file mode 100644 index 00000000..ecd0c3bc --- /dev/null +++ b/docs/beta/project.rst @@ -0,0 +1,17 @@ +Project +======= + +.. autoclass:: tamr_client.UnknownProject + +.. autofunction:: tamr_client.project.by_resource_id +.. autofunction:: tamr_client.project.by_name +.. autofunction:: tamr_client.project.get_all + +Exceptions +---------- + +.. autoclass:: tamr_client.project.NotFound + :no-inherited-members: + +.. autoclass:: tamr_client.project.Ambiguous + :no-inherited-members: \ No newline at end of file diff --git a/docs/beta/response.rst b/docs/beta/response.rst new file mode 100644 index 00000000..807b2bb5 --- /dev/null +++ b/docs/beta/response.rst @@ -0,0 +1,7 @@ +Response +======== + +Utilities for working with :class:`requests.Response` . + +.. autofunction:: tamr_client.response.successful +.. autofunction:: tamr_client.response.ndjson diff --git a/docs/beta/restore.rst b/docs/beta/restore.rst new file mode 100644 index 00000000..6ea05b18 --- /dev/null +++ b/docs/beta/restore.rst @@ -0,0 +1,16 @@ +Restore +======= + +.. autoclass:: tamr_client.Restore + +.. autofunction:: tamr_client.restore.get +.. autofunction:: tamr_client.restore.initiate +.. autofunction:: tamr_client.restore.cancel + +Exceptions +---------- + +.. autoclass:: tamr_client.restore.NotFound + :no-inherited-members: +.. autoclass:: tamr_client.restore.InvalidOperation + :no-inherited-members: diff --git a/docs/beta/schema_mapping.md b/docs/beta/schema_mapping.md new file mode 100644 index 00000000..27f103a5 --- /dev/null +++ b/docs/beta/schema_mapping.md @@ -0,0 +1,4 @@ +# Schema Mapping + + * [Schema Mapping](/beta/schema_mapping/schema_mapping) + * [Project](/beta/schema_mapping/project) diff --git a/docs/beta/schema_mapping/project.rst b/docs/beta/schema_mapping/project.rst new file mode 100644 index 00000000..2491745a --- /dev/null +++ b/docs/beta/schema_mapping/project.rst @@ -0,0 +1,6 @@ +Schema Mapping Project +====================== + +.. autoclass:: tamr_client.SchemaMappingProject + +.. autofunction:: tamr_client.schema_mapping.project.create \ No newline at end of file diff --git a/docs/beta/schema_mapping/schema_mapping.rst b/docs/beta/schema_mapping/schema_mapping.rst new file mode 100644 index 00000000..3ace7dfe --- /dev/null +++ b/docs/beta/schema_mapping/schema_mapping.rst @@ -0,0 +1,4 @@ +Schema Mapping +============== + +.. autofunction:: tamr_client.schema_mapping.update_unified_dataset diff --git a/docs/beta/session.rst b/docs/beta/session.rst new file mode 100644 index 00000000..e53e4ac2 --- /dev/null +++ b/docs/beta/session.rst @@ -0,0 +1,8 @@ +Session +======= + +The :class:`~tamr_client.Session` type is an alias for :class:`requests.Session`. + +For more information, see the official :class:`requests.Session` docs. + +.. autofunction:: tamr_client.session.from_auth diff --git a/docs/beta/transformations.rst b/docs/beta/transformations.rst new file mode 100644 index 00000000..7d433ab2 --- /dev/null +++ b/docs/beta/transformations.rst @@ -0,0 +1,5 @@ +Transformations +=============== + +.. autofunction:: tamr_client.transformations.get_all +.. autofunction:: tamr_client.transformations.replace_all diff --git a/docs/beta/tutorial/continuous_mastering.md b/docs/beta/tutorial/continuous_mastering.md new file mode 100644 index 00000000..2ca7eedd --- /dev/null +++ b/docs/beta/tutorial/continuous_mastering.md @@ -0,0 +1,123 @@ +# Tutorial: Continuous Mastering +This tutorial will cover using the Python client to keep a Mastering project up-to-date. This includes carrying new data through to the end of the project and using any new labels to update the machine-learning model. + +While this is intended to propagate changes such as pair labeling that may be applied in the Tamr user interface, at no point during this tutorial is it necessary to interact with the user interface in any way. + +## Prerequisites +To complete this tutorial you will need: +- `tamr-unify-client` [installed](../../user-guide/installation) +- access to a Tamr instance, specifically: + - a username and password that allow you to log in to Tamr + - the socket address of the instance +- an existing Mastering project in the following state + - the schema mapping between the attributes of the source datasets and the unified dataset has been defined + - the blocking model has been defined + - labels have been applied to pairs + +It is recommended that you first complete the tutorial [here](https://docs.tamr.com/tamr-tutorials/docs/overview-mastering). Alternatively, a different Mastering project can be used as long as the above conditions are met. + +## Steps +### 1. Configure the Session and Instance +- Use your username and password to create an instance of `tamr_client.UsernamePasswordAuth`. +- Use the function `tamr_client.session.from.auth` to create a `Session`. +```eval_rst +.. literalinclude:: ../../../examples/continuous_mastering.py + :language: python + :lines: 1-9 +``` +- Create an `Instance` using the `protocol`, `host`, and `port` of your Tamr instance. Replace these with the corresponding values for your Tamr instance. +```eval_rst +.. literalinclude:: ../../../examples/continuous_mastering.py + :language: python + :lines: 11-15 +``` + +### 2. Get the Tamr Mastering project to be updated +Use the function `tc.project.by_name` to retrieve the project information from the server by its name. +```eval_rst +.. literalinclude:: ../../../examples/continuous_mastering.py + :language: python + :lines: 17 +``` +Ensure that the retrieved project is a Mastering project by checking its type: +```eval_rst +.. literalinclude:: ../../../examples/continuous_mastering.py + :language: python + :lines: 19-20 +``` + +### 3. Update the unified dataset +To update the unified dataset, use the function `tc.mastering.update_unified_dataset`. This function: +- Applies the [attribute mapping configuration](https://docs.tamr.com/tamr-tutorials/docs/define-project-schema-mastering) +- Applies any transformations +- Updates the unified dataset with updated source data +```eval_rst +.. literalinclude:: ../../../examples/continuous_mastering.py + :language: python + :lines: 22-23 +``` +This function and all others in this tutorial are *synchronous*, meaning that they will not return until the job in Tamr has resolved, either successfully or unsuccessfully. The function `tc.operation.check` will raise an exception and halt the script if the job started in Tamr fails for any reason. + +### 4. Generate pairs +To generate pairs according to the [configured pair filter rules](https://docs.tamr.com/tamr-tutorials/docs/setup-how-pairs-are-found), use the function `tc.mastering.generate_pairs`. +```eval_rst +.. literalinclude:: ../../../examples/continuous_mastering.py + :language: python + :lines: 25-26 +``` + +### 5. Train the model with new Labels +Running all of the functions in this section and in the "Apply the model" section that follows is equivalent to initiating "Apply feedback and update results" in the Tamr user interface. + +To [update the machine-learning model](https://docs.tamr.com/tamr-tutorials/docs/help-tamr-learn-about-your-data) with newly-applied labels use the function `tc.mastering.apply_feedback`. +```eval_rst +.. literalinclude:: ../../../examples/continuous_mastering.py + :language: python + :lines: 28-29 +``` + +### 6. Apply the model +Running all of the functions in the previous "Train the model with new labels" section and in this section is equivalent to initiating "Apply feedback and update results" in the Tamr user interface. + +Running the functions in this section alone is equivalent to initiating "Update results only" in the Tamr user interface. + +Applying the trained machine-learning model requires three functions. +- To update the pair prediction results, use the function `tc.mastering.update_pair_results`. +```eval_rst +.. literalinclude:: ../../../examples/continuous_mastering.py + :language: python + :lines: 31-32 +``` +- To update the list of [high-impact pairs](https://docs.tamr.com/tamr-tutorials/docs/help-tamr-learn-about-your-data#4-filter-for-high-impact-pairs), use the function `tc.mastering.update_high_impact_pairs`. +```eval_rst +.. literalinclude:: ../../../examples/continuous_mastering.py + :language: python + :lines: 34-35 +``` +- To update the clustering results, use the function `tc.mastering.update_cluster_results`. +```eval_rst +.. literalinclude:: ../../../examples/continuous_mastering.py + :language: python + :lines: 37-38 +``` + +### 7. Publish the clusters +To publish the record clusters, use the function `tc.mastering.publish_clusters`. +```eval_rst +.. literalinclude:: ../../../examples/continuous_mastering.py + :language: python + :lines: 40-41 +``` + + +All of the above steps can be combined into the following script `continuous_mastering.py`: +```eval_rst +.. literalinclude:: ../../../examples/continuous_mastering.py + :language: python +``` +To run the script via command line: +```bash +TAMR_CLIENT_BETA=1 python continuous_mastering.py +``` + +To continue learning, see other tutorials and examples. \ No newline at end of file diff --git a/docs/beta/tutorial/get_version.md b/docs/beta/tutorial/get_version.md new file mode 100644 index 00000000..a3a23039 --- /dev/null +++ b/docs/beta/tutorial/get_version.md @@ -0,0 +1,68 @@ +# Tutorial: Get Tamr version +This tutorial will cover basic Python client usage by guiding you through: +1. Configuring the connection to a Tamr instance +2. Retrieving the version of that instance + +## Prerequisites +To complete this tutorial you will need: +- `tamr-unify-client` [installed](../../user-guide/installation) +- access to a Tamr instance, specifically: + - a username and password that allow you to log in to Tamr + - the socket address of the instance + +The socket address is composed of +1. The protocol, such as `"https"` or `"http"` +2. The host, which may be `"localhost"` if the instance is deployed from the same machine from which your Python code will be run +3. The port at which you access the Tamr user interface, typically `9100` + +When you view the Tamr user interface in a browser, the url is `://:`. If the port is missing, the URL is simply `://host`. + +## Steps +### The Session +The Tamr Python client uses a `Session` to persist the user's authentication details across requests made to the server where Tamr is hosted. + +A `Session` carries authentication credentials derived from a username and password, and is not explicitly tied to any single Tamr instance. For more details, see the documentation for the [Requests library](https://requests.readthedocs.io/en/master/user/advanced/#session-objects). + + - Use your username and password to create an instance of `tamr_client.UsernamePasswordAuth`. + - Use the function `tamr_client.session.from.auth` to create a `Session`. +```eval_rst +.. literalinclude:: ../../../examples/get_tamr_version.py + :language: python + :lines: 1-9 +``` +### The Instance +An `Instance` models the installation or instance of Tamr with which a user interacts via the Python client. + +- Create an `Instance` using the `protocol`, `host`, and `port` of your Tamr instance. +```eval_rst +.. literalinclude:: ../../../examples/get_tamr_version.py + :language: python + :lines: 11-15 +``` +### Getting the version of Tamr +With the `Session` and `Instance` defined, you can now interact with the API of the Tamr instance. One simple example is fetching the version of the Tamr software running on the server. + +- Use the function `tc.instance.version` and print the returned value. + +```eval_rst +.. literalinclude:: ../../../examples/get_tamr_version.py + :language: python + :lines: 17 +``` + +All of the above steps can be combined into the following script `get_tamr_version.py`: + +```eval_rst +.. literalinclude:: ../../../examples/get_tamr_version.py + :language: python +``` +To run the script via command line: +```bash +TAMR_CLIENT_BETA=1 python get_tamr_version.py +``` + +If successful, the printed result should be similar to `v2020.016.0`. + +Congratulations! This is just the start of what can be done with the Tamr Python client. + +To continue learning, see other tutorials and examples. \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index 129ad8e3..bc440042 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -15,6 +15,7 @@ import os import sys +from recommonmark.transform import AutoStructify import toml sys.path.insert(0, os.path.abspath("..")) @@ -22,8 +23,7 @@ # -- Project information ----------------------------------------------------- -project = "Tamr Unify Python Client" -copyright = "2018, Tamr" +project = "Tamr - Python Client" author = "Tamr" @@ -46,12 +46,21 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. -extensions = ["sphinx.ext.autodoc", "sphinx.ext.intersphinx", "sphinx.ext.viewcode"] -autodoc_default_flags = ["inherited-members", "members"] +extensions = [ + "recommonmark", + "sphinx.ext.napoleon", + "sphinx_autodoc_typehints", + "sphinx.ext.intersphinx", + "sphinx.ext.viewcode", + "sphinx.ext.autosectionlabel", +] +autodoc_default_options = {"inherited-members": True, "members": True} autodoc_member_order = "bysource" +autosectionlabel_prefix_document = True intersphinx_mapping = { "https://docs.python.org/": None, - "requests": ("http://docs.python-requests.org/en/master/", None), + "requests": ("https://requests.readthedocs.io/en/master/", None), + "pandas": ("https://pandas.pydata.org/pandas-docs/stable", None), } # Add any paths that contain templates here, relative to this directory. @@ -61,7 +70,6 @@ # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] -source_suffix = ".rst" # The master toctree document. master_doc = "index" @@ -88,6 +96,10 @@ # a list of builtin themes. # html_theme = "sphinx_rtd_theme" +html_favicon = "_static/favicon.png" +html_show_copyright = False # custom copyright in _templates/footer.html +html_show_sphinx = False + # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the @@ -100,6 +112,7 @@ # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ["_static"] +html_css_files = ["css/custom.css"] html_logo = "_static/tamr.png" html_sidebars = {"**": ["localtoc.html", "relations.html", "searchbox.html"]} @@ -117,7 +130,7 @@ # -- Options for HTMLHelp output --------------------------------------------- # Output file base name for HTML help builder. -htmlhelp_basename = "TamrUnifyPythonClientdoc" +htmlhelp_basename = "TamrPythonClientdoc" # -- Options for LaTeX output ------------------------------------------------ @@ -143,8 +156,8 @@ latex_documents = [ ( master_doc, - "TamrUnifyPythonClient.tex", - "Tamr Unify Python Client Documentation", + "TamrPythonClient.tex", + "Tamr - Python Client Documentation", "Tamr", "manual", ) @@ -156,13 +169,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ( - master_doc, - "tamrunifypythonclient", - "Tamr Unify Python Client Documentation", - [author], - 1, - ) + (master_doc, "tamrpythonclient", "Tamr - Python Client Documentation", [author], 1) ] @@ -174,10 +181,10 @@ texinfo_documents = [ ( master_doc, - "TamrUnifyPythonClient", - "Tamr Unify Python Client Documentation", + "TamrPythonClient", + "Tamr - Python Client Documentation", author, - "TamrUnifyPythonClient", + "TamrPythonClient", "One line description of project.", "Miscellaneous", ) @@ -200,3 +207,15 @@ # A list of files that should not be packed into the epub file. epub_exclude_files = ["search.html"] + + +def setup(app): + """ + https://recommonmark.readthedocs.io/en/latest/auto_structify.html#configuring-autostructify + """ + app.add_config_value( + "recommonmark_config", + {"enable_auto_toc_tree": True, "auto_toc_maxdepth": 2}, + True, + ) + app.add_transform(AutoStructify) diff --git a/docs/contributor-guide.md b/docs/contributor-guide.md new file mode 100644 index 00000000..f20b55c8 --- /dev/null +++ b/docs/contributor-guide.md @@ -0,0 +1,47 @@ +# Contributor guide + +Thank you for learning how to contribute to Tamr's Python Client! +Your contribution will help you and many others in the Tamr community. +Before you begin, make sure you are viewing the [latest version of Contributor Guide](https://tamr-client.readthedocs.io/en/latest/contributor-guide.html). + +## Feedback + +Before submitting a new issue, [you can search existing issues](https://github.com/Datatamer/tamr-client/issues?q=is%3Aissue). +If the bug/feature has been submitted already, leave a like 👍 on the description of the Github Issue. +Maintainers will consider number of likes when prioritizing issues. + +### Bug reports +Submit bug reports as [Github issues](https://github.com/Datatamer/tamr-client/issues/new/choose). + +### Feature requests +Submit feature requests as [Github issues](https://github.com/Datatamer/tamr-client/issues/new/choose). + +## Documentation + +* [How to write user-facing documentation](contributor-guide/how-to-write-docs) + +## Code +* [Install the codebase](contributor-guide/install) +* [Run dev tasks](contributor-guide/dev-tasks) +* [Configure your text editor](contributor-guide/text-editor) +* [Read the ADRs](contributor-guide/adrs) +* [How to write tests](contributor-guide/how-to-write-tests) +* [Submit a pull request](contributor-guide/pull-request) + +## Release process +Releases are automated by [semantic-release](https://semantic-release.gitbook.io/semantic-release/). + +## Maintainers + +Maintainer responsabilities: +- Triage issues +- Review + merge pull requests +- Discuss RFCs +- Publish new releases + +Current maintainers: +- [pcattori](https://github.com/pcattori) +- [skalish](https://github.com/skalish) + +Want to become a maintainer? +Open a pull request that adds your name to the list of current maintainers! \ No newline at end of file diff --git a/docs/contributor-guide.rst b/docs/contributor-guide.rst deleted file mode 100644 index e326c8cb..00000000 --- a/docs/contributor-guide.rst +++ /dev/null @@ -1,137 +0,0 @@ -Contributor Guide -================= - -Code of Conduct ---------------- - -See `CODE_OF_CONDUCT.md `_ - -.. _bug-reports-feature-requests: - -🐛 Bug Reports / 🙋 Feature Requests ------------------------------------- - -Please leave bug reports and feature requests as `Github issues `_ . - ----- - -Be sure to check through existing issues (open and closed) to confirm that the -bug hasn’t been reported before. - -Duplicate bug reports are a huge drain on the time of other contributors, and -should be avoided as much as possible. - -↪️ Pull Requests ----------------- - -For larger, new features: - - `Open an RFC issue `_ . - Discuss the feature with project maintainers to be sure that your change fits with the project - vision and that you won't be wasting effort going in the wrong direction. - - Once you get the green light 🚦 from maintainers, you can proceed with the PR. - -Contributions / PRs should follow the -`Forking Workflow `_ : - - 1. Fork it: https://github.com/[your-github-username]/unify-client-python/fork - 2. Create your feature branch:: - - git checkout -b my-new-feature - - 3. Commit your changes:: - - git commit -am 'Add some feature' - - 4. Push to the branch:: - - git push origin my-new-feature - - 5. Create a new Pull Request - ----- - -We optimize for PR readability, so please squash commits before and during the PR -review process if you think it will help reviewers and onlookers navigate your changes. - -Don't be afraid to ``push -f`` on your PRs when it helps our eyes read your code. - -Install -------- - -This project uses ``poetry`` as its package manager. For details on ``poetry``, -see the `official documentation `_ . - - 1. Install `pyenv `_:: - - curl https://pyenv.run | bash - - 2. Clone your fork and ``cd`` into the project:: - - git clone https://github.com//unify-client-python - cd unify-client-python - - 3. Use ``pyenv`` to install a compatible Python version (``3.6`` or newer; e.g. ``3.7.3``):: - - pyenv install 3.7.3 - - 4. Set that Python version to be your version for this project(e.g. ``3.7.3``):: - - pyenv local 3.7.3 - - 5. Check that your Python version matches the version specified in ``.python-version``:: - - cat .python-version - python --version - - 6. Install ``poetry`` as `described here `_:: - - curl -sSL https://raw.githubusercontent.com/sdispater/poetry/master/get-poetry.py | python - - 7. Install dependencies via ``poetry``:: - - poetry install - -Run tests ---------- - -To run all tests:: - - poetry run pytest . - -To run specific tests, see `these pytest docs `_ . - -Run style checks ----------------- - -To run linter:: - - poetry run flake8 . - -To run formatter:: - - poetry run black --check . - -Run the formatter without the `--check` flag to fix formatting in-place. - -Build docs ----------- - -To build the docs:: - - cd docs/ - poetry run make html - -After docs are build, view them by:: - - cd docs/ # unless you are there already - open -a 'Google Chrome' _build/html/index.html # open in your favorite browser - -Editor config -------------- - -`Atom `_ : - -- `python-black `_ -- `linter-flake8 `_ diff --git a/docs/contributor-guide/adr/0001-record-architecture-decisions.md b/docs/contributor-guide/adr/0001-record-architecture-decisions.md new file mode 100644 index 00000000..8833a49e --- /dev/null +++ b/docs/contributor-guide/adr/0001-record-architecture-decisions.md @@ -0,0 +1,19 @@ +# 1. Record architecture decisions + +Date: 2020-08-14 + +## Status + +Accepted + +## Context + +We need to record the architectural decisions made on this project. + +## Decision + +We will use Architecture Decision Records, as [described by Michael Nygard](http://thinkrelevance.com/blog/2011/11/15/documenting-architecture-decisions). + +## Consequences + +See Michael Nygard's article, linked above. For a lightweight ADR toolset, see Nat Pryce's [adr-tools](https://github.com/npryce/adr-tools). diff --git a/docs/contributor-guide/adr/0002-linting-and-formatting.md b/docs/contributor-guide/adr/0002-linting-and-formatting.md new file mode 100644 index 00000000..d6f744b4 --- /dev/null +++ b/docs/contributor-guide/adr/0002-linting-and-formatting.md @@ -0,0 +1,31 @@ +# 2. Linting and formatting + +Date: 2019-01-14 + +## Status + +Accepted + +## Context + +Inconsistent code formatting slows down development and the review process. + +Code should be linted for things like: +- unused imports and variables +- consistent import order + +Code formatting should be done automatically or programmatically, taking the burden off of reviewers. + +## Decision + +For linting, use [flake8](https://flake8.pycqa.org/en/latest/) and [flake8-import-order](https://github.com/PyCQA/flake8-import-order). + +For formatting, use [black](https://github.com/psf/black). + +## Consequences + +All linting and formatting are enforced programmatically. + +Most linting and formatting errors can be autofixed. + +Text editors and IDEs are able to integrate with our linting and formattings tools to automatically fix (most) errors on save. \ No newline at end of file diff --git a/docs/contributor-guide/adr/0003-reproducibility.md b/docs/contributor-guide/adr/0003-reproducibility.md new file mode 100644 index 00000000..be3245ff --- /dev/null +++ b/docs/contributor-guide/adr/0003-reproducibility.md @@ -0,0 +1,31 @@ +# 3. Reproducibility + +Date: 2019-06-05 + +## Status + +Accepted + +## Context + +Reproducing results from a program is challenging when operating systems, language versions, and dependency versions can vary. + +For this codebase, we will focus on consistent Python versions and dependency versions. + +## Decision + +Manage multiple Python versions via [pyenv](https://github.com/pyenv/pyenv). + +Manage dependencies via [poetry](https://python-poetry.org/). + +Define tests via [nox](https://nox.thea.codes/en/stable/). + +Run tests in automation/CI via [Github Actions](https://github.com/features/actions). + +## Consequences + +This solution lets us: +- keep track of [abstract *and* concrete versions](https://caremad.io/posts/2013/07/setup-vs-requirement/) for dependencies (think `.lock` file) +- locally test against multiple Python versions +- run the same tests locally as we do in [Continuous Integration](https://en.wikipedia.org/wiki/Continuous_integration) (CI) +- easily view CI test results within the review context diff --git a/docs/contributor-guide/adr/0004-documentation-and-docstrings.md b/docs/contributor-guide/adr/0004-documentation-and-docstrings.md new file mode 100644 index 00000000..202d43a3 --- /dev/null +++ b/docs/contributor-guide/adr/0004-documentation-and-docstrings.md @@ -0,0 +1,39 @@ +# 4. Documentation and docstrings + +Date: 2019-10-03 + +## Status + +Accepted + +## Context + +Documentation can take four forms: +1. Explanation +2. Tutorial +3. How-to +4. Reference + +We need a way to author and host prosey documentation and generate reference docs based on source code. + +## Decision + +Doc compilation will be done via [sphinx](https://www.sphinx-doc.org/en/master/). + +Prosey documentation (1-3) via [recommonmark](https://github.com/readthedocs/recommonmark). + +Reference documentation (4) will be generated based on type annotations and docstrings via: +- Automatic docs based on docstrings via [sphinx-autodoc](https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html), [sphinx-autodoc-typehints](https://github.com/agronholm/sphinx-autodoc-typehints) +- Google-style docstrings via [napoleon](https://www.sphinx-doc.org/en/master/usage/extensions/napoleon.html) +- Hosting on [ReadTheDocs](https://readthedocs.org/) (RTD) +- Build docs in CI and fail on errors or warnings. + +## Consequences + +Prosey documentation can be written in Markdown (.md), which is more familiar to our contributors than .rst format. + +Reference doc generation makes docs more maintainable and consistent with actual code. + +Google-style docstrings are easier to read than sphinx-style docstrings. + +RTD natively compiles documentation using sphinx and simultaneously hosts docs at each version. \ No newline at end of file diff --git a/docs/contributor-guide/adr/0005-composable-functions.md b/docs/contributor-guide/adr/0005-composable-functions.md new file mode 100644 index 00000000..3fc005f9 --- /dev/null +++ b/docs/contributor-guide/adr/0005-composable-functions.md @@ -0,0 +1,69 @@ +# 5. Composable functions + +Date: 2019-11-01 + +## Status + +Accepted + +## Context + +We need a reasonable tradeoff between ease-of-use and maintainability. + +Specifically, we need composable, combinable units that can be improved independently. + +### Approach 1: Classes + Methods + +One approach is to embrace Object-Oriented Programming (OOP) with fluent interfaces (i.e. method chaining): + +```python +project + .create(...) + .update(...) + .delete(...) +``` + +Characteristics: +- Ease-of-use is maximized, but this requires each method to `return self`. +- Also, this approach implies that if a function can be called with X different object types, +each of those object types should have a corresponding method that applies that functionality and then `return self`. + +How to enforce these characteristics? + +Any solution will be a tax on maintainability, as code that adheres to these characteristics will include many non-semantic lines simply going through the motions of `return self` and copying function usage into dedicated methods for each class. + +### Approach 2: Types + Functions + +Another approach is to embrace a functional programming style: simple types and functions (no methods). + +Usage is not as terse as for OOP: + +```python +p = tc.project.create(...) +u = tc.project.update(p, ...) +d = tc.project.delete(p, ...) +``` + +Characteristics: +- Ease-of-use is not optimized, but still reasonable. + - With tab-completion, ease-of-use is comparable to OOP. +- Each type can be made immutable +- Each function can be made pure +- Functionality can be shared by calling the same function in user-land, not copying function calls in contributor-land. + +## Decision + +Use `@dataclass(frozen=True)` to model types and plain Python modules and functions to capture business logic. + +## Consequences + +Immutable types and pure functions make the code much easier to reason about, +drastically cutting down the time to ramp up and debug. + +Functions are easily composable without accumulating undesired side-effects, unlike methods. + +Note that not all types and functions *have* to be immutable and pure, +but immutable types and pure functions should be the default. + +If there are good reasons to make exceptions, we can do so, but we should include comments to explain why that exception was made. + diff --git a/docs/contributor-guide/adr/0006-type-checking.md b/docs/contributor-guide/adr/0006-type-checking.md new file mode 100644 index 00000000..72089494 --- /dev/null +++ b/docs/contributor-guide/adr/0006-type-checking.md @@ -0,0 +1,22 @@ +# 6. Type-checking + +Date: 2020-01-29 + +## Status + +Accepted + +## Context + +Static type-checking is available for Python, making us of the type annotations already in the codebase. + +## Decision + +Type-check via [mypy](http://mypy-lang.org/). + +## Consequences + +Testing is still important, but type checking helps to eliminate bugs via static checking, +even for parts of the code not exercised during tests. + +Additionally, type-checking relies on our type annotations, ensuring that the annotations are correct and complete. diff --git a/docs/contributor-guide/adr/0007-tamr-client-package.md b/docs/contributor-guide/adr/0007-tamr-client-package.md new file mode 100644 index 00000000..15b91c21 --- /dev/null +++ b/docs/contributor-guide/adr/0007-tamr-client-package.md @@ -0,0 +1,33 @@ +# 7. tamr_client package + +Date: 2020-04-03 + +## Status + +Accepted + +## Context + +We have an existing userbase that relies on `tamr_unify_client` and cannot painlessly make backwards-incompatible changes. + +But, we want to rearchitect this codebase as a [library of composable functions](/contributor-guide/adr/0005-composable-functions). + +## Decision + +Implement rearchitected design as a new package named `tamr_client`. + +Require the `TAMR_CLIENT_BETA=1` feature flag for `tamr_client` package usage. + +Warn users who attempt to use `tamr_client` package to opt-in if they want to beta test the new design. + +## Consequences + +Continue to support `tamr_unify_client`, but any new functionality: +- must be included in `tamr_client` +- may be included in `tamr_unify_client` + +Users are required to explicitly opt-in to new features, +preserving backward compatiblitiy for current users. + +Once we reach feature parity with `tamr_unify_client`, +we can undergo a deprecation cycle and subsequently remove `tamr_unify_client. diff --git a/docs/contributor-guide/adr/0008-standardized-imports.md b/docs/contributor-guide/adr/0008-standardized-imports.md new file mode 100644 index 00000000..5757a5fd --- /dev/null +++ b/docs/contributor-guide/adr/0008-standardized-imports.md @@ -0,0 +1,47 @@ +# 8. Standardized imports + +Date: 2020-06-01 + +## Status + +Accepted + +## Context + +Python has many ways of importing: + +```python +# option 1: import module + +# option 1.a +import foo.bar.bazaar as baz +baz.do_the_thing() + +# option 1.b +from foo.bar import bazaar as baz +baz.do_the_thing() + +# option 2: import value +from foo.bar.bazaar import do_the_thing +do_the_thing() +``` + +Not to mention that each of these styles may be done with relative imports (replacing `foo.bar` with `.bar` if the `bar` package is a sibling). + +Confusingly, Option 1.a and Option 1.b are _conceptually_ the same, but mechanically there are [subtle differences](https://stackoverflow.com/questions/24807434/imports-in-init-py-and-import-as-statement/24968941#24968941). + + +## Decision + +Imports within `tamr_client`: +- Must import statements for modules, classes, and exceptions +- Must `from foo import bar` instead of `import foo.bar as bar` +- Must not import functions directly. Instead import the containing module and use `module.function(...)` +- Must not use relative imports. Use absolute imports instead. + +## Consequences + +Standardized import style helps linter correctly order imports. + +Choosing import styles is a syntactic choice without semantic meaning. +Removing this choice should speed up development and review. \ No newline at end of file diff --git a/docs/contributor-guide/adr/0009-separate-types-and-functions.md b/docs/contributor-guide/adr/0009-separate-types-and-functions.md new file mode 100644 index 00000000..d7be6611 --- /dev/null +++ b/docs/contributor-guide/adr/0009-separate-types-and-functions.md @@ -0,0 +1,36 @@ +# 9. Separate types and functions + +Date: 2020-06-29 + +## Status + +Accepted + +## Context + +Code must be organized to be compatible with: +- Static type-checking via [mypy](https://github.com/python/mypy) +- Runtime execution during normal usage and running tests via [pytest](https://docs.pytest.org/en/stable/) +- Static doc generation via [sphinx-autodoc-typehints](https://github.com/agronholm/sphinx-autodoc-typehints) + +Additionally: +- Functions should be able to refer to any type +- Most types depend on other types non-recursively, but some types (e.g. `SubAttribute` and `AttributeType`) do depend on each other recursively / cyclically. + +## Decision + +Put types (`@dataclass(frozen=True)`) into the `_types` module +and have all function modules depend on the `_types` module to define their inputs and outputs. + +## Consequences + +Separating types into a `_types` module (e.g. `tc.Project` is an alias for `tc._types.project.Project`) +and functions into namespaced modules (e.g. `tc.project` is a module containing project-specific utilities) +allows all of our tooling to run successfully. + +Also, splitting up types and functions means that we can author a function like `tc.dataset.attributes` in the `tc.dataset` module +while still having the `tc.attribute` module depend on `tc.Dataset` type. + +Finally, for the rare cases where cyclical dependencies for types are unavoidable, +we can use [typing.TYPE_CHECKING](https://docs.python.org/3/library/typing.html#typing.TYPE_CHECKING) since `mypy` and Python are smart enough to resolve these cyclical correctly via [forward references](https://www.python.org/dev/peps/pep-0484/#forward-references). + diff --git a/docs/contributor-guide/adr/0010-confirm-performance-issues-before-optimizing.md b/docs/contributor-guide/adr/0010-confirm-performance-issues-before-optimizing.md new file mode 100644 index 00000000..7b611d8f --- /dev/null +++ b/docs/contributor-guide/adr/0010-confirm-performance-issues-before-optimizing.md @@ -0,0 +1,24 @@ +# 10. Confirm performance issues before optimizing + +Date: 2021-02-04 + +## Status + +Accepted + +## Context + +There are multiple, equally-effective ways to implement many features. In some cases, the most +straightforward implementation might involve making more API calls than are strictly necessary +(e.g. `tc.dataset.create` makes an additional call to retrieve the created dataset from the server +to construct the returned `Dataset`). + +## Decision + +The simplest and most understandably-written implementation of a feature should be prioritized over +performance or reducing the number of API calls. When real performance issues are identified, +optimization should be done on an as-needed basis. + +## Consequences + +Functions will not be unnecessarily optimized at the cost of readability. \ No newline at end of file diff --git a/docs/contributor-guide/adrs.md b/docs/contributor-guide/adrs.md new file mode 100644 index 00000000..aa5d2d72 --- /dev/null +++ b/docs/contributor-guide/adrs.md @@ -0,0 +1,23 @@ +# Architectural Decision Records + +Important architectural decisions are logged as Architectural Decision Records (ADRs) +and are housed here. + +For more on ADRs, see: +- [Why write ADRs](https://github.blog/2020-08-13-why-write-adrs/) +- [Earn future maintainers esteem by writing simple ADRs](https://understandlegacycode.com/blog/earn-maintainers-esteem-with-adrs/) + +To author new ADRs, we recommend [adr-tools](https://github.com/npryce/adr-tools). + +## ADRs + +* [Record architecture decisions](/contributor-guide/adr/0001-record-architecture-decisions) +* [Linting and formatting](/contributor-guide/adr/0002-linting-and-formatting) +* [Reproducibility](/contributor-guide/adr/0003-reproducibility) +* [Documentation and docstrings](/contributor-guide/adr/0004-documentation-and-docstrings) +* [Composable functions](/contributor-guide/adr/0005-composable-functions) +* [Type checking](/contributor-guide/adr/0006-type-checking) +* [tamr_client package](/contributor-guide/adr/0007-tamr-client-package) +* [Standardized imports](/contributor-guide/adr/0008-standardized-imports) +* [Separate types and functions](/contributor-guide/adr/0009-separate-types-and-functions) +* [Confirm performance issues before optimizing](/contributor-guide/adr/0010-confirm-performance-issues-before-optimizing) \ No newline at end of file diff --git a/docs/contributor-guide/dev-tasks.md b/docs/contributor-guide/dev-tasks.md new file mode 100644 index 00000000..6fb2c492 --- /dev/null +++ b/docs/contributor-guide/dev-tasks.md @@ -0,0 +1,96 @@ +# Run dev tasks + +This project uses [nox](https://nox.thea.codes/en/stable/). + +Since `nox` will be running inside of a `poetry` environment (to guarantee you are running the same version of `nox` as everyone else), we recommend adding the following alias to your `.bashrc` / `.zshrc` to save you some keystrokes: + +```sh +alias prn='poetry run nox' +``` + +To run all checks: + +```sh +prn # with alias +poetry run nox # without alias +``` + +## Linting + +To run linter: + +```sh +prn -s lint # with alias +poetry run nox -s lint # without alias +``` + +## Formatting + +To run formatter: + +```sh +prn -s format # with alias +poetry run nox -s format # without alias +``` + +Run the formatter with the `--fix` flag to autofix formatting: + +```sh +prn -s format -- --fix # with alias +poetry run nox -s format -- --fix # without alias +``` + +## Typechecks + +To run typechecks: + +```sh +prn -s typecheck # with alias +poetry run nox -s typecheck # without alias +``` + +## Tests + +To run all tests: + +```sh +prn -s test # with alias +poetry run nox -s test # without alias +``` + +--- + +To run tests for a specific Python version e.g. 3.6: + +```sh +prn -s test-3.6 # with alias +poetry run nox -s test-3.6 # without alias +``` + +See [`nox --list`](https://nox.thea.codes/en/stable/tutorial.html#selecting-which-sessions-to-run) for more details. + +--- + +To run specific tests, see [these pytest docs](https://docs.pytest.org/en/latest/usage.html#specifying-tests-selecting-tests) and pass `pytest` args after `--` e.g.: + +```sh +prn -s test -- tests/unit/test_attribute.py # with alias +poetry run nox -s test -- tests/unit/test_attribute.py # without alias +``` + + +## Docs + +To build the docs: + +```sh +prn -s docs # with alias +poetry run nox -s docs # without alias +``` + +After docs are build, view them by: + +```sh +open -a 'firefox' docs/_build/index.html # open in Firefox +open -a 'Google Chrome' docs/_build/index.html # open in Chrome +``` diff --git a/docs/contributor-guide/how-to-write-docs.md b/docs/contributor-guide/how-to-write-docs.md new file mode 100644 index 00000000..d001149a --- /dev/null +++ b/docs/contributor-guide/how-to-write-docs.md @@ -0,0 +1,63 @@ +# How to write docs + +Before you begin to add content, decide which of the three types of content you want to add: +1. Tutorial +2. How-To guide +3. Explanation + +``` note:: + There is fourth type of content, known as Reference. + + For the Tamr Client, you don't need to add reference topics manually because reference documentation for the Tamr Client is generated automatically based on the source code. + + For more details, see Reference description below. +``` + +For more information about each type of content, see the following descriptions. +Also see [Divio's documentation system manual](https://documentation.divio.com/). + +### Tutorial + +Tutorials are learning-oriented and ... + +- Must include an end-to-end walkthrough for a specific use case, such as "Tutorial: Deduplicating buildings in Cambridge". +- Must have a clearly stated goal and allow the users to achieve it after they complete the steps in the tutorial. +- Must provide the sample data and input configuration that are necessary for the user to complete the tutorial. Include this information upfront, at the start of your tutorial. +- Must be self-contained, but can include links to procedures described elsewhere in this documentation. + +Tutorials are useful if the use case is both simple and in high demand. +Not every use case deserves a tutorial. +Before writing a tutorial, think first of a use case that has a high learning value, and then prepare the assets needed to complete your tutorial, such as a sample dataset and sample configuration. + +Tutorials are in high demand. +If you write a good one, many users will reference it and thank you for your work! + +### How-To + +How-Tos are task-oriented and ... + +- Must include a list of numbered steps, known as a task, or a procedure, to help users complete a specific, domain-agnostic task, such as running a request, copying a file, installing, exporting, or other. For example, you can create a task titled "How to stream datasets out of Tamr". +- Must include a context paragraph, such as "It is often useful to stream datasets from Tamr, to load them into business analytics applications, such as Tableau, for analysis." Context may also include checks needed to be in place before users start the task, and links to related concepts. Context must provide information needed to begin the task, such as, it can list the host and port URL at which the endpoint for the service is served. +- Must include a stem sentence, such as: "To stream a dataset out of Tamr:" The stem sentence is followed by numbered steps. +- Must include a numbered list of steps where each step must begin with an imperative verb, such as: "Run the following curl request.", or "Save the file". For more examples see [Use Imperatives in Procedures](http://www.cs.cmu.edu/afs/cs.cmu.edu/project/cmt-40/kantoo/vol40/doc/kce/styleguide/imperatives.html). + +### Explanation + +Explanations are understanding-oriented and ... + +- Must explain a single concept of the Tamr Python client. If you'd like to write another concept, create it separately. +- Must [keep sentences short](http://www.cs.cmu.edu/afs/cs.cmu.edu/project/cmt-40/kantoo/vol40/doc/kce/styleguide/shortsentences.html). +- May include examples of code or text examples. + +### Reference + +Reference is information-oriented. + +It is something that users cannot remember and want to be able to refer to, often. +Reference provides details, such as configuration parameters for a particular method or call. +It never contains tasks, or concepts. +Reference is often automatically-generated from code, to ensure it is up-to-date and accurate at all times. + +``` note:: + Our reference documentation is automatically generated by [autodoc](https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html) based on type annotations and docstrings in the source code. +``` \ No newline at end of file diff --git a/docs/contributor-guide/how-to-write-tests.md b/docs/contributor-guide/how-to-write-tests.md new file mode 100644 index 00000000..6aa543a5 --- /dev/null +++ b/docs/contributor-guide/how-to-write-tests.md @@ -0,0 +1,78 @@ +# How to write tests + +Our test suite uses `pytest`. + +See the [pytest docs](https://docs.pytest.org/en/stable/) for: +- how to run specific tests +- how to capture `print` output for debugging tests +- etc... + +Note that you will need to pass any `pytest` arguments after `--` so that `nox` passes the arguments correctly to `pytest`: + +```sh +prn -s test-3.6 -- -s tests/tamr_client/test_project.py::test_from_resource_id_mastering +``` + +## Unit tests + +Each unit test: +- must be in a Python file whose name starts with `test_` +- must be a function whose name starts with `test_` +- should test *one* specific feature. +- should use `tests.tamr_client.fake` utility to fake resources and Tamr server responses as necessary + +For example, testing a simple feature that does not require communication with a Tamr server could look like: + +```python +# test_my_feature.py +import tamr_client as tc +from tests.tamr_client import fake + +def test_my_feature_works(): + # prerequisites + p = fake.project() + d = fake.dataset() + + # test my feature + result = tc.my_feature(p, d) + assert result.is_correct() +``` + +After using the `fake` utilities to set up your prerequisites, +the rest of the test code should be as representative of real user code as possible. + +Test code that exercises the feature should not contain any test-specific logic. + +### Faking responses + +If the tested feature requires communication with a Tamr server, +you will need to fake Tamr server responses. + +In general, any feature that takes a session argument will need faked responses. + +You can fake responses via the `@fake.json` decorator: + +```python +# test_my_feature.py +import tamr_client as tc +from tests.tamr_client import fake + +@fake.json +def test_my_feature(): + # prerequisites + s = fake.session() + p = fake.project() + + # test my feature + result = tc.my_feature(s, p) + assert result.is_correct() +``` + +`@fake.json` will look for a corresponding fake JSON file within `tests/tamr_client/fake_json`, +specifically `tests/tamr_client/fake_json//`. + +In the example, that would be `tests/tamr_client/fake_json/test_my_feature/test_my_feature_works.json`. + +The fake JSON file should be formatted as a list of request/response pairs in order of execution. + +For a real examples, see existing fake JSON files within `tests/tamr_client/fake_json`. \ No newline at end of file diff --git a/docs/contributor-guide/install.md b/docs/contributor-guide/install.md new file mode 100644 index 00000000..a54c11bf --- /dev/null +++ b/docs/contributor-guide/install.md @@ -0,0 +1,31 @@ +# Installation + +### Prerequisites + +1. Install [build dependencies for pyenv](https://github.com/pyenv/pyenv/wiki#suggested-build-environment) +2. Install [pyenv](https://github.com/pyenv/pyenv#installation) +3. Install [poetry](https://python-poetry.org/docs/#installation) + +### Clone + install + +1. Clone your fork and `cd` into the project: + + ```sh + git clone https://github.com//tamr-client + cd tamr-client + ``` + +2. Install all Python versions in [.python-version](https://github.com/Datatamer/tamr-client/blob/master/.python-version): + + [Dev tasks](dev-tasks) will use these Python versions. + + ```sh + # run `pyenv install` for each line in `.python-version` + cat .python-version | xargs -L 1 pyenv install + ``` + +3. Install project dependencies via `poetry`: + + ```sh + poetry install + ``` diff --git a/docs/contributor-guide/pull-request.md b/docs/contributor-guide/pull-request.md new file mode 100644 index 00000000..263e5eb9 --- /dev/null +++ b/docs/contributor-guide/pull-request.md @@ -0,0 +1,52 @@ +# Contributing pull requests + +### ️RFCs +If the proposed changes require design input, [open a Request For Comment issue](https://github.com/Datatamer/tamr-client/issues/new/choose). + +Discuss the feature with project maintainers to be sure that your change fits with the project vision and that you won't be wasting effort going in the wrong direction. + +Once you get the green light 🟢 from maintainers, you can proceed with the PR. + +### Pull requests + +Contributions / PRs should follow the +[Forking Workflow](https://www.atlassian.com/git/tutorials/comparing-workflows/forking-workflow). In short: + + 1. Fork it: `https://github.com/[your-github-username]/tamr-client/fork` + 2. Create your feature branch: + + ```sh + git checkout -b my-new-feature + ``` + + 3. Commit your changes: + + ```sh + git commit -am 'Add some feature' + ``` + + 4. Push to the branch: + + ```sh + git push origin my-new-feature + ``` + + 5. Create a new Pull Request + +### Commits + +Split and squash commits as necessary to create a clean `git` history. Once you ask for review, only add new commits (do not change existing commits) for reviewer convenience. You may change commits in your PR only if reviewers are ok with it. + +Commit messages **must** follow the [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/). +CI for pull requests will enforce this and fail if commit messages are not formatted correctly. + +We recommend the [Commitzen CLI](https://github.com/commitizen/cz-cli) to make writing Conventional Commits easy, but you may write commit messages manually or use any other tools. + +Also, your commit messages should [explain any things that are not obvious](https://chris.beams.io/posts/git-commit/#why-not-how) from reading your code! + +### CI checks + +Continuous integration (CI) checks are run automatically for all pull requests. +CI runs the same [dev tasks](dev-tasks) that you can run locally. + +You should run dev tasks locally _before_ submitting your PR to cut down on subsequent commits to fix the CI checks. diff --git a/docs/contributor-guide/text-editor.md b/docs/contributor-guide/text-editor.md new file mode 100644 index 00000000..68c05bf5 --- /dev/null +++ b/docs/contributor-guide/text-editor.md @@ -0,0 +1,11 @@ +# Configure your Text Editor + +### Atom +- [python-black](https://atom.io/packages/python-black) +- [linter-flake8](https://atom.io/packages/linter-flake8) + +### VS Code +- [Python](https://marketplace.visualstudio.com/items?itemName=ms-python.python) + +### IntelliJ +- [Black](https://black.readthedocs.io/en/stable/editor_integration.html#pycharm-intellij-idea) diff --git a/docs/developer-interface.rst b/docs/developer-interface.rst deleted file mode 100644 index 4dcf4d6e..00000000 --- a/docs/developer-interface.rst +++ /dev/null @@ -1,93 +0,0 @@ -Developer Interface -=================== -.. _authentication: - -Authentication --------------- - -.. autoclass:: tamr_unify_client.auth.UsernamePasswordAuth - -Client ------- - -.. autoclass:: tamr_unify_client.Client - :members: - -Dataset -------- - -.. autoclass:: tamr_unify_client.models.dataset.resource.Dataset - :members: - -Dataset Profile ---------------- - -.. autoclass:: tamr_unify_client.models.dataset_profile.DatasetProfile - :members: - -Dataset Status --------------- - -.. autoclass:: tamr_unify_client.models.dataset_status.DatasetStatus - :members: - -Datasets --------- - -.. autoclass:: tamr_unify_client.models.dataset.collection.DatasetCollection - :members: - -Attribute ---------- - -.. autoclass:: tamr_unify_client.models.attribute.resource.Attribute - -Attribute Type --------------- - -.. autoclass:: tamr_unify_client.models.attribute.type.AttributeType - -Attributes ----------- - -.. autoclass:: tamr_unify_client.models.attribute.collection.AttributeCollection - -Machine Learning Models ------------------------ - -.. autoclass:: tamr_unify_client.models.machine_learning_model.MachineLearningModel - :members: - -Operations ----------- - -.. autoclass:: tamr_unify_client.models.operation.Operation - :members: - - -Project -------- - -.. autoclass:: tamr_unify_client.models.project.resource.Project - :members: - ----- - -.. autoclass:: tamr_unify_client.models.project.categorization.CategorizationProject - :members: - ----- - -.. autoclass:: tamr_unify_client.models.project.mastering.MasteringProject - :members: - ----- - -.. autoclass:: tamr_unify_client.models.project.estimated_pair_counts.EstimatedPairCounts - :members: - -Projects --------- - -.. autoclass:: tamr_unify_client.models.project.collection.ProjectCollection - :members: diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 00000000..ec3261b0 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,51 @@ +# Tamr - Python Client + +[View on Github](https://github.com/Datatamer/tamr-client) + +## Example + +```python +from tamr_unify_client import Client +from tamr_unify_client.auth import UsernamePasswordAuth +import os + +# grab credentials from environment variables +username = os.environ['TAMR_USERNAME'] +password = os.environ['TAMR_PASSWORD'] +auth = UsernamePasswordAuth(username, password) + +host = 'localhost' # replace with your Tamr host +tamr = Client(auth, host=host) + +# programmatically interact with Tamr! +# e.g. refresh your project's Unified Dataset +project = tamr.projects.by_resource_id('3') +ud = project.unified_dataset() +op = ud.refresh() +assert op.succeeded() +``` + +## User Guide + + * [FAQ](user-guide/faq) + * [Install](user-guide/installation) + * [Quickstart](user-guide/quickstart) + * [Secure credentials](user-guide/secure-credentials) + * [Workflows](user-guide/workflows) + * [Create and update resources](user-guide/spec) + * [Logging](user-guide/logging) + * [Geospatial data](user-guide/geo) + * [Pandas usage](user-guide/pandas) + * [Advanced usage](user-guide/advanced-usage) + +## Reference + + * [Reference](reference) + +## Contributor Guide + + * [Contributor guide](contributor-guide) + +## BETA + + * [BETA](beta) diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index 6c58de38..00000000 --- a/docs/index.rst +++ /dev/null @@ -1,58 +0,0 @@ -Tamr Unify - Python Client -========================== - -Version: |release| | `View on Github `_ - -Example -------- - -:: - - from tamr_unify_client import Client - from tamr_unify_client.auth import UsernamePasswordAuth - import os - - # grab credentials from environment variables - username = os.environ['UNIFY_USERNAME'] - password = os.environ['UNIFY_PASSWORD'] - auth = UsernamePasswordAuth(username, password) - - host = 'localhost' # replace with your Tamr Unify host - unify = Client(auth, host=host) - - # programmatically interace with Tamr Unify! - # e.g. refresh your project's Unified Dataset - project = unify.projects.by_resource_id('3') - ud = project.unified_dataset() - op = ud.refresh() - assert op.succeeded() - -User Guide ----------- - -.. toctree:: - :maxdepth: 2 - - user-guide/faq - user-guide/installation - user-guide/quickstart - user-guide/secure-credentials - user-guide/workflows - user-guide/geo - user-guide/advanced-usage - -Contributor Guide ------------------ - -.. toctree:: - :maxdepth: 2 - - contributor-guide - -Developer Interface -------------------- - -.. toctree:: - :maxdepth: 2 - - developer-interface diff --git a/docs/reference.md b/docs/reference.md new file mode 100644 index 00000000..66181985 --- /dev/null +++ b/docs/reference.md @@ -0,0 +1,11 @@ +# Reference + + * [Attributes](reference/attribute) + * [Auth](reference/auth) + * [Categorization](reference/categorization) + * [Client](reference/client) + * [Datasets](reference/dataset) + * [Machine Learning Model](reference/machine_learning_model) + * [Mastering](reference/mastering) + * [Operations](reference/operation) + * [Projects](reference/project) diff --git a/docs/reference/attribute.rst b/docs/reference/attribute.rst new file mode 100644 index 00000000..863d2b0f --- /dev/null +++ b/docs/reference/attribute.rst @@ -0,0 +1,37 @@ +Attributes +========== + +Attribute +--------- + +.. autoclass:: tamr_unify_client.attribute.resource.Attribute + :members: + +Attribute Spec +-------------- + +.. autoclass:: tamr_unify_client.attribute.resource.AttributeSpec + :members: + +Attribute Collection +-------------------- + +.. autoclass:: tamr_unify_client.attribute.collection.AttributeCollection + :members: + +Attribute Type +-------------- + +.. autoclass:: tamr_unify_client.attribute.type.AttributeType + :members: + +Attribute Type Spec +------------------- + +.. autoclass:: tamr_unify_client.attribute.type.AttributeTypeSpec + :members: + +SubAttribute +------------ +.. autoclass:: tamr_unify_client.attribute.subattribute.SubAttribute + :members: diff --git a/docs/reference/auth.rst b/docs/reference/auth.rst new file mode 100644 index 00000000..7ada7014 --- /dev/null +++ b/docs/reference/auth.rst @@ -0,0 +1,4 @@ +Auth +==== + +.. autoclass:: tamr_unify_client.auth.UsernamePasswordAuth diff --git a/docs/reference/categorization.rst b/docs/reference/categorization.rst new file mode 100644 index 00000000..bd70abfd --- /dev/null +++ b/docs/reference/categorization.rst @@ -0,0 +1,35 @@ +Categorization +============== + +Categorization Project +---------------------- + +.. autoclass:: tamr_unify_client.categorization.project.CategorizationProject + :members: + +Categories +---------- + +Category +^^^^^^^^ + +.. autoclass:: tamr_unify_client.categorization.category.resource.Category + :members: + +Category Spec +^^^^^^^^^^^^^ + +.. autoclass:: tamr_unify_client.categorization.category.resource.CategorySpec + :members: + +Category Collection +^^^^^^^^^^^^^^^^^^^ + +.. autoclass:: tamr_unify_client.categorization.category.collection.CategoryCollection + :members: + +Taxonomy +-------- + +.. autoclass:: tamr_unify_client.categorization.taxonomy.Taxonomy + :members: diff --git a/docs/reference/client.rst b/docs/reference/client.rst new file mode 100644 index 00000000..9fb01c3b --- /dev/null +++ b/docs/reference/client.rst @@ -0,0 +1,5 @@ +Client +====== + +.. autoclass:: tamr_unify_client.Client + :members: diff --git a/docs/reference/dataset.rst b/docs/reference/dataset.rst new file mode 100644 index 00000000..8c0f90e2 --- /dev/null +++ b/docs/reference/dataset.rst @@ -0,0 +1,52 @@ +Datasets +======== + +Dataset +------- + +.. autoclass:: tamr_unify_client.dataset.resource.Dataset + :members: + +Dataset Spec +------------ + +.. autoclass:: tamr_unify_client.dataset.resource.DatasetSpec + :members: + +Dataset Collection +------------------ + +.. autoclass:: tamr_unify_client.dataset.collection.DatasetCollection + :members: +.. autoclass:: tamr_unify_client.dataset.collection.CreationError + :members: + +Dataset Profile +--------------- + +.. autoclass:: tamr_unify_client.dataset.profile.DatasetProfile + :members: + +Dataset Status +-------------- + +.. autoclass:: tamr_unify_client.dataset.status.DatasetStatus + :members: + +Dataset URI +----------- + +.. autoclass:: tamr_unify_client.dataset.uri.DatasetURI + :members: + +Dataset Usage +------------- + +.. autoclass:: tamr_unify_client.dataset.usage.DatasetUsage + :members: + +Dataset Use +----------- + +.. autoclass:: tamr_unify_client.dataset.use.DatasetUse + :members: diff --git a/docs/reference/machine_learning_model.rst b/docs/reference/machine_learning_model.rst new file mode 100644 index 00000000..f42bd999 --- /dev/null +++ b/docs/reference/machine_learning_model.rst @@ -0,0 +1,5 @@ +Machine Learning Model +---------------------- + +.. autoclass:: tamr_unify_client.base_model.MachineLearningModel + :members: diff --git a/docs/reference/mastering.rst b/docs/reference/mastering.rst new file mode 100644 index 00000000..2c868263 --- /dev/null +++ b/docs/reference/mastering.rst @@ -0,0 +1,58 @@ +Mastering +========= + +Binning Model +------------- + +.. autoclass:: tamr_unify_client.mastering.binning_model.BinningModel + :members: + +Estimated Pair Counts +--------------------- + +.. autoclass:: tamr_unify_client.mastering.estimated_pair_counts.EstimatedPairCounts + :members: + +Mastering Project +----------------- + +.. autoclass:: tamr_unify_client.mastering.project.MasteringProject + :members: + +Published Clusters +------------------ + +Metric +^^^^^^ +.. autoclass:: tamr_unify_client.mastering.published_cluster.metric.Metric + :members: + + +Published Cluster +^^^^^^^^^^^^^^^^^ +.. autoclass:: tamr_unify_client.mastering.published_cluster.resource.PublishedCluster + :members: + +Published Cluster Configuration +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autoclass:: tamr_unify_client.mastering.published_cluster.configuration.PublishedClustersConfiguration + :members: + +Published Cluster Version +^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autoclass:: tamr_unify_client.mastering.published_cluster.version.PublishedClusterVersion + :members: + +Record Published Cluster +^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autoclass:: tamr_unify_client.mastering.published_cluster.record.RecordPublishedCluster + :members: + +Record Published Cluster Version +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autoclass:: tamr_unify_client.mastering.published_cluster.record_version.RecordPublishedClusterVersion + :members: diff --git a/docs/reference/operation.rst b/docs/reference/operation.rst new file mode 100644 index 00000000..51849ace --- /dev/null +++ b/docs/reference/operation.rst @@ -0,0 +1,5 @@ +Operation +========= + +.. autoclass:: tamr_unify_client.operation.Operation + :members: diff --git a/docs/reference/project.rst b/docs/reference/project.rst new file mode 100644 index 00000000..f44531f7 --- /dev/null +++ b/docs/reference/project.rst @@ -0,0 +1,69 @@ +Projects +======== + +Attribute Configurations +------------------------ + +Attribute Configuration +^^^^^^^^^^^^^^^^^^^^^^^ + +.. autoclass:: tamr_unify_client.project.attribute_configuration.resource.AttributeConfiguration + :members: + +Attribute Configuration Spec +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autoclass:: tamr_unify_client.project.attribute_configuration.resource.AttributeConfigurationSpec + :members: + +Attribute Configuration Collection +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autoclass:: tamr_unify_client.project.attribute_configuration.collection.AttributeConfigurationCollection + :members: + + +Attribute Mappings +------------------ + +Attribute Mapping +^^^^^^^^^^^^^^^^^ + +.. autoclass:: tamr_unify_client.project.attribute_mapping.resource.AttributeMapping + :members: + +Attribute Mapping Spec +^^^^^^^^^^^^^^^^^^^^^^ + +.. autoclass:: tamr_unify_client.project.attribute_mapping.resource.AttributeMappingSpec + :members: + +Attribute Mapping Collection +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autoclass:: tamr_unify_client.project.attribute_mapping.collection.AttributeMappingCollection + :members: + +Project +------- + +.. autoclass:: tamr_unify_client.project.resource.Project + :members: + +Project Spec +------------ + +.. autoclass:: tamr_unify_client.project.resource.ProjectSpec + :members: + +Project Collection +------------------ + +.. autoclass:: tamr_unify_client.project.collection.ProjectCollection + :members: + +Project Step +------------ + +.. autoclass:: tamr_unify_client.project.step.ProjectStep + :members: diff --git a/docs/requirements.txt b/docs/requirements.txt index 7d42fd2e..2b0c0ae9 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,5 +1,8 @@ -# TODO(pcattori) Delete this file once RTD fully supports poetry -Sphinx -sphinx_rtd_theme -toml +# Doc dependencies tracked separately here for interoperability with readthedocs.org +pandas==1.0.5 +recommonmark==0.6.0 +sphinx_rtd_theme==0.5.0 +sphinx-autodoc-typehints==1.11.0 +Sphinx==3.1.1 +toml==0.10.0 . diff --git a/docs/user-guide/advanced-usage.md b/docs/user-guide/advanced-usage.md new file mode 100644 index 00000000..3dee4e98 --- /dev/null +++ b/docs/user-guide/advanced-usage.md @@ -0,0 +1,96 @@ +# Advanced Usage + +## Asynchronous Operations + +You can opt-in to an asynchronous interface via the asynchronous keyword argument for methods that kick-off Tamr operations. + +E.g.: + +```python +op = project.unified_dataset().refresh(asynchronous=True) +# do asynchronous stuff here while operation is running +op = op.wait() # hangs until operation finishes +assert op.succeeded() +``` + +## Raw HTTP requests and Unversioned API Access + +We encourage you to use the high-level, object-oriented interface offered by the Python Client. If you aren't sure whether you need to send low-level HTTP requests, you probably don't. + +But sometimes it's useful to directly send HTTP requests to Tamr; for example, Tamr has many APIs that are not covered by the higher-level interface (most of which are neither versioned nor supported). You can still call these endpoints using the Python Client, but you'll need to work with raw `Response` objects. + +### Custom endpoint + +The client exposes a `request` method with the same interface as +`requests.request`: + +```python +# import Python Client library and configure your client + +tamr = Client(auth) +# do stuff with the `tamr` client + +# now I NEED to send a request to a specific endpoint +response = tamr.request('GET', 'relative/path/to/resource') +``` + +This will send a request relative to the base_path registered with the client. If you provide an absolute path to the resource, the base_path will be ignored when composing the request: + +```python +# import Python Client library and configure your client + +tamr = Client(auth) + +# request a resource outside the configured base_path +response = tamr.request('GET', '/absolute/path/to/resource') +``` + +You can also use the `get`, `post`, `put`, `delete` convenience +methods: + +```python +# e.g. `get` convenience method +response = tamr.get('relative/path/to/resource') +``` + +### Custom Host / Port / Base API path + +If you need to repeatedly send requests to another port or base API path (i.e. not `/api/versioned/v1/`), you can simply instantiate a different client. + +Then just call `request` as described above: + +```python +# import Python Client library and configure your client + +tamr = api.Client(auth) +# do stuff with the `tamr` client + +# now I NEED to send requests to a different host/port/base API path etc.. +# NOTE: in this example, we reuse `auth` from the first client, but we could +# have made a new Authentication provider if this client needs it. +custom_client = api.Client( + auth, + host="10.10.0.1", + port=9090, + base_path="/api/some_service/", +) +response = custom_client.get('relative/path/to/resource') +``` + +### One-off authenticated request + +All of the Python Client Authentication providers adhere to the `requests.auth.BaseAuth` interface. + +This means that you can pass in an Authentication provider directly to the `requests` library: + +```python +from tamr_unify_client.auth import UsernamePasswordAuth +import os +import requests + +username = os.environ['TAMR_USERNAME'] +password = os.environ['TAMR_PASSWORD'] +auth = UsernamePasswordAuth(username, password) + +response = requests.request('GET', 'some/specific/endpoint', auth=auth) +``` diff --git a/docs/user-guide/advanced-usage.rst b/docs/user-guide/advanced-usage.rst deleted file mode 100644 index e1c7c1c6..00000000 --- a/docs/user-guide/advanced-usage.rst +++ /dev/null @@ -1,154 +0,0 @@ -Advanced Usage -============== - -Asynchronous Operations ------------------------ - -You can opt-in to an asynchronous interface via the asynchronous keyword argument -for methods that kick-off Unify operations. - -E.g.:: - - operation = project.unified_dataset().refresh(asynchronous=True) - # do asynchronous stuff while operation is running - operation.wait() # hangs until operation finishes - assert op.succeeded() - -Logging API calls ------------------ - -It can be useful (e.g. for debugging) to log the API calls made on your behalf -by the Python Client. - -You can set up HTTP-API-call logging on any client via -standard `Python logging mechanisms `_ :: - - from tamr_unify_client import Client - from unify_api_v1.auth import UsernamePasswordAuth - import logging - - auth = UsernamePasswordAuth("username", "password") - unify = Client(auth) - - # Reload the `logging` library since other libraries (like `requests`) already - # configure logging differently. See: https://stackoverflow.com/a/53553516/1490091 - import imp - imp.reload(logging) - - logging.basicConfig( - level=logging.INFO, format="%(message)s", filename=log_path, filemode="w" - ) - unify.logger = logging.getLogger(name) - -By default, when logging is set up, the client will log ``{method} {url} : -{response_status}`` for each API call. - -You can customize this by passing in a value for ``log_entry``:: - - def log_entry(method, url, response): - # custom logging function - # use the method, url, and response to construct the logged `str` - # e.g. for logging out machine-readable JSON: - import json - return json.dumps({ - "request": f"{method} {url}", - "status": response.status_code, - "json": response.json(), - }) - - # after configuring `unify.logger` - unify.log_entry = log_entry - -.. _custom-http-requests-and-unversioned-api-access: - -Custom HTTP requests and Unversioned API Access ------------------------------------------------ - -We encourage you to use the high-level, object-oriented interface offered by -the Python Client. If you aren't sure whether you need to send low-level HTTP -requests, you probably don't. - -But sometimes it's useful to directly send HTTP requests to Unify; for example, -Unify has many APIs that are not covered by the higher-level interface (most of -which are neither versioned nor supported). You can still call these endpoints -using the Python Client, but you'll need to work with raw ``Response`` objects. - -Custom endpoint -^^^^^^^^^^^^^^^ - -The client exposes a ``request`` method with the same interface as -``requests.request``:: - - # import Python Client library and configure your client - - unify = Client(auth) - # do stuff with the `unify` client - - # now I NEED to send a request to a specific endpoint - response = unify.request('GET', 'relative/path/to/resource') - -This will send a request relative to the base_path registered with the client. -If you provide an absolute path to the resource, the base_path will be ignored -when composing the request:: - - # import Python Client library and configure your client - - unify = Client(auth) - - # request a resource outside the configured base_path - response = unify.request('GET', '/absolute/path/to/resource') - -You can also use the ``get``, ``post``, ``put``, ``delete`` convenience -methods:: - - # e.g. `get` convenience method - response = unify.get('relative/path/to/reosurce') - -Custom Host / Port / Base API path -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -If you need to repeatedly send requests to another port or base API path -(i.e. not ``api/versioned/v1/``), you can simply instantiate a different client. - -Then just call ``request`` as described above:: - - # import Python Client library and configure your client - - unify = api.Client(auth) - # do stuff with the `unify` client - - # now I NEED to send requests to a different host/port/base API path etc.. - # NOTE: in this example, we reuse `auth` from the first client, but we could - # have made a new Authentication provider if this client needs it. - custom_client = api.Client( - auth, - host="10.10.0.1", - port=9090, - base_path="api/some_service/", - ) - response = custom_client.get('relative/path/to/resource') - -Note that any component of the base_path after the final slash will be ignored; -see the documentation on `urljoin -`_ -for details. - -One-off authenticated request -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -All of the Python Client Authentication providers adhere to the -``requests.auth.BaseAuth`` interface. - -This means that you can pass in an -Authentication provider directly to the ``requests`` library:: - - from tamr_unify_client.auth import UsernamePasswordAuth - import os - import requests - - username = os.environ['UNIFY_USERNAME'] - password = os.environ['UNIFY_PASSWORD'] - auth = UsernamePasswordAuth(username, password) - - response = requests.request('GET', 'some/specific/endpoint', auth=auth) - diff --git a/docs/user-guide/faq.md b/docs/user-guide/faq.md new file mode 100644 index 00000000..70455152 --- /dev/null +++ b/docs/user-guide/faq.md @@ -0,0 +1,24 @@ +# FAQ + +## What version of the Python Client should I use? + +The Python Client just cares about features, and will try everything it knows to implement those features correctly, independent of the API version. + +If you are starting a new project or your existing project does not yet use the Python Client, we encourage you to use the **latest stable version** of the Python Client. + +Otherwise, check the [Releases](https://github.com/Datatamer/tamr-client/releases) to see: + +* what new features and bug fixes are available in newer versions +* which breaking changes (if any) will require changes in your code to get those new features and bug fixes + +Note: You do not need to reason about the Tamr API version nor the the Tamr app/server version. + +## How do I call custom endpoints, e.g. endpoints outside the Tamr API? + +To call a custom endpoint *within* the Tamr API, use the `client.request()` method, and provide an endpoint described by a path relative to `base_path`. + +For example, if `base_path` is `/api/versioned/v1/` (the default), and you want to get `/api/versioned/v1/projects/1`, you only need to provide `projects/1` (the relative ID provided by the project) as the endpoint, and the Client will resolve that into `/api/versioned/v1/projects/1`. + +There are various APIs outside the `/api/versioned/v1/` prefix that are often useful or necessary to call - e.g. `/api/service/health`, or other un-versioned / unsupported APIs. To call a custom endpoint *outside* the Tamr API, use the `client.request()` method, and provide an endpoint described by an *absolute* path (a path starting with `/`). For example, to get `/api/service/health` (no matter what `base_path` is), call `client.request()` with `/api/service/health` as the endpoint. The Client will ignore `base_path` and send the request directly against the absolute path provided. + +For additional detail, see [Raw HTTP requests and Unversioned API Access]() diff --git a/docs/user-guide/faq.rst b/docs/user-guide/faq.rst deleted file mode 100644 index 708c7903..00000000 --- a/docs/user-guide/faq.rst +++ /dev/null @@ -1,91 +0,0 @@ -FAQ -=== - -What version of the Python Client should I use? ------------------------------------------------ - -If you are starting a new project or your existing project does not yet use the -Python Client, we encourage you to use the **latest stable version** of the Python -Client. - ----- - -If you are already using the Python Client, you have 3 options: - - -1. **"I like my project's code the way it is."** - - Keep using the version you are on. - -2. **"I want some new features released in versions with the same major version that I'm currently using."** - - Upgrade to the latest stable version *with the same major version* as what - you currently use. - -3. **"I want all new features and I'm willing to modify my code to get those features!"** - - Upgrade to the latest stable version *even* if it has a different major - version from what you currently use. - -Note that you do not need to reason about the Unify API version nor the the Unify version. - ----- - -**How does this the Python Client accomplish this?** - -The short answer is that the Python Client just cares about features, and will -try everything it knows to implement those features correctly, independent of -the API version. - -We'll illustrate with an example. - -Let's say you want to get a dataset by name in your Python code. - -**1.** If no such feature exists, you can file a Feature Request. Note that the Python -Client is limited by what the Unify API enables. So you should check if the Unify -API docs to see if the feature you want is even possible. - -**2.** If this feature already exists, you can try it out! - -E.g. ``unify.datasets.by_name(some_dataset_name)`` - - **2.a** It works! 🎉 - - **2.b** If it fails with an HTTP error, it could be for 2 reasons: - - **2.a.i** It might be impossible to support that feature in the Python Client - because your Unify API version does not have the necessary endpoints to - support it. - - **2.a.ii** Your Unify API version *does* support this feature with some endpoints, - but the Python Client know how to correctly implement this feature for this - version of the API. In this case, you should submit a Feature Request. - - **2.c** If it fails with any other error, you should submit a Bug Report. 🐛 - - -.. note:: - To see how to submit Bug Reports / Feature Requests, see :ref:`bug-reports-feature-requests`. - - To check what endpoints your version of the Unify API supports, see `docs.tamr.com/reference `_ - (be sure to select the correct version in the top left!). - - -How do I call custom endpoints, e.g. endpoints outside the Unify API? ---------------------------------------------------------------------- - -To call a custom endpoint *within* the Unify API, use the ``client.request()`` method, and -provide an endpoint described by a path relative to ``base_path``. For example, if ``base_path`` -is ``/api/versioned/v1/`` (the default), and you want to get ``/api/versioned/v1/projects/1``, -you only need to provide ``projects/1`` (the relative ID provided by the project) as the endpoint, -and the Client will resolve that into ``/api/versioned/v1/projects/1``. - -There are various APIs outside the ``/api/versioned/v1/`` prefix that are often useful or necessary -to call - e.g. ``/api/service/health``, or other un-versioned / unsupported APIs. To call a custom -endpoint *outside* the Unify API, use the ``client.request()`` method, and provide an endpoint -described by an *absolute* path (a path starting with ``/``). For example, to get -``/api/service/health`` (no matter what ``base_path`` is), call ``client.request()`` with -``/api/service/health`` as the endpoint. The Client will ignore ``base_path`` and send the -request directly against the absolute path provided. - -For additional detail, see :ref:`custom-http-requests-and-unversioned-api-access`. diff --git a/docs/user-guide/geo.md b/docs/user-guide/geo.md new file mode 100644 index 00000000..3808736d --- /dev/null +++ b/docs/user-guide/geo.md @@ -0,0 +1,110 @@ +# Geospatial Data +## What geospatial data is supported? +In general, the Python Geo Interface is supported; see . + +There are three layers of information, modeled after GeoJSON (see ): + +* The outermost layer is a FeatureCollection +* Within a FeatureCollection are Features, each of which represents one "thing", like a building or a river. Each feature has: + * type (string; required) + * id (object; required) + * geometry (Geometry, see below; optional) + * bbox ("bounding box", 4 doubles; optional) + * properties (map[string, object]; optional) +* Within a Feature is a Geometry, which represents a shape, like a point or a polygon. Each geometry has: + * type (one of "Point", "MultiPoint", "LineString", "MultiLineString", "Polygon", "MultiPolygon"; required) + * coordinates (doubles; exactly how these are structured depends on the type of the geometry) + +Although the Python Geo Interface is non-prescriptive when it comes to the data types of the id and properties, Tamr has a more restricted set of supported types. See . + +The `Dataset` class supports the `__geo_interface__` property. This will produce one `FeatureCollection` for the entire dataset. + +There is a companion iterator `itergeofeatures()` that returns a generator that allows you to +stream the records in the dataset as Geospatial features. + +To produce a GeoJSON representation of a dataset: +```python +dataset = client.datasets.by_name("my_dataset") +with open("my_dataset.json", "w") as f: + json.dump(dataset.__geo_interface__, f) +``` + +By default, `itergeofeatures()` will use the first dataset attribute with geometry type to fill in the feature geometry. You can override this by specifying the geometry attribute to use in the `geo_attr` parameter to `itergeofeatures`. + +`Dataset` can also be updated from a feature collection that supports the Python Geo Interface: +```python +import geopandas +geodataframe = geopandas.GeoDataFrame(...) +dataset = client.dataset.by_name("my_dataset") +dataset.from_geo_features(geodataframe) +``` +Note that there are currently some limitations to GeoPandas' implementation of the Geo Interface. See below for more details. + +By default the features' geometries will be placed into the first dataset attribute with geometry +type. You can override this by specifying the geometry attribute to use in the `geo_attr` +parameter to `from_geo_features`. + +## Rules for converting from Tamr records to Geospatial Features +The record's primary key will be used as the feature's `id`. If the primary key is a single attribute, then the value of that attribute will be the value of `id`. If the primary key is composed of multiple attributes, then the value of the `id` will be an array with the values of the key attributes in order. + +Tamr allows any number of geometry attributes per record; the Python Geo Interface is limited to one. When converting Tamr records to Python Geo Features, the first geometry attribute in the schema will be used as the geometry; all other geometry attributes will appear as properties with no type conversion. In the future, additional control over the handling of multiple geometries may be provided; the current set of capabilities is intended primarily to support the use case of working with FeatureCollections within Tamr, and FeatureCollection has only one geometry per feature. + +An attribute is considered to have geometry type if it has type `RECORD` and contains an attribute named `point`, `multiPoint`, `lineString`, `multiLineString`, `polygon`, or `multiPolygon`. + +If an attribute named `bbox` is available, it will be used as `bbox`. No conversion is done on the value of `bbox`. In the future, additional control over the handling of `bbox` attributes may be provided. + +All other attributes will be placed in `properties`, with no type conversion. This includes all geometry attributes other than the first. + +## Rules for converting from Geospatial Features to Tamr records +The Feature's `id` will be converted into the primary key for the record. If the record uses a simple key, no value translation will be done. If the record uses a composite key, then the value of the Feature's `id` must be an array of values, one per attribute in the key. + +If the Feature contains keys in `properties` that conflict with the record keys, `bbox`, or geometry, those keys are ignored (omitted). + +If the Feature contains a `bbox`, it is copied to the record's `bbox`. + +All other keys in the Feature's `properties` are propagated to the same-name attribute on the record, with no type conversion. + +## Streaming data access +The `Dataset` method `itergeofeatures()` returns a generator that allows you to stream the records in the dataset as Geospatial features: +```python +my_dataset = client.datasets.by_name("my_dataset") +for feature in my_dataset.itergeofeatures(): + do_something(feature) +``` + +Note that many packages that consume the Python Geo Interface will be able to consume this +iterator directly. For example:: +```python +from geopandas import GeoDataFrame +df = GeoDataFrame.from_features(my_dataset.itergeofeatures()) +``` +This allows construction of a GeoDataFrame directly from the stream of records, without materializing the intermediate dataset. + +## Note on GeoPandas data access +There is a current limitation in [GeoPandas](https://github.com/geopandas/geopandas/issues/1208) that causes the feature's ID field to be ignored in certain scenarios. The Tamr primary key is stored in this field. +The result is that when loading data and updating records through the `dataset.from_geo_features()` method, records will not be overwritten as anticipated. + +This issue can be circumvented by loading features into GeoPandas by re-inserting the id field into the data. + +```python +my_dataset = client.datasets.by_name("my_dataset") +for feature in my_dataset.itergeofeatures(): + primary_key = feature['id'] + df = gpd.GeoDataFrame.from_features([feature]) + do_something(df) + geo.index = [primary_key] + my_dataset.from_geo_features(df) +``` + +Alternatively, it is possible to load the full dataset as follows: +```python +my_dataset = client.datasets.by_name("my_dataset") +def geopandas_dataset(dataset): + for feature in dataset.itergeofeatures(): + feature['properties']['primary_key'] = feature['id'] + yield feature +df = gpd.GeoDataFrame.from_features(geo_dataset(my_dataset)) +df.set_index('primary_key') +do_something(df) +my_dataset.from_geo_features(df) +``` diff --git a/docs/user-guide/geo.rst b/docs/user-guide/geo.rst deleted file mode 100644 index e28339a0..00000000 --- a/docs/user-guide/geo.rst +++ /dev/null @@ -1,108 +0,0 @@ -Geospatial Data -=============== - -What geospatial data is supported? ----------------------------------- - -In general, the Python Geo Interface is supported; see https://gist.github.com/sgillies/2217756 - -There are three layers of information, modeled after GeoJSON; see https://tools.ietf.org/html/rfc7946 : - -- The outermost layer is a FeatureCollection -- Within a FeatureCollection are Features, each of which represents one "thing", like a building - or a river. Each feature has: - - - type (string; required) - - id (object; required) - - geometry (Geometry, see below; optional) - - bbox ("bounding box", 4 doubles; optional) - - properties (map[string, object]; optional) - -- Within a Feature is a Geometry, which represents a shape, like a point or a polygon. Each - geometry has: - - - type (one of "Point", "MultiPoint", "LineString", "MultiLineString", "Polygon", "MultiPolygon"; - required) - - coordinates (doubles; exactly how these are structured depends on the type of the geometry) - -Although the Python Geo Interface is non-prescriptive when it comes to the data types of the id and -properties, Unify has a more restricted set of supported types. See https://docs.tamr.com/reference#attribute-types - -The :class:`~tamr_unify_client.models.dataset.resource.Dataset` class supports the -``__geo_interface__`` property. This will produce one ``FeatureCollection`` for the entire dataset. - -There is a companion iterator ``itergeofeatures()`` that returns a generator that allows you to -stream the records in the dataset as Geospatial features. - -To produce a GeoJSON representation of a dataset:: - - dataset = client.datasets.by_name("my_dataset") - with open("my_dataset.json", "w") as f: - json.dump(dataset.__geo_interface__, f) - -``Dataset`` can also be updated from a feature collection that supports the Python Geo Interface:: - - import geopandas - geodataframe = geopandas.GeoDataFrame(...) - dataset = client.dataset.by_name("my_dataset") - dataset.from_geo_features(geodataframe) - -Rules for converting from Unify records to Geospatial Features ------------------------------------------------------------------- - -The record's primary key will be used as the feature's ``id``. If the primary key is a single -attribute, then the value of that attribute will be the value of ``id``. If the primary key is -composed of multiple attributes, then the value of the ``id`` will be an array with the values -of the key attributes in order. - -Unify allows any number of geometry attributes per record; the Python Geo Interface is limited to -one. When converting Unify records to Python Geo Features, the first geometry attribute in the schema -will be used as the geometry; all other geometry attributes will appear as properties with no type -conversion. In the future, additional control over the handling of multiple geometries may be -provided; the current set of capabilities is intended primarily to support the use case of working -with FeatureCollections within Unify, and FeatureCollection has only one geometry per feature. - -An attribute is considered to have geometry type if it has type ``RECORD`` and contains an attribute -named ``point``, ``multiPoint``, ``lineString``, ``multiLineString``, ``polygon``, or -``multiPolygon``. - -If an attribute named ``bbox`` is available, it will be used as ``bbox``. No conversion is done -on the value of ``bbox``. In the future, additional control over the handling of ``bbox`` attributes -may be provided. - -All other attributes will be placed in ``properties``, with no type conversion. This includes -all geometry attributes other than the first. - -Rules for converting from Geospatial Features to Unify records --------------------------------------------------------------- - -The Feature's ``id`` will be converted into the primary key for the record. If the record uses -a simple key, no value translation will be done. If the record uses a composite key, then the -value of the Feature's ``id`` must be an array of values, one per attribute in the key. - -If the Feature contains keys in ``properties`` that conflict with the record keys, ``bbox``, -or geometry, those keys are ignored (omitted). - -If the Feature contains a ``bbox``, it is copied to the record's ``bbox``. - -All other keys in the Feature's ``properties`` are propagated to the same-name attribute on the -record, with no type conversion. - -Streaming data access ---------------------- - -The ``Dataset`` method ``itergeofeatures()`` returns a generator that allows you to -stream the records in the dataset as Geospatial features:: - - my_dataset = client.datasets.by_name("my_dataset") - for feature in my_dataset.itergeofeatures(): - do_something(feature) - -Note that many packages that consume the Python Geo Interface will be able to consume this -iterator directly. For example:: - - from geopandas import GeoDataFrame - df = GeoDataFrame.from_features(my_dataset.itergeofeatures()) - -This allows construction of a GeoDataFrame directly from the stream of records, without -materializing the intermediate dataset. diff --git a/docs/user-guide/installation.md b/docs/user-guide/installation.md new file mode 100644 index 00000000..78f484ed --- /dev/null +++ b/docs/user-guide/installation.md @@ -0,0 +1,56 @@ +# Installation + +`tamr-unify-client` is compatible with Python 3.6 or newer. + +## Stable releases +Installation is as simple as: + +`pip install tamr-unify-client` + +Or: + +`poetry add tamr-unify-client` + +``` note:: + If you don't use `poetry `_, we recommend you use a virtual environment for your project and install the Python Client into that virtual environment. + + You can create a virtual environment with Python 3 via: + + ``python3 -m venv my-venv`` + + For more, see `The Hitchhiker's Guide to Python `_. +``` +## Latest (unstable) +``` note:: + This project uses the new ``pyproject.toml`` file, not a ``setup.py`` file, so make sure you have the latest version of ``pip`` installed: ```pip install -U pip``. +``` +To install the bleeding edge: +```bash +git clone https://github.com/Datatamer/tamr-client +cd tamr-client +pip install . +``` + +## Offline installs + +First, download `tamr-unify-client` and its dependencies on a machine with online access to PyPI: + +```bash +pip download tamr-unify-client -d tamr-unify-client-requirements +zip -r tamr-unify-client-requirements.zip tamr-unify-client-requirements +``` + +Then, ship the `.zip` file to the target machine where you want `tamr-unify-client` installed. You can do this via email, cloud drives, `scp` or any other mechanism. + +Finally, install `tamr-unify-client` from the saved dependencies: + +```bash +unzip tamr-unify-client-requirements.zip +pip install --no-index --find-links=tamr-unify-client-requirements tamr-unify-client +``` + +If you are not using a virtual environment, you may need to specify the `--user` flag if you get permissions errors: + +```bash +pip install --user --no-index --find-links=tamr-unify-client-requirements tamr-unify-client +``` diff --git a/docs/user-guide/installation.rst b/docs/user-guide/installation.rst deleted file mode 100644 index 8a01b6aa..00000000 --- a/docs/user-guide/installation.rst +++ /dev/null @@ -1,59 +0,0 @@ -Installation -============ - -``tamr-unify-client`` is compatible with Python 3.6 or newer. - -Stable releases ---------------- - -Installation is as simple as:: - - pip install tamr-unify-client - -Or:: - - poetry add tamr-unify-client - -.. note:: - If you don't use `poetry `_, we recommend you use a virtual environment for - your project and install the Python Client into that virtual environment. - - You can create a virtual environment with Python 3 via:: - - python3 -m venv my-venv - - For more, see `The Hitchhiker's Guide to Python `_ . - -Latest (unstable) ------------------ - -.. note:: - This project uses the new ``pyproject.toml`` file, not a ``setup.py`` file, so - make sure you have the latest version of ``pip`` installed: ``pip install -U pip``. - -To install the bleeding edge:: - - git clone https://github.com/Datatamer/unify-client-python - cd unify-client-python - pip install . - -Offline installs ----------------- - -First, download ``tamr-unify-client`` and its dependencies on a machine with online access to PyPI:: - - pip download tamr-unify-client -d tamr-unify-client-requirements - zip -r tamr-unify-client-requirements.zip tamr-unify-client-requirements - -Then, ship the ``.zip`` file to the target machine where you want ``tamr-unify-client`` installed. -You can do this via email, cloud drives, ``scp`` or any other mechanism. - -Finally, install ``tamr-unify-client`` from the saved dependencies:: - - unzip tamr-unify-client-requirements.zip - pip install --no-index --find-links=tamr-unify-client-requirements tamr-unify-client - -If you are not using a virtual environment, you may need to specify the ``--user`` flag -if you get permissions errors:: - - pip install --user --no-index --find-links=tamr-unify-client-requirements tamr-unify-client diff --git a/docs/user-guide/logging.md b/docs/user-guide/logging.md new file mode 100644 index 00000000..506de943 --- /dev/null +++ b/docs/user-guide/logging.md @@ -0,0 +1,52 @@ +# Logging + +**IMPORTANT** Make sure to configure logging BEFORE `import`ing from 3rd party +libraries. Logging will use the first configuration it finds, and if a library +configures logging before you, your configuration will be ignored. + +--- + +To configure logging, simply follow the [official Python logging HOWTO](https://docs.python.org/3/howto/logging.html#logging-howto). + +For example: +```python +# script.py +import logging + +logging.basicConfig(filename="script.log", level=logging.INFO) + +# configure logging before other imports + +from tamr_unify_client import Client +from tamr_unify_client.auth import UsernamePasswordAuth + +auth = UsernamePasswordAuth("my username", "my password") +tamr = Client(auth, host="myhost") + +for p in tamr.projects: + print(p) + +for d in tamr.datasets: + print(d) + +# should cause an HTTP error +tamr.get("/invalid/api/path").successful() +``` + +This will log all API requests made and print the response bodies for any +requests with HTTP error codes. + +If you want to **only** configure logging for the Tamr Client: +```python +import logging +logger = logging.getLogger('tamr_unify_client') +logger.setLevel(logging.INFO) +logger.addHandler(logging.FileHandler('tamr-client.log')) + +# configure logging before other imports + +from tamr_unify_client import Client +from tamr_unify_client import UsernamePasswordAuth + +# rest of script goes here +``` diff --git a/docs/user-guide/pandas.md b/docs/user-guide/pandas.md new file mode 100644 index 00000000..2bc1844d --- /dev/null +++ b/docs/user-guide/pandas.md @@ -0,0 +1,189 @@ +# Pandas Workflow + +## Connecting To Tamr + +Connecting to a Tamr instance: + +```python +import os +import pandas as pd +from tamr_unify_client import Client +from tamr_unify_client.auth import UsernamePasswordAuth + +username = os.environ['TAMR_USERNAME'] +password = os.environ['TAMR_PASSWORD'] + +auth = UsernamePasswordAuth(username, password) +tamr = Client(auth) +``` +## Load dataset as Dataframe + +### Loading: In Memory + +Loading a `dataset` as a pandas `dataframe` is possible via the `from_records()` method that pandas provides. +An example is shown below: + +```python +my_dataset = tamr.datasets.by_name("my_tamr_dataset") +df = pd.DataFrame.from_records(my_dataset.records()) +``` + +This will construct a pandas dataframe based on the records that are streamed in, and stored in the pandas dataframe. +Once all records have been loaded, you will be able to interact with the dataframe normally. + +Note that as values are typically represented inside `arrays` within Tamr, the values will be encapsulated `lists` +inside the dataframe. You can use traditional methods in pandas to deal with this; for example by calling `.explode()`, +or extracting specific elements. + +### Loading: Streaming +When working with large `datasets` it is sometimes better not to work in memory, but to iterate through a dataset, rather +than load the entire dataset at once. +Since `dataset.records()` is a generator, this can easily be done as follows: +```python +output = [] +for record in dataset.records(): + single_record_df = pd.DataFrame.from_records(record) + output.append(do_something(single_record_df)) +``` + +### Custom Generators +In order to customise the data loaded into the pandas dataframe, it is possible to customise the generator object +`dataset.records()` by wrapping it in a different generator. + +For example, it is possible to automatically flatten all lists with a length of one, and apply this to the `dataset.records()` +generator as follows: + +```python +def unlist(lst): + """ + If object is a list of length one, return first element. + Otherwise, return original object. + """ + if isinstance(lst, list) and len(lst) is 1: + return lst[0] + else: + return lst + +def dataset_to_pandas(dataset): + """ + Incorporates basic unlisting for easy transfer between Tamr and Pandas. + """ + for record in dataset.records(): + for key in record: + record[key] = unlist(record[key]) + yield record + +df = pd.DataFrame.from_records(dataset_to_pandas(my_dataset)) +``` + +Similarly, it is possible to filter to extracting only certain attributes, by specifying this in the generator: + +```python +def filter_dataset_to_pandas(dataset, colnames): + """ + Filter the dataset to only the primary key and the columns specified as a list in colnames. + """ + assert isinstance(colnames, list) + colnames = dataset.key_attribute_names + colnames if dataset.key_attribute_names[0] not in colnames else colnames + for record in dataset.records(): + yield {k: unlist(v) for k, v in record.items() if k in colnames} + +df = pd.DataFrame.from_records(filter_dataset_to_pandas(my_dataset, ['City', 'new_attr'])) +``` + +Note that upserting these records back to the original Tamr Dataset would overwite the existing records and attributes, and cause loss of the data +stored in the removed attributes. + +## Upload Dataframe as Dataset + +### Create New Dataset +To create a new dataset and upload data, the convenience function `datasets.create_from_dataframe()` can be used. +Note that Tamr will throw an error if columns aren't generally formatted as strings. (The exception being geospatial +columns. For that, see the geospatial examples.) + +To format values as strings while preserving null information, specify `dtype=object` when creating a dataframe from a csv file. +```python +df = pd.read_csv("my_file.csv", dtype=object) +``` + +Creating the dataset is as easy as calling: +```python +tamr.datasets.create_from_dataframe(df, 'primaryKey', 'my_new_dataset') +``` + +For an already-existing dataframe, the columns can be converted to strings using: +```python +df = df.astype(str) +``` +Note, however, that converting this way will cause any `NaN` or `None` values to become strings like `'nan'` +that will persist into the created Tamr dataset. + +### Changing Values + +#### Making Changes: In Memory +When making changes to a dataset that was loaded as a dataframe, changes can be pushed back to Tamr using the +`dataset.upsert_from_dataframe()` method as follows: + +```python +df = pd.DataFrame.from_records(my_dataset.records()) +df['column'] = 'new_value' +my_dataset.upsert_from_dataframe(df, primary_key_name='primary_key') +``` + +#### Making Changes: Streaming +For larger datasets it might be better to stream the data and apply changes while iterating through the dataset. +This way the full dataset does not need to be loaded into memory. +```python +for record in dataset.records(): + single_record_df = pd.DataFrame.from_records(record) + single_record_df['column_to_change'] = 'new_value' + dataset.upsert_from_dataframe(single_record_df, primary_key_name='primary_key') +``` +### Adding Attributes +When making changes to dataframes, new dataframe columns are not automatically created as attributes when upserting +records to Tamr. In order for these changes to be recorded, these attributes first need to be created. + +One way of creating these for source datasets automatically would be as follows: + +```python +def add_missing_attributes(dataset, df): + """ + Detects any attributes in the dataframe that aren't in the dataset and attempts to add them (as strings). + """ + existing_attributes = [att.name for att in dataset.attributes] + new_attributes = [att for att in df.columns.to_list() if att not in existing_attributes] + + if not new_attributes: + return + + for new_attribute in new_attributes: + attr_spec = {"name": new_attribute, + "type": {"baseType": "ARRAY", "innerType": {"baseType": "STRING"}}, + } + dataset.attributes.create(attr_spec) + +add_missing_attributes(my_dataset, df) +``` + +## Troubleshooting + +When running into errors upon loading `dataset.records()` into a pandas dataframe, it is good to consider the following +steps. To extract a single record, the following code can be used to provide a minimal reproducible example: +```python +record = next(dataset.records()) +print(record) +``` + +### Parsing +Tamr allows for more variety in attribute names and contents than pandas does. In most cases pandas can load data +correctly, but it is possible to modify the parsing using a custom generator as shown above. An example below changes +an attribute name, and extracts only the first element: +```python +def custom_parser(dataset): + for record in dataset.records(): + record['pandas_column_name'] = record.pop('dataset_attribute_name') + record['first_element_of_column'] = record['multi_value_column'][0] + yield record + +df = pd.DataFrame.from_records(custom_parser(dataset)) +``` diff --git a/docs/user-guide/quickstart.md b/docs/user-guide/quickstart.md new file mode 100644 index 00000000..22962d75 --- /dev/null +++ b/docs/user-guide/quickstart.md @@ -0,0 +1,80 @@ +# Quickstart +## Client configuration + +Start by importing the Python Client and authentication provider: +```python +from tamr_unify_client import Client +from tamr_unify_client.auth import UsernamePasswordAuth +``` +Next, create an authentication provider and use that to create an authenticated client: + +```python +import os + +username = os.environ['TAMR_USERNAME'] +password = os.environ['TAMR_PASSWORD'] + +auth = UsernamePasswordAuth(username, password) +tamr = Client(auth) +``` + +``` warning:: For security, it's best to read your credentials in from environment variables or secure files instead of hardcoding them directly into your code. + + For more, see `User Guide > Secure Credentials `_. +``` +By default, the client tries to find the Tamr instance on `localhost`. To point to a different host, set the host argument when instantiating the Client. + +For example, to connect to `10.20.0.1`: +```python +tamr = Client(auth, host='10.20.0.1') +``` + +## Top-level collections +The Python Client exposes 2 top-level collections: Projects and Datasets. + +You can access these collections through the client and loop over their members +with simple `for`-loops. + +E.g.: +```python +for project in tamr.projects: + print(project.name) + +for dataset in tamr.datasets: + print(dataset.name) +``` + +## Fetch a specific resource +If you know the identifier for a specific resource, you can ask for it directly via the `by_resource_id` methods exposed by collections. + +E.g. To fetch the project with ID `'1'`: +```python +project = tamr.projects.by_resource_id('1') +``` +Similarly, if you know the name of a specific resource, you can ask for it directly via the `by_name` methods exposed by collections. + +E.g. To fetch the project with name `'Number 1'`: +```python +project = tamr.projects.by_name('Number 1') +``` +``` note:: + If working with projects across Tamr instances for migrations or promotions, use external IDs (via ``by_external_id``) instead of name (via ``by_name``). +``` + +## Resource relationships +Related resources (like a project and its unified dataset) can be accessed through specific methods. + +E.g. To access the Unified Dataset for a particular project: +```python +ud = project.unified_dataset() +``` + +## Kick-off Tamr Operations +Some methods on Model objects can kick-off long-running Tamr operations. + +Here, kick-off a "Unified Dataset refresh" operation: +```python +operation = project.unified_dataset().refresh() +assert op.succeeded() +``` +By default, the API Clients expose a synchronous interface for Tamr operations. diff --git a/docs/user-guide/quickstart.rst b/docs/user-guide/quickstart.rst deleted file mode 100644 index 10eebacd..00000000 --- a/docs/user-guide/quickstart.rst +++ /dev/null @@ -1,81 +0,0 @@ -Quickstart -========== - -Client configuration --------------------- - -Start by importing the Python Client and authentication provider:: - - from tamr_unify_client import Client - from tamr_unify_client.auth import UsernamePasswordAuth - -Next, create an authentication provider and use that to create an authenticated client:: - - import os - - username = os.environ['UNIFY_USERNAME'] - password = os.environ['UNIFY_PASSWORD'] - - auth = UsernamePasswordAuth(username, password) - unify = Client(auth) - -.. warning:: - For security, it's best to read your credentials in from environment variables - or secure files instead of hardcoding them directly into your code. - - For more, see `User Guide > Secure Credentials `_ . - -By default, the client tries to find the Unify instance on ``localhost``. -To point to a different host, set the host argument when instantiating the Client. - -For example, to connect to ``10.20.0.1``:: - - unify = Client(auth, host='10.20.0.1') - -Top-level collections ---------------------- - -The Python Client exposes 2 top-level collections: Projects and Datasets. - -You can access these collections through the client and loop over their members -with simple ``for``-loops. - -E.g.:: - - for project in unify.projects: - print(project.name) - - for dataset in unify.datasets: - print(dataset.name) - -Fetch a specific resource -------------------------- - -If you know the identifier for a specific resource, you can ask for it directly -via the ``by_resource_id`` methods exposed by collections. - -E.g. To fetch the project with ID ``'1'``:: - - project = unify.projects.by_resoure_id('1') - -Resource relationships ----------------------- - -Related resources (like a project and its unified dataset) can be accessed -through specific methods. - -E.g. To access the Unified Dataset for a particular project:: - - ud = project.unified_dataset() - -Kick-off Unify Operations -------------------------- - -Some methods on Model objects can kick-off long-running Unify operations. - -Here, kick-off a "Unified Dataset refresh" operation:: - - operation = project.unified_dataset().refresh() - assert op.succeeded() - -By default, the API Clients expose a synchronous interface for Unify operations. diff --git a/docs/user-guide/secure-credentials.md b/docs/user-guide/secure-credentials.md new file mode 100644 index 00000000..72578d21 --- /dev/null +++ b/docs/user-guide/secure-credentials.md @@ -0,0 +1,52 @@ +# Secure Credentials +This section discusses ways to pass credentials securely to +`UsernamePasswordAuth`. Specifically, you **should not** hardcode your password(s) in your source code. Instead, you should use environment variables or secure files to store your credentials and simple Python code to read your credentials. + +## Environment variables +You can use `os.environ` to read in your credentials from environment variables: +```python +# my_script.py +import os + +from tamr_unify_client.auth import UsernamePasswordAuth + +username = os.environ['TAMR_USERNAME'] # replace with your username environment variable name +password = os.environ['TAMR_PASSWORD'] # replace with your password environment variable name + +auth = UsernamePasswordAuth(username, password) +``` + +You can pass in the environment variables from the terminal by including them before your command: +```bash +TAMR_USERNAME="my Tamr username" TAMR_PASSWORD="my Tamr password" python my_script.py +``` + +You can also create an `.sh` file to store your environment variables and +simply `source` that file before running your script. + +## Config files +You can also store your credentials in a secure credentials file: +```yaml +# credentials.yaml +--- +username: "my tamr username" +password: "my tamr password" +``` + +Then `pip install pyyaml` read the credentials in your Python code: +```python +# my_script.py +from tamr_unify_client.auth import UsernamePasswordAuth +import yaml + +with open("path/to/credentials.yaml") as f: # replace with your credentials.yaml path + creds = yaml.safe_load(f) + +auth = UsernamePasswordAuth(creds['username'], creds['password']) +``` + +As in this example, we recommend you use YAML as your format since YAML has support for comments and is more human-readable than JSON. + +``` important:: + You **should not** check these credentials files into your version control system (e.g. ``git``). Do not share this file with anyone who should not have access to the password stored in it. +``` diff --git a/docs/user-guide/secure-credentials.rst b/docs/user-guide/secure-credentials.rst deleted file mode 100644 index 8e01295e..00000000 --- a/docs/user-guide/secure-credentials.rst +++ /dev/null @@ -1,61 +0,0 @@ -Secure Credentials -================== - -This section discusses ways to pass credentials securely to -:class:`~tamr_unify_client.auth.UsernamePasswordAuth`. Specifically, you **should -not** hardcode your password(s) in your source code. Instead, you should use -environment variables or secure files to store your credentials and -simple Python code to read your credentials. - -Environment variables ---------------------- - -You can use ``os.environ`` to read in your credentials from environment variables:: - - # my_script.py - import os - - from tamr_unify_client.auth import UsernamePasswordAuth - - username = os.environ['UNIFY_USERNAME'] # replace with your username environment variable name - password = os.environ['UNIFY_PASSWORD'] # replace with your password environment variable name - - auth = UsernamePasswordAuth(username, password) - - -You can pass in the environment variables from the terminal by including them -before your command:: - - UNIFY_USERNAME="my Unify username" UNIFY_PASSWORD="my Unify password" python my_script.py - -You can also create an ``.sh`` file to store your environment variables and -simply ``source`` that file before running your script. - - -Config files ------------- - -You can also store your credentials in a secure credentials file:: - - # credentials.yaml - --- - username: "my unify username" - password: "my unify password" - -Then ``pip install pyyaml`` read the credentials in your Python code:: - - # my_script.py - from tamr_unify_client.auth import UsernamePasswordAuth - import yaml - - creds = yaml.load("path/to/credentials.yaml") # replace with your credentials.yaml path - - auth = UsernamePasswordAuth(creds.username, creds.password) - -As in this example, we recommend you use YAML as your format since YAML has -support for comments and is more human-readable than JSON. - -.. important:: - You **should not** check these credentials files into your version - control system (e.g. ``git``). Do not share this file with anyone who should - not have access to the password stored in it. diff --git a/docs/user-guide/spec.md b/docs/user-guide/spec.md new file mode 100644 index 00000000..96482787 --- /dev/null +++ b/docs/user-guide/spec.md @@ -0,0 +1,82 @@ +# Creating and Modifying Resources +## Creating resources +Resources, such as projects, dataset, and attribute configurations, can be created through their respective collections. Each `create` function takes in a dictionary that conforms to the +[Tamr Public Docs](https://docs.tamr.com/reference) for creating that resource type: +```python +spec = { + "name": "project", + "description": "Mastering Project", + "type": "DEDUP", + "unifiedDatasetName": "project_unified_dataset" +} +project = tamr.projects.create(spec) +``` + +## Using specs +These dictionaries can also be created using spec classes. + +Each `Resource` has a corresponding `ResourceSpec` which can be used to build an instance of that resource by specifying the value for each property. + +The spec can then be converted to a dictionary that can be passed to `create`. + +For instance, to create a project: +```python +spec = ( + ProjectSpec.new() + .with_name("Project") + .with_type("DEDUP") + .with_description("Mastering Project") + .with_unified_dataset_name("Project_unified_dataset") + .with_external_id("tamrProject1") +) +project = tamr.projects.create(spec.to_dict()) +``` + +Calling `with_*` on a spec creates a new spec with the same properties besides the modified one. The original spec is unaltered, so it could be used multiple times: +```python +base_spec = ( + ProjectSpec.new() + .with_type("DEDUP") + .with_description("Mastering Project") +) + +specs = [] +for name in project_names: + spec = ( + base_spec.with_name(name) + .with_unified_dataset_name(name + "_unified_dataset") + ) + specs.append(spec) + +projects = [tamr.projects.create(spec.to_dict()) for spec in specs] +``` + +## Creating a dataset +Datasets can be created as described above, but the dataset's schema and records must then be handled separately. + +To combine all of these steps into one, `DatasetCollection` has a convenience function `create_from_dataframe` that takes a [Pandas DataFrame](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html). +This makes it easy to create a Tamr dataset from a CSV: +```python +import pandas as pd + +df = pd.read_csv("my_data.csv", dtype=str) # string is the recommended data type +dataset = tamr.datasets.create_from_dataframe(df, primary_key_name="primary key name", dataset_name="My Data") +``` + +This will create a dataset called "My Data" with the specified primary key, an attribute +for each column of the `DataFrame`, and the `DataFrame`'s rows as records. + +## Modifying a resource +Certain resources can also be modified using specs. + +After getting a spec corresponding to a resource and modifying some properties, +the updated resource can be committed to Tamr with the `put` function: +```python +updated_dataset = ( + dataset.spec() + .with_description("Modified description") + .put() +) +``` +Each spec class has many properties that can be changed, but refer to the +[Public Docs](https://docs.tamr.com/reference) for which properties will actually be updated in Tamr. If an immutable property is changed in the update request, the new value will simply be ignored. diff --git a/docs/user-guide/workflows.md b/docs/user-guide/workflows.md new file mode 100644 index 00000000..517dd12e --- /dev/null +++ b/docs/user-guide/workflows.md @@ -0,0 +1,66 @@ +# Workflows +## Continuous Categorization +```python +from tamr_unify_client import Client +from tamr_unify_client.auth import UsernamePasswordAuth +import os + +username = os.environ['TAMR_USERNAME'] +password = os.environ['TAMR_PASSWORD'] +auth = UsernamePasswordAuth(username, password) + +host = 'localhost' # replace with your host +tamr = Client(auth) + +project_id = "1" # replace with your project ID +project = tamr.projects.by_resource_id(project_id) +project = project.as_categorization() + +unified_dataset = project.unified_dataset() +op = unified_dataset.refresh() +assert op.succeeded() + +model = project.model() +op = model.train() +assert op.succeeded() + +op = model.predict() +assert op.succeeded() +``` +## Continuous Mastering +```python +from tamr_unify_client import Client +from tamr_unify_client.auth import UsernamePasswordAuth +import os + +username = os.environ['TAMR_USERNAME'] +password = os.environ['TAMR_PASSWORD'] +auth = UsernamePasswordAuth(username, password) + +host = 'localhost' # replace with your host +tamr = Client(auth) + +project_id = "1" # replace with your project ID +project = tamr.projects.by_resource_id(project_id) +project = project.as_mastering() + +unified_dataset = project.unified_dataset() +op = unified_dataset.refresh() +assert op.succeeded() + +op = project.pairs().refresh() +assert op.succeeded() + +model = project.pair_matching_model() +op = model.train() +assert op.succeeded() + +op = model.predict() +assert op.succeeded() + +op = project.record_clusters().refresh() +assert op.succeeded() + +op = project.published_clusters().refresh() +assert op.succeeded() +``` diff --git a/docs/user-guide/workflows.rst b/docs/user-guide/workflows.rst deleted file mode 100644 index de5f79bc..00000000 --- a/docs/user-guide/workflows.rst +++ /dev/null @@ -1,73 +0,0 @@ -Workflows -========= - -Continuous Categorization -------------------------- - -:: - - from tamr_unify_client import Client - from tamr_unify_client.auth import UsernamePasswordAuth - import os - - username = os.environ['UNIFY_USERNAME'] - password = os.environ['UNIFY_PASSWORD'] - auth = UsernamePasswordAuth(username, password) - - host = 'localhost' # replace with your host - unify = Client(auth) - - project_id = "1" # replace with your project ID - project = unify.projects.by_resource_id(project_id) - project = project.as_categorization() - - unified_dataset = project.unified_dataset() - op = unified_dataset.refresh() - assert op.succeeded() - - model = project.model() - op = model.train() - assert op.succeeded() - - op = model.predict() - assert op.succeeded() - -Continuous Mastering --------------------- - -:: - - from tamr_unify_client import Client - from tamr_unify_client.auth import UsernamePasswordAuth - import os - - username = os.environ['UNIFY_USERNAME'] - password = os.environ['UNIFY_PASSWORD'] - auth = UsernamePasswordAuth(username, password) - - host = 'localhost' # replace with your host - unify = Client(auth) - - project_id = "1" # replace with your project ID - project = unify.projects.by_resource_id(project_id) - project = project.as_mastering() - - unified_dataset = project.unified_dataset() - op = unified_dataset.refresh() - assert op.succeeded() - - op = project.pairs().refresh() - assert op.succeeded() - - model = project.pair_matching_model() - op = model.train() - assert op.succeeded() - - op = model.predict() - assert op.succeeded() - - op = project.record_clusters().refresh() - assert op.succeeded() - - op = project.published_clusters().refresh() - assert op.succeeded() diff --git a/examples/continuous_mastering.py b/examples/continuous_mastering.py new file mode 100644 index 00000000..b6b42597 --- /dev/null +++ b/examples/continuous_mastering.py @@ -0,0 +1,41 @@ +from getpass import getpass + +import tamr_client as tc + +username = input("Tamr Username:") +password = getpass("Tamr Password:") + +auth = tc.UsernamePasswordAuth(username, password) +session = tc.session.from_auth(auth) + +protocol = "http" +host = "localhost" +port = 9100 + +instance = tc.Instance(protocol=protocol, host=host, port=port) + +project = tc.project.by_name(session, instance, "MasteringTutorial") + +if not isinstance(project, tc.MasteringProject): + raise RuntimeError(f"{project.name} is not a mastering project.") + +operation_1 = tc.mastering.update_unified_dataset(session, project) +tc.operation.check(session, operation_1) + +operation_2 = tc.mastering.generate_pairs(session, project) +tc.operation.check(session, operation_2) + +operation_3 = tc.mastering.apply_feedback(session, project) +tc.operation.check(session, operation_3) + +operation_4 = tc.mastering.update_pair_results(session, project) +tc.operation.check(session, operation_4) + +operation_5 = tc.mastering.update_high_impact_pairs(session, project) +tc.operation.check(session, operation_5) + +operation_6 = tc.mastering.update_cluster_results(session, project) +tc.operation.check(session, operation_6) + +operation_7 = tc.mastering.publish_clusters(session, project) +tc.operation.check(session, operation_7) diff --git a/examples/get_tamr_version.py b/examples/get_tamr_version.py new file mode 100644 index 00000000..e5c6c1b8 --- /dev/null +++ b/examples/get_tamr_version.py @@ -0,0 +1,17 @@ +from getpass import getpass + +import tamr_client as tc + +username = input("Tamr Username:") +password = getpass("Tamr Password:") + +auth = tc.UsernamePasswordAuth(username, password) +session = tc.session.from_auth(auth) + +protocol = "http" +host = "localhost" +port = 9100 + +instance = tc.Instance(protocol=protocol, host=host, port=port) + +print(tc.instance.version(session, instance)) diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 00000000..49382092 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,9 @@ +[mypy] +mypy_path = ./stubs +check_untyped_defs = True +ignore_errors = False +namespace_packages = True +strict_optional = True +warn_unused_ignores = True +warn_redundant_casts = True +warn_unused_configs = True diff --git a/noxfile.py b/noxfile.py new file mode 100644 index 00000000..3be648bc --- /dev/null +++ b/noxfile.py @@ -0,0 +1,65 @@ +from pathlib import Path + +import nox + +nox.options.reuse_existing_virtualenvs = True + + +def _find_packages(path: Path): + for pkg in path.iterdir(): + if pkg.is_dir() and len(list(pkg.glob("**/*.py"))) >= 1: + yield pkg + + +@nox.session(python="3.6") +def lint(session): + session.run("poetry", "install", external=True) + session.run("flake8", "--extend-exclude=.nox", ".") + + +@nox.session(python="3.6") +def format(session): + session.run("poetry", "install", external=True) + if "--fix" in session.posargs: + session.run("black", ".") + elif "--diff" in session.posargs: + session.run("black", ".", "--diff") + else: + session.run("black", ".", "--check") + + +@nox.session(python="3.6") +def typecheck(session): + session.run("poetry", "install", external=True) + repo = Path(".") + + tc = repo / "tamr_client" + session.run("mypy", "--package", str(tc)) + + tc_examples = [str(x) for x in (repo / "examples").glob("**/*.py")] + session.run("mypy", *tc_examples) + + tc_tests = [str(x) for x in (repo / "tests" / "tamr_client").glob("**/*.py")] + session.run("mypy", *tc_tests) + + +@nox.session(python=["3.6", "3.7", "3.8"]) +def test(session): + session.run("poetry", "install", external=True) + session.run("pytest", *session.posargs, env={"TAMR_CLIENT_BETA": "1"}) + + +@nox.session(python="3.6") +def docs(session): + # RTD uses pip for managing dependencies, so we mirror that approach + session.install(".") + session.install("-r", "docs/requirements.txt") + session.run( + "sphinx-build", + "-b", + "html", + "docs", + "docs/_build", + "-W", + env={"TAMR_CLIENT_BETA": "1", "TAMR_CLIENT_DOCS": "1"}, + ) diff --git a/poetry.lock b/poetry.lock index 1b5ac478..f9b8bcf0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,22 +1,31 @@ [[package]] category = "dev" -description = "A configurable sidebar-enabled Sphinx theme" -name = "alabaster" +description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +name = "appdirs" optional = false python-versions = "*" -version = "0.7.12" +version = "1.4.3" [[package]] category = "dev" -description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." -name = "appdirs" +description = "Bash tab completion for argparse" +name = "argcomplete" optional = false python-versions = "*" -version = "1.4.3" +version = "1.11.1" + +[package.dependencies] +[package.dependencies.importlib-metadata] +python = ">=3.6,<3.7" +version = ">=0.23,<2" + +[package.extras] +test = ["coverage", "flake8", "pexpect", "wheel"] [[package]] category = "dev" description = "Atomic file writes." +marker = "sys_platform == \"win32\"" name = "atomicwrites" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" @@ -30,16 +39,10 @@ optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" version = "19.1.0" -[[package]] -category = "dev" -description = "Internationalization utilities" -name = "babel" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "2.7.0" - -[package.dependencies] -pytz = ">=2015.7" +[package.extras] +dev = ["coverage", "hypothesis", "pympler", "pytest", "six", "zope.interface", "sphinx", "pre-commit"] +docs = ["sphinx", "zope.interface"] +tests = ["coverage", "hypothesis", "pympler", "pytest", "six", "zope.interface"] [[package]] category = "dev" @@ -47,13 +50,19 @@ description = "The uncompromising code formatter." name = "black" optional = false python-versions = ">=3.6" -version = "19.3b0" +version = "19.10b0" [package.dependencies] appdirs = "*" attrs = ">=18.1.0" click = ">=6.5" +pathspec = ">=0.6,<1" +regex = "*" toml = ">=0.9.4" +typed-ast = ">=1.4.0" + +[package.extras] +d = ["aiohttp (>=3.3.2)", "aiohttp-cors"] [[package]] category = "main" @@ -76,8 +85,8 @@ category = "dev" description = "Composable command line interface toolkit" name = "click" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "7.0" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +version = "7.1.2" [[package]] category = "dev" @@ -90,33 +99,55 @@ version = "0.4.1" [[package]] category = "dev" -description = "Docutils -- Python Documentation Utilities" -name = "docutils" +description = "Log formatting with colors!" +name = "colorlog" optional = false python-versions = "*" -version = "0.14" +version = "4.1.0" + +[package.dependencies] +colorama = "*" + +[[package]] +category = "main" +description = "A backport of the dataclasses module for Python 3.6" +name = "dataclasses" +optional = false +python-versions = "*" +version = "0.6" [[package]] category = "dev" -description = "Discover and load entry points from installed packages." -name = "entrypoints" +description = "Distribution utilities" +name = "distlib" optional = false -python-versions = ">=2.7" -version = "0.3" +python-versions = "*" +version = "0.3.0" + +[[package]] +category = "dev" +description = "A platform independent file lock." +name = "filelock" +optional = false +python-versions = "*" +version = "3.0.12" [[package]] category = "dev" -description = "the modular source code checker: pep8, pyflakes and co" +description = "the modular source code checker: pep8 pyflakes and co" name = "flake8" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "3.7.7" +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" +version = "3.8.2" [package.dependencies] -entrypoints = ">=0.3.0,<0.4.0" mccabe = ">=0.6.0,<0.7.0" -pycodestyle = ">=2.5.0,<2.6.0" -pyflakes = ">=2.1.0,<2.2.0" +pycodestyle = ">=2.6.0a1,<2.7.0" +pyflakes = ">=2.2.0,<2.3.0" + +[package.dependencies.importlib-metadata] +python = "<3.8" +version = "*" [[package]] category = "dev" @@ -138,43 +169,42 @@ optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" version = "2.8" -[[package]] -category = "dev" -description = "Getting image size from png/jpeg/jpeg2000/gif file" -name = "imagesize" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "1.1.0" - [[package]] category = "dev" description = "Read metadata from Python packages" +marker = "python_version < \"3.8\"" name = "importlib-metadata" optional = false -python-versions = ">=2.7,!=3.0,!=3.1,!=3.2,!=3.3" -version = "0.17" +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" +version = "1.3.0" [package.dependencies] zipp = ">=0.5" +[package.extras] +docs = ["sphinx", "rst.linker"] +testing = ["packaging", "importlib-resources"] + [[package]] category = "dev" -description = "A small but fast and easy to use stand-alone template engine written in pure python." -name = "jinja2" +description = "Read resources from Python packages" +marker = "python_version < \"3.7\"" +name = "importlib-resources" optional = false -python-versions = "*" -version = "2.10.1" +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" +version = "1.5.0" [package.dependencies] -MarkupSafe = ">=0.23" +[package.dependencies.importlib-metadata] +python = "<3.8" +version = "*" -[[package]] -category = "dev" -description = "Safely add untrusted strings to HTML/XML markup." -name = "markupsafe" -optional = false -python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*" -version = "1.1.1" +[package.dependencies.zipp] +python = "<3.8" +version = ">=0.4" + +[package.extras] +docs = ["sphinx", "rst.linker", "jaraco.packaging"] [[package]] category = "dev" @@ -187,11 +217,63 @@ version = "0.6.1" [[package]] category = "dev" description = "More routines for operating on iterables, beyond itertools" -marker = "python_version > \"2.7\"" name = "more-itertools" optional = false -python-versions = ">=3.4" -version = "7.0.0" +python-versions = ">=3.5" +version = "8.0.2" + +[[package]] +category = "dev" +description = "Optional static typing for Python" +name = "mypy" +optional = false +python-versions = ">=3.5" +version = "0.782" + +[package.dependencies] +mypy-extensions = ">=0.4.3,<0.5.0" +typed-ast = ">=1.4.0,<1.5.0" +typing-extensions = ">=3.7.4" + +[package.extras] +dmypy = ["psutil (>=4.0)"] + +[[package]] +category = "dev" +description = "Experimental type system extensions for programs checked with the mypy typechecker." +name = "mypy-extensions" +optional = false +python-versions = "*" +version = "0.4.3" + +[[package]] +category = "dev" +description = "Flexible test automation." +name = "nox" +optional = false +python-versions = ">=3.5" +version = "2020.5.24" + +[package.dependencies] +argcomplete = ">=1.9.4,<2.0" +colorlog = ">=2.6.1,<5.0.0" +py = ">=1.4.0,<2.0.0" +virtualenv = ">=14.0.0" + +[package.dependencies.importlib-metadata] +python = "<3.8" +version = "*" + +[package.extras] +tox_to_nox = ["jinja2", "tox"] + +[[package]] +category = "dev" +description = "NumPy is the fundamental package for array computing with Python." +name = "numpy" +optional = false +python-versions = ">=3.6" +version = "1.19.0" [[package]] category = "dev" @@ -205,16 +287,45 @@ version = "19.0" pyparsing = ">=2.0.2" six = "*" +[[package]] +category = "dev" +description = "Powerful data structures for data analysis, time series, and statistics" +name = "pandas" +optional = false +python-versions = ">=3.6.1" +version = "1.0.5" + +[package.dependencies] +numpy = ">=1.13.3" +python-dateutil = ">=2.6.1" +pytz = ">=2017.2" + +[package.extras] +test = ["pytest (>=4.0.2)", "pytest-xdist", "hypothesis (>=3.58)"] + +[[package]] +category = "dev" +description = "Utility library for gitignore style pattern matching of file paths." +name = "pathspec" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +version = "0.8.0" + [[package]] category = "dev" description = "plugin and hook calling mechanisms for python" name = "pluggy" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "0.12.0" +version = "0.13.1" [package.dependencies] -importlib-metadata = ">=0.12" +[package.dependencies.importlib-metadata] +python = "<3.8" +version = ">=0.12" + +[package.extras] +dev = ["pre-commit", "tox"] [[package]] category = "dev" @@ -230,7 +341,7 @@ description = "Python style guide checker" name = "pycodestyle" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "2.5.0" +version = "2.6.0" [[package]] category = "dev" @@ -238,15 +349,7 @@ description = "passive checker of Python programs" name = "pyflakes" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "2.1.1" - -[[package]] -category = "dev" -description = "Pygments is a syntax highlighting package written in Python." -name = "pygments" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" -version = "2.4.2" +version = "2.2.0" [[package]] category = "dev" @@ -261,23 +364,37 @@ category = "dev" description = "pytest: simple powerful testing with Python" name = "pytest" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "4.6.2" +python-versions = ">=3.5" +version = "5.3.5" [package.dependencies] atomicwrites = ">=1.0" attrs = ">=17.4.0" colorama = "*" -importlib-metadata = ">=0.12" +more-itertools = ">=4.0.0" packaging = "*" pluggy = ">=0.12,<1.0" py = ">=1.5.0" -six = ">=1.10.0" wcwidth = "*" -[package.dependencies.more-itertools] -python = ">=2.8" -version = ">=4.0.0" +[package.dependencies.importlib-metadata] +python = "<3.8" +version = ">=0.12" + +[package.extras] +checkqa-mypy = ["mypy (v0.761)"] +testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"] + +[[package]] +category = "dev" +description = "Extensions to the standard Python datetime module" +name = "python-dateutil" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +version = "2.8.1" + +[package.dependencies] +six = ">=1.5" [[package]] category = "dev" @@ -287,6 +404,14 @@ optional = false python-versions = "*" version = "2019.1" +[[package]] +category = "dev" +description = "Alternative regular expression module, to replace re." +name = "regex" +optional = false +python-versions = "*" +version = "2020.6.8" + [[package]] category = "main" description = "Python HTTP for Humans." @@ -301,6 +426,10 @@ chardet = ">=3.0.2,<3.1.0" idna = ">=2.5,<2.9" urllib3 = ">=1.21.1,<1.25.0 || >1.25.0,<1.25.1 || >1.25.1,<1.26" +[package.extras] +security = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)"] +socks = ["PySocks (>=1.5.6,<1.5.7 || >1.5.7)", "win-inet-pton"] + [[package]] category = "dev" description = "A utility library for mocking out the `requests` Python library." @@ -313,6 +442,9 @@ version = "0.10.6" requests = ">=2.0" six = "*" +[package.extras] +tests = ["pytest", "coverage (>=3.7.1,<5.0.0)", "pytest-cov", "pytest-localserver", "flake8"] + [[package]] category = "dev" description = "Python 2 and 3 compatibility utilities" @@ -323,113 +455,66 @@ version = "1.12.0" [[package]] category = "dev" -description = "This package provides 16 stemmer algorithms (15 + Poerter English stemmer) generated from Snowball algorithms." -name = "snowballstemmer" +description = "Python Library for Tom's Obvious, Minimal Language" +name = "toml" optional = false python-versions = "*" -version = "1.2.1" - -[[package]] -category = "dev" -description = "Python documentation generator" -name = "sphinx" -optional = false -python-versions = ">=3.5" -version = "2.1.0" - -[package.dependencies] -Jinja2 = ">=2.3" -Pygments = ">=2.0" -alabaster = ">=0.7,<0.8" -babel = ">=1.3,<2.0 || >2.0" -colorama = ">=0.3.5" -docutils = ">=0.12" -imagesize = "*" -packaging = "*" -requests = ">=2.5.0" -setuptools = "*" -snowballstemmer = ">=1.1" -sphinxcontrib-applehelp = "*" -sphinxcontrib-devhelp = "*" -sphinxcontrib-htmlhelp = "*" -sphinxcontrib-jsmath = "*" -sphinxcontrib-qthelp = "*" -sphinxcontrib-serializinghtml = "*" +version = "0.10.1" [[package]] category = "dev" -description = "Read the Docs theme for Sphinx" -name = "sphinx-rtd-theme" +description = "a fork of Python 2 and 3 ast modules with type comment support" +name = "typed-ast" optional = false python-versions = "*" -version = "0.4.3" - -[package.dependencies] -sphinx = "*" +version = "1.4.1" [[package]] category = "dev" -description = "" -name = "sphinxcontrib-applehelp" +description = "Backported and Experimental Type Hints for Python 3.5+" +name = "typing-extensions" optional = false python-versions = "*" -version = "1.0.1" +version = "3.7.4.2" [[package]] -category = "dev" -description = "" -name = "sphinxcontrib-devhelp" +category = "main" +description = "HTTP library with thread-safe connection pooling, file post, and more." +name = "urllib3" optional = false -python-versions = "*" -version = "1.0.1" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4" +version = "1.25.3" -[[package]] -category = "dev" -description = "" -name = "sphinxcontrib-htmlhelp" -optional = false -python-versions = "*" -version = "1.0.2" +[package.extras] +brotli = ["brotlipy (>=0.6.0)"] +secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"] +socks = ["PySocks (>=1.5.6,<1.5.7 || >1.5.7,<2.0)"] [[package]] category = "dev" -description = "A sphinx extension which renders display math in HTML via JavaScript" -name = "sphinxcontrib-jsmath" +description = "Virtual Python Environment builder" +name = "virtualenv" optional = false -python-versions = ">=3.5" -version = "1.0.1" +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" +version = "20.0.21" -[[package]] -category = "dev" -description = "" -name = "sphinxcontrib-qthelp" -optional = false -python-versions = "*" -version = "1.0.2" +[package.dependencies] +appdirs = ">=1.4.3,<2" +distlib = ">=0.3.0,<1" +filelock = ">=3.0.0,<4" +six = ">=1.9.0,<2" -[[package]] -category = "dev" -description = "" -name = "sphinxcontrib-serializinghtml" -optional = false -python-versions = "*" -version = "1.1.3" +[package.dependencies.importlib-metadata] +python = "<3.8" +version = ">=0.12,<2" -[[package]] -category = "dev" -description = "Python Library for Tom's Obvious, Minimal Language" -name = "toml" -optional = false -python-versions = "*" -version = "0.10.0" +[package.dependencies.importlib-resources] +python = "<3.7" +version = ">=1.0,<2" -[[package]] -category = "main" -description = "HTTP library with thread-safe connection pooling, file post, and more." -name = "urllib3" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4" -version = "1.25.3" +[package.extras] +docs = ["sphinx (>=3)", "sphinx-argparse (>=0.2.5)", "sphinx-rtd-theme (>=0.4.3)", "towncrier (>=19.9.0rc1)", "proselint (>=0.10.2)"] +testing = ["pytest (>=4)", "coverage (>=5)", "coverage-enable-subprocess (>=1)", "pytest-xdist (>=1.31.0)", "pytest-mock (>=2)", "pytest-env (>=0.6.2)", "pytest-randomly (>=1)", "pytest-timeout", "packaging (>=20.0)", "xonsh (>=0.9.16)"] [[package]] category = "dev" @@ -442,59 +527,293 @@ version = "0.1.7" [[package]] category = "dev" description = "Backport of pathlib-compatible object wrapper for zip files" +marker = "python_version < \"3.8\"" name = "zipp" optional = false python-versions = ">=2.7" -version = "0.5.1" +version = "0.6.0" + +[package.dependencies] +more-itertools = "*" + +[package.extras] +docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"] +testing = ["pathlib2", "contextlib2", "unittest2"] [metadata] -content-hash = "86be94bbec36a35672dc8144eb0963abbe6824caaabf777eb9d5eece92b047ee" -python-versions = "^3.6" - -[metadata.hashes] -alabaster = ["446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359", "a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02"] -appdirs = ["9e5896d1372858f8dd3344faf4e5014d21849c756c8d5701f78f8a103b372d92", "d8b24664561d0d34ddfaec54636d502d7cea6e29c3eaf68f3df6180863e2166e"] -atomicwrites = ["03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4", "75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"] -attrs = ["69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79", "f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399"] -babel = ["af92e6106cb7c55286b25b38ad7695f8b4efb36a90ba483d7f7a6628c46158ab", "e86135ae101e31e2c8ec20a4e0c5220f4eed12487d5cf3f78be7e98d3a57fc28"] -black = ["09a9dcb7c46ed496a9850b76e4e825d6049ecd38b611f1224857a79bd985a8cf", "68950ffd4d9169716bcb8719a56c07a2f4485354fec061cdd5910aa07369731c"] -certifi = ["59b7658e26ca9c7339e00f8f4636cdfe59d34fa37b9b04f6f9e9926b3cece1a5", "b26104d6835d1f5e49452a26eb2ff87fe7090b89dfcaee5ea2212697e1e1d7ae"] -chardet = ["84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", "fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"] -click = ["2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13", "5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7"] -colorama = ["05eed71e2e327246ad6b38c540c4a3117230b19679b875190486ddd2d721422d", "f8ac84de7840f5b9c4e3347b3c1eaa50f7e49c2b07596221daec5edaabbd7c48"] -docutils = ["02aec4bd92ab067f6ff27a38a38a41173bf01bed8f89157768c1573f53e474a6", "51e64ef2ebfb29cae1faa133b3710143496eca21c530f3f71424d77687764274", "7a4bd47eaf6596e1295ecb11361139febe29b084a87bf005bf899f9a42edc3c6"] -entrypoints = ["589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19", "c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451"] -flake8 = ["859996073f341f2670741b51ec1e67a01da142831aa1fdc6242dbf88dffbe661", "a796a115208f5c03b18f332f7c11729812c8c3ded6c46319c59b53efd3819da8"] -flake8-import-order = ["90a80e46886259b9c396b578d75c749801a41ee969a235e163cfe1be7afd2543", "a28dc39545ea4606c1ac3c24e9d05c849c6e5444a50fb7e9cdd430fc94de6e92"] -idna = ["c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", "ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"] -imagesize = ["3f349de3eb99145973fefb7dbe38554414e5c30abd0c8e4b970a7c9d09f3a1d8", "f3832918bc3c66617f92e35f5d70729187676313caa60c187eb0f28b8fe5e3b5"] -importlib-metadata = ["a9f185022cfa69e9ca5f7eabfd5a58b689894cb78a11e3c8c89398a8ccbb8e7f", "df1403cd3aebeb2b1dcd3515ca062eecb5bd3ea7611f18cba81130c68707e879"] -jinja2 = ["065c4f02ebe7f7cf559e49ee5a95fb800a9e4528727aec6f24402a5374c65013", "14dd6caf1527abb21f08f86c784eac40853ba93edb79552aa1e4b8aef1b61c7b"] -markupsafe = ["00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473", "09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161", "09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235", "1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5", "24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff", "29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b", "43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1", "46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e", "500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183", "535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66", "62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1", "6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1", "717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e", "79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b", "7c1699dfe0cf8ff607dbdcc1e9b9af1755371f92a68f706051cc8c37d447c905", "88e5fcfb52ee7b911e8bb6d6aa2fd21fbecc674eadd44118a9cc3863f938e735", "8defac2f2ccd6805ebf65f5eeb132adcf2ab57aa11fdf4c0dd5169a004710e7d", "98c7086708b163d425c67c7a91bad6e466bb99d797aa64f965e9d25c12111a5e", "9add70b36c5666a2ed02b43b335fe19002ee5235efd4b8a89bfcf9005bebac0d", "9bf40443012702a1d2070043cb6291650a0841ece432556f784f004937f0f32c", "ade5e387d2ad0d7ebf59146cc00c8044acbd863725f887353a10df825fc8ae21", "b00c1de48212e4cc9603895652c5c410df699856a2853135b3967591e4beebc2", "b1282f8c00509d99fef04d8ba936b156d419be841854fe901d8ae224c59f0be5", "b2051432115498d3562c084a49bba65d97cf251f5a331c64a12ee7e04dacc51b", "ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6", "c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f", "cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f", "e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7"] -mccabe = ["ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42", "dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"] -more-itertools = ["2112d2ca570bb7c3e53ea1a35cd5df42bb0fd10c45f0fb97178679c3c03d64c7", "c3e4748ba1aad8dba30a4886b0b1a2004f9a863837b8654e7059eebf727afa5a"] -packaging = ["0c98a5d0be38ed775798ece1b9727178c4469d9c3b4ada66e8e6b7849f8732af", "9e1cbf8c12b1f1ce0bb5344b8d7ecf66a6f8a6e91bcb0c84593ed6d3ab5c4ab3"] -pluggy = ["0825a152ac059776623854c1543d65a4ad408eb3d33ee114dff91e57ec6ae6fc", "b9817417e95936bf75d85d3f8767f7df6cdde751fc40aed3bb3074cbcb77757c"] -py = ["64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa", "dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53"] -pycodestyle = ["95a2219d12372f05704562a14ec30bc76b05a5b297b21a5dfe3f6fac3491ae56", "e40a936c9a450ad81df37f549d676d127b1b66000a6c500caa2b085bc0ca976c"] -pyflakes = ["17dbeb2e3f4d772725c777fabc446d5634d1038f234e77343108ce445ea69ce0", "d976835886f8c5b31d47970ed689944a0262b5f3afa00a5a7b4dc81e5449f8a2"] -pygments = ["71e430bc85c88a430f000ac1d9b331d2407f681d6f6aec95e8bcfbc3df5b0127", "881c4c157e45f30af185c1ffe8d549d48ac9127433f2c380c24b84572ad66297"] -pyparsing = ["1873c03321fc118f4e9746baf201ff990ceb915f433f23b395f5580d1840cb2a", "9b6323ef4ab914af344ba97510e966d64ba91055d6b9afa6b30799340e89cc03"] -pytest = ["6032845e68a17a96e8da3088037f899b56357769a724122056265ca2ea1890ee", "bea27a646a3d74cbbcf8d3d4a06b2dfc336baf3dc2cc85cf70ad0157e73e8322"] -pytz = ["303879e36b721603cc54604edcac9d20401bdbe31e1e4fdee5b9f98d5d31dfda", "d747dd3d23d77ef44c6a3526e274af6efeb0a6f1afd5a69ba4d5be4098c8e141"] -requests = ["11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4", "9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31"] -responses = ["502d9c0c8008439cfcdef7e251f507fcfdd503b56e8c0c87c3c3e3393953f790", "97193c0183d63fba8cd3a041c75464e4b09ea0aff6328800d1546598567dde0b"] -six = ["3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", "d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"] -snowballstemmer = ["919f26a68b2c17a7634da993d91339e288964f93c274f1343e3bbbe2096e1128", "9f3bcd3c401c3e862ec0ebe6d2c069ebc012ce142cce209c098ccb5b09136e89"] -sphinx = ["2c5becc0fd6706dc0aeb4703f9f1f8a1d1eecacf02e9ac5943cbae48b11e5e42", "7a359a91fb04054ec77d68ff97cb8728f8cc322e25f22dc94299d67e0e6a7123"] -sphinx-rtd-theme = ["00cf895504a7895ee433807c62094cf1e95f065843bf3acd17037c3e9a2becd4", "728607e34d60456d736cc7991fd236afb828b21b82f956c5ea75f94c8414040a"] -sphinxcontrib-applehelp = ["edaa0ab2b2bc74403149cb0209d6775c96de797dfd5b5e2a71981309efab3897", "fb8dee85af95e5c30c91f10e7eb3c8967308518e0f7488a2828ef7bc191d0d5d"] -sphinxcontrib-devhelp = ["6c64b077937330a9128a4da74586e8c2130262f014689b4b89e2d08ee7294a34", "9512ecb00a2b0821a146736b39f7aeb90759834b07e81e8cc23a9c70bacb9981"] -sphinxcontrib-htmlhelp = ["4670f99f8951bd78cd4ad2ab962f798f5618b17675c35c5ac3b2132a14ea8422", "d4fd39a65a625c9df86d7fa8a2d9f3cd8299a3a4b15db63b50aac9e161d8eff7"] -sphinxcontrib-jsmath = ["2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", "a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8"] -sphinxcontrib-qthelp = ["513049b93031beb1f57d4daea74068a4feb77aa5630f856fcff2e50de14e9a20", "79465ce11ae5694ff165becda529a600c754f4bc459778778c7017374d4d406f"] -sphinxcontrib-serializinghtml = ["c0efb33f8052c04fd7a26c0a07f1678e8512e0faec19f4aa8f2473a8b81d5227", "db6615af393650bf1151a6cd39120c29abaf93cc60db8c48eb2dddbfdc3a9768"] -toml = ["229f81c57791a41d65e399fc06bf0848bab550a9dfd5ed66df18ce5f05e73d5c", "235682dd292d5899d361a811df37e04a8828a5b1da3115886b73cf81ebc9100e", "f1db651f9657708513243e61e6cc67d101a39bad662eaa9b5546f789338e07a3"] -urllib3 = ["b246607a25ac80bedac05c6f282e3cdaf3afb65420fd024ac94435cabe6e18d1", "dbe59173209418ae49d485b87d1681aefa36252ee85884c31346debd19463232"] -wcwidth = ["3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", "f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"] -zipp = ["8c1019c6aad13642199fbe458275ad6a84907634cc9f0989877ccc4a2840139d", "ca943a7e809cc12257001ccfb99e3563da9af99d52f261725e96dfe0f9275bc3"] +content-hash = "891a70e0fce285518c0ac4b762d18e79ccc02d48163ad526d9553cc0895e26e8" +python-versions = "^3.6.1" + +[metadata.files] +appdirs = [ + {file = "appdirs-1.4.3-py2.py3-none-any.whl", hash = "sha256:d8b24664561d0d34ddfaec54636d502d7cea6e29c3eaf68f3df6180863e2166e"}, + {file = "appdirs-1.4.3.tar.gz", hash = "sha256:9e5896d1372858f8dd3344faf4e5014d21849c756c8d5701f78f8a103b372d92"}, +] +argcomplete = [ + {file = "argcomplete-1.11.1-py2.py3-none-any.whl", hash = "sha256:890bdd1fcbb973ed73db241763e78b6d958580e588c2910b508c770a59ef37d7"}, + {file = "argcomplete-1.11.1.tar.gz", hash = "sha256:5ae7b601be17bf38a749ec06aa07fb04e7b6b5fc17906948dc1866e7facf3740"}, +] +atomicwrites = [ + {file = "atomicwrites-1.3.0-py2.py3-none-any.whl", hash = "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4"}, + {file = "atomicwrites-1.3.0.tar.gz", hash = "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"}, +] +attrs = [ + {file = "attrs-19.1.0-py2.py3-none-any.whl", hash = "sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79"}, + {file = "attrs-19.1.0.tar.gz", hash = "sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399"}, +] +black = [ + {file = "black-19.10b0-py36-none-any.whl", hash = "sha256:1b30e59be925fafc1ee4565e5e08abef6b03fe455102883820fe5ee2e4734e0b"}, + {file = "black-19.10b0.tar.gz", hash = "sha256:c2edb73a08e9e0e6f65a0e6af18b059b8b1cdd5bef997d7a0b181df93dc81539"}, +] +certifi = [ + {file = "certifi-2019.3.9-py2.py3-none-any.whl", hash = "sha256:59b7658e26ca9c7339e00f8f4636cdfe59d34fa37b9b04f6f9e9926b3cece1a5"}, + {file = "certifi-2019.3.9.tar.gz", hash = "sha256:b26104d6835d1f5e49452a26eb2ff87fe7090b89dfcaee5ea2212697e1e1d7ae"}, +] +chardet = [ + {file = "chardet-3.0.4-py2.py3-none-any.whl", hash = "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"}, + {file = "chardet-3.0.4.tar.gz", hash = "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae"}, +] +click = [ + {file = "click-7.1.2-py2.py3-none-any.whl", hash = "sha256:dacca89f4bfadd5de3d7489b7c8a566eee0d3676333fbb50030263894c38c0dc"}, + {file = "click-7.1.2.tar.gz", hash = "sha256:d2b5255c7c6349bc1bd1e59e08cd12acbbd63ce649f2588755783aa94dfb6b1a"}, +] +colorama = [ + {file = "colorama-0.4.1-py2.py3-none-any.whl", hash = "sha256:f8ac84de7840f5b9c4e3347b3c1eaa50f7e49c2b07596221daec5edaabbd7c48"}, + {file = "colorama-0.4.1.tar.gz", hash = "sha256:05eed71e2e327246ad6b38c540c4a3117230b19679b875190486ddd2d721422d"}, +] +colorlog = [ + {file = "colorlog-4.1.0-py2.py3-none-any.whl", hash = "sha256:732c191ebbe9a353ec160d043d02c64ddef9028de8caae4cfa8bd49b6afed53e"}, + {file = "colorlog-4.1.0.tar.gz", hash = "sha256:30aaef5ab2a1873dec5da38fd6ba568fa761c9fa10b40241027fa3edea47f3d2"}, +] +dataclasses = [ + {file = "dataclasses-0.6-py3-none-any.whl", hash = "sha256:454a69d788c7fda44efd71e259be79577822f5e3f53f029a22d08004e951dc9f"}, + {file = "dataclasses-0.6.tar.gz", hash = "sha256:6988bd2b895eef432d562370bb707d540f32f7360ab13da45340101bc2307d84"}, +] +distlib = [ + {file = "distlib-0.3.0.zip", hash = "sha256:2e166e231a26b36d6dfe35a48c4464346620f8645ed0ace01ee31822b288de21"}, +] +filelock = [ + {file = "filelock-3.0.12-py3-none-any.whl", hash = "sha256:929b7d63ec5b7d6b71b0fa5ac14e030b3f70b75747cef1b10da9b879fef15836"}, + {file = "filelock-3.0.12.tar.gz", hash = "sha256:18d82244ee114f543149c66a6e0c14e9c4f8a1044b5cdaadd0f82159d6a6ff59"}, +] +flake8 = [ + {file = "flake8-3.8.2-py2.py3-none-any.whl", hash = "sha256:ccaa799ef9893cebe69fdfefed76865aeaefbb94cb8545617b2298786a4de9a5"}, + {file = "flake8-3.8.2.tar.gz", hash = "sha256:c69ac1668e434d37a2d2880b3ca9aafd54b3a10a3ac1ab101d22f29e29cf8634"}, +] +flake8-import-order = [ + {file = "flake8-import-order-0.18.1.tar.gz", hash = "sha256:a28dc39545ea4606c1ac3c24e9d05c849c6e5444a50fb7e9cdd430fc94de6e92"}, + {file = "flake8_import_order-0.18.1-py2.py3-none-any.whl", hash = "sha256:90a80e46886259b9c396b578d75c749801a41ee969a235e163cfe1be7afd2543"}, +] +idna = [ + {file = "idna-2.8-py2.py3-none-any.whl", hash = "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"}, + {file = "idna-2.8.tar.gz", hash = "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407"}, +] +importlib-metadata = [ + {file = "importlib_metadata-1.3.0-py2.py3-none-any.whl", hash = "sha256:d95141fbfa7ef2ec65cfd945e2af7e5a6ddbd7c8d9a25e66ff3be8e3daf9f60f"}, + {file = "importlib_metadata-1.3.0.tar.gz", hash = "sha256:073a852570f92da5f744a3472af1b61e28e9f78ccf0c9117658dc32b15de7b45"}, +] +importlib-resources = [ + {file = "importlib_resources-1.5.0-py2.py3-none-any.whl", hash = "sha256:85dc0b9b325ff78c8bef2e4ff42616094e16b98ebd5e3b50fe7e2f0bbcdcde49"}, + {file = "importlib_resources-1.5.0.tar.gz", hash = "sha256:6f87df66833e1942667108628ec48900e02a4ab4ad850e25fbf07cb17cf734ca"}, +] +mccabe = [ + {file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"}, + {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"}, +] +more-itertools = [ + {file = "more-itertools-8.0.2.tar.gz", hash = "sha256:b84b238cce0d9adad5ed87e745778d20a3f8487d0f0cb8b8a586816c7496458d"}, + {file = "more_itertools-8.0.2-py3-none-any.whl", hash = "sha256:c833ef592a0324bcc6a60e48440da07645063c453880c9477ceb22490aec1564"}, +] +mypy = [ + {file = "mypy-0.782-cp35-cp35m-macosx_10_6_x86_64.whl", hash = "sha256:2c6cde8aa3426c1682d35190b59b71f661237d74b053822ea3d748e2c9578a7c"}, + {file = "mypy-0.782-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:9c7a9a7ceb2871ba4bac1cf7217a7dd9ccd44c27c2950edbc6dc08530f32ad4e"}, + {file = "mypy-0.782-cp35-cp35m-win_amd64.whl", hash = "sha256:c05b9e4fb1d8a41d41dec8786c94f3b95d3c5f528298d769eb8e73d293abc48d"}, + {file = "mypy-0.782-cp36-cp36m-macosx_10_6_x86_64.whl", hash = "sha256:6731603dfe0ce4352c555c6284c6db0dc935b685e9ce2e4cf220abe1e14386fd"}, + {file = "mypy-0.782-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:f05644db6779387ccdb468cc47a44b4356fc2ffa9287135d05b70a98dc83b89a"}, + {file = "mypy-0.782-cp36-cp36m-win_amd64.whl", hash = "sha256:b7fbfabdbcc78c4f6fc4712544b9b0d6bf171069c6e0e3cb82440dd10ced3406"}, + {file = "mypy-0.782-cp37-cp37m-macosx_10_6_x86_64.whl", hash = "sha256:3fdda71c067d3ddfb21da4b80e2686b71e9e5c72cca65fa216d207a358827f86"}, + {file = "mypy-0.782-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:d7df6eddb6054d21ca4d3c6249cae5578cb4602951fd2b6ee2f5510ffb098707"}, + {file = "mypy-0.782-cp37-cp37m-win_amd64.whl", hash = "sha256:a4a2cbcfc4cbf45cd126f531dedda8485671545b43107ded25ce952aac6fb308"}, + {file = "mypy-0.782-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6bb93479caa6619d21d6e7160c552c1193f6952f0668cdda2f851156e85186fc"}, + {file = "mypy-0.782-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:81c7908b94239c4010e16642c9102bfc958ab14e36048fa77d0be3289dda76ea"}, + {file = "mypy-0.782-cp38-cp38-win_amd64.whl", hash = "sha256:5dd13ff1f2a97f94540fd37a49e5d255950ebcdf446fb597463a40d0df3fac8b"}, + {file = "mypy-0.782-py3-none-any.whl", hash = "sha256:e0b61738ab504e656d1fe4ff0c0601387a5489ca122d55390ade31f9ca0e252d"}, + {file = "mypy-0.782.tar.gz", hash = "sha256:eff7d4a85e9eea55afa34888dfeaccde99e7520b51f867ac28a48492c0b1130c"}, +] +mypy-extensions = [ + {file = "mypy_extensions-0.4.3-py2.py3-none-any.whl", hash = "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d"}, + {file = "mypy_extensions-0.4.3.tar.gz", hash = "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"}, +] +nox = [ + {file = "nox-2020.5.24-py3-none-any.whl", hash = "sha256:c4509621fead99473a1401870e680b0aadadce5c88440f0532863595176d64c1"}, + {file = "nox-2020.5.24.tar.gz", hash = "sha256:61a55705736a1a73efbd18d5b262a43d55a1176546e0eb28b29064cfcffe26c0"}, +] +numpy = [ + {file = "numpy-1.19.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:63d971bb211ad3ca37b2adecdd5365f40f3b741a455beecba70fd0dde8b2a4cb"}, + {file = "numpy-1.19.0-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:b6aaeadf1e4866ca0fdf7bb4eed25e521ae21a7947c59f78154b24fc7abbe1dd"}, + {file = "numpy-1.19.0-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:13af0184177469192d80db9bd02619f6fa8b922f9f327e077d6f2a6acb1ce1c0"}, + {file = "numpy-1.19.0-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:356f96c9fbec59974a592452ab6a036cd6f180822a60b529a975c9467fcd5f23"}, + {file = "numpy-1.19.0-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:fa1fe75b4a9e18b66ae7f0b122543c42debcf800aaafa0212aaff3ad273c2596"}, + {file = "numpy-1.19.0-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:cbe326f6d364375a8e5a8ccb7e9cd73f4b2f6dc3b2ed205633a0db8243e2a96a"}, + {file = "numpy-1.19.0-cp36-cp36m-win32.whl", hash = "sha256:a2e3a39f43f0ce95204beb8fe0831199542ccab1e0c6e486a0b4947256215632"}, + {file = "numpy-1.19.0-cp36-cp36m-win_amd64.whl", hash = "sha256:7b852817800eb02e109ae4a9cef2beda8dd50d98b76b6cfb7b5c0099d27b52d4"}, + {file = "numpy-1.19.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:d97a86937cf9970453c3b62abb55a6475f173347b4cde7f8dcdb48c8e1b9952d"}, + {file = "numpy-1.19.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:a86c962e211f37edd61d6e11bb4df7eddc4a519a38a856e20a6498c319efa6b0"}, + {file = "numpy-1.19.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:d34fbb98ad0d6b563b95de852a284074514331e6b9da0a9fc894fb1cdae7a79e"}, + {file = "numpy-1.19.0-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:658624a11f6e1c252b2cd170d94bf28c8f9410acab9f2fd4369e11e1cd4e1aaf"}, + {file = "numpy-1.19.0-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:4d054f013a1983551254e2379385e359884e5af105e3efe00418977d02f634a7"}, + {file = "numpy-1.19.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:26a45798ca2a4e168d00de75d4a524abf5907949231512f372b217ede3429e98"}, + {file = "numpy-1.19.0-cp37-cp37m-win32.whl", hash = "sha256:3c40c827d36c6d1c3cf413694d7dc843d50997ebffbc7c87d888a203ed6403a7"}, + {file = "numpy-1.19.0-cp37-cp37m-win_amd64.whl", hash = "sha256:be62aeff8f2f054eff7725f502f6228298891fd648dc2630e03e44bf63e8cee0"}, + {file = "numpy-1.19.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:dd53d7c4a69e766e4900f29db5872f5824a06827d594427cf1a4aa542818b796"}, + {file = "numpy-1.19.0-cp38-cp38-manylinux1_i686.whl", hash = "sha256:30a59fb41bb6b8c465ab50d60a1b298d1cd7b85274e71f38af5a75d6c475d2d2"}, + {file = "numpy-1.19.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:df1889701e2dfd8ba4dc9b1a010f0a60950077fb5242bb92c8b5c7f1a6f2668a"}, + {file = "numpy-1.19.0-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:33c623ef9ca5e19e05991f127c1be5aeb1ab5cdf30cb1c5cf3960752e58b599b"}, + {file = "numpy-1.19.0-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:26f509450db547e4dfa3ec739419b31edad646d21fb8d0ed0734188b35ff6b27"}, + {file = "numpy-1.19.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:7b57f26e5e6ee2f14f960db46bd58ffdca25ca06dd997729b1b179fddd35f5a3"}, + {file = "numpy-1.19.0-cp38-cp38-win32.whl", hash = "sha256:a8705c5073fe3fcc297fb8e0b31aa794e05af6a329e81b7ca4ffecab7f2b95ef"}, + {file = "numpy-1.19.0-cp38-cp38-win_amd64.whl", hash = "sha256:c2edbb783c841e36ca0fa159f0ae97a88ce8137fb3a6cd82eae77349ba4b607b"}, + {file = "numpy-1.19.0-pp36-pypy36_pp73-manylinux2010_x86_64.whl", hash = "sha256:8cde829f14bd38f6da7b2954be0f2837043e8b8d7a9110ec5e318ae6bf706610"}, + {file = "numpy-1.19.0.zip", hash = "sha256:76766cc80d6128750075378d3bb7812cf146415bd29b588616f72c943c00d598"}, +] +packaging = [ + {file = "packaging-19.0-py2.py3-none-any.whl", hash = "sha256:9e1cbf8c12b1f1ce0bb5344b8d7ecf66a6f8a6e91bcb0c84593ed6d3ab5c4ab3"}, + {file = "packaging-19.0.tar.gz", hash = "sha256:0c98a5d0be38ed775798ece1b9727178c4469d9c3b4ada66e8e6b7849f8732af"}, +] +pandas = [ + {file = "pandas-1.0.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:faa42a78d1350b02a7d2f0dbe3c80791cf785663d6997891549d0f86dc49125e"}, + {file = "pandas-1.0.5-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:9c31d52f1a7dd2bb4681d9f62646c7aa554f19e8e9addc17e8b1b20011d7522d"}, + {file = "pandas-1.0.5-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:8778a5cc5a8437a561e3276b85367412e10ae9fff07db1eed986e427d9a674f8"}, + {file = "pandas-1.0.5-cp36-cp36m-win32.whl", hash = "sha256:9871ef5ee17f388f1cb35f76dc6106d40cb8165c562d573470672f4cdefa59ef"}, + {file = "pandas-1.0.5-cp36-cp36m-win_amd64.whl", hash = "sha256:35b670b0abcfed7cad76f2834041dcf7ae47fd9b22b63622d67cdc933d79f453"}, + {file = "pandas-1.0.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c9410ce8a3dee77653bc0684cfa1535a7f9c291663bd7ad79e39f5ab58f67ab3"}, + {file = "pandas-1.0.5-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:02f1e8f71cd994ed7fcb9a35b6ddddeb4314822a0e09a9c5b2d278f8cb5d4096"}, + {file = "pandas-1.0.5-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:b3c4f93fcb6e97d993bf87cdd917883b7dab7d20c627699f360a8fb49e9e0b91"}, + {file = "pandas-1.0.5-cp37-cp37m-win32.whl", hash = "sha256:5759edf0b686b6f25a5d4a447ea588983a33afc8a0081a0954184a4a87fd0dd7"}, + {file = "pandas-1.0.5-cp37-cp37m-win_amd64.whl", hash = "sha256:ab8173a8efe5418bbe50e43f321994ac6673afc5c7c4839014cf6401bbdd0705"}, + {file = "pandas-1.0.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:13f75fb18486759da3ff40f5345d9dd20e7d78f2a39c5884d013456cec9876f0"}, + {file = "pandas-1.0.5-cp38-cp38-manylinux1_i686.whl", hash = "sha256:5a7cf6044467c1356b2b49ef69e50bf4d231e773c3ca0558807cdba56b76820b"}, + {file = "pandas-1.0.5-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:ae961f1f0e270f1e4e2273f6a539b2ea33248e0e3a11ffb479d757918a5e03a9"}, + {file = "pandas-1.0.5-cp38-cp38-win32.whl", hash = "sha256:f69e0f7b7c09f1f612b1f8f59e2df72faa8a6b41c5a436dde5b615aaf948f107"}, + {file = "pandas-1.0.5-cp38-cp38-win_amd64.whl", hash = "sha256:4c73f373b0800eb3062ffd13d4a7a2a6d522792fa6eb204d67a4fad0a40f03dc"}, + {file = "pandas-1.0.5.tar.gz", hash = "sha256:69c5d920a0b2a9838e677f78f4dde506b95ea8e4d30da25859db6469ded84fa8"}, +] +pathspec = [ + {file = "pathspec-0.8.0-py2.py3-none-any.whl", hash = "sha256:7d91249d21749788d07a2d0f94147accd8f845507400749ea19c1ec9054a12b0"}, + {file = "pathspec-0.8.0.tar.gz", hash = "sha256:da45173eb3a6f2a5a487efba21f050af2b41948be6ab52b6a1e3ff22bb8b7061"}, +] +pluggy = [ + {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"}, + {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"}, +] +py = [ + {file = "py-1.8.0-py2.py3-none-any.whl", hash = "sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa"}, + {file = "py-1.8.0.tar.gz", hash = "sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53"}, +] +pycodestyle = [ + {file = "pycodestyle-2.6.0-py2.py3-none-any.whl", hash = "sha256:2295e7b2f6b5bd100585ebcb1f616591b652db8a741695b3d8f5d28bdc934367"}, + {file = "pycodestyle-2.6.0.tar.gz", hash = "sha256:c58a7d2815e0e8d7972bf1803331fb0152f867bd89adf8a01dfd55085434192e"}, +] +pyflakes = [ + {file = "pyflakes-2.2.0-py2.py3-none-any.whl", hash = "sha256:0d94e0e05a19e57a99444b6ddcf9a6eb2e5c68d3ca1e98e90707af8152c90a92"}, + {file = "pyflakes-2.2.0.tar.gz", hash = "sha256:35b2d75ee967ea93b55750aa9edbbf72813e06a66ba54438df2cfac9e3c27fc8"}, +] +pyparsing = [ + {file = "pyparsing-2.4.0-py2.py3-none-any.whl", hash = "sha256:9b6323ef4ab914af344ba97510e966d64ba91055d6b9afa6b30799340e89cc03"}, + {file = "pyparsing-2.4.0.tar.gz", hash = "sha256:1873c03321fc118f4e9746baf201ff990ceb915f433f23b395f5580d1840cb2a"}, +] +pytest = [ + {file = "pytest-5.3.5-py3-none-any.whl", hash = "sha256:ff615c761e25eb25df19edddc0b970302d2a9091fbce0e7213298d85fb61fef6"}, + {file = "pytest-5.3.5.tar.gz", hash = "sha256:0d5fe9189a148acc3c3eb2ac8e1ac0742cb7618c084f3d228baaec0c254b318d"}, +] +python-dateutil = [ + {file = "python-dateutil-2.8.1.tar.gz", hash = "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c"}, + {file = "python_dateutil-2.8.1-py2.py3-none-any.whl", hash = "sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a"}, +] +pytz = [ + {file = "pytz-2019.1-py2.py3-none-any.whl", hash = "sha256:303879e36b721603cc54604edcac9d20401bdbe31e1e4fdee5b9f98d5d31dfda"}, + {file = "pytz-2019.1.tar.gz", hash = "sha256:d747dd3d23d77ef44c6a3526e274af6efeb0a6f1afd5a69ba4d5be4098c8e141"}, +] +regex = [ + {file = "regex-2020.6.8-cp27-cp27m-win32.whl", hash = "sha256:fbff901c54c22425a5b809b914a3bfaf4b9570eee0e5ce8186ac71eb2025191c"}, + {file = "regex-2020.6.8-cp27-cp27m-win_amd64.whl", hash = "sha256:112e34adf95e45158c597feea65d06a8124898bdeac975c9087fe71b572bd938"}, + {file = "regex-2020.6.8-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:92d8a043a4241a710c1cf7593f5577fbb832cf6c3a00ff3fc1ff2052aff5dd89"}, + {file = "regex-2020.6.8-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:bae83f2a56ab30d5353b47f9b2a33e4aac4de9401fb582b55c42b132a8ac3868"}, + {file = "regex-2020.6.8-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:b2ba0f78b3ef375114856cbdaa30559914d081c416b431f2437f83ce4f8b7f2f"}, + {file = "regex-2020.6.8-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:95fa7726d073c87141f7bbfb04c284901f8328e2d430eeb71b8ffdd5742a5ded"}, + {file = "regex-2020.6.8-cp36-cp36m-win32.whl", hash = "sha256:e3cdc9423808f7e1bb9c2e0bdb1c9dc37b0607b30d646ff6faf0d4e41ee8fee3"}, + {file = "regex-2020.6.8-cp36-cp36m-win_amd64.whl", hash = "sha256:c78e66a922de1c95a208e4ec02e2e5cf0bb83a36ceececc10a72841e53fbf2bd"}, + {file = "regex-2020.6.8-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:08997a37b221a3e27d68ffb601e45abfb0093d39ee770e4257bd2f5115e8cb0a"}, + {file = "regex-2020.6.8-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:2f6f211633ee8d3f7706953e9d3edc7ce63a1d6aad0be5dcee1ece127eea13ae"}, + {file = "regex-2020.6.8-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:55b4c25cbb3b29f8d5e63aeed27b49fa0f8476b0d4e1b3171d85db891938cc3a"}, + {file = "regex-2020.6.8-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:89cda1a5d3e33ec9e231ece7307afc101b5217523d55ef4dc7fb2abd6de71ba3"}, + {file = "regex-2020.6.8-cp37-cp37m-win32.whl", hash = "sha256:690f858d9a94d903cf5cada62ce069b5d93b313d7d05456dbcd99420856562d9"}, + {file = "regex-2020.6.8-cp37-cp37m-win_amd64.whl", hash = "sha256:1700419d8a18c26ff396b3b06ace315b5f2a6e780dad387e4c48717a12a22c29"}, + {file = "regex-2020.6.8-cp38-cp38-manylinux1_i686.whl", hash = "sha256:654cb773b2792e50151f0e22be0f2b6e1c3a04c5328ff1d9d59c0398d37ef610"}, + {file = "regex-2020.6.8-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:52e1b4bef02f4040b2fd547357a170fc1146e60ab310cdbdd098db86e929b387"}, + {file = "regex-2020.6.8-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:cf59bbf282b627130f5ba68b7fa3abdb96372b24b66bdf72a4920e8153fc7910"}, + {file = "regex-2020.6.8-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:5aaa5928b039ae440d775acea11d01e42ff26e1561c0ffcd3d805750973c6baf"}, + {file = "regex-2020.6.8-cp38-cp38-win32.whl", hash = "sha256:97712e0d0af05febd8ab63d2ef0ab2d0cd9deddf4476f7aa153f76feef4b2754"}, + {file = "regex-2020.6.8-cp38-cp38-win_amd64.whl", hash = "sha256:6ad8663c17db4c5ef438141f99e291c4d4edfeaacc0ce28b5bba2b0bf273d9b5"}, + {file = "regex-2020.6.8.tar.gz", hash = "sha256:e9b64e609d37438f7d6e68c2546d2cb8062f3adb27e6336bc129b51be20773ac"}, +] +requests = [ + {file = "requests-2.22.0-py2.py3-none-any.whl", hash = "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31"}, + {file = "requests-2.22.0.tar.gz", hash = "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4"}, +] +responses = [ + {file = "responses-0.10.6-py2.py3-none-any.whl", hash = "sha256:97193c0183d63fba8cd3a041c75464e4b09ea0aff6328800d1546598567dde0b"}, + {file = "responses-0.10.6.tar.gz", hash = "sha256:502d9c0c8008439cfcdef7e251f507fcfdd503b56e8c0c87c3c3e3393953f790"}, +] +six = [ + {file = "six-1.12.0-py2.py3-none-any.whl", hash = "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c"}, + {file = "six-1.12.0.tar.gz", hash = "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"}, +] +toml = [ + {file = "toml-0.10.1-py2.py3-none-any.whl", hash = "sha256:bda89d5935c2eac546d648028b9901107a595863cb36bae0c73ac804a9b4ce88"}, + {file = "toml-0.10.1.tar.gz", hash = "sha256:926b612be1e5ce0634a2ca03470f95169cf16f939018233a670519cb4ac58b0f"}, +] +typed-ast = [ + {file = "typed_ast-1.4.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:73d785a950fc82dd2a25897d525d003f6378d1cb23ab305578394694202a58c3"}, + {file = "typed_ast-1.4.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:aaee9905aee35ba5905cfb3c62f3e83b3bec7b39413f0a7f19be4e547ea01ebb"}, + {file = "typed_ast-1.4.1-cp35-cp35m-win32.whl", hash = "sha256:0c2c07682d61a629b68433afb159376e24e5b2fd4641d35424e462169c0a7919"}, + {file = "typed_ast-1.4.1-cp35-cp35m-win_amd64.whl", hash = "sha256:4083861b0aa07990b619bd7ddc365eb7fa4b817e99cf5f8d9cf21a42780f6e01"}, + {file = "typed_ast-1.4.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:269151951236b0f9a6f04015a9004084a5ab0d5f19b57de779f908621e7d8b75"}, + {file = "typed_ast-1.4.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:24995c843eb0ad11a4527b026b4dde3da70e1f2d8806c99b7b4a7cf491612652"}, + {file = "typed_ast-1.4.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:fe460b922ec15dd205595c9b5b99e2f056fd98ae8f9f56b888e7a17dc2b757e7"}, + {file = "typed_ast-1.4.1-cp36-cp36m-win32.whl", hash = "sha256:4e3e5da80ccbebfff202a67bf900d081906c358ccc3d5e3c8aea42fdfdfd51c1"}, + {file = "typed_ast-1.4.1-cp36-cp36m-win_amd64.whl", hash = "sha256:249862707802d40f7f29f6e1aad8d84b5aa9e44552d2cc17384b209f091276aa"}, + {file = "typed_ast-1.4.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8ce678dbaf790dbdb3eba24056d5364fb45944f33553dd5869b7580cdbb83614"}, + {file = "typed_ast-1.4.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:c9e348e02e4d2b4a8b2eedb48210430658df6951fa484e59de33ff773fbd4b41"}, + {file = "typed_ast-1.4.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:bcd3b13b56ea479b3650b82cabd6b5343a625b0ced5429e4ccad28a8973f301b"}, + {file = "typed_ast-1.4.1-cp37-cp37m-win32.whl", hash = "sha256:d5d33e9e7af3b34a40dc05f498939f0ebf187f07c385fd58d591c533ad8562fe"}, + {file = "typed_ast-1.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:0666aa36131496aed8f7be0410ff974562ab7eeac11ef351def9ea6fa28f6355"}, + {file = "typed_ast-1.4.1-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:d205b1b46085271b4e15f670058ce182bd1199e56b317bf2ec004b6a44f911f6"}, + {file = "typed_ast-1.4.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:6daac9731f172c2a22ade6ed0c00197ee7cc1221aa84cfdf9c31defeb059a907"}, + {file = "typed_ast-1.4.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:498b0f36cc7054c1fead3d7fc59d2150f4d5c6c56ba7fb150c013fbc683a8d2d"}, + {file = "typed_ast-1.4.1-cp38-cp38-win32.whl", hash = "sha256:715ff2f2df46121071622063fc7543d9b1fd19ebfc4f5c8895af64a77a8c852c"}, + {file = "typed_ast-1.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:fc0fea399acb12edbf8a628ba8d2312f583bdbdb3335635db062fa98cf71fca4"}, + {file = "typed_ast-1.4.1-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:d43943ef777f9a1c42bf4e552ba23ac77a6351de620aa9acf64ad54933ad4d34"}, + {file = "typed_ast-1.4.1.tar.gz", hash = "sha256:8c8aaad94455178e3187ab22c8b01a3837f8ee50e09cf31f1ba129eb293ec30b"}, +] +typing-extensions = [ + {file = "typing_extensions-3.7.4.2-py2-none-any.whl", hash = "sha256:f8d2bd89d25bc39dabe7d23df520442fa1d8969b82544370e03d88b5a591c392"}, + {file = "typing_extensions-3.7.4.2-py3-none-any.whl", hash = "sha256:6e95524d8a547a91e08f404ae485bbb71962de46967e1b71a0cb89af24e761c5"}, + {file = "typing_extensions-3.7.4.2.tar.gz", hash = "sha256:79ee589a3caca649a9bfd2a8de4709837400dfa00b6cc81962a1e6a1815969ae"}, +] +urllib3 = [ + {file = "urllib3-1.25.3-py2.py3-none-any.whl", hash = "sha256:b246607a25ac80bedac05c6f282e3cdaf3afb65420fd024ac94435cabe6e18d1"}, + {file = "urllib3-1.25.3.tar.gz", hash = "sha256:dbe59173209418ae49d485b87d1681aefa36252ee85884c31346debd19463232"}, +] +virtualenv = [ + {file = "virtualenv-20.0.21-py2.py3-none-any.whl", hash = "sha256:a730548b27366c5e6cbdf6f97406d861cccece2e22275e8e1a757aeff5e00c70"}, + {file = "virtualenv-20.0.21.tar.gz", hash = "sha256:a116629d4e7f4d03433b8afa27f43deba09d48bc48f5ecefa4f015a178efb6cf"}, +] +wcwidth = [ + {file = "wcwidth-0.1.7-py2.py3-none-any.whl", hash = "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"}, + {file = "wcwidth-0.1.7.tar.gz", hash = "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e"}, +] +zipp = [ + {file = "zipp-0.6.0-py2.py3-none-any.whl", hash = "sha256:f06903e9f1f43b12d371004b4ac7b06ab39a44adc747266928ae6debfa7b3335"}, + {file = "zipp-0.6.0.tar.gz", hash = "sha256:3718b1cbcd963c7d4c5511a8240812904164b7f381b647143a89d3b98f9bcd8e"}, +] diff --git a/pyproject.toml b/pyproject.toml index 4faf3f58..74879166 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,13 @@ [tool.poetry] name = "tamr-unify-client" -version = "0.6.0-dev" -description = "Python Client for the Tamr Unify API" +version = "0.0.0" +description = "Python Client for the Tamr API" license = "Apache-2.0" authors = ["Pedro Cattori "] readme = "README.md" -homepage = "tamr-unify-python-client.rtfd.io" -repository = "https://github.com/Datatamer/unify-client-python" -keywords = ["tamr", "unify"] +homepage = "https://tamr-client.readthedocs.io/en/stable/" +repository = "https://github.com/Datatamer/tamr-client" +keywords = ["tamr"] classifiers = [ "Development Status :: 4 - Beta", "Intended Audience :: Developers", @@ -17,23 +17,30 @@ classifiers = [ "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7" + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8" ] +packages = [ + { include = "tamr_client" }, + { include = "tamr_unify_client" }, +] +include = ["tamr_client/py.typed"] [tool.poetry.dependencies] -python = "^3.6" +python = "^3.6.1" requests = "^2.22" +dataclasses = "^0.6.0" [tool.poetry.dev-dependencies] -Sphinx = "^2.1" responses = "^0.10.6" flake8-import-order = "^0.18.1" -pytest = "^4.6" -black = {version = "^19.3b0",allows-prereleases = true} flake8 = "^3.7" -toml = "^0.10.0" -sphinx_rtd_theme = "^0.4.3" +pytest = "^5.3.2" +nox = "^2020.5.24" +pandas = "^1.0.5" +black = {version = "^19.10b0", allow-prereleases = true} +mypy = "^0.782" [build-system] -requires = ["poetry>=0.12"] +requires = ["poetry>=1.0"] build-backend = "poetry.masonry.api" diff --git a/stubs/pandas.pyi b/stubs/pandas.pyi new file mode 100644 index 00000000..94bffaca --- /dev/null +++ b/stubs/pandas.pyi @@ -0,0 +1,19 @@ +from typing import Any, Dict, Iterator, List, Tuple + +JsonDict = Dict[str, Any] + +class DataFrame: + index: Index + columns: Index + def __init__(self, data: List[JsonDict] = None, index: List[int] = None): ... + def drop(self, labels: str, axis: int, inplace: bool): ... + def insert(self, loc: int, column: str, value: Index): ... + def iterrows(self) -> Iterator[Tuple[int, Series]]: ... + def set_index(self, keys: str) -> DataFrame: ... + +class Series: + def to_json(self) -> str: ... + +class Index: + name: str + def __iter__(self) -> Iterator[str]: ... diff --git a/stubs/pytest.pyi b/stubs/pytest.pyi new file mode 100644 index 00000000..c8072cf8 --- /dev/null +++ b/stubs/pytest.pyi @@ -0,0 +1,3 @@ +from typing import Any + +def raises(expected_exception: Any): ... diff --git a/stubs/responses.pyi b/stubs/responses.pyi new file mode 100644 index 00000000..f73d84f7 --- /dev/null +++ b/stubs/responses.pyi @@ -0,0 +1,23 @@ +from functools import partial +from typing import Any, ContextManager, Dict, Optional, TypeVar + +JsonDict = Dict[str, Any] + +DELETE: str +GET: str +POST: str +PUT: str + +def add( + method: Optional[str] = None, + url: Optional[str] = None, + body: Optional[str] = None, + status: Optional[int] = None, + json: Optional[JsonDict] = None, +): ... + +T = TypeVar("T") + +def activate(T) -> T: ... +def add_callback(method: Optional[str], url: Optional[str], callback: partial[Any]): ... +def RequestsMock() -> ContextManager[Any]: ... diff --git a/tamr_client/__init__.py b/tamr_client/__init__.py new file mode 100644 index 00000000..2e42cb46 --- /dev/null +++ b/tamr_client/__init__.py @@ -0,0 +1,63 @@ +# Logging +######### + +import logging + +# https://docs.python-guide.org/writing/logging/#logging-in-a-library +logging.getLogger(__name__).addHandler(logging.NullHandler()) + +# BETA check +############ + +from tamr_client import _beta + +_beta.check() + +# types +####### + +from tamr_client._types import ( + AnyDataset, + Attribute, + AttributeType, + Backup, + CategorizationProject, + Dataset, + GoldenRecordsProject, + InputTransformation, + Instance, + MasteringProject, + Operation, + Project, + Restore, + SchemaMappingProject, + Session, + SubAttribute, + Transformations, + UnifiedDataset, + UnknownProject, + URL, + UsernamePasswordAuth, +) + +# functionality +############### + +from tamr_client import attribute +from tamr_client import backup +from tamr_client import categorization +from tamr_client import dataset +from tamr_client import golden_records +from tamr_client import instance +from tamr_client import mastering +from tamr_client import operation +from tamr_client import primary_key +from tamr_client import project +from tamr_client import response +from tamr_client import restore +from tamr_client import schema_mapping +from tamr_client import session +from tamr_client import transformations +from tamr_client.dataset import dataframe +from tamr_client.dataset import record +from tamr_client.exception import TamrClientException diff --git a/tamr_client/_beta.py b/tamr_client/_beta.py new file mode 100644 index 00000000..eaaed8ee --- /dev/null +++ b/tamr_client/_beta.py @@ -0,0 +1,18 @@ +import os +import sys + + +def check(): + env_var = "TAMR_CLIENT_BETA" + is_beta_enabled = os.environ.get(env_var) == "1" + + if not is_beta_enabled: + msg = ( + f"ERROR: 'tamr_client' package is in BETA, but you do not have the '{env_var}' environment variable set to '1'." + "\n\nHINT: For non-BETA features, use only the 'tamr_unify_client' package." + f"\nHINT: To opt-in to BETA features, set environment variable: '{env_var}=1'." + "\n\nWARNING: Do not rely on BETA features in production workflows." + " Support from Tamr may be limited." + ) + print(msg) + sys.exit(1) diff --git a/tamr_client/_types/__init__.py b/tamr_client/_types/__init__.py new file mode 100644 index 00000000..b5f616c3 --- /dev/null +++ b/tamr_client/_types/__init__.py @@ -0,0 +1,35 @@ +from tamr_client._types.attribute import ( + Array, + Attribute, + AttributeType, + BOOLEAN, + ComplexType, + DEFAULT, + DOUBLE, + GEOSPATIAL, + INT, + LONG, + Map, + PrimitiveType, + Record, + STRING, + SubAttribute, +) +from tamr_client._types.auth import UsernamePasswordAuth +from tamr_client._types.backup import Backup +from tamr_client._types.dataset import AnyDataset, Dataset, UnifiedDataset +from tamr_client._types.instance import Instance +from tamr_client._types.json import JsonDict +from tamr_client._types.operation import Operation +from tamr_client._types.project import ( + CategorizationProject, + GoldenRecordsProject, + MasteringProject, + Project, + SchemaMappingProject, + UnknownProject, +) +from tamr_client._types.restore import Restore +from tamr_client._types.session import Session +from tamr_client._types.transformations import InputTransformation, Transformations +from tamr_client._types.url import URL diff --git a/tamr_client/_types/attribute.py b/tamr_client/_types/attribute.py new file mode 100644 index 00000000..c75c5450 --- /dev/null +++ b/tamr_client/_types/attribute.py @@ -0,0 +1,156 @@ +""" +This module includes: +- SubAttribute +- AttributeType +- Attribute + +The definition order is chosen to minimize the number of forward references. +See https://www.python.org/dev/peps/pep-0484/#forward-references + +Forward references are necessary because +- `SubAttribute` and `AttributeType` recursively depend on each other +- `Array` and `Map` have `AttributeType` fields but are themselves `AttributeType`s +""" + +from dataclasses import dataclass +from enum import Enum +from typing import ClassVar, Optional, Tuple, Union + +from tamr_client._types.url import URL + + +# sub attribute +############### + + +@dataclass(frozen=True) +class SubAttribute: + """An attribute which is itself a property of another attribute. + + See https://docs.tamr.com/reference#attribute-types + + NOTE: + `sphinx_autodoc_typehints` cannot handle forward reference to `AttributeType`, + so reference docs are written manually for this type + + Args: + name: Name of sub-attribute + type: See https://docs.tamr.com/reference#attribute-types + is_nullable: If this sub-attribute can be null + """ + + name: str + type: "AttributeType" + is_nullable: bool + + +# attribute types +################# + +# primitive types + +PrimitiveType = Enum("PrimitiveType", ["BOOLEAN", "DOUBLE", "INT", "LONG", "STRING"]) + +# primitive type aliases +DOUBLE = PrimitiveType.DOUBLE +BOOLEAN = PrimitiveType.BOOLEAN +INT = PrimitiveType.INT +LONG = PrimitiveType.LONG +STRING = PrimitiveType.STRING + +# complex types + + +@dataclass(frozen=True) +class Array: + """See https://docs.tamr.com/reference#attribute-types + + NOTE: + `sphinx_autodoc_typehints` cannot handle forward reference to `AttributeType`, + so reference docs are written manually for this type + + Args: + inner_type + """ + + _tag: ClassVar[str] = "ARRAY" + inner_type: "AttributeType" + + +@dataclass(frozen=True) +class Map: + """See https://docs.tamr.com/reference#attribute-types + + NOTE: + `sphinx_autodoc_typehints` cannot handle forward reference to `AttributeType`, + so reference docs are written manually for this type + + Args: + inner_type + """ + + _tag: ClassVar[str] = "MAP" + inner_type: "AttributeType" + + +@dataclass(frozen=True) +class Record: + """See https://docs.tamr.com/reference#attribute-types + + Args: + attributes + """ + + _tag: ClassVar[str] = "RECORD" + attributes: Tuple[SubAttribute, ...] + + +ComplexType = Union[Array, Map, Record] + + +AttributeType = Union[PrimitiveType, ComplexType] + +# complex type aliases +DEFAULT: AttributeType = Array(STRING) +GEOSPATIAL: AttributeType = Record( + attributes=( + SubAttribute(name="point", is_nullable=True, type=Array(DOUBLE)), + SubAttribute(name="multiPoint", is_nullable=True, type=Array(Array(DOUBLE))), + SubAttribute(name="lineString", is_nullable=True, type=Array(Array(DOUBLE))), + SubAttribute( + name="multiLineString", is_nullable=True, type=Array(Array(Array(DOUBLE))) + ), + SubAttribute( + name="polygon", is_nullable=True, type=Array(Array(Array(DOUBLE))) + ), + SubAttribute( + name="multiPolygon", + is_nullable=True, + type=Array(Array(Array(Array(DOUBLE)))), + ), + ) +) + +# attribute +########### + + +@dataclass(frozen=True) +class Attribute: + """A Tamr Attribute. + + See https://docs.tamr.com/reference#attribute-types + + Args: + url + name + type + is_nullable + description + """ + + url: URL + name: str + type: AttributeType + is_nullable: bool + description: Optional[str] = None diff --git a/tamr_client/_types/auth.py b/tamr_client/_types/auth.py new file mode 100644 index 00000000..4b2cf867 --- /dev/null +++ b/tamr_client/_types/auth.py @@ -0,0 +1,40 @@ +import base64 + +import requests + + +def _basic_auth_str(username, password): + auth = f"{username}:{password}" + encoded = base64.b64encode(auth.encode("latin1")) + return "BasicCreds " + requests.utils.to_native_string(encoded.strip()) + + +class UsernamePasswordAuth(requests.auth.HTTPBasicAuth): + """Provides username/password authentication for Tamr. + + Sets the `Authorization` HTTP header with Tamr's custom `BasicCreds` format. + + Args: + username: + password: + + Example: + >>> import tamr_client as tc + >>> auth = tc.UsernamePasswordAuth('my username', 'my password') + >>> s = tc.Session(auth) + """ + + def __init__(self, username: str, password: str): + super().__init__(username, password) + + def __call__(self, r): + r.headers["Authorization"] = _basic_auth_str(self.username, self.password) + return r + + def __repr__(self): + # intentionally leave out password (potentially sensitive) + return ( + f"{type(self).__qualname__}(" + f"username={repr(self.username)}" + f"password=)" + ) diff --git a/tamr_client/_types/backup.py b/tamr_client/_types/backup.py new file mode 100644 index 00000000..98912b2a --- /dev/null +++ b/tamr_client/_types/backup.py @@ -0,0 +1,22 @@ +from dataclasses import dataclass + +from tamr_client._types.url import URL + + +@dataclass(frozen=True) +class Backup: + """A Tamr backup + + See https://docs.tamr.com/new/docs/configuration-backup-and-restore + + Args: + url + path + state + error_message + """ + + url: URL + path: str + state: str + error_message: str diff --git a/tamr_client/_types/dataset.py b/tamr_client/_types/dataset.py new file mode 100644 index 00000000..15c6227a --- /dev/null +++ b/tamr_client/_types/dataset.py @@ -0,0 +1,45 @@ +from dataclasses import dataclass +from typing import Optional, Tuple, Union + +from tamr_client._types.url import URL + + +@dataclass(frozen=True) +class Dataset: + """A Tamr dataset + + See https://docs.tamr.com/reference/dataset-models + + Args: + url: The canonical dataset-based URL for this dataset e.g. `/datasets/1` + name + key_attribute_names + description + """ + + url: URL + name: str + key_attribute_names: Tuple[str, ...] + description: Optional[str] = None + + +@dataclass(frozen=True) +class UnifiedDataset: + """A Tamr unified dataset + + See https://docs.tamr.com/reference/dataset-models + + Args: + url: The project-based alias for this dataset e.g. `/projects/1/unifiedDataset` + name + key_attribute_names + description + """ + + url: URL + name: str + key_attribute_names: Tuple[str, ...] + description: Optional[str] = None + + +AnyDataset = Union[Dataset, UnifiedDataset] diff --git a/tamr_client/_types/instance.py b/tamr_client/_types/instance.py new file mode 100644 index 00000000..1826ba21 --- /dev/null +++ b/tamr_client/_types/instance.py @@ -0,0 +1,17 @@ +from dataclasses import dataclass +from typing import Optional + + +@dataclass(frozen=True) +class Instance: + """Connection parameters for a running Tamr instance + + Args: + protocol + host + port + """ + + protocol: str = "http" + host: str = "localhost" + port: Optional[int] = None diff --git a/tamr_client/_types/json.py b/tamr_client/_types/json.py new file mode 100644 index 00000000..90427f37 --- /dev/null +++ b/tamr_client/_types/json.py @@ -0,0 +1,4 @@ +from typing import Any, Dict + +# taken from https://github.com/python/typing/issues/182 +JsonDict = Dict[str, Any] diff --git a/tamr_client/_types/operation.py b/tamr_client/_types/operation.py new file mode 100644 index 00000000..a6462a42 --- /dev/null +++ b/tamr_client/_types/operation.py @@ -0,0 +1,23 @@ +from dataclasses import dataclass +from typing import Dict, Optional + +from tamr_client._types.url import URL + + +@dataclass(frozen=True) +class Operation: + """A Tamr operation + + See https://docs.tamr.com/new/reference/the-operation-object + + Args: + url + type + status + description + """ + + url: URL + type: str + status: Optional[Dict[str, str]] = None + description: Optional[str] = None diff --git a/tamr_client/_types/project.py b/tamr_client/_types/project.py new file mode 100644 index 00000000..32e84114 --- /dev/null +++ b/tamr_client/_types/project.py @@ -0,0 +1,98 @@ +from dataclasses import dataclass +from typing import Optional, Union + +from tamr_client._types.url import URL + + +@dataclass(frozen=True) +class CategorizationProject: + """A Tamr Categorization project + + See https://docs.tamr.com/reference/the-project-object + + Args: + url + name + description + """ + + url: URL + name: str + description: Optional[str] = None + + +@dataclass(frozen=True) +class MasteringProject: + """A Tamr Mastering project + + See https://docs.tamr.com/reference/the-project-object + + Args: + url + name + description + """ + + url: URL + name: str + description: Optional[str] = None + + +@dataclass(frozen=True) +class SchemaMappingProject: + """A Tamr Schema Mapping project + + See https://docs.tamr.com/reference/the-project-object + + Args: + url + name + description + """ + + url: URL + name: str + description: Optional[str] = None + + +@dataclass(frozen=True) +class GoldenRecordsProject: + """A Tamr Golden Records project + + See https://docs.tamr.com/reference/the-project-object + + Args: + url + name + description + """ + + url: URL + name: str + description: Optional[str] = None + + +@dataclass(frozen=True) +class UnknownProject: + """A Tamr project of an unrecognized type + + See https://docs.tamr.com/reference/the-project-object + + Args: + url + name + description + """ + + url: URL + name: str + description: Optional[str] = None + + +Project = Union[ + CategorizationProject, + MasteringProject, + SchemaMappingProject, + GoldenRecordsProject, + UnknownProject, +] diff --git a/tamr_client/_types/restore.py b/tamr_client/_types/restore.py new file mode 100644 index 00000000..45031f04 --- /dev/null +++ b/tamr_client/_types/restore.py @@ -0,0 +1,22 @@ +from dataclasses import dataclass + +from tamr_client._types.url import URL + + +@dataclass(frozen=True) +class Restore: + """A Tamr restore + + See https://docs.tamr.com/new/docs/configuration-backup-and-restore + + Args: + url + backup_path + state + error_message + """ + + url: URL + backup_path: str + state: str + error_message: str diff --git a/tamr_client/_types/session.py b/tamr_client/_types/session.py new file mode 100644 index 00000000..036af1a1 --- /dev/null +++ b/tamr_client/_types/session.py @@ -0,0 +1,3 @@ +import requests + +Session = requests.Session diff --git a/tamr_client/_types/transformations.py b/tamr_client/_types/transformations.py new file mode 100644 index 00000000..e35ce40f --- /dev/null +++ b/tamr_client/_types/transformations.py @@ -0,0 +1,16 @@ +from dataclasses import dataclass, field +from typing import List + +from tamr_client._types import Dataset + + +@dataclass(frozen=True) +class InputTransformation: + transformation: str + datasets: List[Dataset] = field(default_factory=list) + + +@dataclass(frozen=True) +class Transformations: + input_scope: List[InputTransformation] = field(default_factory=list) + unified_scope: List[str] = field(default_factory=list) diff --git a/tamr_client/_types/url.py b/tamr_client/_types/url.py new file mode 100644 index 00000000..54a3496f --- /dev/null +++ b/tamr_client/_types/url.py @@ -0,0 +1,16 @@ +from dataclasses import dataclass + +from tamr_client._types.instance import Instance + + +@dataclass(frozen=True) +class URL: + path: str + instance: Instance = Instance() + base_path: str = "api/versioned/v1" + + def __str__(self): + from tamr_client import instance + + origin = instance.origin(self.instance) + return f"{origin}/{self.base_path}/{self.path}" diff --git a/tamr_client/attribute/__init__.py b/tamr_client/attribute/__init__.py new file mode 100644 index 00000000..45071966 --- /dev/null +++ b/tamr_client/attribute/__init__.py @@ -0,0 +1,13 @@ +from tamr_client.attribute import sub +from tamr_client.attribute import type +from tamr_client.attribute._attribute import ( + _from_json, + AlreadyExists, + by_resource_id, + create, + delete, + NotFound, + ReservedName, + to_json, + update, +) diff --git a/tamr_client/attribute/_attribute.py b/tamr_client/attribute/_attribute.py new file mode 100644 index 00000000..20d37b62 --- /dev/null +++ b/tamr_client/attribute/_attribute.py @@ -0,0 +1,244 @@ +""" +See https://docs.tamr.com/reference/attribute-types +""" +from copy import deepcopy +from dataclasses import replace +from typing import Optional + +from tamr_client import response +from tamr_client._types import Attribute, AttributeType, Dataset, JsonDict, Session, URL +from tamr_client.attribute import type as attribute_type +from tamr_client.exception import TamrClientException + + +_RESERVED_NAMES = frozenset( + [ + # See javasrc/procurify/ui/app/scripts/constants/ElasticConstants.js + "origin_source_name", + "tamr_id", + "origin_entity_id", + # See javasrc/procurify/ui/app/scripts/constants/PipelineConstants.js + "clusterId", + "originSourceId", + "originEntityId", + "sourceId", + "entityId", + "suggestedClusterId", + "verificationType", + "verifiedClusterId", + ] +) + + +class AlreadyExists(TamrClientException): + """Raised when trying to create an attribute that already exists on the server""" + + pass + + +class NotFound(TamrClientException): + """Raised when referencing (e.g. updating or deleting) an attribute + that does not exist on the server. + """ + + pass + + +class ReservedName(TamrClientException): + """Raised when attempting to create an attribute with a reserved name""" + + pass + + +def by_resource_id(session: Session, dataset: Dataset, id: str) -> Attribute: + """Get attribute by resource ID + + Fetches attribute from Tamr server + + Args: + dataset: Dataset containing this attribute + id: Attribute ID + + Raises: + attribute.NotFound: If no attribute could be found at the specified URL. + Corresponds to a 404 HTTP error. + requests.HTTPError: If any other HTTP error is encountered. + """ + url = replace(dataset.url, path=dataset.url.path + f"/attributes/{id}") + return _by_url(session, url) + + +def _by_url(session: Session, url: URL) -> Attribute: + """Get attribute by URL + + Fetches attribute from Tamr server + + Args: + url: Attribute URL + + Raises: + attribute.NotFound: If no attribute could be found at the specified URL. + Corresponds to a 404 HTTP error. + requests.HTTPError: If any other HTTP error is encountered. + """ + r = session.get(str(url)) + if r.status_code == 404: + raise NotFound(str(url)) + data = response.successful(r).json() + return _from_json(url, data) + + +def _from_json(url: URL, data: JsonDict) -> Attribute: + """Make attribute from JSON data (deserialize) + + Args: + url: Attribute URL + data: Attribute JSON data from Tamr server + """ + cp = deepcopy(data) + return Attribute( + url, + name=cp["name"], + description=cp.get("description"), + is_nullable=cp["isNullable"], + type=attribute_type.from_json(cp["type"]), + ) + + +def to_json(attr: Attribute) -> JsonDict: + """Serialize attribute into JSON + + Args: + attr: Attribute to serialize + + Returns: + JSON data representing the attribute + """ + d = { + "name": attr.name, + "type": attribute_type.to_json(attr.type), + "isNullable": attr.is_nullable, + } + if attr.description is not None: + d["description"] = attr.description + return d + + +def create( + session: Session, + dataset: Dataset, + *, + name: str, + is_nullable: bool, + type: AttributeType = attribute_type.DEFAULT, + description: Optional[str] = None, +) -> Attribute: + """Create an attribute + + Posts a creation request to the Tamr server + + Args: + dataset: Dataset that should contain the new attribute + name: Name for the new attribute + type: Attribute type for the new attribute + is_nullable: Determines if the new attribute can contain NULL values + description: Description of the new attribute + force: If `True`, skips reserved attribute name check + + Returns: + The newly created attribute + + Raises: + attribute.ReservedName: If attribute name is reserved. + attribute.AlreadyExists: If an attribute already exists at the specified URL. + Corresponds to a 409 HTTP error. + requests.HTTPError: If any other HTTP error is encountered. + """ + if name in _RESERVED_NAMES: + raise ReservedName(name) + + return _create( + session, + dataset, + name=name, + is_nullable=is_nullable, + type=type, + description=description, + ) + + +def _create( + session: Session, + dataset: Dataset, + *, + name: str, + is_nullable: bool, + type: AttributeType = attribute_type.DEFAULT, + description: Optional[str] = None, +) -> Attribute: + """Same as `tc.attribute.create`, but does not check for reserved attribute + names. + """ + attrs_url = replace(dataset.url, path=dataset.url.path + "/attributes") + url = replace(attrs_url, path=attrs_url.path + f"/{name}") + + body = { + "name": name, + "type": attribute_type.to_json(type), + "isNullable": is_nullable, + } + if description is not None: + body["description"] = description + + r = session.post(str(attrs_url), json=body) + if r.status_code == 409: + raise AlreadyExists(str(url)) + data = response.successful(r).json() + + return _from_json(url, data) + + +def update( + session: Session, attribute: Attribute, *, description: Optional[str] = None +) -> Attribute: + """Update an existing attribute + + PUTS an update request to the Tamr server + + Args: + attribute: Existing attribute to update + description: Updated description for the existing attribute + + Returns: + The newly updated attribute + + Raises: + attribute.NotFound: If no attribute could be found at the specified URL. + Corresponds to a 404 HTTP error. + requests.HTTPError: If any other HTTP error is encountered. + """ + updates = {"description": description} + r = session.put(str(attribute.url), json=updates) + if r.status_code == 404: + raise NotFound(str(attribute.url)) + data = response.successful(r).json() + return _from_json(attribute.url, data) + + +def delete(session: Session, attribute: Attribute): + """Deletes an existing attribute + + Sends a deletion request to the Tamr server + + Args: + attribute: Existing attribute to delete + + Raises: + attribute.NotFound: If no attribute could be found at the specified URL. + Corresponds to a 404 HTTP error. + requests.HTTPError: If any other HTTP error is encountered. + """ + r = session.delete(str(attribute.url)) + if r.status_code == 404: + raise NotFound(str(attribute.url)) + response.successful(r) diff --git a/tamr_client/attribute/sub.py b/tamr_client/attribute/sub.py new file mode 100644 index 00000000..0e63c5c3 --- /dev/null +++ b/tamr_client/attribute/sub.py @@ -0,0 +1,37 @@ +"""This module and attribute_type depend on each other. + +""" +from copy import deepcopy + +from tamr_client._types import JsonDict, SubAttribute +from tamr_client.attribute import type as attribute_type + + +def from_json(data: JsonDict) -> SubAttribute: + """Make a SubAttribute from JSON data (deserialize) + + Args: + data: JSON data received from Tamr server. + """ + + cp = deepcopy(data) + d = {} + d["name"] = cp["name"] + d["is_nullable"] = cp["isNullable"] + d["type"] = attribute_type.from_json(cp["type"]) + return SubAttribute(**d) + + +def to_json(subattr: SubAttribute) -> JsonDict: + """Serialize subattribute into JSON + + Args: + subattr: SubAttribute to serialize + """ + + d = { + "name": subattr.name, + "type": attribute_type.to_json(subattr.type), + "isNullable": subattr.is_nullable, + } + return d diff --git a/tamr_client/attribute/type.py b/tamr_client/attribute/type.py new file mode 100644 index 00000000..ff8b90ef --- /dev/null +++ b/tamr_client/attribute/type.py @@ -0,0 +1,86 @@ +""" +See https://docs.tamr.com/reference#attribute-types +""" +import logging + +from tamr_client._types import ( + Array, + AttributeType, + JsonDict, + Map, + PrimitiveType, + Record, +) +from tamr_client._types import ( # noqa: F401 + BOOLEAN, + DEFAULT, + DOUBLE, + GEOSPATIAL, + INT, + LONG, + STRING, +) +from tamr_client.attribute import sub + +logger = logging.getLogger(__name__) + + +def from_json(data: JsonDict) -> AttributeType: + """Make an attribute type from JSON data (deserialize) + + Args: + data: JSON data from Tamr server + """ + base_type = data.get("baseType") + if base_type is None: + logger.error(f"JSON data: {repr(data)}") + raise ValueError("Missing required field 'baseType'.") + + for primitive in PrimitiveType: + if base_type == primitive.name: + return primitive + + if base_type == Array._tag: + inner_type = data.get("innerType") + if inner_type is None: + logger.error(f"JSON data: {repr(data)}") + raise ValueError("Missing required field 'innerType' for Array type.") + return Array(inner_type=from_json(inner_type)) + elif base_type == Map._tag: + inner_type = data.get("innerType") + if inner_type is None: + logger.error(f"JSON data: {repr(data)}") + raise ValueError("Missing required field 'innerType' for Map type.") + return Map(inner_type=from_json(inner_type)) + elif base_type == Record._tag: + attributes = data.get("attributes") + if attributes is None: + logger.error(f"JSON data: {repr(data)}") + raise ValueError("Missing required field 'attributes' for Record type.") + return Record(attributes=tuple([sub.from_json(attr) for attr in attributes])) + else: + logger.error(f"JSON data: {repr(data)}") + raise ValueError(f"Unrecognized 'baseType': {base_type}") + + +def to_json(attr_type: AttributeType) -> JsonDict: + """Serialize attribute type to JSON + + Args: + attr_type: Attribute type to serialize + """ + if isinstance(attr_type, PrimitiveType): + return {"baseType": attr_type.name} + elif isinstance(attr_type, (Array, Map)): + return { + "baseType": type(attr_type)._tag, + "innerType": to_json(attr_type.inner_type), + } + elif isinstance(attr_type, Record): + + return { + "baseType": type(attr_type)._tag, + "attributes": [sub.to_json(attr) for attr in attr_type.attributes], + } + else: + raise TypeError(attr_type) diff --git a/tamr_client/backup.py b/tamr_client/backup.py new file mode 100644 index 00000000..2938f6bc --- /dev/null +++ b/tamr_client/backup.py @@ -0,0 +1,150 @@ +from copy import deepcopy +from typing import List + +from tamr_client import Backup, response +from tamr_client._types import Instance, JsonDict, Session, URL +from tamr_client.exception import TamrClientException + + +class InvalidOperation(TamrClientException): + """Raised when attempting an invalid operation. + """ + + pass + + +class NotFound(TamrClientException): + """Raised when referencing a backup that does not exist on the server. + """ + + pass + + +def _from_json(url: URL, data: JsonDict) -> Backup: + """Make backup from JSON data (deserialize). + + Args: + url: Backup URL + data: Backup JSON data from Tamr server + """ + cp = deepcopy(data) + return Backup( + url=url, + path=cp["backupPath"], + state=cp["state"], + error_message=cp["errorMessage"], + ) + + +def get_all(session: Session, instance: Instance) -> List[Backup]: + """Get all backups that have been initiated for a Tamr instance. + + Args: + session: Tamr session + instance: Tamr instance + + Returns: + A list of Tamr backups + + Raises: + backup.NotFound: If no backup found at the specified URL + """ + url = URL(instance=instance, path="backups") + r = session.get(str(url)) + if r.status_code == 404: + raise NotFound(str(url)) + backups = [ + _from_json(URL(instance=instance, path=f'backups/{data["relativeId"]}'), data) + for data in response.successful(r).json() + ] + return backups + + +def by_resource_id(session: Session, instance: Instance, resource_id: str) -> Backup: + """Get information on a specific Tamr backup. + + Args: + session: Tamr session + instance: Tamr instance + resource_id: Resource ID of the backup + + Returns: + A Tamr backup + + Raises: + backup.NotFound: If no backup found at the specified URL + """ + url = URL(instance=instance, path=f"backups/{resource_id}") + r = session.get(str(url)) + if r.status_code == 404: + raise NotFound(str(url)) + return _from_json(url, response.successful(r).json()) + + +def initiate(session: Session, instance: Instance) -> Backup: + """Initiate a Tamr backup. + + Args: + session: Tamr session + instance: Tamr instance + + Returns: + Initiated backup + + Raises: + backup.InvalidOperation: If attempting an invalid operation + """ + url = URL(instance=instance, path="backups") + r = session.post(str(url)) + if r.status_code == 400: + raise InvalidOperation(str(url), r.json()["message"]) + data = response.successful(r).json() + return _from_json( + URL(instance=instance, path=f'backups/{data["relativeId"]}'), data + ) + + +def cancel(session: Session, backup: Backup) -> Backup: + """Cancel a Tamr backup. + + Args: + session: Tamr session + backup: A Tamr backup + + Returns: + Canceled backup + + Raises: + backup.NotFound: If no backup found at the specified URL + backup.InvalidOperation: If attempting an invalid operation + """ + cancel_url = f"{backup.url}:cancel" + r = session.post(cancel_url) + if r.status_code == 404: + raise NotFound(cancel_url) + if r.status_code == 400: + raise InvalidOperation(cancel_url, r.json()["message"]) + return _from_json(backup.url, response.successful(r).json()) + + +def poll(session: Session, backup: Backup) -> Backup: + """Poll this backup for server-side updates. + + Does not update the :class:`~tamr_client.backup.Backup` object. + Instead, returns a new :class:`~tamr_client.backup.Backup`. + + Args: + session: Tamr session + backup: Tamr backup to be polled + + Returns: + A Tamr backup + + Raises: + backup.NotFound: If no backup found at the specified URL + """ + url = backup.url + r = session.get(str(url)) + if r.status_code == 404: + raise NotFound(str(url)) + return _from_json(url, response.successful(r).json()) diff --git a/tamr_client/categorization/__init__.py b/tamr_client/categorization/__init__.py new file mode 100644 index 00000000..3b51e3ed --- /dev/null +++ b/tamr_client/categorization/__init__.py @@ -0,0 +1,13 @@ +""" +Tamr - Categorization +See https://docs.tamr.com/docs/overall-workflow-classification +""" +from tamr_client.categorization import project +from tamr_client.categorization._categorization import ( + _apply_feedback_async, + _update_results_async, + apply_feedback, + manual_labels, + update_results, + update_unified_dataset, +) diff --git a/tamr_client/categorization/_categorization.py b/tamr_client/categorization/_categorization.py new file mode 100644 index 00000000..d1523afc --- /dev/null +++ b/tamr_client/categorization/_categorization.py @@ -0,0 +1,81 @@ +""" +Tamr - Categorization +See https://docs.tamr.com/docs/overall-workflow-classification + +The terminology used here is consistent with Tamr UI terminology + +Asynchronous versions of each function can be found with the suffix `_async` and may be of +interest to power users +""" +from tamr_client import operation +from tamr_client._types import CategorizationProject, Dataset, Operation, Session +from tamr_client.dataset import _dataset, unified + + +def manual_labels(session: Session, project: CategorizationProject) -> Dataset: + """Get manual labels from a Categorization project. + + Args: + project: Tamr project containing labels + + Returns: + Dataset containing manual labels + + Raises: + dataset.NotFound: If no dataset could be found at the specified URL + dataset.Ambiguous: If multiple targets match dataset name + """ + unified_dataset = unified.from_project(session=session, project=project) + labels_dataset_name = unified_dataset.name + "_manual_categorizations" + return _dataset.by_name( + session=session, instance=project.url.instance, name=labels_dataset_name + ) + + +def update_unified_dataset( + session: Session, project: CategorizationProject +) -> Operation: + """Apply changes to the unified dataset and wait for the operation to complete + + Args: + project: Tamr Categorization project + """ + unified_dataset = unified.from_project(session, project) + op = unified._apply_changes_async(session, unified_dataset) + return operation.wait(session, op) + + +def apply_feedback(session: Session, project: CategorizationProject) -> Operation: + """Train the categorization model according to verified labels and wait for the + operation to complete + + Args: + project: Tamr Categorization project + """ + op = _apply_feedback_async(session, project) + return operation.wait(session, op) + + +def update_results(session: Session, project: CategorizationProject) -> Operation: + """Generate classifications based on the latest categorization model and wait for the + operation to complete + + Args: + project: Tamr Categorization project + """ + op = _update_results_async(session, project) + return operation.wait(session, op) + + +def _apply_feedback_async( + session: Session, project: CategorizationProject +) -> Operation: + r = session.post(str(project.url) + "/categorizations/model:refresh") + return operation._from_response(project.url.instance, r) + + +def _update_results_async( + session: Session, project: CategorizationProject +) -> Operation: + r = session.post(str(project.url) + "/categorizations:refresh") + return operation._from_response(project.url.instance, r) diff --git a/tamr_client/categorization/project.py b/tamr_client/categorization/project.py new file mode 100644 index 00000000..2ceaaa13 --- /dev/null +++ b/tamr_client/categorization/project.py @@ -0,0 +1,58 @@ +from typing import Optional + +from tamr_client import project +from tamr_client._types import ( + CategorizationProject, + Instance, + JsonDict, + Project, + Session, + URL, +) + + +def _from_json(url: URL, data: JsonDict) -> CategorizationProject: + """Make Categorization project from JSON data (deserialize) + + Args: + url: Project URL + data: Project JSON data from Tamr server + """ + return CategorizationProject( + url, name=data["name"], description=data.get("description") + ) + + +def create( + session: Session, + instance: Instance, + name: str, + description: Optional[str] = None, + external_id: Optional[str] = None, + unified_dataset_name: Optional[str] = None, +) -> Project: + """Create a Categorization project in Tamr. + + Args: + instance: Tamr instance + name: Project name + description: Project description + external_id: External ID of the project + unified_dataset_name: Unified dataset name. If None, will be set to project name + _'unified_dataset' + + Returns: + Project created in Tamr + + Raises: + project.AlreadyExists: If a project with these specifications already exists + requests.HTTPError: If any other HTTP error is encountered + """ + return project._create( + session=session, + instance=instance, + name=name, + project_type="CATEGORIZATION", + description=description, + external_id=external_id, + unified_dataset_name=unified_dataset_name, + ) diff --git a/tamr_client/dataset/__init__.py b/tamr_client/dataset/__init__.py new file mode 100644 index 00000000..4f5f529f --- /dev/null +++ b/tamr_client/dataset/__init__.py @@ -0,0 +1,14 @@ +from tamr_client.dataset import dataframe, record, unified +from tamr_client.dataset._dataset import ( + _materialize_async, + AlreadyExists, + Ambiguous, + attributes, + by_name, + by_resource_id, + create, + delete, + get_all, + materialize, + NotFound, +) diff --git a/tamr_client/dataset/_dataset.py b/tamr_client/dataset/_dataset.py new file mode 100644 index 00000000..9e8106be --- /dev/null +++ b/tamr_client/dataset/_dataset.py @@ -0,0 +1,266 @@ +""" +See https://docs.tamr.com/reference/dataset-models +""" +from copy import deepcopy +from dataclasses import replace +from typing import List, Optional, Tuple, Union + +from tamr_client import operation, response +from tamr_client._types import ( + Attribute, + Dataset, + Instance, + JsonDict, + Operation, + Session, + URL, +) +from tamr_client.attribute import _from_json as _attribute_from_json +from tamr_client.exception import TamrClientException + + +class NotFound(TamrClientException): + """Raised when referencing (e.g. updating or deleting) a dataset + that does not exist on the server. + """ + + pass + + +class Ambiguous(TamrClientException): + """Raised when referencing a dataset by name that matches multiple possible targets.""" + + pass + + +class AlreadyExists(TamrClientException): + """Raised when a dataset with these specifications already exists.""" + + pass + + +def by_resource_id(session: Session, instance: Instance, id: str) -> Dataset: + """Get dataset by resource ID + + Fetches dataset from Tamr server + + Args: + instance: Tamr instance containing this dataset + id: Dataset ID + + Raises: + dataset.NotFound: If no dataset could be found at the specified URL. + Corresponds to a 404 HTTP error. + requests.HTTPError: If any other HTTP error is encountered. + """ + url = URL(instance=instance, path=f"datasets/{id}") + return _by_url(session, url) + + +def by_name(session: Session, instance: Instance, name: str) -> Dataset: + """Get dataset by name + + Fetches dataset from Tamr server + + Args: + instance: Tamr instance containing this dataset + name: Dataset name + + Raises: + dataset.NotFound: If no dataset could be found with that name. + dataset.Ambiguous: If multiple targets match dataset name. + requests.HTTPError: If any other HTTP error is encountered. + """ + r = session.get( + url=str(URL(instance=instance, path="datasets")), + params={"filter": f"name=={name}"}, + ) + + # Check that exactly one dataset is returned + matches = r.json() + if len(matches) == 0: + raise NotFound(str(r.url)) + if len(matches) > 1: + raise Ambiguous(str(r.url)) + + # Make Dataset from response + url = URL(instance=instance, path=matches[0]["relativeId"]) + return _from_json(url=url, data=matches[0]) + + +def _by_url(session: Session, url: URL) -> Dataset: + """Get dataset by URL + + Fetches dataset from Tamr server + + Args: + url: Dataset URL + + Raises: + dataset.NotFound: If no dataset could be found at the specified URL. + Corresponds to a 404 HTTP error. + requests.HTTPError: If any other HTTP error is encountered. + """ + r = session.get(str(url)) + if r.status_code == 404: + raise NotFound(str(url)) + data = response.successful(r).json() + return _from_json(url, data) + + +def _from_json(url: URL, data: JsonDict) -> Dataset: + """Make dataset from JSON data (deserialize) + + Args: + url: Dataset URL + data: Dataset JSON data from Tamr server + """ + cp = deepcopy(data) + return Dataset( + url, + name=cp["name"], + description=cp.get("description"), + key_attribute_names=tuple(cp["keyAttributeNames"]), + ) + + +def attributes(session: Session, dataset: Dataset) -> Tuple[Attribute, ...]: + """Get all attributes from a dataset + + Args: + dataset: Dataset containing the desired attributes + + Returns: + The attributes for the specified dataset + + Raises: + requests.HTTPError: If an HTTP error is encountered. + """ + attrs_url = replace(dataset.url, path=dataset.url.path + "/attributes") + r = session.get(str(attrs_url)) + attrs_json = response.successful(r).json() + + attrs = [] + for attr_json in attrs_json: + id = attr_json["name"] + attr_url = replace(attrs_url, path=attrs_url.path + f"/{id}") + attr = _attribute_from_json(attr_url, attr_json) + attrs.append(attr) + return tuple(attrs) + + +def materialize(session: Session, dataset: Dataset) -> Operation: + """Materialize a dataset and wait for the operation to complete + Materializing consists of updating the dataset (including records) in persistent storage (HBase) based on upstream changes to data. + + Args: + dataset: A Tamr dataset which will be materialized + """ + op = _materialize_async(session, dataset) + return operation.wait(session, op) + + +def _materialize_async(session: Session, dataset: Dataset) -> Operation: + r = session.post(str(dataset.url) + ":refresh",) + return operation._from_response(dataset.url.instance, r) + + +def delete(session: Session, dataset: Dataset, *, cascade: bool = False): + """Deletes an existing dataset + + Sends a deletion request to the Tamr server + + Args: + dataset: Existing dataset to delete + cascade: Whether to delete all derived datasets as well + + Raises: + dataset.NotFound: If no dataset could be found at the specified URL. + Corresponds to a 404 HTTP error. + requests.HTTPError: If any other HTTP error is encountered. + """ + r = session.delete(str(dataset.url), params={"cascade": cascade}) + if r.status_code == 404: + raise NotFound(str(dataset.url)) + response.successful(r) + + +def get_all( + session: Session, + instance: Instance, + *, + filter: Optional[Union[str, List[str]]] = None, +) -> Tuple[Dataset, ...]: + """Get all datasets from an instance + + Args: + instance: Tamr instance from which to get datasets + filter: Filter expression, e.g. "externalId==wobbly" + Multiple expressions can be passed as a list + + Returns: + The datasets retrieved from the instance + + Raises: + requests.HTTPError: If an HTTP error is encountered. + """ + url = URL(instance=instance, path="datasets") + + if filter is not None: + r = session.get(str(url), params={"filter": filter}) + else: + r = session.get(str(url)) + + datasets_json = response.successful(r).json() + + datasets = [] + for dataset_json in datasets_json: + dataset_url = URL(instance=instance, path=dataset_json["relativeId"]) + dataset = _from_json(dataset_url, dataset_json) + datasets.append(dataset) + return tuple(datasets) + + +def create( + session: Session, + instance: Instance, + *, + name: str, + key_attribute_names: Tuple[str, ...], + description: Optional[str] = None, + external_id: Optional[str] = None, +) -> Dataset: + """Create a dataset in Tamr. + + Args: + instance: Tamr instance + name: Dataset name + key_attribute_names: Dataset primary key attribute names + description: Dataset description + external_id: External ID of the dataset + + Returns: + Dataset created in Tamr + + Raises: + dataset.AlreadyExists: If a dataset with these specifications already exists. + requests.HTTPError: If any other HTTP error is encountered. + """ + data = { + "name": name, + "keyAttributeNames": key_attribute_names, + "description": description, + "externalId": external_id, + } + + dataset_url = URL(instance=instance, path="datasets") + r = session.post(url=str(dataset_url), json=data) + + if r.status_code == 400 and "already exists" in r.json()["message"]: + raise AlreadyExists(r.json()["message"]) + + data = response.successful(r).json() + dataset_path = data["relativeId"] + dataset_url = URL(instance=instance, path=str(dataset_path)) + + return _by_url(session=session, url=dataset_url) diff --git a/tamr_client/dataset/dataframe.py b/tamr_client/dataset/dataframe.py new file mode 100644 index 00000000..158534d7 --- /dev/null +++ b/tamr_client/dataset/dataframe.py @@ -0,0 +1,185 @@ +""" +Convenient functionality for interacting with pandas DataFrames. +""" + +import json +import os +from typing import Optional, TYPE_CHECKING + +import requests + +from tamr_client import attribute, dataset, primary_key +from tamr_client._types import Dataset, Instance, JsonDict, Session +from tamr_client.dataset import record +from tamr_client.exception import TamrClientException + +BUILDING_DOCS = os.environ.get("TAMR_CLIENT_DOCS") == "1" +if TYPE_CHECKING or BUILDING_DOCS: + import pandas as pd + + +class CreationFailure(TamrClientException): + """Raised when a dataset could not be created from a pandas DataFrame + """ + + pass + + +def upsert( + session: Session, + dataset: Dataset, + df: "pd.DataFrame", + *, + primary_key_name: Optional[str] = None, +) -> JsonDict: + """Upserts a record for each row of `df` with attributes for each column in `df`. + + Args: + dataset: Dataset to receive record updates + df: The DataFrame containing records to be upserted + primary_key_name: The primary key of the dataset. Must be a column of `df`. By default the + key_attribute_name of dataset + + Returns: + JSON response body from the server + + Raises: + requests.HTTPError: If an HTTP error is encountered + primary_key.NotFound: If `primary_key_name` is not a column in `df` or the index of `df` + ValueError: If `primary_key_name` matches both a column in `df` and the index of `df` + """ + if primary_key_name is None: + primary_key_name = dataset.key_attribute_names[0] + + # preconditions + _check_primary_key(df, primary_key_name) + + # promote primary key column to index + if primary_key_name in df.columns: + df = df.set_index(primary_key_name) + + # serialize records via to_json to handle `np.nan` values + serialized_records = ((pk, row.to_json()) for pk, row in df.iterrows()) + records = ( + {primary_key_name: pk, **json.loads(row)} for pk, row in serialized_records + ) + return record.upsert(session, dataset, records, primary_key_name=primary_key_name) + + +def create( + session: Session, + instance: Instance, + df: "pd.DataFrame", + *, + name: str, + primary_key_name: Optional[str] = None, + description: Optional[str] = None, + external_id: Optional[str] = None, +) -> Dataset: + """Create a dataset in Tamr from the DataFrame `df` and creates a record from each row + + All attributes other than the primary key are created as the default type array(string) + + Args: + instance: Tamr instance + df: The DataFrame containing records to be upserted + name: Dataset name + primary_key_name: The primary key of the dataset. Must be a column of `df`. By default the + name of the index of `df` + description: Dataset description + external_id: External ID of the dataset + + Returns: + Dataset created in Tamr + + Raises: + dataset.AlreadyExists: If a dataset with these specifications already exists. + requests.HTTPError: If any other HTTP error is encountered. + primary_key.NotFound: If `primary_key_name` is not a column in `df` or the index of `df` + ValueError: If `primary_key_name` matches both a column in `df` and the index of `df` + """ + # preconditions + if primary_key_name is None: + if df.index.name is not None: + primary_key_name = df.index.name + else: + raise primary_key.NotFound( + "No primary key was specified and DataFrame index is unnamed" + ) + _check_primary_key(df, primary_key_name) + + # dataset creation + try: + ds = dataset.create( + session, + instance, + name=name, + key_attribute_names=(primary_key_name,), + description=description, + external_id=external_id, + ) + except (TamrClientException, requests.HTTPError) as e: + raise CreationFailure(f"Dataset was not created: {e}") + + # attribute creation + for col in df.columns: + if col == primary_key_name: + # this attribute already exists as a key attribute + continue + try: + attribute.create(session, ds, name=col, is_nullable=True) + except (TamrClientException, requests.HTTPError) as e: + _handle_creation_failure(session, ds, f"An attribute was not created: {e}") + + # record creation + try: + response = upsert(session, ds, df, primary_key_name=primary_key_name) + if not response["allCommandsSucceeded"]: + _handle_creation_failure(session, ds, "Some records had validation errors") + except (TamrClientException, requests.HTTPError) as e: + _handle_creation_failure(session, ds, f"Record could not be created: {e}") + + # Get Dataset from server + return dataset._dataset._by_url(session, ds.url) + + +def _handle_creation_failure(session: Session, stub_dataset: Dataset, error: str): + """Attempt to make `dataframe.create` atomic by deleting the created dataset in the event of + later failure. + + However, this does not guarantee atomicity: if the request to delete the dataset fails, it will + not retry. + + Args: + stub_dataset: The created dataset to delete + error: The error that caused dataset creation to fail + """ + try: + dataset.delete(session, stub_dataset) + except requests.HTTPError: + raise CreationFailure( + f"Created dataset did not delete after an earlier error: {error}" + ) + raise CreationFailure(error) + + +def _check_primary_key(df: "pd.DataFrame", primary_key_name: str): + """Check if the primary key name uniquely identifies a column or index of the DataFrame + + Args: + df: The DataFrame to inspect + primary_key_name: The index or column name to be used as the primary key + + Raises: + primary_key.Ambiguous: If the primary key name matches both the index and a column + primary_key.NotFound: If the primary key name does not match the index or any column + """ + if primary_key_name in df.columns and primary_key_name == df.index.name: + raise primary_key.Ambiguous( + f"Index {primary_key_name} has the same name as column {primary_key_name}" + ) + elif primary_key_name not in df.columns and primary_key_name != df.index.name: + raise primary_key.NotFound( + f"Primary key: {primary_key_name} is not DataFrame index name: {df.index.name} or in" + f" DataFrame column names: {df.columns}" + ) diff --git a/tamr_client/dataset/record.py b/tamr_client/dataset/record.py new file mode 100644 index 00000000..0035bf0e --- /dev/null +++ b/tamr_client/dataset/record.py @@ -0,0 +1,161 @@ +""" +See https://docs.tamr.com/reference/record +"The recommended approach for modifying records is to use the :func:`~tamr_client.record.upsert` and +:func:`~tamr_client.record.delete` functions for all use cases they can handle. For more advanced use cases, the +underlying :func:`~tamr_client.record._update` function can be used directly." +""" +import json +from typing import cast, Dict, IO, Iterable, Iterator, Optional + +from tamr_client import primary_key +from tamr_client import response +from tamr_client._types import AnyDataset, Dataset, JsonDict, Session + + +def _update(session: Session, dataset: Dataset, updates: Iterable[Dict]) -> JsonDict: + """Send a batch of record creations/updates/deletions to this dataset. + You probably want to use :func:`~tamr_client.record.upsert` + or :func:`~tamr_client.record.delete` instead. + + Args: + dataset: Dataset containing records to be updated + updates: Each update should be formatted as specified in the `Public Docs for Dataset updates `_. + + Returns: + JSON response body from server + + Raises: + requests.HTTPError: If an HTTP error is encountered + """ + stringified_updates = (json.dumps(update).encode("utf-8") for update in updates) + # `requests` accepts a generator for `data` param, but stubs for `requests` in https://github.com/python/typeshed expects this to be a file-like object + io_updates = cast(IO, stringified_updates) + r = session.post( + str(dataset.url) + ":updateRecords", + headers={"Content-Encoding": "utf-8"}, + data=io_updates, + ) + return response.successful(r).json() + + +def upsert( + session: Session, + dataset: Dataset, + records: Iterable[Dict], + *, + primary_key_name: Optional[str] = None, +) -> JsonDict: + """Create or update the specified records. + + Args: + dataset: Dataset to receive record updates + records: The records to update, as dictionaries + primary_key_name: The primary key for these records, which must be a key in each record dictionary. + By default the key_attribute_name of dataset + + Returns: + JSON response body from server + + Raises: + requests.HTTPError: If an HTTP error is encountered + primary_key.NotFound: If primary_key_name does not match dataset primary key + primary_key.NotFound: If primary_key_name not in a record dictionary + """ + if primary_key_name is None: + primary_key_name = dataset.key_attribute_names[0] + + if primary_key_name not in dataset.key_attribute_names: + raise primary_key.NotFound( + f"Primary key: {primary_key_name} is not in dataset key attribute names: {dataset.key_attribute_names}" + ) + updates = ( + _create_command(record, primary_key_name=primary_key_name) for record in records + ) + return _update(session, dataset, updates) + + +def delete( + session: Session, + dataset: Dataset, + records: Iterable[Dict], + *, + primary_key_name: Optional[str] = None, +) -> JsonDict: + """Deletes the specified records, based on primary key values. Does not check that other attribute values match. + + Args: + dataset: Dataset from which to delete records + records: The records to update, as dictionaries + primary_key_name: The primary key for these records, which must be a key in each record dictionary. + By default the key_attribute_name of dataset + + Returns: + JSON response body from server + + Raises: + requests.HTTPError: If an HTTP error is encountered + primary_key.NotFound: If primary_key_name does not match dataset primary key + primary_key.NotFound: If primary_key_name not in a record dictionary + """ + if primary_key_name is None: + primary_key_name = dataset.key_attribute_names[0] + + if primary_key_name not in dataset.key_attribute_names: + raise primary_key.NotFound( + f"Primary key: {primary_key_name} is not in dataset key attribute names: {dataset.key_attribute_names}" + ) + updates = ( + _delete_command(record, primary_key_name=primary_key_name) for record in records + ) + return _update(session, dataset, updates) + + +def _create_command(record: Dict, *, primary_key_name: str) -> Dict: + """Generates the CREATE command formatted as specified in the `Public Docs for Dataset updates + `_. + + Args: + record: The record to create, as a dictionary + primary_key_name: The primary key for this record, which must be a key in the dictionary + + Returns: + The CREATE command in the proper format + """ + return {"action": "CREATE", "recordId": record[primary_key_name], "record": record} + + +def _delete_command(record: Dict, *, primary_key_name: str) -> Dict: + """Generates the DELETE command formatted as specified in the `Public Docs for Dataset updates + `_. + + Args: + record: The record to delete, as a dictionary + primary_key_name: The primary key for this record, which must be a key in the dictionary + + Returns: + The DELETE command in the proper format + """ + return {"action": "DELETE", "recordId": record[primary_key_name]} + + +def stream(session: Session, dataset: AnyDataset) -> Iterator[JsonDict]: + """Stream the records in this dataset as Python dictionaries. + + Args: + dataset: Dataset from which to stream records + + Returns: + Python generator yielding records + """ + with session.get(str(dataset.url) + "/records", stream=True) as r: + yield from response.ndjson(r) + + +def delete_all(session: Session, dataset: AnyDataset): + """Delete all records in this dataset + + Args: + dataset: Dataset from which to delete records + """ + r = session.delete(str(dataset.url) + "/records") + response.successful(r) diff --git a/tamr_client/dataset/unified.py b/tamr_client/dataset/unified.py new file mode 100644 index 00000000..a3b32cd9 --- /dev/null +++ b/tamr_client/dataset/unified.py @@ -0,0 +1,98 @@ +""" +See https://docs.tamr.com/reference/dataset-models +""" +from copy import deepcopy + +from tamr_client import operation, response +from tamr_client._types import ( + JsonDict, + Operation, + Project, + Session, + UnifiedDataset, + URL, +) +from tamr_client.exception import TamrClientException + + +class NotFound(TamrClientException): + """Raised when referencing (e.g. updating or deleting) a unified dataset + that does not exist on the server. + """ + + pass + + +def from_project(session: Session, project: Project) -> UnifiedDataset: + """Get unified dataset of a project + + Fetches the unified dataset of a given project from Tamr server + + Args: + project: Tamr project of this Unified Dataset + + Raises: + unified.NotFound: If no unified dataset could be found at the specified URL. + Corresponds to a 404 HTTP error. + requests.HTTPError: If any other HTTP error is encountered. + """ + url = URL(instance=project.url.instance, path=f"{project.url.path}/unifiedDataset") + return _by_url(session, url) + + +def _by_url(session: Session, url: URL) -> UnifiedDataset: + """Get dataset by URL + + Fetches dataset from Tamr server + + Args: + url: Dataset URL + + Raises: + unified.NotFound: If no dataset could be found at the specified URL. + Corresponds to a 404 HTTP error. + requests.HTTPError: If any other HTTP error is encountered. + """ + r = session.get(str(url)) + if r.status_code == 404: + raise NotFound(str(url)) + data = response.successful(r).json() + return _from_json(url, data) + + +def _from_json(url: URL, data: JsonDict) -> UnifiedDataset: + """Make unified dataset from JSON data (deserialize) + + Args: + url: Unified Dataset URL + data: Unified Dataset JSON data from Tamr server + """ + cp = deepcopy(data) + return UnifiedDataset( + url, + name=cp["name"], + description=cp.get("description"), + key_attribute_names=tuple(cp["keyAttributeNames"]), + ) + + +def apply_changes(session: Session, unified_dataset: UnifiedDataset) -> Operation: + """Applies changes to the unified dataset and waits for the operation to complete + + Args: + unified_dataset: The Unified Dataset which will be committed + """ + op = _apply_changes_async(session, unified_dataset) + return operation.wait(session, op) + + +def _apply_changes_async( + session: Session, unified_dataset: UnifiedDataset +) -> Operation: + """Applies changes to the unified dataset + + Args: + unified_dataset: The Unified Dataset which will be committed + """ + r = session.post(str(unified_dataset.url) + ":refresh") + return operation._from_response(unified_dataset.url.instance, r) diff --git a/tamr_client/exception.py b/tamr_client/exception.py new file mode 100644 index 00000000..d1e5075b --- /dev/null +++ b/tamr_client/exception.py @@ -0,0 +1,4 @@ +class TamrClientException(Exception): + """Base class for all Tamr Client exceptions""" + + pass diff --git a/tamr_client/golden_records/__init__.py b/tamr_client/golden_records/__init__.py new file mode 100644 index 00000000..623262b7 --- /dev/null +++ b/tamr_client/golden_records/__init__.py @@ -0,0 +1,11 @@ +""" +Tamr - Golden Records +See https://docs.tamr.com/docs/overview-golden-records +""" +from tamr_client.golden_records import project +from tamr_client.golden_records._golden_records import ( + _publish_async, + _update_async, + publish, + update, +) diff --git a/tamr_client/golden_records/_golden_records.py b/tamr_client/golden_records/_golden_records.py new file mode 100644 index 00000000..7118a168 --- /dev/null +++ b/tamr_client/golden_records/_golden_records.py @@ -0,0 +1,44 @@ +""" +Tamr - Golden Records +See https://docs.tamr.com/docs/overview-golden-records + +The terminology used here is consistent with Tamr UI terminology + +Asynchronous versions of each function can be found with the suffix `_async` and may be of +interest to power users +""" +from tamr_client import operation +from tamr_client._types import GoldenRecordsProject, Operation, Session + + +def update(session: Session, project: GoldenRecordsProject) -> Operation: + """Update the draft golden records and wait for the operation to complete + + Args: + project: Tamr Golden Records project + """ + op = _update_async(session, project) + return operation.wait(session, op) + + +def publish(session: Session, project: GoldenRecordsProject) -> Operation: + """Publish the golden records and wait for the operation to complete + + Args: + project: Tamr Golden Records project + """ + op = _publish_async(session, project) + return operation.wait(session, op) + + +def _update_async(session: Session, project: GoldenRecordsProject) -> Operation: + r = session.post(str(project.url) + "/goldenRecords:refresh") + return operation._from_response(project.url.instance, r) + + +def _publish_async(session: Session, project: GoldenRecordsProject) -> Operation: + r = session.post( + str(project.url) + "/publishedGoldenRecords:refresh", + params={"validate": "true", "version": "CURRENT"}, + ) + return operation._from_response(project.url.instance, r) diff --git a/tamr_client/golden_records/project.py b/tamr_client/golden_records/project.py new file mode 100644 index 00000000..bce0bfe6 --- /dev/null +++ b/tamr_client/golden_records/project.py @@ -0,0 +1,17 @@ +from tamr_client._types import ( + GoldenRecordsProject, + JsonDict, + URL, +) + + +def _from_json(url: URL, data: JsonDict) -> GoldenRecordsProject: + """Make golden records project from JSON data (deserialize) + + Args: + url: Project URL + data: Project JSON data from Tamr server + """ + return GoldenRecordsProject( + url, name=data["name"], description=data.get("description") + ) diff --git a/tamr_client/instance.py b/tamr_client/instance.py new file mode 100644 index 00000000..9f8f3d66 --- /dev/null +++ b/tamr_client/instance.py @@ -0,0 +1,28 @@ +from tamr_client._types import Instance, Session + + +def origin(instance: Instance) -> str: + """HTTP origin i.e. :code:`://[:]`. + + For additional information, see `MDN web docs `_ . + """ + if instance.port is None: + return f"{instance.protocol}://{instance.host}" + else: + return f"{instance.protocol}://{instance.host}:{instance.port}" + + +def version(session: Session, instance: Instance) -> str: + """Return the Tamr version for an instance. + + Args: + session: Tamr Session + instance: Tamr instance + + Returns: Version + + """ + # Version endpoints are not themselves versioned by design, but they are stable so they are ok to use here. + return session.get(f"{origin(instance)}/api/versioned/service/version").json()[ + "version" + ] diff --git a/tamr_client/mastering/__init__.py b/tamr_client/mastering/__init__.py new file mode 100644 index 00000000..ac82ef92 --- /dev/null +++ b/tamr_client/mastering/__init__.py @@ -0,0 +1,22 @@ +""" +Tamr - Mastering +See https://docs.tamr.com/docs/overall-workflow-mastering +""" +from tamr_client.mastering import project +from tamr_client.mastering._mastering import ( + _apply_feedback_async, + _estimate_pairs_async, + _generate_pairs_async, + _publish_clusters_async, + _update_cluster_results_async, + _update_high_impact_pairs_async, + _update_pair_results_async, + apply_feedback, + estimate_pairs, + generate_pairs, + publish_clusters, + update_cluster_results, + update_high_impact_pairs, + update_pair_results, + update_unified_dataset, +) diff --git a/tamr_client/mastering/_mastering.py b/tamr_client/mastering/_mastering.py new file mode 100644 index 00000000..62e10e46 --- /dev/null +++ b/tamr_client/mastering/_mastering.py @@ -0,0 +1,139 @@ +""" +Tamr - Mastering +See https://docs.tamr.com/docs/overall-workflow-mastering + +The terminology used here is consistent with Tamr UI terminology + +Asynchronous versions of each function can be found with the suffix `_async` and may be of +interest to power users +""" +from tamr_client import operation +from tamr_client._types import MasteringProject, Operation, Session +from tamr_client.dataset import unified + + +def update_unified_dataset(session: Session, project: MasteringProject) -> Operation: + """Apply changes to the unified dataset and wait for the operation to complete + + Args: + project: Tamr Mastering project + """ + unified_dataset = unified.from_project(session, project) + op = unified._apply_changes_async(session, unified_dataset) + return operation.wait(session, op) + + +def estimate_pairs(session: Session, project: MasteringProject) -> Operation: + """Update the estimated pair counts and wait for the operation to complete + + Args: + project: Tamr Mastering project + """ + op = _estimate_pairs_async(session, project) + return operation.wait(session, op) + + +def generate_pairs(session: Session, project: MasteringProject) -> Operation: + """Generate pairs according to the binning model and wait for the operation + to complete + + Args: + project: Tamr Mastering project + """ + op = _generate_pairs_async(session, project) + return operation.wait(session, op) + + +def apply_feedback(session: Session, project: MasteringProject) -> Operation: + """Train the pair-matching model according to verified labels and wait for the + operation to complete + + Args: + project: Tamr Mastering project + """ + op = _apply_feedback_async(session, project) + return operation.wait(session, op) + + +def update_pair_results(session: Session, project: MasteringProject) -> Operation: + """Update record pair predictions according to the latest pair-matching model and + wait for the operation to complete + + Args: + project: Tamr Mastering project + """ + op = _update_pair_results_async(session, project) + return operation.wait(session, op) + + +def update_high_impact_pairs(session: Session, project: MasteringProject) -> Operation: + """Produce new high-impact pairs according to the latest pair-matching model and + wait for the operation to complete + + Args: + project: Tamr Mastering project + """ + op = _update_high_impact_pairs_async(session, project) + return operation.wait(session, op) + + +def update_cluster_results(session: Session, project: MasteringProject) -> Operation: + """Generate clusters based on the latest pair-matching model and wait for the + operation to complete + + Args: + project: Tamr Mastering project + """ + op = _update_cluster_results_async(session, project) + return operation.wait(session, op) + + +def publish_clusters(session: Session, project: MasteringProject) -> Operation: + """Publish current record clusters and wait for the operation to complete + + Args: + project: Tamr Mastering project + """ + op = _publish_clusters_async(session, project) + return operation.wait(session, op) + + +def _estimate_pairs_async(session: Session, project: MasteringProject) -> Operation: + r = session.post(str(project.url) + "/estimatedPairCounts:refresh") + return operation._from_response(project.url.instance, r) + + +def _generate_pairs_async(session: Session, project: MasteringProject) -> Operation: + r = session.post(str(project.url) + "/recordPairs:refresh") + return operation._from_response(project.url.instance, r) + + +def _apply_feedback_async(session: Session, project: MasteringProject) -> Operation: + r = session.post(str(project.url) + "/recordPairsWithPredictions/model:refresh") + return operation._from_response(project.url.instance, r) + + +def _update_pair_results_async( + session: Session, project: MasteringProject +) -> Operation: + r = session.post(str(project.url) + "/recordPairsWithPredictions:refresh") + return operation._from_response(project.url.instance, r) + + +def _update_high_impact_pairs_async( + session: Session, project: MasteringProject +) -> Operation: + r = session.post(str(project.url) + "/highImpactPairs:refresh") + return operation._from_response(project.url.instance, r) + + +def _update_cluster_results_async( + session: Session, project: MasteringProject +) -> Operation: + r = session.post(str(project.url) + "/recordClusters:refresh") + return operation._from_response(project.url.instance, r) + + +def _publish_clusters_async(session: Session, project: MasteringProject) -> Operation: + r = session.post(str(project.url) + "/publishedClustersWithData:refresh") + return operation._from_response(project.url.instance, r) diff --git a/tamr_client/mastering/project.py b/tamr_client/mastering/project.py new file mode 100644 index 00000000..73f6d970 --- /dev/null +++ b/tamr_client/mastering/project.py @@ -0,0 +1,56 @@ +from typing import Optional + +from tamr_client import project +from tamr_client._types import ( + Instance, + JsonDict, + MasteringProject, + Project, + Session, + URL, +) + + +def _from_json(url: URL, data: JsonDict) -> MasteringProject: + """Make mastering project from JSON data (deserialize) + + Args: + url: Project URL + data: Project JSON data from Tamr server + """ + return MasteringProject(url, name=data["name"], description=data.get("description")) + + +def create( + session: Session, + instance: Instance, + name: str, + description: Optional[str] = None, + external_id: Optional[str] = None, + unified_dataset_name: Optional[str] = None, +) -> Project: + """Create a Mastering project in Tamr. + + Args: + instance: Tamr instance + name: Project name + description: Project description + external_id: External ID of the project + unified_dataset_name: Unified dataset name. If None, will be set to project name + _'unified_dataset' + + Returns: + Project created in Tamr + + Raises: + project.AlreadyExists: If a project with these specifications already exists. + requests.HTTPError: If any other HTTP error is encountered. + """ + return project._create( + session=session, + instance=instance, + name=name, + project_type="DEDUP", + description=description, + external_id=external_id, + unified_dataset_name=unified_dataset_name, + ) diff --git a/tamr_client/operation.py b/tamr_client/operation.py new file mode 100644 index 00000000..01f9e751 --- /dev/null +++ b/tamr_client/operation.py @@ -0,0 +1,175 @@ +""" +See https://docs.tamr.com/new/reference/the-operation-object +""" +from copy import deepcopy +from time import sleep, time as now +from typing import Optional + +import requests + +from tamr_client import response +from tamr_client._types import Instance, JsonDict, Operation, Session, URL +from tamr_client.exception import TamrClientException + + +class NotFound(TamrClientException): + """Raised when referencing an operation that does not exist on the server. + """ + + pass + + +class Failed(TamrClientException): + """Raised when checking a failed operation. + """ + + pass + + +def check(session: Session, operation: Operation): + """Waits for the operation to finish and raises an exception if the operation was not successful. + + Args: + operation: Operation to be checked. + + Raises: + Failed: If the operation failed. + """ + op = wait(session, operation) + if not succeeded(op): + raise Failed( + f"Checked operation '{str(op.url)}', but it failed with status: {op.status}" + ) + + +def poll(session: Session, operation: Operation) -> Operation: + """Poll this operation for server-side updates. + + Does not update the :class:`~tamr_client.operation.Operation` object. + Instead, returns a new :class:`~tamr_client.operation.Operation`. + + Args: + operation: Operation to be polled. + """ + return _by_url(session, operation.url) + + +def wait( + session: Session, + operation: Operation, + *, + poll_interval_seconds: int = 3, + timeout_seconds: Optional[int] = None, +) -> Operation: + """Continuously polls for this operation's server-side state. + + Args: + operation: Operation to be polled. + poll_interval_seconds: Time interval (in seconds) between subsequent polls. + timeout_seconds: Time (in seconds) to wait for operation to resolve. + + Raises: + TimeoutError: If operation takes longer than `timeout_seconds` to resolve. + """ + started = now() + while timeout_seconds is None or now() - started < timeout_seconds: + if operation.status is None: + return operation + elif operation.status["state"] in ["PENDING", "RUNNING"]: + sleep(poll_interval_seconds) + elif operation.status["state"] in ["CANCELED", "SUCCEEDED", "FAILED"]: + return operation + operation = poll(session, operation) + raise TimeoutError( + f"Waiting for operation took longer than {timeout_seconds} seconds." + ) + + +def succeeded(operation: Operation) -> bool: + """Convenience method for checking if operation was successful. + """ + return operation.status is not None and operation.status["state"] == "SUCCEEDED" + + +def by_resource_id(session: Session, instance: Instance, resource_id: str) -> Operation: + """Get operation by ID + + Args: + resource_id: The ID of the operation + """ + url = URL(instance=instance, path=f"operations/{resource_id}") + r = session.get(str(url)) + return _from_response(instance, r) + + +def _from_response(instance: Instance, response: requests.Response) -> Operation: + """ + Handle idiosyncrasies in constructing Operations from Tamr responses. + When a Tamr API call would start an operation, but all results that would be + produced by that operation are already up-to-date, Tamr returns `HTTP 204 No Content` + + To make it easy for client code to handle these API responses without checking + the response code, this method will either construct an Operation, or a + dummy `NoOp` operation representing the 204 Success response. + + Args: + response: HTTP Response from the request that started the operation. + """ + if response.status_code == 204: + # Operation was successful, but the response contains no content. + # Create a dummy operation to represent this. + _never = "0000-00-00T00:00:00.000Z" + _description = """Tamr returned HTTP 204 for this operation, indicating that all + results that would be produced by the operation are already up-to-date.""" + resource_json = { + "id": "-1", + "type": "NOOP", + "description": _description, + "status": { + "state": "SUCCEEDED", + "startTime": _never, + "endTime": _never, + "message": "", + }, + "created": {"username": "", "time": _never, "version": "-1"}, + "lastModified": {"username": "", "time": _never, "version": "-1"}, + "relativeId": "operations/-1", + } + else: + resource_json = response.json() + _id = resource_json["id"] + _url = URL(instance=instance, path=f"operations/{_id}") + return _from_json(_url, resource_json) + + +def _by_url(session: Session, url: URL) -> Operation: + """Get operation by URL + + Fetches operation from Tamr server + + Args: + url: Operation URL + + Raises: + operation.NotFound: If no operation could be found at the specified URL. + Corresponds to a 404 HTTP error. + requests.HTTPError: If any other HTTP error is encountered. + """ + r = session.get(str(url)) + if r.status_code == 404: + raise NotFound(str(url)) + data = response.successful(r).json() + return _from_json(url, data) + + +def _from_json(url: URL, data: JsonDict) -> Operation: + """Make operation from JSON data (deserialize) + + Args: + url: Operation URL + data: Operation JSON data from Tamr server + """ + cp = deepcopy(data) + return Operation( + url, type=cp["type"], status=cp.get("status"), description=cp.get("description") + ) diff --git a/tamr_client/primary_key.py b/tamr_client/primary_key.py new file mode 100644 index 00000000..840f443a --- /dev/null +++ b/tamr_client/primary_key.py @@ -0,0 +1,13 @@ +from tamr_client.exception import TamrClientException + + +class Ambiguous(TamrClientException): + """Raised when referencing a primary key by name that matches multiple possible targets.""" + + pass + + +class NotFound(TamrClientException): + """Raised when referencing a primary key by name that does not exist.""" + + pass diff --git a/tamr_client/project.py b/tamr_client/project.py new file mode 100644 index 00000000..6a964912 --- /dev/null +++ b/tamr_client/project.py @@ -0,0 +1,200 @@ +from typing import List, Optional, Tuple, Union + +from tamr_client import response +from tamr_client._types import Instance, JsonDict, Project, Session, UnknownProject, URL +from tamr_client.categorization import project as categorization_project +from tamr_client.exception import TamrClientException +from tamr_client.golden_records import project as golden_records_project +from tamr_client.mastering import project as mastering_project +from tamr_client.schema_mapping import project as schema_mapping_project + + +class NotFound(TamrClientException): + """Raised when referencing (e.g. updating or deleting) a project + that does not exist on the server.""" + + pass + + +class Ambiguous(TamrClientException): + """Raised when referencing a project by name that matches multiple possible targets.""" + + pass + + +class AlreadyExists(TamrClientException): + """Raised when a project with these specifications already exists.""" + + pass + + +def by_resource_id(session: Session, instance: Instance, id: str) -> Project: + """Get project by resource ID. + Fetches project from Tamr server. + + Args: + instance: Tamr instance containing this dataset + id: Project ID + + Raises: + project.NotFound: If no project could be found at the specified URL. + Corresponds to a 404 HTTP error. + requests.HTTPError: If any other HTTP error is encountered. + """ + url = URL(instance=instance, path=f"projects/{id}") + return _by_url(session, url) + + +def by_name(session: Session, instance: Instance, name: str) -> Project: + """Get project by name + Fetches project from Tamr server. + + Args: + instance: Tamr instance containing this project + name: Project name + + Raises: + project.NotFound: If no project could be found with that name. + project.Ambiguous: If multiple targets match project name. + requests.HTTPError: If any other HTTP error is encountered. + """ + r = session.get( + url=str(URL(instance=instance, path="projects")), + params={"filter": f"name=={name}"}, + ) + + # Check that exactly one project is returned + matches = r.json() + if len(matches) == 0: + raise NotFound(str(r.url)) + if len(matches) > 1: + raise Ambiguous(str(r.url)) + + # Make Project from response + url = URL(instance=instance, path=matches[0]["relativeId"]) + return _from_json(url=url, data=matches[0]) + + +def _by_url(session: Session, url: URL) -> Project: + """Get project by URL. + Fetches project from Tamr server. + + Args: + url: Project URL + + Raises: + project.NotFound: If no project could be found at the specified URL. + Corresponds to a 404 HTTP error. + requests.HTTPError: If any other HTTP error is encountered. + """ + r = session.get(str(url)) + if r.status_code == 404: + raise NotFound(str(url)) + data = response.successful(r).json() + return _from_json(url, data) + + +def _from_json(url: URL, data: JsonDict) -> Project: + """Make project from JSON data (deserialize) + Args: + url: Project URL + data: Project JSON data from Tamr server + """ + proj_type = data["type"] + if proj_type == "DEDUP": + return mastering_project._from_json(url, data) + elif proj_type == "CATEGORIZATION": + return categorization_project._from_json(url, data) + elif proj_type == "SCHEMA_MAPPING_RECOMMENDATIONS": + return schema_mapping_project._from_json(url, data) + elif proj_type == "GOLDEN_RECORDS": + return golden_records_project._from_json(url, data) + else: + return UnknownProject( + url, name=data["name"], description=data.get("description") + ) + + +def _create( + session: Session, + instance: Instance, + name: str, + project_type: str, + description: Optional[str] = None, + external_id: Optional[str] = None, + unified_dataset_name: Optional[str] = None, +) -> Project: + """Create a project in Tamr. + + Args: + instance: Tamr instance + name: Project name + project_type: Project type + description: Project description + external_id: External ID of the project + unified_dataset_name: Name of the unified dataset + + Returns: + Project created in Tamr + + Raises: + project.AlreadyExists: If a project with these specifications already exists. + requests.HTTPError: If any other HTTP error is encountered. + """ + if not unified_dataset_name: + unified_dataset_name = name + "_unified_dataset" + data = { + "name": name, + "type": project_type, + "unifiedDatasetName": unified_dataset_name, + "description": description, + "externalId": external_id, + } + + project_url = URL(instance=instance, path="projects") + r = session.post(url=str(project_url), json=data) + + if r.status_code == 409: + raise AlreadyExists(r.json()["message"]) + + data = response.successful(r).json() + project_path = data["relativeId"] + project_url = URL(instance=instance, path=str(project_path)) + + return _by_url(session=session, url=project_url) + + +def get_all( + session: Session, + instance: Instance, + *, + filter: Optional[Union[str, List[str]]] = None, +) -> Tuple[Project, ...]: + """Get all projects from an instance + + Args: + instance: Tamr instance from which to get projects + filter: Filter expression, e.g. "externalId==wobbly" + Multiple expressions can be passed as a list + + Returns: + The projects retrieved from the instance + + Raises: + requests.HTTPError: If an HTTP error is encountered. + """ + url = URL(instance=instance, path="projects") + + if filter is not None: + r = session.get(str(url), params={"filter": filter}) + else: + r = session.get(str(url)) + + projects_json = response.successful(r).json() + + projects = [] + for project_json in projects_json: + project_url = URL(instance=instance, path=project_json["relativeId"]) + project = _from_json(project_url, project_json) + projects.append(project) + return tuple(projects) diff --git a/tamr_unify_client/models/__init__.py b/tamr_client/py.typed similarity index 100% rename from tamr_unify_client/models/__init__.py rename to tamr_client/py.typed diff --git a/tamr_client/response.py b/tamr_client/response.py new file mode 100644 index 00000000..dad88aad --- /dev/null +++ b/tamr_client/response.py @@ -0,0 +1,57 @@ +import json +import logging +from typing import Iterator + +import requests + +from tamr_client._types import JsonDict + +logger = logging.getLogger(__name__) + + +def successful(response: requests.Response) -> requests.Response: + """Ensure response does not contain an HTTP error. + + Delegates to :func:`requests.Response.raise_for_status` + + Returns: + The response being checked. + + Raises: + requests.exceptions.HTTPError: If an HTTP error is encountered. + """ + try: + response.raise_for_status() + except requests.HTTPError as e: + r = e.response + logger.error( + f"Encountered HTTP error code {r.status_code}. Response body: {r.text}" + ) + raise e + return response + + +def ndjson(response: requests.Response, **kwargs) -> Iterator[JsonDict]: + """Stream newline-delimited JSON from the response body + + Analog to :func:`requests.Response.json` but for ``.ndjson``-formatted body. + + **Recommended**: For memory efficiency, use ``stream=True`` when sending the request corresponding to this response. + + Args: + response: Response whose body should be streamed as newline-delimited JSON. + **kwargs: Keyword arguments passed to underlying :func:`requests.Response.iter_lines` call. + + Returns + Each line of the response body, parsed as JSON + + Example: + >>> import tamr_client as tc + >>> s = tc.session.from_auth(...) + >>> r = s.get(..., stream=True) + >>> for data in tc.response.ndjson(r): + ... assert data['my key'] == 'my_value' + + """ + for line in response.iter_lines(**kwargs): + yield json.loads(line) diff --git a/tamr_client/restore.py b/tamr_client/restore.py new file mode 100644 index 00000000..9eaebb29 --- /dev/null +++ b/tamr_client/restore.py @@ -0,0 +1,96 @@ +from tamr_client import response, Restore +from tamr_client._types import Instance, JsonDict, Session, URL +from tamr_client.exception import TamrClientException + + +class InvalidOperation(TamrClientException): + """Raised when attempting an invalid operation. + """ + + pass + + +class NotFound(TamrClientException): + """Raised when referencing a restore that does not exist on the server. + """ + + pass + + +def _from_json(url: URL, data: JsonDict) -> Restore: + """Make restore from JSON data (deserialize). + + Args: + url: Restore url + data: Restore JSON data from Tamr server + """ + return Restore( + url=url, + backup_path=data["backupPath"], + state=data["state"], + error_message=data["errorMessage"], + ) + + +def get(session: Session, instance: Instance) -> Restore: + """Get information on the latest Tamr restore, if any. + + Args: + session: Tamr session + instance: Tamr instance + + Returns: + Latest Tamr restore + + Raises: + restore.NotFound: If no backup found at the specified URL + """ + url = URL(instance=instance, path="instance/restore") + r = session.get(str(url)) + if r.status_code == 404: + raise NotFound(str(url)) + return _from_json(url, response.successful(r).json()) + + +def initiate(session: Session, instance: Instance, backup_path: str) -> Restore: + """Initiate a Tamr restore. + + Args: + session: Tamr session + instance: Tamr instance + backup_path: Path to the backup + + Returns: + Initiated restore + + Raises: + restore.InvalidOperation: If attempting an invalid operation + """ + url = URL(instance=instance, path="instance/restore") + r = session.post(str(url), data=backup_path) + if r.status_code == 400: + raise InvalidOperation(str(url), r.json()["message"]) + return _from_json(url, response.successful(r).json()) + + +def cancel(session: Session, restore: Restore) -> Restore: + """Cancel a Tamr restore. + + Args: + session: Tamr session + restore: A Tamr restore + + Returns: + Canceled restore + + Raises: + restore.NotFound: If no backup file found at the specified path + restore.InvalidOperation: If attempting an invalid operation + """ + cancel_url = f"{restore.url}:cancel" + r = session.post(cancel_url) + if r.status_code == 404: + raise NotFound(cancel_url) + if r.status_code == 400: + raise InvalidOperation(cancel_url, r.json()["message"]) + return _from_json(restore.url, response.successful(r).json()) diff --git a/tamr_client/schema_mapping/__init__.py b/tamr_client/schema_mapping/__init__.py new file mode 100644 index 00000000..eb6fd44e --- /dev/null +++ b/tamr_client/schema_mapping/__init__.py @@ -0,0 +1,6 @@ +""" +Tamr - Schema Mapping +See https://docs.tamr.com/new/docs/overall-workflow-schema +""" +from tamr_client.schema_mapping import project +from tamr_client.schema_mapping._schema_mapping import update_unified_dataset diff --git a/tamr_client/schema_mapping/_schema_mapping.py b/tamr_client/schema_mapping/_schema_mapping.py new file mode 100644 index 00000000..66349e4b --- /dev/null +++ b/tamr_client/schema_mapping/_schema_mapping.py @@ -0,0 +1,25 @@ +""" +Tamr - Schema Mapping +See https://docs.tamr.com/new/docs/overall-workflow-schema + +The terminology used here is consistent with Tamr UI terminology + +Asynchronous versions of each function can be found with the suffix `_async` and may be of +interest to power users +""" +from tamr_client import operation +from tamr_client._types import Operation, SchemaMappingProject, Session +from tamr_client.dataset import unified + + +def update_unified_dataset( + session: Session, project: SchemaMappingProject +) -> Operation: + """Apply changes to the unified dataset and wait for the operation to complete + + Args: + project: Tamr Schema Mapping project + """ + unified_dataset = unified.from_project(session, project) + op = unified._apply_changes_async(session, unified_dataset) + return operation.wait(session, op) diff --git a/tamr_client/schema_mapping/project.py b/tamr_client/schema_mapping/project.py new file mode 100644 index 00000000..d02b58c0 --- /dev/null +++ b/tamr_client/schema_mapping/project.py @@ -0,0 +1,58 @@ +from typing import Optional + +from tamr_client import project +from tamr_client._types import ( + Instance, + JsonDict, + Project, + SchemaMappingProject, + Session, + URL, +) + + +def _from_json(url: URL, data: JsonDict) -> SchemaMappingProject: + """Make schema mapping project from JSON data (deserialize) + + Args: + url: Project URL + data: Project JSON data from Tamr server + """ + return SchemaMappingProject( + url, name=data["name"], description=data.get("description") + ) + + +def create( + session: Session, + instance: Instance, + name: str, + description: Optional[str] = None, + external_id: Optional[str] = None, + unified_dataset_name: Optional[str] = None, +) -> Project: + """Create a Schema Mapping project in Tamr. + + Args: + instance: Tamr instance + name: Project name + description: Project description + external_id: External ID of the project + unified_dataset_name: Unified dataset name. If None, will be set to project name + _'unified_dataset' + + Returns: + Project created in Tamr + + Raises: + project.AlreadyExists: If a project with these specifications already exists. + requests.HTTPError: If any other HTTP error is encountered. + """ + return project._create( + session=session, + instance=instance, + name=name, + project_type="SCHEMA_MAPPING_RECOMMENDATIONS", + description=description, + external_id=external_id, + unified_dataset_name=unified_dataset_name, + ) diff --git a/tamr_client/session.py b/tamr_client/session.py new file mode 100644 index 00000000..c1676685 --- /dev/null +++ b/tamr_client/session.py @@ -0,0 +1,14 @@ +import requests + +from tamr_client._types import Session + + +def from_auth(auth: requests.auth.HTTPBasicAuth) -> Session: + """Create a new authenticated session + + Args: + auth: Authentication + """ + s = requests.Session() + s.auth = auth + return s diff --git a/tamr_client/transformations.py b/tamr_client/transformations.py new file mode 100644 index 00000000..42e6896b --- /dev/null +++ b/tamr_client/transformations.py @@ -0,0 +1,121 @@ +import requests + +from tamr_client import dataset, response +from tamr_client._types import ( + InputTransformation, + Instance, + JsonDict, + Project, + Session, + Transformations, +) + + +def _input_transformation_from_json( + session: Session, instance: Instance, data: JsonDict +) -> InputTransformation: + """Make input transformation from JSON data (deserialize) + + Args: + instance: Tamr instance containing this transformation + data: Input scoped transformation JSON data from Tamr server + """ + dataset_resource_ids = [d["datasetId"].split("/")[-1] for d in data["datasets"]] + datasets = [ + dataset.by_resource_id(session, instance, d_id) for d_id in dataset_resource_ids + ] + return InputTransformation(transformation=data["transformation"], datasets=datasets) + + +def _from_json(session: Session, instance: Instance, data: JsonDict) -> Transformations: + """Make transformations from JSON data (deserialize) + + Args: + instance: Tamr instance containing this transformation + data: Transformation JSON data from Tamr server + """ + return Transformations( + unified_scope=data["unified"], + input_scope=[ + _input_transformation_from_json(session, instance, tx) + for tx in data["parameterized"] + ], + ) + + +def _input_transformation_to_json(tx: InputTransformation) -> JsonDict: + """Convert input transformations to JSON data (serialize) + + Args: + tx: Input transformation to convert + """ + # datasetId omitted, only one of "datasetId" or "relativeDatasetId" is required + dataset_json = [ + {"name": d.name, "relativeDatasetId": d.url.path} for d in tx.datasets + ] + + return {"datasets": dataset_json, "transformation": tx.transformation} + + +def _to_json(tx: Transformations) -> JsonDict: + """Convert transformations to JSON data (serialize) + + Args: + tx: Transformations to convert + """ + return { + "parameterized": [_input_transformation_to_json(t) for t in tx.input_scope], + "unified": tx.unified_scope, + } + + +def get_all(session: Session, project: Project) -> Transformations: + """Get the transformations of a Project + + Args: + project: Project containing transformations + + Raises: + requests.HTTPError: If any HTTP error is encountered. + + Example: + >>> import tamr_client as tc + >>> session = tc.session.from_auth('username', 'password') + >>> instance = tc.instance.Instance(host="localhost", port=9100) + >>> project1 = tc.project.by_resource_id(session, instance, id='1') + >>> print(tc.transformations.get_all(session, project1)) + """ + r = session.get(f"{project.url}/transformations") + response.successful(r) + return _from_json(session, project.url.instance, r.json()) + + +def replace_all( + session: Session, project: Project, tx: Transformations +) -> requests.Response: + """Replaces the transformations of a Project + + Args: + project: Project to place transformations within + tx: Transformations to put into project + + Raises: + requests.HTTPError: If any HTTP error is encountered. + + Example: + >>> import tamr_client as tc + >>> session = tc.session.from_auth('username', 'password') + >>> instance = tc.instance.Instance(host="localhost", port=9100) + >>> project1 = tc.project.by_resource_id(session, instance, id='1') + >>> dataset3 = tc.dataset.by_resource_id(session, instance, id='3') + >>> new_input_tx = tc.InputTransformation("SELECT *, upper(name) as name;", [dataset3]) + >>> all_tx = tc.Transformations( + ... input_scope=[new_input_tx], + ... unified_scope=["SELECT *, 1 as one;"] + ... ) + >>> tc.transformations.replace_all(session, project1, all_tx) + """ + body = _to_json(tx) + r = session.put(f"{project.url}/transformations", json=body) + + return response.successful(r) diff --git a/tamr_unify_client/__init__.py b/tamr_unify_client/__init__.py index b556f056..7fd0379c 100644 --- a/tamr_unify_client/__init__.py +++ b/tamr_unify_client/__init__.py @@ -1 +1,5 @@ +# flake8: noqa + +import logging + from tamr_unify_client.client import Client diff --git a/tamr_unify_client/_ignore_nan_encoder.py b/tamr_unify_client/_ignore_nan_encoder.py new file mode 100644 index 00000000..660dba19 --- /dev/null +++ b/tamr_unify_client/_ignore_nan_encoder.py @@ -0,0 +1,67 @@ +"""Adaptation of the Python standard library JSONEncoder to encode `NaN` as 'null' +Compare to https://github.com/python/cpython/blob/3.9/Lib/json/encoder.py +The only functional difference is in the definition of `floatstr` where 'NaN', 'Infinity', and '-Infinity' are encoded as 'null' +""" +from json import JSONEncoder +from json.encoder import ( + _make_iterencode, + py_encode_basestring, + py_encode_basestring_ascii, +) + +try: + from _json import encode_basestring_ascii as c_encode_basestring_ascii +except ImportError: + c_encode_basestring_ascii = None +try: + from _json import encode_basestring as c_encode_basestring +except ImportError: + c_encode_basestring = None +try: + from _json import make_encoder as c_make_encoder +except ImportError: + c_make_encoder = None + +INFINITY = float("inf") +encode_basestring = c_encode_basestring or py_encode_basestring +encode_basestring_ascii = c_encode_basestring_ascii or py_encode_basestring_ascii + + +class IgnoreNanEncoder(JSONEncoder): + def iterencode(self, o, _one_shot=False): + """Encode the given object and yield each string + representation as available. + For example:: + for chunk in JSONEncoder().iterencode(bigobject): + mysocket.write(chunk) + """ + if self.check_circular: + markers = {} + else: + markers = None + if self.ensure_ascii: + _encoder = encode_basestring_ascii + else: + _encoder = encode_basestring + + def floatstr( + o, _repr=float.__repr__, _inf=INFINITY, _neginf=-INFINITY, + ): + if o != o or o == _inf or o == _neginf: + return "null" + else: + return _repr(o) + + _iterencode = _make_iterencode( + markers, + self.default, + _encoder, + self.indent, + floatstr, + self.key_separator, + self.item_separator, + self.sort_keys, + self.skipkeys, + _one_shot, + ) + return _iterencode(o, 0) diff --git a/tamr_unify_client/models/attribute/collection.py b/tamr_unify_client/attribute/collection.py similarity index 61% rename from tamr_unify_client/models/attribute/collection.py rename to tamr_unify_client/attribute/collection.py index e4d90c93..4b68e0c0 100644 --- a/tamr_unify_client/models/attribute/collection.py +++ b/tamr_unify_client/attribute/collection.py @@ -1,28 +1,19 @@ -from tamr_unify_client.models.attribute.resource import Attribute -from tamr_unify_client.models.base_collection import BaseCollection +from tamr_unify_client.attribute.resource import Attribute +from tamr_unify_client.base_collection import BaseCollection class AttributeCollection(BaseCollection): - """Collection of :class:`~tamr_unify_client.models.attribute.resource.Attribute` s. + """Collection of :class:`~tamr_unify_client.attribute.resource.Attribute` s. :param client: Client for API call delegation. :type client: :class:`~tamr_unify_client.Client` - :param data: JSON data representing this resource - :type data: dict :param api_path: API path used to access this collection. E.g. ``"datasets/1/attributes"``. :type api_path: str """ - def __init__(self, client, data, api_path): + def __init__(self, client, api_path): super().__init__(client, api_path) - self._data = data - - @classmethod - def from_json(cls, client, data, api_path): - # BaseCollection doesn't really implement from_json / from_data - # but we pretend it does. - return AttributeCollection(client, data, api_path) def by_resource_id(self, resource_id): """Retrieve an attribute by resource ID. @@ -30,9 +21,9 @@ def by_resource_id(self, resource_id): :param resource_id: The resource ID. E.g. ``"AttributeName"`` :type resource_id: str :returns: The specified attribute. - :rtype: :class:`~tamr_unify_client.models.attribute.resource.Attribute` + :rtype: :class:`~tamr_unify_client.attribute.resource.Attribute` """ - return self.by_name(resource_id) + return super().by_resource_id(self.api_path, resource_id) def by_relative_id(self, relative_id): """Retrieve an attribute by relative ID. @@ -40,10 +31,9 @@ def by_relative_id(self, relative_id): :param relative_id: The resource ID. E.g. ``"datasets/1/attributes/AttributeName"`` :type relative_id: str :returns: The specified attribute. - :rtype: :class:`~tamr_unify_client.models.attribute.resource.Attribute` + :rtype: :class:`~tamr_unify_client.attribute.resource.Attribute` """ - resource_id = relative_id.split("/")[-1] - return self.by_resource_id(resource_id) + return super().by_relative_id(Attribute, relative_id) def by_external_id(self, external_id): """Retrieve an attribute by external ID. @@ -54,7 +44,7 @@ def by_external_id(self, external_id): :param external_id: The external ID. :type external_id: str :returns: The specified attribute, if found. - :rtype: :class:`~tamr_unify_client.models.attribute.resource.Attribute` + :rtype: :class:`~tamr_unify_client.attribute.resource.Attribute` :raises KeyError: If no attribute with the specified external_id is found :raises LookupError: If multiple attributes with the specified external_id are found """ @@ -65,7 +55,7 @@ def stream(self): over this collection. :returns: Stream of attributes. - :rtype: Python generator yielding :class:`~tamr_unify_client.models.attribute.resource.Attribute` + :rtype: Python generator yielding :class:`~tamr_unify_client.attribute.resource.Attribute` Usage: >>> for attribute in collection.stream(): # explicit @@ -73,7 +63,8 @@ def stream(self): >>> for attribute in collection: # implicit >>> do_stuff(attribute) """ - for resource_json in self._data: + data = self.client.get(self.api_path).successful().json() + for resource_json in data: alias = self.api_path + "/" + resource_json["name"] yield Attribute.from_json(self.client, resource_json, alias) @@ -83,12 +74,21 @@ def by_name(self, attribute_name): :param attribute_name: Name of the desired attribute. :type attribute_name: str :return: Attribute with matching name in this collection. - :rtype: :class:`~tamr_unify_client.models.attribute.resource.Attribute` - :raises KeyError: If no attribute with specified name was found. + :rtype: :class:`~tamr_unify_client.attribute.resource.Attribute` + """ + return super().by_resource_id(self.api_path, attribute_name) + + def create(self, creation_spec): + """ + Create an Attribute in this collection + + :param creation_spec: Attribute creation specification should be formatted as specified in the `Public Docs for adding an Attribute `_. + :type creation_spec: dict[str, str] + :returns: The created Attribute + :rtype: :class:`~tamr_unify_client.attribute.resource.Attribute` """ - for attribute in self: - if attribute.name == attribute_name: - return attribute - raise KeyError(f"No attribute found with name: {attribute_name}") + data = self.client.post(self.api_path, json=creation_spec).successful().json() + alias = self.api_path + "/" + creation_spec["name"] + return Attribute.from_json(self.client, data, alias) # super.__repr__ is sufficient diff --git a/tamr_unify_client/attribute/resource.py b/tamr_unify_client/attribute/resource.py new file mode 100644 index 00000000..89a1ea76 --- /dev/null +++ b/tamr_unify_client/attribute/resource.py @@ -0,0 +1,179 @@ +from copy import deepcopy + +from tamr_unify_client.attribute.type import AttributeType +from tamr_unify_client.base_resource import BaseResource + + +class Attribute(BaseResource): + """ + A Tamr Attribute. + + See https://docs.tamr.com/reference#attribute-types + """ + + @classmethod + def from_json(cls, client, data, api_path): + return super().from_data(client, data, api_path) + + @property + def relative_id(self): + """:type: str""" + # api_path is alias when it exists, and relative_id when it does not. + # this distinction is useful for things like refreshing a unified dataset, + # where using the relative_id would hit + # /datasets/{id}:refresh + # rather than + # /projects/{id}/unifiedDataset:refresh. + # Since attributes don't currently have that kind of aliasing, + # using api_path is always correct. + # If attributes ever get aliased, this will need to be updated. + # This is confusing; there's an RFC for suggestions to improve this + # #64 https://github.com/Datatamer/unify-client-python/issues/64 + # "Conflation between 'api_path', 'relative_id' / 'relativeId', and + # BaseResource ctor 'alias'" + return self.api_path + + @property + def name(self): + """:type: str""" + return self._data.get("name") + + @property + def description(self): + """:type: str""" + return self._data.get("description") + + @property + def type(self): + """:type: :class:`~tamr_unify_client.attribute.type.AttributeType`""" + type_json = self._data.get("type") + return AttributeType(type_json) + + @property + def is_nullable(self): + """:type: bool""" + return self._data.get("isNullable") + + def spec(self): + """Returns a spec representation of this attribute. + + :return: The attribute spec. + :rtype: :class:`~tamr_unify_client.attribute.resource.AttributeSpec` + """ + return AttributeSpec.of(self) + + def __repr__(self): + return ( + f"{self.__class__.__module__}." + f"{self.__class__.__qualname__}(" + f"relative_id={self.relative_id!r}, " + f"name={self.name!r})" + ) + + +class AttributeSpec: + """A representation of the server view of an attribute""" + + def __init__(self, client, data, api_path): + self._data = data + self.client = client + self.api_path = api_path + + @staticmethod + def of(resource): + """Creates an attribute spec from an attribute. + + :param resource: The existing attribute. + :type resource: :class:`~tamr_unify_client.attribute.resource.Attribute` + :return: The corresponding attribute spec. + :rtype: :class:`~tamr_unify_client.attribute.resource.AttributeSpec` + """ + return AttributeSpec( + resource.client, deepcopy(resource._data), resource.api_path + ) + + @staticmethod + def new(): + """Creates a blank spec that could be used to construct a new attribute. + + :return: The empty spec. + :rtype: :class:`~tamr_unify_client.attribute.resource.AttributeSpec` + """ + return AttributeSpec(None, {}, None) + + def from_data(self, data): + """Creates a spec with the same client and API path as this one, but new data. + + :param data: The data for the new spec. + :type data: dict + :return: The new spec. + :rtype: :class:`~tamr_unify_client.attribute.resource.AttributeSpec` + """ + return AttributeSpec(self.client, data, self.api_path) + + def to_dict(self): + """Returns a version of this spec that conforms to the API representation. + + :returns: The spec's dict. + :rtype: dict + """ + return deepcopy(self._data) + + def with_name(self, new_name): + """Creates a new spec with the same properties, updating name. + + :param new_name: The new name. + :type new_name: str + :return: The new spec. + :rtype: :class:`~tamr_unify_client.attribute.resource.AttributeSpec` + """ + return self.from_data({**self._data, "name": new_name}) + + def with_description(self, new_description): + """Creates a new spec with the same properties, updating description. + + :param new_description: The new description. + :type new_description: str + :return: The new spec. + :rtype: :class:`~tamr_unify_client.attribute.resource.AttributeSpec` + """ + return self.from_data({**self._data, "description": new_description}) + + def with_type(self, new_type): + """Creates a new spec with the same properties, updating type. + + :param new_type: The spec of the new type. + :type new_type: :class:`~tamr_unify_client.attribute.type.AttributeTypeSpec` + :return: The new spec. + :rtype: :class:`~tamr_unify_client.attribute.resource.AttributeSpec` + """ + type_spec = new_type.to_dict() + return self.from_data({**self._data, "type": type_spec}) + + def with_is_nullable(self, new_is_nullable): + """Creates a new spec with the same properties, updating is nullable. + + :param new_is_nullable: The new is nullable. + :type new_is_nullable: bool + :return: The new spec. + :rtype: :class:`~tamr_unify_client.attribute.resource.AttributeSpec` + """ + return self.from_data({**self._data, "isNullable": new_is_nullable}) + + def put(self): + """Commits the changes and updates the attribute in Tamr. + + :return: The updated attribute. + :rtype: :class:`~tamr_unify_client.attribute.resource.Attribute` + """ + updated_attribute = ( + self.client.put(self.api_path, json=self._data).successful().json() + ) + return Attribute.from_json(self.client, updated_attribute, self.api_path) + + def __repr__(self): + return ( + f"{self.__class__.__module__}." + f"{self.__class__.__qualname__}(" + f"dict={self._data!r})" + ) diff --git a/tamr_unify_client/attribute/subattribute.py b/tamr_unify_client/attribute/subattribute.py new file mode 100644 index 00000000..2377c89f --- /dev/null +++ b/tamr_unify_client/attribute/subattribute.py @@ -0,0 +1,45 @@ +from copy import deepcopy +from dataclasses import dataclass, field +from typing import Any, Dict, Optional + +from tamr_unify_client.attribute.type import AttributeType + +SubAttributeJson = Dict[str, Any] + + +@dataclass(frozen=True) +class SubAttribute: + """An attribute which is itself a property of another attribute. + + See https://docs.tamr.com/reference#attribute-types + + Args: + name: Name of sub-attribute + description: Description of sub-attribute + type: See https://docs.tamr.com/reference#attribute-types + is_nullable: If this sub-attribute can be null + """ + + name: str + type: AttributeType + is_nullable: bool + _json: SubAttributeJson = field(repr=False) + description: Optional[str] = None + + @staticmethod + def from_json(data: SubAttributeJson) -> "SubAttribute": + """Create a SubAttribute from JSON data. + + Args: + data: JSON data received from Tamr server. + """ + _json = deepcopy(data) + + dc = deepcopy(data) + dc["is_nullable"] = dc.pop("isNullable") + + type_json = dc.pop("type") + # TODO implement AttributeType.from_json and use that instead + type = AttributeType(type_json) + + return SubAttribute(**dc, type=type, _json=_json) diff --git a/tamr_unify_client/attribute/type.py b/tamr_unify_client/attribute/type.py new file mode 100644 index 00000000..2bf37fe8 --- /dev/null +++ b/tamr_unify_client/attribute/type.py @@ -0,0 +1,123 @@ +from copy import deepcopy + + +class AttributeType: + """ + The type of an :class:`~tamr_unify_client.attribute.resource.Attribute` or :class:`~tamr_unify_client.attribute.subattribute.SubAttribute`. + + See https://docs.tamr.com/reference#attribute-types + + :param data: JSON data representing this type + :type data: :py:class:`dict` + """ + + def __init__(self, data): + self._data = data + + @property + def base_type(self): + """:type: str""" + return self._data.get("baseType") + + @property + def inner_type(self): + """:type: :class:`~tamr_unify_client.attribute.type.AttributeType`""" + if "innerType" in self._data: + return AttributeType(self._data.get("innerType")) + else: + return None + + @property + def attributes(self): + """:type: list[:class:`~tamr_unify_client.attribute.subattribute.SubAttribute`]""" + from tamr_unify_client.attribute.subattribute import SubAttribute + + collection_json = self._data.get("attributes") + return [SubAttribute.from_json(attr) for attr in collection_json] + + def spec(self): + """Returns a spec representation of this attribute type. + + :return: The attribute type spec. + :rtype: :class:`~tamr_unify_client.attribute.type.AttributeTypeSpec` + """ + return AttributeTypeSpec.of(self) + + def __repr__(self): + return ( + f"{self.__class__.__module__}." + f"{self.__class__.__qualname__}(" + f"base_type={self.base_type!r})" + ) + + +class AttributeTypeSpec: + def __init__(self, data): + self._data = data + + @staticmethod + def of(resource): + """Creates an attribute type spec from an attribute type. + + :param resource: The existing attribute type. + :type resource: :class:`~tamr_unify_client.attribute.type.AttributeType` + :return: The corresponding attribute type spec. + :rtype: :class:`~tamr_unify_client.attribute.type.AttributeTypeSpec` + """ + return AttributeTypeSpec(deepcopy(resource._data)) + + @staticmethod + def new(): + """Creates a blank spec that could be used to construct a new attribute type. + + :return: The empty spec. + :rtype: :class:`~tamr_unify_client.attribute.type.AttributeTypeSpec` + """ + return AttributeTypeSpec({}) + + def to_dict(self): + """Returns a version of this spec that conforms to the API representation. + + :returns: The spec's dict. + :rtype: dict + """ + return deepcopy(self._data) + + def with_base_type(self, new_base_type): + """Creates a new spec with the same properties, updating the base type. + + :param new_base_type: The new base type. + :type new_base_type: str + :return: The new spec. + :rtype: :class:`~tamr_unify_client.attribute.type.AttributeTypeSpec` + """ + return AttributeTypeSpec({**self._data, "baseType": new_base_type}) + + def with_inner_type(self, new_inner_type): + """Creates a new spec with the same properties, updating the inner type. + + :param new_inner_type: The spec of the new inner type. + :type new_inner_type: :class:`~tamr_unify_client.attribute.type.AttributeTypeSpec` + :return: The new spec. + :rtype: :class:`~tamr_unify_client.attribute.type.AttributeTypeSpec` + """ + inner_spec = new_inner_type.to_dict() + return AttributeTypeSpec({**self._data, "innerType": inner_spec}) + + def with_attributes(self, new_attributes): + """Creates a new spec with the same properties, updating attributes. + + :param new_attributes: The specs of the new attributes. + :type new_attributes: list[:class:`~tamr_unify_client.attribute.resource.AttributeSpec`] + :return: The new spec. + :rtype: :class:`~tamr_unify_client.attribute.type.AttributeTypeSpec` + """ + attr_specs = [attr.to_dict() for attr in new_attributes] + return AttributeTypeSpec({**self._data, "attributes": attr_specs}) + + def __repr__(self): + return ( + f"{self.__class__.__module__}." + f"{self.__class__.__qualname__}(" + f"dict={self._data!r})" + ) diff --git a/tamr_unify_client/auth/__init__.py b/tamr_unify_client/auth/__init__.py index 55bc40ac..36790060 100644 --- a/tamr_unify_client/auth/__init__.py +++ b/tamr_unify_client/auth/__init__.py @@ -1,2 +1,4 @@ +# flake8: noqa + from tamr_unify_client.auth.token import TokenAuth from tamr_unify_client.auth.username_password import UsernamePasswordAuth diff --git a/tamr_unify_client/auth/username_password.py b/tamr_unify_client/auth/username_password.py index 1de774f8..2d70bc8e 100644 --- a/tamr_unify_client/auth/username_password.py +++ b/tamr_unify_client/auth/username_password.py @@ -13,8 +13,8 @@ def _basic_auth_str(username, password): class UsernamePasswordAuth(HTTPBasicAuth): - """Provides username/password authentication for Unify. - Specifically, sets the `Authorization` HTTP header with Unify's custom `BasicCreds` format. + """Provides username/password authentication for Tamr. + Specifically, sets the `Authorization` HTTP header with Tamr's custom `BasicCreds` format. :param str username: :param str password: diff --git a/tamr_unify_client/models/base_collection.py b/tamr_unify_client/base_collection.py similarity index 88% rename from tamr_unify_client/models/base_collection.py rename to tamr_unify_client/base_collection.py index 94585ff0..fd45e02f 100644 --- a/tamr_unify_client/models/base_collection.py +++ b/tamr_unify_client/base_collection.py @@ -27,7 +27,7 @@ def by_resource_id(self, canonical_path, resource_id): :param resource_id: The resource ID. E.g. "1" :type resource_id: str :returns: The specified item. - :rtype: The ``resource_class`` for this collection. See :func:`~tamr_unify_client.models.base_collection.BaseCollection.by_relative_id`. + :rtype: The ``resource_class`` for this collection. See :func:`~tamr_unify_client.base_collection.BaseCollection.by_relative_id`. """ relative_id = canonical_path + "/" + resource_id return self.by_relative_id(relative_id) @@ -102,6 +102,18 @@ def by_external_id(self, resource_class, external_id): return items[0] + def delete_by_resource_id(self, resource_id): + """Deletes a resource from this collection by resource ID. + + :param resource_id: The resource ID of the resource that will be deleted. + :type resource_id: str + :return: HTTP response from the server. + :rtype: :class:`requests.Response` + """ + path = f"{self.api_path}/{resource_id}" + response = self.client.delete(path).successful() + return response + def __repr__(self): return ( f"{self.__class__.__module__}." diff --git a/tamr_unify_client/models/machine_learning_model.py b/tamr_unify_client/base_model.py similarity index 64% rename from tamr_unify_client/models/machine_learning_model.py rename to tamr_unify_client/base_model.py index 4570d71a..78b8d70a 100644 --- a/tamr_unify_client/models/machine_learning_model.py +++ b/tamr_unify_client/base_model.py @@ -1,9 +1,9 @@ -from tamr_unify_client.models.base_resource import BaseResource -from tamr_unify_client.models.operation import Operation +from tamr_unify_client.base_resource import BaseResource +from tamr_unify_client.operation import Operation class MachineLearningModel(BaseResource): - """A Unify Machine Learning model.""" + """A Tamr Machine Learning model.""" @classmethod def from_json(cls, client, resource_json, api_path=None): @@ -12,8 +12,10 @@ def from_json(cls, client, resource_json, api_path=None): def train(self, **options): """Learn from verified labels. - :param ``**options``: Options passed to underlying :class:`~tamr_unify_client.models.operation.Operation` . - See :func:`~tamr_unify_client.models.operation.Operation.apply_options` . + :param ``**options``: Options passed to underlying :class:`~tamr_unify_client.operation.Operation` . + See :func:`~tamr_unify_client.operation.Operation.apply_options` . + :returns: The resultant operation. + :rtype: :class:`~tamr_unify_client.operation.Operation` """ op_json = self.client.post(self.api_path + ":refresh").successful().json() op = Operation.from_json(self.client, op_json) @@ -22,8 +24,10 @@ def train(self, **options): def predict(self, **options): """Suggest labels for unverified records. - :param ``**options``: Options passed to underlying :class:`~tamr_unify_client.models.operation.Operation` . - See :func:`~tamr_unify_client.models.operation.Operation.apply_options` . + :param ``**options``: Options passed to underlying :class:`~tamr_unify_client.operation.Operation` . + See :func:`~tamr_unify_client.operation.Operation.apply_options` . + :returns: The resultant operation. + :rtype: :class:`~tamr_unify_client.operation.Operation` """ dependent_dataset = "/".join(self.api_path.split("/")[:-1]) op_json = self.client.post(dependent_dataset + ":refresh").successful().json() diff --git a/tamr_unify_client/models/base_resource.py b/tamr_unify_client/base_resource.py similarity index 76% rename from tamr_unify_client/models/base_resource.py rename to tamr_unify_client/base_resource.py index 7e106f02..382d97f3 100644 --- a/tamr_unify_client/models/base_resource.py +++ b/tamr_unify_client/base_resource.py @@ -32,3 +32,12 @@ def resource_id(self): if rid is None: return None return rid.split("/")[-1] + + def delete(self): + """Deletes this resource. Some resources do not support deletion, and will raise a 405 error if this is called. + + :return: HTTP response from the server + :rtype: :class:`requests.Response` + """ + response = self.client.delete(self.api_path).successful() + return response diff --git a/tamr_unify_client/categorization/category/collection.py b/tamr_unify_client/categorization/category/collection.py new file mode 100644 index 00000000..93f3fd86 --- /dev/null +++ b/tamr_unify_client/categorization/category/collection.py @@ -0,0 +1,103 @@ +import json + +from tamr_unify_client.base_collection import BaseCollection +from tamr_unify_client.categorization.category.resource import Category + + +class CategoryCollection(BaseCollection): + """Collection of :class:`~tamr_unify_client.categorization.category.resource.Category` s. + + :param client: Client for API call delegation. + :type client: :class:`~tamr_unify_client.Client` + :param api_path: API path used to access this collection. + E.g. ``"projects/1/taxonomy/categories"``. + :type api_path: str + """ + + def __init__(self, client, api_path): + super().__init__(client, api_path) + + def by_resource_id(self, resource_id): + """Retrieve a category by resource ID. + + :param resource_id: The resource ID. E.g. ``"1"`` + :type resource_id: str + :returns: The specified category. + :rtype: :class:`~tamr_unify_client.categorization.category.resource.Category` + """ + return super().by_resource_id(self.api_path, resource_id) + + def by_relative_id(self, relative_id): + """Retrieve a category by relative ID. + + :param relative_id: The relative ID. E.g. ``"projects/1/categories/1"`` + :type relative_id: str + :returns: The specified category. + :rtype: :class:`~tamr_unify_client.categorization.category.resource.Category` + """ + return super().by_relative_id(Category, relative_id) + + def by_external_id(self, external_id): + """Retrieve an attribute by external ID. + + Since categories do not have external IDs, this method is not supported and will + raise a :class:`NotImplementedError` . + + :param external_id: The external ID. + :type external_id: str + :returns: The specified category, if found. + :rtype: :class:`~tamr_unify_client.categorization.category.resource.Category` + :raises KeyError: If no category with the specified external_id is found + :raises LookupError: If multiple categories with the specified external_id are found + """ + raise NotImplementedError("Categories do not have external_id") + + def stream(self): + """Stream categories in this collection. Implicitly called when iterating + over this collection. + + :returns: Stream of categories. + :rtype: Python generator yielding :class:`~tamr_unify_client.categorization.category.resource.Category` + + Usage: + >>> for category in collection.stream(): # explicit + >>> do_stuff(category) + >>> for category in collection: # implicit + >>> do_stuff(category) + """ + return super().stream(Category) + + def create(self, creation_spec): + """ Creates a new category. + + :param creation_spec: Category creation specification, formatted as specified in the + `Public Docs for Creating a Category `_. + :type creation_spec: dict + :return: The newly created category. + :rtype: :class:`~tamr_unify_client.categorization.category.resource.Category` + """ + resource_json = ( + self.client.post(self.api_path, json=creation_spec).successful().json() + ) + return Category.from_json(self.client, resource_json) + + def bulk_create(self, creation_specs): + """Creates new categories in bulk. + + :param creation_specs: A collection of creation specifications, as detailed for create. + :type creation_specs: iterable[dict] + :returns: JSON response from the server + :rtype: :py:class:`dict` + """ + body = "\n".join([json.dumps(s) for s in creation_specs]).encode("utf-8") + return ( + self.client.post( + self.api_path + ":bulk", + headers={"Content-Encoding": "utf-8"}, + data=body, + ) + .successful() + .json() + ) + + # super.__repr__ is sufficient diff --git a/tamr_unify_client/categorization/category/resource.py b/tamr_unify_client/categorization/category/resource.py new file mode 100644 index 00000000..90978383 --- /dev/null +++ b/tamr_unify_client/categorization/category/resource.py @@ -0,0 +1,144 @@ +from copy import deepcopy + +from tamr_unify_client.base_resource import BaseResource + + +class Category(BaseResource): + """A category of a taxonomy""" + + @classmethod + def from_json(cls, client, data, api_path=None): + return super().from_data(client, data, api_path) + + @property + def name(self): + """:type: str""" + return self._data.get("name") + + @property + def description(self): + """:type: str""" + return self._data.get("description") + + @property + def path(self): + """:type: list[str]""" + return self._data.get("path")[:] + + def parent(self): + """Gets the parent Category of this one, or None if it is a tier 1 category + + :returns: The parent Category or None + :rtype: :class:`~tamr_unify_client.categorization.category.resource.Category` + """ + parent = self._data.get("parent") + if parent: + alias = self.api_path.rsplit("/", 1)[0] + "/" + parent.split("/")[-1] + resource_json = self.client.get(alias).successful().json() + return Category.from_json(self.client, resource_json, alias) + else: + return None + + def spec(self): + """Returns this category's spec. + + :return: The spec for the category. + :rtype: :class:`~tamr_unify_client.categorization.category.resource.CategorySpec` + """ + return CategorySpec.of(self) + + def __repr__(self): + return ( + f"{self.__class__.__module__}." + f"{self.__class__.__qualname__}(" + f"relative_id={self.relative_id!r}, " + f"name={self.name!r}," + f"path={'/'.join(self.path)!r}," + f"description={self.description!r})" + ) + + +class CategorySpec: + """A representation of the server view of a category.""" + + def __init__(self, client, data, api_path): + self.client = client + self._data = data + self.api_path = api_path + + @staticmethod + def of(resource): + """Creates a category spec from a category. + + :param resource: The existing category. + :type resource: :class:`~tamr_unify_client.categorization.category.resource.Category` + :return: The corresponding category spec. + :rtype: :class:`~tamr_unify_client.categorization.category.resource.CategorySpec` + """ + return CategorySpec( + resource.client, deepcopy(resource._data), resource.api_path + ) + + @staticmethod + def new(): + """Creates a blank spec that could be used to construct a new category. + + :return: The empty spec. + :rtype: :class:`~tamr_unify_client.categorization.category.resource.CategorySpec` + """ + return CategorySpec(None, {}, None) + + def from_data(self, data): + """Creates a spec with the same client and API path as this one, but new data. + + :param data: The data for the new spec. + :type data: dict + :return: The new spec. + :rtype: :class:`~tamr_unify_client.categorization.category.resource.CategorySpec` + """ + return CategorySpec(self.client, data, self.api_path) + + def to_dict(self): + """Returns a version of this spec that conforms to the API representation. + + :returns: The spec's dict. + :rtype: dict + """ + return deepcopy(self._data) + + def with_name(self, new_name): + """Creates a new spec with the same properties, updating name. + + :param new_name: The new name. + :type new_name: str + :return: The new spec. + :rtype: :class:`~tamr_unify_client.categorization.category.resource.CategorySpec` + """ + return self.from_data({**self._data, "name": new_name}) + + def with_description(self, new_description): + """Creates a new spec with the same properties, updating description. + + :param new_description: The new description. + :type new_description: str + :return: The new spec. + :rtype: :class:`~tamr_unify_client.categorization.category.resource.CategorySpec` + """ + return self.from_data({**self._data, "description": new_description}) + + def with_path(self, new_path): + """Creates a new spec with the same properties, updating path. + + :param new_path: The new path. + :type new_path: list[str] + :return: The new spec. + :rtype: :class:`~tamr_unify_client.categorization.category.resource.CategorySpec` + """ + return self.from_data({**self._data, "path": new_path}) + + def __repr__(self): + return ( + f"{self.__class__.__module__}." + f"{self.__class__.__qualname__}(" + f"dict={self._data})" + ) diff --git a/tamr_unify_client/categorization/project.py b/tamr_unify_client/categorization/project.py new file mode 100644 index 00000000..580eeafe --- /dev/null +++ b/tamr_unify_client/categorization/project.py @@ -0,0 +1,45 @@ +from tamr_unify_client.base_model import MachineLearningModel +from tamr_unify_client.categorization.taxonomy import Taxonomy +from tamr_unify_client.project.resource import Project + + +class CategorizationProject(Project): + """A Categorization project in Tamr.""" + + def model(self): + """Machine learning model for this Categorization project. + Learns from verified labels and predicts categorization labels for unlabeled records. + + :returns: The machine learning model for categorization. + :rtype: :class:`~tamr_unify_client.base_model.MachineLearningModel` + """ + alias = self.api_path + "/categorizations/model" + return MachineLearningModel(self.client, None, alias) + + def create_taxonomy(self, creation_spec): + """Creates a :class:`~tamr_unify_client.categorization.taxonomy.Taxonomy` for this project. + + A taxonomy cannot already be associated with this project. + + :param creation_spec: The creation specification for the taxonomy, which can include name. + :type creation_spec: dict + :returns: The new Taxonomy + :rtype: :class:`~tamr_unify_client.categorization.taxonomy.Taxonomy` + """ + alias = self.api_path + "/taxonomy" + resource_json = self.client.post(alias, json=creation_spec).successful().json() + return Taxonomy.from_json(self.client, resource_json, alias) + + def taxonomy(self): + """Retrieves the :class:`~tamr_unify_client.categorization.taxonomy.Taxonomy` associated with this project. + If a taxonomy is not already associated with this project, + call :func:`~tamr_unify_client.categorization.project.CategorizationProject.create_taxonomy` first. + + :returns: The project's Taxonomy + :rtype: :class:`~tamr_unify_client.categorization.taxonomy.Taxonomy` + """ + alias = self.api_path + "/taxonomy" + resource_json = self.client.get(alias).successful().json() + return Taxonomy.from_json(self.client, resource_json, alias) + + # super.__repr__ is sufficient diff --git a/tamr_unify_client/categorization/taxonomy.py b/tamr_unify_client/categorization/taxonomy.py new file mode 100644 index 00000000..563cbf95 --- /dev/null +++ b/tamr_unify_client/categorization/taxonomy.py @@ -0,0 +1,32 @@ +from tamr_unify_client.base_resource import BaseResource +from tamr_unify_client.categorization.category.collection import CategoryCollection + + +class Taxonomy(BaseResource): + """A project's taxonomy""" + + @classmethod + def from_json(cls, client, data, api_path): + return super().from_data(client, data, api_path) + + @property + def name(self): + """:type: str""" + return self._data.get("name") + + def categories(self): + """Retrieves the categories of this taxonomy. + + :returns: A collection of the taxonomy categories. + :rtype: :class:`~tamr_unify_client.categorization.category.collection.CategoryCollection` + """ + alias = self.api_path + "/categories" + return CategoryCollection(self.client, alias) + + def __repr__(self): + return ( + f"{self.__class__.__module__}." + f"{self.__class__.__qualname__}(" + f"relative_id={self.relative_id!r}, " + f"name={self.name!r})" + ) diff --git a/tamr_unify_client/client.py b/tamr_unify_client/client.py index bb23b42a..5f674ddd 100644 --- a/tamr_unify_client/client.py +++ b/tamr_unify_client/client.py @@ -1,62 +1,52 @@ +import logging +from typing import Optional from urllib.parse import urljoin import requests -from requests import Response +import requests.auth +import requests.exceptions -from tamr_unify_client.models.dataset.collection import DatasetCollection -from tamr_unify_client.models.project.collection import ProjectCollection +from tamr_unify_client.dataset.collection import DatasetCollection +from tamr_unify_client.project.collection import ProjectCollection +import tamr_unify_client.response as response -# monkey-patch Response.successful +logger = logging.getLogger(__name__) +response._monkey_patch() -def successful(self): - """Checks that this response did not encounter an HTTP error (i.e. status code indicates success: 2xx, 3xx). - :raises :class:`requests.exceptions.HTTPError`: If an HTTP error is encountered. - :return: The calling response (i.e. ``self``). - :rtype: :class:`requests.Response` - """ - self.raise_for_status() - return self +class Client: + """Python Client for Tamr API. + Each client is specific to a specific origin (protocol, host, port). -Response.successful = successful + Args: + auth: Tamr-compatible Authentication provider. + **Recommended**: use one of the classes described in :ref:`authentication` + host: Host address of remote Tamr instance (e.g. ``'10.0.10.0'``) + protocol: Either ``'http'`` or ``'https'`` + port: Tamr instance main port + base_path: Base API path. Requests made by this client will be relative to this path. + session: Session to use for API calls. If none is provided, will use a new :class:`requests.Session`. -class Client: - """Python Client for Unify API. Each client is specific to a specific origin - (protocol, host, port). - - :param auth: Unify-compatible Authentication provider. - **Recommended**: use one of the classes described in :ref:`authentication` - :type auth: :class:`requests.auth.AuthBase` - :param host: Host address of remote Unify instance (e.g. `10.0.10.0`). Default: `'localhost'` - :type host: str - :param protocol: Either `'http'` or `'https'`. Default: `'http'` - :type protocol: str - :param port: Unify instance main port. Default: `9100` - :type port: int - :param base_path: Base API path. Requests made by this client will be relative to this path. Default: `'api/versioned/v1/'` - :type base_path: str - :param session: Session to use for API calls. Default: A new default `requests.Session()`. - :type session: requests.Session - - Usage: - >>> import tamr_unify_client as api + Example: + >>> from tamr_unify_client import Client >>> from tamr_unify_client.auth import UsernamePasswordAuth >>> auth = UsernamePasswordAuth('my username', 'my password') - >>> local = api.Client(auth) # on http://localhost:9100 - >>> remote = api.Client(auth, protocol='https', host='10.0.10.0') # on https://10.0.10.0:9100 + >>> tamr_local = Client(auth) # on http://localhost:9100 + >>> tamr_remote = Client(auth, protocol='https', host='10.0.10.0') # on https://10.0.10.0:9100 + >>> tamr_remote = Client(auth, protocol='https', host='10.0.10.0', port=None) # on https://10.0.10.0 """ def __init__( self, - auth, - host="localhost", - protocol="http", - port=9100, - base_path="api/versioned/v1/", - session=None, + auth: requests.auth.AuthBase, + host: str = "localhost", + protocol: str = "http", + port: Optional[int] = 9100, + base_path: str = "/api/versioned/v1/", + session: Optional[requests.Session] = None, ): self.auth = auth self.host = host @@ -64,47 +54,47 @@ def __init__( self.port = port self.base_path = base_path self.session = session or requests.Session() + self.session.auth = auth self._projects = ProjectCollection(self) self._datasets = DatasetCollection(self) - # logging - self.logger = None - # https://docs.python.org/3/howto/logging-cookbook.html#implementing-structured-logging + if not self.base_path.startswith("/"): + self.base_path = "/" + self.base_path - def default_log_entry(method, url, response): - return f"{method} {url} : {response.status_code}" - - self.log_entry = None + if not self.base_path.endswith("/"): + self.base_path = self.base_path + "/" @property - def origin(self): - """HTTP origin i.e. ``://[:]``. - For additional information, see `MDN web docs `_ . + def origin(self) -> str: + """HTTP origin i.e. :code:`://[:]`. - :type: str - """ - return f"{self.protocol}://{self.host}:{self.port}" - - def request(self, method, endpoint, **kwargs): - """Sends an authenticated request to the server. The URL for the request - will be ``"//"``. - - :param method: The HTTP method for the request to be sent. - :type method: str - :param endpoint: API endpoint to call (relative to the Base API path for this client). - :type endpoint: str - :return: HTTP response - :rtype: :class:`requests.Response` + For additional information, see `MDN web docs `_ . """ - url = urljoin(self.origin + "/" + self.base_path, endpoint) - response = self.session.request(method, url, auth=self.auth, **kwargs) + if self.port is None: + return f"{self.protocol}://{self.host}" + else: + return f"{self.protocol}://{self.host}:{self.port}" + + def request(self, method: str, endpoint: str, **kwargs) -> requests.Response: + """Sends a request to Tamr. + + The URL for the request will be ``//``. + The request is authenticated via :attr:`Client.auth`. - # logging - if self.logger: - log_message = self.log_entry(method, url, response) - self.logger.info(log_message) + Args: + method: The HTTP method to use (e.g. `'GET'` or `'POST'`) + endpoint: API endpoint to call (relative to the Base API path for this client). + Returns: + HTTP response from the Tamr server + """ + url = urljoin(self.origin + self.base_path, endpoint) + response = self.session.request(method, url, **kwargs) + + logger.info( + f"{response.request.method} {response.url} : {response.status_code}" + ) return response def get(self, endpoint, **kwargs): @@ -128,59 +118,23 @@ def delete(self, endpoint, **kwargs): return self.request("DELETE", endpoint, **kwargs) @property - def projects(self): - """Collection of all projects on this Unify instance. + def projects(self) -> ProjectCollection: + """Collection of all projects on this Tamr instance. - :return: Collection of all projects. - :rtype: :class:`~tamr_unify_client.models.ProjectCollection` + Returns: + Collection of all projects. """ return self._projects @property - def datasets(self): - """Collection of all datasets on this Unify instance. + def datasets(self) -> DatasetCollection: + """Collection of all datasets on this Tamr instance. - :return: Collection of all datasets. - :rtype: :class:`~tamr_unify_client.models.DatasetCollection` + Returns: + Collection of all datasets. """ return self._datasets - def create_project(self, project_creation_spec): - """ - Create a Project in Unify - - :param project_creation_spec: Project creation specification should be formatted as specified in the `Public Docs for Creating a Project `_. - :type project_creation_spec: dict[str, str] - :returns: The created Project - :rtype: :class:`~tamr_unify_client.models.project.resource.Project` - """ - from tamr_unify_client.models.project.resource import Project - - data = ( - self.post(self.projects.api_path, json=project_creation_spec) - .successful() - .json() - ) - return Project.from_json(self, data) - - def create_dataset(self, dataset_creation_spec): - """ - Create a Dataset in Unify - - :param dataset_creation_spec: Project creation specification should be formatted as specified in the `Public Docs for Creating a Dataset `_. - :type dataset_creation_spec: dict[str, str] - :returns: The created Dataset - :rtype: :class:`~tamr_unify_client.models.dataset.resource.Dataset` - """ - from tamr_unify_client.models.dataset.resource import Dataset - - data = ( - self.post(self.datasets.api_path, json=dataset_creation_spec) - .successful() - .json() - ) - return Dataset.from_json(self, data) - def __repr__(self): # Show only the type `auth` to mitigate any security concerns. return ( diff --git a/tamr_unify_client/dataset/collection.py b/tamr_unify_client/dataset/collection.py new file mode 100644 index 00000000..fe07deb6 --- /dev/null +++ b/tamr_unify_client/dataset/collection.py @@ -0,0 +1,201 @@ +import warnings + +from requests.exceptions import HTTPError + +from tamr_unify_client.base_collection import BaseCollection +from tamr_unify_client.dataset.resource import Dataset + + +class DatasetCollection(BaseCollection): + """Collection of :class:`~tamr_unify_client.dataset.resource.Dataset` s. + + :param client: Client for API call delegation. + :type client: :class:`~tamr_unify_client.Client` + :param api_path: API path used to access this collection. + E.g. ``"projects/1/inputDatasets"``. + Default: ``"datasets"``. + :type api_path: str + """ + + def __init__(self, client, api_path="datasets"): + super().__init__(client, api_path) + + def by_resource_id(self, resource_id): + """Retrieve a dataset by resource ID. + + :param resource_id: The resource ID. E.g. ``"1"`` + :type resource_id: str + :returns: The specified dataset. + :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` + """ + return super().by_resource_id("datasets", resource_id) + + def by_relative_id(self, relative_id): + """Retrieve a dataset by relative ID. + + :param relative_id: The resource ID. E.g. ``"datasets/1"`` + :type relative_id: str + :returns: The specified dataset. + :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` + """ + return super().by_relative_id(Dataset, relative_id) + + def by_external_id(self, external_id): + """Retrieve a dataset by external ID. + + :param external_id: The external ID. + :type external_id: str + :returns: The specified dataset, if found. + :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` + :raises KeyError: If no dataset with the specified external_id is found + :raises LookupError: If multiple datasets with the specified external_id are found + """ + return super().by_external_id(Dataset, external_id) + + def stream(self): + """Stream datasets in this collection. Implicitly called when iterating + over this collection. + + :returns: Stream of datasets. + :rtype: Python generator yielding :class:`~tamr_unify_client.dataset.resource.Dataset` + + Usage: + >>> for dataset in collection.stream(): # explicit + >>> do_stuff(dataset) + >>> for dataset in collection: # implicit + >>> do_stuff(dataset) + """ + return super().stream(Dataset) + + def by_name(self, dataset_name): + """Lookup a specific dataset in this collection by exact-match on name. + + :param dataset_name: Name of the desired dataset. + :type dataset_name: str + :return: Dataset with matching name in this collection. + :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` + :raises KeyError: If no dataset with specified name was found. + """ + for dataset in self: + if dataset.name == dataset_name: + return dataset + raise KeyError(f"No dataset found with name: {dataset_name}") + + def delete_by_resource_id(self, resource_id, cascade=False): + """Deletes a dataset from this collection by resource_id. Optionally deletes all derived datasets as well. + + :param resource_id: The resource id of the dataset in this collection to delete. + :type resource_id: str + :param cascade: Whether to delete all datasets derived from the deleted one. Optional, default is `False`. + Do not use this option unless you are certain you need it as it can have unindended consequences. + :type cascade: bool + :return: HTTP response from the server. + :rtype: :class:`requests.Response` + """ + params = {"cascade": cascade} + path = f"{self.api_path}/{resource_id}" + response = self.client.delete(path, params=params).successful() + return response + + def create(self, creation_spec): + """ + Create a Dataset in Tamr + + :param creation_spec: Dataset creation specification should be formatted as specified in the `Public Docs for Creating a Dataset `_. + :type creation_spec: dict[str, str] + :returns: The created Dataset + :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` + """ + data = self.client.post(self.api_path, json=creation_spec).successful().json() + return Dataset.from_json(self.client, data) + + def create_from_dataframe( + self, df, primary_key_name, dataset_name, ignore_nan=None + ): + """Creates a dataset in this collection with the given name, creates an attribute for each column in the `df` + (with `primary_key_name` as the key attribute), and upserts a record for each row of `df`. + + Each attribute has the default type `ARRAY[STRING]`, besides the key attribute, which will have type `STRING`. + + This function attempts to ensure atomicity, but it is not guaranteed. If an error occurs while creating + attributes or records, an attempt will be made to delete the dataset that was created. However, if this + request errors, it will not try again. + + :param df: The data to create the dataset with. + :type df: :class:`pandas.DataFrame` + :param primary_key_name: The name of the primary key of the dataset. Must be a column of `df`. + :type primary_key_name: str + :param dataset_name: What to name the dataset in Tamr. There cannot already be a dataset with this name. + :type dataset_name: str + :param ignore_nan: Legacy parameter that does nothing + :type ignore_nan: bool + :returns: The newly created dataset. + :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` + :raises KeyError: If `primary_key_name` is not a column in `df`. + :raises CreationError: If a step in creating the dataset fails. + """ + if ignore_nan is not None: + warnings.warn( + "'ignore_nan' is deprecated. DataFrame `NaN`s are always ignored in upsert", + DeprecationWarning, + ) + if primary_key_name not in df.columns: + raise KeyError(f"{primary_key_name} is not an attribute of the data") + + creation_spec = {"name": dataset_name, "keyAttributeNames": [primary_key_name]} + try: + dataset = self.create(creation_spec) + except HTTPError: + raise CreationError("Dataset was not created") + # after this point, if a request fails, try to undo the change by deleting this dataset + + attributes = dataset.attributes + for col in df.columns: + if col == primary_key_name: + # this attribute already exists, so don't create it again + continue + + attr_spec = { + "name": col, + "type": {"baseType": "ARRAY", "innerType": {"baseType": "STRING"}}, + } + try: + attributes.create(attr_spec) + except HTTPError: + self._handle_creation_failure(dataset, "An attribute was not created") + + try: + response = dataset.upsert_from_dataframe( + df, primary_key_name=primary_key_name + ) + except HTTPError: + self._handle_creation_failure(dataset, "Records could not be created") + + if not response["allCommandsSucceeded"]: + self._handle_creation_failure(dataset, "Some records had validation errors") + + return dataset + + def _handle_creation_failure(self, dataset, error): + """Attempts to make create_from_dataframe atomic by deleting the created dataset in the event of later failure. + However, this does not guarantee atomicity: if the request to delete the dataset fails, it will not retry. + + :param dataset: The created dataset to delete. + :type dataset: :class:`~tamr_unify_client.dataset.resource.Dataset` + :param error: The error that caused the function to fail. + :type error: str + """ + try: + dataset.delete() + except HTTPError: + raise CreationError("Created dataset didn't delete after an earlier error") + raise CreationError(error) + + # super.__repr__ is sufficient + + +class CreationError(Exception): + """An error from :func:`~tamr_unify_client.dataset.collection.DatasetCollection.create_from_dataframe`""" + + def __init__(self, error_message): + super().__init__(error_message) diff --git a/tamr_unify_client/models/dataset_profile.py b/tamr_unify_client/dataset/profile.py similarity index 67% rename from tamr_unify_client/models/dataset_profile.py rename to tamr_unify_client/dataset/profile.py index 9b38a9c5..35630d4d 100644 --- a/tamr_unify_client/models/dataset_profile.py +++ b/tamr_unify_client/dataset/profile.py @@ -1,8 +1,9 @@ -from tamr_unify_client.models.base_resource import BaseResource +from tamr_unify_client.base_resource import BaseResource +from tamr_unify_client.operation import Operation class DatasetProfile(BaseResource): - """Profile info of a Unify dataset.""" + """Profile info of a Tamr dataset.""" @classmethod def from_json(cls, client, resource_json, api_path=None) -> "DatasetProfile": @@ -64,6 +65,22 @@ def attribute_profiles(self) -> list: """ return self._data.get("attributeProfiles") + def refresh(self, **options): + """Updates the dataset profile if needed. + + The dataset profile is updated on the server; you will need to call + :func:`~tamr_unify_client.dataset.resource.Dataset.profile` + to retrieve the updated profile. + + :param ``**options``: Options passed to underlying :class:`~tamr_unify_client.operation.Operation` . + See :func:`~tamr_unify_client.operation.Operation.apply_options` . + :returns: The refresh operation. + :rtype: :class:`~tamr_unify_client.operation.Operation` + """ + response = self.client.post(self.api_path + ":refresh").successful() + op = Operation.from_response(self.client, response) + return op.apply_options(**options) + def __repr__(self) -> str: return ( f"{self.__class__.__module__}." @@ -71,7 +88,7 @@ def __repr__(self) -> str: f"relative_id={self.relative_id!r}, " f"dataset_name={self.dataset_name!r}, " f"relative_dataset_id={self.relative_dataset_id!r}, " - f"up_to_date={self.is_up_to_date!r}, " + f"is_up_to_date={self.is_up_to_date!r}, " f"profiled_data_version={self.profiled_data_version!r}, " f"profiled_at={self.profiled_at!r}, " f"simple_metrics={self.simple_metrics!r})" diff --git a/tamr_unify_client/dataset/resource.py b/tamr_unify_client/dataset/resource.py new file mode 100644 index 00000000..1291ee03 --- /dev/null +++ b/tamr_unify_client/dataset/resource.py @@ -0,0 +1,629 @@ +from copy import deepcopy +import json +import os +from typing import Optional, TYPE_CHECKING +import warnings + +from tamr_unify_client._ignore_nan_encoder import IgnoreNanEncoder +from tamr_unify_client.attribute.collection import AttributeCollection +from tamr_unify_client.base_resource import BaseResource +from tamr_unify_client.dataset.profile import DatasetProfile +from tamr_unify_client.dataset.status import DatasetStatus +from tamr_unify_client.dataset.uri import DatasetURI +from tamr_unify_client.dataset.usage import DatasetUsage +from tamr_unify_client.operation import Operation + +BUILDING_DOCS = os.environ.get("TAMR_CLIENT_DOCS") == "1" +if TYPE_CHECKING or BUILDING_DOCS: + import pandas as pd + + +class Dataset(BaseResource): + """A Tamr dataset.""" + + @classmethod + def from_json(cls, client, resource_json, api_path=None): + return super().from_data(client, resource_json, api_path) + + @property + def name(self): + """:type: str""" + return self._data.get("name") + + @property + def external_id(self): + """:type: str""" + return self._data.get("externalId") + + @property + def description(self): + """:type: str""" + return self._data.get("description") + + @property + def version(self): + """:type: str""" + return self._data.get("version") + + @property + def tags(self): + """:type: list[str]""" + return self._data.get("tags")[:] + + @property + def key_attribute_names(self): + """:type: list[str]""" + return self._data.get("keyAttributeNames")[:] + + @property + def attributes(self): + """Attributes of this dataset. + + :return: Attributes of this dataset. + :rtype: :class:`~tamr_unify_client.attribute.collection.AttributeCollection` + """ + alias = self.api_path + "/attributes" + return AttributeCollection(self.client, alias) + + def _update_records(self, updates, *, ignore_nan=False): + """Send a batch of record creations/updates/deletions to this dataset. + You probably want to use :func:`~tamr_unify_client.dataset.resource.Dataset.upsert_records` + or :func:`~tamr_unify_client.dataset.resource.Dataset.delete_records` instead. + + :param records: Each record should be formatted as specified in the `Public Docs for Dataset updates `_. + :type records: iterable[dict] + :param ignore_nan: Whether to treat `NaN` values as null. Unconverted `NaN`s will raise an error if found. Deprecated. + :type ignore_nan: bool + :returns: JSON response body from server. + :rtype: :py:class:`dict` + """ + if ignore_nan: + warnings.warn( + "'ignore_nan' is deprecated. Users are expected to provide valid JSON representations instead", + DeprecationWarning, + ) + encoder = IgnoreNanEncoder if ignore_nan else None + stringified_updates = ( + json.dumps(update, cls=encoder, allow_nan=False).encode("utf-8") + for update in updates + ) + + return ( + self.client.post( + self.api_path + ":updateRecords", + headers={"Content-Encoding": "utf-8"}, + data=stringified_updates, + ) + .successful() + .json() + ) + + def upsert_from_dataframe( + self, + df: "pd.DataFrame", + *, + primary_key_name: str, + ignore_nan: Optional[bool] = None, + ) -> dict: + """Upserts a record for each row of `df` with attributes for each column in `df`. + + Args: + df: The data to upsert records from. + primary_key_name: The name of the primary key of the dataset. Must be a column of `df`. + ignore_nan: Legacy parameter that does nothing. Deprecated. + + Returns: + JSON response body from the server. + + Raises: + KeyError: If `primary_key_name` is not a column in `df`. + + """ + if ignore_nan is not None: + warnings.warn( + "'ignore_nan' is deprecated. DataFrame `NaN`s are always ignored in upsert", + DeprecationWarning, + ) + if primary_key_name not in df.columns: + raise KeyError(f"{primary_key_name} is not an attribute of the data") + + # serialize records via to_json to handle `np.nan` values + serialized_records = ((pk, row.to_json()) for pk, row in df.iterrows()) + records = ( + {primary_key_name: pk, **json.loads(row)} for pk, row in serialized_records + ) + return self.upsert_records(records, primary_key_name) + + def upsert_records(self, records, primary_key_name, *, ignore_nan=False): + """Creates or updates the specified records. + + :param records: The records to update, as dictionaries. + :type records: iterable[dict] + :param primary_key_name: The name of the primary key for these records, which must be a key in each record dictionary. + :type primary_key_name: str + :param ignore_nan: Whether to convert `NaN` values to `null` when upserting records. If `False` and `NaN` is found this function will fail. Deprecated. + :type ignore_nan: bool + :return: JSON response body from the server. + :rtype: dict + """ + if ignore_nan: + warnings.warn( + "'ignore_nan' is deprecated. Users are expected to provide valid JSON representations instead", + DeprecationWarning, + ) + updates = ( + {"action": "CREATE", "recordId": record[primary_key_name], "record": record} + for record in records + ) + return self._update_records(updates, ignore_nan=ignore_nan) + + def delete_records(self, records, primary_key_name): + """Deletes the specified records. + + :param records: The records to delete, as dictionaries. + :type records: iterable[dict] + :param primary_key_name: The name of the primary key for these records, which must be a key in each record dictionary. + :type primary_key_name: str + :return: JSON response body from the server. + :rtype: dict + """ + ids = (record[primary_key_name] for record in records) + return self.delete_records_by_id(ids) + + def delete_records_by_id(self, record_ids): + """Deletes the specified records. + + :param record_ids: The IDs of the records to delete. + :type record_ids: iterable + :return: JSON response body from the server. + :rtype: dict + """ + updates = ({"action": "DELETE", "recordId": rid} for rid in record_ids) + return self._update_records(updates) + + def delete_all_records(self): + """Removes all records from the dataset. + + :return: HTTP response from the server + :rtype: :class:`requests.Response` + """ + path = self.api_path + "/records" + response = self.client.delete(path).successful() + return response + + def refresh(self, **options): + """Brings dataset up-to-date if needed, taking whatever actions are required. + + :param ``**options``: Options passed to underlying :class:`~tamr_unify_client.operation.Operation` . + See :func:`~tamr_unify_client.operation.Operation.apply_options` . + :returns: The refresh operation. + :rtype: :class:`~tamr_unify_client.operation.Operation` + """ + response = self.client.post(self.api_path + ":refresh").successful() + op = Operation.from_response(self.client, response) + return op.apply_options(**options) + + def profile(self): + """Returns profile information for a dataset. + + If profile information has not been generated, call create_profile() first. + If the returned profile information is out-of-date, you can call refresh() on the returned + object to bring it up-to-date. + + :return: Dataset Profile information. + :rtype: :class:`~tamr_unify_client.dataset.profile.DatasetProfile` + """ + profile_json = self.client.get(self.api_path + "/profile").successful().json() + return DatasetProfile.from_json( + self.client, profile_json, api_path=self.api_path + "/profile" + ) + + def create_profile(self, **options): + """Create a profile for this dataset. + + If a profile already exists, the existing profile will be brought + up to date. + + :param ``**options``: Options passed to underlying :class:`~tamr_unify_client.operation.Operation` . + See :func:`~tamr_unify_client.operation.Operation.apply_options` . + :return: The operation to create the profile. + :rtype: :class:`~tamr_unify_client.operation.Operation` + """ + response = self.client.post(self.api_path + "/profile:refresh").successful() + op = Operation.from_response(self.client, response) + return op.apply_options(**options) + + def records(self): + """Stream this dataset's records as Python dictionaries. + + :return: Stream of records. + :rtype: Python generator yielding :py:class:`dict` + """ + with self.client.get(self.api_path + "/records", stream=True) as response: + for line in response.iter_lines(): + yield json.loads(line) + + def status(self): + """Retrieve this dataset's streamability status. + + :return: Dataset streamability status. + :rtype: :class:`~tamr_unify_client.dataset.status.DatasetStatus` + """ + status_json = self.client.get(self.api_path + "/status").successful().json() + return DatasetStatus.from_json( + self.client, status_json, api_path=self.api_path + "/status" + ) + + def usage(self): + """Retrieve this dataset's usage by recipes and downstream datasets. + + :return: The dataset's usage. + :rtype: :class:`~tamr_unify_client.dataset.usage.DatasetUsage` + """ + alias = self.api_path + "/usage" + usage = self.client.get(alias).successful().json() + return DatasetUsage.from_json(self.client, usage, alias) + + def from_geo_features(self, features, geo_attr=None): + """Upsert this dataset from a geospatial FeatureCollection or iterable of Features. + + `features` can be: + + - An object that implements ``__geo_interface__`` as a FeatureCollection + (see https://gist.github.com/sgillies/2217756) + - An iterable of features, where each element is a feature dictionary or an object + that implements the ``__geo_interface__`` as a Feature + - A map where the "features" key contains an iterable of features + + See: geopandas.GeoDataFrame.from_features() + + If geo_attr is provided, then the named Tamr attribute will be used for the geometry. + If geo_attr is not provided, then the first attribute on the dataset with geometry type + will be used for the geometry. + + :param features: geospatial features + :param geo_attr: (optional) name of the Tamr attribute to use for the feature's geometry + :type geo_attr: str + :returns: JSON response body from server. + :rtype: :py:class:`dict` + """ + if hasattr(features, "__geo_interface__"): + features = features.__geo_interface__ + if hasattr(features, "get") and features.get("type") == "FeatureCollection": + features = features["features"] + + key_attrs = self.key_attribute_names + if len(key_attrs) == 1: + record_id = "recordId" + else: + record_id = "compositeRecordId" + + if geo_attr is None: + geo_attr = self._geo_attr + + return self._update_records( + self._features_to_updates(features, record_id, key_attrs, geo_attr) + ) + + def upstream_datasets(self): + """The Dataset's upstream datasets. + + API returns the URIs of the upstream datasets, + resulting in a list of DatasetURIs, not actual Datasets. + + :return: A list of the Dataset's upstream datasets. + :rtype: list[:class:`~tamr_unify_client.dataset.uri.DatasetURI`] + """ + alias = self.api_path + "/upstreamDatasets" + resources = self.client.get(alias).successful().json() + + return [DatasetURI(self.client, uri) for uri in resources] + + def spec(self): + """Returns this dataset's spec. + + :return: The spec of this dataset. + :rtype: :class:`~tamr_unify_client.dataset.resource.DatasetSpec` + """ + return DatasetSpec.of(self) + + def delete(self, cascade=False): + """Deletes this dataset, optionally deleting all derived datasets as well. + + :param cascade: Whether to delete all datasets derived from this one. Optional, default is `False`. + Do not use this option unless you are certain you need it as it can have unindended consequences. + :type cascade: bool + :return: HTTP response from the server + :rtype: :class:`requests.Response` + """ + params = {"cascade": cascade} + response = self.client.delete(self.api_path, params=params).successful() + return response + + @property + def __geo_interface__(self): + """Retrieve a representation of this dataset that conforms to the Python Geo Interface. + + Note that this materializes all features; for a streaming interface to features, + see :method:`~tamr_unify_client.dataset.Dataset.__geo_features__()` + + See https://gist.github.com/sgillies/2217756 + + :return: dict[str, object] + """ + return { + "type": "FeatureCollection", + "features": [feature for feature in self.itergeofeatures()], + } + + def itergeofeatures(self, geo_attr=None): + """Returns an iterator that yields feature dictionaries that comply with __geo_interface__ + + See https://gist.github.com/sgillies/2217756 + + :param geo_attr: (optional) name of the Tamr attribute to use for the feature's geometry + :type geo_attr: str + :return: stream of features + :rtype: Python generator yielding :py:class:`dict[str, object]` + """ + key_attrs = self.key_attribute_names + if len(key_attrs) == 1: + + def key_value(rec): + return rec[key_attrs[0]] + + else: + + def key_value(rec): + return [rec[attr] for attr in key_attrs] + + if geo_attr is None: + geo_attr = self._geo_attr + + for record in self.records(): + yield self._record_to_feature(record, key_value, key_attrs, geo_attr) + + @property + def _geo_attr(self): + """The name of the attribute that contains geometry + + :return: the name of the attribute that contains geometry + :rtype: str + """ + # Duck-typing: find all the attributes that look like geometry + geo_attrs = [ + attr.name + for attr in self.attributes + if "RECORD" == attr.type.base_type + and self._geo_attr_names().intersection( + {sub_attr.name for sub_attr in attr.type.attributes} + ) + ] + # We select the first such attribute as the geometry + if geo_attrs: + geo_attr = geo_attrs[0] + else: + geo_attr = None + return geo_attr + + @staticmethod + def _record_to_feature(record, key_value, key_attrs, geo_attr): + """Convert a Tamr record to a Python Geo Interface Feature + + :param record: Tamr record + :param key_value: Function to extract the value of the primary key from the record + :param key_attrs: Set of attributes that comprise the primary key for the record + :param geo_attr: The singular attribute to use as the geometry + :return: map from str to object + """ + feature = {"type": "Feature", "id": key_value(record)} + reserved = {"bbox", geo_attr}.union(key_attrs) + if geo_attr and geo_attr in record: + src_geo = record[geo_attr] + if src_geo: + for unify_attr in Dataset._geo_attr_names(): + if unify_attr in src_geo and src_geo[unify_attr]: + feature["geometry"] = { + # Convert e.g. multiLineString -> MultiLineString + "type": unify_attr[0].upper() + unify_attr[1:], + "coordinates": src_geo[unify_attr], + } + break + else: + feature["geometry"] = None + else: + feature["geometry"] = None + if "bbox" in record: + feature["bbox"] = record["bbox"] + non_reserved = set(record.keys()).difference(reserved) + if non_reserved: + feature["properties"] = {attr: record[attr] for attr in non_reserved} + return feature + + @staticmethod + def _feature_to_record(feature, key_attrs, geo_attr): + """Convert a Python Geo Interface Feature to a Tamr record + + feature can be a dict representing a Geospatial Feature, or a Feature object + that implements the __geo_interface__ property. + + :param feature: Python Geo Interface Feature + :param key_attrs: Sequence of attributes that comprise the primary key for the record + :param geo_attr: The singluar attribute on the record to use for the geometry + :return: dict + """ + + if hasattr(feature, "__geo_interface__"): + feature = feature.__geo_interface__ + + record = {} + + props = feature.get("properties") + if props: + for prop in props: + record[prop] = props[prop] + + geometry = feature.get("geometry") + if geometry: + geo_type = geometry["type"] + # Convert e.g. "MultiLineString" -> "multiLineString" + geo_type = geo_type[0].lower() + geo_type[1:] + record[geo_attr] = {geo_type: geometry["coordinates"]} + + bbox = feature.get("bbox") + if bbox: + record["bbox"] = bbox + if "id" not in feature or feature["id"] is None: + raise ValueError("id must have a non-null value") + if key_attrs[1:]: + key_values = feature["id"] + + for i, attr in enumerate(key_attrs): + record[attr] = key_values[i] + else: + record[key_attrs[0]] = feature["id"] + return record + + @staticmethod + def _features_to_updates(features, id_attr, key_attrs, geo_attr): + for feature in features: + yield { + "action": "CREATE", + id_attr: feature["id"], + "record": Dataset._feature_to_record(feature, key_attrs, geo_attr), + } + + def __repr__(self): + return ( + f"{self.__class__.__module__}." + f"{self.__class__.__qualname__}(" + f"relative_id={self.relative_id!r}, " + f"name={self.name!r}, " + f"version={self.version!r})" + ) + + @staticmethod + def _geo_attr_names(): + return { + "point", + "multiPoint", + "lineString", + "multiLineString", + "polygon", + "multiPolygon", + } + + +class DatasetSpec: + """A representation of the server view of a dataset.""" + + def __init__(self, client, data, api_path): + self.client = client + self._data = data + self.api_path = api_path + + @staticmethod + def of(resource): + """Creates a dataset spec from a dataset. + + :param resource: The existing dataset. + :type resource: :class:`~tamr_unify_client.dataset.resource.Dataset` + :return: The corresponding dataset spec. + :rtype: :class:`~tamr_unify_client.dataset.resource.DatasetSpec` + """ + return DatasetSpec(resource.client, deepcopy(resource._data), resource.api_path) + + @staticmethod + def new(): + """Creates a blank spec that could be used to construct a new dataset. + + :return: The empty spec. + :rtype: :class:`~tamr_unify_client.dataset.resource.DatasetSpec` + """ + return DatasetSpec(None, {}, None) + + def from_data(self, data): + """Creates a spec with the same client and API path as this one, but new data. + + :param data: The data for the new spec. + :type data: dict + :return: The new spec. + :rtype: :class:`~tamr_unify_client.dataset.resource.DatasetSpec` + """ + return DatasetSpec(self.client, data, self.api_path) + + def to_dict(self): + """Returns a version of this spec that conforms to the API representation. + + :returns: The spec's dict. + :rtype: dict + """ + return deepcopy(self._data) + + def with_name(self, new_name): + """Creates a new spec with the same properties, updating name. + + :param new_name: The new name. + :type new_name: str + :return: A new spec. + :rtype: :class:`~tamr_unify_client.dataset.resource.DatasetSpec` + """ + return self.from_data({**self._data, "name": new_name}) + + def with_external_id(self, new_external_id): + """Creates a new spec with the same properties, updating external ID. + + :param new_external_id: The new external ID. + :type new_external_id: str + :return: A new spec. + :rtype: :class:`~tamr_unify_client.dataset.resource.DatasetSpec` + """ + return self.from_data({**self._data, "externalId": new_external_id}) + + def with_description(self, new_description): + """Creates a new spec with the same properties, updating description. + + :param new_description: The new description. + :type new_description: str + :return: A new spec. + :rtype: :class:`~tamr_unify_client.dataset.resource.DatasetSpec` + """ + return self.from_data({**self._data, "description": new_description}) + + def with_key_attribute_names(self, new_key_attribute_names): + """Creates a new spec with the same properties, updating key attribute names. + + :param new_key_attribute_names: The new key attribute names. + :type new_key_attribute_names: list[str] + :return: A new spec. + :rtype: :class:`~tamr_unify_client.dataset.resource.DatasetSpec` + """ + return self.from_data( + {**self._data, "keyAttributeNames": new_key_attribute_names} + ) + + def with_tags(self, new_tags): + """Creates a new spec with the same properties, updating tags. + + :param new_tags: The new tags. + :type new_tags: list[str] + :return: A new spec. + :rtype: :class:`~tamr_unify_client.dataset.resource.DatasetSpec` + """ + return self.from_data({**self._data, "tags": new_tags}) + + def put(self): + """Updates the dataset on the server. + + :return: The modified dataset. + :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` + """ + new_data = self.client.put(self.api_path, json=self._data).successful().json() + return Dataset.from_json(self.client, new_data, self.api_path) + + def __repr__(self): + return ( + f"{self.__class__.__module__}." + f"{self.__class__.__qualname__}(" + f"dict={self._data})" + ) diff --git a/tamr_unify_client/models/dataset_status.py b/tamr_unify_client/dataset/status.py similarity index 90% rename from tamr_unify_client/models/dataset_status.py rename to tamr_unify_client/dataset/status.py index 0bfd850f..d554983e 100644 --- a/tamr_unify_client/models/dataset_status.py +++ b/tamr_unify_client/dataset/status.py @@ -1,8 +1,8 @@ -from tamr_unify_client.models.base_resource import BaseResource +from tamr_unify_client.base_resource import BaseResource class DatasetStatus(BaseResource): - """Streamability status of a Unify dataset.""" + """Streamability status of a Tamr dataset.""" @classmethod def from_json(cls, client, resource_json, api_path=None) -> "DatasetStatus": diff --git a/tamr_unify_client/dataset/uri.py b/tamr_unify_client/dataset/uri.py new file mode 100644 index 00000000..56397b33 --- /dev/null +++ b/tamr_unify_client/dataset/uri.py @@ -0,0 +1,43 @@ +class DatasetURI: + """ + Indentifier of a dataset. + + :param client: Queried dataset's client. + :type client: :class:`~tamr_unify_client.client.Client` + :param uri: Queried dataset's dataset ID. + :type uri: :py:class:`str` + """ + + def __init__(self, client, uri): + self.client = client + self._uri = uri + + @property + def resource_id(self): + """:type: str""" + return self._uri.split("/")[-1] + + @property + def relative_id(self): + """:type: str""" + return "datasets/" + self.resource_id + + @property + def uri(self): + """:type: str""" + return self._uri + + def dataset(self): + """Fetch the dataset that this identifier points to. + + :return: A Tamr dataset. + :rtype: :class: `~tamr_unify_client.dataset.resource.Dataset` + """ + return self.client.datasets.by_resource_id(self.resource_id) + + def __repr__(self): + return ( + f"{self.__class__.__module__}." + f"{self.__class__.__qualname__}(" + f"'{self.uri})'" + ) diff --git a/tamr_unify_client/dataset/usage.py b/tamr_unify_client/dataset/usage.py new file mode 100644 index 00000000..c580f8ce --- /dev/null +++ b/tamr_unify_client/dataset/usage.py @@ -0,0 +1,37 @@ +from tamr_unify_client.base_resource import BaseResource +from tamr_unify_client.dataset.use import DatasetUse + + +class DatasetUsage(BaseResource): + """ + The usage of a dataset and its downstream dependencies. + + See https://docs.tamr.com/reference#retrieve-downstream-dataset-usage + """ + + @classmethod + def from_json(self, client, resource_json, api_path): + return super().from_data(client, resource_json, api_path) + + @property + def relative_id(self): + """:type: str""" + return self.api_path + + @property + def usage(self): + """:type: :class:`~tamr_unify_client.dataset.use.DatasetUse`""" + return DatasetUse(self.client, self._data.get("usage")) + + @property + def dependencies(self): + """:type: list[:class:`~tamr_unify_client.dataset.use.DatasetUse`]""" + deps = self._data.get("dependencies") + return [DatasetUse(self.client, dep) for dep in deps] + + def __repr__(self): + return ( + f"{self.__class__.__module__}." + f"{self.__class__.__qualname__}(" + f"alias={self.api_path!r})" + ) diff --git a/tamr_unify_client/dataset/use.py b/tamr_unify_client/dataset/use.py new file mode 100644 index 00000000..2b732a22 --- /dev/null +++ b/tamr_unify_client/dataset/use.py @@ -0,0 +1,57 @@ +from tamr_unify_client.project.step import ProjectStep + + +class DatasetUse: + """ + The use of a dataset in project steps. This is not a `BaseResource` because it has no API path + and cannot be directly retrieved or modified. + + See https://docs.tamr.com/reference#retrieve-downstream-dataset-usage + + :param client: Delegate underlying API calls to this client. + :type client: :class:`~tamr_unify_client.Client` + :param data: The JSON body containing usage information. + :type data: :py:class:`dict` + """ + + def __init__(self, client, data): + self.client = client + self._data = data + + @property + def dataset_id(self): + """:type: str""" + return self._data.get("datasetId") + + @property + def dataset_name(self): + """:type: str""" + return self._data.get("datasetName") + + @property + def input_to_project_steps(self): + """:type: list[:class:`~tamr_unify_client.project.step.ProjectStep`]""" + steps = self._data.get("inputToProjectSteps") + return [ProjectStep(self.client, step) for step in steps] + + @property + def output_from_project_steps(self): + """:type: list[:class:`~tamr_unify_client.project.step.ProjectStep`]""" + steps = self._data.get("outputFromProjectSteps") + return [ProjectStep(self.client, step) for step in steps] + + def dataset(self): + """Retrieves the :class:`~tamr_unify_client.dataset.resource.Dataset` this use represents. + + :return: The dataset being used. + :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` + """ + dataset_id = self.dataset_id.split("/")[-1] + return self.client.datasets.by_resource_id(dataset_id) + + def __repr__(self): + return ( + f"{self.__class__.__module__}." + f"{self.__class__.__qualname__}(" + f"dataset_id={self.dataset_id!r})" + ) diff --git a/tamr_unify_client/mastering/binning_model.py b/tamr_unify_client/mastering/binning_model.py new file mode 100644 index 00000000..bcb64540 --- /dev/null +++ b/tamr_unify_client/mastering/binning_model.py @@ -0,0 +1,51 @@ +import json + +from tamr_unify_client.base_resource import BaseResource + + +class BinningModel(BaseResource): + """ A binning model object.""" + + @classmethod + def from_json(cls, client, resource_json, api_path=None): + return super().from_data(client, resource_json, api_path) + + def records(self): + """Stream this object's records as Python dictionaries. + + :return: Stream of records. + :rtype: Python generator yielding :py:class:`dict` + """ + with self.client.get(self.api_path + "/records", stream=True) as response: + for line in response.iter_lines(): + yield json.loads(line) + + def update_records(self, records): + """Send a batch of record creations/updates/deletions to this dataset. + + :param records: Each record should be formatted as specified in the `Public Docs for Dataset updates `_. + :type records: iterable[dict] + :returns: JSON response body from server. + :rtype: :py:class:`dict` + """ + + def _stringify_updates(updates): + for update in updates: + yield json.dumps(update).encode("utf-8") + + return ( + self.client.post( + self.api_path + "/records", + headers={"Content-Encoding": "utf-8"}, + data=_stringify_updates(records), + ) + .successful() + .json() + ) + + def __repr__(self): + return ( + f"{self.__class__.__module__}." + f"{self.__class__.__qualname__}(" + f"api_path={self.api_path})" + ) diff --git a/tamr_unify_client/models/project/estimated_pair_counts.py b/tamr_unify_client/mastering/estimated_pair_counts.py similarity index 70% rename from tamr_unify_client/models/project/estimated_pair_counts.py rename to tamr_unify_client/mastering/estimated_pair_counts.py index 362c36a1..43c84293 100644 --- a/tamr_unify_client/models/project/estimated_pair_counts.py +++ b/tamr_unify_client/mastering/estimated_pair_counts.py @@ -1,4 +1,5 @@ -from tamr_unify_client.models.base_resource import BaseResource +from tamr_unify_client.base_resource import BaseResource +from tamr_unify_client.operation import Operation class EstimatedPairCounts(BaseResource): @@ -50,6 +51,22 @@ def clause_estimates(self) -> dict: """ return self._data.get("clauseEstimates") + def refresh(self, **options): + """Updates the estimated pair counts if needed. + + The pair count estimates are updated on the server; you will need to call + :func:`~tamr_unify_client.mastering.project.MasteringProject.estimate_pairs` + to retrieve the updated estimate. + + :param ``**options``: Options passed to underlying :class:`~tamr_unify_client.operation.Operation` . + See :func:`~tamr_unify_client.operation.Operation.apply_options` . + :returns: The refresh operation. + :rtype: :class:`~tamr_unify_client.operation.Operation` + """ + response = self.client.post(self.api_path + ":refresh").successful() + op = Operation.from_response(self.client, response) + return op.apply_options(**options) + def __repr__(self) -> str: return ( f"{self.__class__.__module__}." diff --git a/tamr_unify_client/mastering/project.py b/tamr_unify_client/mastering/project.py new file mode 100644 index 00000000..056375f7 --- /dev/null +++ b/tamr_unify_client/mastering/project.py @@ -0,0 +1,235 @@ +import json + +from tamr_unify_client.base_model import MachineLearningModel +from tamr_unify_client.dataset.resource import Dataset +from tamr_unify_client.mastering.binning_model import BinningModel +from tamr_unify_client.mastering.estimated_pair_counts import EstimatedPairCounts +from tamr_unify_client.mastering.published_cluster.configuration import ( + PublishedClustersConfiguration, +) +from tamr_unify_client.mastering.published_cluster.record import RecordPublishedCluster +from tamr_unify_client.mastering.published_cluster.resource import PublishedCluster +from tamr_unify_client.project.resource import Project + + +class MasteringProject(Project): + """A Mastering project in Tamr.""" + + def pairs(self): + """Record pairs generated by Tamr's binning model. + Pairs are displayed on the "Pairs" page in the Tamr UI. + + Call :func:`~tamr_unify_client.dataset.resource.Dataset.refresh` from + this dataset to regenerate pairs according to the latest binning model. + + :returns: The record pairs represented as a dataset. + :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` + """ + alias = self.api_path + "/recordPairs" + return Dataset(self.client, None, alias) + + def pair_matching_model(self): + """Machine learning model for pair-matching for this Mastering project. + Learns from verified labels and predicts categorization labels for unlabeled pairs. + + Calling :func:`~tamr_unify_client.base_model.MachineLearningModel.predict` + from this dataset will produce new (unpublished) clusters. These clusters + are displayed on the "Clusters" page in the Tamr UI. + + :returns: The machine learning model for pair-matching. + :rtype: :class:`~tamr_unify_client.base_model.MachineLearningModel` + """ + alias = self.api_path + "/recordPairsWithPredictions/model" + return MachineLearningModel(self.client, None, alias) + + def high_impact_pairs(self): + """High-impact pairs as a dataset. Tamr labels pairs as "high-impact" if + labeling these pairs would help it learn most quickly (i.e. "Active learning"). + + High-impact pairs are displayed with a ⚡ lightning bolt icon on the + "Pairs" page in the Tamr UI. + + Call :func:`~tamr_unify_client.dataset.resource.Dataset.refresh` from + this dataset to produce new high-impact pairs according to the latest + pair-matching model. + + :returns: The high-impact pairs represented as a dataset. + :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` + """ + alias = self.api_path + "/highImpactPairs" + return Dataset(self.client, None, alias) + + def record_clusters(self): + """Record Clusters as a dataset. Tamr clusters labeled pairs using pairs + model. These clusters populate the cluster review page and get transient + cluster ids, rather than published cluster ids (i.e., "Permanent Ids") + + Call :func:`~tamr_unify_client.dataset.resource.Dataset.refresh` from + this dataset to generate clusters based on to the latest pair-matching model. + + :returns: The record clusters represented as a dataset. + :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` + """ + alias = self.api_path + "/recordClusters" + return Dataset(self.client, None, alias) + + def published_clusters(self): + """Published record clusters generated by Tamr's pair-matching model. + + :returns: The published clusters represented as a dataset. + :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` + """ + + unified_dataset = self.unified_dataset() + + # Replace this workaround with a direct API call once API + # is fixed. APIs that need to work are: fetching the dataset and + # being able to call refresh on resulting dataset. Until then, we grab + # the dataset by constructing its name from the corresponding Unified Dataset's name + name = unified_dataset.name + "_dedup_published_clusters" + canonical = self.client.datasets.by_name(name) + resource_json = canonical._data + alias = self.api_path + "/publishedClusters" + return Dataset.from_json(self.client, resource_json, alias) + + def published_clusters_configuration(self): + """Retrieves published clusters configuration for this project. + + :returns: The published clusters configuration + :rtype: :class:`~tamr_unify_client.mastering.published_cluster.configuration.PublishedClustersConfiguration` + """ + alias = self.api_path + "/publishedClustersConfiguration" + resource_json = self.client.get(alias).successful().json() + return PublishedClustersConfiguration.from_json( + self.client, resource_json, alias + ) + + def published_cluster_ids(self): + """Retrieves published cluster IDs for this project. + + :returns: The published cluster ID dataset. + :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` + """ + # Replace this workaround with a direct API call once API + # is fixed. APIs that need to work are: fetching the dataset and + # being able to call refresh on resulting dataset. Until then, we grab + # the dataset by constructing its name from the corresponding Unified Dataset's name + unified_dataset = self.unified_dataset() + name = unified_dataset.name + "_dedup_all_persistent_ids" + dataset = self.client.datasets.by_name(name) + + path = self.api_path + "/allPublishedClusterIds" + return Dataset.from_json(self.client, dataset._data, path) + + def published_cluster_stats(self): + """Retrieves published cluster stats for this project. + + :returns: The published cluster stats dataset. + :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` + """ + # Replace this workaround with a direct API call once API + # is fixed. APIs that need to work are: fetching the dataset and + # being able to call refresh on resulting dataset. Until then, we grab + # the dataset by constructing its name from the corresponding Unified Dataset's name + unified_dataset = self.unified_dataset() + name = unified_dataset.name + "_dedup_published_cluster_stats" + dataset = self.client.datasets.by_name(name) + + path = self.api_path + "/publishedClusterStats" + return Dataset.from_json(self.client, dataset._data, path) + + def published_cluster_versions(self, cluster_ids): + """Retrieves version information for the specified published clusters. + See https://docs.tamr.com/reference#retrieve-published-clusters-given-cluster-ids. + + :param cluster_ids: The persistent IDs of the clusters to get version information for. + :type cluster_ids: iterable[str] + :return: A stream of the published clusters. + :rtype: Python generator yielding :class:`~tamr_unify_client.mastering.published_cluster.resource.PublishedCluster` + """ + path = self.api_path + "/publishedClusterVersions" + return self._cluster_versions(PublishedCluster, cluster_ids, path) + + def record_published_cluster_versions(self, record_ids): + """Retrieves version information for the published clusters of the given records. + See https://docs.tamr.com/reference#retrieve-published-clusters-given-record-ids. + + :param record_ids: The Tamr IDs of the records to get cluster version information for. + :type record_ids: iterable[str] + :return: A stream of the relevant published clusters. + :rtype: Python generator yielding :class:`~tamr_unify_client.mastering.published_cluster.record.RecordPublishedCluster` + """ + path = self.api_path + "/recordPublishedClusterVersions" + return self._cluster_versions(RecordPublishedCluster, record_ids, path) + + def _cluster_versions(self, cluster_class, ids, endpoint): + """Retrieves version information for published clusters. + + :param cluster_class: The class to create instances of. + :param ids: The IDs of the clusters or records to get version information for. + :type ids: iterable[str] + :param endpoint: The endpoint to call for versions. + :type endpoint: str + :return: A stream of the published clusters. + """ + string_ids = "\n".join(json.dumps(i) for i in ids) + + with self.client.post(endpoint, data=string_ids, stream=True) as response: + for line in response.iter_lines(): + yield cluster_class(json.loads(line)) + + def estimate_pairs(self): + """Returns pair estimate information for a mastering project + + :return: Pairs Estimate information. + :rtype: :class:`~tamr_unify_client.mastering.estimated_pair_counts.EstimatedPairCounts` + """ + alias = self.api_path + "/estimatedPairCounts" + estimate_json = self.client.get(alias).successful().json() + info = EstimatedPairCounts.from_json(self.client, estimate_json, api_path=alias) + return info + + def record_clusters_with_data(self): + """Project's unified dataset with associated clusters. + + :returns: The record clusters with data represented as a dataset + :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` + """ + unified_dataset = self.unified_dataset() + + # Replace this workaround with a direct API call once API + # is fixed. APIs that need to work are: fetching the dataset and + # being able to call refresh on resulting dataset. Until then, we grab + # the dataset by constructing its name from the corresponding Unified Dataset's name + name = unified_dataset.name + "_dedup_clusters_with_data" + dataset = self.client.datasets.by_name(name) + dataset.api_path = self.api_path + "/recordClustersWithData" + return dataset + + def published_clusters_with_data(self): + """Project's unified dataset with associated clusters. + + :returns: The published clusters with data represented as a dataset + :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` + """ + + unified_dataset = self.unified_dataset() + name = unified_dataset.name + "_dedup_published_clusters_with_data" + dataset = self.client.datasets.by_name(name) + dataset.api_path = self.api_path + "/publishedClustersWithData" + return dataset + + def binning_model(self): + """ + Binning model for this project. + + :return: Binning model for this project. + :rtype: :class:`~tamr_unify_client.mastering.binning_model.BinningModel` + """ + alias = self.api_path + "/binningModel" + + # Cannot get this resource and so we hard code + resource_json = {"relativeId": alias} + return BinningModel.from_json(self.client, resource_json, alias) + + # super.__repr__ is sufficient diff --git a/tamr_unify_client/mastering/published_cluster/configuration.py b/tamr_unify_client/mastering/published_cluster/configuration.py new file mode 100644 index 00000000..bed402c7 --- /dev/null +++ b/tamr_unify_client/mastering/published_cluster/configuration.py @@ -0,0 +1,126 @@ +from copy import deepcopy + +from tamr_unify_client.base_resource import BaseResource + + +class PublishedClustersConfiguration(BaseResource): + """ + The configuration of published clusters in a project. + + See https://docs.tamr.com/reference#the-published-clusters-configuration-object + """ + + @classmethod + def from_json(cls, client, data, api_path): + return super().from_data(client, data, api_path) + + @property + def relative_id(self): + """:type: str""" + # api_path is alias when it exists, and relative_id when it does not. + # this distinction is useful for things like refreshing a unified dataset, + # where using the relative_id would hit + # /datasets/{id}:refresh + # rather than + # /projects/{id}/unifiedDataset:refresh. + # Since cluster configurations don't currently have that kind of aliasing, + # using api_path is always correct. + # If configurations ever get aliased, this will need to be updated. + # This is confusing; there's an RFC for suggestions to improve this + # #64 https://github.com/Datatamer/unify-client-python/issues/64 + # "Conflation between 'api_path', 'relative_id' / 'relativeId', and + # BaseResource ctor 'alias'" + return self.api_path + + @property + def versions_time_to_live(self): + """:type: str""" + return self._data.get("versionsTimeToLive") + + def spec(self): + """Returns a spec representation of this published cluster configuration. + + :return: The published cluster configuration spec. + :rtype: :class`~tamr_unify_client.mastering.published_cluster.configuration.PublishedClustersConfigurationSpec` + """ + return PublishedClustersConfigurationSpec.of(self) + + def __repr__(self): + return ( + f"{self.__class__.__module__}." + f"{self.__class__.__qualname__}(" + f"relative_id={self.relative_id!r}, " + f"versions_time_to_live={self.versions_time_to_live!r})" + ) + + +class PublishedClustersConfigurationSpec: + """A representation of the server view of published clusters configuration.""" + + def __init__(self, client, data, api_path): + self.client = client + self._data = data + self.api_path = api_path + + @staticmethod + def of(resource): + """Creates an published cluster configuration spec from published cluster configuration. + + :param resource: The existing published cluster configuration. + :type resource: :class:`~tamr_unify_client.mastering.published_cluster.configuration.PublishedClustersConfiguration` + :return: The corresponding published cluster configuration spec. + :rtype: :class:`~tamr_unify_client.mastering.published_cluster.configuration.PublishedClustersConfigurationSpec` + """ + return PublishedClustersConfigurationSpec( + resource.client, deepcopy(resource._data), resource.api_path + ) + + def from_data(self, data): + """Creates a spec with new data. + + :param data: The data for the new spec. + :type data: dict + :return: The new spec. + :rtype: :class:`~tamr_unify_client.mastering.published_cluster.configuration.PublishedClustersConfigurationSpec` + """ + return PublishedClustersConfigurationSpec(self.client, data, self.api_path) + + def to_dict(self): + """Returns a version of this spec that conforms to the API representation. + + :returns: The spec's dict. + :rtype: dict + """ + return deepcopy(self._data) + + def with_versions_time_to_live(self, new_versions_time_to_live): + """Creates a new spec with the same properties, updating versions time to live. + + :param new_versions_time_to_live: The new versions time to live. + :type new_versions_time_to_live: str + :return: A new spec. + :rtype: :class:`~tamr_unify_client.mastering.published_cluster.configuration.PublishedClustersConfigurationSpec` + """ + return self.from_data( + {**self._data, "versionsTimeToLive": new_versions_time_to_live} + ) + + def put(self): + """Commits these changes by updating the configuration in Tamr. + + :return: The updated configuration. + :rtype: :class:`~tamr_unify_client.mastering.published_cluster.configuration.PublishedClustersConfiguration` + """ + updated_json = ( + self.client.put(self.api_path, json=self._data).successful().json() + ) + return PublishedClustersConfiguration.from_json( + self.client, updated_json, self.api_path + ) + + def __repr__(self): + return ( + f"{self.__class__.__module__}." + f"{self.__class__.__qualname__}(" + f"dict={self._data})" + ) diff --git a/tamr_unify_client/mastering/published_cluster/metric.py b/tamr_unify_client/mastering/published_cluster/metric.py new file mode 100644 index 00000000..e396d23c --- /dev/null +++ b/tamr_unify_client/mastering/published_cluster/metric.py @@ -0,0 +1,28 @@ +class Metric: + """ A metric for a published cluster. + + This is not a `BaseResource` because it does not have its own API endpoint. + + :param data: The JSON entity representing this cluster. + """ + + def __init__(self, data): + self._data = data + + @property + def name(self): + """:type: str""" + return self._data.get("metricName") + + @property + def value(self): + """:type: str""" + return self._data.get("metricValue") + + def __repr__(self): + return ( + f"{self.__class__.__module__}." + f"{self.__class__.__qualname__}(" + f"name={self.name!r}, " + f"value={self.value!r})" + ) diff --git a/tamr_unify_client/mastering/published_cluster/record.py b/tamr_unify_client/mastering/published_cluster/record.py new file mode 100644 index 00000000..f627c26b --- /dev/null +++ b/tamr_unify_client/mastering/published_cluster/record.py @@ -0,0 +1,52 @@ +from tamr_unify_client.mastering.published_cluster.record_version import ( + RecordPublishedClusterVersion, +) + + +class RecordPublishedCluster: + """A representation of a published cluster of a record in a mastering project with version information. + See https://docs.tamr.com/reference#retrieve-published-clusters-given-record-ids. + + This is not a `BaseResource` because it does not have its own API endpoint. + + :param data: The JSON entity representing this + :class:`~tamr_unify_client.mastering.published_cluster.record.RecordPublishedCluster`. + """ + + def __init__(self, data): + self._data = data + + @property + def entity_id(self): + """:type: str""" + return self._data.get("entityId") + + @property + def source_id(self): + """:type: str""" + return self._data.get("sourceId") + + @property + def origin_entity_id(self): + """:type: str""" + return self._data.get("originEntityId") + + @property + def origin_source_id(self): + """:type: str""" + return self._data.get("originSourceId") + + @property + def versions(self): + """:type: list[:class:`~tamr_unify_client.mastering.published_cluster.record_version.RecordPublishedClusterVersion`]""" + return [RecordPublishedClusterVersion(v) for v in self._data.get("versions")] + + def __repr__(self): + return ( + f"{self.__class__.__module__}." + f"{self.__class__.__qualname__}(" + f"entity_id={self.entity_id!r}, " + f"source_id={self.source_id!r}, " + f"origin_entity_id={self.origin_entity_id!r}, " + f"origin_source_id={self.origin_source_id!r})" + ) diff --git a/tamr_unify_client/mastering/published_cluster/record_version.py b/tamr_unify_client/mastering/published_cluster/record_version.py new file mode 100644 index 00000000..0694a2f2 --- /dev/null +++ b/tamr_unify_client/mastering/published_cluster/record_version.py @@ -0,0 +1,34 @@ +class RecordPublishedClusterVersion: + """A version of a published cluster in a mastering project. + + This is not a `BaseResource` because it does not have its own API endpoint. + + :param data: The JSON entity representing this version. + """ + + def __init__(self, data): + self._data = data + + @property + def version(self): + """:type: str""" + return self._data.get("version") + + @property + def timestamp(self): + """:type: str""" + return self._data.get("timestamp") + + @property + def cluster_id(self): + """:type: str""" + return self._data.get("clusterId") + + def __repr__(self): + return ( + f"{self.__class__.__module__}." + f"{self.__class__.__qualname__}(" + f"version={self.version!r}, " + f"timestamp={self.timestamp!r}, " + f"name={self.cluster_id!r})" + ) diff --git a/tamr_unify_client/mastering/published_cluster/resource.py b/tamr_unify_client/mastering/published_cluster/resource.py new file mode 100644 index 00000000..9219ab62 --- /dev/null +++ b/tamr_unify_client/mastering/published_cluster/resource.py @@ -0,0 +1,33 @@ +from tamr_unify_client.mastering.published_cluster.version import ( + PublishedClusterVersion, +) + + +class PublishedCluster: + """A representation of a published cluster in a mastering project with version information. + See https://docs.tamr.com/reference#retrieve-published-clusters-given-cluster-ids. + + This is not a `BaseResource` because it does not have its own API endpoint. + + :param data: The JSON entity representing this :class:`~tamr_unify_client.mastering.published_cluster.resource.PublishedCluster`. + """ + + def __init__(self, data): + self._data = data + + @property + def id(self): + """:type: str""" + return self._data.get("id") + + @property + def versions(self): + """:type: list[:class:`~tamr_unify_client.mastering.published_cluster.version.PublishedClusterVersion`]""" + return [PublishedClusterVersion(v) for v in self._data.get("versions")] + + def __repr__(self): + return ( + f"{self.__class__.__module__}." + f"{self.__class__.__qualname__}(" + f"id={self.id!r})" + ) diff --git a/tamr_unify_client/mastering/published_cluster/version.py b/tamr_unify_client/mastering/published_cluster/version.py new file mode 100644 index 00000000..7e037ec5 --- /dev/null +++ b/tamr_unify_client/mastering/published_cluster/version.py @@ -0,0 +1,47 @@ +from tamr_unify_client.mastering.published_cluster.metric import Metric + + +class PublishedClusterVersion: + """A version of a published cluster in a mastering project. + + This is not a `BaseResource` because it does not have its own API endpoint. + + :param data: The JSON entity representing this version. + """ + + def __init__(self, data): + self._data = data + + @property + def version(self): + """:type: str""" + return self._data.get("version") + + @property + def timestamp(self): + """:type: str""" + return self._data.get("timestamp") + + @property + def name(self): + """:type: str""" + return self._data.get("name") + + @property + def metrics(self): + """:type: list[:class:`~tamr_unify_client.mastering.published_cluster.metric.Metric`]""" + return [Metric(m) for m in self._data.get("metrics")] + + @property + def record_ids(self): + """:type: list[dict[str, str]]""" + return self._data.get("recordIds") + + def __repr__(self): + return ( + f"{self.__class__.__module__}." + f"{self.__class__.__qualname__}(" + f"version={self.version!r}, " + f"timestamp={self.timestamp!r}, " + f"name={self.name!r})" + ) diff --git a/tamr_unify_client/models/attribute/resource.py b/tamr_unify_client/models/attribute/resource.py deleted file mode 100644 index 753fab77..00000000 --- a/tamr_unify_client/models/attribute/resource.py +++ /dev/null @@ -1,62 +0,0 @@ -from tamr_unify_client.models.attribute.type import AttributeType -from tamr_unify_client.models.base_resource import BaseResource - - -class Attribute(BaseResource): - """ - A Unify Attribute. - - See https://docs.tamr.com/reference#attribute-types - """ - - @classmethod - def from_json(cls, client, data, api_path): - return super().from_data(client, data, api_path) - - @property - def relative_id(self): - """:type: str""" - # api_path is alias when it exists, and relative_id when it does not. - # this distinction is useful for things like refreshing a unified dataset, - # where using the relative_id would hit - # /datasets/{id}:refresh - # rather than - # /projects/{id}/unifiedDataset:refresh. - # Since attributes don't currently have that kind of aliasing, - # using api_path is always correct. - # If attributes ever get aliased, this will need to be updated. - # This is confusing; there's an RFC for suggestions to improve this - # #64 https://github.com/Datatamer/unify-client-python/issues/64 - # "Conflation between 'api_path', 'relative_id' / 'relativeId', and - # BaseResource ctor 'alias'" - return self.api_path - - @property - def name(self): - """:type: str""" - return self._data.get("name") - - @property - def description(self): - """:type: str""" - return self._data.get("description") - - @property - def type(self): - """:type: :class:`~tamr_unify_client.models.attribute.type.AttributeType`""" - alias = self.api_path + "/type" - type_json = self._data.get("type") - return AttributeType.from_data(self.client, type_json, alias) - - @property - def is_nullable(self): - """:type: bool""" - return self._data.get("isNullable") - - def __repr__(self): - return ( - f"{self.__class__.__module__}." - f"{self.__class__.__qualname__}(" - f"relative_id={self.relative_id!r}, " - f"name={self.name!r})" - ) diff --git a/tamr_unify_client/models/attribute/type.py b/tamr_unify_client/models/attribute/type.py deleted file mode 100644 index 87bc7b5f..00000000 --- a/tamr_unify_client/models/attribute/type.py +++ /dev/null @@ -1,45 +0,0 @@ -from tamr_unify_client.models.base_resource import BaseResource - - -class AttributeType(BaseResource): - @classmethod - def from_json(cls, client, data, api_path): - return super().from_data(client, data, api_path) - - @property - def relative_id(self): - return self.api_path - - @property - def base_type(self): - """:type: str""" - return self._data.get("baseType") - - @property - def inner_type(self): - """:type: :class:`~tamr_unify_client.models.attribute.type.AttributeType`""" - if "innerType" in self._data: - alias = self.api_path + "/type" - return AttributeType.from_data( - self.client, self._data.get("innerType"), alias - ) - else: - return None - - @property - def attributes(self): - """:type: :class:`~tamr_unify_client.models.attribute.collection.AttributeCollection`""" - alias = self.api_path + "/attributes" - collection_json = self._data.get("attributes") - # Import locally to avoid circular dependency - from tamr_unify_client.models.attribute.collection import AttributeCollection - - return AttributeCollection.from_json(self.client, collection_json, alias) - - def __repr__(self): - return ( - f"{self.__class__.__module__}." - f"{self.__class__.__qualname__}(" - f"relative_id={self.relative_id!r}, " - f"base_type={self.base_type!r})" - ) diff --git a/tamr_unify_client/models/dataset/collection.py b/tamr_unify_client/models/dataset/collection.py deleted file mode 100644 index e3b4add5..00000000 --- a/tamr_unify_client/models/dataset/collection.py +++ /dev/null @@ -1,80 +0,0 @@ -from tamr_unify_client.models.base_collection import BaseCollection -from tamr_unify_client.models.dataset.resource import Dataset - - -class DatasetCollection(BaseCollection): - """Collection of :class:`~tamr_unify_client.models.dataset.resource.Dataset` s. - - :param client: Client for API call delegation. - :type client: :class:`~tamr_unify_client.Client` - :param api_path: API path used to access this collection. - E.g. ``"projects/1/inputDatasets"``. - Default: ``"datasets"``. - :type api_path: str - """ - - def __init__(self, client, api_path="datasets"): - super().__init__(client, api_path) - - def by_resource_id(self, resource_id): - """Retrieve a dataset by resource ID. - - :param resource_id: The resource ID. E.g. ``"1"`` - :type resource_id: str - :returns: The specified dataset. - :rtype: :class:`~tamr_unify_client.models.dataset.resource.Dataset` - """ - return super().by_resource_id("datasets", resource_id) - - def by_relative_id(self, relative_id): - """Retrieve a dataset by relative ID. - - :param relative_id: The resource ID. E.g. ``"datasets/1"`` - :type relative_id: str - :returns: The specified dataset. - :rtype: :class:`~tamr_unify_client.models.dataset.resource.Dataset` - """ - return super().by_relative_id(Dataset, relative_id) - - def by_external_id(self, external_id): - """Retrieve a dataset by external ID. - - :param external_id: The external ID. - :type external_id: str - :returns: The specified dataset, if found. - :rtype: :class:`~tamr_unify_client.models.dataset.resource.Dataset` - :raises KeyError: If no dataset with the specified external_id is found - :raises LookupError: If multiple datasets with the specified external_id are found - """ - return super().by_external_id(Dataset, external_id) - - def stream(self): - """Stream datasets in this collection. Implicitly called when iterating - over this collection. - - :returns: Stream of datasets. - :rtype: Python generator yielding :class:`~tamr_unify_client.models.dataset.resource.Dataset` - - Usage: - >>> for dataset in collection.stream(): # explicit - >>> do_stuff(dataset) - >>> for dataset in collection: # implicit - >>> do_stuff(dataset) - """ - return super().stream(Dataset) - - def by_name(self, dataset_name): - """Lookup a specific dataset in this collection by exact-match on name. - - :param dataset_name: Name of the desired dataset. - :type dataset_name: str - :return: Dataset with matching name in this collection. - :rtype: :class:`~tamr_unify_client.models.dataset.resource.Dataset` - :raises KeyError: If no dataset with specified name was found. - """ - for dataset in self: - if dataset.name == dataset_name: - return dataset - raise KeyError(f"No dataset found with name: {dataset_name}") - - # super.__repr__ is sufficient diff --git a/tamr_unify_client/models/dataset/resource.py b/tamr_unify_client/models/dataset/resource.py deleted file mode 100644 index 8a385bdf..00000000 --- a/tamr_unify_client/models/dataset/resource.py +++ /dev/null @@ -1,338 +0,0 @@ -import json - -from tamr_unify_client.models.attribute.collection import AttributeCollection -from tamr_unify_client.models.base_resource import BaseResource -from tamr_unify_client.models.dataset_profile import DatasetProfile -from tamr_unify_client.models.dataset_status import DatasetStatus -from tamr_unify_client.models.operation import Operation - - -class Dataset(BaseResource): - """A Unify dataset.""" - - @classmethod - def from_json(cls, client, resource_json, api_path=None): - return super().from_data(client, resource_json, api_path) - - @property - def name(self): - """:type: str""" - return self._data.get("name") - - @property - def external_id(self): - """:type: str""" - return self._data.get("externalId") - - @property - def description(self): - """:type: str""" - return self._data.get("description") - - @property - def version(self): - """:type: str""" - return self._data.get("version") - - @property - def tags(self): - """:type: list[str]""" - return self._data.get("tags") - - @property - def key_attribute_names(self): - """:type: list[str]""" - return self._data.get("keyAttributeNames") - - @property - def attributes(self): - """Attributes of this dataset. - - :return: Attributes of this dataset. - :rtype: :class:`~tamr_unify_client.models.attribute.collection.AttributeCollection` - """ - alias = self.api_path + "/attributes" - resource_json = self.client.get(alias).successful().json() - return AttributeCollection.from_json(self.client, resource_json, alias) - - def create_attribute(self, attribute_creation_spec): - """Create an Attribute in Unify - - :param attribute_creation_spec: the name and type (and optional description) of the attribute to create, formatted as described in the `Public Docs for Adding an Attribute `_. - :type attribute_creation_spec: dict[str, object] - :return: the created Attribute - """ - from tamr_unify_client.models.attribute.resource import Attribute - - data = ( - self.client.post(self.attributes.api_path, json=attribute_creation_spec) - .successful() - .json() - ) - alias = self.attributes.api_path + "/" + attribute_creation_spec["name"] - return Attribute(self.client, data, alias) - - def update_records(self, records): - """Send a batch of record creations/updates/deletions to this dataset. - - :param records: Each record should be formatted as specified in the `Public Docs for Dataset updates `_. - :type records: iterable[dict] - :returns: JSON response body from server. - :rtype: :py:class:`dict` - """ - - def _stringify_updates(updates): - for update in updates: - yield json.dumps(update).encode("utf-8") - - return ( - self.client.post( - self.api_path + ":updateRecords", - headers={"Content-Encoding": "utf-8"}, - data=_stringify_updates(records), - ) - .successful() - .json() - ) - - def refresh(self, **options): - """Brings dataset up-to-date if needed, taking whatever actions are required. - :param ``**options``: Options passed to underlying :class:`~tamr_unify_client.models.operation.Operation` . - See :func:`~tamr_unify_client.models.operation.Operation.apply_options` . - """ - op_json = self.client.post(self.api_path + ":refresh").successful().json() - op = Operation.from_json(self.client, op_json) - return op.apply_options(**options) - - def profile(self, **options): - """Returns up to date profile information for a dataset, re-profiling if not up to date. - - :param ``**options``: Options passed to underlying :class:`~tamr_unify_client.models.operation.Operation` . - :return: Dataset Profile information. - :rtype: :class:`~tamr_unify_client.models.dataset_status.DatasetProfile` - """ - - profile_json = self.client.get(self.api_path + "/profile").successful().json() - info = DatasetProfile.from_json( - self.client, profile_json, api_path=self.api_path + "/profile" - ) - if info.is_up_to_date: - return info - else: - op_json = ( - self.client.post(self.api_path + "/profile:refresh").successful().json() - ) - op = Operation.from_json(self.client, op_json) - op.apply_options(**options) - return self.profile() - - def records(self): - """Stream this dataset's records as Python dictionaries. - - :return: Stream of records. - :rtype: Python generator yielding :py:class:`dict` - """ - with self.client.get(self.api_path + "/records", stream=True) as response: - for line in response.iter_lines(): - yield json.loads(line) - - def status(self) -> DatasetStatus: - """Retrieve this dataset's streamability status. - - :return: Dataset streamability status. - :rtype: :class:`~tamr_unify_client.models.dataset_status.DatasetStatus` - """ - status_json = self.client.get(self.api_path + "/status").successful().json() - return DatasetStatus.from_json( - self.client, status_json, api_path=self.api_path + "/status" - ) - - def from_geo_features(self, features): - """Upsert this dataset from a geospatial FeatureCollection or iterable of Features. - - `features` can be: - - - An object that implements ``__geo_interface__`` as a FeatureCollection - (see https://gist.github.com/sgillies/2217756) - - An iterable of features, where each element is a feature dictionary or an object - that implements the ``__geo_interface__`` as a Feature - - A map where the "features" key contains an iterable of features - - See: geopandas.GeoDataFrame.from_features() - - :param features: geospatial features - """ - if hasattr(features, "__geo_interface__"): - features = features.__geo_interface__ - if hasattr(features, "get") and features.get("type") == "FeatureCollection": - features = features["features"] - - key_attrs = self.key_attribute_names - if len(key_attrs) == 1: - record_id = "recordId" - else: - record_id = "compositeRecordId" - - self.update_records( - self._features_to_updates(features, record_id, key_attrs, self._geo_attr) - ) - - @property - def __geo_interface__(self): - """Retrieve a representation of this dataset that conforms to the Python Geo Interface. - - Note that this materializes all features; for a streaming interface to features, - see :method:`~tamr_unify_client.models.dataset.Dataset.__geo_features__()` - - See https://gist.github.com/sgillies/2217756 - - :return: dict[str, object] - """ - return { - "type": "FeatureCollection", - "features": [feature for feature in self.itergeofeatures()], - } - - def itergeofeatures(self): - """Returns an iterator that yields feature dictionaries that comply with __geo_interface__ - - See https://gist.github.com/sgillies/2217756 - - :return: stream of features - :rtype: Python generator yielding :py:class:`dict[str, object]` - """ - key_attrs = self.key_attribute_names - if len(key_attrs) == 1: - - def key_value(rec): - return rec[key_attrs[0]] - - else: - - def key_value(rec): - return [rec[attr] for attr in key_attrs] - - for record in self.records(): - yield self._record_to_feature(record, key_value, key_attrs, self._geo_attr) - - @property - def _geo_attr(self): - """The name of the attribute that contains geometry - - :return: the name of the attribute that contains geometry - :rtype: str - """ - # Duck-typing: find all the attributes that look like geometry - geo_attrs = [ - attr.name - for attr in self.attributes - if "RECORD" == attr.type.base_type - and self._geo_attr_names().intersection( - {sub_attr.name for sub_attr in attr.type.attributes} - ) - ] - # We select the first such attribute as the geometry - if geo_attrs: - geo_attr = geo_attrs[0] - else: - geo_attr = None - return geo_attr - - @staticmethod - def _record_to_feature(record, key_value, key_attrs, geo_attr): - """Convert a Unify record to a Python Geo Interface Feature - - :param record: Unify record - :param key_value: Function to extract the value of the primary key from the record - :param key_attrs: Set of attributes that comprise the primary key for the record - :param geo_attr: The singular attribute to use as the geometry - :return: map from str to object - """ - feature = {"type": "Feature", "id": key_value(record)} - reserved = {"bbox", geo_attr}.union(key_attrs) - if geo_attr and geo_attr in record: - src_geo = record[geo_attr] - for unify_attr in Dataset._geo_attr_names(): - if unify_attr in src_geo and src_geo[unify_attr]: - feature["geometry"] = { - # Convert e.g. multiLineString -> MultiLineString - "type": unify_attr[0].upper() + unify_attr[1:], - "coordinates": src_geo[unify_attr], - } - break - if "bbox" in record: - feature["bbox"] = record["bbox"] - non_reserved = set(record.keys()).difference(reserved) - if non_reserved: - feature["properties"] = {attr: record[attr] for attr in non_reserved} - return feature - - @staticmethod - def _feature_to_record(feature, key_attrs, geo_attr): - """Convert a Python Geo Interface Feature to a Unify record - - feature can be a dict representing a Geospatial Feature, or a Feature object - that implements the __geo_interface__ property. - - :param feature: Python Geo Interface Feature - :param key_attrs: Sequence of attributes that comprise the primary key for the record - :param geo_attr: The singluar attribute on the record to use for the geometry - :return: dict - """ - if hasattr(feature, "__geo_interface__"): - feature = feature.__geo_interface__ - - record = {} - - props = feature.get("properties") - if props: - for prop in props: - record[prop] = props[prop] - - geometry = feature.get("geometry") - if geometry: - geo_type = geometry["type"] - # Convert e.g. "MultiLineString" -> "multiLineString" - geo_type = geo_type[0].lower() + geo_type[1:] - record[geo_attr] = {geo_type: geometry["coordinates"]} - - bbox = feature.get("bbox") - if bbox: - record["bbox"] = bbox - - if key_attrs[1:]: - key_values = feature["id"] - for i, attr in enumerate(key_attrs): - record[attr] = key_values[i] - else: - record[key_attrs[0]] = feature["id"] - return record - - @staticmethod - def _features_to_updates(features, id_attr, key_attrs, geo_attr): - for feature in features: - yield { - "action": "CREATE", - id_attr: feature["id"], - "record": Dataset._feature_to_record(feature, key_attrs, geo_attr), - } - - def __repr__(self): - return ( - f"{self.__class__.__module__}." - f"{self.__class__.__qualname__}(" - f"relative_id={self.relative_id!r}, " - f"name={self.name!r}, " - f"version={self.version!r})" - ) - - @staticmethod - def _geo_attr_names(): - return { - "point", - "multiPoint", - "lineString", - "multiLineString", - "polygon", - "multiPolygon", - } diff --git a/tamr_unify_client/models/project/__init__.py b/tamr_unify_client/models/project/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tamr_unify_client/models/project/categorization.py b/tamr_unify_client/models/project/categorization.py deleted file mode 100644 index 469bddd6..00000000 --- a/tamr_unify_client/models/project/categorization.py +++ /dev/null @@ -1,18 +0,0 @@ -from tamr_unify_client.models.machine_learning_model import MachineLearningModel -from tamr_unify_client.models.project.resource import Project - - -class CategorizationProject(Project): - """A Categorization project in Unify.""" - - def model(self): - """Machine learning model for this Categorization project. - Learns from verified labels and predicts categorization labels for unlabeled records. - - :returns: The machine learning model for categorization. - :rtype: :class:`~tamr_unify_client.models.machine_learning_model.MachineLearningModel` - """ - alias = self.api_path + "/categorizations/model" - return MachineLearningModel(self.client, None, alias) - - # super.__repr__ is sufficient diff --git a/tamr_unify_client/models/project/mastering.py b/tamr_unify_client/models/project/mastering.py deleted file mode 100644 index ee5c9a68..00000000 --- a/tamr_unify_client/models/project/mastering.py +++ /dev/null @@ -1,91 +0,0 @@ -from tamr_unify_client.models.dataset.resource import Dataset -from tamr_unify_client.models.machine_learning_model import MachineLearningModel -from tamr_unify_client.models.project.estimated_pair_counts import EstimatedPairCounts -from tamr_unify_client.models.project.resource import Project - - -class MasteringProject(Project): - """A Mastering project in Unify.""" - - def pairs(self): - """Record pairs generated by Unify's binning model. - Pairs are displayed on the "Pairs" page in the Unify UI. - - Call :func:`~tamr_unify_client.models.dataset.resource.Dataset.refresh` from - this dataset to regenerate pairs according to the latest binning model. - - :returns: The record pairs represented as a dataset. - :rtype: :class:`~tamr_unify_client.models.dataset.resource.Dataset` - """ - alias = self.api_path + "/recordPairs" - return Dataset(self.client, None, alias) - - def pair_matching_model(self): - """Machine learning model for pair-matching for this Mastering project. - Learns from verified labels and predicts categorization labels for unlabeled pairs. - - Calling :func:`~tamr_unify_client.models.machine_learning_model.MachineLearningModel.predict` - from this dataset will produce new (unpublished) clusters. These clusters - are displayed on the "Clusters" page in the Unify UI. - - :returns: The machine learning model for pair-matching. - :rtype: :class:`~tamr_unify_client.models.machine_learning_model.MachineLearningModel` - """ - alias = self.api_path + "/recordPairsWithPredictions/model" - return MachineLearningModel(self.client, None, alias) - - def high_impact_pairs(self): - """High-impact pairs as a dataset. Unify labels pairs as "high-impact" if - labeling these pairs would help it learn most quickly (i.e. "Active learning"). - - High-impact pairs are displayed with a ⚡ lightning bolt icon on the - "Pairs" page in the Unify UI. - - Call :func:`~tamr_unify_client.models.dataset.resource.Dataset.refresh` from - this dataset to produce new high-impact pairs according to the latest - pair-matching model. - - :returns: The high-impact pairs represented as a dataset. - :rtype: :class:`~tamr_unify_client.models.dataset.resource.Dataset` - """ - alias = self.api_path + "/highImpactPairs" - return Dataset(self.client, None, alias) - - def record_clusters(self): - """Record Clusters as a dataset. Unify clusters labeled pairs using pairs - model. These clusters populate the cluster review page and get transient - cluster ids, rather than published cluster ids (i.e., "Permanent Ids") - - Call :func:`~tamr_unify_client.models.dataset.resource.Dataset.refresh` from - this dataset to generate clusters based on to the latest pair-matching model. - - :returns: The record clusters represented as a dataset. - :rtype: :class:`~tamr_unify_client.models.dataset.resource.Dataset` - """ - alias = self.api_path + "/recordClusters" - return Dataset(self.client, None, alias) - - def published_clusters(self): - """Published record clusters generated by Unify's pair-matching model. - - Call :func:`~tamr_unify_client.models.dataset.resource.Dataset.refresh` from - this dataset to republish clusters according to the latest clustering. - - :returns: The published clusters represented as a dataset. - :rtype: :class:`~tamr_unify_client.models.dataset.resource.Dataset` - """ - alias = self.api_path + "/publishedClusters" - return Dataset(self.client, None, alias) - - def estimate_pairs(self): - """Returns pair estimate information for a mastering project - - :return: Pairs Estimate information. - :rtype: :class:`~tamr_unify_client.models.project.estimated_pair_counts` - """ - alias = self.api_path + "/estimatedPairCounts" - estimate_json = self.client.get(alias).successful().json() - info = EstimatedPairCounts.from_json(self.client, estimate_json, api_path=alias) - return info - - # super.__repr__ is sufficient diff --git a/tamr_unify_client/models/project/resource.py b/tamr_unify_client/models/project/resource.py deleted file mode 100644 index d8152858..00000000 --- a/tamr_unify_client/models/project/resource.py +++ /dev/null @@ -1,107 +0,0 @@ -from tamr_unify_client.models.base_resource import BaseResource -from tamr_unify_client.models.dataset.resource import Dataset - - -class Project(BaseResource): - """A Unify project.""" - - @classmethod - def from_json(cls, client, resource_json, api_path=None): - return super().from_data(client, resource_json, api_path) - - @property - def name(self): - """:type: str""" - return self._data.get("name") - - @property - def external_id(self): - """:type: str""" - return self._data.get("externalId") - - @property - def description(self): - """:type: str""" - return self._data.get("description") - - @property - def type(self): - """One of: - ``"SCHEMA_MAPPING"`` - ``"SCHEMA_MAPPING_RECOMMENDATIONS"`` - ``"CATEGORIZATION"`` - ``"DEDUP"`` - - :type: str - """ - return self._data.get("type") - - def unified_dataset(self): - """Unified dataset for this project. - - :return: Unified dataset for this project. - :rtype: :class:`~tamr_unify_client.models.dataset.resource.Dataset` - """ - alias = self.api_path + "/unifiedDataset" - resource_json = self.client.get(alias).successful().json() - return Dataset.from_json(self.client, resource_json, alias) - - def as_categorization(self): - """Convert this project to a :class:`~tamr_unify_client.models.project.categorization.CategorizationProject` - - :return: This project. - :rtype: :class:`~tamr_unify_client.models.project.categorization.CategorizationProject` - :raises TypeError: If the :attr:`~tamr_unify_client.models.project.resource.Project.type` of this project is not ``"CATEGORIZATION"`` - """ - from tamr_unify_client.models.project.categorization import ( - CategorizationProject, - ) - - if self.type != "CATEGORIZATION": - raise TypeError( - f"Cannot convert project to categorization project. Project type: {self.type}" - ) - return CategorizationProject(self.client, self._data, self.api_path) - - def as_mastering(self): - """Convert this project to a :class:`~tamr_unify_client.models.project.mastering.MasteringProject` - - :return: This project. - :rtype: :class:`~tamr_unify_client.models.project.mastering.MasteringProject` - :raises TypeError: If the :attr:`~tamr_unify_client.models.project.resource.Project.type` of this project is not ``"DEDUP"`` - """ - from tamr_unify_client.models.project.mastering import MasteringProject - - if self.type != "DEDUP": - raise TypeError( - f"Cannot convert project to mastering project. Project type: {self.type}" - ) - return MasteringProject(self.client, self._data, self.api_path) - - def add_source_dataset(self, dataset): - """ - Associate a dataset with a project in Unify. - - By default, datasets are not associated with any projects. - They need to be added as input to a project before they can be used - as part of that project - - :param project: Unify Project - :param dataset: Unify Dataset - :return: HTTP response from the server - :rtype: :class:`requests.Response` - """ - dataset_id = dataset.relative_id.split("/")[-1] - response = self.client.post( - self.api_path + "/inputDatasets" + f"?id={dataset_id}" - ).successful() - return response - - def __repr__(self): - return ( - f"{self.__class__.__module__}." - f"{self.__class__.__qualname__}(" - f"relative_id={self.relative_id!r}, " - f"name={self.name!r}, " - f"type={self.type!r})" - ) diff --git a/tamr_unify_client/models/operation.py b/tamr_unify_client/operation.py similarity index 56% rename from tamr_unify_client/models/operation.py rename to tamr_unify_client/operation.py index ec4be801..9f730b14 100644 --- a/tamr_unify_client/models/operation.py +++ b/tamr_unify_client/operation.py @@ -1,11 +1,11 @@ from time import sleep, time as now -from tamr_unify_client.models.base_resource import BaseResource +from tamr_unify_client.base_resource import BaseResource class Operation(BaseResource): - """A long-running operation performed by Unify. - Operations appear on the "Jobs" page of the Unify UI. + """A long-running operation performed by Tamr. + Operations appear on the "Jobs" page of the Tamr UI. By design, client-side operations represent server-side operations *at a particular point in time* (namely, when the operation was fetched from the @@ -18,11 +18,68 @@ class Operation(BaseResource): def from_json(cls, client, resource_json, api_path=None): return super().from_data(client, resource_json, api_path) + @classmethod + def from_resource_id(cls, client, resource_id): + """Get an operation by resource ID. + + :param client: Delegate underlying API calls to this client. + :type client: :class:`~tamr_unify_client.Client` + :param resource_id: The ID of the operation + :type resource_id: str + :returns: The specified operation + :rtype: :class:`~tamr_unify_client.operation.Operation` + """ + url = f"operations/{resource_id}" + response = client.get(url).successful() + return Operation.from_response(client, response) + + @classmethod + def from_response(cls, client, response): + """ + Handle idiosyncrasies in constructing Operations from Tamr responses. + When a Tamr API call would start an operation, but all results that would be + produced by that operation are already up-to-date, Tamr returns `HTTP 204 No Content` + + To make it easy for client code to handle these API responses without checking + the response code, this method will either construct an Operation, or a + dummy `NoOp` operation representing the 204 Success response. + + :param client: Delegate underlying API calls to this client. + :type client: :class:`~tamr_unify_client.Client` + :param response: HTTP Response from the request that started the operation. + :type response: :class:`requests.Response` + :return: Operation + :rtype: :class:`~tamr_unify_client.operation.Operation` + """ + if response.status_code == 204: + # Operation was successful, but the response contains no content. + # Create a dummy operation to represent this. + _never = "0000-00-00T00:00:00.000Z" + _description = """Tamr returned HTTP 204 for this operation, indicating that all + results that would be produced by the operation are already up-to-date.""" + resource_json = { + "id": "-1", + "type": "NOOP", + "description": _description, + "status": { + "state": "SUCCEEDED", + "startTime": _never, + "endTime": _never, + "message": "", + }, + "created": {"username": "", "time": _never, "version": "-1"}, + "lastModified": {"username": "", "time": _never, "version": "-1"}, + "relativeId": "operations/-1", + } + else: + resource_json = response.json() + return Operation.from_json(client, resource_json) + def apply_options(self, asynchronous=False, **options): """Applies operation options to this operation. **NOTE**: This function **should not** be called directly. Rather, options should be - passed in through a higher-level function e.g. :func:`~tamr_unify_client.models.dataset.resource.Dataset.refresh` . + passed in through a higher-level function e.g. :func:`~tamr_unify_client.dataset.resource.Dataset.refresh` . Synchronous mode: Automatically waits for operation to resolve before returning the @@ -31,15 +88,15 @@ def apply_options(self, asynchronous=False, **options): asynchronous mode: Immediately return the ``'PENDING'`` operation. It is up to the user to coordinate this operation with their code via - :func:`~tamr_unify_client.models.operation.Operation.wait` and/or - :func:`~tamr_unify_client.models.operation.Operation.poll` . + :func:`~tamr_unify_client.operation.Operation.wait` and/or + :func:`~tamr_unify_client.operation.Operation.poll` . :param asynchronous: Whether or not to run in asynchronous mode. Default: ``False``. :type asynchronous: bool :param ``**options``: When running in synchronous mode, these options are - passed to the underlying :func:`~tamr_unify_client.models.operation.Operation.wait` call. + passed to the underlying :func:`~tamr_unify_client.operation.Operation.wait` call. :return: Operation with options applied. - :rtype: :class:`~tamr_unify_client.models.operation.Operation` + :rtype: :class:`~tamr_unify_client.operation.Operation` """ if asynchronous: return self @@ -84,11 +141,11 @@ def state(self): def poll(self): """Poll this operation for server-side updates. - Does not update the calling :class:`~tamr_unify_client.models.Operation` object. - Instead, returns a new :class:`~tamr_unify_client.models.Operation`. + Does not update the calling :class:`~tamr_unify_client.operation.Operation` object. + Instead, returns a new :class:`~tamr_unify_client.operation.Operation`. :return: Updated representation of this operation. - :rtype: :class:`~tamr_unify_client.models.Operation` + :rtype: :class:`~tamr_unify_client.operation.Operation` """ op_json = self.client.get(self.api_path).successful().json() return Operation.from_json(self.client, op_json) @@ -100,7 +157,7 @@ def wait(self, poll_interval_seconds=3, timeout_seconds=None): :param int timeout_seconds: Time (in seconds) to wait for operation to resolve. :raises TimeoutError: If operation takes longer than `timeout_seconds` to resolve. :return: Resolved operation. - :rtype: :class:`~tamr_unify_client.models.Operation` + :rtype: :class:`~tamr_unify_client.operation.Operation` """ started = now() op = self diff --git a/tamr_unify_client/project/attribute_configuration/collection.py b/tamr_unify_client/project/attribute_configuration/collection.py new file mode 100644 index 00000000..7e28163f --- /dev/null +++ b/tamr_unify_client/project/attribute_configuration/collection.py @@ -0,0 +1,79 @@ +from tamr_unify_client.base_collection import BaseCollection +from tamr_unify_client.project.attribute_configuration.resource import ( + AttributeConfiguration, +) + + +class AttributeConfigurationCollection(BaseCollection): + """Collection of :class:`~tamr_unify_client.project.attribute_configuration.resource.AttributeConfiguration` + + :param client: Client for API call delegation. + :type client: :class:`~tamr_unify_client.Client` + :param api_path: API path used to access this collection. + E.g. ``"projects/1/attributeConfigurations"`` + :type api_path: str + """ + + def by_resource_id(self, resource_id): + """Retrieve an attribute configuration by resource ID. + + :param resource_id: The resource ID. + :type resource_id: str + :returns: The specified attribute configuration. + :rtype: :class:`~tamr_unify_client.project.attribute_configuration.resource.AttributeConfiguration` + """ + return super().by_resource_id(self.api_path, resource_id) + + def by_relative_id(self, relative_id): + """Retrieve an attribute configuration by relative ID. + + :param relative_id: The relative ID. + :type relative_id: str + :returns: The specified attribute configuration. + :rtype: :class:`~tamr_unify_client.project.attribute_configuration.resource.AttributeConfiguration` + """ + return super().by_relative_id(AttributeConfiguration, relative_id) + + def by_external_id(self, external_id): + """Retrieve an attribute configuration by external ID. + + Since attributes do not have external IDs, this method is not supported and will + raise a :class:`NotImplementedError` . + + :param external_id: The external ID. + :type external_id: str + :returns: The specified attribute, if found. + :rtype: :class:`~tamr_unify_client.project.attribute_configuration.resource.AttributeConfiguration` + :raises KeyError: If no attribute with the specified external_id is found + :raises LookupError: If multiple attributes with the specified external_id are found + :raises NotImplementedError: AttributeConfiguration does not support external_id + """ + raise NotImplementedError("AttributeConfiguration does not support external_id") + + def stream(self): + """Stream attribute configurations in this collection. Implicitly called when iterating + over this collection. + + :returns: Stream of attribute configurations. + :rtype: Python generator yielding :class:`~tamr_unify_client.project.attribute_configuration.resource.AttributeConfiguration` + + Usage: + >>> for attributeConfiguration in collection.stream(): # explicit + >>> do_stuff(attributeConfiguration) + >>> for attributeConfiguration in collection: # implicit + >>> do_stuff(attributeConfiguration) + """ + + return super().stream(AttributeConfiguration) + + def create(self, creation_spec): + """Create an Attribute configuration in this collection + + :param creation_spec: Attribute configuration creation specification should be formatted as specified in the + `Public Docs for adding an AttributeConfiguration `_. + :type creation_spec: dict[str, str] + :returns: The created Attribute configuration + :rtype: :class:`~tamr_unify_client.project.attribute_configuration.resource.AttributeConfiguration` + """ + data = self.client.post(self.api_path, json=creation_spec).successful().json() + return AttributeConfiguration.from_json(self.client, data) diff --git a/tamr_unify_client/project/attribute_configuration/resource.py b/tamr_unify_client/project/attribute_configuration/resource.py new file mode 100644 index 00000000..5db5f430 --- /dev/null +++ b/tamr_unify_client/project/attribute_configuration/resource.py @@ -0,0 +1,213 @@ +from copy import deepcopy + +from tamr_unify_client.base_resource import BaseResource + + +class AttributeConfiguration(BaseResource): + """The configurations of Tamr Attributes. + + See https://docs.tamr.com/reference#the-attribute-configuration-object + """ + + @classmethod + def from_json( + cls, client, resource_json, api_path=None + ) -> "AttributeConfiguration": + return super().from_data(client, resource_json, api_path) + + @property + def relative_id(self): + """:type: str""" + return self._data.get("relativeId") + + @property + def id(self): + """:type: str""" + return self._data.get("id") + + @property + def relative_attribute_id(self): + """:type: str""" + return self._data.get("relativeAttributeId") + + @property + def attribute_role(self): + """:type: str""" + return self._data.get("attributeRole") + + @property + def similarity_function(self): + """:type: str""" + return self._data.get("similarityFunction") + + @property + def enabled_for_ml(self): + """:type: bool""" + return self._data.get("enabledForMl") + + @property + def tokenizer(self): + """:type: str""" + return self._data.get("tokenizer") + + @property + def numeric_field_resolution(self): + """:type: list """ + return self._data.get("numericFieldResolution") + + @property + def attribute_name(self): + """:type: str""" + return self._data.get("attributeName") + + def spec(self): + """Returns this attribute configuration's spec. + + :return: The spec of this attribute configuration. + :rtype: :class:`~tamr_unify_client.project.attribute_configuration.resource.AttributeConfigurationSpec` + """ + return AttributeConfigurationSpec.of(self) + + def __repr__(self): + return ( + f"{self.__class__.__module__}." + f"{self.__class__.__qualname__}(" + f"relative_id={self.relative_id!r}, " + f"id={self.id!r}, " + f"relative_attribute_id={self.relative_attribute_id!r}, " + f"attribute_role={self.attribute_role!r}, " + f"similarity_function={self.similarity_function!r}, " + f"enabled_for_ml={self.enabled_for_ml!r}, " + f"tokenizer={self.tokenizer!r}, " + f"numeric_field_resolution={self.numeric_field_resolution!r}, " + f"attribute_name={self.attribute_name!r})" + ) + + +class AttributeConfigurationSpec: + """A representation of the server view of an attribute configuration.""" + + def __init__(self, client, data, api_path): + self.client = client + self._data = data + self.api_path = api_path + + @staticmethod + def of(resource): + """Creates an attribute configuration spec from an attribute configuration. + + :param resource: The existing attribute configuration. + :type resource: :class:`~tamr_unify_client.project.attribute_configuration.resource.AttributeConfiguration` + :return: The corresponding attribute creation spec. + :rtype: :class:`~tamr_unify_client.project.attribute_configuration.resource.AttributeConfigurationSpec` + """ + return AttributeConfigurationSpec( + resource.client, deepcopy(resource._data), resource.api_path + ) + + @staticmethod + def new(): + """Creates a blank spec that could be used to construct a new attribute configuration. + + :return: The empty spec. + :rtype: :class:`~tamr_unify_client.project.attribute_configuration.resource.AttributeConfigurationSpec` + """ + return AttributeConfigurationSpec(None, {}, None) + + def from_data(self, data): + """Creates a spec with the same client and API path as this one, but new data. + + :param data: The data for the new spec. + :type data: dict + :return: The new spec. + :rtype: :class:`~tamr_unify_client.project.attribute_configuration.resource.AttributeConfigurationSpec` + """ + return AttributeConfigurationSpec(self.client, data, self.api_path) + + def to_dict(self): + """Returns a version of this spec that conforms to the API representation. + + :returns: The spec's dict. + :rtype: dict + """ + return deepcopy(self._data) + + def with_attribute_role(self, new_attribute_role): + """Creates a new spec with the same properties, updating attribute role. + + :param new_attribute_role: The new attribute role. + :type new_attribute_role: str + :return: A new spec. + :rtype: :class:`~tamr_unify_client.project.attribute_configuration.resource.AttributeConfigurationSpec` + """ + return self.from_data({**self._data, "attributeRole": new_attribute_role}) + + def with_similarity_function(self, new_similarity_function): + """Creates a new spec with the same properties, updating similarity function. + + :param new_similarity_function: The new similarity function. + :type new_similarity_function: str + :return: A new spec. + :rtype: :class:`~tamr_unify_client.project.attribute_configuration.resource.AttributeConfigurationSpec` + """ + return self.from_data( + {**self._data, "similarityFunction": new_similarity_function} + ) + + def with_enabled_for_ml(self, new_enabled_for_ml): + """Creates a new spec with the same properties, updating enabled for ML. + + :param new_enabled_for_ml: Whether the builder is enabled for ML. + :type new_enabled_for_ml: bool + :return: A new spec. + :rtype: :class:`~tamr_unify_client.project.attribute_configuration.resource.AttributeConfigurationSpec` + """ + return self.from_data({**self._data, "enabledForMl": new_enabled_for_ml}) + + def with_tokenizer(self, new_tokenizer): + """Creates a new spec with the same properties, updating tokenizer. + + :param new_tokenizer: The new tokenizer. + :type new_tokenizer: str + :return: A new spec. + :rtype: :class:`~tamr_unify_client.project.attribute_configuration.resource.AttributeConfigurationSpec` + """ + return self.from_data({**self._data, "tokenizer": new_tokenizer}) + + def with_numeric_field_resolution(self, new_numeric_field_resolution): + """Creates a new spec with the same properties, updating numeric field resolution. + + :param new_numeric_field_resolution: The new numeric field resolution. + :type new_numeric_field_resolution: str + :return: A new spec. + :rtype: :class:`~tamr_unify_client.project.attribute_configuration.resource.AttributeConfigurationSpec` + """ + return self.from_data( + {**self._data, "numericFieldResolution": new_numeric_field_resolution} + ) + + def with_attribute_name(self, new_attribute_name): + """Creates a new spec with the same properties, updating new attribute name. + + :param new_attribute_name: The new attribute name. + :type new_attribute_name: str + :return: A new spec. + :rtype: :class:`~tamr_unify_client.project.attribute_configuration.resource.AttributeConfigurationSpec` + """ + return self.from_data({**self._data, "attributeName": new_attribute_name}) + + def put(self): + """Updates the attribute configuration on the server. + + :return: The modified attribute configuration. + :rtype: :class:`~tamr_unify_client.project.attribute_configuration.resource.AttributeConfiguration` + """ + new_data = self.client.put(self.api_path, json=self._data).successful().json() + return AttributeConfiguration.from_json(self.client, new_data, self.api_path) + + def __repr__(self): + return ( + f"{self.__class__.__module__}." + f"{self.__class__.__qualname__}(" + f"dict={self._data})" + ) diff --git a/tamr_unify_client/project/attribute_mapping/collection.py b/tamr_unify_client/project/attribute_mapping/collection.py new file mode 100644 index 00000000..1aea4c1c --- /dev/null +++ b/tamr_unify_client/project/attribute_mapping/collection.py @@ -0,0 +1,76 @@ +from tamr_unify_client.project.attribute_mapping.resource import AttributeMapping + + +class AttributeMappingCollection: + """Collection of :class:`~tamr_unify_client.project.attribute_mapping.resource.AttributeMapping` + + :param client: Client for API call delegation. + :type client: :class:`~tamr_unify_client.Client` + :param api_path: API path used to access this collection. + :type api_path: str + """ + + def __init__(self, client, api_path): + self.client = client + self.api_path = api_path + + def stream(self): + """Stream attribute mappings in this collection. Implicitly called when iterating + over this collection. + + :returns: Stream of attribute mappings. + :rtype: Python generator yielding :class:`~tamr_unify_client.project.attribute_mapping.resource.AttributeMapping` + """ + all_maps = self.client.get(self.api_path).successful().json() + for mapping in all_maps: + yield AttributeMapping(self.client, mapping) + + def by_resource_id(self, resource_id): + """Retrieve an item in this collection by resource ID. + + :param resource_id: The resource ID. + :type resource_id: str + :returns: The specified attribute mapping. + :rtype: :class:`~tamr_unify_client.project.attribute_mapping.resource.AttributeMapping` + """ + maps = self.stream() + for mapping in maps: + split_id = mapping.resource_id + if resource_id == split_id: + return mapping + raise LookupError("cannot locate mapping from resource ID") + + def by_relative_id(self, relative_id): + """Retrieve an item in this collection by relative ID. + + :param relative_id: The relative ID. + :type relative_id: str + :returns: The specified attribute mapping. + :rtype: :class:`~tamr_unify_client.project.attribute_mapping.resource.AttributeMapping` + """ + resource_id = relative_id.split("attributeMappings/")[1] + return self.by_resource_id(resource_id) + + def create(self, creation_spec): + """Create an Attribute mapping in this collection + + :param creation_spec: Attribute mapping creation specification should be formatted as specified in the + `Public Docs for adding an AttributeMapping `_. + :type creation_spec: dict[str, str] + :returns: The created Attribute mapping + :rtype: :class:`~tamr_unify_client.project.attribute_mapping.resource.AttributeMapping` + """ + data = self.client.post(self.api_path, json=creation_spec).successful().json() + return AttributeMapping(self.client, data) + + def delete_by_resource_id(self, resource_id): + """Delete an attribute mapping using its Resource ID. + + :param resource_id: the resource ID of the mapping to be deleted. + :type resource_id: str + :returns: HTTP response from the server + :rtype: :class:`requests.Response` + """ + path = self.api_path + "/" + resource_id + response = self.client.delete(path).successful() + return response diff --git a/tamr_unify_client/project/attribute_mapping/resource.py b/tamr_unify_client/project/attribute_mapping/resource.py new file mode 100644 index 00000000..d7c5b12d --- /dev/null +++ b/tamr_unify_client/project/attribute_mapping/resource.py @@ -0,0 +1,247 @@ +from copy import deepcopy + + +class AttributeMapping: + """see https://docs.tamr.com/reference#retrieve-projects-mappings + AttributeMapping and AttributeMappingCollection do not inherit from BaseResource and BaseCollection. + BC and BR require a specific URL for each individual attribute mapping + (ex: /projects/1/attributeMappings/1), but these types of URLs do not exist for attribute mappings + """ + + def __init__(self, client, data): + self._data = data + self.client = client + # AttributeMapping cannot be aliased, and Project cannot be aliased, + # so AttributeMapping only ever has one address, which is both + # its relative_id and its api_path. + self.api_path = self.relative_id + + @property + def id(self): + """:type: str""" + return self._data["id"] + + @property + def relative_id(self): + """:type: str""" + return self._data["relativeId"] + + @property + def input_attribute_id(self): + """:type: str""" + return self._data["inputAttributeId"] + + @property + def relative_input_attribute_id(self): + """:type: str""" + return self._data["relativeInputAttributeId"] + + @property + def input_dataset_name(self): + """:type: str""" + return self._data["inputDatasetName"] + + @property + def input_attribute_name(self): + """:type: str""" + return self._data["inputAttributeName"] + + @property + def unified_attribute_id(self): + """:type: str""" + return self._data["unifiedAttributeId"] + + @property + def relative_unified_attribute_id(self): + """:type: str""" + return self._data["relativeUnifiedAttributeId"] + + @property + def unified_dataset_name(self): + """:type: str""" + return self._data["unifiedDatasetName"] + + @property + def unified_attribute_name(self): + """:type: str""" + return self._data["unifiedAttributeName"] + + @property + def resource_id(self): + """:type: str""" + spliced = self.relative_id.split("attributeMappings/")[1] + return spliced + + def spec(self): + """Returns a spec representation of this attribute mapping. + + :return: The attribute mapping spec. + :rtype: :class:`~tamr_unify_client.project.attribute_mapping.resource.AttributeMappingSpec` + """ + return AttributeMappingSpec.of(self) + + def delete(self): + """Delete this attribute mapping. + + :return: HTTP response from the server + :rtype: :class:`requests.Response` + """ + response = self.client.delete(self.api_path).successful() + return response + + def __repr__(self): + return ( + f"{self.__class__.__module__}." + f"{self.__class__.__qualname__}(" + f"id={self.id!r}, " + f"relative_id={self.relative_id!r}, " + f"input_attribute_id={self.input_attribute_id!r}, " + f"relative_input_attribute_id={self.relative_input_attribute_id!r}, " + f"input_dataset_name={self.input_dataset_name!r}, " + f"input_attribute_name={self.input_attribute_name!r}, " + f"unified_attribute_id={self.unified_attribute_id!r}, " + f"relative_unified_attribute_id={self.relative_unified_attribute_id!r}, " + f"unified_dataset_name={self.unified_dataset_name!r}, " + f"unified_attribute_name={self.unified_attribute_name!r})" + ) + + +class AttributeMappingSpec: + """A representation of the server view of an attribute mapping""" + + def __init__(self, data): + self._data = data + + @staticmethod + def of(resource): + """Creates an attribute mapping spec from a attribute mapping. + + :param resource: The existing attribute mapping. + :type resource: :class:`~tamr_unify_client.project.attribute_mapping.resource.AttributeMapping` + :return: The corresponding attribute mapping spec. + :rtype: :class:`~tamr_unify_client.project.attribute_mapping.resource.AttributeMappingSpec` + """ + return AttributeMappingSpec(deepcopy(resource._data)) + + @staticmethod + def new(): + """Creates a blank spec that could be used to construct a new attribute mapping. + + :return: The empty spec. + :rtype: :class:`~tamr_unify_client.project.attribute_mapping.resource.AttributeMappingSpec` + """ + return AttributeMappingSpec({}) + + def to_dict(self): + """Returns a version of this spec that conforms to the API representation. + + :returns: The spec's dict. + :rtype: dict + """ + return deepcopy(self._data) + + def with_input_attribute_id(self, new_input_attribute_id): + """Creates a new spec with the same properties, updating the input attribute id. + + :param new_input_attribute_id: The new input attribute id. + :type new_input_attribute_id: str + :return: The new spec. + :rtype: :class:`~tamr_unify_client.project.attribute_mapping.resource.AttributeMappingSpec` + """ + return AttributeMappingSpec( + {**self._data, "inputAttributeId": new_input_attribute_id} + ) + + def with_relative_input_attribute_id(self, new_relative_input_attribute_id): + """Creates a new spec with the same properties, updating the relative input attribute id. + + :param new_relative_input_attribute_id: The new relative input attribute Id. + :type new_relative_input_attribute_id: str + :return: The new spec. + :rtype: :class:`~tamr_unify_client.project.attribute_mapping.resource.AttributeMappingSpec` + """ + return AttributeMappingSpec( + {**self._data, "relativeInputAttributeId": new_relative_input_attribute_id} + ) + + def with_input_dataset_name(self, new_input_dataset_name): + """Creates a new spec with the same properties, updating the input dataset name. + + :param new_input_dataset_name: The new input dataset name. + :type new_input_dataset_name: str + :return: The new spec. + :rtype: :class:`~tamr_unify_client.project.attribute_mapping.resource.AttributeMappingSpec` + """ + return AttributeMappingSpec( + {**self._data, "inputDatasetName": new_input_dataset_name} + ) + + def with_input_attribute_name(self, new_input_attribute_name): + """Creates a new spec with the same properties, updating the input attribute name. + + :param new_input_attribute_name: The new input attribute name. + :type new_input_attribute_name: str + :return: The new spec. + :rtype: :class:`~tamr_unify_client.project.attribute_mapping.resource.AttributeMappingSpec` + """ + return AttributeMappingSpec( + {**self._data, "inputAttributeName": new_input_attribute_name} + ) + + def with_unified_attribute_id(self, new_unified_attribute_id): + """Creates a new spec with the same properties, updating the unified attribute id. + + :param new_unified_attribute_id: The new unified attribute id. + :type new_unified_attribute_id: str + :return: The new spec. + :rtype: :class:`~tamr_unify_client.project.attribute_mapping.resource.AttributeMappingSpec` + """ + return AttributeMappingSpec( + {**self._data, "unifiedAttributeId": new_unified_attribute_id} + ) + + def with_relative_unified_attribute_id(self, new_relative_unified_attribute_id): + """Creates a new spec with the same properties, updating the relative unified attribute id. + + :param new_relative_unified_attribute_id: The new relative unified attribute id. + :type new_relative_unified_attribute_id: str + :return: The new spec. + :rtype: :class:`~tamr_unify_client.project.attribute_mapping.resource.AttributeMappingSpec` + """ + return AttributeMappingSpec( + { + **self._data, + "relativeUnifiedAttributeId": new_relative_unified_attribute_id, + } + ) + + def with_unified_dataset_name(self, new_unified_dataset_name): + """Creates a new spec with the same properties, updating the unified dataset name. + + :param new_unified_dataset_name: The new unified dataset name. + :type new_unified_dataset_name: str + :return: The new spec. + :rtype: :class:`~tamr_unify_client.project.attribute_mapping.resource.AttributeMappingSpec` + """ + return AttributeMappingSpec( + {**self._data, "unifiedDatasetName": new_unified_dataset_name} + ) + + def with_unified_attribute_name(self, new_unified_attribute_name): + """Creates a new spec with the same properties, updating the unified attribute name. + + :param new_unified_attribute_name: The new unified attribute name. + :type new_unified_attribute_name: str + :return: The new spec. + :rtype: :class:`~tamr_unify_client.project.attribute_mapping.resource.AttributeMappingSpec` + """ + return AttributeMappingSpec( + {**self._data, "unifiedAttributeName": new_unified_attribute_name} + ) + + def __repr__(self): + return ( + f"{self.__class__.__module__}." + f"{self.__class__.__qualname__}(" + f"dict={self._data})" + ) diff --git a/tamr_unify_client/models/project/collection.py b/tamr_unify_client/project/collection.py similarity index 56% rename from tamr_unify_client/models/project/collection.py rename to tamr_unify_client/project/collection.py index 2b49a235..359b7649 100644 --- a/tamr_unify_client/models/project/collection.py +++ b/tamr_unify_client/project/collection.py @@ -1,9 +1,9 @@ -from tamr_unify_client.models.base_collection import BaseCollection -from tamr_unify_client.models.project.resource import Project +from tamr_unify_client.base_collection import BaseCollection +from tamr_unify_client.project.resource import Project class ProjectCollection(BaseCollection): - """Collection of :class:`~tamr_unify_client.models.project.resource.Project` s. + """Collection of :class:`~tamr_unify_client.project.resource.Project` s. :param client: Client for API call delegation. :type client: :class:`~tamr_unify_client.Client` @@ -21,7 +21,7 @@ def by_resource_id(self, resource_id): :param resource_id: The resource ID. E.g. ``"1"`` :type resource_id: str :returns: The specified project. - :rtype: :class:`~tamr_unify_client.models.project.resource.Project` + :rtype: :class:`~tamr_unify_client.project.resource.Project` """ return super().by_resource_id("projects", resource_id) @@ -31,7 +31,7 @@ def by_relative_id(self, relative_id): :param relative_id: The resource ID. E.g. ``"projects/1"`` :type relative_id: str :returns: The specified project. - :rtype: :class:`~tamr_unify_client.models.project.resource.Project` + :rtype: :class:`~tamr_unify_client.project.resource.Project` """ return super().by_relative_id(Project, relative_id) @@ -41,7 +41,7 @@ def by_external_id(self, external_id): :param external_id: The external ID. :type external_id: str :returns: The specified project, if found. - :rtype: :class:`~tamr_unify_client.models.project.resource.Project` + :rtype: :class:`~tamr_unify_client.project.resource.Project` :raises KeyError: If no project with the specified external_id is found :raises LookupError: If multiple projects with the specified external_id are found """ @@ -52,7 +52,7 @@ def stream(self): over this collection. :returns: Stream of projects. - :rtype: Python generator yielding :class:`~tamr_unify_client.models.project.resource.Project` + :rtype: Python generator yielding :class:`~tamr_unify_client.project.resource.Project` Usage: >>> for project in collection.stream(): # explicit @@ -62,4 +62,31 @@ def stream(self): """ return super().stream(Project) + def by_name(self, project_name: str) -> Project: + """Get project by name + + Fetches a specific project in this collection by exact-match on name. + + Args: + project_name: Name of the desired project. + Raises: + KeyError: If no project with specified name was found. + """ + for project in self: + if project.name == project_name: + return project + raise KeyError(f"No project found with name: {project_name}") + + def create(self, creation_spec): + """ + Create a Project in Tamr + + :param creation_spec: Project creation specification should be formatted as specified in the `Public Docs for Creating a Project `_. + :type creation_spec: dict[str, str] + :returns: The created Project + :rtype: :class:`~tamr_unify_client.project.resource.Project` + """ + data = self.client.post(self.api_path, json=creation_spec).successful().json() + return Project.from_json(self.client, data) + # super.__repr__ is sufficient diff --git a/tamr_unify_client/project/resource.py b/tamr_unify_client/project/resource.py new file mode 100644 index 00000000..0e1f83be --- /dev/null +++ b/tamr_unify_client/project/resource.py @@ -0,0 +1,290 @@ +from copy import deepcopy + +from tamr_unify_client.base_resource import BaseResource +from tamr_unify_client.dataset.collection import DatasetCollection +from tamr_unify_client.dataset.resource import Dataset +from tamr_unify_client.project.attribute_configuration.collection import ( + AttributeConfigurationCollection, +) +from tamr_unify_client.project.attribute_mapping.collection import ( + AttributeMappingCollection, +) + + +class Project(BaseResource): + """A Tamr project.""" + + @classmethod + def from_json(cls, client, resource_json, api_path=None): + return super().from_data(client, resource_json, api_path) + + @property + def name(self): + """:type: str""" + return self._data.get("name") + + @property + def external_id(self): + """:type: str""" + return self._data.get("externalId") + + @property + def description(self): + """:type: str""" + return self._data.get("description") + + @property + def type(self): + """A Tamr project type, listed in https://docs.tamr.com/reference#create-a-project. + + :type: str + """ + return self._data.get("type") + + @property + def attributes(self): + """Attributes of this project. + + :return: Attributes of this project. + :rtype: :class:`~tamr_unify_client.attribute.collection.AttributeCollection` + """ + from tamr_unify_client.attribute.collection import AttributeCollection + + alias = self.api_path + "/attributes" + return AttributeCollection(self.client, alias) + + def unified_dataset(self): + """Unified dataset for this project. + + :return: Unified dataset for this project. + :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` + """ + alias = self.api_path + "/unifiedDataset" + resource_json = self.client.get(alias).successful().json() + return Dataset.from_json(self.client, resource_json, alias) + + def as_categorization(self): + """Convert this project to a :class:`~tamr_unify_client.categorization.project.CategorizationProject` + + :return: This project. + :rtype: :class:`~tamr_unify_client.categorization.project.CategorizationProject` + :raises TypeError: If the :attr:`~tamr_unify_client.project.resource.Project.type` of this project is not ``"CATEGORIZATION"`` + """ + from tamr_unify_client.categorization.project import CategorizationProject + + if self.type != "CATEGORIZATION": + raise TypeError( + f"Cannot convert project to categorization project. Project type: {self.type}" + ) + return CategorizationProject(self.client, self._data, self.api_path) + + def as_mastering(self): + """Convert this project to a :class:`~tamr_unify_client.mastering.project.MasteringProject` + + :return: This project. + :rtype: :class:`~tamr_unify_client.mastering.project.MasteringProject` + :raises TypeError: If the :attr:`~tamr_unify_client.project.resource.Project.type` of this project is not ``"DEDUP"`` + """ + from tamr_unify_client.mastering.project import MasteringProject + + if self.type != "DEDUP": + raise TypeError( + f"Cannot convert project to mastering project. Project type: {self.type}" + ) + return MasteringProject(self.client, self._data, self.api_path) + + def add_input_dataset(self, dataset): + """ + Associate a dataset with a project in Tamr. + + By default, datasets are not associated with any projects. + They need to be added as input to a project before they can be used + as part of that project + + :param dataset: The dataset to associate with the project. + :type dataset: :class:`~tamr_unify_client.dataset.resource.Dataset` + :return: HTTP response from the server + :rtype: :class:`requests.Response` + """ + params = {"id": dataset.relative_id} + response = self.client.post( + self.api_path + "/inputDatasets", params=params + ).successful() + return response + + def remove_input_dataset(self, dataset): + """Remove a dataset from a project. + + :param dataset: The dataset to be removed from this project. + :type dataset: :class:`~tamr_unify_client.dataset.resource.Dataset` + :return: HTTP response from the server + :rtype: :class:`requests.Response` + """ + params = {"id": dataset.relative_id} + response = self.client.delete( + self.api_path + "/inputDatasets", params=params + ).successful() + return response + + def input_datasets(self): + """Retrieve a collection of this project's input datasets. + + :return: The project's input datasets. + :rtype: :class:`~tamr_unify_client.dataset.collection.DatasetCollection` + """ + alias = self.api_path + "/inputDatasets" + return DatasetCollection(self.client, alias) + + def attribute_configurations(self): + """Project's attribute's configurations. + + :returns: The configurations of the attributes of a project. + :rtype: :class:`~tamr_unify_client.project.attribute_configuration.collection.AttributeConfigurationCollection` + """ + alias = self.api_path + "/attributeConfigurations" + info = AttributeConfigurationCollection(self.client, api_path=alias) + return info + + def attribute_mappings(self): + """Project's attribute's mappings. + + :returns: The attribute mappings of a project. + :rtype: :class:`~tamr_unify_client.project.attribute_mapping.collection.AttributeMappingCollection` + """ + alias = self.api_path + "/attributeMappings" + info = AttributeMappingCollection(self.client, alias) + return info + + def spec(self): + """Returns this project's spec. + + :return: The spec for the project. + :rtype: :class:`~tamr_unify_client.project.resource.ProjectSpec` + """ + return ProjectSpec.of(self) + + def __repr__(self): + return ( + f"{self.__class__.__module__}." + f"{self.__class__.__qualname__}(" + f"relative_id={self.relative_id!r}, " + f"name={self.name!r}, " + f"type={self.type!r})" + ) + + +class ProjectSpec: + """A representation of the server view of a project.""" + + def __init__(self, client, data, api_path): + self.client = client + self._data = data + self.api_path = api_path + + @staticmethod + def of(resource): + """Creates a project spec from a project. + + :param resource: The existing project. + :type resource: :class:`~tamr_unify_client.project.resource.Project` + :return: The corresponding project spec. + :rtype: :class:`~tamr_unify_client.project.resource.ProjectSpec` + """ + return ProjectSpec(resource.client, deepcopy(resource._data), resource.api_path) + + @staticmethod + def new(): + """Creates a blank spec that could be used to construct a new project. + + :return: The empty spec. + :rtype: :class:`~tamr_unify_client.project.resource.ProjectSpec` + """ + return ProjectSpec(None, {}, None) + + def from_data(self, data): + """Creates a spec with the same client and API path as this one, but new data. + + :param data: The data for the new spec. + :type data: dict + :return: The new spec. + :rtype: :class:`~tamr_unify_client.project.resource.ProjectSpec` + """ + return ProjectSpec(self.client, data, self.api_path) + + def to_dict(self): + """Returns a version of this spec that conforms to the API representation. + + :returns: The spec's dict. + :rtype: dict + """ + return deepcopy(self._data) + + def with_name(self, new_name): + """Creates a new spec with the same properties, updating name. + + :param new_name: The new name. + :type new_name: str + :return: The new spec. + :rtype: :class:`~tamr_unify_client.project.resource.ProjectSpec` + """ + return self.from_data({**self._data, "name": new_name}) + + def with_description(self, new_description): + """Creates a new spec with the same properties, updating description. + + :param new_description: The new description. + :type new_description: str + :return: The new spec. + :rtype: :class:`~tamr_unify_client.project.resource.ProjectSpec` + """ + return self.from_data({**self._data, "description": new_description}) + + def with_type(self, new_type): + """Creates a new spec with the same properties, updating type. + + :param new_type: The new type. + :type new_type: str + :return: The new spec. + :rtype: :class:`~tamr_unify_client.project.resource.ProjectSpec` + """ + return self.from_data({**self._data, "type": new_type}) + + def with_external_id(self, new_external_id): + """Creates a new spec with the same properties, updating external ID. + + :param new_external_id: The new external ID. + :type new_external_id: str + :return: The new spec. + :rtype: :class:`~tamr_unify_client.project.resource.ProjectSpec` + """ + return self.from_data({**self._data, "externalId": new_external_id}) + + def with_unified_dataset_name(self, new_unified_dataset_name): + """Creates a new spec with the same properties, updating unified dataset name. + + :param new_unified_dataset_name: The new unified dataset name. + :type new_unified_dataset_name: str + :return: The new spec. + :rtype: :class:`~tamr_unify_client.project.resource.ProjectSpec` + """ + return self.from_data( + {**self._data, "unifiedDatasetName": new_unified_dataset_name} + ) + + def put(self): + """Commits these changes by updating the project in Tamr. + + :return: The updated project. + :rtype: :class:`~tamr_unify_client.project.resource.Project` + """ + updated_json = ( + self.client.put(self.api_path, json=self._data).successful().json() + ) + return Project.from_json(self.client, updated_json, self.api_path) + + def __repr__(self): + return ( + f"{self.__class__.__module__}." + f"{self.__class__.__qualname__}(" + f"dict={self._data})" + ) diff --git a/tamr_unify_client/project/step.py b/tamr_unify_client/project/step.py new file mode 100644 index 00000000..3f9b8f84 --- /dev/null +++ b/tamr_unify_client/project/step.py @@ -0,0 +1,65 @@ +class ProjectStep: + """A step of a Tamr project. This is not a `BaseResource` because it has no API path + and cannot be directly retrieved or modified. + + See https://docs.tamr.com/reference#retrieve-downstream-dataset-usage + + :param client: Delegate underlying API calls to this client. + :type client: :class:`~tamr_unify_client.Client` + :param data: The JSON body containing project step information. + :type data: :py:class:`dict` + """ + + def __init__(self, client, data): + self.client = client + self._data = data + + @property + def project_step_id(self): + """:type: str""" + return self._data.get("projectStepId") + + @property + def project_step_name(self): + """:type: str""" + return self._data.get("projectStepName") + + @property + def project_name(self): + """:type: str""" + return self._data.get("projectName") + + @property + def type(self): + """A Tamr project type, listed in https://docs.tamr.com/reference#create-a-project. + + :type: str""" + return self._data.get("type") + + def project(self): + """Retrieves the :class:`~tamr_unify_client.project.resource.Project` this step is associated with. + + :returns: This step's project. + :rtype: :class:`~tamr_unify_client.project.resource.Project` + :raises KeyError: If no project with the specified name is found. + :raises LookupError: If multiple projects with the specified name are found. + """ + name = self.project_name + projects = [p for p in self.client.projects if p.name == name] + + if len(projects) == 0: + raise KeyError(f'No project found with name "{name}"') + elif len(projects) > 1: + raise LookupError(f'Multiple projects found with name "{name}"') + + return projects[0] + + def __repr__(self): + return ( + f"{self.__class__.__module__}." + f"{self.__class__.__qualname__}(" + f"project_step_id={self.project_step_id!r}, " + f"project_step_name={self.project_step_name!r}, " + f"project_name={self.project_name!r}, " + f"type={self.type!r})" + ) diff --git a/tamr_unify_client/response.py b/tamr_unify_client/response.py new file mode 100644 index 00000000..ac9bc349 --- /dev/null +++ b/tamr_unify_client/response.py @@ -0,0 +1,31 @@ +import logging + +import requests + +logger = logging.getLogger(__name__) + + +def successful(response: requests.Response) -> requests.Response: + """Ensure response does not contain an HTTP error. + + Delegates to :func:`requests.Response.raise_for_status` + + Returns: + The response being checked. + + Raises: + requests.exceptions.HTTPError: If an HTTP error is encountered. + """ + try: + response.raise_for_status() + except requests.HTTPError as e: + r = e.response + logger.error( + f"Encountered HTTP error code {r.status_code}. Response body: {r.text}" + ) + raise e + return response + + +def _monkey_patch(): + requests.Response.successful = successful diff --git a/tests/mock_api/test_continuous_mastering.py b/tests/mock_api/test_continuous_mastering.py index 04ad7753..4d1512cf 100644 --- a/tests/mock_api/test_continuous_mastering.py +++ b/tests/mock_api/test_continuous_mastering.py @@ -4,7 +4,8 @@ from tamr_unify_client import Client from tamr_unify_client.auth import UsernamePasswordAuth -from .utils import mock_api +from tests.mock_api.utils import mock_api + basedir = os.path.dirname(__file__) response_log_path = os.path.join( @@ -42,7 +43,7 @@ def test_continuous_mastering(): assert op.succeeded() estimate_url = ( - f"http://localhost:9100/api/versioned/v1/projects/1/estimatedPairCounts" + "http://localhost:9100/api/versioned/v1/projects/1/estimatedPairCounts" ) estimate_json = { "isUpToDate": "true", diff --git a/tests/response_logs/continuous_mastering.ndjson b/tests/response_logs/continuous_mastering.ndjson index ccd14a66..70d87e06 100644 --- a/tests/response_logs/continuous_mastering.ndjson +++ b/tests/response_logs/continuous_mastering.ndjson @@ -80,7 +80,9 @@ {"method": "POST", "url": "http://10.10.0.92:9100/api/versioned/v1/projects/1/recordClusters:refresh", "status": 200, "json": {"id": "497", "type": "SPARK", "description": "Clustering", "status": {"state": "PENDING", "startTime": "", "endTime": "", "message": "Job has not yet been submitted to the executor"}, "created": {"username": "epeck", "time": "2019-03-07T20:16:55.789Z", "version": "23830"}, "lastModified": {"username": "epeck", "time": "2019-03-07T20:16:55.789Z", "version": "23830"}, "relativeId": "operations/497"}} {"method": "GET", "url": "http://10.10.0.92:9100/api/versioned/v1/operations/497", "status": 200, "json": {"id": "497", "type": "SPARK", "description": "Clustering", "status": {"state": "RUNNING", "startTime": "", "endTime": "", "message": "Job has not yet been submitted to the executor"}, "created": {"username": "epeck", "time": "2019-03-07T20:16:55.789Z", "version": "23830"}, "lastModified": {"username": "epeck", "time": "2019-03-07T20:16:55.789Z", "version": "23830"}, "relativeId": "operations/497"}} {"method": "GET", "url": "http://10.10.0.92:9100/api/versioned/v1/operations/497", "status": 200, "json": {"id": "497", "type": "SPARK", "description": "Clustering", "status": {"state": "SUCCEEDED", "startTime": "", "endTime": "", "message": "Job has not yet been submitted to the executor"}, "created": {"username": "epeck", "time": "2019-03-07T20:16:55.789Z", "version": "23830"}, "lastModified": {"username": "epeck", "time": "2019-03-07T20:16:55.789Z", "version": "23830"}, "relativeId": "operations/497"}} -{"method": "POST", "url": "http://10.10.0.92:9100/api/versioned/v1/projects/1/publishedClusters:refresh", "status": 202, "json": {"id": "21", "type": "SPARK", "description": "Publish clusters", "status": {"state": "PENDING", "message": "Job has not yet been submitted to the executor"}, "created": {"username": "admin", "time": "2018-12-14T19:42:46.755Z", "version": "603"}, "lastModified": {"username": "admin", "time": "2018-12-14T19:42:46.755Z", "version": "603"}, "relativeId": "operations/21"}} +{"method": "GET", "url": "http://10.10.0.92:9100/api/versioned/v1/projects/1/unifiedDataset", "status": 200, "json": {"id": "unify://unified-data/v1/datasets/8", "name": "Project_1_unified_dataset", "description":"", "created":{"username":"admin","time":"2019-06-05T16:28:11.639Z","version":"83"}, "lastModified":{"username":"admin","time":"2019-06-10T15:06:24.856Z","version":"5983"}, "relativeId": "datasets/8"}} +{"method": "GET", "url": "http://10.10.0.92:9100/api/versioned/v1/datasets", "status": 200, "json": [{"id": "unify://unified-data/v1/datasets/32", "name": "Project_1_unified_dataset_dedup_published_clusters", "description": "All the mappings of records to clusters.", "unifiedDatasetName": "Project_1_unified_dataset", "created": {"username": "admin", "time": "2019-06-05T18:35:32.407Z", "version": "553"}, "lastModified": {"username": "admin", "time": "2019-06-11T14:00:38.576Z", "version": "6792"}, "relativeId": "datasets/32"}]} +{"method": "POST", "url": "http://10.10.0.92:9100/api/versioned/v1/projects/1/publishedClusters:refresh", "status": 202, "json": {"id": "21", "type": "SPARK", "description": "Publish clusters", "status": {"state": "PENDING", "startTime": "", "endTime": "", "message": "Job has not yet been submitted to Spark"}, "created": {"username": "admin", "time": "2019-06-24T15:58:48.734Z", "version": "2407"}, "lastModified": {"username": "admin", "time": "2019-06-24T15:58:48.734Z", "version": "2407"}, "relativeId": "operations/21"}} {"method": "GET", "url": "http://10.10.0.92:9100/api/versioned/v1/operations/21", "status": 200, "json": {"id": "21", "type": "SPARK", "description": "Publish clusters", "status": {"state": "PENDING", "message": "Job has not yet been submitted to the executor"}, "created": {"username": "admin", "time": "2018-12-14T19:42:46.755Z", "version": "603"}, "lastModified": {"username": "admin", "time": "2018-12-14T19:42:46.755Z", "version": "603"}, "relativeId": "operations/21"}} {"method": "GET", "url": "http://10.10.0.92:9100/api/versioned/v1/operations/21", "status": 200, "json": {"id": "21", "type": "SPARK", "description": "Publish clusters", "status": {"state": "PENDING", "message": "Job has not yet been submitted to the executor"}, "created": {"username": "admin", "time": "2018-12-14T19:42:46.755Z", "version": "603"}, "lastModified": {"username": "admin", "time": "2018-12-14T19:42:46.755Z", "version": "603"}, "relativeId": "operations/21"}} {"method": "GET", "url": "http://10.10.0.92:9100/api/versioned/v1/operations/21", "status": 200, "json": {"id": "21", "type": "SPARK", "description": "Publish clusters", "status": {"state": "PENDING", "message": "Job has not yet been submitted to the executor"}, "created": {"username": "admin", "time": "2018-12-14T19:42:46.755Z", "version": "603"}, "lastModified": {"username": "admin", "time": "2018-12-14T19:42:46.755Z", "version": "603"}, "relativeId": "operations/21"}} diff --git a/tests/tamr_client/attribute/test_attribute.py b/tests/tamr_client/attribute/test_attribute.py new file mode 100644 index 00000000..810d9521 --- /dev/null +++ b/tests/tamr_client/attribute/test_attribute.py @@ -0,0 +1,147 @@ +import pytest + +import tamr_client as tc +from tests.tamr_client import fake, utils + + +def test_from_json(): + attrs_json = utils.load_json("attributes.json") + dataset_id = 1 + for attr_json in attrs_json: + attr_id = attr_json["name"] + url = tc.URL(path=f"datasets/{dataset_id}/attributes/{attr_id}") + attr = tc.attribute._from_json(url, attr_json) + assert attr.name == attr_json["name"] + assert attr.description == attr_json["description"] + assert attr.is_nullable == attr_json["isNullable"] + + +def test_json(): + """original -> to_json -> from_json -> original""" + attrs_json = utils.load_json("attributes.json") + dataset_id = 1 + for attr_json in attrs_json: + attr_id = attr_json["name"] + url = tc.URL(f"datasets/{dataset_id}/attributes/{attr_id}") + attr = tc.attribute._from_json(url, attr_json) + assert attr == tc.attribute._from_json(url, tc.attribute.to_json(attr)) + + +@fake.json +def test_create(): + s = fake.session() + dataset = fake.dataset() + + attrs = tuple( + [ + tc.SubAttribute( + name=str(i), + is_nullable=True, + type=tc.attribute.type.Array(tc.attribute.type.STRING), + ) + for i in range(4) + ] + ) + + attr = tc.attribute.create( + s, + dataset, + name="attr", + is_nullable=False, + type=tc.attribute.type.Record(attributes=attrs), + description="an attribute", + ) + + assert attr.name == "attr" + assert not attr.is_nullable + assert isinstance(attr.type, tc.attribute.type.Record) + assert attr.type.attributes == attrs + assert attr.description == "an attribute" + + +@fake.json +def test_update(): + s = fake.session() + attr = fake.attribute() + + updated_attr = tc.attribute.update( + s, attr, description="Synthetic row number updated" + ) + + assert updated_attr.description == "Synthetic row number updated" + + +@fake.json +def test_delete(): + s = fake.session() + attr = fake.attribute() + + tc.attribute.delete(s, attr) + + +@fake.json +def test_by_resource_id(): + s = fake.session() + dataset = fake.dataset() + + attrs = tuple( + [ + tc.SubAttribute( + name=str(i), + is_nullable=True, + type=tc.attribute.type.Array(tc.attribute.type.STRING), + ) + for i in range(4) + ] + ) + + attr = tc.attribute.by_resource_id(s, dataset, "attr") + + assert attr.name == "attr" + assert not attr.is_nullable + assert isinstance(attr.type, tc.attribute.type.Record) + assert attr.type.attributes == attrs + + +@fake.json +def test_by_resource_id_attribute_not_found(): + s = fake.session() + dataset = fake.dataset() + + with pytest.raises(tc.attribute.NotFound): + tc.attribute.by_resource_id(s, dataset, "attr") + + +def test_create_reserved_attribute_name(): + s = fake.session() + dataset = fake.dataset() + + with pytest.raises(tc.attribute.ReservedName): + tc.attribute.create(s, dataset, name="clusterId", is_nullable=False) + + +@fake.json +def test_create_attribute_exists(): + s = fake.session() + dataset = fake.dataset() + + with pytest.raises(tc.attribute.AlreadyExists): + tc.attribute.create(s, dataset, name="attr", is_nullable=False) + + +@fake.json +def test_update_attribute_not_found(): + s = fake.session() + attr = fake.attribute() + + with pytest.raises(tc.attribute.NotFound): + tc.attribute.update(s, attr) + + +@fake.json +def test_delete_attribute_not_found(): + s = fake.session() + attr = fake.attribute() + + with pytest.raises(tc.attribute.NotFound): + tc.attribute.delete(s, attr) diff --git a/tests/tamr_client/attribute/test_type.py b/tests/tamr_client/attribute/test_type.py new file mode 100644 index 00000000..49e6b967 --- /dev/null +++ b/tests/tamr_client/attribute/test_type.py @@ -0,0 +1,76 @@ +import pytest + +import tamr_client as tc +from tests.tamr_client import utils + + +def test_from_json(): + geom_json = utils.load_json("attributes.json")[1] + geom_type = tc.attribute.type.from_json(geom_json["type"]) + assert isinstance(geom_type, tc.attribute.type.Record) + + for i, subattr in enumerate(geom_type.attributes): + assert isinstance(subattr, tc.SubAttribute) + if i == 0: + assert subattr.name == "point" + assert subattr.type == tc.attribute.type.Array(tc.attribute.type.DOUBLE) + assert subattr.is_nullable + elif i == 1: + assert subattr.name == "lineString" + assert subattr.type == tc.attribute.type.Array( + tc.attribute.type.Array(tc.attribute.type.DOUBLE) + ) + assert subattr.is_nullable + elif i == 2: + assert subattr.name == "polygon" + assert subattr.type == tc.attribute.type.Array( + tc.attribute.type.Array( + tc.attribute.type.Array(tc.attribute.type.DOUBLE) + ) + ) + assert subattr.is_nullable + + +def test_from_json_missing_base_type(): + type_json: tc._types.JsonDict = {"attributes": []} + + with pytest.raises(ValueError): + tc.attribute.type.from_json(type_json) + + +def test_from_json_unrecognized_base_type(): + type_json: tc._types.JsonDict = {"baseType": "NOT_A_TYPE", "attributes": []} + + with pytest.raises(ValueError): + tc.attribute.type.from_json(type_json) + + +def test_from_json_array_missing_inner_type(): + type_json: tc._types.JsonDict = {"baseType": "ARRAY"} + + with pytest.raises(ValueError): + tc.attribute.type.from_json(type_json) + + +def test_from_json_map_missing_inner_type(): + type_json: tc._types.JsonDict = {"baseType": "MAP"} + + with pytest.raises(ValueError): + tc.attribute.type.from_json(type_json) + + +def test_from_json_record_missing_attributes(): + type_json: tc._types.JsonDict = {"baseType": "RECORD"} + + with pytest.raises(ValueError): + tc.attribute.type.from_json(type_json) + + +def test_json(): + attrs_json = utils.load_json("attributes.json") + for attr_json in attrs_json: + attr_type_json = attr_json["type"] + attr_type = tc.attribute.type.from_json(attr_type_json) + assert attr_type == tc.attribute.type.from_json( + tc.attribute.type.to_json(attr_type) + ) diff --git a/tamr_unify_client/models/attribute/__init__.py b/tests/tamr_client/categorization/__init__.py similarity index 100% rename from tamr_unify_client/models/attribute/__init__.py rename to tests/tamr_client/categorization/__init__.py diff --git a/tests/tamr_client/categorization/test_categorization.py b/tests/tamr_client/categorization/test_categorization.py new file mode 100644 index 00000000..649393e5 --- /dev/null +++ b/tests/tamr_client/categorization/test_categorization.py @@ -0,0 +1,42 @@ +import tamr_client as tc +from tests.tamr_client import fake + + +@fake.json +def test_manual_labels(): + s = fake.session() + project = fake.categorization_project() + + tc.categorization.manual_labels(session=s, project=project) + + +@fake.json +def test_apply_feedback_async(): + s = fake.session() + project = fake.categorization_project() + + op = tc.categorization._apply_feedback_async(s, project) + assert op.type == "SPARK" + assert op.description == "Materialize views to Elastic" + assert op.status == { + "state": "PENDING", + "startTime": "", + "endTime": "", + "message": "Job has not yet been submitted to Spark", + } + + +@fake.json +def test_update_results_async(): + s = fake.session() + project = fake.categorization_project() + + op = tc.categorization._update_results_async(s, project) + assert op.type == "SPARK" + assert op.description == "Materialize views to Elastic" + assert op.status == { + "state": "PENDING", + "startTime": "", + "endTime": "", + "message": "Job has not yet been submitted to Spark", + } diff --git a/tests/tamr_client/categorization/test_categorization_project.py b/tests/tamr_client/categorization/test_categorization_project.py new file mode 100644 index 00000000..f60935c1 --- /dev/null +++ b/tests/tamr_client/categorization/test_categorization_project.py @@ -0,0 +1,18 @@ +import tamr_client as tc +from tests.tamr_client import fake + + +@fake.json +def test_create(): + s = fake.session() + instance = fake.instance() + + project = tc.categorization.project.create( + s, + instance, + name="New Categorization Project", + description="A Categorization Project", + ) + assert isinstance(project, tc.CategorizationProject) + assert project.name == "New Categorization Project" + assert project.description == "A Categorization Project" diff --git a/tests/tamr_client/data/attributes.json b/tests/tamr_client/data/attributes.json new file mode 100644 index 00000000..5b42540b --- /dev/null +++ b/tests/tamr_client/data/attributes.json @@ -0,0 +1,119 @@ +[ + { + "name": "RowNum", + "description": "Synthetic row number", + "type": { + "baseType": "STRING", + "attributes": [] + }, + "isNullable": false + }, + { + "name": "geom", + "description": "", + "type": { + "baseType": "RECORD", + "attributes": [ + { + "name": "point", + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "DOUBLE", + "attributes": [] + }, + "attributes": [] + }, + "isNullable": true + }, + { + "name": "lineString", + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "ARRAY", + "innerType": { + "baseType": "DOUBLE", + "attributes": [] + }, + "attributes": [] + }, + "attributes": [] + }, + "isNullable": true + }, + { + "name": "polygon", + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "ARRAY", + "innerType": { + "baseType": "ARRAY", + "innerType": { + "baseType": "DOUBLE", + "attributes": [] + }, + "attributes": [] + }, + "attributes": [] + }, + "attributes": [] + }, + "isNullable": true + } + ] + }, + "isNullable": false + }, + { + "name": "attr", + "description": "", + "isNullable": false, + "type": { + "baseType": "RECORD", + "attributes": [ + { + "name": "0", + "isNullable": true, + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "STRING" + } + } + }, + { + "name": "1", + "isNullable": true, + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "STRING" + } + } + }, + { + "name": "2", + "isNullable": true, + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "STRING" + } + } + }, + { + "name": "3", + "isNullable": true, + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "STRING" + } + } + } + ] + } + } +] diff --git a/tests/tamr_client/data/dataset.json b/tests/tamr_client/data/dataset.json new file mode 100644 index 00000000..b9934ef0 --- /dev/null +++ b/tests/tamr_client/data/dataset.json @@ -0,0 +1,23 @@ +{ + "id": "unify://unified-data/v1/datasets/1", + "externalId": "number 1", + "name": "dataset 1 name", + "description": "dataset 1 description", + "version": "dataset 1 version", + "keyAttributeNames": [ + "tamr_id" + ], + "tags": [], + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "dataset 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "dataset 1 modified version" + }, + "relativeId": "datasets/1", + "upstreamDatasetIds": [] +} \ No newline at end of file diff --git a/tests/tamr_client/data/operation_failed.json b/tests/tamr_client/data/operation_failed.json new file mode 100644 index 00000000..17c196f7 --- /dev/null +++ b/tests/tamr_client/data/operation_failed.json @@ -0,0 +1,22 @@ +{ + "id": "1", + "type": "SPARK", + "description": "operation 1 description", + "status": { + "state": "FAILED", + "startTime": "", + "endTime": "", + "message": "" + }, + "created": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 modified version" + }, + "relativeId": "operations/1" +} \ No newline at end of file diff --git a/tests/tamr_client/data/operation_pending.json b/tests/tamr_client/data/operation_pending.json new file mode 100644 index 00000000..c2169d4e --- /dev/null +++ b/tests/tamr_client/data/operation_pending.json @@ -0,0 +1,22 @@ +{ + "id": "1", + "type": "SPARK", + "description": "operation 1 description", + "status": { + "state": "PENDING", + "startTime": "", + "endTime": "", + "message": "Job has not yet been submitted to Spark" + }, + "created": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 modified version" + }, + "relativeId": "operations/1" +} \ No newline at end of file diff --git a/tests/tamr_client/data/operation_succeeded.json b/tests/tamr_client/data/operation_succeeded.json new file mode 100644 index 00000000..010d7ef0 --- /dev/null +++ b/tests/tamr_client/data/operation_succeeded.json @@ -0,0 +1,22 @@ +{ + "id": "1", + "type": "SPARK", + "description": "operation 1 description", + "status": { + "state": "SUCCEEDED", + "startTime": "", + "endTime": "", + "message": "" + }, + "created": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 modified version" + }, + "relativeId": "operations/1" +} \ No newline at end of file diff --git a/tests/tamr_client/dataset/test_dataframe.py b/tests/tamr_client/dataset/test_dataframe.py new file mode 100644 index 00000000..df49c13d --- /dev/null +++ b/tests/tamr_client/dataset/test_dataframe.py @@ -0,0 +1,173 @@ +import pandas as pd +import pytest + +import tamr_client as tc +from tests.tamr_client import fake + + +@fake.json +def test_upsert(): + s = fake.session() + dataset = fake.dataset() + + df = pd.DataFrame(_records_json) + + response = tc.dataframe.upsert(s, dataset, df, primary_key_name="primary_key") + assert response == _response_json + + +def test_upsert_primary_key_not_found(): + s = fake.session() + dataset = fake.dataset() + + df = pd.DataFrame(_records_json) + + with pytest.raises(tc.primary_key.NotFound): + tc.dataframe.upsert(s, dataset, df, primary_key_name="wrong_primary_key") + + +@fake.json +def test_upsert_infer_primary_key(): + s = fake.session() + dataset = fake.dataset() + + df = pd.DataFrame(_records_json) + + response = tc.dataframe.upsert(s, dataset, df) + assert response == _response_json + + +@fake.json +def test_upsert_index_as_primary_key(): + s = fake.session() + dataset = fake.dataset() + + df = pd.DataFrame( + _records_json_2, + index=[record["primary_key"] for record in _records_with_keys_json_2], + ) + df.index.name = "primary_key" + + response = tc.dataframe.upsert(s, dataset, df, primary_key_name="primary_key") + assert response == _response_json + + +def test_upsert_index_column_name_collision(): + s = fake.session() + dataset = fake.dataset() + + df = pd.DataFrame(_records_json_2) + df.index.name = "primary_key" + + # create column in `df` with same name as index and matching "primary_key" + df.insert(0, df.index.name, df.index) + + with pytest.raises(tc.primary_key.Ambiguous): + tc.dataframe.upsert(s, dataset, df, primary_key_name="primary_key") + + +@fake.json +def test_create(): + s = fake.session() + instance = fake.instance() + + df = pd.DataFrame(_records_with_keys_json_2) + + dataset = tc.dataframe.create( + s, instance, df, name="df_dataset", primary_key_name="primary_key" + ) + assert dataset.name == "df_dataset" + assert dataset.key_attribute_names == ("primary_key",) + + +@fake.json +def test_create_infer_primary_key_from_index(): + s = fake.session() + instance = fake.instance() + + df = pd.DataFrame( + _records_json_2, + index=[record["primary_key"] for record in _records_with_keys_json_2], + ) + df.index.name = "primary_key" + + dataset = tc.dataframe.create(s, instance, df, name="df_dataset") + assert dataset.name == "df_dataset" + assert dataset.key_attribute_names == ("primary_key",) + + +def test_create_no_primary_key(): + s = fake.session() + instance = fake.instance() + + df = pd.DataFrame(_records_with_keys_json_2) + + with pytest.raises(tc.primary_key.NotFound): + tc.dataframe.create(s, instance, df, name="df_dataset") + + +def test_create_primary_key_not_found(): + s = fake.session() + instance = fake.instance() + + df = pd.DataFrame(_records_with_keys_json_2) + + with pytest.raises(tc.primary_key.NotFound): + tc.dataframe.create( + s, instance, df, name="df_dataset", primary_key_name="wrong_primary_key" + ) + + +@fake.json +def test_create_handle_attribute_failure(): + s = fake.session() + instance = fake.instance() + + df = pd.DataFrame(_records_with_keys_json_2) + + with pytest.raises(tc.dataframe.CreationFailure): + tc.dataframe.create( + s, instance, df, name="df_dataset", primary_key_name="primary_key" + ) + + +@fake.json +def test_create_deletion_failure(): + s = fake.session() + instance = fake.instance() + + df = pd.DataFrame(_records_with_keys_json_2) + + with pytest.raises(tc.dataframe.CreationFailure): + tc.dataframe.create( + s, instance, df, name="df_dataset", primary_key_name="primary_key" + ) + + +@fake.json +def test_create_handle_record_failure(): + s = fake.session() + instance = fake.instance() + + df = pd.DataFrame(_records_with_keys_json_2) + + with pytest.raises(tc.dataframe.CreationFailure): + tc.dataframe.create( + s, instance, df, name="df_dataset", primary_key_name="primary_key" + ) + + +_records_json = [{"primary_key": 1}, {"primary_key": 2}] + +_records_json_2 = [{"attribute": 1}, {"attribute": 2}] + +_records_with_keys_json_2 = [ + {"primary_key": 1, "attribute": 1}, + {"primary_key": 2, "attribute": 2}, +] + +_response_json = { + "numCommandsProcessed": 2, + "allCommandsSucceeded": True, + "validationErrors": [], +} diff --git a/tests/tamr_client/dataset/test_dataset.py b/tests/tamr_client/dataset/test_dataset.py new file mode 100644 index 00000000..02936d45 --- /dev/null +++ b/tests/tamr_client/dataset/test_dataset.py @@ -0,0 +1,196 @@ +import pytest + +import tamr_client as tc +from tests.tamr_client import fake + + +@fake.json +def test_by_resource_id(): + s = fake.session() + instance = fake.instance() + + dataset = tc.dataset.by_resource_id(s, instance, "1") + assert dataset.name == "dataset 1 name" + assert dataset.description == "dataset 1 description" + assert dataset.key_attribute_names == ("tamr_id",) + + +@fake.json +def test_by_resource_id_dataset_not_found(): + s = fake.session() + instance = fake.instance() + + with pytest.raises(tc.dataset.NotFound): + tc.dataset.by_resource_id(s, instance, "1") + + +@fake.json +def test_by_name(): + s = fake.session() + instance = fake.instance() + + dataset = tc.dataset.by_name(s, instance, "dataset 1 name") + assert dataset.name == "dataset 1 name" + assert dataset.description == "dataset 1 description" + assert dataset.key_attribute_names == ("tamr_id",) + + +@fake.json +def test_by_name_dataset_not_found(): + s = fake.session() + instance = fake.instance() + + with pytest.raises(tc.dataset.NotFound): + tc.dataset.by_name(s, instance, "missing dataset") + + +@fake.json +def test_by_name_dataset_ambiguous(): + s = fake.session() + instance = fake.instance() + + with pytest.raises(tc.dataset.Ambiguous): + tc.dataset.by_name(s, instance, "ambiguous dataset") + + +@fake.json +def test_attributes(): + s = fake.session() + dataset = fake.dataset() + + attrs = tc.dataset.attributes(s, dataset) + + row_num = attrs[0] + assert row_num.name == "RowNum" + assert row_num.type == tc.attribute.type.STRING + + geom = attrs[1] + assert geom.name == "geom" + assert isinstance(geom.type, tc.attribute.type.Record) + + +@fake.json +def test_materialize_async(): + s = fake.session() + dataset = fake.dataset() + + op = tc.dataset._materialize_async(s, dataset) + + assert op.type == "SPARK" + assert op.description == "Materialize views to Elastic" + assert op.status == { + "state": "PENDING", + "startTime": "", + "endTime": "", + "message": "Job has not yet been submitted to Spark", + } + + +@fake.json +def test_delete(): + s = fake.session() + dataset = fake.dataset() + + tc.dataset.delete(s, dataset) + + +@fake.json +def test_delete_cascading(): + s = fake.session() + dataset = fake.dataset() + + tc.dataset.delete(s, dataset, cascade=True) + + +@fake.json +def test_delete_dataset_not_found(): + s = fake.session() + dataset = fake.dataset() + + with pytest.raises(tc.dataset.NotFound): + tc.dataset.delete(s, dataset) + + +@fake.json +def test_get_all(): + s = fake.session() + instance = fake.instance() + + all_datasets = tc.dataset.get_all(s, instance) + assert len(all_datasets) == 2 + + dataset_1 = all_datasets[0] + assert dataset_1.name == "dataset 1 name" + assert dataset_1.description == "dataset 1 description" + assert dataset_1.key_attribute_names == ("tamr_id",) + + dataset_2 = all_datasets[1] + assert dataset_2.name == "dataset 2 name" + assert dataset_2.description == "dataset 2 description" + assert dataset_2.key_attribute_names == ("tamr_id",) + + +@fake.json +def test_get_all_filter(): + s = fake.session() + instance = fake.instance() + + all_datasets = tc.dataset.get_all( + s, instance, filter="description==dataset 2 description" + ) + assert len(all_datasets) == 1 + + dataset = all_datasets[0] + assert dataset.name == "dataset 2 name" + assert dataset.description == "dataset 2 description" + assert dataset.key_attribute_names == ("tamr_id",) + + +@fake.json +def test_get_all_filter_list(): + s = fake.session() + instance = fake.instance() + + all_datasets = tc.dataset.get_all( + s, + instance, + filter=["description==dataset 2 description", "version==dataset 2 version"], + ) + assert len(all_datasets) == 1 + + dataset = all_datasets[0] + assert dataset.name == "dataset 2 name" + assert dataset.description == "dataset 2 description" + assert dataset.key_attribute_names == ("tamr_id",) + + +@fake.json +def test_create(): + s = fake.session() + instance = fake.instance() + + dataset = tc.dataset.create( + s, + instance, + name="new dataset", + key_attribute_names=("primary_key",), + description="a new dataset", + ) + assert dataset.name == "new dataset" + assert dataset.description == "a new dataset" + assert dataset.key_attribute_names == ("primary_key",) + + +@fake.json +def test_create_dataset_already_exists(): + s = fake.session() + instance = fake.instance() + + with pytest.raises(tc.dataset.AlreadyExists): + tc.dataset.create( + s, + instance, + name="new dataset", + key_attribute_names=("primary_key",), + description="a new dataset", + ) diff --git a/tests/tamr_client/dataset/test_record.py b/tests/tamr_client/dataset/test_record.py new file mode 100644 index 00000000..55c5c9d7 --- /dev/null +++ b/tests/tamr_client/dataset/test_record.py @@ -0,0 +1,104 @@ +import pytest + +import tamr_client as tc +from tests.tamr_client import fake + + +@fake.json +def test_update(): + s = fake.session() + dataset = fake.dataset() + + updates = [ + tc.record._create_command(record, primary_key_name="primary_key") + for record in _records_json + ] + + response = tc.record._update(s, dataset, updates) + assert response == _response_json + + +@fake.json +def test_upsert(): + s = fake.session() + dataset = fake.dataset() + + response = tc.record.upsert( + s, dataset, _records_json, primary_key_name="primary_key" + ) + assert response == _response_json + + +def test_upsert_primary_key_not_found(): + s = fake.session() + dataset = fake.dataset() + + with pytest.raises(tc.primary_key.NotFound): + tc.record.upsert( + s, dataset, _records_json, primary_key_name="wrong_primary_key" + ) + + +@fake.json +def test_upsert_infer_primary_key(): + s = fake.session() + dataset = fake.dataset() + + response = tc.record.upsert(s, dataset, _records_json) + assert response == _response_json + + +@fake.json +def test_delete(): + s = fake.session() + dataset = fake.dataset() + + response = tc.record.delete( + s, dataset, _records_json, primary_key_name="primary_key" + ) + assert response == _response_json + + +def test_delete_primary_key_not_found(): + s = fake.session() + dataset = fake.dataset() + + with pytest.raises(tc.primary_key.NotFound): + tc.record.delete( + s, dataset, _records_json, primary_key_name="wrong_primary_key" + ) + + +@fake.json +def test_delete_infer_primary_key(): + s = fake.session() + dataset = fake.dataset() + + response = tc.record.delete(s, dataset, _records_json) + assert response == _response_json + + +@fake.json +def test_stream(): + s = fake.session() + dataset = fake.dataset() + + records = tc.record.stream(s, dataset) + assert list(records) == _records_json + + +@fake.json +def test_delete_all(): + s = fake.session() + dataset = fake.dataset() + + tc.record.delete_all(s, dataset) + + +_records_json = [{"primary_key": 1}, {"primary_key": 2}] + +_response_json = { + "numCommandsProcessed": 2, + "allCommandsSucceeded": True, + "validationErrors": [], +} diff --git a/tests/tamr_client/dataset/test_unified.py b/tests/tamr_client/dataset/test_unified.py new file mode 100644 index 00000000..618125d2 --- /dev/null +++ b/tests/tamr_client/dataset/test_unified.py @@ -0,0 +1,40 @@ +import pytest + +import tamr_client as tc +from tests.tamr_client import fake + + +@fake.json +def test_from_project(): + s = fake.session() + project = fake.mastering_project() + + unified_dataset = tc.dataset.unified.from_project(s, project) + assert unified_dataset.name == "dataset 1 name" + assert unified_dataset.description == "dataset 1 description" + assert unified_dataset.key_attribute_names == ("tamr_id",) + + +@fake.json +def test_from_project_dataset_not_found(): + s = fake.session() + project = fake.mastering_project() + + with pytest.raises(tc.dataset.unified.NotFound): + tc.dataset.unified.from_project(s, project) + + +@fake.json +def test_apply_changes_async(): + s = fake.session() + unified_dataset = fake.unified_dataset() + + op = tc.dataset.unified._apply_changes_async(s, unified_dataset) + assert op.type == "SPARK" + assert op.description == "operation 1 description" + assert op.status == { + "state": "PENDING", + "startTime": "", + "endTime": "", + "message": "Job has not yet been submitted to Spark", + } diff --git a/tests/tamr_client/fake.py b/tests/tamr_client/fake.py new file mode 100644 index 00000000..ed2133be --- /dev/null +++ b/tests/tamr_client/fake.py @@ -0,0 +1,200 @@ +""" +Utilities for faking Tamr resources and server responses for testing. + +For more, see "How to write tests" in the Contributor guide. +""" + +from functools import partial, wraps +from inspect import getfile +from json import dumps, load, loads +from pathlib import Path +from typing import Dict, Tuple + +import responses + +import tamr_client as tc +from tamr_client._types import JsonDict + + +tests_tc_dir = (Path(__file__) / "..").resolve() +fake_json_dir = tests_tc_dir / "fake_json" + + +class WrongRequestBody(Exception): + """Raised when the body of a request does not match the value expected during + testing + """ + + pass + + +def _check_request_body(request, expected_body: JsonDict): + """Checks that the body of a caught request matches the expected content + + The body is decoded and loaded as a JSON object so the comparison is not sensitive to the + order of dictionary contents. The comparison is sensitive to the order of a newline-delimited + JSON request body. + + Args: + request: The caught request + expected_body: The expected request body as a dictionary (for JSON contents) or a list of + dictionaries (for newline-delimited JSON contents) + """ + if isinstance(expected_body, list): + actual_body = [loads(x.decode("utf-8")) for x in request.body] + if actual_body != expected_body: + raise WrongRequestBody(actual_body) + elif expected_body is not None: + actual_body = loads(request.body.decode("utf-8")) + if actual_body != expected_body: + raise WrongRequestBody(actual_body) + + +def _callback( + request, expected_body: JsonDict, status: int, response_json: str +) -> Tuple[int, Dict, str]: + """Adds a callback to intercept an API request, check the validity of the request, and emit a + response + + Args: + expected_body: The expected request body as a dictionary (for JSON contents) or a list of + dictionaries (for newline-delimited JSON contents) + status: The status of the response to be emitted + response_json: The JSON body of the response to be emitted + + Returns: + Response status, headers, and JSON body. This conforms to the callback interface of + `~responses.RequestsMock.add_callback` + """ + _check_request_body(request, expected_body) + return status, {}, response_json + + +def add_response(rsps, fake: JsonDict): + """Adds a mock response to intercept API requests and respond with fake JSON data + + Args: + fake: The JSON dictionary containing the fake data defining what requests to intercept and + what responses to emit + """ + req = fake["request"] + resp = fake["response"] + + url = req.get("url") + if url is None: + path = req.get("path") + url = "http://localhost/api/versioned/v1/" + path + + # Get response body from either ndjson or json + if resp.get("ndjson") is not None: + resp["body"] = "\n".join((dumps(line) for line in resp["ndjson"])) + elif resp.get("json") is not None: + resp["body"] = dumps(resp["json"]) + + # Get expected request body from ndjson + if req.get("ndjson") is not None: + req["body"] = req["ndjson"] + elif req.get("json") is not None: + req["body"] = req["json"] + + rsps.add_callback( + method=req["method"], + url=url, + callback=partial( + _callback, + expected_body=req.get("body"), + status=resp["status"], + response_json=resp.get("body"), + ), + ) + + +def json(test_fn): + """Intercept API requests and respond with fake JSON data. + + Will look in fake_json directory for data corresponding to the decorated test. + Data format is a JSON list of request/response pairs in order of execution. + """ + test_file = Path(getfile(test_fn)) + + fakes_mod_path = fake_json_dir / test_file.relative_to(tests_tc_dir).with_suffix("") + fakes_test_path = (fakes_mod_path / test_fn.__name__).with_suffix(".json") + with open(fakes_test_path) as f: + fakes = load(f) + + @wraps(test_fn) + def wrapper(*args, **kwargs): + with responses.RequestsMock() as rsps: + for fake in fakes: + add_response(rsps, fake) + test_fn(*args, **kwargs) + + return wrapper + + +def session() -> tc.Session: + auth = tc.UsernamePasswordAuth("username", "password") + s = tc.session.from_auth(auth) + return s + + +def instance() -> tc.Instance: + return tc.Instance() + + +def dataset() -> tc.Dataset: + url = tc.URL(path="datasets/1") + dataset = tc.Dataset(url, name="dataset.csv", key_attribute_names=("primary_key",)) + return dataset + + +def unified_dataset() -> tc.UnifiedDataset: + url = tc.URL(path="projects/1/unifiedDataset") + unified_dataset = tc.dataset.unified.UnifiedDataset( + url, name="dataset.csv", key_attribute_names=("primary_key",) + ) + return unified_dataset + + +def mastering_project() -> tc.MasteringProject: + url = tc.URL(path="projects/1") + mastering_project = tc.MasteringProject( + url, name="Project 1", description="A Mastering Project" + ) + return mastering_project + + +def categorization_project() -> tc.CategorizationProject: + url = tc.URL(path="projects/2") + categorization_project = tc.CategorizationProject( + url, name="Project 2", description="A Categorization Project" + ) + return categorization_project + + +def golden_records_project() -> tc.GoldenRecordsProject: + url = tc.URL(path="projects/3") + golden_records_project = tc.GoldenRecordsProject( + url, name="Project 3", description="A Golden Records Project" + ) + return golden_records_project + + +def transforms() -> tc.Transformations: + return tc.Transformations( + input_scope=[ + tc.InputTransformation("SELECT *, 1 as one;"), + tc.InputTransformation("SELECT *, 2 as two;", datasets=[dataset()]), + ], + unified_scope=["//Comment\nSELECT *;"], + ) + + +def attribute() -> tc.Attribute: + return tc.Attribute( + url=tc.URL(path="datasets/1/attributes/RowNum"), + name="RowNum", + type=tc.attribute.type.DEFAULT, + description="Synthetic row number", + is_nullable=False, + ) diff --git a/tests/tamr_client/fake_json/attribute/test_attribute/test_by_resource_id.json b/tests/tamr_client/fake_json/attribute/test_attribute/test_by_resource_id.json new file mode 100644 index 00000000..2f843e34 --- /dev/null +++ b/tests/tamr_client/fake_json/attribute/test_attribute/test_by_resource_id.json @@ -0,0 +1,60 @@ +[ + { + "request": { + "method": "GET", + "path": "datasets/1/attributes/attr" + }, + "response": { + "status": 200, + "json": { + "name": "attr", + "isNullable": false, + "type": { + "baseType": "RECORD", + "attributes": [ + { + "name": "0", + "isNullable": true, + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "STRING" + } + } + }, + { + "name": "1", + "isNullable": true, + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "STRING" + } + } + }, + { + "name": "2", + "isNullable": true, + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "STRING" + } + } + }, + { + "name": "3", + "isNullable": true, + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "STRING" + } + } + } + ] + } + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/attribute/test_attribute/test_by_resource_id_attribute_not_found.json b/tests/tamr_client/fake_json/attribute/test_attribute/test_by_resource_id_attribute_not_found.json new file mode 100644 index 00000000..3ce31cb9 --- /dev/null +++ b/tests/tamr_client/fake_json/attribute/test_attribute/test_by_resource_id_attribute_not_found.json @@ -0,0 +1,11 @@ +[ + { + "request": { + "method": "GET", + "path": "datasets/1/attributes/attr" + }, + "response": { + "status": 404 + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/attribute/test_attribute/test_create.json b/tests/tamr_client/fake_json/attribute/test_attribute/test_create.json new file mode 100644 index 00000000..4b7f5e25 --- /dev/null +++ b/tests/tamr_client/fake_json/attribute/test_attribute/test_create.json @@ -0,0 +1,111 @@ +[ + { + "request": { + "method": "POST", + "path": "datasets/1/attributes", + "json": { + "name": "attr", + "isNullable": false, + "type": { + "baseType": "RECORD", + "attributes": [ + { + "name": "0", + "isNullable": true, + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "STRING" + } + } + }, + { + "name": "1", + "isNullable": true, + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "STRING" + } + } + }, + { + "name": "2", + "isNullable": true, + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "STRING" + } + } + }, + { + "name": "3", + "isNullable": true, + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "STRING" + } + } + } + ] + }, + "description": "an attribute" + } + }, + "response": { + "status": 201, + "json": { + "name": "attr", + "isNullable": false, + "type": { + "baseType": "RECORD", + "attributes": [ + { + "name": "0", + "isNullable": true, + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "STRING" + } + } + }, + { + "name": "1", + "isNullable": true, + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "STRING" + } + } + }, + { + "name": "2", + "isNullable": true, + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "STRING" + } + } + }, + { + "name": "3", + "isNullable": true, + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "STRING" + } + } + } + ] + }, + "description": "an attribute" + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/attribute/test_attribute/test_create_attribute_exists.json b/tests/tamr_client/fake_json/attribute/test_attribute/test_create_attribute_exists.json new file mode 100644 index 00000000..38d9b1a9 --- /dev/null +++ b/tests/tamr_client/fake_json/attribute/test_attribute/test_create_attribute_exists.json @@ -0,0 +1,11 @@ +[ + { + "request": { + "method": "POST", + "path": "datasets/1/attributes" + }, + "response": { + "status": 409 + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/attribute/test_attribute/test_delete.json b/tests/tamr_client/fake_json/attribute/test_attribute/test_delete.json new file mode 100644 index 00000000..25ea6fa8 --- /dev/null +++ b/tests/tamr_client/fake_json/attribute/test_attribute/test_delete.json @@ -0,0 +1,11 @@ +[ + { + "request": { + "method": "DELETE", + "path": "datasets/1/attributes/RowNum" + }, + "response": { + "status": 204 + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/attribute/test_attribute/test_delete_attribute_not_found.json b/tests/tamr_client/fake_json/attribute/test_attribute/test_delete_attribute_not_found.json new file mode 100644 index 00000000..fb0be2ec --- /dev/null +++ b/tests/tamr_client/fake_json/attribute/test_attribute/test_delete_attribute_not_found.json @@ -0,0 +1,11 @@ +[ + { + "request": { + "method": "DELETE", + "path": "datasets/1/attributes/RowNum" + }, + "response": { + "status": 404 + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/attribute/test_attribute/test_update.json b/tests/tamr_client/fake_json/attribute/test_attribute/test_update.json new file mode 100644 index 00000000..0c1de2f2 --- /dev/null +++ b/tests/tamr_client/fake_json/attribute/test_attribute/test_update.json @@ -0,0 +1,20 @@ +[ + { + "request": { + "method": "PUT", + "path": "datasets/1/attributes/RowNum" + }, + "response": { + "status": 200, + "json": { + "name": "RowNum", + "description": "Synthetic row number updated", + "type": { + "baseType": "STRING", + "attributes": [] + }, + "isNullable": false + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/attribute/test_attribute/test_update_attribute_not_found.json b/tests/tamr_client/fake_json/attribute/test_attribute/test_update_attribute_not_found.json new file mode 100644 index 00000000..0b505578 --- /dev/null +++ b/tests/tamr_client/fake_json/attribute/test_attribute/test_update_attribute_not_found.json @@ -0,0 +1,11 @@ +[ + { + "request": { + "method": "PUT", + "path": "datasets/1/attributes/RowNum" + }, + "response": { + "status": 404 + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/categorization/test_categorization/test_apply_feedback_async.json b/tests/tamr_client/fake_json/categorization/test_categorization/test_apply_feedback_async.json new file mode 100644 index 00000000..b05cd6a1 --- /dev/null +++ b/tests/tamr_client/fake_json/categorization/test_categorization/test_apply_feedback_async.json @@ -0,0 +1,33 @@ +[ + { + "request": { + "method": "POST", + "path": "projects/2/categorizations/model:refresh" + }, + "response": { + "status": 200, + "json": { + "id": "1", + "type": "SPARK", + "description": "Materialize views to Elastic", + "status": { + "state": "PENDING", + "startTime": "", + "endTime": "", + "message": "Job has not yet been submitted to Spark" + }, + "created": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 modified version" + }, + "relativeId": "operations/1" + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/categorization/test_categorization/test_manual_labels.json b/tests/tamr_client/fake_json/categorization/test_categorization/test_manual_labels.json new file mode 100644 index 00000000..e95e0565 --- /dev/null +++ b/tests/tamr_client/fake_json/categorization/test_categorization/test_manual_labels.json @@ -0,0 +1,70 @@ +[ + { + "request": { + "method": "GET", + "path": "projects/2/unifiedDataset" + }, + "response": { + "status": 200, + "json": { + "id": "unify://unified-data/v1/datasets/161", + "name": "Party_Categorization_Unified_Dataset", + "description": "", + "version": "3607", + "keyAttributeNames": [ + "tamr_id" + ], + "tags": [], + "created": { + "username": "afsana.afzal", + "time": "2020-05-21T15:18:38.575Z", + "version": "18336" + }, + "lastModified": { + "username": "workflow.bot", + "time": "2020-06-18T15:18:30.833Z", + "version": "149940" + }, + "relativeId": "datasets/161", + "upstreamDatasetIds": [ + "unify://unified-data/v1/datasets/106" + ], + "externalId": "Party_Categorization_Unified_Dataset" + } + } + }, + { + "request": { + "method": "GET", + "path": "datasets?filter=name==Party_Categorization_Unified_Dataset_manual_categorizations" + }, + "response": { + "status": 200, + "json": [ + { + "id": "unify://unified-data/v1/datasets/167", + "name": "Party_Categorization_Unified_Dataset_manual_categorizations", + "description": "Manual categorizations", + "version": "2992", + "keyAttributeNames": [ + "recordId" + ], + "tags": [], + "created": { + "username": "afsana.afzal", + "time": "2020-06-01T20:49:46.549Z", + "version": "57920" + }, + "lastModified": { + "username": "workflow.bot", + "time": "2020-06-18T15:32:44.631Z", + "version": "150069" + }, + "relativeId": "datasets/167", + "upstreamDatasetIds": [], + "externalId": "Party_Categorization_Unified_Dataset_manual_categorizations" + } + ] + } + } +] diff --git a/tests/tamr_client/fake_json/categorization/test_categorization/test_update_results_async.json b/tests/tamr_client/fake_json/categorization/test_categorization/test_update_results_async.json new file mode 100644 index 00000000..6b3a26b0 --- /dev/null +++ b/tests/tamr_client/fake_json/categorization/test_categorization/test_update_results_async.json @@ -0,0 +1,33 @@ +[ + { + "request": { + "method": "POST", + "path": "projects/2/categorizations:refresh" + }, + "response": { + "status": 200, + "json": { + "id": "1", + "type": "SPARK", + "description": "Materialize views to Elastic", + "status": { + "state": "PENDING", + "startTime": "", + "endTime": "", + "message": "Job has not yet been submitted to Spark" + }, + "created": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 modified version" + }, + "relativeId": "operations/1" + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/categorization/test_categorization_project/test_create.json b/tests/tamr_client/fake_json/categorization/test_categorization_project/test_create.json new file mode 100644 index 00000000..249426ae --- /dev/null +++ b/tests/tamr_client/fake_json/categorization/test_categorization_project/test_create.json @@ -0,0 +1,65 @@ +[ + { + "request": { + "method": "POST", + "path": "projects", + "json": { + "name": "New Categorization Project", + "type": "CATEGORIZATION", + "unifiedDatasetName": "New Categorization Project_unified_dataset", + "description": "A Categorization Project", + "externalId": null + } + }, + "response": { + "status": 200, + "json": { + "id": "unify://unified-data/v1/projects/2", + "name": "New Categorization Project", + "description": "A Categorization Project", + "type": "CATEGORIZATION", + "unifiedDatasetName": "New Categorization Project_unified_dataset", + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "modified version" + }, + "relativeId": "projects/2", + "externalId": "b129f3b1-82f5-4e30-90a3-e562ca977992" + } + } + }, + { + "request": { + "method": "GET", + "path": "projects/2" + }, + "response": { + "status": 200, + "json": { + "id": "unify://unified-data/v1/projects/2", + "name": "New Categorization Project", + "description": "A Categorization Project", + "type": "CATEGORIZATION", + "unifiedDatasetName": "New Categorization Project_unified_dataset", + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "modified version" + }, + "relativeId": "projects/2", + "externalId": "b129f3b1-82f5-4e30-90a3-e562ca977992" + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_dataframe/test_create.json b/tests/tamr_client/fake_json/dataset/test_dataframe/test_create.json new file mode 100644 index 00000000..1b40fa7a --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_dataframe/test_create.json @@ -0,0 +1,168 @@ +[ + { + "request": { + "method": "POST", + "path": "datasets", + "json": { + "name": "df_dataset", + "keyAttributeNames": [ + "primary_key" + ], + "description": null, + "externalId": null + } + }, + "response": { + "status": 201, + "json": { + "id": "unify://unified-data/v1/datasets/1", + "externalId": "number 1", + "name": "df_dataset", + "description": null, + "version": "dataset version", + "keyAttributeNames": [ + "primary_key" + ], + "tags": [], + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "dataset 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "dataset 1 modified version" + }, + "relativeId": "datasets/1", + "upstreamDatasetIds": [] + } + } + }, + { + "request": { + "method": "GET", + "path": "datasets/1" + }, + "response": { + "status": 200, + "json": { + "id": "unify://unified-data/v1/datasets/1", + "externalId": "number 1", + "name": "df_dataset", + "description": null, + "version": "dataset version", + "keyAttributeNames": [ + "primary_key" + ], + "tags": [], + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "dataset 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "dataset 1 modified version" + }, + "relativeId": "datasets/1", + "upstreamDatasetIds": [] + } + } + }, + { + "request": { + "method": "POST", + "path": "datasets/1/attributes", + "json": { + "name": "attribute", + "isNullable": true, + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "STRING" + } + } + } + }, + "response": { + "status": 201, + "json": { + "name": "attribute", + "isNullable": true, + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "STRING" + } + }, + "description": null + } + } + }, + { + "request": { + "method": "POST", + "path": "datasets/1:updateRecords", + "ndjson": [ + { + "action": "CREATE", + "recordId": 1, + "record": { + "primary_key": 1, + "attribute": 1 + } + }, + { + "action": "CREATE", + "recordId": 2, + "record": { + "primary_key": 2, + "attribute": 2 + } + } + ] + }, + "response": { + "status": 204, + "json": { + "numCommandsProcessed": 2, + "allCommandsSucceeded": true, + "validationErrors": [] + } + } + }, + { + "request": { + "method": "GET", + "path": "datasets/1" + }, + "response": { + "status": 200, + "json": { + "id": "unify://unified-data/v1/datasets/1", + "externalId": "number 1", + "name": "df_dataset", + "description": null, + "version": "dataset version", + "keyAttributeNames": [ + "primary_key" + ], + "tags": [], + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "dataset 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "dataset 1 modified version" + }, + "relativeId": "datasets/1", + "upstreamDatasetIds": [] + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_dataframe/test_create_deletion_failure.json b/tests/tamr_client/fake_json/dataset/test_dataframe/test_create_deletion_failure.json new file mode 100644 index 00000000..0f5cd930 --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_dataframe/test_create_deletion_failure.json @@ -0,0 +1,103 @@ +[ + { + "request": { + "method": "POST", + "path": "datasets", + "json": { + "name": "df_dataset", + "keyAttributeNames": [ + "primary_key" + ], + "description": null, + "externalId": null + } + }, + "response": { + "status": 201, + "json": { + "id": "unify://unified-data/v1/datasets/1", + "externalId": "number 1", + "name": "df_dataset", + "description": null, + "version": "dataset version", + "keyAttributeNames": [ + "primary_key" + ], + "tags": [], + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "dataset 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "dataset 1 modified version" + }, + "relativeId": "datasets/1", + "upstreamDatasetIds": [] + } + } + }, + { + "request": { + "method": "GET", + "path": "datasets/1" + }, + "response": { + "status": 200, + "json": { + "id": "unify://unified-data/v1/datasets/1", + "externalId": "number 1", + "name": "df_dataset", + "description": null, + "version": "dataset version", + "keyAttributeNames": [ + "primary_key" + ], + "tags": [], + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "dataset 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "dataset 1 modified version" + }, + "relativeId": "datasets/1", + "upstreamDatasetIds": [] + } + } + }, + { + "request": { + "method": "POST", + "path": "datasets/1/attributes", + "json": { + "name": "attribute", + "isNullable": true, + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "STRING" + } + } + } + }, + "response": { + "status": 500, + "json": {} + } + }, + { + "request": { + "method": "DELETE", + "path": "datasets/1?cascade=false" + }, + "response": { + "status": 500 + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_dataframe/test_create_handle_attribute_failure.json b/tests/tamr_client/fake_json/dataset/test_dataframe/test_create_handle_attribute_failure.json new file mode 100644 index 00000000..845bd592 --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_dataframe/test_create_handle_attribute_failure.json @@ -0,0 +1,103 @@ +[ + { + "request": { + "method": "POST", + "path": "datasets", + "json": { + "name": "df_dataset", + "keyAttributeNames": [ + "primary_key" + ], + "description": null, + "externalId": null + } + }, + "response": { + "status": 201, + "json": { + "id": "unify://unified-data/v1/datasets/1", + "externalId": "number 1", + "name": "df_dataset", + "description": null, + "version": "dataset version", + "keyAttributeNames": [ + "primary_key" + ], + "tags": [], + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "dataset 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "dataset 1 modified version" + }, + "relativeId": "datasets/1", + "upstreamDatasetIds": [] + } + } + }, + { + "request": { + "method": "GET", + "path": "datasets/1" + }, + "response": { + "status": 200, + "json": { + "id": "unify://unified-data/v1/datasets/1", + "externalId": "number 1", + "name": "df_dataset", + "description": null, + "version": "dataset version", + "keyAttributeNames": [ + "primary_key" + ], + "tags": [], + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "dataset 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "dataset 1 modified version" + }, + "relativeId": "datasets/1", + "upstreamDatasetIds": [] + } + } + }, + { + "request": { + "method": "POST", + "path": "datasets/1/attributes", + "json": { + "name": "attribute", + "isNullable": true, + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "STRING" + } + } + } + }, + "response": { + "status": 500, + "json": {} + } + }, + { + "request": { + "method": "DELETE", + "path": "datasets/1?cascade=false" + }, + "response": { + "status": 204 + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_dataframe/test_create_handle_record_failure.json b/tests/tamr_client/fake_json/dataset/test_dataframe/test_create_handle_record_failure.json new file mode 100644 index 00000000..d745a5d7 --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_dataframe/test_create_handle_record_failure.json @@ -0,0 +1,141 @@ +[ + { + "request": { + "method": "POST", + "path": "datasets", + "json": { + "name": "df_dataset", + "keyAttributeNames": [ + "primary_key" + ], + "description": null, + "externalId": null + } + }, + "response": { + "status": 201, + "json": { + "id": "unify://unified-data/v1/datasets/1", + "externalId": "number 1", + "name": "df_dataset", + "description": null, + "version": "dataset version", + "keyAttributeNames": [ + "primary_key" + ], + "tags": [], + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "dataset 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "dataset 1 modified version" + }, + "relativeId": "datasets/1", + "upstreamDatasetIds": [] + } + } + }, + { + "request": { + "method": "GET", + "path": "datasets/1" + }, + "response": { + "status": 200, + "json": { + "id": "unify://unified-data/v1/datasets/1", + "externalId": "number 1", + "name": "df_dataset", + "description": null, + "version": "dataset version", + "keyAttributeNames": [ + "primary_key" + ], + "tags": [], + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "dataset 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "dataset 1 modified version" + }, + "relativeId": "datasets/1", + "upstreamDatasetIds": [] + } + } + }, + { + "request": { + "method": "POST", + "path": "datasets/1/attributes", + "json": { + "name": "attribute", + "isNullable": true, + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "STRING" + } + } + } + }, + "response": { + "status": 201, + "json": { + "name": "attribute", + "isNullable": true, + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "STRING" + } + }, + "description": null + } + } + }, + { + "request": { + "method": "POST", + "path": "datasets/1:updateRecords", + "ndjson": [ + { + "action": "CREATE", + "recordId": 1, + "record": { + "primary_key": 1, + "attribute": 1 + } + }, + { + "action": "CREATE", + "recordId": 2, + "record": { + "primary_key": 2, + "attribute": 2 + } + } + ] + }, + "response": { + "status": 500, + "json": {} + } + }, + { + "request": { + "method": "DELETE", + "path": "datasets/1?cascade=false" + }, + "response": { + "status": 204 + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_dataframe/test_create_infer_primary_key_from_index.json b/tests/tamr_client/fake_json/dataset/test_dataframe/test_create_infer_primary_key_from_index.json new file mode 100644 index 00000000..1b40fa7a --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_dataframe/test_create_infer_primary_key_from_index.json @@ -0,0 +1,168 @@ +[ + { + "request": { + "method": "POST", + "path": "datasets", + "json": { + "name": "df_dataset", + "keyAttributeNames": [ + "primary_key" + ], + "description": null, + "externalId": null + } + }, + "response": { + "status": 201, + "json": { + "id": "unify://unified-data/v1/datasets/1", + "externalId": "number 1", + "name": "df_dataset", + "description": null, + "version": "dataset version", + "keyAttributeNames": [ + "primary_key" + ], + "tags": [], + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "dataset 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "dataset 1 modified version" + }, + "relativeId": "datasets/1", + "upstreamDatasetIds": [] + } + } + }, + { + "request": { + "method": "GET", + "path": "datasets/1" + }, + "response": { + "status": 200, + "json": { + "id": "unify://unified-data/v1/datasets/1", + "externalId": "number 1", + "name": "df_dataset", + "description": null, + "version": "dataset version", + "keyAttributeNames": [ + "primary_key" + ], + "tags": [], + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "dataset 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "dataset 1 modified version" + }, + "relativeId": "datasets/1", + "upstreamDatasetIds": [] + } + } + }, + { + "request": { + "method": "POST", + "path": "datasets/1/attributes", + "json": { + "name": "attribute", + "isNullable": true, + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "STRING" + } + } + } + }, + "response": { + "status": 201, + "json": { + "name": "attribute", + "isNullable": true, + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "STRING" + } + }, + "description": null + } + } + }, + { + "request": { + "method": "POST", + "path": "datasets/1:updateRecords", + "ndjson": [ + { + "action": "CREATE", + "recordId": 1, + "record": { + "primary_key": 1, + "attribute": 1 + } + }, + { + "action": "CREATE", + "recordId": 2, + "record": { + "primary_key": 2, + "attribute": 2 + } + } + ] + }, + "response": { + "status": 204, + "json": { + "numCommandsProcessed": 2, + "allCommandsSucceeded": true, + "validationErrors": [] + } + } + }, + { + "request": { + "method": "GET", + "path": "datasets/1" + }, + "response": { + "status": 200, + "json": { + "id": "unify://unified-data/v1/datasets/1", + "externalId": "number 1", + "name": "df_dataset", + "description": null, + "version": "dataset version", + "keyAttributeNames": [ + "primary_key" + ], + "tags": [], + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "dataset 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "dataset 1 modified version" + }, + "relativeId": "datasets/1", + "upstreamDatasetIds": [] + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_dataframe/test_upsert.json b/tests/tamr_client/fake_json/dataset/test_dataframe/test_upsert.json new file mode 100644 index 00000000..de3de037 --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_dataframe/test_upsert.json @@ -0,0 +1,32 @@ +[ + { + "request": { + "method": "POST", + "path": "datasets/1:updateRecords", + "ndjson": [ + { + "action": "CREATE", + "recordId": 1, + "record": { + "primary_key": 1 + } + }, + { + "action": "CREATE", + "recordId": 2, + "record": { + "primary_key": 2 + } + } + ] + }, + "response": { + "status": 204, + "json": { + "numCommandsProcessed": 2, + "allCommandsSucceeded": true, + "validationErrors": [] + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_dataframe/test_upsert_index_as_primary_key.json b/tests/tamr_client/fake_json/dataset/test_dataframe/test_upsert_index_as_primary_key.json new file mode 100644 index 00000000..573f8f53 --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_dataframe/test_upsert_index_as_primary_key.json @@ -0,0 +1,34 @@ +[ + { + "request": { + "method": "POST", + "path": "datasets/1:updateRecords", + "ndjson": [ + { + "action": "CREATE", + "recordId": 1, + "record": { + "primary_key": 1, + "attribute": 1 + } + }, + { + "action": "CREATE", + "recordId": 2, + "record": { + "primary_key": 2, + "attribute": 2 + } + } + ] + }, + "response": { + "status": 204, + "json": { + "numCommandsProcessed": 2, + "allCommandsSucceeded": true, + "validationErrors": [] + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_dataframe/test_upsert_infer_primary_key.json b/tests/tamr_client/fake_json/dataset/test_dataframe/test_upsert_infer_primary_key.json new file mode 100644 index 00000000..de3de037 --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_dataframe/test_upsert_infer_primary_key.json @@ -0,0 +1,32 @@ +[ + { + "request": { + "method": "POST", + "path": "datasets/1:updateRecords", + "ndjson": [ + { + "action": "CREATE", + "recordId": 1, + "record": { + "primary_key": 1 + } + }, + { + "action": "CREATE", + "recordId": 2, + "record": { + "primary_key": 2 + } + } + ] + }, + "response": { + "status": 204, + "json": { + "numCommandsProcessed": 2, + "allCommandsSucceeded": true, + "validationErrors": [] + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_dataset/test_attributes.json b/tests/tamr_client/fake_json/dataset/test_dataset/test_attributes.json new file mode 100644 index 00000000..49ebc078 --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_dataset/test_attributes.json @@ -0,0 +1,80 @@ +[ + { + "request": { + "method": "GET", + "path": "datasets/1/attributes" + }, + "response": { + "status": 200, + "json": [ + { + "name": "RowNum", + "description": "Synthetic row number", + "type": { + "baseType": "STRING", + "attributes": [] + }, + "isNullable": false + }, + { + "name": "geom", + "description": "", + "type": { + "baseType": "RECORD", + "attributes": [ + { + "name": "point", + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "DOUBLE", + "attributes": [] + }, + "attributes": [] + }, + "isNullable": true + }, + { + "name": "lineString", + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "ARRAY", + "innerType": { + "baseType": "DOUBLE", + "attributes": [] + }, + "attributes": [] + }, + "attributes": [] + }, + "isNullable": true + }, + { + "name": "polygon", + "type": { + "baseType": "ARRAY", + "innerType": { + "baseType": "ARRAY", + "innerType": { + "baseType": "ARRAY", + "innerType": { + "baseType": "DOUBLE", + "attributes": [] + }, + "attributes": [] + }, + "attributes": [] + }, + "attributes": [] + }, + "isNullable": true + } + ] + }, + "isNullable": false + } + ] + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_dataset/test_by_name.json b/tests/tamr_client/fake_json/dataset/test_dataset/test_by_name.json new file mode 100644 index 00000000..f3908e62 --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_dataset/test_by_name.json @@ -0,0 +1,36 @@ +[ + { + "request": { + "method": "GET", + "path": "datasets?filter=name==dataset 1 name" + }, + "response": { + "status": 200, + "json": [ + { + "id": "unify://unified-data/v1/datasets/1", + "externalId": "number 1", + "name": "dataset 1 name", + "description": "dataset 1 description", + "version": "dataset 1 version", + "keyAttributeNames": [ + "tamr_id" + ], + "tags": [], + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "dataset 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "dataset 1 modified version" + }, + "relativeId": "datasets/1", + "upstreamDatasetIds": [] + } + ] + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_dataset/test_by_name_dataset_ambiguous.json b/tests/tamr_client/fake_json/dataset/test_dataset/test_by_name_dataset_ambiguous.json new file mode 100644 index 00000000..a23428eb --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_dataset/test_by_name_dataset_ambiguous.json @@ -0,0 +1,59 @@ +[ + { + "request": { + "method": "GET", + "path": "datasets?filter=name==ambiguous dataset" + }, + "response": { + "status": 200, + "json": [ + { + "id": "unify://unified-data/v1/datasets/1", + "externalId": "number 1", + "name": "ambiguous dataset", + "description": "description", + "version": "version", + "keyAttributeNames": [ + "tamr_id" + ], + "tags": [], + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "dataset 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "dataset 1 modified version" + }, + "relativeId": "datasets/1", + "upstreamDatasetIds": [] + }, + { + "id": "unify://unified-data/v1/datasets/2", + "externalId": "number 2", + "name": "ambiguous dataset", + "description": "description", + "version": "version", + "keyAttributeNames": [ + "tamr_id" + ], + "tags": [], + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "dataset 2 created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "dataset 2 modified version" + }, + "relativeId": "datasets/2", + "upstreamDatasetIds": [] + } + ] + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_dataset/test_by_name_dataset_not_found.json b/tests/tamr_client/fake_json/dataset/test_dataset/test_by_name_dataset_not_found.json new file mode 100644 index 00000000..588865c7 --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_dataset/test_by_name_dataset_not_found.json @@ -0,0 +1,12 @@ +[ + { + "request": { + "method": "GET", + "path": "datasets?filter=name==missing dataset" + }, + "response": { + "status": 200, + "json": [] + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_dataset/test_by_resource_id.json b/tests/tamr_client/fake_json/dataset/test_dataset/test_by_resource_id.json new file mode 100644 index 00000000..dad693f2 --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_dataset/test_by_resource_id.json @@ -0,0 +1,34 @@ +[ + { + "request": { + "method": "GET", + "path": "datasets/1" + }, + "response": { + "status": 200, + "json": { + "id": "unify://unified-data/v1/datasets/1", + "externalId": "number 1", + "name": "dataset 1 name", + "description": "dataset 1 description", + "version": "dataset 1 version", + "keyAttributeNames": [ + "tamr_id" + ], + "tags": [], + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "dataset 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "dataset 1 modified version" + }, + "relativeId": "datasets/1", + "upstreamDatasetIds": [] + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_dataset/test_by_resource_id_dataset_not_found.json b/tests/tamr_client/fake_json/dataset/test_dataset/test_by_resource_id_dataset_not_found.json new file mode 100644 index 00000000..fdc4349f --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_dataset/test_by_resource_id_dataset_not_found.json @@ -0,0 +1,11 @@ +[ + { + "request": { + "method": "GET", + "path": "datasets/1" + }, + "response": { + "status": 404 + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_dataset/test_create.json b/tests/tamr_client/fake_json/dataset/test_dataset/test_create.json new file mode 100644 index 00000000..b79ee4cb --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_dataset/test_create.json @@ -0,0 +1,74 @@ +[ + { + "request": { + "method": "POST", + "path": "datasets", + "json": { + "name": "new dataset", + "keyAttributeNames": [ + "primary_key" + ], + "description": "a new dataset", + "externalId": null + } + }, + "response": { + "status": 201, + "json": { + "id": "unify://unified-data/v1/datasets/1", + "externalId": "number 1", + "name": "new dataset", + "description": "a new dataset", + "version": "dataset version", + "keyAttributeNames": [ + "primary_key" + ], + "tags": [], + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "dataset 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "dataset 1 modified version" + }, + "relativeId": "datasets/1", + "upstreamDatasetIds": [] + } + } + }, + { + "request": { + "method": "GET", + "path": "datasets/1" + }, + "response": { + "status": 200, + "json": { + "id": "unify://unified-data/v1/datasets/1", + "externalId": "number 1", + "name": "new dataset", + "description": "a new dataset", + "version": "dataset version", + "keyAttributeNames": [ + "primary_key" + ], + "tags": [], + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "dataset 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "dataset 1 modified version" + }, + "relativeId": "datasets/1", + "upstreamDatasetIds": [] + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_dataset/test_create_dataset_already_exists.json b/tests/tamr_client/fake_json/dataset/test_dataset/test_create_dataset_already_exists.json new file mode 100644 index 00000000..c44b9c6a --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_dataset/test_create_dataset_already_exists.json @@ -0,0 +1,22 @@ +[ + { + "request": { + "method": "POST", + "path": "datasets", + "json": { + "name": "new dataset", + "keyAttributeNames": [ + "primary_key" + ], + "description": "a new dataset", + "externalId": null + } + }, + "response": { + "status": 400, + "json": { + "message": "Dataset \"new dataset\" already exists" + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_dataset/test_delete.json b/tests/tamr_client/fake_json/dataset/test_dataset/test_delete.json new file mode 100644 index 00000000..d6de19ba --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_dataset/test_delete.json @@ -0,0 +1,11 @@ +[ + { + "request": { + "method": "DELETE", + "path": "datasets/1?cascade=false" + }, + "response": { + "status": 204 + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_dataset/test_delete_cascading.json b/tests/tamr_client/fake_json/dataset/test_dataset/test_delete_cascading.json new file mode 100644 index 00000000..f01636e9 --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_dataset/test_delete_cascading.json @@ -0,0 +1,11 @@ +[ + { + "request": { + "method": "DELETE", + "path": "datasets/1?cascade=true" + }, + "response": { + "status": 204 + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_dataset/test_delete_dataset_not_found.json b/tests/tamr_client/fake_json/dataset/test_dataset/test_delete_dataset_not_found.json new file mode 100644 index 00000000..9a566cfe --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_dataset/test_delete_dataset_not_found.json @@ -0,0 +1,11 @@ +[ + { + "request": { + "method": "DELETE", + "path": "datasets/1?cascade=false" + }, + "response": { + "status": 404 + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_dataset/test_get_all.json b/tests/tamr_client/fake_json/dataset/test_dataset/test_get_all.json new file mode 100644 index 00000000..fa555908 --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_dataset/test_get_all.json @@ -0,0 +1,59 @@ +[ + { + "request": { + "method": "GET", + "path": "datasets" + }, + "response": { + "status": 200, + "json": [ + { + "id": "unify://unified-data/v1/datasets/1", + "externalId": "number 1", + "name": "dataset 1 name", + "description": "dataset 1 description", + "version": "dataset 1 version", + "keyAttributeNames": [ + "tamr_id" + ], + "tags": [], + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "dataset 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "dataset 1 modified version" + }, + "relativeId": "datasets/1", + "upstreamDatasetIds": [] + }, + { + "id": "unify://unified-data/v1/datasets/2", + "externalId": "number 2", + "name": "dataset 2 name", + "description": "dataset 2 description", + "version": "dataset 2 version", + "keyAttributeNames": [ + "tamr_id" + ], + "tags": [], + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "dataset 2 created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "dataset 2 modified version" + }, + "relativeId": "datasets/2", + "upstreamDatasetIds": [] + } + ] + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_dataset/test_get_all_filter.json b/tests/tamr_client/fake_json/dataset/test_dataset/test_get_all_filter.json new file mode 100644 index 00000000..b99f4a4b --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_dataset/test_get_all_filter.json @@ -0,0 +1,36 @@ +[ + { + "request": { + "method": "GET", + "path": "datasets?filter=description==dataset%202%20description" + }, + "response": { + "status": 200, + "json": [ + { + "id": "unify://unified-data/v1/datasets/2", + "externalId": "number 2", + "name": "dataset 2 name", + "description": "dataset 2 description", + "version": "dataset 2 version", + "keyAttributeNames": [ + "tamr_id" + ], + "tags": [], + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "dataset 2 created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "dataset 2 modified version" + }, + "relativeId": "datasets/2", + "upstreamDatasetIds": [] + } + ] + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_dataset/test_get_all_filter_list.json b/tests/tamr_client/fake_json/dataset/test_dataset/test_get_all_filter_list.json new file mode 100644 index 00000000..c935936b --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_dataset/test_get_all_filter_list.json @@ -0,0 +1,36 @@ +[ + { + "request": { + "method": "GET", + "path": "datasets?filter=description==dataset%202%20description&?filter=version==dataset%202%20version" + }, + "response": { + "status": 200, + "json": [ + { + "id": "unify://unified-data/v1/datasets/2", + "externalId": "number 2", + "name": "dataset 2 name", + "description": "dataset 2 description", + "version": "dataset 2 version", + "keyAttributeNames": [ + "tamr_id" + ], + "tags": [], + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "dataset 2 created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "dataset 2 modified version" + }, + "relativeId": "datasets/2", + "upstreamDatasetIds": [] + } + ] + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_dataset/test_materialize_async.json b/tests/tamr_client/fake_json/dataset/test_dataset/test_materialize_async.json new file mode 100644 index 00000000..d28ed3dd --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_dataset/test_materialize_async.json @@ -0,0 +1,34 @@ +[ + { + "request": { + "method": "POST", + "path": "datasets/1:refresh" + }, + "response": { + "status": 200, + "json": { + "id": "1", + "type": "SPARK", + "description": "Materialize views to Elastic", + "status": { + "state": "PENDING", + "startTime": "", + "endTime": "", + "message": "Job has not yet been submitted to Spark" + }, + "created": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 modified version" + }, + "relativeId": "operations/1" + + } + } + } +] diff --git a/tests/tamr_client/fake_json/dataset/test_record/test_delete.json b/tests/tamr_client/fake_json/dataset/test_record/test_delete.json new file mode 100644 index 00000000..6c180542 --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_record/test_delete.json @@ -0,0 +1,26 @@ +[ + { + "request": { + "method": "POST", + "path": "datasets/1:updateRecords", + "ndjson": [ + { + "action": "DELETE", + "recordId": 1 + }, + { + "action": "DELETE", + "recordId": 2 + } + ] + }, + "response": { + "status": 204, + "json": { + "numCommandsProcessed": 2, + "allCommandsSucceeded": true, + "validationErrors": [] + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_record/test_delete_all.json b/tests/tamr_client/fake_json/dataset/test_record/test_delete_all.json new file mode 100644 index 00000000..17824491 --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_record/test_delete_all.json @@ -0,0 +1,11 @@ +[ + { + "request": { + "method": "DELETE", + "path": "datasets/1/records" + }, + "response": { + "status": 204 + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_record/test_delete_infer_primary_key.json b/tests/tamr_client/fake_json/dataset/test_record/test_delete_infer_primary_key.json new file mode 100644 index 00000000..6c180542 --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_record/test_delete_infer_primary_key.json @@ -0,0 +1,26 @@ +[ + { + "request": { + "method": "POST", + "path": "datasets/1:updateRecords", + "ndjson": [ + { + "action": "DELETE", + "recordId": 1 + }, + { + "action": "DELETE", + "recordId": 2 + } + ] + }, + "response": { + "status": 204, + "json": { + "numCommandsProcessed": 2, + "allCommandsSucceeded": true, + "validationErrors": [] + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_record/test_stream.json b/tests/tamr_client/fake_json/dataset/test_record/test_stream.json new file mode 100644 index 00000000..b03b718e --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_record/test_stream.json @@ -0,0 +1,15 @@ +[ + { + "request": { + "method": "GET", + "path": "datasets/1/records" + }, + "response": { + "status": 200, + "ndjson": [ + {"primary_key": 1}, + {"primary_key": 2} + ] + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_record/test_update.json b/tests/tamr_client/fake_json/dataset/test_record/test_update.json new file mode 100644 index 00000000..de3de037 --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_record/test_update.json @@ -0,0 +1,32 @@ +[ + { + "request": { + "method": "POST", + "path": "datasets/1:updateRecords", + "ndjson": [ + { + "action": "CREATE", + "recordId": 1, + "record": { + "primary_key": 1 + } + }, + { + "action": "CREATE", + "recordId": 2, + "record": { + "primary_key": 2 + } + } + ] + }, + "response": { + "status": 204, + "json": { + "numCommandsProcessed": 2, + "allCommandsSucceeded": true, + "validationErrors": [] + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_record/test_upsert.json b/tests/tamr_client/fake_json/dataset/test_record/test_upsert.json new file mode 100644 index 00000000..de3de037 --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_record/test_upsert.json @@ -0,0 +1,32 @@ +[ + { + "request": { + "method": "POST", + "path": "datasets/1:updateRecords", + "ndjson": [ + { + "action": "CREATE", + "recordId": 1, + "record": { + "primary_key": 1 + } + }, + { + "action": "CREATE", + "recordId": 2, + "record": { + "primary_key": 2 + } + } + ] + }, + "response": { + "status": 204, + "json": { + "numCommandsProcessed": 2, + "allCommandsSucceeded": true, + "validationErrors": [] + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_record/test_upsert_infer_primary_key.json b/tests/tamr_client/fake_json/dataset/test_record/test_upsert_infer_primary_key.json new file mode 100644 index 00000000..de3de037 --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_record/test_upsert_infer_primary_key.json @@ -0,0 +1,32 @@ +[ + { + "request": { + "method": "POST", + "path": "datasets/1:updateRecords", + "ndjson": [ + { + "action": "CREATE", + "recordId": 1, + "record": { + "primary_key": 1 + } + }, + { + "action": "CREATE", + "recordId": 2, + "record": { + "primary_key": 2 + } + } + ] + }, + "response": { + "status": 204, + "json": { + "numCommandsProcessed": 2, + "allCommandsSucceeded": true, + "validationErrors": [] + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_unified/test_apply_changes_async.json b/tests/tamr_client/fake_json/dataset/test_unified/test_apply_changes_async.json new file mode 100644 index 00000000..0552bed0 --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_unified/test_apply_changes_async.json @@ -0,0 +1,33 @@ +[ + { + "request": { + "method": "POST", + "path": "projects/1/unifiedDataset:refresh" + }, + "response": { + "status": 200, + "json": { + "id": "1", + "type": "SPARK", + "description": "operation 1 description", + "status": { + "state": "PENDING", + "startTime": "", + "endTime": "", + "message": "Job has not yet been submitted to Spark" + }, + "created": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 modified version" + }, + "relativeId": "operations/1" + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_unified/test_from_project.json b/tests/tamr_client/fake_json/dataset/test_unified/test_from_project.json new file mode 100644 index 00000000..4ff5b270 --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_unified/test_from_project.json @@ -0,0 +1,34 @@ +[ + { + "request": { + "method": "GET", + "path": "projects/1/unifiedDataset" + }, + "response": { + "status": 200, + "json": { + "id": "unify://unified-data/v1/datasets/1", + "externalId": "number 1", + "name": "dataset 1 name", + "description": "dataset 1 description", + "version": "dataset 1 version", + "keyAttributeNames": [ + "tamr_id" + ], + "tags": [], + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "dataset 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "dataset 1 modified version" + }, + "relativeId": "datasets/1", + "upstreamDatasetIds": [] + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/dataset/test_unified/test_from_project_dataset_not_found.json b/tests/tamr_client/fake_json/dataset/test_unified/test_from_project_dataset_not_found.json new file mode 100644 index 00000000..53328d69 --- /dev/null +++ b/tests/tamr_client/fake_json/dataset/test_unified/test_from_project_dataset_not_found.json @@ -0,0 +1,11 @@ +[ + { + "request": { + "method": "GET", + "path": "projects/1/unifiedDataset" + }, + "response": { + "status": 404 + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/golden_records/test_golden_records/test_publish_async.json b/tests/tamr_client/fake_json/golden_records/test_golden_records/test_publish_async.json new file mode 100644 index 00000000..e660817e --- /dev/null +++ b/tests/tamr_client/fake_json/golden_records/test_golden_records/test_publish_async.json @@ -0,0 +1,33 @@ +[ + { + "request": { + "method": "POST", + "path": "projects/3/publishedGoldenRecords:refresh?validate=true&version=CURRENT" + }, + "response": { + "status": 200, + "json": { + "id": "1", + "type": "SPARK", + "description": "Updating published datasets for GoldenRecords module", + "status": { + "state": "PENDING", + "startTime": "", + "endTime": "", + "message": "Job has not yet been submitted to Spark" + }, + "created": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 modified version" + }, + "relativeId": "operations/1" + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/golden_records/test_golden_records/test_update_async.json b/tests/tamr_client/fake_json/golden_records/test_golden_records/test_update_async.json new file mode 100644 index 00000000..c6cb0b4c --- /dev/null +++ b/tests/tamr_client/fake_json/golden_records/test_golden_records/test_update_async.json @@ -0,0 +1,33 @@ +[ + { + "request": { + "method": "POST", + "path": "projects/3/goldenRecords:refresh" + }, + "response": { + "status": 200, + "json": { + "id": "1", + "type": "SPARK", + "description": "Updating Golden Records", + "status": { + "state": "PENDING", + "startTime": "", + "endTime": "", + "message": "Job has not yet been submitted to Spark" + }, + "created": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 modified version" + }, + "relativeId": "operations/1" + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/mastering/test_mastering/test_apply_feedback_async.json b/tests/tamr_client/fake_json/mastering/test_mastering/test_apply_feedback_async.json new file mode 100644 index 00000000..1fe30561 --- /dev/null +++ b/tests/tamr_client/fake_json/mastering/test_mastering/test_apply_feedback_async.json @@ -0,0 +1,33 @@ +[ + { + "request": { + "method": "POST", + "path": "projects/1/recordPairsWithPredictions/model:refresh" + }, + "response": { + "status": 200, + "json": { + "id": "1", + "type": "SPARK", + "description": "Materialize views to Elastic", + "status": { + "state": "PENDING", + "startTime": "", + "endTime": "", + "message": "Job has not yet been submitted to Spark" + }, + "created": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 modified version" + }, + "relativeId": "operations/1" + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/mastering/test_mastering/test_estimate_pairs_async.json b/tests/tamr_client/fake_json/mastering/test_mastering/test_estimate_pairs_async.json new file mode 100644 index 00000000..fca4e1bf --- /dev/null +++ b/tests/tamr_client/fake_json/mastering/test_mastering/test_estimate_pairs_async.json @@ -0,0 +1,33 @@ +[ + { + "request": { + "method": "POST", + "path": "projects/1/estimatedPairCounts:refresh" + }, + "response": { + "status": 200, + "json": { + "id": "1", + "type": "SPARK", + "description": "operation 1 description", + "status": { + "state": "PENDING", + "startTime": "", + "endTime": "", + "message": "Job has not yet been submitted to Spark" + }, + "created": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 modified version" + }, + "relativeId": "operations/1" + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/mastering/test_mastering/test_generate_pairs_async.json b/tests/tamr_client/fake_json/mastering/test_mastering/test_generate_pairs_async.json new file mode 100644 index 00000000..4c0590a2 --- /dev/null +++ b/tests/tamr_client/fake_json/mastering/test_mastering/test_generate_pairs_async.json @@ -0,0 +1,33 @@ +[ + { + "request": { + "method": "POST", + "path": "projects/1/recordPairs:refresh" + }, + "response": { + "status": 200, + "json": { + "id": "1", + "type": "SPARK", + "description": "Materialize views to Elastic", + "status": { + "state": "PENDING", + "startTime": "", + "endTime": "", + "message": "Job has not yet been submitted to Spark" + }, + "created": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 modified version" + }, + "relativeId": "operations/1" + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/mastering/test_mastering/test_publish_clusters_async.json b/tests/tamr_client/fake_json/mastering/test_mastering/test_publish_clusters_async.json new file mode 100644 index 00000000..f937de85 --- /dev/null +++ b/tests/tamr_client/fake_json/mastering/test_mastering/test_publish_clusters_async.json @@ -0,0 +1,33 @@ +[ + { + "request": { + "method": "POST", + "path": "projects/1/publishedClustersWithData:refresh" + }, + "response": { + "status": 200, + "json": { + "id": "1", + "type": "SPARK", + "description": "operation 1 description", + "status": { + "state": "PENDING", + "startTime": "", + "endTime": "", + "message": "Job has not yet been submitted to Spark" + }, + "created": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 modified version" + }, + "relativeId": "operations/1" + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/mastering/test_mastering/test_update_cluster_results_async.json b/tests/tamr_client/fake_json/mastering/test_mastering/test_update_cluster_results_async.json new file mode 100644 index 00000000..2e2af14a --- /dev/null +++ b/tests/tamr_client/fake_json/mastering/test_mastering/test_update_cluster_results_async.json @@ -0,0 +1,33 @@ +[ + { + "request": { + "method": "POST", + "path": "projects/1/recordClusters:refresh" + }, + "response": { + "status": 200, + "json": { + "id": "1", + "type": "SPARK", + "description": "Materialize views to Elastic", + "status": { + "state": "PENDING", + "startTime": "", + "endTime": "", + "message": "Job has not yet been submitted to Spark" + }, + "created": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 modified version" + }, + "relativeId": "operations/1" + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/mastering/test_mastering/test_update_high_impact_pairs_async.json b/tests/tamr_client/fake_json/mastering/test_mastering/test_update_high_impact_pairs_async.json new file mode 100644 index 00000000..68a8fa8a --- /dev/null +++ b/tests/tamr_client/fake_json/mastering/test_mastering/test_update_high_impact_pairs_async.json @@ -0,0 +1,33 @@ +[ + { + "request": { + "method": "POST", + "path": "projects/1/highImpactPairs:refresh" + }, + "response": { + "status": 200, + "json": { + "id": "1", + "type": "SPARK", + "description": "Materialize views to Elastic", + "status": { + "state": "PENDING", + "startTime": "", + "endTime": "", + "message": "Job has not yet been submitted to Spark" + }, + "created": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 modified version" + }, + "relativeId": "operations/1" + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/mastering/test_mastering/test_update_pair_results_async.json b/tests/tamr_client/fake_json/mastering/test_mastering/test_update_pair_results_async.json new file mode 100644 index 00000000..89f6bcca --- /dev/null +++ b/tests/tamr_client/fake_json/mastering/test_mastering/test_update_pair_results_async.json @@ -0,0 +1,33 @@ +[ + { + "request": { + "method": "POST", + "path": "projects/1/recordPairsWithPredictions:refresh" + }, + "response": { + "status": 200, + "json": { + "id": "1", + "type": "SPARK", + "description": "Materialize views to Elastic", + "status": { + "state": "PENDING", + "startTime": "", + "endTime": "", + "message": "Job has not yet been submitted to Spark" + }, + "created": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2020-06-12T18:21:42.288Z", + "version": "operation 1 modified version" + }, + "relativeId": "operations/1" + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/mastering/test_mastering_project/test_create.json b/tests/tamr_client/fake_json/mastering/test_mastering_project/test_create.json new file mode 100644 index 00000000..c47166ec --- /dev/null +++ b/tests/tamr_client/fake_json/mastering/test_mastering_project/test_create.json @@ -0,0 +1,65 @@ +[ + { + "request": { + "method": "POST", + "path": "projects", + "json": { + "name": "New Mastering Project", + "type": "DEDUP", + "unifiedDatasetName": "New Mastering Project_unified_dataset", + "description": "A Mastering Project", + "externalId": null + } + }, + "response": { + "status": 200, + "json": { + "id": "unify://unified-data/v1/projects/1", + "name": "New Mastering Project", + "description": "A Mastering Project", + "type": "DEDUP", + "unifiedDatasetName": "New Mastering Project_unified_dataset", + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "modified version" + }, + "relativeId": "projects/1", + "externalId": "b129f3b1-82f5-4e30-90a3-e562ca977992" + } + } + }, + { + "request": { + "method": "GET", + "path": "projects/1" + }, + "response": { + "status": 200, + "json": { + "id": "unify://unified-data/v1/projects/1", + "name": "New Mastering Project", + "description": "A Mastering Project", + "type": "DEDUP", + "unifiedDatasetName": "New Mastering Project_unified_dataset", + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "modified version" + }, + "relativeId": "projects/1", + "externalId": "b129f3b1-82f5-4e30-90a3-e562ca977992" + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/schema_mapping/test_schema_mapping_project/test_create.json b/tests/tamr_client/fake_json/schema_mapping/test_schema_mapping_project/test_create.json new file mode 100644 index 00000000..e431e9c9 --- /dev/null +++ b/tests/tamr_client/fake_json/schema_mapping/test_schema_mapping_project/test_create.json @@ -0,0 +1,65 @@ +[ + { + "request": { + "method": "POST", + "path": "projects", + "json": { + "name": "New Schema Mapping Project", + "type": "SCHEMA_MAPPING_RECOMMENDATIONS", + "unifiedDatasetName": "New Schema Mapping Project_unified_dataset", + "description": "A Schema Mapping Project", + "externalId": null + } + }, + "response": { + "status": 200, + "json": { + "id": "unify://unified-data/v1/projects/3", + "name": "New Schema Mapping Project", + "description": "A Schema Mapping Project", + "type": "SCHEMA_MAPPING_RECOMMENDATIONS", + "unifiedDatasetName": "New Schema Mapping Project_unified_dataset", + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "modified version" + }, + "relativeId": "projects/3", + "externalId": "b129f3b1-82f5-4e30-90a3-e562ca977992" + } + } + }, + { + "request": { + "method": "GET", + "path": "projects/3" + }, + "response": { + "status": 200, + "json": { + "id": "unify://unified-data/v1/projects/3", + "name": "New Schema Mapping Project", + "description": "A Schema Mapping Project", + "type": "SCHEMA_MAPPING_RECOMMENDATIONS", + "unifiedDatasetName": "New Schema Mapping Project_unified_dataset", + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "modified version" + }, + "relativeId": "projects/3", + "externalId": "b129f3b1-82f5-4e30-90a3-e562ca977992" + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/test_backup/test_by_resource_id.json b/tests/tamr_client/fake_json/test_backup/test_by_resource_id.json new file mode 100644 index 00000000..dba9574f --- /dev/null +++ b/tests/tamr_client/fake_json/test_backup/test_by_resource_id.json @@ -0,0 +1,22 @@ +[ + { + "request": { + "method": "GET", + "path": "backups/2020-08-17_21-32-10-961" + }, + "response": { + "status": 200, + "json": { + "id": "unify://unified-data/v1/backups/2020-08-17_21-32-10-961", + "relativeId": "2020-08-17_21-32-10-961", + "user": "admin", + "backupPath": "/home/ubuntu/tamr/backups/2020-08-17_21-32-10-961", + "state": "RUNNING", + "stage": "", + "errorMessage": "", + "created": "2020-08-17_21-32-10-961", + "lastModified": "2020-08-17_21-51-57-600" + } + } + } +] diff --git a/tests/tamr_client/fake_json/test_backup/test_cancel.json b/tests/tamr_client/fake_json/test_backup/test_cancel.json new file mode 100644 index 00000000..3698ee71 --- /dev/null +++ b/tests/tamr_client/fake_json/test_backup/test_cancel.json @@ -0,0 +1,22 @@ +[ + { + "request": { + "method": "POST", + "path": "backups/2020-08-17_21-32-10-961:cancel" + }, + "response": { + "status": 200, + "json": { + "id": "unify://unified-data/v1/backups/2020-08-17_21-32-10-961", + "relativeId": "2020-08-17_21-32-10-961", + "user": "admin", + "backupPath": "/home/ubuntu/tamr/backups/2020-08-17_21-32-10-961", + "state": "CANCELED", + "stage": "", + "errorMessage": "", + "created": "2020-08-17_21-32-10-961", + "lastModified": "2020-08-17_21-51-57-600" + } + } + } +] diff --git a/tests/tamr_client/fake_json/test_backup/test_get_all.json b/tests/tamr_client/fake_json/test_backup/test_get_all.json new file mode 100644 index 00000000..945a0f26 --- /dev/null +++ b/tests/tamr_client/fake_json/test_backup/test_get_all.json @@ -0,0 +1,46 @@ +[ + { + "request": { + "method": "GET", + "path": "backups" + }, + "response": { + "status": 200, + "json": [ + { + "id": "unify://unified-data/v1/backups/2020-08-17_21-32-10-961", + "relativeId": "2020-08-17_21-32-10-961", + "user": "admin", + "backupPath": "/home/ubuntu/tamr/backups/2020-08-17_21-32-10-961", + "state": "CANCELED", + "stage": "", + "errorMessage": "", + "created": "2020-08-17_21-32-10-961", + "lastModified": "2020-08-17_21-51-57-600" + }, + { + "id": "unify://unified-data/v1/backups/2020-08-17_21-58-01-205", + "relativeId": "2020-08-17_21-58-01-205", + "user": "admin", + "backupPath": "/home/ubuntu/tamr/backups/2020-08-17_21-58-01-205", + "state": "RUNNING", + "stage": "", + "errorMessage": "", + "created": "2020-08-17_21-58-01-205", + "lastModified": "2020-08-17_21-58-01-351" + }, + { + "id": "unify://unified-data/v1/backups/2020-08-17_21-58-45-062", + "relativeId": "2020-08-17_21-58-45-062", + "user": "admin", + "backupPath": "/home/ubuntu/tamr/backups/2020-08-17_21-58-45-062", + "state": "FAILED", + "stage": "", + "errorMessage": "A system operation is already in progress", + "created": "2020-08-17_21-58-45-062", + "lastModified": "2020-08-17_21-58-45-249" + } + ] + } + } +] diff --git a/tests/tamr_client/fake_json/test_backup/test_initiate.json b/tests/tamr_client/fake_json/test_backup/test_initiate.json new file mode 100644 index 00000000..98157eee --- /dev/null +++ b/tests/tamr_client/fake_json/test_backup/test_initiate.json @@ -0,0 +1,22 @@ +[ + { + "request": { + "method": "POST", + "path": "backups" + }, + "response": { + "status": 200, + "json": { + "id": "unify://unified-data/v1/backups/2020-08-17_21-32-10-961", + "relativeId": "2020-08-17_21-32-10-961", + "user": "admin", + "backupPath": "/home/ubuntu/tamr/backups/2020-08-17_21-32-10-961", + "state": "PENDING", + "stage": "", + "errorMessage": "", + "created": "2020-08-17_21-32-10-961", + "lastModified": "2020-08-17_21-32-10-961" + } + } + } +] diff --git a/tests/tamr_client/fake_json/test_backup/test_poll.json b/tests/tamr_client/fake_json/test_backup/test_poll.json new file mode 100644 index 00000000..dba9574f --- /dev/null +++ b/tests/tamr_client/fake_json/test_backup/test_poll.json @@ -0,0 +1,22 @@ +[ + { + "request": { + "method": "GET", + "path": "backups/2020-08-17_21-32-10-961" + }, + "response": { + "status": 200, + "json": { + "id": "unify://unified-data/v1/backups/2020-08-17_21-32-10-961", + "relativeId": "2020-08-17_21-32-10-961", + "user": "admin", + "backupPath": "/home/ubuntu/tamr/backups/2020-08-17_21-32-10-961", + "state": "RUNNING", + "stage": "", + "errorMessage": "", + "created": "2020-08-17_21-32-10-961", + "lastModified": "2020-08-17_21-51-57-600" + } + } + } +] diff --git a/tests/tamr_client/fake_json/test_instance/test_version.json b/tests/tamr_client/fake_json/test_instance/test_version.json new file mode 100644 index 00000000..2bedc1fb --- /dev/null +++ b/tests/tamr_client/fake_json/test_instance/test_version.json @@ -0,0 +1,19 @@ +[ + { + "request": { + "method": "GET", + "url": "http://localhost/api/versioned/service/version" + }, + "response": { + "status": 200, + "json": { + "version": "2020.012.0", + "gitDescribe": "Element/release/1.0.3-26513-gab2085fb5d", + "gitCommitId": "ab2085fb5ddd626199f0e86c8a93561129629fad", + "gitCommitShort": "ab2085fb5d", + "gitCommitTime": "2020-06-18 03:28:36 PM UTC", + "buildTime": "2020-06-18 05:25:55 PM UTC" + } + } + } +] diff --git a/tests/tamr_client/fake_json/test_project/test_by_name.json b/tests/tamr_client/fake_json/test_project/test_by_name.json new file mode 100644 index 00000000..e665ad93 --- /dev/null +++ b/tests/tamr_client/fake_json/test_project/test_by_name.json @@ -0,0 +1,32 @@ +[ + { + "request": { + "method": "GET", + "path": "projects?filter=name==proj" + }, + "response": { + "status": 200, + "json": [ + { + "id": "unify://unified-data/v1/projects/1", + "name": "proj", + "description": "Mastering Project", + "type": "DEDUP", + "unifiedDatasetName": "proj_unified_dataset", + "created": { + "username": "admin", + "time": "2020-04-03T14:14:18.752Z", + "version": "18" + }, + "lastModified": { + "username": "admin", + "time": "2020-04-03T14:14:20.115Z", + "version": "19" + }, + "relativeId": "projects/1", + "externalId": "58bdbe72-3c08-427d-97bd-45b16d92c79c" + } + ] + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/test_project/test_by_name_project_ambiguous.json b/tests/tamr_client/fake_json/test_project/test_by_name_project_ambiguous.json new file mode 100644 index 00000000..c40d9b1b --- /dev/null +++ b/tests/tamr_client/fake_json/test_project/test_by_name_project_ambiguous.json @@ -0,0 +1,51 @@ +[ + { + "request": { + "method": "GET", + "path": "projects?filter=name==ambiguous proj" + }, + "response": { + "status": 200, + "json": [ + { + "id": "unify://unified-data/v1/projects/1", + "name": "ambiguous proj", + "description": "Mastering Project", + "type": "DEDUP", + "unifiedDatasetName": "proj_unified_dataset", + "created": { + "username": "admin", + "time": "2020-04-03T14:14:18.752Z", + "version": "18" + }, + "lastModified": { + "username": "admin", + "time": "2020-04-03T14:14:20.115Z", + "version": "19" + }, + "relativeId": "projects/1", + "externalId": "58bdbe72-3c08-427d-97bd-45b16d92c79c" + }, + { + "id": "unify://unified-data/v1/projects/2", + "name": "ambiguous proj", + "description": "Mastering Project", + "type": "DEDUP", + "unifiedDatasetName": "proj_unified_dataset", + "created": { + "username": "admin", + "time": "2020-04-03T14:14:18.752Z", + "version": "18" + }, + "lastModified": { + "username": "admin", + "time": "2020-04-03T14:14:20.115Z", + "version": "19" + }, + "relativeId": "projects/2", + "externalId": "39abcd72-3c08-427d-9d7b-d92c45b1679c" + } + ] + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/test_project/test_by_name_project_not_found.json b/tests/tamr_client/fake_json/test_project/test_by_name_project_not_found.json new file mode 100644 index 00000000..d4dca382 --- /dev/null +++ b/tests/tamr_client/fake_json/test_project/test_by_name_project_not_found.json @@ -0,0 +1,12 @@ +[ + { + "request": { + "method": "GET", + "path": "projects?filter=name==missing proj" + }, + "response": { + "status": 200, + "json": [] + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/test_project/test_by_resource_id_categorization.json b/tests/tamr_client/fake_json/test_project/test_by_resource_id_categorization.json new file mode 100644 index 00000000..51c11afa --- /dev/null +++ b/tests/tamr_client/fake_json/test_project/test_by_resource_id_categorization.json @@ -0,0 +1,30 @@ +[ + { + "request": { + "method": "GET", + "path": "projects/2" + }, + "response": { + "status": 200, + "json": { + "id": "unify://unified-data/v1/projects/2", + "name": "Party Categorization", + "description": "Categorizes organization at the Party/Domestic level", + "type": "CATEGORIZATION", + "unifiedDatasetName": "party_categorization_unified_dataset", + "created": { + "username": "admin", + "time": "2020-08-04T14:54:11.767Z", + "version": "20" + }, + "lastModified": { + "username": "admin", + "time": "2020-08-04T14:54:11.767Z", + "version": "21" + }, + "relativeId": "projects/2", + "externalId": "98f9e4ee-1a35-4242-917d-1163363d5411" + } + } + } +] diff --git a/tests/tamr_client/fake_json/test_project/test_by_resource_id_mastering.json b/tests/tamr_client/fake_json/test_project/test_by_resource_id_mastering.json new file mode 100644 index 00000000..21e6dbfc --- /dev/null +++ b/tests/tamr_client/fake_json/test_project/test_by_resource_id_mastering.json @@ -0,0 +1,30 @@ +[ + { + "request": { + "method": "GET", + "path": "projects/1" + }, + "response": { + "status": 200, + "json": { + "id": "unify://unified-data/v1/projects/1", + "name": "proj", + "description": "Mastering Project", + "type": "DEDUP", + "unifiedDatasetName": "proj_unified_dataset", + "created": { + "username": "admin", + "time": "2020-04-03T14:14:18.752Z", + "version": "18" + }, + "lastModified": { + "username": "admin", + "time": "2020-04-03T14:14:20.115Z", + "version": "19" + }, + "relativeId": "projects/1", + "externalId": "58bdbe72-3c08-427d-97bd-45b16d92c79c" + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/test_project/test_by_resource_id_not_found.json b/tests/tamr_client/fake_json/test_project/test_by_resource_id_not_found.json new file mode 100644 index 00000000..26ee176c --- /dev/null +++ b/tests/tamr_client/fake_json/test_project/test_by_resource_id_not_found.json @@ -0,0 +1,11 @@ +[ + { + "request": { + "method": "GET", + "path": "projects/1" + }, + "response": { + "status": 404 + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/test_project/test_create_project_already_exists.json b/tests/tamr_client/fake_json/test_project/test_create_project_already_exists.json new file mode 100644 index 00000000..149ebb6a --- /dev/null +++ b/tests/tamr_client/fake_json/test_project/test_create_project_already_exists.json @@ -0,0 +1,21 @@ +[ + { + "request": { + "method": "POST", + "path": "projects", + "json": { + "name": "New Mastering Project", + "type": "DEDUP", + "unifiedDatasetName": "New Mastering Project_unified_dataset", + "description": "A Mastering Project", + "externalId": null + } + }, + "response": { + "status": 409, + "json": { + "message": "Can't create project with the requested name 'New Mastering Project' because a project with that name already exists" + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/test_project/test_get_all.json b/tests/tamr_client/fake_json/test_project/test_get_all.json new file mode 100644 index 00000000..fa088939 --- /dev/null +++ b/tests/tamr_client/fake_json/test_project/test_get_all.json @@ -0,0 +1,51 @@ +[ + { + "request": { + "method": "GET", + "path": "projects" + }, + "response": { + "status": 200, + "json": [ + { + "id": "unify://unified-data/v1/projects/1", + "name": "project 1", + "description": "Mastering Project", + "type": "DEDUP", + "unifiedDatasetName": "project_1_unified_dataset", + "created": { + "username": "admin", + "time": "2020-04-03T14:14:18.752Z", + "version": "18" + }, + "lastModified": { + "username": "admin", + "time": "2020-04-03T14:14:20.115Z", + "version": "19" + }, + "relativeId": "projects/1", + "externalId": "58bdbe72-3c08-427d-97bd-45b16d92c79c" + }, + { + "id": "unify://unified-data/v1/projects/2", + "name": "project 2", + "description": "Categorization Project", + "type": "CATEGORIZATION", + "unifiedDatasetName": "project_2_unified_dataset", + "created": { + "username": "admin", + "time": "2020-08-04T14:54:11.767Z", + "version": "20" + }, + "lastModified": { + "username": "admin", + "time": "2020-08-04T14:54:11.767Z", + "version": "21" + }, + "relativeId": "projects/2", + "externalId": "98f9e4ee-1a35-4242-917d-1163363d5411" + } + ] + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/test_project/test_get_all_filter.json b/tests/tamr_client/fake_json/test_project/test_get_all_filter.json new file mode 100644 index 00000000..78fb7d2c --- /dev/null +++ b/tests/tamr_client/fake_json/test_project/test_get_all_filter.json @@ -0,0 +1,32 @@ +[ + { + "request": { + "method": "GET", + "path": "projects?filter=description==Categorization%20Project" + }, + "response": { + "status": 200, + "json": [ + { + "id": "unify://unified-data/v1/projects/2", + "name": "project 2", + "description": "Categorization Project", + "type": "CATEGORIZATION", + "unifiedDatasetName": "project_2_unified_dataset", + "created": { + "username": "admin", + "time": "2020-08-04T14:54:11.767Z", + "version": "20" + }, + "lastModified": { + "username": "admin", + "time": "2020-08-04T14:54:11.767Z", + "version": "21" + }, + "relativeId": "projects/2", + "externalId": "98f9e4ee-1a35-4242-917d-1163363d5411" + } + ] + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/test_project/test_get_all_filter_list.json b/tests/tamr_client/fake_json/test_project/test_get_all_filter_list.json new file mode 100644 index 00000000..aa83afa8 --- /dev/null +++ b/tests/tamr_client/fake_json/test_project/test_get_all_filter_list.json @@ -0,0 +1,32 @@ +[ + { + "request": { + "method": "GET", + "path": "projects?filter=description==Categorization%20Project&?filter=name==project%202" + }, + "response": { + "status": 200, + "json": [ + { + "id": "unify://unified-data/v1/projects/2", + "name": "project 2", + "description": "Categorization Project", + "type": "CATEGORIZATION", + "unifiedDatasetName": "project_2_unified_dataset", + "created": { + "username": "admin", + "time": "2020-08-04T14:54:11.767Z", + "version": "20" + }, + "lastModified": { + "username": "admin", + "time": "2020-08-04T14:54:11.767Z", + "version": "21" + }, + "relativeId": "projects/2", + "externalId": "98f9e4ee-1a35-4242-917d-1163363d5411" + } + ] + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/test_restore/test_cancel.json b/tests/tamr_client/fake_json/test_restore/test_cancel.json new file mode 100644 index 00000000..1cfc37a2 --- /dev/null +++ b/tests/tamr_client/fake_json/test_restore/test_cancel.json @@ -0,0 +1,22 @@ +[ + { + "request": { + "method": "POST", + "path": "instance/restore:cancel" + }, + "response": { + "status": 200, + "json": { + "id": "unify://unified-data/v1/restore/restore-2020-08-19_20-01-20-233", + "relativeId": "restore-2020-08-19_20-01-20-233", + "user": "admin", + "backupPath": "/home/ubuntu/tamr/backups/2020-08-17_22-07-11-100", + "state": "CANCELED", + "stage": "", + "errorMessage": "", + "created": "2020-08-19_20-01-20-233", + "lastModified": "2020-08-19_20-02-19-351" + } + } + } +] diff --git a/tests/tamr_client/fake_json/test_restore/test_get.json b/tests/tamr_client/fake_json/test_restore/test_get.json new file mode 100644 index 00000000..1207c83c --- /dev/null +++ b/tests/tamr_client/fake_json/test_restore/test_get.json @@ -0,0 +1,22 @@ +[ + { + "request": { + "method": "GET", + "path": "instance/restore" + }, + "response": { + "status": 200, + "json": { + "id": "unify://unified-data/v1/restore/restore-2020-08-19_19-57-57-366", + "relativeId": "restore-2020-08-19_19-57-57-366", + "user": "admin", + "backupPath": "/home/ubuntu/tamr/backups/2020-08-17_22-07-11-100", + "state": "RUNNING", + "stage": "", + "errorMessage": "", + "created": "2020-08-19_19-57-57-366", + "lastModified": "2020-08-19_19-57-57-508" + } + } + } +] diff --git a/tests/tamr_client/fake_json/test_restore/test_initiate.json b/tests/tamr_client/fake_json/test_restore/test_initiate.json new file mode 100644 index 00000000..6699a3ac --- /dev/null +++ b/tests/tamr_client/fake_json/test_restore/test_initiate.json @@ -0,0 +1,22 @@ +[ + { + "request": { + "method": "POST", + "path": "instance/restore" + }, + "response": { + "status": 200, + "json": { + "id": "unify://unified-data/v1/restore/restore-2020-08-19_20-01-20-233", + "relativeId": "restore-2020-08-19_20-01-20-233", + "user": "admin", + "backupPath": "/home/ubuntu/tamr/backups/2020-08-17_22-07-11-100", + "state": "PENDING", + "stage": "", + "errorMessage": "", + "created": "2020-08-19_20-01-20-233", + "lastModified": "2020-08-19_20-01-20-233" + } + } + } +] diff --git a/tests/tamr_client/fake_json/test_transformations/test_get_all.json b/tests/tamr_client/fake_json/test_transformations/test_get_all.json new file mode 100644 index 00000000..3b5f6a0a --- /dev/null +++ b/tests/tamr_client/fake_json/test_transformations/test_get_all.json @@ -0,0 +1,64 @@ +[ + { + "request": { + "method": "GET", + "path": "projects/1/transformations" + }, + "response": { + "status": 200, + "json": { + "parameterized": [ + { + "datasets": [], + "transformation": "SELECT *, 1 as one;" + }, + { + "datasets": [ + { + "name": "dataset 1 name", + "datasetId": "unify://unified-data/v1/datasets/1", + "relativeDatasetId": "datasets/1" + } + ], + "transformation": "SELECT *, 2 as two;" + } + ], + "unified": [ + "//Comment\nSELECT *;" + ] + } + } + }, + { + "request": { + "method": "GET", + "path": "datasets/1" + }, + "response": { + "status": 200, + "json": { + "id": "unify://unified-data/v1/datasets/1", + "externalId": "number 1", + "name": "dataset 1 name", + "description": "dataset 1 description", + "version": "dataset 1 version", + "keyAttributeNames": [ + "tamr_id" + ], + "tags": [], + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "dataset 1 created version" + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "dataset 1 modified version" + }, + "relativeId": "datasets/1", + "upstreamDatasetIds": [] + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/test_transformations/test_replace_all.json b/tests/tamr_client/fake_json/test_transformations/test_replace_all.json new file mode 100644 index 00000000..2d2dc3d5 --- /dev/null +++ b/tests/tamr_client/fake_json/test_transformations/test_replace_all.json @@ -0,0 +1,23 @@ +[ + { + "request": { + "method": "PUT", + "path": "projects/1/transformations" + }, + "response": { + "status": 200, + "json": { + "parameterized": [ + { + "datasets": [], + "transformation": "SELECT *, 1 as one;" + } + ], + "unified": [ + "//Comment\nSELECT *;", + "//extra TX" + ] + } + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/fake_json/test_transformations/test_replace_all_errors.json b/tests/tamr_client/fake_json/test_transformations/test_replace_all_errors.json new file mode 100644 index 00000000..f692d9fd --- /dev/null +++ b/tests/tamr_client/fake_json/test_transformations/test_replace_all_errors.json @@ -0,0 +1,11 @@ +[ + { + "request": { + "method": "PUT", + "path": "projects/1/transformations" + }, + "response": { + "status": 400 + } + } +] \ No newline at end of file diff --git a/tests/tamr_client/golden_records/test_golden_records.py b/tests/tamr_client/golden_records/test_golden_records.py new file mode 100644 index 00000000..c28e6c8e --- /dev/null +++ b/tests/tamr_client/golden_records/test_golden_records.py @@ -0,0 +1,34 @@ +import tamr_client as tc +from tests.tamr_client import fake + + +@fake.json +def test_update_async(): + s = fake.session() + project = fake.golden_records_project() + + op = tc.golden_records._update_async(s, project) + assert op.type == "SPARK" + assert op.description == "Updating Golden Records" + assert op.status == { + "state": "PENDING", + "startTime": "", + "endTime": "", + "message": "Job has not yet been submitted to Spark", + } + + +@fake.json +def test_publish_async(): + s = fake.session() + project = fake.golden_records_project() + + op = tc.golden_records._publish_async(s, project) + assert op.type == "SPARK" + assert op.description == "Updating published datasets for GoldenRecords module" + assert op.status == { + "state": "PENDING", + "startTime": "", + "endTime": "", + "message": "Job has not yet been submitted to Spark", + } diff --git a/tests/tamr_client/mastering/test_mastering.py b/tests/tamr_client/mastering/test_mastering.py new file mode 100644 index 00000000..e887f99b --- /dev/null +++ b/tests/tamr_client/mastering/test_mastering.py @@ -0,0 +1,114 @@ +import tamr_client as tc +from tests.tamr_client import fake + + +@fake.json +def test_estimate_pairs_async(): + s = fake.session() + project = fake.mastering_project() + + op = tc.mastering._estimate_pairs_async(s, project) + assert op.type == "SPARK" + assert op.description == "operation 1 description" + assert op.status == { + "state": "PENDING", + "startTime": "", + "endTime": "", + "message": "Job has not yet been submitted to Spark", + } + + +@fake.json +def test_generate_pairs_async(): + s = fake.session() + project = fake.mastering_project() + + op = tc.mastering._generate_pairs_async(s, project) + assert op.type == "SPARK" + assert op.description == "Materialize views to Elastic" + assert op.status == { + "state": "PENDING", + "startTime": "", + "endTime": "", + "message": "Job has not yet been submitted to Spark", + } + + +@fake.json +def test_apply_feedback_async(): + s = fake.session() + project = fake.mastering_project() + + op = tc.mastering._apply_feedback_async(s, project) + assert op.type == "SPARK" + assert op.description == "Materialize views to Elastic" + assert op.status == { + "state": "PENDING", + "startTime": "", + "endTime": "", + "message": "Job has not yet been submitted to Spark", + } + + +@fake.json +def test_update_pair_results_async(): + s = fake.session() + project = fake.mastering_project() + + op = tc.mastering._update_pair_results_async(s, project) + assert op.type == "SPARK" + assert op.description == "Materialize views to Elastic" + assert op.status == { + "state": "PENDING", + "startTime": "", + "endTime": "", + "message": "Job has not yet been submitted to Spark", + } + + +@fake.json +def test_update_high_impact_pairs_async(): + s = fake.session() + project = fake.mastering_project() + + op = tc.mastering._update_high_impact_pairs_async(s, project) + assert op.type == "SPARK" + assert op.description == "Materialize views to Elastic" + assert op.status == { + "state": "PENDING", + "startTime": "", + "endTime": "", + "message": "Job has not yet been submitted to Spark", + } + + +@fake.json +def test_update_cluster_results_async(): + s = fake.session() + project = fake.mastering_project() + + op = tc.mastering._update_cluster_results_async(s, project) + assert op.type == "SPARK" + assert op.description == "Materialize views to Elastic" + assert op.status == { + "state": "PENDING", + "startTime": "", + "endTime": "", + "message": "Job has not yet been submitted to Spark", + } + + +@fake.json +def test_publish_clusters_async(): + s = fake.session() + project = fake.mastering_project() + + op = tc.mastering._publish_clusters_async(s, project) + assert op.type == "SPARK" + assert op.description == "operation 1 description" + assert op.status == { + "state": "PENDING", + "startTime": "", + "endTime": "", + "message": "Job has not yet been submitted to Spark", + } diff --git a/tests/tamr_client/mastering/test_mastering_project.py b/tests/tamr_client/mastering/test_mastering_project.py new file mode 100644 index 00000000..d798996b --- /dev/null +++ b/tests/tamr_client/mastering/test_mastering_project.py @@ -0,0 +1,15 @@ +import tamr_client as tc +from tests.tamr_client import fake + + +@fake.json +def test_create(): + s = fake.session() + instance = fake.instance() + + project = tc.mastering.project.create( + s, instance, name="New Mastering Project", description="A Mastering Project", + ) + assert isinstance(project, tc.MasteringProject) + assert project.name == "New Mastering Project" + assert project.description == "A Mastering Project" diff --git a/tamr_unify_client/models/dataset/__init__.py b/tests/tamr_client/schema_mapping/__init__.py similarity index 100% rename from tamr_unify_client/models/dataset/__init__.py rename to tests/tamr_client/schema_mapping/__init__.py diff --git a/tests/tamr_client/schema_mapping/test_schema_mapping_project.py b/tests/tamr_client/schema_mapping/test_schema_mapping_project.py new file mode 100644 index 00000000..cf139a00 --- /dev/null +++ b/tests/tamr_client/schema_mapping/test_schema_mapping_project.py @@ -0,0 +1,18 @@ +import tamr_client as tc +from tests.tamr_client import fake + + +@fake.json +def test_create(): + s = fake.session() + instance = fake.instance() + + project = tc.schema_mapping.project.create( + s, + instance, + name="New Schema Mapping Project", + description="A Schema Mapping Project", + ) + assert isinstance(project, tc.SchemaMappingProject) + assert project.name == "New Schema Mapping Project" + assert project.description == "A Schema Mapping Project" diff --git a/tests/tamr_client/test_auth.py b/tests/tamr_client/test_auth.py new file mode 100644 index 00000000..b9dbdb1a --- /dev/null +++ b/tests/tamr_client/test_auth.py @@ -0,0 +1,10 @@ +import tamr_client as tc + + +def test_auth_hidden_password(): + username = "username" + password = "secure_password" + auth = tc.UsernamePasswordAuth(username, password) + + assert password not in repr(auth) + assert password not in str(auth) diff --git a/tests/tamr_client/test_backup.py b/tests/tamr_client/test_backup.py new file mode 100644 index 00000000..20528950 --- /dev/null +++ b/tests/tamr_client/test_backup.py @@ -0,0 +1,69 @@ +import tamr_client as tc +from tests.tamr_client import fake + + +@fake.json +def test_get_all(): + s = fake.session() + instance = fake.instance() + + tc.backup.get_all(session=s, instance=instance) + + +@fake.json +def test_by_resource_id(): + s = fake.session() + instance = fake.instance() + resource_id = "2020-08-17_21-32-10-961" + + tc.backup.by_resource_id(session=s, instance=instance, resource_id=resource_id) + + +@fake.json +def test_initiate(): + s = fake.session() + instance = fake.instance() + + tc.backup.initiate(session=s, instance=instance) + + +@fake.json +def test_cancel(): + s = fake.session() + data = { + "id": "unify://unified-data/v1/backups/2020-08-17_21-32-10-961", + "relativeId": "2020-08-17_21-32-10-961", + "user": "admin", + "backupPath": "/home/ubuntu/tamr/backups/2020-08-17_21-32-10-961", + "state": "RUNNING", + "stage": "", + "errorMessage": "", + "created": "2020-08-17_21-32-10-961", + "lastModified": "2020-08-17_21-51-57-600", + } + backup = tc.backup._from_json( + url=tc.URL(path="backups/2020-08-17_21-32-10-961"), data=data + ) + + tc.backup.cancel(session=s, backup=backup) + + +@fake.json +def test_poll(): + s = fake.session() + data = { + "id": "unify://unified-data/v1/backups/2020-08-17_21-32-10-961", + "relativeId": "2020-08-17_21-32-10-961", + "user": "admin", + "backupPath": "/home/ubuntu/tamr/backups/2020-08-17_21-32-10-961", + "state": "RUNNING", + "stage": "", + "errorMessage": "", + "created": "2020-08-17_21-32-10-961", + "lastModified": "2020-08-17_21-51-57-600", + } + backup = tc.backup._from_json( + url=tc.URL(path="backups/2020-08-17_21-32-10-961"), data=data + ) + + tc.backup.poll(session=s, backup=backup) diff --git a/tests/tamr_client/test_instance.py b/tests/tamr_client/test_instance.py new file mode 100644 index 00000000..3547f301 --- /dev/null +++ b/tests/tamr_client/test_instance.py @@ -0,0 +1,30 @@ +import tamr_client as tc +from tests.tamr_client import fake + + +def test_instance_default(): + instance = tc.Instance() + assert tc.instance.origin(instance) == "http://localhost" + + +def test_client_set_protocol(): + instance = tc.Instance(protocol="https") + assert tc.instance.origin(instance) == "https://localhost" + + +def test_client_set_host(): + instance = tc.Instance(host="123.123.123.123") + assert tc.instance.origin(instance) == "http://123.123.123.123" + + +def test_client_set_port(): + instance = tc.Instance(port=9100) + assert tc.instance.origin(instance) == "http://localhost:9100" + + +@fake.json +def test_version(): + s = fake.session() + instance = fake.instance() + version = tc.instance.version(s, instance) + assert version == "2020.012.0" diff --git a/tests/tamr_client/test_operation.py b/tests/tamr_client/test_operation.py new file mode 100644 index 00000000..8ea7b271 --- /dev/null +++ b/tests/tamr_client/test_operation.py @@ -0,0 +1,134 @@ +import pytest +import responses + +import tamr_client as tc +from tests.tamr_client import fake, utils + + +def test_operation_from_json(): + url = tc.URL(path="operations/1") + operation_json = utils.load_json("operation_succeeded.json") + op = tc.operation._from_json(url, operation_json) + assert op.url == url + assert op.type == operation_json["type"] + assert op.description == operation_json["description"] + assert op.status == operation_json["status"] + assert tc.operation.succeeded(op) + + +@responses.activate +def test_operation_by_url(): + s = fake.session() + url = tc.URL(path="operations/1") + + operation_json = utils.load_json("operation_succeeded.json") + responses.add(responses.GET, str(url), json=operation_json) + + op = tc.operation._by_url(s, url) + assert op.url == url + assert op.type == operation_json["type"] + assert op.description == operation_json["description"] + assert op.status == operation_json["status"] + assert tc.operation.succeeded(op) + + +@responses.activate +def test_operation_from_response(): + s = fake.session() + instance = fake.instance() + url = tc.URL(path="operations/1") + + operation_json = utils.load_json("operation_succeeded.json") + responses.add(responses.GET, str(url), json=operation_json) + + r = s.get(str(url)) + op = tc.operation._from_response(instance, r) + assert op.url == url + assert op.type == operation_json["type"] + assert op.description == operation_json["description"] + assert op.status == operation_json["status"] + assert tc.operation.succeeded(op) + + +@responses.activate +def test_operation_from_response_noop(): + s = fake.session() + instance = fake.instance() + url = tc.URL(path="operations/2") + responses.add(responses.GET, str(url), status=204) + + url_dummy = tc.URL(path="operations/-1") + responses.add(responses.GET, str(url_dummy), status=404) + + r = s.get(str(url)) + op2 = tc.operation._from_response(instance, r) + + assert op2.url is not None + assert op2.type == "NOOP" + assert op2.description is not None + assert op2.status is not None + assert op2.status["state"] == "SUCCEEDED" + assert tc.operation.succeeded(op2) + + op2w = tc.operation.wait(s, op2) + assert tc.operation.succeeded(op2w) + + with pytest.raises(tc.operation.NotFound): + tc.operation.poll(s, op2w) + + +@responses.activate +def test_by_resource_id(): + s = fake.session() + instance = fake.instance() + url = tc.URL(path="operations/1") + + operation_json = utils.load_json("operation_succeeded.json") + responses.add(responses.GET, str(url), json=operation_json) + + resource_id = "1" + op = tc.operation.by_resource_id(s, instance, resource_id) + assert op.url == url + assert op.type == operation_json["type"] + assert op.description == operation_json["description"] + assert op.status == operation_json["status"] + assert tc.operation.succeeded(op) + + +@responses.activate +def test_operation_poll(): + s = fake.session() + url = tc.URL(path="operations/1") + + pending_operation_json = utils.load_json("operation_pending.json") + op1 = tc.operation._from_json(url, pending_operation_json) + + succeeded_operation_json = utils.load_json("operation_succeeded.json") + responses.add(responses.GET, str(url), json=succeeded_operation_json) + op2 = tc.operation.poll(s, op1) + + assert op2.url == op1.url + assert not tc.operation.succeeded(op1) + assert tc.operation.succeeded(op2) + + +def test_operation_check_success(): + s = fake.session() + url = tc.URL(path="operations/1") + op_json = utils.load_json("operation_succeeded.json") + op = tc.operation._from_json(url, op_json) + + tc.operation.check(s, op) + + +def test_operation_failed_success(): + s = fake.session() + url = tc.URL(path="operations/1") + op_json = utils.load_json("operation_failed.json") + op = tc.operation._from_json(url, op_json) + + with pytest.raises(tc.operation.Failed) as exc_info: + tc.operation.check(s, op) + err_msg = str(exc_info.value) + assert str(url) in err_msg + assert op.status is not None and str(op.status["state"]) in err_msg diff --git a/tests/tamr_client/test_project.py b/tests/tamr_client/test_project.py new file mode 100644 index 00000000..444bd6e5 --- /dev/null +++ b/tests/tamr_client/test_project.py @@ -0,0 +1,147 @@ +import pytest + +import tamr_client as tc +from tests.tamr_client import fake + + +@fake.json +def test_by_resource_id_mastering(): + s = fake.session() + instance = fake.instance() + + project = tc.project.by_resource_id(s, instance, "1") + assert isinstance(project, tc.MasteringProject) + assert project.name == "proj" + assert project.description == "Mastering Project" + + +@fake.json +def test_by_resource_id_categorization(): + s = fake.session() + instance = fake.instance() + + project = tc.project.by_resource_id(s, instance, "2") + assert isinstance(project, tc.CategorizationProject) + assert project.name == "Party Categorization" + assert project.description == "Categorizes organization at the Party/Domestic level" + + +@fake.json +def test_by_resource_id_not_found(): + s = fake.session() + instance = fake.instance() + + with pytest.raises(tc.project.NotFound): + tc.project.by_resource_id(s, instance, "1") + + +@fake.json +def test_by_name(): + s = fake.session() + instance = fake.instance() + + project = tc.project.by_name(s, instance, "proj") + assert project.name == "proj" + assert project.description == "Mastering Project" + + +@fake.json +def test_by_name_project_not_found(): + s = fake.session() + instance = fake.instance() + + with pytest.raises(tc.project.NotFound): + tc.project.by_name(s, instance, "missing project") + + +@fake.json +def test_by_name_project_ambiguous(): + s = fake.session() + instance = fake.instance() + + with pytest.raises(tc.project.Ambiguous): + tc.project.by_name(s, instance, "ambiguous project") + + +@fake.json +def test_get_all(): + s = fake.session() + instance = fake.instance() + + all_projects = tc.project.get_all(s, instance) + assert len(all_projects) == 2 + + project_1 = all_projects[0] + assert isinstance(project_1, tc.MasteringProject) + assert project_1.name == "project 1" + assert project_1.description == "Mastering Project" + + project_2 = all_projects[1] + assert isinstance(project_2, tc.CategorizationProject) + assert project_2.name == "project 2" + assert project_2.description == "Categorization Project" + + +@fake.json +def test_get_all_filter(): + s = fake.session() + instance = fake.instance() + + all_projects = tc.project.get_all( + s, instance, filter="description==Categorization Project" + ) + assert len(all_projects) == 1 + + project = all_projects[0] + assert isinstance(project, tc.CategorizationProject) + assert project.name == "project 2" + assert project.description == "Categorization Project" + + +@fake.json +def test_get_all_filter_list(): + s = fake.session() + instance = fake.instance() + + all_projects = tc.project.get_all( + s, instance, filter=["description==Categorization Project", "name==project 2"] + ) + assert len(all_projects) == 1 + + project = all_projects[0] + assert isinstance(project, tc.CategorizationProject) + assert project.name == "project 2" + assert project.description == "Categorization Project" + + +@fake.json +def test_create_project_already_exists(): + s = fake.session() + instance = fake.instance() + + with pytest.raises(tc.project.AlreadyExists): + tc.project._create( + s, + instance, + name="New Mastering Project", + project_type="DEDUP", + description="A Mastering Project", + ) + + +def test_from_json_unrecognized_project_type(): + instance = fake.instance() + url = tc.URL("project/1", instance) + data: tc._types.JsonDict = { + "id": "unify://unified-data/v1/projects/1", + "name": "project 1", + "description": "A project of unknown type", + "type": "UNKNOWN", + "unifiedDatasetName": "", + "relativeId": "projects/1", + "externalId": "58bdbe72-3c08-427d-97bd-45b16d92c79c", + } + project = tc.project._from_json(url, data) + assert isinstance(project, tc.UnknownProject) + assert project.name == "project 1" + assert project.description == "A project of unknown type" diff --git a/tests/tamr_client/test_response.py b/tests/tamr_client/test_response.py new file mode 100644 index 00000000..e9d9d6bd --- /dev/null +++ b/tests/tamr_client/test_response.py @@ -0,0 +1,24 @@ +import json + +import responses + +import tamr_client as tc +from tests.tamr_client import fake + + +@responses.activate +def test_ndjson(): + s = fake.session() + + records = [{"a": 1}, {"b": 2}, {"c": 3}] + url = tc.URL(path="datasets/1/records") + responses.add( + responses.GET, str(url), body="\n".join(json.dumps(x) for x in records) + ) + + r = s.get(str(url)) + + ndjson = list(tc.response.ndjson(r)) + assert len(ndjson) == 3 + for record in ndjson: + assert record in records diff --git a/tests/tamr_client/test_restore.py b/tests/tamr_client/test_restore.py new file mode 100644 index 00000000..24564b46 --- /dev/null +++ b/tests/tamr_client/test_restore.py @@ -0,0 +1,38 @@ +import tamr_client as tc +from tests.tamr_client import fake + + +@fake.json +def test_get(): + s = fake.session() + instance = fake.instance() + + tc.restore.get(session=s, instance=instance) + + +@fake.json +def test_initiate(): + s = fake.session() + instance = fake.instance() + backup_path = "2020-08-19_20-01-20-233" + + tc.restore.initiate(session=s, instance=instance, backup_path=backup_path) + + +@fake.json +def test_cancel(): + s = fake.session() + data = { + "id": "unify://unified-data/v1/restore/restore-2020-08-19_20-01-20-233", + "relativeId": "restore-2020-08-19_20-01-20-233", + "user": "admin", + "backupPath": "/home/ubuntu/tamr/backups/2020-08-17_22-07-11-100", + "state": "RUNNING", + "stage": "", + "errorMessage": "", + "created": "2020-08-19_20-01-20-233", + "lastModified": "2020-08-19_20-02-19-351", + } + restore = tc.restore._from_json(url=tc.URL(path="instance/restore"), data=data) + + tc.restore.cancel(session=s, restore=restore) diff --git a/tests/tamr_client/test_transformations.py b/tests/tamr_client/test_transformations.py new file mode 100644 index 00000000..3757d8c8 --- /dev/null +++ b/tests/tamr_client/test_transformations.py @@ -0,0 +1,45 @@ +import pytest +from requests import HTTPError + +import tamr_client as tc +from tests.tamr_client import fake + + +@fake.json +def test_get_all(): + s = fake.session() + project = fake.mastering_project() + + transforms = tc.transformations.get_all(s, project) + + assert len(transforms.input_scope) == 2 + assert len(transforms.unified_scope) == 1 + + assert len(transforms.input_scope[0].datasets) == 0 + assert transforms.input_scope[0].transformation == "SELECT *, 1 as one;" + assert len(transforms.input_scope[1].datasets) == 1 + assert transforms.input_scope[1].datasets[0].name == "dataset 1 name" + assert transforms.input_scope[1].transformation == "SELECT *, 2 as two;" + + assert transforms.unified_scope[0] == "//Comment\nSELECT *;" + + +@fake.json +def test_replace_all(): + s = fake.session() + project = fake.mastering_project() + transforms = fake.transforms() + + transforms.unified_scope.append("//extra TX") + transforms.input_scope.pop(1) + tc.transformations.replace_all(s, project, transforms) + + +@fake.json +def test_replace_all_errors(): + s = fake.session() + project = fake.mastering_project() + transforms = fake.transforms() + + with pytest.raises(HTTPError): + tc.transformations.replace_all(s, project, transforms) diff --git a/tests/tamr_client/utils.py b/tests/tamr_client/utils.py new file mode 100644 index 00000000..47578b3e --- /dev/null +++ b/tests/tamr_client/utils.py @@ -0,0 +1,24 @@ +import json +from pathlib import Path +from typing import Union + + +data_dir = Path(__file__).parent / "data" + + +def load_json(path: Union[str, Path]): + with open(data_dir / path) as f: + return json.load(f) + + +def capture_payload(request, snoop, status, response_json): + """Capture request body within `snoop` so we can inspect that the request body is constructed correctly (e.g. for streaming requests). + + See https://github.com/getsentry/responses#dynamic-responses + """ + snoop["payload"] = [x.decode("utf-8") for x in request.body] + return status, {}, json.dumps(response_json) + + +def stringify(updates): + return [json.dumps(u) for u in updates] diff --git a/tests/unit/test_attribute.py b/tests/unit/test_attribute.py index ad88b44a..2a297cec 100644 --- a/tests/unit/test_attribute.py +++ b/tests/unit/test_attribute.py @@ -1,20 +1,26 @@ +from functools import partial +import json from unittest import TestCase +from requests import HTTPError import responses from tamr_unify_client import Client +from tamr_unify_client.attribute.collection import AttributeCollection +from tamr_unify_client.attribute.resource import Attribute, AttributeSpec +from tamr_unify_client.attribute.type import AttributeTypeSpec from tamr_unify_client.auth import UsernamePasswordAuth -from tamr_unify_client.models.attribute.resource import Attribute +from tamr_unify_client.dataset.resource import Dataset class TestAttribute(TestCase): def setUp(self): auth = UsernamePasswordAuth("username", "password") - self.unify = Client(auth) + self.tamr = Client(auth) def test_resource(self): alias = "datasets/1/attributes/RowNum" - row_num = Attribute(self.unify, self._attributes_json[0], alias) + row_num = Attribute(self.tamr, self._attributes_json[0], alias) expected = alias self.assertEqual(expected, row_num.relative_id) @@ -30,13 +36,13 @@ def test_resource(self): def test_resource_from_json(self): alias = "datasets/1/attributes/RowNum" - expected = Attribute(self.unify, self._attributes_json[0], alias) - actual = Attribute.from_json(self.unify, self._attributes_json[0], alias) + expected = Attribute(self.tamr, self._attributes_json[0], alias) + actual = Attribute.from_json(self.tamr, self._attributes_json[0], alias) self.assertEqual(repr(expected), repr(actual)) def test_simple_type(self): alias = "datasets/1/attributes/RowNum" - row_num = Attribute(self.unify, self._attributes_json[0], alias) + row_num = Attribute(self.tamr, self._attributes_json[0], alias) row_num_type = row_num.type expected = self._attributes_json[0]["type"]["baseType"] self.assertEqual(expected, row_num_type.base_type) @@ -45,14 +51,13 @@ def test_simple_type(self): def test_complex_type(self): alias = "datasets/1/attributes/geom" - geom = Attribute(self.unify, self._attributes_json[1], alias) + geom = Attribute(self.tamr, self._attributes_json[1], alias) self.assertEqual("RECORD", geom.type.base_type) self.assertIsNone(geom.type.inner_type) self.assertEqual(3, len(list(geom.type.attributes))) - point = list(geom.type.attributes)[0] + point = geom.type.attributes[0] self.assertEqual("point", point.name) - self.assertEqual(alias + "/type/attributes/point", point.relative_id) self.assertTrue(point.is_nullable) self.assertEqual("ARRAY", point.type.base_type) self.assertEqual("DOUBLE", point.type.inner_type.base_type) @@ -60,11 +65,11 @@ def test_complex_type(self): @responses.activate def test_dataset_attributes(self): - dataset_url = f"http://localhost:9100/api/versioned/v1/datasets/1" - attributes_url = f"http://localhost:9100/api/versioned/v1/datasets/1/attributes" + dataset_url = "http://localhost:9100/api/versioned/v1/datasets/1" + attributes_url = "http://localhost:9100/api/versioned/v1/datasets/1/attributes" responses.add(responses.GET, dataset_url, json=self._dataset_json) responses.add(responses.GET, attributes_url, json=self._attributes_json) - dataset = self.unify.datasets.by_resource_id("1") + dataset = self.tamr.datasets.by_resource_id("1") self.assertSequenceEqual( self._dataset_json["keyAttributeNames"], dataset.key_attribute_names ) @@ -73,6 +78,117 @@ def test_dataset_attributes(self): alias = "datasets/1/attributes/RowNum" self.assertEqual(alias, attributes[0].relative_id) + @responses.activate + def test_delete_attribute(self): + url = "http://localhost:9100/api/versioned/v1/datasets/1/attributes/RowNum" + responses.add(responses.GET, url, json=self._attributes_json[0]) + responses.add(responses.DELETE, url, status=204) + responses.add(responses.GET, url, status=404) + + dataset = Dataset(self.tamr, self._dataset_json) + attribute = dataset.attributes.by_resource_id("RowNum") + self.assertEqual(attribute._data, self._attributes_json[0]) + + response = attribute.delete() + self.assertEqual(response.status_code, 204) + self.assertRaises( + HTTPError, lambda: dataset.attributes.by_resource_id("RowNum") + ) + + @responses.activate + def test_update_attribute(self): + def create_callback(request, snoop): + snoop["payload"] = request.body + return 200, {}, json.dumps(self._updated_attribute_json) + + relative_id = "dataset/1/attributes/RowNum" + attribute_url = f"http://localhost:9100/api/versioned/v1/{relative_id}" + snoop_dict = {} + responses.add_callback( + responses.PUT, attribute_url, partial(create_callback, snoop=snoop_dict) + ) + attribute = Attribute(self.tamr, self._attributes_json[0], relative_id) + + temp_spec = attribute.spec() + new_attribute = temp_spec.with_description( + self._updated_attribute_json["description"] + ).put() + self.assertEqual(new_attribute.name, self._updated_attribute_json["name"]) + self.assertEqual( + new_attribute.description, self._updated_attribute_json["description"] + ) + + self.assertEqual( + json.loads(snoop_dict["payload"]), self._updated_attribute_json + ) + + self.assertEqual(attribute.name, self._attributes_json[0]["name"]) + self.assertEqual(attribute.description, self._attributes_json[0]["description"]) + + # checking that intermediate didn't change + self.assertEqual( + temp_spec.to_dict()["description"], self._attributes_json[0]["description"] + ) + + @responses.activate + def test_create_from_spec(self): + def create_callback(request, snoop): + snoop["payload"] = json.loads(request.body) + return 201, {}, json.dumps(spec_json) + + spec_json = { + "name": "attr", + "isNullable": False, + "type": { + "baseType": "RECORD", + "attributes": [ + { + "name": str(i), + "isNullable": True, + "type": { + "baseType": "ARRAY", + "innerType": {"baseType": "STRING"}, + }, + } + for i in range(4) + ], + }, + } + + inner_spec = ( + AttributeSpec.new() + .with_type( + AttributeTypeSpec.new() + .with_base_type("ARRAY") + .with_inner_type(AttributeTypeSpec.new().with_base_type("STRING")) + ) + .with_is_nullable(True) + ) + attr_specs = [inner_spec.with_name(str(i)) for i in range(4)] + outer_spec = ( + AttributeTypeSpec.new().with_base_type("RECORD").with_attributes(attr_specs) + ) + spec = ( + AttributeSpec.new() + .with_name("attr") + .with_is_nullable(False) + .with_type(outer_spec) + ) + + snoop_dict = {} + rel_path = "projects/1/attributes" + base_path = "http://localhost:9100/api/versioned/v1" + responses.add_callback( + responses.POST, + f"{base_path}/{rel_path}", + partial(create_callback, snoop=snoop_dict), + ) + + collection = AttributeCollection(self.tamr, rel_path) + collection.create(spec.to_dict()) + + self.assertEqual(snoop_dict["payload"], spec_json) + _dataset_json = { "id": "unify://unified-data/v1/datasets/1", "externalId": "number 1", @@ -155,3 +271,10 @@ def test_dataset_attributes(self): "isNullable": False, }, ] + + _updated_attribute_json = { + "name": "RowNum", + "description": "Synthetic row number updated", + "type": {"baseType": "STRING", "attributes": []}, + "isNullable": False, + } diff --git a/tests/unit/test_attribute_collection.py b/tests/unit/test_attribute_collection.py new file mode 100644 index 00000000..94f55123 --- /dev/null +++ b/tests/unit/test_attribute_collection.py @@ -0,0 +1,52 @@ +import pytest +import responses + +from tamr_unify_client import Client +from tamr_unify_client.auth import UsernamePasswordAuth + + +@pytest.fixture +def client(): + auth = UsernamePasswordAuth("username", "password") + tamr = Client(auth) + return tamr + + +@responses.activate +def test_delete_by_resource_id(client): + ds_url = url_prefix + "datasets/7" + attr_url = ds_url + "/attributes/family_role" + + responses.add(responses.GET, ds_url, json=datasets_collection[0]) + responses.add(responses.DELETE, attr_url, status=204) + + attributes = client.datasets.by_resource_id("7").attributes + response = attributes.delete_by_resource_id("family_role") + assert response.status_code == 204 + + +url_prefix = "http://localhost:9100/api/versioned/v1/" + +datasets_collection = [ + { + "id": "unify://unified-data/v1/datasets/115", + "name": "Globex_Store_Customers", + "description": "", + "version": "659", + "keyAttributeNames": ["custid"], + "tags": [], + "created": { + "username": "admin", + "time": "2019-08-02T20:11:51.643Z", + "version": "23388", + }, + "lastModified": { + "username": "admin", + "time": "2019-08-08T18:18:14.047Z", + "version": "26090", + }, + "relativeId": "datasets/115", + "upstreamDatasetIds": [], + "externalId": "05d15bfd-d709-472a-ad5a-048e3367cfab", + } +] diff --git a/tests/unit/test_attribute_configuration.py b/tests/unit/test_attribute_configuration.py new file mode 100644 index 00000000..cd9b2057 --- /dev/null +++ b/tests/unit/test_attribute_configuration.py @@ -0,0 +1,136 @@ +from functools import partial +import json +from unittest import TestCase + +from requests import HTTPError +import responses + +from tamr_unify_client import Client +from tamr_unify_client.auth import UsernamePasswordAuth +from tamr_unify_client.project.attribute_configuration.collection import ( + AttributeConfigurationCollection, +) +from tamr_unify_client.project.attribute_configuration.resource import ( + AttributeConfiguration, +) + + +class TestAttributeConfiguration(TestCase): + def setUp(self): + auth = UsernamePasswordAuth("username", "password") + self.tamr = Client(auth) + + def test_resource(self): + alias = "projects/1/attributeConfigurations/26" + test = AttributeConfiguration(self.tamr, self._ac_json, alias) + + expected = alias + self.assertEqual(expected, test.relative_id) + + expected = self._ac_json["id"] + self.assertEqual(expected, test.id) + + expected = self._ac_json["relativeAttributeId"] + self.assertEqual(expected, test.relative_attribute_id) + + expected = self._ac_json["attributeRole"] + self.assertEqual(expected, test.attribute_role) + + expected = self._ac_json["similarityFunction"] + self.assertEqual(expected, test.similarity_function) + + expected = self._ac_json["enabledForMl"] + self.assertEqual(expected, test.enabled_for_ml) + + expected = self._ac_json["tokenizer"] + self.assertEqual(expected, test.tokenizer) + + expected = self._ac_json["numericFieldResolution"] + self.assertEqual(expected, test.numeric_field_resolution) + + expected = self._ac_json["attributeName"] + self.assertEqual(expected, test.attribute_name) + + def test_resource_from_json(self): + alias = "projects/1/attributeConfigurations/26" + expected = AttributeConfiguration(self.tamr, self._ac_json, alias) + actual = AttributeConfiguration.from_json(self.tamr, self._ac_json, alias) + self.assertEqual(repr(expected), repr(actual)) + + @responses.activate + def test_delete(self): + url = f"{self._base}/{self._alias}/{self._attribute_id}" + responses.add(responses.GET, url, json=self._ac_json) + responses.add(responses.DELETE, url, status=204) + responses.add(responses.GET, url, status=404) + + collection = AttributeConfigurationCollection(self.tamr, self._alias) + config = collection.by_resource_id(self._attribute_id) + + self.assertEqual(config._data, self._ac_json) + + response = config.delete() + self.assertEqual(response.status_code, 204) + self.assertRaises( + HTTPError, lambda: collection.by_resource_id(self._attribute_id) + ) + + @responses.activate + def test_update(self): + def create_callback(request, snoop): + snoop["payload"] = request.body + return 200, {}, json.dumps(self._updated_ac_json) + + configs_url = f"{self._base}/{self._alias}" + config_url = f"{configs_url}/{self._attribute_id}" + + snoop_dict = {} + responses.add(responses.GET, config_url, json=self._ac_json) + responses.add_callback( + responses.PUT, config_url, partial(create_callback, snoop=snoop_dict) + ) + configs = AttributeConfigurationCollection(self.tamr, self._alias) + config = configs.by_resource_id(self._attribute_id) + + temp_spec = config.spec().with_attribute_role("SUM_ATTRIBUTE") + new_config = ( + temp_spec.with_enabled_for_ml(False) + .with_similarity_function("ABSOLUTE_DIFF") + .with_tokenizer("BIGRAM") + .put() + ) + + self.assertEqual(new_config._data, self._updated_ac_json) + self.assertEqual(json.loads(snoop_dict["payload"]), self._updated_ac_json) + self.assertEqual(config._data, self._ac_json) + + # checking that intermediate didn't change + self.assertTrue(temp_spec.to_dict()["enabledForMl"]) + + _base = "http://localhost:9100/api/versioned/v1" + _alias = "projects/1/attributeConfigurations" + _attribute_id = "26" + + _ac_json = { + "id": "unify://unified-data/v1/projects/1/attributeConfigurations/26", + "relativeId": "projects/1/attributeConfigurations/26", + "relativeAttributeId": "datasets/8/attributes/surname", + "attributeRole": "CLUSTER_NAME_ATTRIBUTE", + "similarityFunction": "COSINE", + "enabledForMl": True, + "tokenizer": "DEFAULT", + "numericFieldResolution": [], + "attributeName": "surname", + } + + _updated_ac_json = { + "id": "unify://unified-data/v1/projects/1/attributeConfigurations/26", + "relativeId": "projects/1/attributeConfigurations/26", + "relativeAttributeId": "datasets/8/attributes/surname", + "attributeRole": "SUM_ATTRIBUTE", + "similarityFunction": "ABSOLUTE_DIFF", + "enabledForMl": False, + "tokenizer": "BIGRAM", + "numericFieldResolution": [], + "attributeName": "surname", + } diff --git a/tests/unit/test_attribute_configuration_collection.py b/tests/unit/test_attribute_configuration_collection.py new file mode 100644 index 00000000..1e81c0d9 --- /dev/null +++ b/tests/unit/test_attribute_configuration_collection.py @@ -0,0 +1,393 @@ +from functools import partial +import json +from unittest import TestCase + +import responses + +from tamr_unify_client import Client +from tamr_unify_client.auth import UsernamePasswordAuth +from tamr_unify_client.project.attribute_configuration.collection import ( + AttributeConfigurationCollection, +) +from tamr_unify_client.project.attribute_configuration.resource import ( + AttributeConfigurationSpec, +) +from tamr_unify_client.project.resource import Project + + +class TestAttributeConfigurationCollection(TestCase): + def setUp(self): + auth = UsernamePasswordAuth("username", "password") + self.tamr = Client(auth) + + @responses.activate + def test_by_relative_id(self): + ac_url = "http://localhost:9100/api/versioned/v1/projects/1/attributeConfigurations/1" + alias = "projects/1/attributeConfigurations/" + ac_test = AttributeConfigurationCollection(self.tamr, alias) + expected = self.acc_json[0]["relativeId"] + responses.add(responses.GET, ac_url, json=self.acc_json[0]) + self.assertEqual( + expected, + ac_test.by_relative_id("projects/1/attributeConfigurations/1").relative_id, + ) + + @responses.activate + def test_by_resource_id(self): + ac_url = "http://localhost:9100/api/versioned/v1/projects/1/attributeConfigurations/1" + alias = "projects/1/attributeConfigurations/" + ac_test = AttributeConfigurationCollection(self.tamr, alias) + expected = self.acc_json[0]["relativeId"] + responses.add(responses.GET, ac_url, json=self.acc_json[0]) + self.assertEqual(expected, ac_test.by_resource_id("1").relative_id) + + @responses.activate + def test_create(self): + def create_callback(request, snoop): + snoop["payload"] = json.loads(request.body) + return 204, {}, json.dumps(self.created_json) + + url = ( + "http://localhost:9100/api/versioned/v1/projects/1/attributeConfigurations" + ) + project_url = "http://localhost:9100/api/versioned/v1/projects/1" + responses.add(responses.GET, project_url, json=self.project_json) + snoop_dict = {} + responses.add_callback( + responses.POST, url, partial(create_callback, snoop=snoop_dict) + ) + responses.add(responses.GET, url, json=self.created_json) + + attributeconfig = self.tamr.projects.by_resource_id( + "1" + ).attribute_configurations() + create = attributeconfig.create(self.create_json) + + self.assertEqual(create.relative_id, self.created_json["relativeId"]) + self.assertEqual(snoop_dict["payload"], self.create_json) + + @responses.activate + def test_create_from_spec(self): + def create_callback(request, snoop): + snoop["payload"] = json.loads(request.body) + return 204, {}, json.dumps(self.created_json) + + url = ( + "http://localhost:9100/api/versioned/v1/projects/1/attributeConfigurations" + ) + snoop_dict = {} + responses.add_callback( + responses.POST, url, partial(create_callback, snoop=snoop_dict) + ) + + configs = Project(self.tamr, self.project_json).attribute_configurations() + spec = ( + AttributeConfigurationSpec.new() + .with_attribute_name(self.create_json["attributeName"]) + .with_enabled_for_ml(self.create_json["enabledForMl"]) + .with_similarity_function(self.create_json["similarityFunction"]) + ) + create = configs.create(spec.to_dict()) + + self.assertEqual(create.relative_id, self.created_json["relativeId"]) + self.assertEqual(snoop_dict["payload"], self.create_json) + + @responses.activate + def test_stream(self): + ac_url = ( + "http://localhost:9100/api/versioned/v1/projects/1/attributeConfigurations/" + ) + alias = "projects/1/attributeConfigurations/" + ac_test = AttributeConfigurationCollection(self.tamr, alias) + responses.add(responses.GET, ac_url, json=self.acc_json) + streamer = ac_test.stream() + stream_content = [] + for char in streamer: + stream_content.append(char._data) + self.assertEqual(self.acc_json, stream_content) + + @responses.activate + def test_delete_by_resource_id(self): + attr_config_url = self._base + "projects/1/attributeConfigurations/20" + + responses.add(responses.GET, self.mastering_project, json=self.project_json) + responses.add(responses.DELETE, attr_config_url, status=204) + + attr_config_collection = self.tamr.projects.by_resource_id( + "1" + ).attribute_configurations() + response = attr_config_collection.delete_by_resource_id("20") + self.assertEqual(response.status_code, 204) + + _base = "http://localhost:9100/api/versioned/v1/" + mastering_project = _base + "projects/1" + + create_json = { + "similarityFunction": "ABSOLUTE_DIFF", + "enabledForMl": False, + "attributeName": "Tester", + } + + created_json = { + **create_json, + "attributeRole": "", + "tokenizer": "", + "numericFieldResolution": [], + "id": "unify://unified-data/v1/projects/1/attributeConfigurations/35", + "relativeId": "projects/1/attributeConfigurations/35", + "relativeAttributeId": "datasets/79/attributes/Tester", + } + + project_json = { + "id": "unify://unified-data/v1/projects/1", + "externalId": "project 1 external ID", + "name": "project 1 name", + "description": "project 1 description", + "type": "DEDUP", + "unifiedDatasetName": "project 1 unified dataset", + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "project 1 created version", + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "project 1 modified version", + }, + "relativeId": "projects/1", + } + + acc_json = [ + { + "id": "unify://unified-data/v1/projects/1/attributeConfigurations/1", + "relativeId": "projects/1/attributeConfigurations/1", + "relativeAttributeId": "datasets/8/attributes/suburb", + "attributeRole": "", + "similarityFunction": "COSINE", + "enabledForMl": True, + "tokenizer": "DEFAULT", + "numericFieldResolution": [], + "attributeName": "suburb", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeConfigurations/2", + "relativeId": "projects/1/attributeConfigurations/2", + "relativeAttributeId": "datasets/8/attributes/sex", + "attributeRole": "", + "similarityFunction": "COSINE", + "enabledForMl": True, + "tokenizer": "DEFAULT", + "numericFieldResolution": [], + "attributeName": "sex", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeConfigurations/3", + "relativeId": "projects/1/attributeConfigurations/3", + "relativeAttributeId": "datasets/8/attributes/address_2", + "attributeRole": "", + "similarityFunction": "COSINE", + "enabledForMl": True, + "tokenizer": "DEFAULT", + "numericFieldResolution": [], + "attributeName": "address_2", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeConfigurations/4", + "relativeId": "projects/1/attributeConfigurations/4", + "relativeAttributeId": "datasets/8/attributes/age", + "attributeRole": "", + "similarityFunction": "COSINE", + "enabledForMl": True, + "tokenizer": "DEFAULT", + "numericFieldResolution": [], + "attributeName": "age", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeConfigurations/5", + "relativeId": "projects/1/attributeConfigurations/5", + "relativeAttributeId": "datasets/8/attributes/culture", + "attributeRole": "", + "similarityFunction": "COSINE", + "enabledForMl": True, + "tokenizer": "DEFAULT", + "numericFieldResolution": [], + "attributeName": "culture", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeConfigurations/6", + "relativeId": "projects/1/attributeConfigurations/6", + "relativeAttributeId": "datasets/8/attributes/street_number", + "attributeRole": "", + "similarityFunction": "COSINE", + "enabledForMl": True, + "tokenizer": "DEFAULT", + "numericFieldResolution": [], + "attributeName": "street_number", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeConfigurations/7", + "relativeId": "projects/1/attributeConfigurations/7", + "relativeAttributeId": "datasets/8/attributes/postcode", + "attributeRole": "", + "similarityFunction": "COSINE", + "enabledForMl": True, + "tokenizer": "DEFAULT", + "numericFieldResolution": [], + "attributeName": "postcode", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeConfigurations/8", + "relativeId": "projects/1/attributeConfigurations/8", + "relativeAttributeId": "datasets/8/attributes/phone_number", + "attributeRole": "", + "similarityFunction": "COSINE", + "enabledForMl": True, + "tokenizer": "DEFAULT", + "numericFieldResolution": [], + "attributeName": "phone_number", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeConfigurations/9", + "relativeId": "projects/1/attributeConfigurations/9", + "relativeAttributeId": "datasets/8/attributes/soc_sec_id", + "attributeRole": "", + "similarityFunction": "COSINE", + "enabledForMl": True, + "tokenizer": "DEFAULT", + "numericFieldResolution": [], + "attributeName": "soc_sec_id", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeConfigurations/10", + "relativeId": "projects/1/attributeConfigurations/10", + "relativeAttributeId": "datasets/8/attributes/rec2_id", + "attributeRole": "", + "similarityFunction": "COSINE", + "enabledForMl": True, + "tokenizer": "DEFAULT", + "numericFieldResolution": [], + "attributeName": "rec2_id", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeConfigurations/11", + "relativeId": "projects/1/attributeConfigurations/11", + "relativeAttributeId": "datasets/8/attributes/date_of_birth", + "attributeRole": "", + "similarityFunction": "COSINE", + "enabledForMl": True, + "tokenizer": "DEFAULT", + "numericFieldResolution": [], + "attributeName": "date_of_birth", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeConfigurations/12", + "relativeId": "projects/1/attributeConfigurations/12", + "relativeAttributeId": "datasets/8/attributes/title", + "attributeRole": "", + "similarityFunction": "COSINE", + "enabledForMl": True, + "tokenizer": "DEFAULT", + "numericFieldResolution": [], + "attributeName": "title", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeConfigurations/13", + "relativeId": "projects/1/attributeConfigurations/13", + "relativeAttributeId": "datasets/8/attributes/address_1", + "attributeRole": "", + "similarityFunction": "COSINE", + "enabledForMl": True, + "tokenizer": "DEFAULT", + "numericFieldResolution": [], + "attributeName": "address_1", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeConfigurations/14", + "relativeId": "projects/1/attributeConfigurations/14", + "relativeAttributeId": "datasets/8/attributes/rec_id", + "attributeRole": "", + "similarityFunction": "COSINE", + "enabledForMl": True, + "tokenizer": "DEFAULT", + "numericFieldResolution": [], + "attributeName": "rec_id", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeConfigurations/15", + "relativeId": "projects/1/attributeConfigurations/15", + "relativeAttributeId": "datasets/8/attributes/state", + "attributeRole": "", + "similarityFunction": "COSINE", + "enabledForMl": True, + "tokenizer": "DEFAULT", + "numericFieldResolution": [], + "attributeName": "state", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeConfigurations/16", + "relativeId": "projects/1/attributeConfigurations/16", + "relativeAttributeId": "datasets/8/attributes/family_role", + "attributeRole": "", + "similarityFunction": "COSINE", + "enabledForMl": True, + "tokenizer": "DEFAULT", + "numericFieldResolution": [], + "attributeName": "family_role", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeConfigurations/17", + "relativeId": "projects/1/attributeConfigurations/17", + "relativeAttributeId": "datasets/8/attributes/blocking_number", + "attributeRole": "", + "similarityFunction": "COSINE", + "enabledForMl": True, + "tokenizer": "DEFAULT", + "numericFieldResolution": [], + "attributeName": "blocking_number", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeConfigurations/18", + "relativeId": "projects/1/attributeConfigurations/18", + "relativeAttributeId": "datasets/8/attributes/surname", + "attributeRole": "CLUSTER_NAME_ATTRIBUTE", + "similarityFunction": "COSINE", + "enabledForMl": True, + "tokenizer": "DEFAULT", + "numericFieldResolution": [], + "attributeName": "surname", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeConfigurations/19", + "relativeId": "projects/1/attributeConfigurations/19", + "relativeAttributeId": "datasets/8/attributes/given_name", + "attributeRole": "", + "similarityFunction": "COSINE", + "enabledForMl": True, + "tokenizer": "DEFAULT", + "numericFieldResolution": [], + "attributeName": "given_name", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeConfigurations/20", + "relativeId": "projects/1/attributeConfigurations/20", + "relativeAttributeId": "datasets/8/attributes/Address1", + "attributeRole": "", + "similarityFunction": "COSINE", + "enabledForMl": False, + "tokenizer": "DEFAULT", + "numericFieldResolution": [], + "attributeName": "Address1", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeConfigurations/21", + "relativeId": "projects/1/attributeConfigurations/21", + "relativeAttributeId": "datasets/8/attributes/Address2", + "attributeRole": "", + "similarityFunction": "COSINE", + "enabledForMl": False, + "tokenizer": "DEFAULT", + "numericFieldResolution": [], + "attributeName": "Address2", + }, + ] diff --git a/tests/unit/test_attribute_mapping.py b/tests/unit/test_attribute_mapping.py new file mode 100644 index 00000000..85a264ed --- /dev/null +++ b/tests/unit/test_attribute_mapping.py @@ -0,0 +1,71 @@ +import pytest +import responses + +from tamr_unify_client import Client +from tamr_unify_client.auth import UsernamePasswordAuth +from tamr_unify_client.project.attribute_mapping.resource import AttributeMapping + + +@pytest.fixture +def client(): + + return Client(UsernamePasswordAuth("username", "password")) + + +def test_resource(client): + test = AttributeMapping(client, mappings_json) + + expected = mappings_json["relativeId"] + assert expected == test.relative_id + + expected = mappings_json["id"] + assert expected == test.id + + expected = mappings_json["inputAttributeId"] + assert expected == test.input_attribute_id + + expected = mappings_json["relativeInputAttributeId"] + assert expected == test.relative_input_attribute_id + + expected = mappings_json["inputDatasetName"] + assert expected == test.input_dataset_name + + expected = mappings_json["inputAttributeName"] + assert expected == test.input_attribute_name + + expected = mappings_json["unifiedAttributeId"] + assert expected == test.unified_attribute_id + + expected = mappings_json["relativeUnifiedAttributeId"] + assert expected == test.relative_unified_attribute_id + + expected = mappings_json["unifiedDatasetName"] + assert expected == test.unified_dataset_name + + expected = mappings_json["unifiedAttributeName"] + assert expected == test.unified_attribute_name + + +@responses.activate +def test_delete(client): + specific_url = ( + "http://localhost:9100/api/versioned/v1/projects/4/attributeMappings/19629-12" + ) + responses.add(responses.DELETE, specific_url, status=204) + delete_map = AttributeMapping(client, mappings_json) + final_response = delete_map.delete() + assert final_response.status_code == 204 + + +mappings_json = { + "id": "unify://unified-data/v1/projects/4/attributeMappings/19629-12", + "relativeId": "projects/4/attributeMappings/19629-12", + "inputAttributeId": "unify://unified-data/v1/datasets/6/attributes/surname", + "relativeInputAttributeId": "datasets/6/attributes/surname", + "inputDatasetName": "febrl_sample_2k.csv", + "inputAttributeName": "surname", + "unifiedAttributeId": "unify://unified-data/v1/datasets/79/attributes/surname", + "relativeUnifiedAttributeId": "datasets/79/attributes/surname", + "unifiedDatasetName": "Charlotte_unified_dataset", + "unifiedAttributeName": "surname", +} diff --git a/tests/unit/test_attribute_mapping_collection.py b/tests/unit/test_attribute_mapping_collection.py new file mode 100644 index 00000000..44312da2 --- /dev/null +++ b/tests/unit/test_attribute_mapping_collection.py @@ -0,0 +1,180 @@ +from functools import partial +import json + +import pytest +import responses + +from tamr_unify_client import Client +from tamr_unify_client.auth import UsernamePasswordAuth +from tamr_unify_client.project.attribute_mapping.collection import ( + AttributeMappingCollection, +) +from tamr_unify_client.project.attribute_mapping.resource import AttributeMappingSpec + + +@pytest.fixture +def client(): + return Client(UsernamePasswordAuth("username", "password")) + + +@responses.activate +def test_by_resource_id(client): + url = "http://localhost:9100/api/versioned/v1/projects/4/attributeMappings" + responses.add(responses.GET, url, json=mappings_json) + tester = AttributeMappingCollection(client, url) + by_resource = tester.by_resource_id("19629-12") + assert ( + by_resource.unified_attribute_name == mappings_json[0]["unifiedAttributeName"] + ) + + +@responses.activate +def test_by_relative_id(client): + url = "http://localhost:9100/api/versioned/v1/projects/4/attributeMappings" + responses.add(responses.GET, url, json=mappings_json) + tester = AttributeMappingCollection(client, url) + by_relative = tester.by_relative_id("projects/4/attributeMappings/19629-12") + assert ( + by_relative.unified_attribute_name == mappings_json[0]["unifiedAttributeName"] + ) + + +@responses.activate +def test_create(client): + def create_callback(request, snoop): + snoop["payload"] = request.body + return 200, {}, json.dumps(mappings_json[0]) + + url = "http://localhost:9100/api/versioned/v1/projects/4/attributeMappings" + responses.add(responses.GET, url, json=mappings_json) + snoop_dict = {} + responses.add_callback( + responses.POST, url, partial(create_callback, snoop=snoop_dict) + ) + map_collection = AttributeMappingCollection(client, "projects/4/attributeMappings") + test = map_collection.create(create_json) + assert test.input_dataset_name == create_json["inputDatasetName"] + assert json.loads(snoop_dict["payload"]) == create_json + + +@responses.activate +def test_delete(client): + general_url = "http://localhost:9100/api/versioned/v1/projects/4/attributeMappings" + delete_collection = AttributeMappingCollection(client, general_url) + specific_url = general_url + "/19629-12" + responses.add(responses.DELETE, specific_url, status=204) + response = delete_collection.delete_by_resource_id("19629-12") + assert response.status_code == 204 + + +@responses.activate +def test_create_from_spec(client): + def create_callback(request, snoop): + snoop["payload"] = json.loads(request.body) + return 200, {}, json.dumps(mappings_json[0]) + + url = "http://localhost:9100/api/versioned/v1/projects/4/attributeMappings" + responses.add(responses.GET, url, json=mappings_json) + snoop_dict = {} + responses.add_callback( + responses.POST, url, partial(create_callback, snoop=snoop_dict) + ) + + map_collection = AttributeMappingCollection(client, "projects/4/attributeMappings") + spec = ( + AttributeMappingSpec.new() + .with_relative_input_attribute_id(create_json["relativeInputAttributeId"]) + .with_input_dataset_name(create_json["inputDatasetName"]) + .with_input_attribute_name(create_json["inputAttributeName"]) + .with_relative_unified_attribute_id(create_json["relativeUnifiedAttributeId"]) + .with_unified_dataset_name(create_json["unifiedDatasetName"]) + .with_unified_attribute_name(create_json["unifiedAttributeName"]) + ) + map_collection.create(spec.to_dict()) + + assert snoop_dict["payload"] == create_json + map_collection = AttributeMappingCollection(client, "projects/4/attributeMappings") + test = map_collection.create(create_json) + assert test.input_dataset_name == create_json["inputDatasetName"] + assert snoop_dict["payload"] == create_json + + +create_json = { + "relativeInputAttributeId": "datasets/6/attributes/suburb", + "inputDatasetName": "febrl_sample_2k.csv", + "inputAttributeName": "suburb", + "relativeUnifiedAttributeId": "datasets/8/attributes/suburb", + "unifiedDatasetName": "Project_1_unified_dataset", + "unifiedAttributeName": "suburb", +} + +created_json = { + **create_json, + "id": "unify://unified-data/v1/projects/1/attributeMappings/19594-14", + "relativeId": "projects/1/attributeMappings/19594-14", + "inputAttributeId": "unify://unified-data/v1/datasets/6/attributes/suburb", + "unifiedAttributeId": "unify://unified-data/v1/datasets/8/attributes/suburb", +} + +mappings_json = [ + { + "id": "unify://unified-data/v1/projects/4/attributeMappings/19629-12", + "relativeId": "projects/4/attributeMappings/19629-12", + "inputAttributeId": "unify://unified-data/v1/datasets/6/attributes/surname", + "relativeInputAttributeId": "datasets/6/attributes/surname", + "inputDatasetName": "febrl_sample_2k.csv", + "inputAttributeName": "surname", + "unifiedAttributeId": "unify://unified-data/v1/datasets/79/attributes/surname", + "relativeUnifiedAttributeId": "datasets/79/attributes/surname", + "unifiedDatasetName": "Charlotte_unified_dataset", + "unifiedAttributeName": "surname", + }, + { + "id": "unify://unified-data/v1/projects/4/attributeMappings/19629-17", + "relativeId": "projects/4/attributeMappings/19629-17", + "inputAttributeId": "unify://unified-data/v1/datasets/6/attributes/address_1", + "relativeInputAttributeId": "datasets/6/attributes/address_1", + "inputDatasetName": "febrl_sample_2k.csv", + "inputAttributeName": "address_1", + "unifiedAttributeId": "unify://unified-data/v1/datasets/79/attributes/surname", + "relativeUnifiedAttributeId": "datasets/79/attributes/surname", + "unifiedDatasetName": "Charlotte_unified_dataset", + "unifiedAttributeName": "surname", + }, + { + "id": "unify://unified-data/v1/projects/4/attributeMappings/19630-16", + "relativeId": "projects/4/attributeMappings/19630-16", + "inputAttributeId": "unify://unified-data/v1/datasets/6/attributes/street_number", + "relativeInputAttributeId": "datasets/6/attributes/street_number", + "inputDatasetName": "febrl_sample_2k.csv", + "inputAttributeName": "street_number", + "unifiedAttributeId": "unify://unified-data/v1/datasets/79/attributes/street_number", + "relativeUnifiedAttributeId": "datasets/79/attributes/street_number", + "unifiedDatasetName": "Charlotte_unified_dataset", + "unifiedAttributeName": "street_number", + }, + { + "id": "unify://unified-data/v1/projects/4/attributeMappings/19631-17", + "relativeId": "projects/4/attributeMappings/19631-17", + "inputAttributeId": "unify://unified-data/v1/datasets/6/attributes/address_1", + "relativeInputAttributeId": "datasets/6/attributes/address_1", + "inputDatasetName": "febrl_sample_2k.csv", + "inputAttributeName": "address_1", + "unifiedAttributeId": "unify://unified-data/v1/datasets/79/attributes/address_1", + "relativeUnifiedAttributeId": "datasets/79/attributes/address_1", + "unifiedDatasetName": "Charlotte_unified_dataset", + "unifiedAttributeName": "address_1", + }, + { + "id": "unify://unified-data/v1/projects/4/attributeMappings/19632-9", + "relativeId": "projects/4/attributeMappings/19632-9", + "inputAttributeId": "unify://unified-data/v1/datasets/6/attributes/date_of_birth", + "relativeInputAttributeId": "datasets/6/attributes/date_of_birth", + "inputDatasetName": "febrl_sample_2k.csv", + "inputAttributeName": "date_of_birth", + "unifiedAttributeId": "unify://unified-data/v1/datasets/79/attributes/Birthday", + "relativeUnifiedAttributeId": "datasets/79/attributes/Birthday", + "unifiedDatasetName": "Charlotte_unified_dataset", + "unifiedAttributeName": "Birthday", + }, +] diff --git a/tests/unit/test_base_path.py b/tests/unit/test_base_path.py new file mode 100644 index 00000000..b7355308 --- /dev/null +++ b/tests/unit/test_base_path.py @@ -0,0 +1,69 @@ +import responses + + +from tamr_unify_client import Client +from tamr_unify_client.auth import UsernamePasswordAuth + + +auth = UsernamePasswordAuth("username", "password") + +""" +This is a test file for testing imperfect base paths, and various other tests too. +The first five tests demonstrate that the client can handle badly written base paths and produce correct final urls. +Each test runs a request to the server, and produces the correct final url. +""" + + +@responses.activate +def test_base_path_no_trailing_slash(): + bad_base_path = "/api/versioned/v1" + tamr = Client(auth, base_path=bad_base_path) + full_url = "http://localhost:9100/api/versioned/v1/datasets/1" + responses.add(responses.GET, full_url, status=200) + tamr.get("datasets/1") + + +@responses.activate +def test_base_path_no_leading_slash(): + bad_base_path = "api/versioned/v1/" + tamr = Client(auth, base_path=bad_base_path) + full_url = "http://localhost:9100/api/versioned/v1/datasets/1" + responses.add(responses.GET, full_url, status=200) + tamr.get("datasets/1") + + +@responses.activate +def test_base_path_no_slash(): + bad_base_path = "api/versioned/v1" + tamr = Client(auth, base_path=bad_base_path) + full_url = "http://localhost:9100/api/versioned/v1/datasets/1" + responses.add(responses.GET, full_url, status=200) + tamr.get("datasets/1") + + +@responses.activate +def test_base_path_default_slash(): + standard_base_path = "/api/versioned/v1/" + tamr = Client(auth, base_path=standard_base_path) + full_url = "http://localhost:9100/api/versioned/v1/datasets/1" + responses.add(responses.GET, full_url, status=200) + tamr.get("datasets/1") + + +@responses.activate +def test_base_path_no_base_path(): + tamr = Client(auth) + full_url = "http://localhost:9100/api/versioned/v1/datasets/2" + responses.add(responses.GET, full_url, status=400) + tamr.get("datasets/2") + + +@responses.activate +def test_request_absolute_endpoint(): + endpoint = "/api/service/health" + full_url = f"http://localhost:9100{endpoint}" + responses.add(responses.GET, full_url, json={}) + client = Client(UsernamePasswordAuth("username", "password")) + # If client does not properly handle absolute paths, client.get() will + # raise a ConnectionRefused exception. + client.get(endpoint) diff --git a/tests/unit/test_binning_model.py b/tests/unit/test_binning_model.py new file mode 100644 index 00000000..7331ddf3 --- /dev/null +++ b/tests/unit/test_binning_model.py @@ -0,0 +1,162 @@ +from functools import partial +import json + +import responses + +from tamr_unify_client import Client +from tamr_unify_client.auth import UsernamePasswordAuth + + +project_config = { + "name": "Project 1", + "description": "Mastering Project", + "type": "DEDUP", + "unifiedDatasetName": "Project 1 - Unified Dataset", + "externalId": "Project1", + "resourceId": "1", +} +project_url = "http://localhost:9100/api/versioned/v1/projects/1" + + +@responses.activate +def test_binning_model_records(): + + records_body = [ + { + "id": ["d8b7351d-24ce-49aa-8655-5b5809ab6bb8"], + "isActive": ["true"], + "clauseId": ["2e6c5f1b-ed49-40ab-8cbb-350aded25070"], + "similarityFunction": ["COSINE"], + "tokenizer": ["DEFAULT"], + "fieldName": ["surname"], + "threshold": ["0.75"], + } + ] + + records_url = ( + "http://localhost:9100/api/versioned/v1/projects/1/binningModel/records" + ) + + responses.add(responses.GET, project_url, json=project_config) + + responses.add( + responses.GET, + records_url, + body="\n".join(json.dumps(body) for body in records_body), + ) + + tamr = Client(UsernamePasswordAuth("username", "password")) + + project = tamr.projects.by_resource_id("1").as_mastering() + binning_model = project.binning_model() + + binning_model_records = list(binning_model.records()) + assert binning_model_records == records_body + + +@responses.activate +def test_binning_model_update_records(): + + records_body = [ + { + "id": ["d8b7351d-24ce-49aa-8655-5b5809ab6bb8"], + "isActive": ["true"], + "clauseId": ["2e6c5f1b-ed49-40ab-8cbb-350aded25070"], + "similarityFunction": ["COSINE"], + "tokenizer": ["DEFAULT"], + "fieldName": ["surname"], + "threshold": ["0.75"], + }, + { + "id": ["d8b7351d-24ce-49aa-8655-5b5809ab6bc9"], + "isActive": ["true"], + "clauseId": ["2e6c5f1b-ed49-40ab-8cbb-350aded25070"], + "similarityFunction": ["COSINE"], + "tokenizer": ["DEFAULT"], + "fieldName": ["surname"], + "threshold": ["0.75"], + }, + { + "id": ["d8b7351d-24ce-49aa-8655-5b5809ab6bd8"], + "isActive": ["true"], + "clauseId": ["2e6c5f1b-ed49-40ab-8cbb-350aded25070"], + "similarityFunction": ["COSINE"], + "tokenizer": ["DEFAULT"], + "fieldName": ["surname"], + "threshold": ["0.75"], + }, + ] + + expected_updates = [ + { + "action": "CREATE", + "recordId": "d8b7351d-24ce-49aa-8655-5b5809ab6bb8", + "record": { + "id": ["d8b7351d-24ce-49aa-8655-5b5809ab6bb8"], + "isActive": ["true"], + "clauseId": ["2e6c5f1b-ed49-40ab-8cbb-350aded25070"], + "similarityFunction": ["COSINE"], + "tokenizer": ["DEFAULT"], + "fieldName": ["surname"], + "threshold": ["0.75"], + }, + }, + { + "action": "CREATE", + "recordId": "d8b7351d-24ce-49aa-8655-5b5809ab6bc9", + "record": { + "id": ["d8b7351d-24ce-49aa-8655-5b5809ab6bc9"], + "isActive": ["true"], + "clauseId": ["2e6c5f1b-ed49-40ab-8cbb-350aded25070"], + "similarityFunction": ["COSINE"], + "tokenizer": ["DEFAULT"], + "fieldName": ["surname"], + "threshold": ["0.75"], + }, + }, + { + "action": "CREATE", + "recordId": "d8b7351d-24ce-49aa-8655-5b5809ab6bd8", + "record": { + "id": ["d8b7351d-24ce-49aa-8655-5b5809ab6bd8"], + "isActive": ["true"], + "clauseId": ["2e6c5f1b-ed49-40ab-8cbb-350aded25070"], + "similarityFunction": ["COSINE"], + "tokenizer": ["DEFAULT"], + "fieldName": ["surname"], + "threshold": ["0.75"], + }, + }, + ] + + snoop_dict = {} + + def update_callback(request, snoop): + snoop["payload"] = request.body + return 200, {}, "{}" + + update_records_url = ( + "http://localhost:9100/api/versioned/v1/projects/1/binningModel/records" + ) + + responses.add(responses.GET, project_url, json=project_config) + + responses.add_callback( + responses.POST, + update_records_url, + callback=partial(update_callback, snoop=snoop_dict), + ) + + tamr = Client(UsernamePasswordAuth("username", "password")) + + project = tamr.projects.by_resource_id("1").as_mastering() + binning_model = project.binning_model() + + updates = [ + {"action": "CREATE", "recordId": record["id"][0], "record": record} + for record in records_body + ] + + binning_model.update_records(updates) + actual = [json.loads(item) for item in snoop_dict["payload"]] + assert expected_updates == actual diff --git a/tests/unit/test_categorization.py b/tests/unit/test_categorization.py new file mode 100644 index 00000000..0ab5de68 --- /dev/null +++ b/tests/unit/test_categorization.py @@ -0,0 +1,63 @@ +from unittest import TestCase + +import responses + +from tamr_unify_client import Client +from tamr_unify_client.auth import UsernamePasswordAuth + + +class TestCategorization(TestCase): + def setUp(self): + auth = UsernamePasswordAuth("username", "password") + self.tamr = Client(auth) + + @responses.activate + def test_taxonomy(self): + project_url = "http://localhost:9100/api/versioned/v1/projects/1" + taxonomy_url = "http://localhost:9100/api/versioned/v1/projects/1/taxonomy" + responses.add(responses.GET, project_url, json=self._project_json) + responses.add(responses.POST, taxonomy_url, json=self._taxonomy_json) + + project = self.tamr.projects.by_resource_id("1").as_categorization() + creation_spec = {"name": "Test Taxonomy"} + u = project.create_taxonomy(creation_spec) + + responses.add(responses.GET, taxonomy_url, json=self._taxonomy_json) + t = project.taxonomy() + self.assertEqual(print(u), print(t)) + + _project_json = { + "id": "unify://unified-data/v1/projects/1", + "name": "Test Project", + "description": "Categorization Project", + "type": "CATEGORIZATION", + "unifiedDatasetName": "", + "created": { + "username": "admin", + "time": "2019-07-12T13:08:17.440Z", + "version": "401", + }, + "lastModified": { + "username": "admin", + "time": "2019-07-12T13:08:17.534Z", + "version": "402", + }, + "relativeId": "projects/1", + "externalId": "904bf89e-74ba-45c5-8b4a-5ff913728f66", + } + + _taxonomy_json = { + "id": "unify://unified-data/v1/projects/1/taxonomy", + "name": "Test Taxonomy", + "created": { + "username": "admin", + "time": "2019-07-12T13:09:14.981Z", + "version": "405", + }, + "lastModified": { + "username": "admin", + "time": "2019-07-12T13:09:14.981Z", + "version": "405", + }, + "relativeId": "projects/1/taxonomy", + } diff --git a/tests/unit/test_category.py b/tests/unit/test_category.py new file mode 100644 index 00000000..5c77452c --- /dev/null +++ b/tests/unit/test_category.py @@ -0,0 +1,89 @@ +from unittest import TestCase + +import responses + +from tamr_unify_client import Client +from tamr_unify_client.auth import UsernamePasswordAuth +from tamr_unify_client.categorization.category.resource import Category + + +class TestCategory(TestCase): + def setUp(self): + auth = UsernamePasswordAuth("username", "password") + self.tamr = Client(auth) + + def test_resource(self): + alias = "projects/1/taxonomy/categories/1" + row_num = Category(self.tamr, self._categories_json[0], alias) + + expected = alias + self.assertEqual(expected, row_num.relative_id) + + expected = self._categories_json[0]["name"] + self.assertEqual(expected, row_num.name) + + expected = self._categories_json[0]["description"] + self.assertEqual(expected, row_num.description) + + def test_resource_from_json(self): + alias = "projects/1/taxonomy/categories/1" + expected = Category(self.tamr, self._categories_json[0], alias) + actual = Category.from_json(self.tamr, self._categories_json[0], alias) + self.assertEqual(repr(expected), repr(actual)) + + @responses.activate + def test_path(self): + t2 = Category( + self.tamr, self._categories_json[1], "projects/1/taxonomy/categories/2" + ) + + parent_url = ( + "http://localhost:9100/api/versioned/v1/projects/1/taxonomy/categories/1" + ) + responses.add(responses.GET, parent_url, json=self._categories_json[0]) + t1 = t2.parent() + + self.assertEqual(self._categories_json[0]["relativeId"], t1.relative_id) + self.assertIsNone(t1.parent()) + + self.assertEqual(t1.path, [t1.name]) + self.assertEqual(t2.path, [t1.name, t2.name]) + + _categories_json = [ + { + "id": "unify://unified-data/v1/projects/1/taxonomy/categories/1", + "name": "t1", + "description": "", + "parent": "", + "path": ["t1"], + "created": { + "username": "admin", + "time": "2019-07-12T13:10:52.988Z", + "version": "414", + }, + "lastModified": { + "username": "admin", + "time": "2019-07-12T13:10:52.988Z", + "version": "414", + }, + "relativeId": "projects/1/taxonomy/categories/1", + }, + { + "id": "unify://unified-data/v1/projects/1/taxonomy/categories/2", + "name": "t2", + "description": "", + "parent": "unify://unified-data/v1/projects/1/taxonomy/categories/1", + "path": ["t1", "t2"], + "created": { + "username": "admin", + "time": "2019-07-12T13:51:20.600Z", + "version": "419", + }, + "lastModified": { + "username": "admin", + "time": "2019-07-12T13:51:20.600Z", + "version": "419", + }, + "relativeId": "projects/1/taxonomy/categories/2", + }, + ] diff --git a/tests/unit/test_category_collection.py b/tests/unit/test_category_collection.py new file mode 100644 index 00000000..d69dc489 --- /dev/null +++ b/tests/unit/test_category_collection.py @@ -0,0 +1,59 @@ +import pytest +import responses + +from tamr_unify_client import Client +from tamr_unify_client.auth import UsernamePasswordAuth + + +@pytest.fixture +def client(): + auth = UsernamePasswordAuth("username", "password") + tamr = Client(auth) + return tamr + + +@responses.activate +def test_delete_by_resource_id(client): + taxonomy_url = categorization_project + "/taxonomy" + category_url = taxonomy_url + "/categories/3" + + responses.add( + responses.GET, categorization_project, json=categorization_project_config + ) + + responses.add(responses.GET, taxonomy_url, json=taxonomy) + responses.add(responses.DELETE, category_url, status=204) + + category_collection = client.projects.by_resource_id("2").as_categorization() + response = category_collection.taxonomy().categories().delete_by_resource_id("3") + assert response.status_code == 204 + + +url_prefix = "http://localhost:9100/api/versioned/v1/" +categorization_project = url_prefix + "projects/2" + +categorization_project_config = { + "id": "unify://unified-data/v1/projects/2", + "name": "cat", + "description": "Categorization Project", + "type": "CATEGORIZATION", + "unifiedDatasetName": "", + "relativeId": "projects/2", + "externalId": "904bf89e-74ba-45c5-8b4a-5ff913728f66", +} + +taxonomy = { + "id": "unify://unified-data/v1/projects/2/taxonomy", + "name": "tax", + "created": { + "username": "admin", + "time": "2019-07-12T13:09:14.981Z", + "version": "405", + }, + "lastModified": { + "username": "admin", + "time": "2019-07-12T13:09:14.981Z", + "version": "405", + }, + "relativeId": "projects/2/taxonomy", +} diff --git a/tests/unit/test_client_origin.py b/tests/unit/test_client_origin.py new file mode 100644 index 00000000..7e0c8667 --- /dev/null +++ b/tests/unit/test_client_origin.py @@ -0,0 +1,37 @@ +from tamr_unify_client import Client +from tamr_unify_client.auth import UsernamePasswordAuth + + +def test_client_default(): + auth = UsernamePasswordAuth("username", "password") + client = Client(auth) + + assert client.origin == "http://localhost:9100" + + +def test_client_set_protocol(): + auth = UsernamePasswordAuth("username", "password") + client = Client(auth, protocol="https") + + assert client.origin == "https://localhost:9100" + + +def test_client_set_host(): + auth = UsernamePasswordAuth("username", "password") + client = Client(auth, host="123.123.123.123") + + assert client.origin == "http://123.123.123.123:9100" + + +def test_client_set_port(): + auth = UsernamePasswordAuth("username", "password") + client = Client(auth, port=80) + + assert client.origin == "http://localhost:80" + + +def test_client_set_port_none(): + auth = UsernamePasswordAuth("username", "password") + client = Client(auth, port=None) + + assert client.origin == "http://localhost" diff --git a/tests/unit/test_create_dataset.py b/tests/unit/test_create_dataset.py index 9d243b68..ef0a267f 100644 --- a/tests/unit/test_create_dataset.py +++ b/tests/unit/test_create_dataset.py @@ -1,33 +1,219 @@ +from functools import partial import json +from pandas import DataFrame +import pytest import responses from tamr_unify_client import Client from tamr_unify_client.auth import UsernamePasswordAuth +from tamr_unify_client.dataset.collection import CreationError +from tamr_unify_client.dataset.resource import DatasetSpec auth = UsernamePasswordAuth("username", "password") -unify = Client(auth) +tamr = Client(auth) @responses.activate def test_create_dataset(): - dataset_creation_spec = { - "id": "unify://unified-data/v1/datasets/1", - "name": "dataset", - "keyAttributeNames": ["F1"], - "description": "So much data in here!", - "externalId": "Dataset created with pubapi", - } - - datasets_url = f"http://localhost:9100/api/versioned/v1/datasets" - dataset_url = f"http://localhost:9100/api/versioned/v1/datasets/1" - - responses.add(responses.POST, datasets_url, json=dataset_creation_spec, status=204) - responses.add(responses.GET, dataset_url, json=dataset_creation_spec) - - u = unify.create_dataset(dataset_creation_spec) - p = unify.datasets.by_resource_id("1") + def create_callback(request, snoop): + snoop["payload"] = json.loads(request.body) + return 201, {}, json.dumps(_dataset_json) + + dataset_url = _datasets_url + "/1" + snoop_dict = {} + responses.add_callback( + responses.POST, _datasets_url, partial(create_callback, snoop=snoop_dict) + ) + responses.add(responses.GET, dataset_url, json=_dataset_json) + + u = tamr.datasets.create(_creation_spec) + p = tamr.datasets.by_resource_id("1") + assert u.name == p.name assert u.key_attribute_names == p.key_attribute_names assert u.description == p.description assert u.external_id == p.external_id + + +@responses.activate +def test_create_from_dataframe(): + def create_callback(request, snoop): + snoop["creation"] = json.loads(request.body) + return 201, {}, json.dumps(_dataset_json) + + def attribute_callback(request, snoop): + snoop["attribute"] = json.loads(request.body) + return 201, {}, json.dumps(_attribute_json) + + def record_callback(request, snoop): + snoop["records"] = [json.loads(r) for r in request.body] + return 200, {}, json.dumps(_records_response_json) + + snoop_dict = {} + responses.add_callback( + responses.POST, _datasets_url, partial(create_callback, snoop=snoop_dict) + ) + responses.add_callback( + responses.POST, _attribute_url, partial(attribute_callback, snoop=snoop_dict) + ) + # only one additional attribute should be created, as the pk is handled at dataset creation + responses.add(responses.POST, _attribute_url, status=500) + responses.add_callback( + responses.POST, _records_url, partial(record_callback, snoop=snoop_dict) + ) + + dataset = tamr.datasets.create_from_dataframe(_dataframe, "attribute1", "Dataset") + assert dataset.name == _dataset_json["name"] + + creation_spec = snoop_dict["creation"] + assert creation_spec["name"] == _dataset_json["name"] + assert creation_spec["keyAttributeNames"], ["attribute1"] + + attribute_spec = snoop_dict["attribute"] + assert attribute_spec["name"] == _attribute_json["name"] + assert attribute_spec["type"] == _attribute_json["type"] + + records_spec = snoop_dict["records"] + assert len(records_spec) == len(_records_json) + for command, record in zip(records_spec, _records_json): + assert command["action"] == "CREATE" + assert command["record"] == record + + +def test_key_not_in_dataframe(): + with pytest.raises(KeyError): + tamr.datasets.create_from_dataframe(_dataframe, "bad key", "Dataset") + + +@responses.activate +def test_creation_initial_failure(): + responses.add(responses.POST, _datasets_url, status=500) + responses.add(responses.DELETE, _datasets_url + "/1", status=204) + + with pytest.raises(CreationError): + tamr.datasets.create_from_dataframe(_dataframe, "attribute1", "Dataset") + + +@responses.activate +def test_attribute_creation_failure(): + responses.add(responses.POST, _datasets_url, json=_dataset_json) + responses.add(responses.POST, _attribute_url, status=500) + responses.add(responses.DELETE, _dataset_url, status=204) + + with pytest.raises(CreationError): + tamr.datasets.create_from_dataframe(_dataframe, "attribute1", "Dataset") + + +@responses.activate +def test_record_failure(): + responses.add(responses.POST, _datasets_url, json=_dataset_json) + responses.add(responses.POST, _attribute_url, json=_attribute_json) + responses.add(responses.POST, _records_url, status=500) + responses.add(responses.DELETE, _dataset_url, status=204) + + with pytest.raises(CreationError): + tamr.datasets.create_from_dataframe(_dataframe, "attribute1", "Dataset") + + +@responses.activate +def test_record_validation_failure(): + responses.add(responses.POST, _datasets_url, json=_dataset_json) + responses.add(responses.POST, _attribute_url, json=_attribute_json) + responses.add(responses.POST, _records_url, json=_records_failure_json) + responses.add(responses.DELETE, _dataset_url, status=204) + + with pytest.raises(CreationError): + tamr.datasets.create_from_dataframe(_dataframe, "attribute1", "Dataset") + + +@responses.activate +def test_dataset_deletion_failure(): + responses.add(responses.POST, _datasets_url, json=_dataset_json) + responses.add(responses.POST, _attribute_url, json=_attribute_json) + responses.add(responses.POST, _records_url, json=_records_failure_json) + responses.add(responses.DELETE, _dataset_url, status=500) + + with pytest.raises(CreationError): + tamr.datasets.create_from_dataframe(_dataframe, "attribute1", "Dataset") + + +@responses.activate +def test_create_from_spec(): + def create_callback(request, snoop): + snoop["payload"] = json.loads(request.body) + return 201, {}, json.dumps(_dataset_json) + + snoop_dict = {} + responses.add_callback( + responses.POST, _datasets_url, partial(create_callback, snoop=snoop_dict) + ) + + spec = ( + DatasetSpec.new() + .with_name(_creation_spec["name"]) + .with_key_attribute_names(_creation_spec["keyAttributeNames"]) + .with_description(_creation_spec["description"]) + .with_external_id(_creation_spec["externalId"]) + ) + d = tamr.datasets.create(spec.to_dict()) + + assert snoop_dict["payload"] == _creation_spec + assert d.relative_id == _dataset_json["relativeId"] + + +_creation_spec = { + "name": "Dataset", + "keyAttributeNames": ["F1"], + "description": "So much data in here!", + "externalId": "Dataset created with pubapi", +} + +_datasets_url = "http://localhost:9100/api/versioned/v1/datasets" +_dataset_url = _datasets_url + "/1" +_attribute_url = _dataset_url + "/attributes" +_records_url = _dataset_url + ":updateRecords" + +_records_json = [ + {"attribute1": 1, "attribute2": "hi"}, + {"attribute1": 2, "attribute2": "record"}, +] +_dataframe = DataFrame(_records_json, columns=["attribute1", "attribute2"]) + +_records_response_json = { + "numCommandsProcessed": 2, + "allCommandsSucceeded": True, + "validationErrors": [], +} + +_records_failure_json = { + "numCommandsProcessed": 2, + "allCommandsSucceeded": False, + "validationErrors": [], +} + +_dataset_json = { + **_creation_spec, + "id": "unify://unified-data/v1/datasets/1", + "version": "1", + "tags": [], + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "1", + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "1", + }, + "relativeId": "datasets/1", + "upstreamDatasetIds": [], +} + +_attribute_json = { + "name": "attribute2", + "description": "", + "type": {"baseType": "ARRAY", "innerType": {"baseType": "STRING"}}, + "isNullable": False, +} diff --git a/tests/unit/test_create_project.py b/tests/unit/test_create_project.py index 85bd2e05..ab703140 100644 --- a/tests/unit/test_create_project.py +++ b/tests/unit/test_create_project.py @@ -1,31 +1,82 @@ +from functools import partial import json import responses from tamr_unify_client import Client from tamr_unify_client.auth import UsernamePasswordAuth +from tamr_unify_client.project.resource import ProjectSpec auth = UsernamePasswordAuth("username", "password") -unify = Client(auth) +tamr = Client(auth) + +creation_spec = { + "name": "Project 1", + "description": "Mastering Project", + "type": "DEDUP", + "unifiedDatasetName": "Project 1 - Unified Dataset", + "externalId": "Project1", +} + +project_json = { + **creation_spec, + "id": "unify://unified-data/v1/projects/1", + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "project 1 created version", + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "project 1 modified version", + }, + "relativeId": "projects/1", +} + +projects_url = "http://localhost:9100/api/versioned/v1/projects" +project_url = f"{projects_url}/1" @responses.activate def test_create_project(): - project_creation_spec = { - "name": "Project 1", - "description": "Mastering Project", - "type": "DEDUP", - "unifiedDatasetName": "Project 1 - Unified Dataset", - "externalId": "Project1", - "resourceId": "1", - } - - projects_url = f"http://localhost:9100/api/versioned/v1/projects" - project_url = f"http://localhost:9100/api/versioned/v1/projects/1" - - responses.add(responses.POST, projects_url, json=project_creation_spec, status=204) - responses.add(responses.GET, project_url, json=project_creation_spec) - - u = unify.create_project(project_creation_spec) - p = unify.projects.by_resource_id("1") - assert print(p) == print(u) + def create_callback(request, snoop): + snoop["payload"] = json.loads(request.body) + return 204, {}, json.dumps(project_json) + + snoop_dict = {} + responses.add_callback( + responses.POST, projects_url, partial(create_callback, snoop=snoop_dict) + ) + responses.add(responses.GET, project_url, json=project_json) + + u = tamr.projects.create(creation_spec) + p = tamr.projects.by_resource_id("1") + + assert snoop_dict["payload"] == creation_spec + assert p.__repr__() == u.__repr__() + + +@responses.activate +def test_create_from_spec(): + def create_callback(request, snoop): + snoop["payload"] = json.loads(request.body) + return 204, {}, json.dumps(project_json) + + snoop_dict = {} + responses.add_callback( + responses.POST, projects_url, partial(create_callback, snoop=snoop_dict) + ) + + spec = ( + ProjectSpec.new() + .with_name(creation_spec["name"]) + .with_description(creation_spec["description"]) + .with_type(creation_spec["type"]) + .with_unified_dataset_name(creation_spec["unifiedDatasetName"]) + .with_external_id(creation_spec["externalId"]) + ) + p = tamr.projects.create(spec.to_dict()) + + assert snoop_dict["payload"] == creation_spec + assert p.relative_id == project_json["relativeId"] diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py new file mode 100644 index 00000000..234b356e --- /dev/null +++ b/tests/unit/test_dataset.py @@ -0,0 +1,122 @@ +from functools import partial +import json + +import pytest +from requests import HTTPError +import responses + +from tamr_unify_client import Client +from tamr_unify_client.auth import UsernamePasswordAuth + + +@pytest.fixture +def client(): + auth = UsernamePasswordAuth("username", "password") + tamr = Client(auth) + return tamr + + +@responses.activate +def test_delete(client): + responses.add(responses.GET, _url, json=_dataset_json) + responses.add(responses.DELETE, _url, status=204) + responses.add(responses.GET, _url, status=404) + + dataset = client.datasets.by_resource_id("1") + assert dataset._data == _dataset_json + + response = dataset.delete() + assert response.status_code == 204 + with pytest.raises(HTTPError): + client.datasets.by_resource_id("1") + + +@responses.activate +def test_cascading_delete(client): + responses.add(responses.GET, _url, json=_dataset_json) + responses.add(responses.DELETE, _url + "?cascade=True", status=204) + responses.add(responses.GET, _url, status=404) + + dataset = client.datasets.by_resource_id("1") + assert dataset._data == _dataset_json + + response = dataset.delete(cascade=True) + assert response.status_code == 204 + with pytest.raises(HTTPError): + client.datasets.by_resource_id("1") + + +@responses.activate +def test_update(client): + def create_callback(request, snoop): + snoop["payload"] = request.body + return 200, {}, json.dumps(_updated_dataset_json) + + snoop_dict = {} + responses.add(responses.GET, _url, json=_dataset_json) + responses.add_callback( + responses.PUT, _url, partial(create_callback, snoop=snoop_dict) + ) + + dataset = client.datasets.by_resource_id("1") + + temp_spec = dataset.spec().with_description(_updated_dataset_json["description"]) + new_dataset = ( + temp_spec.with_external_id(_updated_dataset_json["externalId"]) + .with_tags(_updated_dataset_json["tags"]) + .put() + ) + + assert new_dataset._data == _updated_dataset_json + assert json.loads(snoop_dict["payload"]) == _updated_dataset_json + assert dataset._data == _dataset_json + + # checking that intermediate didn't change + assert temp_spec.to_dict()["externalId"] == _dataset_json["externalId"] + + +_url = "http://localhost:9100/api/versioned/v1/datasets/1" + +_dataset_json = { + "id": "unify://unified-data/v1/datasets/1", + "externalId": "1", + "name": "dataset 1 name", + "description": "dataset 1 description", + "version": "dataset 1 version", + "keyAttributeNames": ["tamr_id"], + "tags": [], + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "dataset 1 created version", + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "dataset 1 modified version", + }, + "relativeId": "datasets/1", + "upstreamDatasetIds": [], +} + +_updated_dataset_json = { + "id": "unify://unified-data/v1/datasets/1", + "externalId": "dataset1", + "name": "dataset 1 name", + "description": "updated description", + "version": "dataset 1 version", + "keyAttributeNames": ["tamr_id"], + "tags": ["new", "tags"], + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "dataset 1 created version", + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "dataset 1 modified version", + }, + "relativeId": "datasets/1", + "upstreamDatasetIds": [], +} diff --git a/tests/unit/test_dataset_attributes.py b/tests/unit/test_dataset_attributes.py index b7922788..ffe8d297 100644 --- a/tests/unit/test_dataset_attributes.py +++ b/tests/unit/test_dataset_attributes.py @@ -1,13 +1,10 @@ -import json - import responses from tamr_unify_client import Client from tamr_unify_client.auth import UsernamePasswordAuth -from tamr_unify_client.models.attribute.resource import Attribute auth = UsernamePasswordAuth("username", "password") -unify = Client(auth) +tamr = Client(auth) @responses.activate @@ -19,7 +16,7 @@ def test_dataset_attributes(): "isNullable": "false", } - dataset_url = f"http://localhost:9100/api/versioned/v1/datasets/1" + dataset_url = "http://localhost:9100/api/versioned/v1/datasets/1" responses.add(responses.GET, dataset_url, json={}) responses.add( @@ -29,11 +26,13 @@ def test_dataset_attributes(): status=204, ) responses.add( - responses.GET, dataset_url + "/attributes", json=[attribute_creation_spec] + responses.GET, + dataset_url + "/attributes/myAttribute", + json=attribute_creation_spec, ) - dataset = unify.datasets.by_resource_id("1") - create = dataset.create_attribute(attribute_creation_spec) + dataset = tamr.datasets.by_resource_id("1") + create = dataset.attributes.create(attribute_creation_spec) created = dataset.attributes.by_name("myAttribute") assert (create.relative_id) == (created.relative_id) diff --git a/tests/unit/test_dataset_by_external_id.py b/tests/unit/test_dataset_by_external_id.py index 42efdc4e..c5350e41 100644 --- a/tests/unit/test_dataset_by_external_id.py +++ b/tests/unit/test_dataset_by_external_id.py @@ -1,5 +1,3 @@ -import json - import pytest import responses @@ -38,15 +36,15 @@ def test_dataset_by_external_id__raises_when_not_found(): responses.add(responses.GET, datasets_url, json=[]) auth = UsernamePasswordAuth("username", "password") - unify = Client(auth) + tamr = Client(auth) with pytest.raises(KeyError): - unify.datasets.by_external_id(dataset_external_id) + tamr.datasets.by_external_id(dataset_external_id) @responses.activate def test_dataset_by_external_id_succeeds(): responses.add(responses.GET, datasets_url, json=dataset_json) auth = UsernamePasswordAuth("username", "password") - unify = Client(auth) - actual_dataset = unify.datasets.by_external_id(dataset_external_id) + tamr = Client(auth) + actual_dataset = tamr.datasets.by_external_id(dataset_external_id) assert actual_dataset._data == dataset_json[0] diff --git a/tests/unit/test_dataset_collection.py b/tests/unit/test_dataset_collection.py new file mode 100644 index 00000000..945faaf9 --- /dev/null +++ b/tests/unit/test_dataset_collection.py @@ -0,0 +1,35 @@ +import pytest +import responses + +from tamr_unify_client import Client +from tamr_unify_client.auth import UsernamePasswordAuth + + +@pytest.fixture +def client(): + auth = UsernamePasswordAuth("username", "password") + tamr = Client(auth) + return tamr + + +@responses.activate +def test_delete_by_resource_id(client): + ds_url = url_prefix + "datasets/115" + + responses.add(responses.DELETE, ds_url, status=204) + + response = client.datasets.delete_by_resource_id("115") + assert response.status_code == 204 + + +@responses.activate +def test_delete_by_resource_id_cascade(client): + ds_url = url_prefix + "datasets/115?cascade=True" + + responses.add(responses.DELETE, ds_url, status=204) + + response = client.datasets.delete_by_resource_id("115", cascade=True) + assert response.status_code == 204 + + +url_prefix = "http://localhost:9100/api/versioned/v1/" diff --git a/tests/unit/test_dataset_geo.py b/tests/unit/test_dataset_geo.py index f74a895d..e399fc24 100644 --- a/tests/unit/test_dataset_geo.py +++ b/tests/unit/test_dataset_geo.py @@ -3,17 +3,18 @@ import json from unittest import TestCase +import pytest import responses from tamr_unify_client import Client from tamr_unify_client.auth import UsernamePasswordAuth -from tamr_unify_client.models.dataset.resource import Dataset +from tamr_unify_client.dataset.resource import Dataset class TestDatasetGeo(TestCase): def setUp(self): auth = UsernamePasswordAuth("username", "password") - self.unify = Client(auth) + self.tamr = Client(auth) def test_record_to_feature(self): empty_record = {"id": "1"} @@ -143,7 +144,7 @@ def key_value_single(rec): actual = Dataset._record_to_feature( record_with_null_geo, key_value_single, ["id"], "geom" ) - expected = {"type": "Feature", "id": "1"} + expected = {"geometry": None, "type": "Feature", "id": "1"} self.assertEqual(expected, actual) record_with_bbox = {"id": "1", "bbox": [[0, 0], [1, 1]]} @@ -230,7 +231,7 @@ def key_value_composite(rec): @responses.activate def test_geo_features(self): - dataset_url = f"http://localhost:9100/api/versioned/v1/datasets/1" + dataset_url = "http://localhost:9100/api/versioned/v1/datasets/1" responses.add(responses.GET, dataset_url, json=self._dataset_json) attributes_url = f"{dataset_url}/attributes" @@ -242,7 +243,7 @@ def test_geo_features(self): records_url, body="\n".join([json.dumps(rec) for rec in self._records_json]), ) - dataset = self.unify.datasets.by_resource_id("1") + dataset = self.tamr.datasets.by_resource_id("1") features = [feature for feature in dataset.itergeofeatures()] self.assertEqual(6, len(features)) self.assertSetEqual( @@ -257,9 +258,36 @@ def test_geo_features(self): {feature["id"] for feature in features}, ) + @responses.activate + def test_geo_features_geo_attr(self): + dataset_url = "http://localhost:9100/api/versioned/v1/datasets/1" + responses.add(responses.GET, dataset_url, json=self._dataset_json) + + # Create a dataset with multiple geometry attributes + multi_geo_attrs = deepcopy(self._attributes_json) + geo2_attr = deepcopy(multi_geo_attrs[-1]) + geo2_attr["name"] = "geom2" + multi_geo_attrs.append(geo2_attr) + attributes_url = f"{dataset_url}/attributes" + responses.add(responses.GET, attributes_url, json=multi_geo_attrs) + + # Create a record with multiple geometry attributes + record = {"id": "point", "geom": {"point": [1, 1]}, "geom2": {"point": [2, 2]}} + records_url = f"{dataset_url}/records" + responses.add(responses.GET, records_url, body=json.dumps(record)) + dataset = self.tamr.datasets.by_resource_id("1") + + # Default is to get the first attribute with geometry type + feature = next(dataset.itergeofeatures()) + self.assertEqual(feature["geometry"]["coordinates"], record["geom"]["point"]) + + # We can override which geometry attribute is used for geometry + feature = next(dataset.itergeofeatures(geo_attr="geom2")) + self.assertEqual(feature["geometry"]["coordinates"], record["geom2"]["point"]) + @responses.activate def test_geo_interface(self): - dataset_url = f"http://localhost:9100/api/versioned/v1/datasets/1" + dataset_url = "http://localhost:9100/api/versioned/v1/datasets/1" responses.add(responses.GET, dataset_url, json=self._dataset_json) attributes_url = f"{dataset_url}/attributes" @@ -271,7 +299,7 @@ def test_geo_interface(self): records_url, body="\n".join([json.dumps(rec) for rec in self._records_json]), ) - dataset = self.unify.datasets.by_resource_id("1") + dataset = self.tamr.datasets.by_resource_id("1") fc = dataset.__geo_interface__ self.assertEqual("FeatureCollection", fc["type"]) self.assertSetEqual( @@ -428,6 +456,25 @@ def test_feature_to_record(self): expected = {"pk1": "1", "pk2": "2", "geo": {"point": [0, 0]}} self.assertEqual(expected, actual) + feature = {"type": "Feature", "id": "1", "geometry": None} + Dataset._feature_to_record(feature, ["pk"], "geo") + # feature_to_record is required to not raise an exception + + feature = { + "type": "Feature", + "id": None, + "geometry": {"type": "Point", "coordinates": [0, 0]}, + } + with pytest.raises(ValueError): + Dataset._feature_to_record(feature, ["pk"], "geo") + + feature = { + "type": "Feature", + "geometry": {"type": "Point", "coordinates": [0, 0]}, + } + with pytest.raises(ValueError): + Dataset._feature_to_record(feature, ["pk"], "geo") + class NotAFeature: @property def __geo_interface__(self): @@ -448,7 +495,7 @@ def update_callback(request, snoop): snoop["payload"] = request.body return 200, {}, "{}" - dataset_url = f"http://localhost:9100/api/versioned/v1/datasets/1" + dataset_url = "http://localhost:9100/api/versioned/v1/datasets/1" responses.add(responses.GET, dataset_url, json=self._dataset_json) attributes_url = f"{dataset_url}/attributes" @@ -460,7 +507,7 @@ def update_callback(request, snoop): responses.POST, records_url, callback=partial(update_callback, snoop=snoop) ) - dataset = self.unify.datasets.by_resource_id("1") + dataset = self.tamr.datasets.by_resource_id("1") features = [ {"id": "1", "geometry": {"type": "Point", "coordinates": [0, 0]}}, {"id": "2", "geometry": {"type": "Point", "coordinates": [1, 1]}}, @@ -493,6 +540,57 @@ def __geo_interface__(self): actual = [json.loads(item) for item in snoop["payload"]] self.assertEqual(expected, actual) + @responses.activate + def test_from_geo_features_geo_attr(self): + def update_callback(request, snoop): + snoop["payload"] = request.body + return 200, {}, "{}" + + dataset_url = "http://localhost:9100/api/versioned/v1/datasets/1" + responses.add(responses.GET, dataset_url, json=self._dataset_json) + + # Create a dataset with multiple geometry attributes + multi_geo_attrs = deepcopy(self._attributes_json) + geo2_attr = deepcopy(multi_geo_attrs[-1]) + geo2_attr["name"] = "geom2" + multi_geo_attrs.append(geo2_attr) + attributes_url = f"{dataset_url}/attributes" + responses.add(responses.GET, attributes_url, json=multi_geo_attrs) + + records_url = f"{dataset_url}:updateRecords" + snoop = {} + responses.add_callback( + responses.POST, records_url, callback=partial(update_callback, snoop=snoop) + ) + + dataset = self.tamr.datasets.by_resource_id("1") + features = [{"id": "1", "geometry": {"type": "Point", "coordinates": [0, 0]}}] + + # by default, the first attribute with geometry type is used for geometry + dataset.from_geo_features(features) + expected = [ + { + "action": "CREATE", + "recordId": "1", + "record": {"geom": {"point": [0, 0]}, "id": "1"}, + } + ] + actual = [json.loads(item) for item in snoop["payload"]] + self.assertEqual(expected, actual) + + # We can override which geometry attribute is used for geometry + snoop["payload"] = None + dataset.from_geo_features(features, geo_attr="geom2") + expected = [ + { + "action": "CREATE", + "recordId": "1", + "record": {"geom2": {"point": [0, 0]}, "id": "1"}, + } + ] + actual = [json.loads(item) for item in snoop["payload"]] + self.assertEqual(expected, actual) + @responses.activate def test_from_geo_features_composite_key(self): def update_callback(request, snoop): @@ -501,7 +599,7 @@ def update_callback(request, snoop): composite_key_dataset_json = deepcopy(self._dataset_json) composite_key_dataset_json["keyAttributeNames"] = ["id1", "id2"] - dataset_url = f"http://localhost:9100/api/versioned/v1/datasets/1" + dataset_url = "http://localhost:9100/api/versioned/v1/datasets/1" responses.add(responses.GET, dataset_url, json=composite_key_dataset_json) composite_key_attributes_json = deepcopy(self._attributes_json) @@ -519,7 +617,7 @@ def update_callback(request, snoop): responses.POST, records_url, callback=partial(update_callback, snoop=snoop) ) - dataset = self.unify.datasets.by_resource_id("1") + dataset = self.tamr.datasets.by_resource_id("1") features = [ {"id": ["1", "a"], "geometry": {"type": "Point", "coordinates": [0, 0]}}, {"id": ["2", "b"], "geometry": {"type": "Point", "coordinates": [1, 1]}}, diff --git a/tests/unit/test_dataset_profile.py b/tests/unit/test_dataset_profile.py index 9b5b9b4f..e662d205 100644 --- a/tests/unit/test_dataset_profile.py +++ b/tests/unit/test_dataset_profile.py @@ -1,36 +1,123 @@ -import json +from unittest import TestCase import responses from tamr_unify_client import Client from tamr_unify_client.auth import UsernamePasswordAuth -profile_json1 = { - "datasetName": "ds3", - "relativeDatasetId": "v1/datasets/3", - "isUpToDate": "false", - "profiledDataVersion": "3", - "profiledAt": { - "username": "system", - "time": "2019-06-05T14:23:25.860Z", - "version": "46", - }, - "simpleMetrics": [{"metricName": "rowCount", "metricValue": "1999"}], -} - - -@responses.activate -def test_dataset_profile(): - dataset_id = "3" - dataset_url = f"http://localhost:9100/api/versioned/v1/datasets/{dataset_id}" - profile_url = f"{dataset_url}/profile" - profile_refresh_url = f"{profile_url}:refresh" - responses.add(responses.GET, dataset_url, json={}) - responses.add(responses.GET, profile_url, json=profile_json1) - responses.add(responses.POST, profile_refresh_url, json=[], status=204) - auth = UsernamePasswordAuth("username", "password") - unify = Client(auth) - - dataset = unify.datasets.by_resource_id(dataset_id) - profile = dataset.profile() - assert profile._data == profile_json1 + +class TestDatasetProfile(TestCase): + @responses.activate + def test_dataset_profile(self): + auth = UsernamePasswordAuth("username", "password") + client = Client(auth) + + dataset_id = "3" + dataset_url = f"{client.protocol}://{client.host}:{client.port}/api/versioned/v1/datasets/{dataset_id}" + profile_url = f"{dataset_url}/profile" + responses.add(responses.GET, dataset_url, json={}) + responses.add(responses.GET, profile_url, json=self.profile_stale) + + dataset = client.datasets.by_resource_id(dataset_id) + profile = dataset.profile() + self.assertEqual(self.profile_stale["datasetName"], profile.dataset_name) + self.assertEqual( + self.profile_stale["relativeDatasetId"], profile.relative_dataset_id + ) + self.assertEqual(self.profile_stale["isUpToDate"], profile.is_up_to_date) + self.assertEqual( + self.profile_stale["profiledDataVersion"], profile.profiled_data_version + ) + self.assertEqual(self.profile_stale["profiledAt"], profile.profiled_at) + self.assertEqual(self.profile_stale["simpleMetrics"], profile.simple_metrics) + self.assertEqual( + self.profile_stale["attributeProfiles"], profile.attribute_profiles + ) + + @responses.activate + def test_profile_refresh(self): + auth = UsernamePasswordAuth("username", "password") + client = Client(auth) + + dataset_id = "3" + dataset_url = f"{client.protocol}://{client.host}:{client.port}/api/versioned/v1/datasets/{dataset_id}" + profile_url = f"{dataset_url}/profile" + profile_refresh_url = f"{profile_url}:refresh" + responses.add(responses.GET, dataset_url, json={}) + responses.add(responses.GET, profile_url, json=self.profile_stale) + responses.add( + responses.POST, profile_refresh_url, json=self.operation_succeeded + ) + + dataset = client.datasets.by_resource_id(dataset_id) + profile = dataset.profile() + op = profile.refresh(poll_interval_seconds=0) + self.assertTrue(op.succeeded()) + + @responses.activate + def test_profile_create(self): + auth = UsernamePasswordAuth("username", "password") + client = Client(auth) + + dataset_id = "3" + dataset_url = f"{client.protocol}://{client.host}:{client.port}/api/versioned/v1/datasets/{dataset_id}" + profile_url = f"{dataset_url}/profile" + profile_refresh_url = f"{profile_url}:refresh" + responses.add(responses.GET, dataset_url, json={}) + # We need to ensure that, when creating the profile, + # nothing ever tries to access the (non-existent) profile. + responses.add(responses.GET, profile_url, status=404) + responses.add( + responses.POST, profile_refresh_url, json=self.operation_succeeded + ) + + dataset = client.datasets.by_resource_id(dataset_id) + op = dataset.create_profile() + self.assertTrue(op.succeeded()) + + profile_stale = { + "datasetName": "ds3", + "relativeDatasetId": "v1/datasets/3", + "isUpToDate": False, + "profiledDataVersion": "3", + "profiledAt": { + "username": "system", + "time": "2019-06-05T14:23:25.860Z", + "version": "46", + }, + "simpleMetrics": [{"metricName": "rowCount", "metricValue": "1999"}], + "attributeProfiles": [ + { + "attributeName": "attribute1", + "simpleMetrics": [ + {"metricName": "distinctValueCount", "metricValue": "1999"} + ], + "mostFrequentValues": [ + {"value": "value1", "frequency": "1999", "percentFrequency": 1.0} + ], + } + ], + } + + operation_succeeded = { + "id": "1", + "type": "SPARK", + "description": "Synthetic Operation", + "status": { + "state": "SUCCEEDED", + "startTime": "2018-12-14T19:34:00.273Z", + "endTime": "2018-12-14T19:34:14.573Z", + "message": "", + }, + "created": { + "username": "admin", + "time": "2018-12-14T19:33:50.538Z", + "version": "390", + }, + "lastModified": { + "username": "system", + "time": "2018-12-14T19:34:15.200Z", + "version": "399", + }, + "relativeId": "operations/1", + } diff --git a/tests/unit/test_dataset_records.py b/tests/unit/test_dataset_records.py index 78536854..df385f83 100644 --- a/tests/unit/test_dataset_records.py +++ b/tests/unit/test_dataset_records.py @@ -1,21 +1,228 @@ +from functools import partial +import json +from unittest import TestCase + +from pandas import DataFrame import responses from tamr_unify_client import Client from tamr_unify_client.auth import UsernamePasswordAuth -@responses.activate -def test_dataset_records(): - dataset_id = "1" - dataset_url = f"http://localhost:9100/api/versioned/v1/datasets/{dataset_id}" - records_url = f"{dataset_url}/records" - responses.add(responses.GET, dataset_url, json={}) - responses.add( - responses.GET, records_url, body='{"attribute1": 1}\n{"attribute1": 2}' - ) - auth = UsernamePasswordAuth("username", "password") - unify = Client(auth) +class TestDatasetRecords(TestCase): + def setUp(self): + auth = UsernamePasswordAuth("username", "password") + self.tamr = Client(auth) + + @responses.activate + def test_get(self): + records_url = f"{self._dataset_url}/records" + responses.add(responses.GET, self._dataset_url, json={}) + responses.add( + responses.GET, + records_url, + body="\n".join([json.dumps(x) for x in self._records_json]), + ) + + dataset = self.tamr.datasets.by_resource_id(self._dataset_id) + records = list(dataset.records()) + self.assertListEqual(records, self._records_json) + + @responses.activate + def test_update(self): + def create_callback(request, snoop): + snoop["payload"] = list(request.body) + return 200, {}, json.dumps(self._response_json) + + responses.add(responses.GET, self._dataset_url, json={}) + dataset = self.tamr.datasets.by_resource_id(self._dataset_id) + + records_url = f"{self._dataset_url}:updateRecords" + updates = TestDatasetRecords.records_to_updates(self._records_json) + snoop = {} + responses.add_callback( + responses.POST, records_url, partial(create_callback, snoop=snoop) + ) + + response = dataset._update_records(updates) + self.assertEqual(response, self._response_json) + self.assertEqual(snoop["payload"], TestDatasetRecords.stringify(updates, False)) + + @responses.activate + def test_nan_update(self): + def create_callback(request, snoop, status): + snoop["payload"] = list(request.body) + return status, {}, json.dumps(self._response_json) + + responses.add(responses.GET, self._dataset_url, json={}) + dataset = self.tamr.datasets.by_resource_id(self._dataset_id) + + records_url = f"{self._dataset_url}:updateRecords" + updates = TestDatasetRecords.records_to_updates(self._nan_records_json) + snoop = {} + + responses.add_callback( + responses.POST, + records_url, + partial(create_callback, snoop=snoop, status=200), + ) + + # First call raises a ValueError and makes no request because NaN is not valid JSON + self.assertRaises(ValueError, lambda: dataset._update_records(updates)) + + # Second call has payload with NaN replaced by null and makes a successful request + response = dataset._update_records(updates, ignore_nan=True) + self.assertEqual(response, self._response_json) + self.assertEqual(snoop["payload"], TestDatasetRecords.stringify(updates, True)) + + @responses.activate + def test_upsert(self): + def create_callback(request, snoop): + snoop["payload"] = list(request.body) + return 200, {}, json.dumps(self._response_json) + + responses.add(responses.GET, self._dataset_url, json={}) + dataset = self.tamr.datasets.by_resource_id(self._dataset_id) + + records_url = f"{self._dataset_url}:updateRecords" + updates = TestDatasetRecords.records_to_updates(self._records_json) + snoop = {} + responses.add_callback( + responses.POST, records_url, partial(create_callback, snoop=snoop) + ) + + response = dataset.upsert_records(self._records_json, "attribute1") + self.assertEqual(response, self._response_json) + self.assertEqual(snoop["payload"], TestDatasetRecords.stringify(updates, False)) + + @responses.activate + def test_upsert_from_dataframe(self): + def create_callback(request, snoop): + snoop["payload"] = list(request.body) + return 200, {}, json.dumps(self._response_json) + + responses.add(responses.GET, self._dataset_url, json={}) + dataset = self.tamr.datasets.by_resource_id(self._dataset_id) + + records_url = f"{self._dataset_url}:updateRecords" + updates = TestDatasetRecords.records_to_updates(self._records_json) + snoop = {} + responses.add_callback( + responses.POST, records_url, partial(create_callback, snoop=snoop) + ) - dataset = unify.datasets.by_resource_id(dataset_id) - records = list(dataset.records()) - assert records == [{"attribute1": 1}, {"attribute1": 2}] + response = dataset.upsert_from_dataframe( + self._dataframe, primary_key_name="attribute1" + ) + self.assertEqual(response, self._response_json) + self.assertEqual(snoop["payload"], TestDatasetRecords.stringify(updates, False)) + + @responses.activate + def test_upsert_from_dataframe_nan(self): + def create_callback(request, snoop): + snoop["payload"] = list(request.body) + return 200, {}, json.dumps(self._response_json) + + responses.add(responses.GET, self._dataset_url, json={}) + dataset = self.tamr.datasets.by_resource_id(self._dataset_id) + + records_url = f"{self._dataset_url}:updateRecords" + updates = TestDatasetRecords.records_to_updates(self._null_records_json) + snoop = {} + responses.add_callback( + responses.POST, records_url, partial(create_callback, snoop=snoop) + ) + + response = dataset.upsert_from_dataframe( + self._dataframe_nan, primary_key_name="pk" + ) + self.assertEqual(response, self._response_json) + self.assertEqual(snoop["payload"], TestDatasetRecords.stringify(updates, True)) + + @responses.activate + def test_delete(self): + def create_callback(request, snoop): + snoop["payload"] = list(request.body) + return 200, {}, json.dumps(self._response_json) + + responses.add(responses.GET, self._dataset_url, json={}) + dataset = self.tamr.datasets.by_resource_id(self._dataset_id) + + records_url = f"{self._dataset_url}:updateRecords" + deletes = TestDatasetRecords.records_to_deletes(self._records_json) + snoop = {} + responses.add_callback( + responses.POST, records_url, partial(create_callback, snoop=snoop) + ) + + response = dataset.delete_records(self._records_json, "attribute1") + self.assertEqual(response, self._response_json) + self.assertEqual(snoop["payload"], TestDatasetRecords.stringify(deletes, False)) + + @responses.activate + def test_delete_ids(self): + def create_callback(request, snoop): + snoop["payload"] = list(request.body) + return 200, {}, json.dumps(self._response_json) + + responses.add(responses.GET, self._dataset_url, json={}) + dataset = self.tamr.datasets.by_resource_id(self._dataset_id) + + records_url = f"{self._dataset_url}:updateRecords" + deletes = TestDatasetRecords.records_to_deletes(self._records_json) + snoop = {} + responses.add_callback( + responses.POST, records_url, partial(create_callback, snoop=snoop) + ) + + ids = [r["attribute1"] for r in self._records_json] + response = dataset.delete_records_by_id(ids) + self.assertEqual(response, self._response_json) + self.assertEqual(snoop["payload"], TestDatasetRecords.stringify(deletes, False)) + + @responses.activate + def test_delete_all(self): + responses.add(responses.GET, self._dataset_url, json={}) + dataset = self.tamr.datasets.by_resource_id(self._dataset_id) + + responses.add(responses.DELETE, self._dataset_url + "/records", status=204) + response = dataset.delete_all_records() + self.assertEqual(response.status_code, 204) + + @staticmethod + def records_to_deletes(records): + return [ + {"action": "DELETE", "recordId": i} + for i, record in enumerate(records, start=1) + ] + + @staticmethod + def records_to_updates(records): + return [ + {"action": "CREATE", "recordId": i, "record": record} + for i, record in enumerate(records, start=1) + ] + + @staticmethod + def stringify(updates, ignore_nan): + nan_fill = "null" if ignore_nan else "NaN" + return [json.dumps(u).replace("NaN", nan_fill).encode("utf-8") for u in updates] + + _dataset_id = "1" + _dataset_url = f"http://localhost:9100/api/versioned/v1/datasets/{_dataset_id}" + + _records_json = [{"attribute1": 1}, {"attribute1": 2}] + _dataframe = DataFrame(_records_json, columns=["attribute1"], dtype=object) + _nan_records_json = [ + {"pk": 1, "attribute1": float("nan")}, + {"pk": 2, "attribute1": float("nan")}, + ] + _dataframe_nan = DataFrame( + _nan_records_json, columns=["pk", "attribute1"], dtype=object + ) + _null_records_json = [{"pk": 1, "attribute1": None}, {"pk": 2, "attribute1": None}] + _response_json = { + "numCommandsProcessed": 2, + "allCommandsSucceeded": True, + "validationErrors": [], + } diff --git a/tests/unit/test_dataset_status.py b/tests/unit/test_dataset_status.py index e31a04c4..93b6632c 100644 --- a/tests/unit/test_dataset_status.py +++ b/tests/unit/test_dataset_status.py @@ -1,5 +1,3 @@ -import json - import responses from tamr_unify_client import Client @@ -20,8 +18,8 @@ def test_dataset_status(): responses.add(responses.GET, dataset_url, json={}) responses.add(responses.GET, status_url, json=status_json) auth = UsernamePasswordAuth("username", "password") - unify = Client(auth) + tamr = Client(auth) - dataset = unify.datasets.by_resource_id(dataset_id) + dataset = tamr.datasets.by_resource_id(dataset_id) status = dataset.status() assert status._data == status_json diff --git a/tests/unit/test_dataset_usage.py b/tests/unit/test_dataset_usage.py new file mode 100644 index 00000000..aa088532 --- /dev/null +++ b/tests/unit/test_dataset_usage.py @@ -0,0 +1,174 @@ +from unittest import TestCase + +import responses + +from tamr_unify_client import Client +from tamr_unify_client.auth import UsernamePasswordAuth +from tamr_unify_client.dataset.resource import Dataset +from tamr_unify_client.dataset.usage import DatasetUsage +from tamr_unify_client.dataset.use import DatasetUse +from tamr_unify_client.project.step import ProjectStep + + +class TestUsage(TestCase): + def setUp(self): + auth = UsernamePasswordAuth("username", "password") + self.tamr = Client(auth) + + @responses.activate + def test_get_usage(self): + responses.add( + responses.GET, f"{self._base_url}/datasets/1/usage", json=self._usage_json + ) + u = Dataset(self.tamr, self._dataset_json).usage() + self.assertEqual(u._data, self._usage_json) + + def test_usage(self): + alias = "datasets/1/usage" + u = DatasetUsage(self.tamr, self._usage_json, alias) + self.assertEqual(u.usage._data, self._usage_json["usage"]) + self.assertEqual(u.relative_id, alias) + + udeps = u.dependencies + deps = [DatasetUse(self.tamr, dep) for dep in self._usage_json["dependencies"]] + for i in range(len(deps)): + self.assertEqual(deps[i].dataset_id, udeps[i].dataset_id) + + @responses.activate + def test_use(self): + usage_json = self._usage_json["usage"] + u = DatasetUse(self.tamr, usage_json) + + responses.add( + responses.GET, f"{self._base_url}/datasets/1", json=self._dataset_json + ) + + self.assertEqual(u.dataset_id, usage_json["datasetId"]) + self.assertEqual(u.dataset_name, usage_json["datasetName"]) + + self.assertEqual(u.output_from_project_steps, []) + inputs = u.input_to_project_steps + step = ProjectStep(self.tamr, usage_json["inputToProjectSteps"][0]) + self.assertEqual(len(inputs), 1) + self.assertEqual(repr(inputs[0]), repr(step)) + + dataset = u.dataset() + self.assertEqual(dataset.relative_id, "datasets/1") + + @responses.activate + def test_project_step(self): + step_json = self._usage_json["usage"]["inputToProjectSteps"][0] + step = ProjectStep(self.tamr, step_json) + + self.assertEqual(step.project_step_id, step_json["projectStepId"]) + self.assertEqual(step.project_step_name, step_json["projectStepName"]) + self.assertEqual(step.project_name, step_json["projectName"]) + self.assertEqual(step.type, step_json["type"]) + + responses.add( + responses.GET, f"{self._base_url}/projects", json=self._projects_json + ) + project = step.project() + self.assertEqual(project.relative_id, self._projects_json[0]["relativeId"]) + + _base_url = "http://localhost:9100/api/versioned/v1" + + _dataset_json = { + "id": "unify://unified-data/v1/datasets/1", + "name": "myData.csv", + "description": "", + "version": "321", + "keyAttributeNames": ["pk"], + "tags": [], + "created": { + "username": "admin", + "time": "2019-07-08T20:15:06.818Z", + "version": "4", + }, + "lastModified": { + "username": "admin", + "time": "2019-07-18T17:58:38.453Z", + "version": "6125", + }, + "relativeId": "datasets/1", + "upstreamDatasetIds": [], + "externalId": "myData.csv", + } + + _projects_json = [ + { + "id": "unify://unified-data/v1/projects/1", + "name": "My Project", + "description": "Categorization Project", + "type": "CATEGORIZATION", + "unifiedDatasetName": "", + "created": { + "username": "admin", + "time": "2019-07-12T13:08:17.440Z", + "version": "401", + }, + "lastModified": { + "username": "admin", + "time": "2019-07-12T13:08:17.534Z", + "version": "402", + }, + "relativeId": "projects/1", + "externalId": "904bf89e-74ba-45c5-8b4a-5ff913728f66", + } + ] + + _usage_json = { + "usage": { + "datasetId": "unify://unified-data/v1/datasets/1", + "datasetName": "myData.csv", + "inputToProjectSteps": [ + { + "projectStepId": "unify://unified-data/v1/projectSteps/1", + "projectStepName": "My Project-SCHEMA_MAPPING", + "projectName": "My Project", + "type": "SCHEMA_MAPPING", + } + ], + "outputFromProjectSteps": [], + }, + "dependencies": [ + { + "datasetId": "unify://unified-data/v1/datasets/2", + "datasetName": "myData.csv_sample", + "inputToProjectSteps": [], + "outputFromProjectSteps": [], + }, + { + "datasetId": "unify://unified-data/v1/datasets/3", + "datasetName": "My Project - Unified Dataset", + "inputToProjectSteps": [ + { + "projectStepId": "unify://unified-data/v1/projectSteps/2", + "projectStepName": "My Project-SCHEMA_MAPPING_RECOMMENDATIONS", + "projectName": "My Project", + "type": "SCHEMA_MAPPING_RECOMMENDATIONS", + }, + { + "projectStepId": "unify://unified-data/v1/projectSteps/3", + "projectStepName": "My Project-CATEGORIZATION", + "projectName": "My Project", + "type": "CATEGORIZATION", + }, + ], + "outputFromProjectSteps": [ + { + "projectStepId": "unify://unified-data/v1/projectSteps/1", + "projectStepName": "My Project-SCHEMA_MAPPING", + "projectName": "My Project", + "type": "SCHEMA_MAPPING", + }, + { + "projectStepId": "unify://unified-data/v1/projectSteps/2", + "projectStepName": "My Project-SCHEMA_MAPPING_RECOMMENDATIONS", + "projectName": "MY Project", + "type": "SCHEMA_MAPPING_RECOMMENDATIONS", + }, + ], + }, + ], + } diff --git a/tests/unit/test_http_error.py b/tests/unit/test_http_error.py index b1a20246..93ccdde2 100644 --- a/tests/unit/test_http_error.py +++ b/tests/unit/test_http_error.py @@ -10,10 +10,10 @@ def test_http_error(): """Ensure that the client surfaces HTTP errors as exceptions. """ - endpoint = f"http://localhost:9100/api/versioned/v1/projects/1" + endpoint = "http://localhost:9100/api/versioned/v1/projects/1" responses.add(responses.GET, endpoint, status=401) auth = UsernamePasswordAuth("nonexistent-username", "invalid-password") - unify = Client(auth) + tamr = Client(auth) with raises(HTTPError) as e: - unify.projects.by_resource_id("1") + tamr.projects.by_resource_id("1") assert f"401 Client Error: Unauthorized for url: {endpoint}" in str(e) diff --git a/tests/unit/test_input_datasets_collection.py b/tests/unit/test_input_datasets_collection.py new file mode 100644 index 00000000..87b1492e --- /dev/null +++ b/tests/unit/test_input_datasets_collection.py @@ -0,0 +1,64 @@ +import pytest +import responses + +from tamr_unify_client import Client +from tamr_unify_client.auth import UsernamePasswordAuth + + +@pytest.fixture +def client(): + auth = UsernamePasswordAuth("username", "password") + tamr = Client(auth) + return tamr + + +@responses.activate +def test_delete_by_resource_id(client): + input_collection = url_prefix + "projects/1/inputDatasets" + input_ds = input_collection + "/6" + + responses.add(responses.GET, mastering_project, json=mastering_project_config) + + responses.add(responses.GET, input_collection, json=input_ds_json) + responses.add(responses.DELETE, input_ds, status=204) + + input_ds_collection = client.projects.by_resource_id("1").input_datasets() + response = input_ds_collection.delete_by_resource_id("6") + assert response.status_code == 204 + + +url_prefix = "http://localhost:9100/api/versioned/v1/" +mastering_project = url_prefix + "projects/1" + +mastering_project_config = { + "name": "Project 1", + "description": "Mastering Project", + "type": "DEDUP", + "unifiedDatasetName": "Project 1 - Unified Dataset", + "externalId": "Project1", + "resourceId": "1", +} + +input_ds_json = [ + { + "id": "unify://unified-data/v1/datasets/6", + "name": "febrl_sample_2k.csv", + "description": "charlotte SM dataset", + "version": "5", + "keyAttributeNames": ["rec_id"], + "tags": [], + "created": { + "username": "admin", + "time": "2019-06-05T16:16:31.964Z", + "version": "35", + }, + "lastModified": { + "username": "admin", + "time": "2019-07-19T17:44:42.369Z", + "version": "22919", + }, + "relativeId": "datasets/6", + "upstreamDatasetIds": [], + "externalId": "febrl_sample_2k.csv", + } +] diff --git a/tests/unit/test_operation.py b/tests/unit/test_operation.py new file mode 100644 index 00000000..f984fa95 --- /dev/null +++ b/tests/unit/test_operation.py @@ -0,0 +1,105 @@ +from urllib.parse import urljoin + +import pytest +from requests import HTTPError +import responses + + +from tamr_unify_client.operation import Operation + + +@pytest.fixture +def client(): + from tamr_unify_client import Client + from tamr_unify_client.auth import UsernamePasswordAuth + + auth = UsernamePasswordAuth("username", "password") + tamr = Client(auth) + return tamr + + +def full_url(client, endpoint): + return urljoin(client.origin + client.base_path, endpoint) + + +op_1_json = { + "id": "1", + "type": "SPARK", + "description": "Profiling [dataset] attributes.", + "status": { + "state": "SUCCEEDED", + "startTime": "2019-08-28T18:51:06.856Z", + "endTime": "2019-08-28T18:53:08.204Z", + "message": "", + }, + "created": { + "username": "admin", + "time": "2019-08-28T18:50:35.582Z", + "version": "17", + }, + "lastModified": { + "username": "system", + "time": "2019-08-28T18:53:08.950Z", + "version": "40", + }, + "relativeId": "operations/1", +} + + +def test_operation_from_json(client): + alias = "operations/123" + op1 = Operation.from_json(client, op_1_json, alias) + assert op1.api_path == alias + assert op1.relative_id == op_1_json["relativeId"] + assert op1.resource_id == "1" + assert op1.type == op_1_json["type"] + assert op1.description == op_1_json["description"] + assert op1.status == op_1_json["status"] + assert op1.state == "SUCCEEDED" + assert op1.succeeded + + +@responses.activate +def test_operation_from_resource_id(client): + responses.add(responses.GET, full_url(client, "operations/1"), json=op_1_json) + + op1 = Operation.from_resource_id(client, "1") + + assert op1.resource_id == "1" + assert op1.succeeded + + +@responses.activate +def test_operation_from_response(client): + responses.add(responses.GET, full_url(client, "operations/1"), json=op_1_json) + + op1 = Operation.from_response(client, client.get("operations/1").successful()) + + assert op1.resource_id == "1" + assert op1.succeeded + + +@responses.activate +def test_operation_from_response_noop(client): + responses.add(responses.GET, full_url(client, "operations/2"), status=204) + responses.add(responses.GET, full_url(client, "operations/-1"), status=404) + + op2 = Operation.from_response(client, client.get("operations/2").successful()) + + assert op2.api_path is not None + assert op2.relative_id is not None + assert op2.resource_id is not None + assert op2.type == "NOOP" + assert op2.description is not None + assert op2.status is not None + assert op2.state == "SUCCEEDED" + assert op2.succeeded + + op2a = op2.apply_options(asynchronous=True) + assert op2a.succeeded + + op2w = op2a.wait() + assert op2w.succeeded + + with pytest.raises(HTTPError): + op2w.poll() diff --git a/tests/unit/test_pair_counts.py b/tests/unit/test_pair_counts.py new file mode 100644 index 00000000..7dc7c652 --- /dev/null +++ b/tests/unit/test_pair_counts.py @@ -0,0 +1,114 @@ +from unittest import TestCase + +import responses + +from tamr_unify_client import Client +from tamr_unify_client.auth import UsernamePasswordAuth +from tamr_unify_client.mastering.estimated_pair_counts import EstimatedPairCounts +from tamr_unify_client.mastering.project import MasteringProject +from tamr_unify_client.operation import Operation + + +class TestPairCounts(TestCase): + def setUp(self): + auth = UsernamePasswordAuth("username", "password") + self.tamr = Client(auth) + + @responses.activate + def test_get(self): + p = MasteringProject(self.tamr, self._project_json) + responses.add( + responses.GET, + f"{self._url_base}/{self._api_path}", + json=self._estimate_json, + ) + generated = p.estimate_pairs() + + created = EstimatedPairCounts.from_json( + self.tamr, self._estimate_json, self._api_path + ) + self.assertEqual(repr(generated), repr(created)) + + def test_properties(self): + estimate = EstimatedPairCounts.from_json( + self.tamr, self._estimate_json, self._api_path + ) + self.assertFalse(estimate.is_up_to_date) + self.assertEqual(estimate.total_estimate, self._estimate_json["totalEstimate"]) + self.assertEqual( + estimate.clause_estimates, self._estimate_json["clauseEstimates"] + ) + + @responses.activate + def test_refresh(self): + responses.add( + responses.POST, + f"{self._url_base}/{self._api_path}:refresh", + json=self._refresh_json, + ) + updated = self._refresh_json.copy() + updated["status"]["state"] = "SUCCEEDED" + responses.add(responses.GET, f"{self._url_base}/operations/24", json=updated) + + estimate = EstimatedPairCounts.from_json( + self.tamr, self._estimate_json, self._api_path + ) + generated = estimate.refresh(poll_interval_seconds=0) + + created = Operation.from_json(self.tamr, updated) + self.assertEqual(repr(generated), repr(created)) + + _url_base = "http://localhost:9100/api/versioned/v1" + _api_path = "projects/1/estimatedPairCounts" + + _project_json = { + "id": "unify://unified-data/v1/projects/1", + "name": "mastering", + "description": "Mastering Project", + "type": "DEDUP", + "unifiedDatasetName": "mastering_unified_dataset", + "created": { + "username": "admin", + "time": "2019-07-08T20:14:46.904Z", + "version": "20", + }, + "lastModified": { + "username": "admin", + "time": "2019-07-08T20:18:13.629Z", + "version": "89", + }, + "relativeId": "projects/1", + "externalId": "1c2a10e5-e602-47ac-ade8-f9c23e49dfd2", + } + + _estimate_json = { + "isUpToDate": False, + "totalEstimate": {"candidatePairCount": "150", "generatedPairCount": "75"}, + "clauseEstimates": { + "clause1": {"candidatePairCount": "50", "generatedPairCount": "25"}, + "clause2": {"candidatePairCount": "100", "generatedPairCount": "50"}, + }, + } + + _refresh_json = { + "id": "24", + "type": "SPARK", + "description": "Generate Pair Estimates", + "status": { + "state": "PENDING", + "startTime": "", + "endTime": "", + "message": "Job has not yet been submitted to Spark", + }, + "created": { + "username": "admin", + "time": "2019-07-18T15:40:26.974Z", + "version": "1052", + }, + "lastModified": { + "username": "admin", + "time": "2019-07-18T15:40:26.974Z", + "version": "1052", + }, + "relativeId": "operations/24", + } diff --git a/tests/unit/test_project.py b/tests/unit/test_project.py new file mode 100644 index 00000000..4d8148c0 --- /dev/null +++ b/tests/unit/test_project.py @@ -0,0 +1,517 @@ +from functools import partial +import json +from unittest import TestCase + +import responses + +from tamr_unify_client import Client +from tamr_unify_client.auth import UsernamePasswordAuth +from tamr_unify_client.project.resource import Project + + +class TestProject(TestCase): + def setUp(self): + auth = UsernamePasswordAuth("username", "password") + self.tamr = Client(auth) + + @responses.activate + def test_project_add_input_dataset(self): + responses.add(responses.GET, self.datasets_url, json=self.dataset_json) + responses.add(responses.GET, self.projects_url, json=self.project_json) + responses.add( + responses.POST, + self.input_datasets_url, + json=self.post_input_datasets_json, + status=204, + ) + responses.add( + responses.GET, self.input_datasets_url, json=self.get_input_datasets_json + ) + + dataset = self.tamr.datasets.by_external_id(self.dataset_external_id) + project = self.tamr.projects.by_external_id(self.project_external_id) + project.add_input_dataset(dataset) + alias = project.api_path + "/inputDatasets" + input_datasets = project.client.get(alias).successful().json() + self.assertEqual(self.dataset_json, input_datasets) + + @responses.activate + def test_project_remove_input_dataset(self): + dataset_id = self.dataset_json[0]["relativeId"] + + responses.add(responses.GET, self.input_datasets_url, json=self.dataset_json) + responses.add( + responses.DELETE, f"{self.input_datasets_url}?id={dataset_id}", status=204 + ) + responses.add(responses.GET, self.input_datasets_url, json=[]) + + project = Project(self.tamr, self.project_json[0]) + dataset = next(project.input_datasets().stream()) + + response = project.remove_input_dataset(dataset) + self.assertEqual(response.status_code, 204) + + input_datasets = project.input_datasets() + self.assertEqual(list(input_datasets), []) + + @responses.activate + def test_project_by_external_id__raises_when_not_found(self): + responses.add(responses.GET, self.projects_url, json=[]) + with self.assertRaises(KeyError): + self.tamr.projects.by_external_id(self.project_external_id) + + @responses.activate + def test_project_by_external_id_succeeds(self): + responses.add(responses.GET, self.projects_url, json=self.project_json) + actual_project = self.tamr.projects.by_external_id(self.project_external_id) + self.assertEqual(self.project_json[0], actual_project._data) + + @responses.activate + def test_project_by_name__raises_when_not_found(self): + responses.add(responses.GET, self.project_list_url, json=[]) + auth = UsernamePasswordAuth("username", "password") + tamr = Client(auth) + with self.assertRaises(KeyError): + tamr.projects.by_name(self.project_name) + + @responses.activate + def test_project_by_name(self): + responses.add(responses.GET, self.project_list_url, json=self.project_json) + auth = UsernamePasswordAuth("username", "password") + tamr = Client(auth) + actual_project = tamr.projects.by_name(self.project_name) + assert actual_project._data == self.project_json[0] + + @responses.activate + def test_project_attributes_get(self): + responses.add(responses.GET, self.projects_url, json=self.project_json) + responses.add( + responses.GET, + self.project_attributes_url, + json=self.project_attributes_json, + ) + project = self.tamr.projects.by_external_id(self.project_external_id) + attributes = list(project.attributes) + self.assertEqual(len(self.project_attributes_json), len(attributes)) + + responses.add( + responses.GET, + self.project_attributes_url + "/id", + json=self.project_attributes_json[0], + ) + id_attribute = project.attributes.by_name("id") + self.assertEqual(self.project_attributes_json[0]["name"], id_attribute.name) + + @responses.activate + def test_project_attributes_post(self): + responses.add(responses.GET, self.projects_url, json=self.project_json) + responses.add( + responses.GET, + self.project_attributes_url, + json=self.project_attributes_json, + ) + responses.add( + responses.POST, + self.project_attributes_url, + json=self.project_attributes_json[0], + status=204, + ) + project = self.tamr.projects.by_external_id(self.project_external_id) + # project.attributes.create MUST make a POST request to self.project_attributes_url + # If it posts to some other URL, responses will raise an exception; + # If it does not post to any URL, responses will also raise an exception. + project.attributes.create(self.project_attributes_json[0]) + + def test_project_get_input_datasets(self): + p = Project(self.tamr, self.project_json[0]) + datasets = p.input_datasets() + self.assertEqual(datasets.api_path, "projects/1/inputDatasets") + + @responses.activate + def test_return_attribute_collection(self): + responses.add(responses.GET, self.projects_url, json=self.project_json) + project = self.tamr.projects.by_external_id(self.project_external_id) + attribute_configs = project.attribute_configurations() + self.assertEqual( + attribute_configs.api_path, "projects/1/attributeConfigurations" + ) + + @responses.activate + def test_return_attribute_mapping(self): + responses.add(responses.GET, self.projects_url, json=self.project_json) + map_url = "http://localhost:9100/api/versioned/v1/projects/1/attributeMappings" + responses.add(responses.GET, map_url, json=self.mappings_json) + project = self.tamr.projects.by_external_id(self.project_external_id) + attribute_mappings = project.attribute_mappings() + self.assertEqual( + attribute_mappings.by_resource_id("19689-14").unified_dataset_name, + self.mappings_json[0]["unifiedDatasetName"], + ) + + @responses.activate + def test_update_project(self): + def create_callback(request, snoop): + snoop["payload"] = request.body + return 200, {}, json.dumps(self._updated_project_json) + + project_url = "http://localhost:9100/api/versioned/v1/projects/1" + snoop_dict = {} + responses.add_callback( + responses.PUT, project_url, partial(create_callback, snoop=snoop_dict) + ) + project = Project(self.tamr, self.project_json[0]) + + temp_spec = project.spec().with_name(self._updated_project_json["name"]) + new_project = ( + temp_spec.with_description(self._updated_project_json["description"]) + .with_external_id(self._updated_project_json["externalId"]) + .put() + ) + self.assertEqual(new_project.name, self._updated_project_json["name"]) + self.assertEqual( + new_project.description, self._updated_project_json["description"] + ) + self.assertEqual( + new_project.external_id, self._updated_project_json["externalId"] + ) + + self.assertEqual(json.loads(snoop_dict["payload"]), self._updated_project_json) + + self.assertEqual(project.name, self.project_json[0]["name"]) + self.assertEqual(project.description, self.project_json[0]["description"]) + self.assertEqual(project.external_id, self.project_json[0]["externalId"]) + + # test that intermediate didn't change + self.assertEqual( + temp_spec.to_dict()["description"], self.project_json[0]["description"] + ) + + dataset_external_id = "1" + datasets_url = f"http://localhost:9100/api/versioned/v1/datasets?filter=externalId=={dataset_external_id}" + dataset_json = [ + { + "id": "unify://unified-data/v1/datasets/1", + "externalId": "1", + "name": "dataset 1 name", + "description": "dataset 1 description", + "version": "dataset 1 version", + "keyAttributeNames": ["tamr_id"], + "tags": [], + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "dataset 1 created version", + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "dataset 1 modified version", + }, + "relativeId": "datasets/1", + "upstreamDatasetIds": [], + } + ] + project_json = [ + { + "id": "unify://unified-data/v1/projects/1", + "externalId": "project 1 external ID", + "name": "project 1 name", + "description": "project 1 description", + "type": "DEDUP", + "unifiedDatasetName": "project 1 unified dataset", + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "project 1 created version", + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "project 1 modified version", + }, + "relativeId": "projects/1", + } + ] + project_name = "project 1 name" + project_external_id = "project 1 external ID" + projects_url = f"http://localhost:9100/api/versioned/v1/projects?filter=externalId=={project_external_id}" + project_list_url = "http://localhost:9100/api/versioned/v1/projects" + post_input_datasets_json = [] + input_datasets_url = ( + "http://localhost:9100/api/versioned/v1/projects/1/inputDatasets" + ) + get_input_datasets_json = dataset_json + + project_attributes_url = ( + "http://localhost:9100/api/versioned/v1/projects/1/attributes" + ) + project_attributes_json = [ + { + "name": "id", + "description": "identifier", + "type": {"baseType": "STRING"}, + "isNullable": False, + }, + { + "name": "name", + "description": "full name", + "type": {"baseType": "ARRAY", "innerType": {"baseType": "STRING"}}, + "isNullable": True, + }, + { + "name": "description", + "description": "human readable description", + "type": {"baseType": "ARRAY", "innerType": {"baseType": "STRING"}}, + "isNullable": True, + }, + ] + + mappings_json = [ + { + "id": "unify://unified-data/v1/projects/1/attributeMappings/19689-14", + "relativeId": "projects/1/attributeMappings/19689-14", + "inputAttributeId": "unify://unified-data/v1/datasets/6/attributes/suburb", + "relativeInputAttributeId": "datasets/6/attributes/suburb", + "inputDatasetName": "febrl_sample_2k.csv", + "inputAttributeName": "suburb", + "unifiedAttributeId": "unify://unified-data/v1/datasets/8/attributes/suburb", + "relativeUnifiedAttributeId": "datasets/8/attributes/suburb", + "unifiedDatasetName": "Project_1_unified_dataset", + "unifiedAttributeName": "suburb", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeMappings/19690-7", + "relativeId": "projects/1/attributeMappings/19690-7", + "inputAttributeId": "unify://unified-data/v1/datasets/6/attributes/sex", + "relativeInputAttributeId": "datasets/6/attributes/sex", + "inputDatasetName": "febrl_sample_2k.csv", + "inputAttributeName": "sex", + "unifiedAttributeId": "unify://unified-data/v1/datasets/8/attributes/sex", + "relativeUnifiedAttributeId": "datasets/8/attributes/sex", + "unifiedDatasetName": "Project_1_unified_dataset", + "unifiedAttributeName": "sex", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeMappings/19691-18", + "relativeId": "projects/1/attributeMappings/19691-18", + "inputAttributeId": "unify://unified-data/v1/datasets/6/attributes/address_2", + "relativeInputAttributeId": "datasets/6/attributes/address_2", + "inputDatasetName": "febrl_sample_2k.csv", + "inputAttributeName": "address_2", + "unifiedAttributeId": "unify://unified-data/v1/datasets/8/attributes/address_2", + "relativeUnifiedAttributeId": "datasets/8/attributes/address_2", + "unifiedDatasetName": "Project_1_unified_dataset", + "unifiedAttributeName": "address_2", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeMappings/19692-8", + "relativeId": "projects/1/attributeMappings/19692-8", + "inputAttributeId": "unify://unified-data/v1/datasets/6/attributes/age", + "relativeInputAttributeId": "datasets/6/attributes/age", + "inputDatasetName": "febrl_sample_2k.csv", + "inputAttributeName": "age", + "unifiedAttributeId": "unify://unified-data/v1/datasets/8/attributes/age", + "relativeUnifiedAttributeId": "datasets/8/attributes/age", + "unifiedDatasetName": "Project_1_unified_dataset", + "unifiedAttributeName": "age", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeMappings/19693-6", + "relativeId": "projects/1/attributeMappings/19693-6", + "inputAttributeId": "unify://unified-data/v1/datasets/6/attributes/culture", + "relativeInputAttributeId": "datasets/6/attributes/culture", + "inputDatasetName": "febrl_sample_2k.csv", + "inputAttributeName": "culture", + "unifiedAttributeId": "unify://unified-data/v1/datasets/8/attributes/culture", + "relativeUnifiedAttributeId": "datasets/8/attributes/culture", + "unifiedDatasetName": "Project_1_unified_dataset", + "unifiedAttributeName": "culture", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeMappings/19694-16", + "relativeId": "projects/1/attributeMappings/19694-16", + "inputAttributeId": "unify://unified-data/v1/datasets/6/attributes/street_number", + "relativeInputAttributeId": "datasets/6/attributes/street_number", + "inputDatasetName": "febrl_sample_2k.csv", + "inputAttributeName": "street_number", + "unifiedAttributeId": "unify://unified-data/v1/datasets/8/attributes/street_number", + "relativeUnifiedAttributeId": "datasets/8/attributes/street_number", + "unifiedDatasetName": "Project_1_unified_dataset", + "unifiedAttributeName": "street_number", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeMappings/19695-15", + "relativeId": "projects/1/attributeMappings/19695-15", + "inputAttributeId": "unify://unified-data/v1/datasets/6/attributes/postcode", + "relativeInputAttributeId": "datasets/6/attributes/postcode", + "inputDatasetName": "febrl_sample_2k.csv", + "inputAttributeName": "postcode", + "unifiedAttributeId": "unify://unified-data/v1/datasets/8/attributes/postcode", + "relativeUnifiedAttributeId": "datasets/8/attributes/postcode", + "unifiedDatasetName": "Project_1_unified_dataset", + "unifiedAttributeName": "postcode", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeMappings/19696-19", + "relativeId": "projects/1/attributeMappings/19696-19", + "inputAttributeId": "unify://unified-data/v1/datasets/6/attributes/phone_number", + "relativeInputAttributeId": "datasets/6/attributes/phone_number", + "inputDatasetName": "febrl_sample_2k.csv", + "inputAttributeName": "phone_number", + "unifiedAttributeId": "unify://unified-data/v1/datasets/8/attributes/phone_number", + "relativeUnifiedAttributeId": "datasets/8/attributes/phone_number", + "unifiedDatasetName": "Project_1_unified_dataset", + "unifiedAttributeName": "phone_number", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeMappings/19697-20", + "relativeId": "projects/1/attributeMappings/19697-20", + "inputAttributeId": "unify://unified-data/v1/datasets/6/attributes/soc_sec_id", + "relativeInputAttributeId": "datasets/6/attributes/soc_sec_id", + "inputDatasetName": "febrl_sample_2k.csv", + "inputAttributeName": "soc_sec_id", + "unifiedAttributeId": "unify://unified-data/v1/datasets/8/attributes/soc_sec_id", + "relativeUnifiedAttributeId": "datasets/8/attributes/soc_sec_id", + "unifiedDatasetName": "Project_1_unified_dataset", + "unifiedAttributeName": "soc_sec_id", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeMappings/19698-5", + "relativeId": "projects/1/attributeMappings/19698-5", + "inputAttributeId": "unify://unified-data/v1/datasets/6/attributes/rec2_id", + "relativeInputAttributeId": "datasets/6/attributes/rec2_id", + "inputDatasetName": "febrl_sample_2k.csv", + "inputAttributeName": "rec2_id", + "unifiedAttributeId": "unify://unified-data/v1/datasets/8/attributes/rec2_id", + "relativeUnifiedAttributeId": "datasets/8/attributes/rec2_id", + "unifiedDatasetName": "Project_1_unified_dataset", + "unifiedAttributeName": "rec2_id", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeMappings/19699-9", + "relativeId": "projects/1/attributeMappings/19699-9", + "inputAttributeId": "unify://unified-data/v1/datasets/6/attributes/date_of_birth", + "relativeInputAttributeId": "datasets/6/attributes/date_of_birth", + "inputDatasetName": "febrl_sample_2k.csv", + "inputAttributeName": "date_of_birth", + "unifiedAttributeId": "unify://unified-data/v1/datasets/8/attributes/date_of_birth", + "relativeUnifiedAttributeId": "datasets/8/attributes/date_of_birth", + "unifiedDatasetName": "Project_1_unified_dataset", + "unifiedAttributeName": "date_of_birth", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeMappings/19700-10", + "relativeId": "projects/1/attributeMappings/19700-10", + "inputAttributeId": "unify://unified-data/v1/datasets/6/attributes/title", + "relativeInputAttributeId": "datasets/6/attributes/title", + "inputDatasetName": "febrl_sample_2k.csv", + "inputAttributeName": "title", + "unifiedAttributeId": "unify://unified-data/v1/datasets/8/attributes/title", + "relativeUnifiedAttributeId": "datasets/8/attributes/title", + "unifiedDatasetName": "Project_1_unified_dataset", + "unifiedAttributeName": "title", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeMappings/19701-17", + "relativeId": "projects/1/attributeMappings/19701-17", + "inputAttributeId": "unify://unified-data/v1/datasets/6/attributes/address_1", + "relativeInputAttributeId": "datasets/6/attributes/address_1", + "inputDatasetName": "febrl_sample_2k.csv", + "inputAttributeName": "address_1", + "unifiedAttributeId": "unify://unified-data/v1/datasets/8/attributes/address_1", + "relativeUnifiedAttributeId": "datasets/8/attributes/address_1", + "unifiedDatasetName": "Project_1_unified_dataset", + "unifiedAttributeName": "address_1", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeMappings/19702-4", + "relativeId": "projects/1/attributeMappings/19702-4", + "inputAttributeId": "unify://unified-data/v1/datasets/6/attributes/rec_id", + "relativeInputAttributeId": "datasets/6/attributes/rec_id", + "inputDatasetName": "febrl_sample_2k.csv", + "inputAttributeName": "rec_id", + "unifiedAttributeId": "unify://unified-data/v1/datasets/8/attributes/rec_id", + "relativeUnifiedAttributeId": "datasets/8/attributes/rec_id", + "unifiedDatasetName": "Project_1_unified_dataset", + "unifiedAttributeName": "rec_id", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeMappings/19703-13", + "relativeId": "projects/1/attributeMappings/19703-13", + "inputAttributeId": "unify://unified-data/v1/datasets/6/attributes/state", + "relativeInputAttributeId": "datasets/6/attributes/state", + "inputDatasetName": "febrl_sample_2k.csv", + "inputAttributeName": "state", + "unifiedAttributeId": "unify://unified-data/v1/datasets/8/attributes/state", + "relativeUnifiedAttributeId": "datasets/8/attributes/state", + "unifiedDatasetName": "Project_1_unified_dataset", + "unifiedAttributeName": "state", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeMappings/19704-22", + "relativeId": "projects/1/attributeMappings/19704-22", + "inputAttributeId": "unify://unified-data/v1/datasets/6/attributes/family_role", + "relativeInputAttributeId": "datasets/6/attributes/family_role", + "inputDatasetName": "febrl_sample_2k.csv", + "inputAttributeName": "family_role", + "unifiedAttributeId": "unify://unified-data/v1/datasets/8/attributes/family_role", + "relativeUnifiedAttributeId": "datasets/8/attributes/family_role", + "unifiedDatasetName": "Project_1_unified_dataset", + "unifiedAttributeName": "family_role", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeMappings/19705-21", + "relativeId": "projects/1/attributeMappings/19705-21", + "inputAttributeId": "unify://unified-data/v1/datasets/6/attributes/blocking_number", + "relativeInputAttributeId": "datasets/6/attributes/blocking_number", + "inputDatasetName": "febrl_sample_2k.csv", + "inputAttributeName": "blocking_number", + "unifiedAttributeId": "unify://unified-data/v1/datasets/8/attributes/blocking_number", + "relativeUnifiedAttributeId": "datasets/8/attributes/blocking_number", + "unifiedDatasetName": "Project_1_unified_dataset", + "unifiedAttributeName": "blocking_number", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeMappings/19706-12", + "relativeId": "projects/1/attributeMappings/19706-12", + "inputAttributeId": "unify://unified-data/v1/datasets/6/attributes/surname", + "relativeInputAttributeId": "datasets/6/attributes/surname", + "inputDatasetName": "febrl_sample_2k.csv", + "inputAttributeName": "surname", + "unifiedAttributeId": "unify://unified-data/v1/datasets/8/attributes/surname", + "relativeUnifiedAttributeId": "datasets/8/attributes/surname", + "unifiedDatasetName": "Project_1_unified_dataset", + "unifiedAttributeName": "surname", + }, + { + "id": "unify://unified-data/v1/projects/1/attributeMappings/19707-11", + "relativeId": "projects/1/attributeMappings/19707-11", + "inputAttributeId": "unify://unified-data/v1/datasets/6/attributes/given_name", + "relativeInputAttributeId": "datasets/6/attributes/given_name", + "inputDatasetName": "febrl_sample_2k.csv", + "inputAttributeName": "given_name", + "unifiedAttributeId": "unify://unified-data/v1/datasets/8/attributes/given_name", + "relativeUnifiedAttributeId": "datasets/8/attributes/given_name", + "unifiedDatasetName": "Project_1_unified_dataset", + "unifiedAttributeName": "given_name", + }, + ] + _updated_project_json = { + "id": "unify://unified-data/v1/projects/1", + "externalId": "new external ID", + "name": "Renamed!", + "description": "project 1 description is more descriptive", + "type": "DEDUP", + "unifiedDatasetName": "project 1 unified dataset", + "created": { + "username": "admin", + "time": "2018-09-10T16:06:20.636Z", + "version": "project 1 created version", + }, + "lastModified": { + "username": "admin", + "time": "2018-09-10T16:06:20.851Z", + "version": "project 1 modified version", + }, + "relativeId": "projects/1", + } diff --git a/tests/unit/test_project_add_dataset.py b/tests/unit/test_project_add_dataset.py deleted file mode 100644 index 444e1426..00000000 --- a/tests/unit/test_project_add_dataset.py +++ /dev/null @@ -1,81 +0,0 @@ -from functools import partial - -import pytest -import responses - -from tamr_unify_client import Client -from tamr_unify_client.auth import UsernamePasswordAuth - -dataset_json = [ - { - "id": "unify://unified-data/v1/datasets/1", - "externalId": "1", - "name": "dataset 1 name", - "description": "dataset 1 description", - "version": "dataset 1 version", - "keyAttributeNames": ["tamr_id"], - "tags": [], - "created": { - "username": "admin", - "time": "2018-09-10T16:06:20.636Z", - "version": "dataset 1 created version", - }, - "lastModified": { - "username": "admin", - "time": "2018-09-10T16:06:20.851Z", - "version": "dataset 1 modified version", - }, - "relativeId": "datasets/1", - "upstreamDatasetIds": [], - } -] - -dataset_external_id = "1" -datasets_url = f"http://localhost:9100/api/versioned/v1/datasets?filter=externalId=={dataset_external_id}" - -project_json = [ - { - "id": "unify://unified-data/v1/projects/1", - "externalId": "1", - "name": "project 1 name", - "description": "project 1 description", - "type": "DEDUP", - "unifiedDatasetName": "project 1 unified dataset", - "created": { - "username": "admin", - "time": "2018-09-10T16:06:20.636Z", - "version": "project 1 created version", - }, - "lastModified": { - "username": "admin", - "time": "2018-09-10T16:06:20.851Z", - "version": "project 1 modified version", - }, - "relativeId": "projects/1", - } -] - -project_external_id = "1" -projects_url = f"http://localhost:9100/api/versioned/v1/projects?filter=externalId=={project_external_id}" - -post_input_datasets_json = [] -input_datasets_url = f"http://localhost:9100/api/versioned/v1/projects/1/inputDatasets" -get_input_datasets_json = dataset_json - - -@responses.activate -def test_project_add_source_dataset(): - responses.add(responses.GET, datasets_url, json=dataset_json) - responses.add(responses.GET, projects_url, json=project_json) - responses.add( - responses.POST, input_datasets_url, json=post_input_datasets_json, status=204 - ) - responses.add(responses.GET, input_datasets_url, json=get_input_datasets_json) - auth = UsernamePasswordAuth("username", "password") - unify = Client(auth) - dataset = unify.datasets.by_external_id(dataset_external_id) - project = unify.projects.by_external_id(project_external_id) - project.add_source_dataset(dataset) - alias = project.api_path + "/inputDatasets" - input_datasets = project.client.get(alias).successful().json() - assert input_datasets == dataset_json diff --git a/tests/unit/test_project_by_external_id.py b/tests/unit/test_project_by_external_id.py deleted file mode 100644 index 571c960c..00000000 --- a/tests/unit/test_project_by_external_id.py +++ /dev/null @@ -1,50 +0,0 @@ -import json - -import pytest -import responses - -from tamr_unify_client import Client -from tamr_unify_client.auth import UsernamePasswordAuth - -project_json = [ - { - "id": "unify://unified-data/v1/projects/1", - "externalId": "project 1 external ID", - "name": "project 1 name", - "description": "project 1 description", - "type": "DEDUP", - "unifiedDatasetName": "project 1 unified dataset", - "created": { - "username": "admin", - "time": "2018-09-10T16:06:20.636Z", - "version": "project 1 created version", - }, - "lastModified": { - "username": "admin", - "time": "2018-09-10T16:06:20.851Z", - "version": "project 1 modified version", - }, - "relativeId": "projects/1", - } -] - -project_external_id = "project 1 external ID" -projects_url = f"http://localhost:9100/api/versioned/v1/projects?filter=externalId=={project_external_id}" - - -@responses.activate -def test_project_by_external_id__raises_when_not_found(): - responses.add(responses.GET, projects_url, json=[]) - auth = UsernamePasswordAuth("username", "password") - unify = Client(auth) - with pytest.raises(KeyError): - unify.projects.by_external_id(project_external_id) - - -@responses.activate -def test_project_by_external_id_succeeds(): - responses.add(responses.GET, projects_url, json=project_json) - auth = UsernamePasswordAuth("username", "password") - unify = Client(auth) - actual_project = unify.projects.by_external_id(project_external_id) - assert actual_project._data == project_json[0] diff --git a/tests/unit/test_published_cluster_version.py b/tests/unit/test_published_cluster_version.py new file mode 100644 index 00000000..80fc1623 --- /dev/null +++ b/tests/unit/test_published_cluster_version.py @@ -0,0 +1,268 @@ +from functools import partial +import json +from unittest import TestCase + +import responses + +from tamr_unify_client import Client +from tamr_unify_client.auth import UsernamePasswordAuth +from tamr_unify_client.mastering.published_cluster.metric import Metric +from tamr_unify_client.mastering.published_cluster.record import RecordPublishedCluster +from tamr_unify_client.mastering.published_cluster.record_version import ( + RecordPublishedClusterVersion, +) +from tamr_unify_client.mastering.published_cluster.resource import PublishedCluster +from tamr_unify_client.mastering.published_cluster.version import ( + PublishedClusterVersion, +) +from tamr_unify_client.project.resource import Project + + +class PublishedClusterTest(TestCase): + def setUp(self): + auth = UsernamePasswordAuth("username", "password") + self.tamr = Client(auth) + + def test_metric(self): + metric_json = {"metricName": "recordCount", "metricValue": "1"} + m = Metric(metric_json) + self.assertEqual(m.name, metric_json["metricName"]) + self.assertEqual(m.value, metric_json["metricValue"]) + + def test_cluster_version(self): + version_json = self._versions_json[0]["versions"][0] + version = PublishedClusterVersion(version_json) + + self.assertEqual(version.version, version_json["version"]) + self.assertEqual(version.timestamp, version_json["timestamp"]) + self.assertEqual(version.name, version_json["name"]) + self.assertEqual(version.record_ids, version_json["recordIds"]) + + metrics = [Metric(m) for m in version_json["metrics"]] + for actual, expected in zip(version.metrics, metrics): + self.assertEqual(actual.__repr__(), expected.__repr__()) + + def test_record_cluster_version(self): + version_json = self._record_versions_json[0]["versions"][0] + version = RecordPublishedClusterVersion(version_json) + + self.assertEqual(version.version, version_json["version"]) + self.assertEqual(version.timestamp, version_json["timestamp"]) + self.assertEqual(version.cluster_id, version_json["clusterId"]) + + def test_record_cluster(self): + record_json = self._record_versions_json[0] + record = RecordPublishedCluster(record_json) + versions = record.versions + expected_versions = [ + RecordPublishedClusterVersion(v) for v in record_json["versions"] + ] + + self.assertEqual(record.entity_id, record_json["entityId"]) + self.assertEqual(record.origin_entity_id, record_json["originEntityId"]) + self.assertEqual(record.origin_source_id, record_json["originSourceId"]) + self.assertEqual(record.source_id, record_json["sourceId"]) + self.assertEqual(len(versions), len(expected_versions)) + for actual, expected in zip(versions, expected_versions): + self.assertEqual(actual.__repr__(), expected.__repr__()) + + def test_cluster(self): + cluster_json = self._versions_json[0] + cluster = PublishedCluster(cluster_json) + versions = cluster.versions + expected_versions = [ + PublishedClusterVersion(v) for v in cluster_json["versions"] + ] + + self.assertEqual(cluster.id, cluster_json["id"]) + self.assertEqual(len(versions), len(expected_versions)) + for actual, expected in zip(versions, expected_versions): + self.assertEqual(actual.__repr__(), expected.__repr__()) + + @responses.activate + def test_get_versions(self): + def create_callback(request, snoop): + snoop["payload"] = request.body + return 200, {}, "\n".join(json.dumps(c) for c in self._versions_json) + + p = Project.from_json(self.tamr, self._project_json).as_mastering() + post_url = f"http://localhost:9100/api/versioned/v1/{p.api_path}/publishedClusterVersions" + snoop = {} + responses.add_callback( + responses.POST, post_url, partial(create_callback, snoop=snoop) + ) + + clusters = list(p.published_cluster_versions(self._cluster_ids)) + expected_clusters = [PublishedCluster(c) for c in self._versions_json] + + self.assertEqual( + snoop["payload"], "\n".join([json.dumps(i) for i in self._cluster_ids]) + ) + self.assertEqual(len(clusters), len(expected_clusters)) + for actual, expected in zip(clusters, expected_clusters): + self.assertEqual(actual.__repr__(), expected.__repr__()) + self.assertEqual(len(actual.versions), len(expected.versions)) + + @responses.activate + def test_get_record_versions(self): + def create_callback(request, snoop): + snoop["payload"] = request.body + return 200, {}, "\n".join(json.dumps(c) for c in self._record_versions_json) + + p = Project.from_json(self.tamr, self._project_json).as_mastering() + base_url = "http://localhost:9100/api/versioned/v1" + post_url = f"{base_url}/{p.api_path}/recordPublishedClusterVersions" + snoop = {} + responses.add_callback( + responses.POST, post_url, partial(create_callback, snoop=snoop) + ) + + clusters = list(p.record_published_cluster_versions(self._record_ids)) + expected_clusters = [ + RecordPublishedCluster(c) for c in self._record_versions_json + ] + + self.assertEqual( + snoop["payload"], "\n".join([json.dumps(i) for i in self._record_ids]) + ) + self.assertEqual(len(clusters), len(expected_clusters)) + for actual, expected in zip(clusters, expected_clusters): + self.assertEqual(actual.__repr__(), expected.__repr__()) + self.assertEqual(len(actual.versions), len(expected.versions)) + + _project_json = { + "id": "unify://unified-data/v1/projects/1", + "name": "Test Project", + "description": "Mastering Project", + "type": "DEDUP", + "unifiedDatasetName": "", + "created": { + "username": "admin", + "time": "2019-07-12T13:08:17.440Z", + "version": "401", + }, + "lastModified": { + "username": "admin", + "time": "2019-07-12T13:08:17.534Z", + "version": "402", + }, + "relativeId": "projects/1", + "externalId": "904bf89e-74ba-45c5-8b4a-5ff913728f66", + } + + _cluster_ids = [ + "055908e7-2144-3f46-ba21-4c2e58816228", + "ca68d64b-755e-32b7-a785-5f9b1f51e420", + ] + + _versions_json = [ + { + "id": "055908e7-2144-3f46-ba21-4c2e58816228", + "versions": [ + { + "version": 324, + "timestamp": "2019-07-17T15:48:40.171Z", + "name": "cluster 1", + "metrics": [ + {"metricName": "recordCount", "metricValue": "2"}, + {"metricName": "totalSpend", "metricValue": "0.0"}, + {"metricName": "verifiedRecordCount", "metricValue": "0"}, + { + "metricName": "averageLinkage", + "metricValue": "0.7626373626373626", + }, + ], + "recordIds": [ + { + "entityId": "6084737977926081128", + "originSourceId": "dataset_name", + "originEntityId": "82049", + }, + { + "entityId": "-3832930559140320929", + "originSourceId": "dataset_name", + "originEntityId": "80455", + }, + ], + }, + { + "version": 323, + "timestamp": "2019-07-15T15:48:40.171Z", + "name": "cluster 1", + "metrics": [ + {"metricName": "recordCount", "metricValue": "1"}, + {"metricName": "totalSpend", "metricValue": "0.0"}, + {"metricName": "verifiedRecordCount", "metricValue": "0"}, + { + "metricName": "averageLinkage", + "metricValue": "0.7626373626373626", + }, + ], + "recordIds": [ + { + "entityId": "6084737977926081128", + "originSourceId": "dataset_name", + "originEntityId": "82049", + } + ], + }, + ], + }, + { + "id": "ca68d64b-755e-32b7-a785-5f9b1f51e420", + "versions": [ + { + "version": 324, + "timestamp": "2019-07-17T15:48:40.171Z", + "name": "cluster 2", + "metrics": [ + {"metricName": "recordCount", "metricValue": "1"}, + {"metricName": "totalSpend", "metricValue": "0.0"}, + {"metricName": "verifiedRecordCount", "metricValue": "0"}, + { + "metricName": "averageLinkage", + "metricValue": "0.7582417582417584", + }, + ], + "recordIds": [ + { + "entityId": "-4650342988873587155", + "originSourceId": "dataset_name", + "originEntityId": "63730", + } + ], + } + ], + }, + ] + + _record_ids = ["6084737977926081128", "-4650342988873587155"] + + _record_versions_json = [ + { + "entityId": "-4650342988873587155", + "sourceId": "mastering_unified_dataset", + "originEntityId": "63730", + "originSourceId": "Acme_online.csv", + "versions": [ + { + "version": 323, + "timestamp": "2019-07-17T15:48:40.170Z", + "clusterId": "ca68d64b-755e-32b7-a785-5f9b1f51e420", + } + ], + }, + { + "entityId": "6084737977926081128", + "sourceId": "mastering_unified_dataset", + "originEntityId": "82049", + "originSourceId": "Acme_online.csv", + "versions": [ + { + "version": 323, + "timestamp": "2019-07-17T15:48:40.170Z", + "clusterId": "055908e7-2144-3f46-ba21-4c2e58816228", + } + ], + }, + ] diff --git a/tests/unit/test_published_clusters.py b/tests/unit/test_published_clusters.py new file mode 100644 index 00000000..1ca2303c --- /dev/null +++ b/tests/unit/test_published_clusters.py @@ -0,0 +1,231 @@ +from functools import partial +import json +from unittest import TestCase + +from requests import HTTPError +import responses + +from tamr_unify_client import Client +from tamr_unify_client.auth import UsernamePasswordAuth +from tamr_unify_client.mastering.published_cluster.configuration import ( + PublishedClustersConfiguration, +) +from tamr_unify_client.project.resource import Project + + +class PublishedClusterTest(TestCase): + def setUp(self): + auth = UsernamePasswordAuth("username", "password") + self.tamr = Client(auth) + + @responses.activate + def test_published_clusters(self): + project_id = "1" + project_url = f"{self._base_url}/projects/{project_id}" + unified_dataset_url = f"{self._base_url}/projects/{project_id}/unifiedDataset" + datasets_url = f"{self._base_url}/datasets" + refresh_url = ( + f"{self._base_url}/projects/{project_id}/publishedClusters:refresh" + ) + operations_url = f"{self._base_url}/operations/93" + + responses.add(responses.GET, project_url, json=self._project_config_json) + responses.add( + responses.GET, unified_dataset_url, json=self._unified_dataset_json + ) + responses.add(responses.GET, datasets_url, json=self._datasets_json) + responses.add(responses.POST, refresh_url, json=self._refresh_json) + responses.add(responses.GET, operations_url, json=self._operations_json) + project = self.tamr.projects.by_resource_id(project_id) + actual_published_clusters_dataset = project.as_mastering().published_clusters() + actual_published_clusters_dataset.refresh(poll_interval_seconds=0) + self.assertEqual( + actual_published_clusters_dataset.name, + self._published_clusters_json["name"], + ) + + @responses.activate + def test_published_clusters_configuration(self): + path = "projects/1/publishedClustersConfiguration" + config_url = f"{self._base_url}/{path}" + responses.add(responses.GET, config_url, json=self._config_json) + + p = Project(self.tamr, self._project_config_json).as_mastering() + config = p.published_clusters_configuration() + created = PublishedClustersConfiguration.from_json( + self.tamr, self._config_json, path + ) + + self.assertEqual(repr(config), repr(created)) + self.assertEqual( + config.versions_time_to_live, self._config_json["versionsTimeToLive"] + ) + + @responses.activate + def test_delete_published_clusters_configuration(self): + path = "projects/1/publishedClustersConfiguration" + config_url = f"{self._base_url}/{path}" + responses.add(responses.GET, config_url, json=self._config_json) + responses.add(responses.DELETE, config_url, status=405) + + p = Project(self.tamr, self._project_config_json).as_mastering() + config = p.published_clusters_configuration() + self.assertRaises(HTTPError, config.delete) + + @responses.activate + def test_update_published_clusters_configuration(self): + def create_callback(request, snoop): + snoop["payload"] = request.body + return 200, {}, json.dumps({"versionsTimeToLive": self.update_info}) + + path = "projects/1/publishedClustersConfiguration" + url = f"http://localhost:9100/api/versioned/v1/{path}" + snoop_dict = {} + responses.add_callback( + responses.PUT, url, partial(create_callback, snoop=snoop_dict) + ) + + clusters = PublishedClustersConfiguration(self.tamr, self._config_json, path) + new_cluster = clusters.spec().with_versions_time_to_live(self.update_info).put() + self.assertEqual(new_cluster._data, {"versionsTimeToLive": "PT100H"}) + + @responses.activate + def test_refresh_ids(self): + unified_dataset_url = f"{self._base_url}/projects/1/unifiedDataset" + datasets_url = f"{self._base_url}/datasets" + refresh_url = f"{self._base_url}/projects/1/allPublishedClusterIds:refresh" + + responses.add( + responses.GET, unified_dataset_url, json=self._unified_dataset_json + ) + responses.add(responses.GET, datasets_url, json=self._datasets_json) + responses.add(responses.POST, refresh_url, json=self._operations_json) + + p = Project(self.tamr, self._project_config_json).as_mastering() + d = p.published_cluster_ids() + + op = d.refresh(poll_interval_seconds=0) + self.assertEqual(op.resource_id, self._operations_json["id"]) + self.assertTrue(op.succeeded()) + + @responses.activate + def test_refresh_stats(self): + unified_dataset_url = f"{self._base_url}/projects/1/unifiedDataset" + datasets_url = f"{self._base_url}/datasets" + refresh_url = f"{self._base_url}/projects/1/publishedClusterStats:refresh" + + responses.add( + responses.GET, unified_dataset_url, json=self._unified_dataset_json + ) + responses.add(responses.GET, datasets_url, json=self._datasets_json) + responses.add(responses.POST, refresh_url, json=self._operations_json) + + p = Project(self.tamr, self._project_config_json).as_mastering() + d = p.published_cluster_stats() + + op = d.refresh(poll_interval_seconds=0) + self.assertEqual(op.resource_id, self._operations_json["id"]) + self.assertTrue(op.succeeded()) + + _base_url = "http://localhost:9100/api/versioned/v1" + + _project_config_json = { + "id": "unify://unified-data/v1/projects/1", + "name": "Project_1", + "description": "Mastering Project", + "type": "DEDUP", + "unifiedDatasetName": "Project_1_unified_dataset", + "relativeId": "projects/1", + "externalId": "32b99cab-e01b-41e7-a29d-509165242c6f", + } + + _unified_dataset_json = { + "id": "unify://unified-data/v1/datasets/8", + "name": "Project_1_unified_dataset", + "version": "10", + "relativeId": "datasets/8", + "externalId": "Project_1_unified_dataset", + } + + _published_clusters_json = { + "id": "unify://unified-data/v1/datasets/32", + "name": "Project_1_unified_dataset_dedup_published_clusters", + "description": "All the mappings of records to clusters.", + "version": "253", + "relativeId": "datasets/32", + "externalId": "Project_1_unified_dataset_dedup_published_clusters", + } + + _published_stats_json = { + "id": "unify://unified-data/v1/datasets/33", + "name": "Project_1_unified_dataset_dedup_published_cluster_stats", + "description": "Published cluster stats", + "version": "253", + "relativeId": "datasets/33", + "externalId": "Project_1_unified_dataset_dedup_published_cluster_stats", + } + + _published_ids_json = { + "id": "unify://unified-data/v1/datasets/34", + "name": "Project_1_unified_dataset_dedup_all_persistent_ids", + "description": "All previously and currently published cluster IDs", + "version": "253", + "relativeId": "datasets/34", + "externalId": "Project_1_unified_dataset_dedup_all_persistent_ids", + } + + _datasets_json = [ + _unified_dataset_json, + _published_clusters_json, + _published_stats_json, + _published_ids_json, + ] + + _refresh_json = { + "id": "93", + "type": "SPARK", + "description": "Publish clusters", + "status": { + "state": "PENDING", + "startTime": "", + "endTime": "", + "message": "Job has not yet been submitted to Spark", + }, + "created": { + "username": "admin", + "time": "2019-06-24T15:58:48.734Z", + "version": "2407", + }, + "lastModified": { + "username": "admin", + "time": "2019-06-24T15:58:48.734Z", + "version": "2407", + }, + "relativeId": "operations/93", + } + + _operations_json = { + "id": "93", + "type": "SPARK", + "description": "Publish clusters", + "status": { + "state": "SUCCEEDED", + "startTime": "2019-06-24T15:58:56.595Z", + "endTime": "2019-06-24T15:59:17.084Z", + }, + "created": { + "username": "admin", + "time": "2019-06-24T15:58:48.734Z", + "version": "2407", + }, + "lastModified": { + "username": "system", + "time": "2019-06-24T15:59:18.350Z", + "version": "2423", + }, + "relativeId": "operations/93", + } + + _config_json = {"versionsTimeToLive": "P4D"} + + update_info = "PT100H" diff --git a/tests/unit/test_published_clusters_with_data.py b/tests/unit/test_published_clusters_with_data.py new file mode 100644 index 00000000..a40d927f --- /dev/null +++ b/tests/unit/test_published_clusters_with_data.py @@ -0,0 +1,79 @@ +import responses + +from tamr_unify_client import Client +from tamr_unify_client.auth import UsernamePasswordAuth + + +@responses.activate +def test_published_clusters_with_data(): + project_config = { + "name": "Project 1", + "description": "Mastering Project", + "type": "DEDUP", + "unifiedDatasetName": "Project_1_unified_dataset", + "externalId": "Project1", + "resourceId": "1", + } + + unified_dataset_json = { + "id": "unify://unified-data/v1/datasets/8", + "name": "Project_1_unified_dataset", + "version": "10", + "relativeId": "datasets/8", + "externalId": "Project_1_unified_dataset", + } + + pcwd_json = { + "externalId": "1", + "id": "unify://unified-data/v1/datasets/36", + "name": "Project_1_unified_dataset_dedup_published_clusters_with_data", + "relativeId": "datasets/36", + "version": "251", + } + + refresh_json = { + "id": "93", + "type": "SPARK", + "description": "Publish clusters", + "status": { + "state": "SUCCEEDED", + "startTime": "2019-06-24T15:58:56.595Z", + "endTime": "2019-06-24T15:59:17.084Z", + }, + "created": { + "username": "admin", + "time": "2019-06-24T15:58:48.734Z", + "version": "2407", + }, + "lastModified": { + "username": "system", + "time": "2019-06-24T15:59:18.350Z", + "version": "2423", + }, + "relativeId": "operations/93", + } + + datasets_json = [pcwd_json] + + tamr = Client(UsernamePasswordAuth("username", "password")) + + project_id = "1" + + project_url = f"http://localhost:9100/api/versioned/v1/projects/{project_id}" + unified_dataset_url = ( + f"http://localhost:9100/api/versioned/v1/projects/{project_id}/unifiedDataset" + ) + datasets_url = "http://localhost:9100/api/versioned/v1/datasets" + refresh_url = project_url + "/publishedClustersWithData:refresh" + + responses.add(responses.GET, project_url, json=project_config) + responses.add(responses.GET, unified_dataset_url, json=unified_dataset_json) + responses.add(responses.GET, datasets_url, json=datasets_json) + responses.add(responses.POST, refresh_url, json=refresh_json) + + project = tamr.projects.by_resource_id(project_id) + actual_pcwd_dataset = project.as_mastering().published_clusters_with_data() + assert actual_pcwd_dataset.name == pcwd_json["name"] + + op = actual_pcwd_dataset.refresh(poll_interval_seconds=0) + assert op.succeeded() diff --git a/tests/unit/test_record_clusters_with_data.py b/tests/unit/test_record_clusters_with_data.py new file mode 100644 index 00000000..d7824d13 --- /dev/null +++ b/tests/unit/test_record_clusters_with_data.py @@ -0,0 +1,80 @@ +import responses + +from tamr_unify_client import Client +from tamr_unify_client.auth import UsernamePasswordAuth + + +@responses.activate +def test_record_clusters_with_data(): + + project_config = { + "name": "Project 1", + "description": "Mastering Project", + "type": "DEDUP", + "unifiedDatasetName": "Project 1 - Unified Dataset", + "externalId": "Project1", + "resourceId": "1", + } + + unified_dataset_json = { + "id": "unify://unified-data/v1/datasets/8", + "name": "Project_1_unified_dataset", + "version": "10", + "relativeId": "datasets/8", + "externalId": "Project_1_unified_dataset", + } + + rcwd_json = { + "externalId": "1", + "id": "unify://unified-data/v1/datasets/36", + "name": "Project_1_unified_dataset_dedup_clusters_with_data", + "relativeId": "datasets/36", + "version": "251", + } + + refresh_json = { + "id": "93", + "type": "SPARK", + "description": "Clustering", + "status": { + "state": "SUCCEEDED", + "startTime": "2019-06-24T15:58:56.595Z", + "endTime": "2019-06-24T15:59:17.084Z", + }, + "created": { + "username": "admin", + "time": "2019-06-24T15:58:48.734Z", + "version": "2407", + }, + "lastModified": { + "username": "system", + "time": "2019-06-24T15:59:18.350Z", + "version": "2423", + }, + "relativeId": "operations/93", + } + + datasets_json = [rcwd_json] + + tamr = Client(UsernamePasswordAuth("username", "password")) + + project_id = "1" + + project_url = f"http://localhost:9100/api/versioned/v1/projects/{project_id}" + unified_dataset_url = ( + f"http://localhost:9100/api/versioned/v1/projects/{project_id}/unifiedDataset" + ) + datasets_url = "http://localhost:9100/api/versioned/v1/datasets" + refresh_url = project_url + "/recordClustersWithData:refresh" + + responses.add(responses.GET, project_url, json=project_config) + responses.add(responses.GET, unified_dataset_url, json=unified_dataset_json) + responses.add(responses.GET, datasets_url, json=datasets_json) + responses.add(responses.POST, refresh_url, json=refresh_json) + + project = tamr.projects.by_resource_id(project_id) + actual_rcwd_dataset = project.as_mastering().record_clusters_with_data() + assert actual_rcwd_dataset.name == rcwd_json["name"] + + op = actual_rcwd_dataset.refresh(poll_interval_seconds=0) + assert op.succeeded() diff --git a/tests/unit/test_strings.py b/tests/unit/test_strings.py index d2951572..304ff84f 100644 --- a/tests/unit/test_strings.py +++ b/tests/unit/test_strings.py @@ -1,14 +1,14 @@ from tamr_unify_client import Client from tamr_unify_client.auth import TokenAuth, UsernamePasswordAuth -from tamr_unify_client.models.dataset_status import DatasetStatus +from tamr_unify_client.dataset.status import DatasetStatus def test_client_repr(): auth = UsernamePasswordAuth("username", "password") - unify = Client(auth) + tamr = Client(auth) full_clz_name = "tamr_unify_client.client.Client" - rstr = f"{unify!r}" + rstr = f"{tamr!r}" assert rstr.startswith(f"{full_clz_name}(") assert "http" in rstr @@ -16,8 +16,8 @@ def test_client_repr(): assert rstr.endswith(")") # further testing when Client has optional arguments - unify = Client(auth, protocol="http", port=1234, base_path="foo/bar") - rstr = f"{unify!r}" + tamr = Client(auth, protocol="http", port=1234, base_path="foo/bar") + rstr = f"{tamr!r}" assert "'http'" in rstr assert "1234" in rstr @@ -55,7 +55,7 @@ def test_dataset_status_repr(): "isStreamable": True, } status = DatasetStatus.from_json(client, data) - full_clz_name = "tamr_unify_client.models.dataset_status.DatasetStatus" + full_clz_name = "tamr_unify_client.dataset.status.DatasetStatus" rstr = f"{status!r}" @@ -68,7 +68,7 @@ def test_dataset_status_repr(): def test_dataset_collection_repr(): client = Client(UsernamePasswordAuth("username", "password")) - full_clz_name = "tamr_unify_client.models.dataset.collection.DatasetCollection" + full_clz_name = "tamr_unify_client.dataset.collection.DatasetCollection" rstr = f"{client.datasets!r}" diff --git a/tests/unit/test_taxonomy.py b/tests/unit/test_taxonomy.py new file mode 100644 index 00000000..ffa97e68 --- /dev/null +++ b/tests/unit/test_taxonomy.py @@ -0,0 +1,224 @@ +from functools import partial +import json +from unittest import TestCase + +from requests import HTTPError +import responses + +from tamr_unify_client import Client +from tamr_unify_client.auth import UsernamePasswordAuth +from tamr_unify_client.categorization.category.collection import CategoryCollection +from tamr_unify_client.categorization.category.resource import Category, CategorySpec +from tamr_unify_client.categorization.taxonomy import Taxonomy +from tamr_unify_client.project.resource import Project + + +class TestTaxonomy(TestCase): + def setUp(self): + auth = UsernamePasswordAuth("username", "password") + self.tamr = Client(auth) + + @responses.activate + def test_categories(self): + cat_url = ( + "http://localhost:9100/api/versioned/v1/projects/1/taxonomy/categories" + ) + responses.add(responses.GET, cat_url, json=self._categories_json) + + t = Taxonomy(self.tamr, self._taxonomy_json) + c = list(t.categories()) + + cats = [ + Category(self.tamr, self._categories_json[0]), + Category(self.tamr, self._categories_json[1]), + ] + self.assertEqual(repr(c), repr(cats)) + + @responses.activate + def test_by_id(self): + cat_url = ( + "http://localhost:9100/api/versioned/v1/projects/1/taxonomy/categories/1" + ) + responses.add(responses.GET, cat_url, json=self._categories_json[0]) + + c = CategoryCollection(self.tamr, "projects/1/taxonomy/categories") + r = c.by_relative_id("projects/1/taxonomy/categories/1") + self.assertEqual(r._data, self._categories_json[0]) + r = c.by_resource_id("1") + self.assertEqual(r._data, self._categories_json[0]) + self.assertRaises(NotImplementedError, c.by_external_id, "1") + + @responses.activate + def test_create(self): + post_url = ( + "http://localhost:9100/api/versioned/v1/projects/1/taxonomy/categories" + ) + responses.add(responses.POST, post_url, json=self._categories_json[0]) + + alias = "projects/1/taxonomy/categories" + coll = CategoryCollection(self.tamr, alias) + + creation_spec = { + "name": self._categories_json[0]["name"], + "path": self._categories_json[0]["path"], + } + c = coll.create(creation_spec) + self.assertEqual(alias + "/1", c.relative_id) + + @responses.activate + def test_create_from_spec(self): + def create_callback(request, snoop): + snoop["payload"] = json.loads(request.body) + return 201, {}, json.dumps(self._categories_json[0]) + + post_url = ( + "http://localhost:9100/api/versioned/v1/projects/1/taxonomy/categories" + ) + snoop_dict = {} + responses.add_callback( + responses.POST, post_url, partial(create_callback, snoop=snoop_dict) + ) + + alias = "projects/1/taxonomy/categories" + coll = CategoryCollection(self.tamr, alias) + + json_spec = { + "name": self._categories_json[0]["name"], + "path": self._categories_json[0]["path"], + } + spec = ( + CategorySpec.new() + .with_name(self._categories_json[0]["name"]) + .with_path(self._categories_json[0]["path"]) + ) + coll.create(spec.to_dict()) + + self.assertEqual(snoop_dict["payload"], json_spec) + + @responses.activate + def test_bulk_create(self): + def create_callback(request, snoop): + snoop["payload"] = request.body + return 200, {}, json.dumps(self._bulk_json) + + post_url = ( + "http://localhost:9100/api/versioned/v1/projects/1/taxonomy/categories:bulk" + ) + snoop_dict = {} + responses.add_callback( + responses.POST, post_url, partial(create_callback, snoop=snoop_dict) + ) + + alias = "projects/1/taxonomy/categories" + coll = CategoryCollection(self.tamr, alias) + + creation_specs = [ + { + "name": self._categories_json[0]["name"], + "path": self._categories_json[0]["path"], + }, + { + "name": self._categories_json[1]["name"], + "path": self._categories_json[1]["path"], + }, + ] + j = coll.bulk_create(creation_specs) + self.assertEqual(j, self._bulk_json) + + sent = [] + for line in snoop_dict["payload"].split(b"\n"): + sent.append(json.loads(line)) + self.assertEqual(sent, creation_specs) + + @responses.activate + def test_delete(self): + url = "http://localhost:9100/api/versioned/v1/projects/1/taxonomy" + responses.add(responses.GET, url, json=self._taxonomy_json) + responses.add(responses.DELETE, url, status=204) + responses.add(responses.GET, url, status=404) + + project = Project( + self.tamr, {"type": "CATEGORIZATION"}, "projects/1" + ).as_categorization() + taxonomy = project.taxonomy() + self.assertEqual(taxonomy._data, self._taxonomy_json) + + response = taxonomy.delete() + self.assertEqual(response.status_code, 204) + self.assertRaises(HTTPError, project.taxonomy) + + @responses.activate + def test_delete_category(self): + url = "http://localhost:9100/api/versioned/v1/projects/1/taxonomy/categories/1" + responses.add(responses.GET, url, json=self._categories_json[0]) + responses.add(responses.DELETE, url, status=204) + responses.add(responses.GET, url, status=404) + + categories = CategoryCollection(self.tamr, "projects/1/taxonomy/categories") + category = categories.by_resource_id("1") + self.assertEqual(category._data, self._categories_json[0]) + + response = category.delete() + self.assertEqual(response.status_code, 204) + self.assertRaises(HTTPError, lambda: categories.by_resource_id("1")) + + _taxonomy_json = { + "id": "unify://unified-data/v1/projects/1/taxonomy", + "name": "Test Taxonomy", + "created": { + "username": "admin", + "time": "2019-07-12T13:09:14.981Z", + "version": "405", + }, + "lastModified": { + "username": "admin", + "time": "2019-07-12T13:09:14.981Z", + "version": "405", + }, + "relativeId": "projects/1/taxonomy", + } + + _categories_json = [ + { + "id": "unify://unified-data/v1/projects/1/taxonomy/categories/1", + "name": "t1", + "description": "", + "parent": "", + "path": ["t1"], + "created": { + "username": "admin", + "time": "2019-07-12T13:10:52.988Z", + "version": "414", + }, + "lastModified": { + "username": "admin", + "time": "2019-07-12T13:10:52.988Z", + "version": "414", + }, + "relativeId": "projects/1/taxonomy/categories/1", + }, + { + "id": "unify://unified-data/v1/projects/1/taxonomy/categories/2", + "name": "t2", + "description": "", + "parent": "unify://unified-data/v1/projects/1/taxonomy/categories/1", + "path": ["t1", "t2"], + "created": { + "username": "admin", + "time": "2019-07-12T13:51:20.600Z", + "version": "419", + }, + "lastModified": { + "username": "admin", + "time": "2019-07-12T13:51:20.600Z", + "version": "419", + }, + "relativeId": "projects/1/taxonomy/categories/2", + }, + ] + + _bulk_json = { + "numCommandsProcessed": 2, + "allCommandsSucceeded": True, + "validationErrors": [], + } diff --git a/tests/unit/test_upstream_dataset.py b/tests/unit/test_upstream_dataset.py new file mode 100644 index 00000000..a5dac3d6 --- /dev/null +++ b/tests/unit/test_upstream_dataset.py @@ -0,0 +1,74 @@ +import responses + +from tamr_unify_client import Client +from tamr_unify_client.auth import UsernamePasswordAuth + + +@responses.activate +def test_upstream_dataset(): + + dataset_json = { + "id": "unify://unified-data/v1/datasets/12", + "name": "Project_1_unified_dataset_dedup_features", + "description": "Features for all the rows and values in the source dataset. Used in Dedup Workflow.", + "version": "543", + "keyAttributeNames": ["entityId"], + "tags": [], + "created": { + "username": "admin", + "time": "2019-06-05T18:31:59.327Z", + "version": "212", + }, + "lastModified": { + "username": "admin", + "time": "2019-07-18T14:19:28.133Z", + "version": "22225", + }, + "relativeId": "datasets/12", + "upstreamDatasetIds": ["unify://unified-data/v1/datasets/8"], + "externalId": "Project_1_unified_dataset_dedup_features", + } + + upstream_json = ["unify://unified-data/v1/datasets/8"] + + upstream_ds_json = { + "id": "unify://unified-data/v1/datasets/8", + "name": "Project_1_unified_dataset", + "description": "", + "version": "529", + "keyAttributeNames": ["tamr_id"], + "tags": [], + "created": { + "username": "admin", + "time": "2019-06-05T16:28:11.639Z", + "version": "83", + }, + "lastModified": { + "username": "admin", + "time": "2019-07-22T20:31:23.968Z", + "version": "23146", + }, + "relativeId": "datasets/8", + "upstreamDatasetIds": ["unify://unified-data/v1/datasets/6"], + "externalId": "Project_1_unified_dataset", + "resourceId": "8", + } + + tamr = Client(UsernamePasswordAuth("username", "password")) + + url_prefix = "http://localhost:9100/api/versioned/v1/" + dataset_url = url_prefix + "datasets/12" + upstream_url = url_prefix + "datasets/12/upstreamDatasets" + upstream_ds_url = url_prefix + "datasets/8" + + responses.add(responses.GET, dataset_url, json=dataset_json) + responses.add(responses.GET, upstream_url, json=upstream_json) + responses.add(responses.GET, upstream_ds_url, json=upstream_ds_json) + + project_ds = tamr.datasets.by_relative_id("datasets/12") + actual_upstream_ds = project_ds.upstream_datasets() + uri_dataset = actual_upstream_ds[0].dataset() + + assert actual_upstream_ds[0].relative_id == upstream_ds_json["relativeId"] + assert actual_upstream_ds[0].resource_id == upstream_ds_json["resourceId"] + assert uri_dataset.name == upstream_ds_json["name"]