diff --git a/python/pyxet/pyxet/cli.py b/python/pyxet/pyxet/cli.py index a1d832e..a45f4d5 100644 --- a/python/pyxet/pyxet/cli.py +++ b/python/pyxet/pyxet/cli.py @@ -115,6 +115,7 @@ def version(): """ print(__version__) + @staticmethod @cli.command() def cp(source: Annotated[typing.List[str], typer.Argument(help="Source file or folder to copy")], target: Annotated[str, typer.Argument(help="Target location of the file or folder")], diff --git a/python/pyxet/scripts/build_standalone_cli.sh b/python/pyxet/scripts/build_standalone_cli.sh index 088fbc6..1fd7e8c 100755 --- a/python/pyxet/scripts/build_standalone_cli.sh +++ b/python/pyxet/scripts/build_standalone_cli.sh @@ -4,31 +4,37 @@ # Will build wheel in release mode, then build standalone executable using the xet packaged with the CLI if [[ ! -e pyproject.toml ]] ; then - echo "Run this script in the pyxet directory using ./scripts/$0" + >&2 echo "Run this script in the pyxet directory using ./scripts/$0" exit 1 fi -source ./scripts/build_wheel.sh +>&2 wheel_location=$(./scripts/build_wheel.sh) -pip install target/wheels/pyxet-*.whl +>&2 pip install $wheel_location +>&2 pip install -r ./scripts/cli_requirements.txt OS=$(uname -s) +xet_cli_path="./scripts/xet_standalone_entry.py" +>&2 echo "Path to xet entry script = '${xet_cli_path}'" + # Build binary if [[ "$OS" == "Darwin" ]]; then - xet_cli_path="$(which xet)" - echo "Path to xet = '${xet_cli_path}'" - pyinstaller --onefile "$xet_cli_path" --target-arch universal2 + if [[ ${_PYXET_BUILD_MODE} == "debug" ]] ; then + target_flag= + else + target_flag="--target-arch=universal2" + fi + + >&2 pyinstaller --onefile "$xet_cli_path" --name xet $target_flag + cli_path="dist/xet" elif [[ "$OS" == "Linux" ]] ; then - xet_cli_path="$(which xet)" - echo "Path to xet = '${xet_cli_path}'" - pyinstaller --onefile "$xet_cli_path" + >&2 pyinstaller --onefile "$xet_cli_path" --name xet + cli_path="dist/xet" else - # Windows is weird. Have to go directly to the cli path - - # Find the cli file, which isn't always where you want it to be. - xet_cli_path="./.venv_build/Lib/site-packages/pyxet/cli.py" - echo "Path to xet = '${xet_cli_path}'" - pyinstaller --onefile "$xet_cli_path" - mv dist/cli.exe dist/xet.exe + >&2 pyinstaller --onefile "$xet_cli_path" --name xet + cli_path="dist/xet.exe" fi + +>&2 echo "Standalone installer is located at ${cli_path}." +echo ${cli_path} diff --git a/python/pyxet/scripts/build_wheel.sh b/python/pyxet/scripts/build_wheel.sh old mode 100644 new mode 100755 index 7a4f951..53e7abc --- a/python/pyxet/scripts/build_wheel.sh +++ b/python/pyxet/scripts/build_wheel.sh @@ -4,7 +4,7 @@ # Will build wheel in release mode. if [[ ! -e pyproject.toml ]] ; then - echo "Run this script in the pyxet directory using ./scripts/$0" + >&2 echo "Run this script in the pyxet directory using ./scripts/$0" exit 1 fi @@ -13,22 +13,39 @@ OS=$(uname -s) export MACOSX_DEPLOYMENT_TARGET=10.9 unset CONDA_PREFIX +# Adds in the install instructions. +>&2 source ./scripts/setup_env.sh + # Use a new build environment that links against the system python on OSX # and always creates a new environment. -rm -rf .venv_build -source ./scripts/setup_env.sh -create_venv .venv_build release -source $(venv_activate_script .venv_build) - -# Clear out any old wheels -mkdir -p target/old_wheels/ -mv target/wheels/* target/old_wheels/ || echo "" -if [[ "$OS" == "Darwin" ]]; then - maturin build --profile=cli-release --target=universal2-apple-darwin +# If we're already in a virtual env, then don't worry about this. +if [[ -z $_PYXET_BUILD_VIRTUAL_ENV ]] ; then + >&2 rm -rf .venv_build + >&2 create_venv .venv_build release + >&2 source $(venv_activate_script .venv_build) else - maturin build --profile=cli-release + >&2 source $(venv_activate_script ${_PYXET_BUILD_VIRTUAL_ENV}) fi -echo "Wheel is located at target/wheels/pyxet-*.whl" +# Clear out any old wheels +>&2 mkdir -p target/old_wheels/ +>&2 mv target/wheels/* target/old_wheels/ || echo "" + +# Mode +if [[ $_PYXET_BUILD_MODE == "debug" ]] ; then + flags= +else + flags="--profile=cli-release" + + if [[ "$OS" == "Darwin" ]]; then + flags="$flags --target=universal2-apple-darwin" + fi +fi + +>&2 maturin build $flags + +wheel=$(ls ./target/wheels/pyxet-*.whl | head -n 1) +>&2 echo "Wheel is located at $wheel" +echo $wheel diff --git a/python/pyxet/scripts/cli_requirements.txt b/python/pyxet/scripts/cli_requirements.txt new file mode 100644 index 0000000..96c0840 --- /dev/null +++ b/python/pyxet/scripts/cli_requirements.txt @@ -0,0 +1,3 @@ +# All of the requirements for installation that the fsspec implementation +# could be interested in. +s3fs \ No newline at end of file diff --git a/python/pyxet/scripts/run_tests.sh b/python/pyxet/scripts/run_tests.sh index 685640c..f2c7455 100755 --- a/python/pyxet/scripts/run_tests.sh +++ b/python/pyxet/scripts/run_tests.sh @@ -9,27 +9,33 @@ if [[ ! -e pyproject.toml ]] ; then fi source ./scripts/setup_env.sh -create_venv venv dev +create_venv venv dev # The dev part here installs the additional dev requirements source $(venv_activate_script venv) +export _PYXET_BUILD_MODE=debug +export _PYXET_BUILD_VIRTUAL_ENV=venv + +# Build the wheel. +wheel=$(./scripts/build_wheel.sh) + +# Build the standalone cli and wheel +cli=$(./scripts/build_standalone_cli.sh) + +# Install the wheel +pip install "$wheel" + if [[ -z "$VIRTUAL_ENV" ]] ; then echo "Failed to activate virtual env." exit 1 fi +# Make sure windows executable can run anywhere +work_dir=./.testing_tmp +rm -rf $work_dir || echo "" +mkdir -p $work_dir -# Clear out any old wheels -mkdir -p target/old_wheels/ -mv target/wheels/* target/old_wheels/ || echo "" - -echo "$(which pip)" - -# Install -maturin build -pip install target/wheels/pyxet-*.whl - -# TODO: This runs the tests in parallel using pytest-xdist -# Error: tests in cli can't be run simultaneously actually, as there are conflicts. -#pytest -n 12 --verbose tests/ -pytest --verbose tests/ +cp "$cli" "$work_dir" +cd "$work_dir" +export XET_STANDALONE_CLI="./$(basename ${cli})" +pytest -n 12 --verbose "../tests/" diff --git a/python/pyxet/scripts/setup_env.sh b/python/pyxet/scripts/setup_env.sh index 21671b2..a4292b9 100755 --- a/python/pyxet/scripts/setup_env.sh +++ b/python/pyxet/scripts/setup_env.sh @@ -36,14 +36,17 @@ create_venv() { [[ -e "./$venv_name" ]] || exit 1 source $(venv_activate_script $venv_name) - - >&2 pip install --upgrade pip - if [[ $build_mode == "release" ]] ; then - # For building the wheel / standalone xet, use minimal installation - # environment; otherwise may pull in non-universal2 compatible package. - >&2 pip install -r scripts/build_requirements.txt - else - >&2 pip install -r scripts/dev_requirements.txt - fi + fi + + # Make sure it's up to par. + >&2 pip install --upgrade pip + if [[ $build_mode == "release" ]] ; then + # For building the wheel / standalone xet, use minimal installation + # environment; otherwise may pull in non-universal2 compatible package. + >&2 pip install --upgrade -r scripts/build_requirements.txt + else + # Install both. + >&2 pip install --upgrade -r scripts/build_requirements.txt + >&2 pip install --upgrade -r scripts/dev_requirements.txt fi } \ No newline at end of file diff --git a/python/pyxet/scripts/xet_standalone_entry.py b/python/pyxet/scripts/xet_standalone_entry.py new file mode 100755 index 0000000..1de735f --- /dev/null +++ b/python/pyxet/scripts/xet_standalone_entry.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python + +# Pull in everything from pyxet, without the relative imports. +from pyxet.cli import cli + +# Package in s3 dependencies, otherwise these +# will just give other repo errors. +import s3fs + +if __name__ == "__main__": + cli() \ No newline at end of file diff --git a/python/pyxet/tests/arrow_test.py b/python/pyxet/tests/arrow_test.py index 29e728a..440eaf4 100644 --- a/python/pyxet/tests/arrow_test.py +++ b/python/pyxet/tests/arrow_test.py @@ -5,23 +5,18 @@ import pyxet from utils import skip_if_no, CONSTANTS - -@pytest.mark.skip("Not sure if pyxet will implement read_arrow - TODO") -def test_read_arrow(): - dataset = pyxet.read_arrow(CONSTANTS.TITANIC_PARQUET) - assert dataset.to_table().num_rows == 891 - +# This just ends up being one test, as this repo is currently mdbv1, and the mdb +# v1 clone process is not safe between multiple processes. @skip_if_no("pyarrow") -def test_pyarrow_dataset(): +def test_pyarrow(): import pyarrow.dataset as ds fs = pyxet.XetFS(repo_url=CONSTANTS.TITANIC_MAIN) dataset = ds.dataset(CONSTANTS.TITANIC_PARQUET, filesystem=fs) assert dataset.to_table().num_rows == 891 -@skip_if_no("pyarrow") -def test_pyarrow_parquet(): +# def test_pyarrow_parquet(): from pyarrow.parquet import ParquetFile import pyarrow as pa @@ -31,9 +26,7 @@ def test_pyarrow_parquet(): df = pa.Table.from_batches([first_ten_rows]).to_pandas() assert df.shape == (10, 12) - -@skip_if_no("pyarrow") -def test_pyarrow_stream(): +# def test_pyarrow_stream(): import pandas as pd from pyarrow.fs import PyFileSystem, FSSpecHandler @@ -44,13 +37,6 @@ def test_pyarrow_stream(): assert df.shape == (891, 12) -@pytest.mark.skip("cp not implemented") -def test_pyarrow_stream_cp(): - with pytest.raises(NotImplementedError): # TODO - pa_fs.copy_file(CONSTANTS.TITANIC_CSV, - 'https://xethub.com/xdssio/titanic.git/main/titanic2.csv') - - @pytest.mark.skip("Not sure if we need this - TODO") def test_pyarrow_fsspec(): from pyarrow import fs diff --git a/python/pyxet/tests/cli_cp_new_url_test.py b/python/pyxet/tests/cli_cp_new_url_test.py index a5b5c32..a2dfd9f 100644 --- a/python/pyxet/tests/cli_cp_new_url_test.py +++ b/python/pyxet/tests/cli_cp_new_url_test.py @@ -7,14 +7,11 @@ from pyxet.file_operations import perform_copy, build_cp_action_list - def delete_branch(repo, branch, *args): try: pyxet.BranchCLI.delete(repo, branch, *args) except Exception as e: print(f"WARNING: Exception trying to delete branch {branch} on {repo}: {e}") - - def test_single_file_upload(): user, host = utils.test_account_login() diff --git a/python/pyxet/tests/cli_standalone_cp_tests.py b/python/pyxet/tests/cli_standalone_cp_tests.py new file mode 100644 index 0000000..f80db57 --- /dev/null +++ b/python/pyxet/tests/cli_standalone_cp_tests.py @@ -0,0 +1,309 @@ +import os +import pytest +import pyxet +import utils +import shutil +import tempfile +import subprocess +import sys + +from pyxet.file_operations import perform_copy, build_cp_action_list + +# Set these once. +user, host = utils.test_account_login() +repo = utils.test_repo() + +# TODO: right syntax for this. +xet_cli_path=os.environ.get("XET_STANDALONE_CLI", None) + +if xet_cli_path is None: + print("Warning: XET_STANDALONE_CLI not set, skipping all tests.") + +def xet_url(branch = None, path = None): + ret = f"xet://{host}:{user}/{repo}" + if branch is not None: + ret += f"/{branch}" + if path is not None: + assert branch is not None + ret += f"/{path}" + return ret + +def run_xet(*args, cwd = None): + arg_text = ' '.join(f'"{arg}"' if arg.contains(' ') else arg for arg in args) + print(f"Running command: xet {arg_text}") + subprocess.check_output([xet_cli_path] + args, cwd = cwd) + +def xet_perform_copy(cwd, src, dest, message, is_recursive): + run_xet(["cp"] + (["--recursive"] if is_recursive else []) + [src, dest], cwd=cwd) + +def delete_branch(branch): + try: + run_xet("branch", "delete", xet_url(branch)) + except Exception as e: + print(f"WARNING: Exception trying to delete branch {branch} on {repo}: {e}") + + +@pytest.mark.skipif(xet_cli_path is None) +def test_single_file_upload(): + b1 = utils.new_random_branch_from(f"xet://{host}:{user}/{repo}", "main") + + try: + # generate a random file in a temp dir + dir = tempfile.mkdtemp() + local_file = f"{dir}/data" + utils.random_binary_file(local_file, 1024) + + # test variations of path + source_list = [ + f"{dir}/data", + ] + + dest_list = [ + # (dest in cp command, expected path of remote file) + (f"xet://{host}:{user}/{repo}/{b1}", [f"xet://{host}:{user}/{repo}/{b1}/data"]), + (f"xet://{host}:{user}/{repo}/{b1}/",[f"xet://{host}:{user}/{repo}/{b1}/data"]), + (f"xet://{host}:{user}/{repo}/{b1}/zz", [f"xet://{host}:{user}/{repo}/{b1}/zz"]), + ] + + recursive_list = [ + False, + True, + ] + + try: + for src in source_list: + for dest in dest_list: + for r in recursive_list: + xet_perform_copy(src, dest[0], "add data", r) + utils.assert_remote_files_exist(f"xet://{host}:{user}/{repo}/{b1}/*", dest[1]) + pyxet.PyxetCLI.rm(dest[1]) + finally: + shutil.rmtree(dir) + finally: + delete_branch(b1) + +@pytest.mark.skipif(xet_cli_path is None) +def test_multiple_files_upload(): + b1 = utils.new_random_branch_from(f"xet://{host}:{user}/{repo}", "main") + + try: + # generate random files in a temp dir + dir = tempfile.mkdtemp() + local_files = [f"{dir}/data0", f"{dir}/data1"] + utils.random_binary_files(local_files, [1024, 1024]) + + # test variations of path + source_list = [ + f"{dir}/*", + ] + + dest_list = [ + f"xet://{host}:{user}/{repo}/{b1}", + f"xet://{host}:{user}/{repo}/{b1}/", + ] + + recursive_list = [ + False, + True, + ] + + expected_files = [f"xet://{host}:{user}/{repo}/{b1}/data0", f"xet://{host}:{user}/{repo}/{b1}/data1"] + + try: + for src in source_list: + for dest in dest_list: + for r in recursive_list: + xet_perform_copy(src, dest, "add data", r) + utils.assert_remote_files_exist(f"xet://{host}:{user}/{repo}/{b1}/*", expected_files) + run_xet("rm", *expected_files) + finally: + shutil.rmtree(dir) + finally: + delete_branch(b1) + +@pytest.mark.skipif(xet_cli_path is None) +def test_glob_nonrecursive_upload(): + b1 = utils.new_random_branch_from(f"xet://{host}:{user}/{repo}", "main") + + try: + # generate a mix of random files and directories in a temp dir + dir = tempfile.mkdtemp() + + local_files = [f"{dir}/data0", f"{dir}/data1"] + utils.random_binary_files(local_files, [1024, 1024]) + + sub_dir = "subdir" + os.mkdir(f"{dir}/{sub_dir}") + utils.random_binary_file(f"{dir}/{sub_dir}/data", 1024) + + # test variations of path + source_list = [ + f"{dir}/*", + ] + + dest_list = [ + f"xet://{host}:{user}/{repo}/{b1}", + f"xet://{host}:{user}/{repo}/{b1}/", + ] + + recursive_list = [ + False, + ] + + expected_files = [f"xet://{host}:{user}/{repo}/{b1}/data0", f"xet://{host}:{user}/{repo}/{b1}/data1"] + + try: + for src in source_list: + for dest in dest_list: + for r in recursive_list: + print(f"xet cp {src} {dest} {r}") + xet_perform_copy(src, dest, "add data", r) + utils.assert_remote_files_exist(f"xet://{host}:{user}/{repo}/{b1}/*", expected_files) + utils.assert_remote_files_not_exist(f"xet://{host}:{user}/{repo}/{b1}/*", [f"xet://{host}:{user}/{repo}/{b1}/{sub_dir}"]) + run_xet("rm", *expected_files) + finally: + shutil.rmtree(dir) + finally: + delete_branch(b1) + +@pytest.mark.skipif(xet_cli_path is None) +def test_glob_recursive_upload(): + b1 = utils.new_random_branch_from(f"xet://{host}:{user}/{repo}", "main") + + try: + # generate a mix of random files and directories in a temp dir + dir = tempfile.mkdtemp() + + local_files = [f"{dir}/data0", f"{dir}/data1"] + utils.random_binary_files(local_files, [1024, 1024]) + + sub_dir = "subdir" + os.mkdir(f"{dir}/{sub_dir}") + utils.random_binary_file(f"{dir}/{sub_dir}/data", 1024) + + # test variations of path + source_list = [ + f"{dir}/*", + ] + + dest_list = [ + f"xet://{host}:{user}/{repo}/{b1}", + f"xet://{host}:{user}/{repo}/{b1}/", + ] + + recursive_list = [ + True, + ] + + expected_files_level1 = [f"xet://{host}:{user}/{repo}/{b1}/data0", f"xet://{host}:{user}/{repo}/{b1}/data1", f"xet://{host}:{user}/{repo}/{b1}/{sub_dir}"] + expected_files_level2 = [f"xet://{host}:{user}/{repo}/{b1}/{sub_dir}/data"] + + try: + for src in source_list: + for dest in dest_list: + for r in recursive_list: + print(f"xet cp {src} {dest} {r}") + xet_perform_copy(src, dest, "add data", r) + utils.assert_remote_files_exist(f"xet://{host}:{user}/{repo}/{b1}/*", expected_files_level1) + utils.assert_remote_files_exist(f"xet://{host}:{user}/{repo}/{b1}/{sub_dir}/*", expected_files_level2) + run_xet("rm", *expected_files_level1) + finally: + shutil.rmtree(dir) + finally: + delete_branch(b1) + +@pytest.mark.skipif(xet_cli_path is None) +def test_directory_nonrecursive_upload(): + b1 = utils.new_random_branch_from(f"xet://{host}:{user}/{repo}", "main") + + try: + # generate a random file in a temp dir + dir = tempfile.mkdtemp() + sub_dir = "subdir" + os.mkdir(f"{dir}/{sub_dir}") + local_file = f"{dir}/{sub_dir}/data" + utils.random_binary_file(local_file, 1024) + + # test variations of path + source_list = [ + (f"{dir}/{sub_dir}/data", True), + (f"{dir}/{sub_dir}", False), + (f"{dir}/{sub_dir}/", False) + ] + + dest_list = [ + f"xet://{host}:{user}/{repo}/{b1}", + f"xet://{host}:{user}/{repo}/{b1}/", + ] + + recursive_list = [ + False, + ] + + try: + for src, should_succeed in source_list: + for dest in dest_list: + for r in recursive_list: + print(f"xet cp {src} {dest} {r}") + + if should_succeed: + xet_perform_copy(src, dest, "add data", r) + + utils.assert_remote_files_exist(f"xet://{host}:{user}/{repo}/{b1}/*", [f"xet://{host}:{user}/{repo}/{b1}/data"]) + + else: + + # ignores instead of raising error + xet_perform_copy(src, dest, "add data", r) + + finally: + shutil.rmtree(dir) + finally: + delete_branch(b1) + +@pytest.mark.skipif(xet_cli_path is None) +def test_directory_recursive_upload(): + b1 = utils.new_random_branch_from(f"xet://{host}:{user}/{repo}", "main") + try: + # generate a mix of random files and directories in a temp dir + dir = tempfile.mkdtemp() + + local_files = [f"{dir}/data0", f"{dir}/data1"] + utils.random_binary_files(local_files, [1024, 1024]) + + sub_dir = "subdir" + os.mkdir(f"{dir}/{sub_dir}") + utils.random_binary_file(f"{dir}/{sub_dir}/data", 1024) + + # test variations of path + source_list = [ + f"{dir}/*", + ] + + dest_list = [ + f"xet://{host}:{user}/{repo}/{b1}", + f"xet://{host}:{user}/{repo}/{b1}/", + ] + + recursive_list = [ + True, + ] + + expected_files_level1 = [f"xet://{host}:{user}/{repo}/{b1}/data0", f"xet://{host}:{user}/{repo}/{b1}/data1", f"xet://{host}:{user}/{repo}/{b1}/{sub_dir}"] + expected_files_level2 = [f"xet://{host}:{user}/{repo}/{b1}/{sub_dir}/data"] + + try: + for src in source_list: + for dest in dest_list: + for r in recursive_list: + print(f"xet cp {src} {dest} {r}") + xet_perform_copy(src, dest, "add data", r) + utils.assert_remote_files_exist(f"xet://{host}:{user}/{repo}/{b1}/*", expected_files_level1) + utils.assert_remote_files_exist(f"xet://{host}:{user}/{repo}/{b1}/{sub_dir}/*", expected_files_level2) + run_xet("rm", *expected_files_level1) + run_xet("rm", *expected_files_level2) + finally: + shutil.rmtree(dir) + + finally: + delete_branch(b1) \ No newline at end of file