Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion rust/lance/src/dataset/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ impl DatasetBuilder {

/// Sets `version` for the builder using a tag
pub fn with_tag(mut self, tag: &str) -> Self {
self.version = Some(Ref::from(tag));
self.version = Some(Ref::Tag(tag.to_string()));
self
}

Expand Down
99 changes: 98 additions & 1 deletion rust/lance/src/dataset/refs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ use std::io::ErrorKind;
pub const MAIN_BRANCH: &str = "main";

/// Lance Ref
#[derive(Debug, Clone)]
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Ref {
// Version number points of the current branch
VersionNumber(u64),
Expand All @@ -45,10 +45,51 @@ impl From<u64> for Ref {

impl From<&str> for Ref {
fn from(reference: &str) -> Self {
let reference = reference.trim();
if let Some((branch, version)) = reference.split_once(':') {
if branch.is_empty() {
return Tag(reference.to_string());
}

let branch = if branch == MAIN_BRANCH {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: standardize_branch was designed as the entry point for this scenario

None
} else if check_valid_branch(branch).is_ok() {
Some(branch.to_string())
} else {
return Tag(reference.to_string());
};

let version = match version {
"" | "latest" => None,
v => match v.parse::<u64>() {
Ok(v) => Some(v),
Err(_) => return Tag(reference.to_string()),
},
};

return Version(branch, version);
}

if reference == MAIN_BRANCH {
return Version(None, None);
}

if reference.chars().all(|c| c.is_ascii_digit()) {
if let Ok(v) = reference.parse::<u64>() {
return VersionNumber(v);
}
}

Tag(reference.to_string())
}
}

impl From<String> for Ref {
fn from(reference: String) -> Self {
Self::from(reference.as_str())
}
}

impl From<(&str, u64)> for Ref {
fn from(reference: (&str, u64)) -> Self {
Version(standardize_branch(reference.0), Some(reference.1))
Expand Down Expand Up @@ -221,6 +262,13 @@ impl Tags<'_> {
message: format!("tag {} already exists", tag),
});
}

let branch_file = branch_contents_path(&root_location.path, tag);
if self.object_store().exists(&branch_file).await? {
return Err(Error::RefConflict {
message: format!("tag {} conflicts with existing branch", tag),
});
}
let tag_contents = self.build_tag_content_by_ref(reference).await?;

self.object_store()
Expand Down Expand Up @@ -257,6 +305,13 @@ impl Tags<'_> {
message: format!("tag {} does not exist", tag),
});
}

let branch_file = branch_contents_path(&root_location.path, tag);
if self.object_store().exists(&branch_file).await? {
return Err(Error::RefConflict {
message: format!("tag {} conflicts with existing branch", tag),
});
}
let tag_contents = self.build_tag_content_by_ref(reference).await?;

self.object_store()
Expand Down Expand Up @@ -393,6 +448,13 @@ impl Branches<'_> {

let source_branch = source_branch.and_then(standardize_branch);
let root_location = self.refs.root()?;

let tag_file = tag_path(&root_location.path, branch_name);
if self.object_store().exists(&tag_file).await? {
return Err(Error::RefConflict {
message: format!("branch {} conflicts with existing tag", branch_name),
});
}
let branch_file = branch_contents_path(&root_location.path, branch_name);
if self.object_store().exists(&branch_file).await? {
return Err(Error::RefConflict {
Expand Down Expand Up @@ -793,6 +855,41 @@ mod tests {

use rstest::rstest;

#[rstest]
fn test_parse_ref_ok(
#[values(
("0", Ref::VersionNumber(0)),
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we add a ref/ prefix before the branch or tag, so that we can differentiate a version 100 vs a branch/tag "100"?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another thought I proposed in Spark is that ref/ could indicate the delimiter is /. That means if we do ref/branch/tag then we know to use / to split branch and tag.

If a branch has a / (which is common), we an use a different delimiter, e.g. ref:feature/abc:100.

Basically the first 3 characters equaling ref indicates its not a version number, and the forth character is used as the delimiter.

What do we think? @Xuanwo @wjones127 @hamersaw

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds great to me! Just a few clarifying questions:

  • Do we need to prepend all tags with branches when using ref then? ex. with ref/foo is "foo" a branch or a tag (ie. tag "foo" on branch "main")?
  • How do we handle ambiguity if a branch / version / tag is specified in both the table ID and in the VERSION AS OF clause? Just error out?

I've been trying to convince myself that it would be more simple (parsing + API) to only support branches in table IDs and versions / tags in VERSION AS OF clauses (within SELECT and CREATE TAG statements), but that's probably a bad idea if we parse things into our existing Ref type (which contains branch, version, and tag already).

("42", Ref::VersionNumber(42)),
("main", Ref::Version(None, None)),
("main:", Ref::Version(None, None)),
("main:latest", Ref::Version(None, None)),
("main:10", Ref::Version(None, Some(10))),
("feature/a:", Ref::Version(Some("feature/a".to_string()), None)),
("feature/a:latest", Ref::Version(Some("feature/a".to_string()), None)),
("feature/a:10", Ref::Version(Some("feature/a".to_string()), Some(10))),
("tag1", Ref::Tag("tag1".to_string()))
)]
(input, expected): (&str, Ref),
) {
assert_eq!(Ref::from(input), expected);
}

#[rstest]
fn test_ref_from_str_invalid_syntax_falls_back_to_tag(
#[values(
"",
":",
":10",
"main:bad",
"feature/a",
"/start-with-slash",
"feature//double-slash"
)]
input: &str,
) {
assert_eq!(Ref::from(input), Ref::Tag(input.trim().to_string()));
}

#[rstest]
fn test_ok_ref(
#[values(
Expand Down
53 changes: 53 additions & 0 deletions rust/lance/src/dataset/tests/dataset_versioning.rs
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,59 @@ async fn test_tag(
assert_eq!(dataset.manifest.version, 1);
}

#[tokio::test]
async fn test_tag_branch_name_conflict() {
let test_uri = TempStrDir::default();

let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
"i",
DataType::UInt32,
false,
)]));

let data = RecordBatch::try_new(
schema.clone(),
vec![Arc::new(UInt32Array::from_iter_values(0..10))],
)
.unwrap();
let reader = RecordBatchIterator::new(vec![data].into_iter().map(Ok), schema);
let mut dataset = Dataset::write(reader, &test_uri, None).await.unwrap();

dataset
.create_branch("conflict", dataset.version().version, None)
.await
.unwrap();

let err = dataset
.tags()
.create("conflict", dataset.version().version)
.await
.err()
.unwrap()
.to_string();
assert_eq!(
err,
"Ref conflict error: tag conflict conflicts with existing branch"
);

dataset
.tags()
.create("tag_conflict", dataset.version().version)
.await
.unwrap();

let err = dataset
.create_branch("tag_conflict", dataset.version().version, None)
.await
.err()
.unwrap()
.to_string();
assert_eq!(
err,
"Ref conflict error: branch tag_conflict conflicts with existing tag"
);
}

#[rstest]
#[tokio::test]
async fn test_fragment_id_zero_not_reused() {
Expand Down