From 67e07d612f0911f14dcf680b15f287aaad97a3ff Mon Sep 17 00:00:00 2001 From: Tim Vermeulen Date: Thu, 4 Sep 2025 14:50:53 +0200 Subject: [PATCH 1/4] refactor: split off the `parse_inner_type` function from `parse_inner_types` --- types/src/data_types.rs | 68 +++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/types/src/data_types.rs b/types/src/data_types.rs index c88521b4..b5162263 100644 --- a/types/src/data_types.rs +++ b/types/src/data_types.rs @@ -663,15 +663,39 @@ fn parse_variant(input: &str) -> Result { /// let input1 = "Tuple(Enum8('f\'()' = 1))"; // the result is `f\'()` /// let input2 = "Tuple(Enum8('(' = 1))"; // the result is `(` /// ``` -fn parse_inner_types(input: &str) -> Result, TypesError> { +fn parse_inner_types(mut input: &str) -> Result, TypesError> { let mut inner_types: Vec = Vec::new(); + while !input.is_empty() { + inner_types.push(parse_inner_type(&mut input)?); + + if !input.starts_with(',') { + break; + } + + input = &input[1..]; + + if input.starts_with(' ') { + input = &input[1..]; + } + } + + if !input.is_empty() { + return Err(TypesError::TypeParsingError(format!( + "Failed to parse all inner types, remaining input: {input}" + ))); + } + + Ok(inner_types) +} + +/// Parses a single type from the input string and removes it from the input. +fn parse_inner_type(input: &mut &str) -> Result { let input_bytes = input.as_bytes(); let mut open_parens = 0; let mut quote_open = false; let mut char_escaped = false; - let mut last_element_index = 0; let mut i = 0; while i < input_bytes.len() { @@ -687,42 +711,20 @@ fn parse_inner_types(input: &str) -> Result, TypesError> { } else if input_bytes[i] == b')' { open_parens -= 1; } else if input_bytes[i] == b',' && open_parens == 0 { - let data_type_str = String::from_utf8(input_bytes[last_element_index..i].to_vec()) - .map_err(|_| { - TypesError::TypeParsingError(format!( - "Invalid UTF-8 sequence in input for the inner data type: {}", - &input[last_element_index..] - )) - })?; - let data_type = DataTypeNode::new(&data_type_str)?; - inner_types.push(data_type); - // Skip ', ' (comma and space) - if i + 2 <= input_bytes.len() && input_bytes[i + 1] == b' ' { - i += 2; - } else { - i += 1; - } - last_element_index = i; - continue; // Skip the normal increment at the end of the loop + break; } } i += 1; } - // Push the remaining part of the type if it seems to be valid (at least all parentheses are closed) - if open_parens == 0 && last_element_index < input_bytes.len() { - let data_type_str = - String::from_utf8(input_bytes[last_element_index..].to_vec()).map_err(|_| { - TypesError::TypeParsingError(format!( - "Invalid UTF-8 sequence in input for the inner data type: {}", - &input[last_element_index..] - )) - })?; - let data_type = DataTypeNode::new(&data_type_str)?; - inner_types.push(data_type); - } - - Ok(inner_types) + let data_type_str = String::from_utf8(input_bytes[..i].to_vec()).map_err(|_| { + TypesError::TypeParsingError(format!( + "Invalid UTF-8 sequence in input for the inner data type: {}", + &input[..i] + )) + })?; + *input = &input[i..]; + DataTypeNode::new(&data_type_str) } #[inline] From 34d1b52ef42472d358ac31e2ffd357a74a985e2a Mon Sep 17 00:00:00 2001 From: Tim Vermeulen Date: Thu, 4 Sep 2025 14:50:53 +0200 Subject: [PATCH 2/4] fix: simplify `parse_inner_type` and make it more strict It now ensures that the brackets and quotes are balanced, and no longer consumes a trailing closing bracket if there's no matching opening bracket. --- types/src/data_types.rs | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/types/src/data_types.rs b/types/src/data_types.rs index b5162263..5c0558bd 100644 --- a/types/src/data_types.rs +++ b/types/src/data_types.rs @@ -699,20 +699,17 @@ fn parse_inner_type(input: &mut &str) -> Result { let mut i = 0; while i < input_bytes.len() { - if char_escaped { - char_escaped = false; - } else if input_bytes[i] == b'\\' { - char_escaped = true; - } else if input_bytes[i] == b'\'' { - quote_open = !quote_open; // unescaped quote - } else if !quote_open { - if input_bytes[i] == b'(' { - open_parens += 1; - } else if input_bytes[i] == b')' { - open_parens -= 1; - } else if input_bytes[i] == b',' && open_parens == 0 { - break; - } + match input_bytes[i] { + _ if char_escaped => char_escaped = false, + b'\\' if quote_open => char_escaped = true, + b'\'' => quote_open = !quote_open, // unescaped quote + byte if !quote_open => match byte { + b'(' => open_parens += 1, + b',' | b')' if open_parens == 0 => break, + b')' => open_parens -= 1, + _ => {} + }, + _ => {} } i += 1; } @@ -723,6 +720,14 @@ fn parse_inner_type(input: &mut &str) -> Result { &input[..i] )) })?; + + if open_parens != 0 || quote_open || char_escaped { + return Err(TypesError::TypeParsingError(format!( + "Invalid inner data type: {}", + &input[..i] + ))); + } + *input = &input[i..]; DataTypeNode::new(&data_type_str) } From e261ae00afff2ca92d8d0611431c6e0936e7d1c4 Mon Sep 17 00:00:00 2001 From: Tim Vermeulen Date: Thu, 4 Sep 2025 14:50:53 +0200 Subject: [PATCH 3/4] feat: add `DataTypeNode::Nested` --- types/src/data_types.rs | 167 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) diff --git a/types/src/data_types.rs b/types/src/data_types.rs index 5c0558bd..2ee1e631 100644 --- a/types/src/data_types.rs +++ b/types/src/data_types.rs @@ -101,6 +101,8 @@ pub enum DataTypeNode { MultiLineString, Polygon, MultiPolygon, + + Nested(Vec), } impl DataTypeNode { @@ -157,6 +159,8 @@ impl DataTypeNode { str if str.starts_with("Tuple") => parse_tuple(str), str if str.starts_with("Variant") => parse_variant(str), + str if str.starts_with("Nested") => Ok(Self::Nested(parse_nested(str)?)), + // ... str => Err(TypesError::TypeParsingError(format!( "Unknown data type: {str}" @@ -276,6 +280,16 @@ impl Display for DataTypeNode { MultiLineString => write!(f, "MultiLineString"), Polygon => write!(f, "Polygon"), MultiPolygon => write!(f, "MultiPolygon"), + Nested(columns) => { + write!(f, "Nested(")?; + for (i, column) in columns.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{} {}", column.name, column.data_type)?; + } + write!(f, ")") + } } } } @@ -826,6 +840,92 @@ fn parse_enum_values_map(input: &str) -> Result, TypesError .collect::>()) } +fn parse_nested(mut input: &str) -> Result, TypesError> { + /// Removes the prefix `prefix` from `input`. + fn parse_str(input: &mut &str, prefix: &str) -> Result<(), TypesError> { + if input.starts_with(prefix) { + *input = &input[prefix.len()..]; + Ok(()) + } else { + Err(TypesError::TypeParsingError(format!( + "Expected {prefix:?}, got {input:?}" + ))) + } + } + + /// Removes and returns the prefix of `input` up to the first character that does not match the + /// predicate. + fn parse_while<'a>(input: &mut &'a str, predicate: impl Fn(char) -> bool) -> &'a str { + let index = input + .char_indices() + .find(|(_, c)| !predicate(*c)) + .map(|(i, _)| i) + .unwrap_or(input.len()); + let (prefix, rest) = input.split_at(index); + *input = rest; + prefix + } + + /// Removes and returns a valid identifier from the start of `input`. + fn parse_identifier<'a>(input: &mut &'a str) -> Result<&'a str, TypesError> { + let original_input = *input; + + if input.starts_with('`') { + parse_str(input, "`")?; + let mut is_escaping = false; + + for (index, char) in input.char_indices() { + match char { + _ if is_escaping => is_escaping = false, + '\\' => is_escaping = true, + '`' => { + let name = &input[..index]; + *input = &input[index + 1..]; + return Ok(name); + } + _ => {} + } + } + + Err(TypesError::TypeParsingError(format!( + "Unclosed backtick in name: {original_input}" + ))) + } else { + Ok(parse_while(input, |c| { + c.is_ascii_alphanumeric() || c == '_' + })) + } + } + + let original_input = input; + parse_str(&mut input, "Nested(")?; + + let mut columns = Vec::new(); + while !input.starts_with(')') { + let name = parse_identifier(&mut input)?; + parse_str(&mut input, " ")?; + let data_type = parse_inner_type(&mut input)?; + + columns.push(Column { + name: name.to_string(), + data_type, + }); + + if input.starts_with(',') { + parse_str(&mut input, ", ")?; + } + } + + if columns.is_empty() { + return Err(TypesError::TypeParsingError(format!( + "Expected at least one column in Nested from input {original_input}" + ))); + } + + parse_str(&mut input, ")")?; + Ok(columns) +} + #[cfg(test)] mod tests { use super::*; @@ -1476,6 +1576,65 @@ mod tests { ); } + #[test] + fn test_data_type_new_nested() { + assert_eq!( + DataTypeNode::new("Nested(foo UInt8)").unwrap(), + DataTypeNode::Nested(vec![Column::new("foo".to_string(), DataTypeNode::UInt8)]) + ); + assert_eq!( + DataTypeNode::new("Nested(foo UInt8, bar String)").unwrap(), + DataTypeNode::Nested(vec![ + Column::new("foo".to_string(), DataTypeNode::UInt8), + Column::new("bar".to_string(), DataTypeNode::String), + ]) + ); + assert_eq!( + DataTypeNode::new("Nested(foo UInt8, `bar` String)").unwrap(), + DataTypeNode::Nested(vec![ + Column::new("foo".to_string(), DataTypeNode::UInt8), + Column::new("bar".to_string(), DataTypeNode::String), + ]) + ); + assert_eq!( + DataTypeNode::new("Nested(foo UInt8, `b a r` String)").unwrap(), + DataTypeNode::Nested(vec![ + Column::new("foo".to_string(), DataTypeNode::UInt8), + Column::new("b a r".to_string(), DataTypeNode::String), + ]) + ); + assert_eq!( + DataTypeNode::new( + "Nested(foo Enum8('f\\'(' = 1), `b a r` Nested(bar Tuple(Enum8('f\\'()' = 1))))" + ) + .unwrap(), + DataTypeNode::Nested(vec![ + Column::new( + "foo".to_string(), + DataTypeNode::Enum(EnumType::Enum8, HashMap::from([(1, "f\\'(".to_string())]),) + ), + Column::new( + "b a r".to_string(), + DataTypeNode::Nested(vec![Column::new( + "bar".to_string(), + DataTypeNode::Tuple(vec![DataTypeNode::Enum( + EnumType::Enum8, + HashMap::from([(1, "f\\'()".to_string())]), + )]), + )]) + ), + ]) + ); + + assert!(DataTypeNode::new("Nested").is_err()); + assert!(DataTypeNode::new("Nested(").is_err()); + assert!(DataTypeNode::new("Nested()").is_err()); + assert!(DataTypeNode::new("Nested(,)").is_err()); + assert!(DataTypeNode::new("Nested(String)").is_err()); + assert!(DataTypeNode::new("Nested(Int32, String)").is_err()); + assert!(DataTypeNode::new("Nested(foo Int32, String)").is_err()); + } + #[test] fn test_data_type_to_string_simple() { // Simple types @@ -1584,6 +1743,14 @@ mod tests { DataTypeNode::Variant(vec![DataTypeNode::UInt8, DataTypeNode::Bool]).to_string(), "Variant(UInt8, Bool)" ); + assert_eq!( + DataTypeNode::Nested(vec![ + Column::new("foo".to_string(), DataTypeNode::UInt8), + Column::new("bar".to_string(), DataTypeNode::String), + ]) + .to_string(), + "Nested(foo UInt8, bar String)" + ); } #[test] From 506e52dd778c0ab0d3d6a21b533d9fa9b135aefa Mon Sep 17 00:00:00 2001 From: Tim Vermeulen Date: Thu, 4 Sep 2025 14:50:53 +0200 Subject: [PATCH 4/4] feat: add validation support for the `Nested(...)` type --- src/rowbinary/validation.rs | 4 ++ tests/it/nested.rs | 107 ++++++++++++++++++++++++++++++++ types/src/data_types.rs | 120 ++++++++++++++++++++++++------------ 3 files changed, 191 insertions(+), 40 deletions(-) diff --git a/src/rowbinary/validation.rs b/src/rowbinary/validation.rs index 76f54493..410708f1 100644 --- a/src/rowbinary/validation.rs +++ b/src/rowbinary/validation.rs @@ -548,6 +548,10 @@ fn validate_impl<'serde, 'caller, R: Row>( root, kind: InnerDataTypeValidatorKind::Array(&DataTypeNode::LineString), }), + DataTypeNode::Nested { as_tuple, .. } => Some(InnerDataTypeValidator { + root, + kind: InnerDataTypeValidatorKind::Array(as_tuple), + }), _ => root.panic_on_schema_mismatch(data_type, serde_type, is_inner), }, SerdeType::Tuple(len) => match data_type { diff --git a/tests/it/nested.rs b/tests/it/nested.rs index 7a78eb58..fb8c591e 100644 --- a/tests/it/nested.rs +++ b/tests/it/nested.rs @@ -50,3 +50,110 @@ async fn smoke() { assert_eq!(row, original_row); } + +#[tokio::test] +async fn no_flatten() { + let client = prepare_database!().with_option("flatten_nested", "0"); + + #[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Row)] + struct MyRow { + no: i32, + items: Vec<(String, u32)>, + } + + // `flatten_nested = 0` prevents flattening of nested columns, causing them to be stored as a + // single array of tuples instead of as separate arrays + + client + .query( + " + CREATE TABLE test( + no Int32, + items Nested( + name String, + count UInt32 + ) + ) + ENGINE = MergeTree ORDER BY no + ", + ) + .execute() + .await + .unwrap(); + + let original_row = MyRow { + no: 42, + items: vec![("foo".into(), 1), ("bar".into(), 5)], + }; + + let mut insert = client.insert::("test").await.unwrap(); + insert.write(&original_row).await.unwrap(); + insert.end().await.unwrap(); + + let row = client + .query("SELECT ?fields FROM test") + .fetch_one::() + .await + .unwrap(); + + assert_eq!(row, original_row); +} + +#[tokio::test] +async fn doubly_flattened() { + let client = prepare_database!(); + + #[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Row)] + struct MyRow { + no: i32, + #[serde(rename = "items.names")] + items_names: Vec>, + #[serde(rename = "items.count")] + items_count: Vec, + } + + // Only the first level is flattened and any more deeply nested columns are stored as an array + // of tuples, so the table ends up with columns + // - `no Int32` + // - `items.names Array(Nested(first String, last String))` + // (i.e. `Array(Array(Tuple(first String, last String)))`) + // - `items.count Array(UInt32)` + + client + .query( + " + CREATE TABLE test( + no Int32, + items Nested( + names Nested(first String, last String), + count UInt32 + ) + ) + ENGINE = MergeTree ORDER BY no + ", + ) + .execute() + .await + .unwrap(); + + let original_row = MyRow { + no: 42, + items_names: vec![ + vec![("foo".into(), "foo".into())], + vec![("bar".into(), "bar".into())], + ], + items_count: vec![1, 5], + }; + + let mut insert = client.insert::("test").await.unwrap(); + insert.write(&original_row).await.unwrap(); + insert.end().await.unwrap(); + + let row = client + .query("SELECT ?fields FROM test") + .fetch_one::() + .await + .unwrap(); + + assert_eq!(row, original_row); +} diff --git a/types/src/data_types.rs b/types/src/data_types.rs index 2ee1e631..e3386998 100644 --- a/types/src/data_types.rs +++ b/types/src/data_types.rs @@ -102,7 +102,12 @@ pub enum DataTypeNode { Polygon, MultiPolygon, - Nested(Vec), + Nested { + columns: Vec, + // This stores the types in `columns` as a tuple node as a hack to be able to validate + // data for this column as an array of tuples + as_tuple: Box, + }, } impl DataTypeNode { @@ -159,7 +164,7 @@ impl DataTypeNode { str if str.starts_with("Tuple") => parse_tuple(str), str if str.starts_with("Variant") => parse_variant(str), - str if str.starts_with("Nested") => Ok(Self::Nested(parse_nested(str)?)), + str if str.starts_with("Nested") => parse_nested(str), // ... str => Err(TypesError::TypeParsingError(format!( @@ -280,7 +285,7 @@ impl Display for DataTypeNode { MultiLineString => write!(f, "MultiLineString"), Polygon => write!(f, "Polygon"), MultiPolygon => write!(f, "MultiPolygon"), - Nested(columns) => { + Nested { columns, .. } => { write!(f, "Nested(")?; for (i, column) in columns.iter().enumerate() { if i > 0 { @@ -840,7 +845,7 @@ fn parse_enum_values_map(input: &str) -> Result, TypesError .collect::>()) } -fn parse_nested(mut input: &str) -> Result, TypesError> { +fn parse_nested(mut input: &str) -> Result { /// Removes the prefix `prefix` from `input`. fn parse_str(input: &mut &str, prefix: &str) -> Result<(), TypesError> { if input.starts_with(prefix) { @@ -901,6 +906,8 @@ fn parse_nested(mut input: &str) -> Result, TypesError> { parse_str(&mut input, "Nested(")?; let mut columns = Vec::new(); + let mut types = Vec::new(); + while !input.starts_with(')') { let name = parse_identifier(&mut input)?; parse_str(&mut input, " ")?; @@ -908,8 +915,9 @@ fn parse_nested(mut input: &str) -> Result, TypesError> { columns.push(Column { name: name.to_string(), - data_type, + data_type: data_type.clone(), }); + types.push(data_type); if input.starts_with(',') { parse_str(&mut input, ", ")?; @@ -923,7 +931,10 @@ fn parse_nested(mut input: &str) -> Result, TypesError> { } parse_str(&mut input, ")")?; - Ok(columns) + Ok(DataTypeNode::Nested { + columns, + as_tuple: Box::new(DataTypeNode::Tuple(types)), + }) } #[cfg(test)] @@ -1580,50 +1591,73 @@ mod tests { fn test_data_type_new_nested() { assert_eq!( DataTypeNode::new("Nested(foo UInt8)").unwrap(), - DataTypeNode::Nested(vec![Column::new("foo".to_string(), DataTypeNode::UInt8)]) + DataTypeNode::Nested { + columns: vec![Column::new("foo".to_string(), DataTypeNode::UInt8)], + as_tuple: Box::new(DataTypeNode::Tuple(vec![DataTypeNode::UInt8])), + } ); assert_eq!( DataTypeNode::new("Nested(foo UInt8, bar String)").unwrap(), - DataTypeNode::Nested(vec![ - Column::new("foo".to_string(), DataTypeNode::UInt8), - Column::new("bar".to_string(), DataTypeNode::String), - ]) + DataTypeNode::Nested { + columns: vec![ + Column::new("foo".to_string(), DataTypeNode::UInt8), + Column::new("bar".to_string(), DataTypeNode::String), + ], + as_tuple: Box::new(DataTypeNode::Tuple(vec![ + DataTypeNode::UInt8, + DataTypeNode::String, + ])), + } ); assert_eq!( DataTypeNode::new("Nested(foo UInt8, `bar` String)").unwrap(), - DataTypeNode::Nested(vec![ - Column::new("foo".to_string(), DataTypeNode::UInt8), - Column::new("bar".to_string(), DataTypeNode::String), - ]) + DataTypeNode::Nested { + columns: vec![ + Column::new("foo".to_string(), DataTypeNode::UInt8), + Column::new("bar".to_string(), DataTypeNode::String), + ], + as_tuple: Box::new(DataTypeNode::Tuple(vec![ + DataTypeNode::UInt8, + DataTypeNode::String, + ])), + } ); assert_eq!( DataTypeNode::new("Nested(foo UInt8, `b a r` String)").unwrap(), - DataTypeNode::Nested(vec![ - Column::new("foo".to_string(), DataTypeNode::UInt8), - Column::new("b a r".to_string(), DataTypeNode::String), - ]) + DataTypeNode::Nested { + columns: vec![ + Column::new("foo".to_string(), DataTypeNode::UInt8), + Column::new("b a r".to_string(), DataTypeNode::String), + ], + as_tuple: Box::new(DataTypeNode::Tuple(vec![ + DataTypeNode::UInt8, + DataTypeNode::String, + ])), + } ); + + let foo = DataTypeNode::Enum(EnumType::Enum8, HashMap::from([(1, "f\\'(".to_string())])); + let baz = DataTypeNode::Tuple(vec![DataTypeNode::Enum( + EnumType::Enum8, + HashMap::from([(1, "f\\'()".to_string())]), + )]); + let bar = DataTypeNode::Nested { + columns: vec![Column::new("baz".to_string(), baz.clone())], + as_tuple: Box::new(DataTypeNode::Tuple(vec![baz])), + }; + assert_eq!( DataTypeNode::new( - "Nested(foo Enum8('f\\'(' = 1), `b a r` Nested(bar Tuple(Enum8('f\\'()' = 1))))" + "Nested(foo Enum8('f\\'(' = 1), `b a r` Nested(baz Tuple(Enum8('f\\'()' = 1))))" ) .unwrap(), - DataTypeNode::Nested(vec![ - Column::new( - "foo".to_string(), - DataTypeNode::Enum(EnumType::Enum8, HashMap::from([(1, "f\\'(".to_string())]),) - ), - Column::new( - "b a r".to_string(), - DataTypeNode::Nested(vec![Column::new( - "bar".to_string(), - DataTypeNode::Tuple(vec![DataTypeNode::Enum( - EnumType::Enum8, - HashMap::from([(1, "f\\'()".to_string())]), - )]), - )]) - ), - ]) + DataTypeNode::Nested { + columns: vec![ + Column::new("foo".to_string(), foo.clone()), + Column::new("b a r".to_string(), bar.clone()), + ], + as_tuple: Box::new(DataTypeNode::Tuple(vec![foo, bar])), + } ); assert!(DataTypeNode::new("Nested").is_err()); @@ -1744,10 +1778,16 @@ mod tests { "Variant(UInt8, Bool)" ); assert_eq!( - DataTypeNode::Nested(vec![ - Column::new("foo".to_string(), DataTypeNode::UInt8), - Column::new("bar".to_string(), DataTypeNode::String), - ]) + DataTypeNode::Nested { + columns: vec![ + Column::new("foo".to_string(), DataTypeNode::UInt8), + Column::new("bar".to_string(), DataTypeNode::String), + ], + as_tuple: Box::new(DataTypeNode::Tuple(vec![ + DataTypeNode::UInt8, + DataTypeNode::String + ])), + } .to_string(), "Nested(foo UInt8, bar String)" );