diff --git a/src/rowbinary/validation.rs b/src/rowbinary/validation.rs index 76f54493..410708f1 100644 --- a/src/rowbinary/validation.rs +++ b/src/rowbinary/validation.rs @@ -548,6 +548,10 @@ fn validate_impl<'serde, 'caller, R: Row>( root, kind: InnerDataTypeValidatorKind::Array(&DataTypeNode::LineString), }), + DataTypeNode::Nested { as_tuple, .. } => Some(InnerDataTypeValidator { + root, + kind: InnerDataTypeValidatorKind::Array(as_tuple), + }), _ => root.panic_on_schema_mismatch(data_type, serde_type, is_inner), }, SerdeType::Tuple(len) => match data_type { diff --git a/tests/it/nested.rs b/tests/it/nested.rs index 7a78eb58..fb8c591e 100644 --- a/tests/it/nested.rs +++ b/tests/it/nested.rs @@ -50,3 +50,110 @@ async fn smoke() { assert_eq!(row, original_row); } + +#[tokio::test] +async fn no_flatten() { + let client = prepare_database!().with_option("flatten_nested", "0"); + + #[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Row)] + struct MyRow { + no: i32, + items: Vec<(String, u32)>, + } + + // `flatten_nested = 0` prevents flattening of nested columns, causing them to be stored as a + // single array of tuples instead of as separate arrays + + client + .query( + " + CREATE TABLE test( + no Int32, + items Nested( + name String, + count UInt32 + ) + ) + ENGINE = MergeTree ORDER BY no + ", + ) + .execute() + .await + .unwrap(); + + let original_row = MyRow { + no: 42, + items: vec![("foo".into(), 1), ("bar".into(), 5)], + }; + + let mut insert = client.insert::("test").await.unwrap(); + insert.write(&original_row).await.unwrap(); + insert.end().await.unwrap(); + + let row = client + .query("SELECT ?fields FROM test") + .fetch_one::() + .await + .unwrap(); + + assert_eq!(row, original_row); +} + +#[tokio::test] +async fn doubly_flattened() { + let client = prepare_database!(); + + #[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Row)] + struct MyRow { + no: i32, + #[serde(rename = "items.names")] + items_names: Vec>, + #[serde(rename = "items.count")] + items_count: Vec, + } + + // Only the first level is flattened and any more deeply nested columns are stored as an array + // of tuples, so the table ends up with columns + // - `no Int32` + // - `items.names Array(Nested(first String, last String))` + // (i.e. `Array(Array(Tuple(first String, last String)))`) + // - `items.count Array(UInt32)` + + client + .query( + " + CREATE TABLE test( + no Int32, + items Nested( + names Nested(first String, last String), + count UInt32 + ) + ) + ENGINE = MergeTree ORDER BY no + ", + ) + .execute() + .await + .unwrap(); + + let original_row = MyRow { + no: 42, + items_names: vec![ + vec![("foo".into(), "foo".into())], + vec![("bar".into(), "bar".into())], + ], + items_count: vec![1, 5], + }; + + let mut insert = client.insert::("test").await.unwrap(); + insert.write(&original_row).await.unwrap(); + insert.end().await.unwrap(); + + let row = client + .query("SELECT ?fields FROM test") + .fetch_one::() + .await + .unwrap(); + + assert_eq!(row, original_row); +} diff --git a/types/src/data_types.rs b/types/src/data_types.rs index c88521b4..e3386998 100644 --- a/types/src/data_types.rs +++ b/types/src/data_types.rs @@ -101,6 +101,13 @@ pub enum DataTypeNode { MultiLineString, Polygon, MultiPolygon, + + Nested { + columns: Vec, + // This stores the types in `columns` as a tuple node as a hack to be able to validate + // data for this column as an array of tuples + as_tuple: Box, + }, } impl DataTypeNode { @@ -157,6 +164,8 @@ impl DataTypeNode { str if str.starts_with("Tuple") => parse_tuple(str), str if str.starts_with("Variant") => parse_variant(str), + str if str.starts_with("Nested") => parse_nested(str), + // ... str => Err(TypesError::TypeParsingError(format!( "Unknown data type: {str}" @@ -276,6 +285,16 @@ impl Display for DataTypeNode { MultiLineString => write!(f, "MultiLineString"), Polygon => write!(f, "Polygon"), MultiPolygon => write!(f, "MultiPolygon"), + Nested { columns, .. } => { + write!(f, "Nested(")?; + for (i, column) in columns.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{} {}", column.name, column.data_type)?; + } + write!(f, ")") + } } } } @@ -663,66 +682,73 @@ fn parse_variant(input: &str) -> Result { /// let input1 = "Tuple(Enum8('f\'()' = 1))"; // the result is `f\'()` /// let input2 = "Tuple(Enum8('(' = 1))"; // the result is `(` /// ``` -fn parse_inner_types(input: &str) -> Result, TypesError> { +fn parse_inner_types(mut input: &str) -> Result, TypesError> { let mut inner_types: Vec = Vec::new(); + while !input.is_empty() { + inner_types.push(parse_inner_type(&mut input)?); + + if !input.starts_with(',') { + break; + } + + input = &input[1..]; + + if input.starts_with(' ') { + input = &input[1..]; + } + } + + if !input.is_empty() { + return Err(TypesError::TypeParsingError(format!( + "Failed to parse all inner types, remaining input: {input}" + ))); + } + + Ok(inner_types) +} + +/// Parses a single type from the input string and removes it from the input. +fn parse_inner_type(input: &mut &str) -> Result { let input_bytes = input.as_bytes(); let mut open_parens = 0; let mut quote_open = false; let mut char_escaped = false; - let mut last_element_index = 0; let mut i = 0; while i < input_bytes.len() { - if char_escaped { - char_escaped = false; - } else if input_bytes[i] == b'\\' { - char_escaped = true; - } else if input_bytes[i] == b'\'' { - quote_open = !quote_open; // unescaped quote - } else if !quote_open { - if input_bytes[i] == b'(' { - open_parens += 1; - } else if input_bytes[i] == b')' { - open_parens -= 1; - } else if input_bytes[i] == b',' && open_parens == 0 { - let data_type_str = String::from_utf8(input_bytes[last_element_index..i].to_vec()) - .map_err(|_| { - TypesError::TypeParsingError(format!( - "Invalid UTF-8 sequence in input for the inner data type: {}", - &input[last_element_index..] - )) - })?; - let data_type = DataTypeNode::new(&data_type_str)?; - inner_types.push(data_type); - // Skip ', ' (comma and space) - if i + 2 <= input_bytes.len() && input_bytes[i + 1] == b' ' { - i += 2; - } else { - i += 1; - } - last_element_index = i; - continue; // Skip the normal increment at the end of the loop - } + match input_bytes[i] { + _ if char_escaped => char_escaped = false, + b'\\' if quote_open => char_escaped = true, + b'\'' => quote_open = !quote_open, // unescaped quote + byte if !quote_open => match byte { + b'(' => open_parens += 1, + b',' | b')' if open_parens == 0 => break, + b')' => open_parens -= 1, + _ => {} + }, + _ => {} } i += 1; } - // Push the remaining part of the type if it seems to be valid (at least all parentheses are closed) - if open_parens == 0 && last_element_index < input_bytes.len() { - let data_type_str = - String::from_utf8(input_bytes[last_element_index..].to_vec()).map_err(|_| { - TypesError::TypeParsingError(format!( - "Invalid UTF-8 sequence in input for the inner data type: {}", - &input[last_element_index..] - )) - })?; - let data_type = DataTypeNode::new(&data_type_str)?; - inner_types.push(data_type); + let data_type_str = String::from_utf8(input_bytes[..i].to_vec()).map_err(|_| { + TypesError::TypeParsingError(format!( + "Invalid UTF-8 sequence in input for the inner data type: {}", + &input[..i] + )) + })?; + + if open_parens != 0 || quote_open || char_escaped { + return Err(TypesError::TypeParsingError(format!( + "Invalid inner data type: {}", + &input[..i] + ))); } - Ok(inner_types) + *input = &input[i..]; + DataTypeNode::new(&data_type_str) } #[inline] @@ -819,6 +845,98 @@ fn parse_enum_values_map(input: &str) -> Result, TypesError .collect::>()) } +fn parse_nested(mut input: &str) -> Result { + /// Removes the prefix `prefix` from `input`. + fn parse_str(input: &mut &str, prefix: &str) -> Result<(), TypesError> { + if input.starts_with(prefix) { + *input = &input[prefix.len()..]; + Ok(()) + } else { + Err(TypesError::TypeParsingError(format!( + "Expected {prefix:?}, got {input:?}" + ))) + } + } + + /// Removes and returns the prefix of `input` up to the first character that does not match the + /// predicate. + fn parse_while<'a>(input: &mut &'a str, predicate: impl Fn(char) -> bool) -> &'a str { + let index = input + .char_indices() + .find(|(_, c)| !predicate(*c)) + .map(|(i, _)| i) + .unwrap_or(input.len()); + let (prefix, rest) = input.split_at(index); + *input = rest; + prefix + } + + /// Removes and returns a valid identifier from the start of `input`. + fn parse_identifier<'a>(input: &mut &'a str) -> Result<&'a str, TypesError> { + let original_input = *input; + + if input.starts_with('`') { + parse_str(input, "`")?; + let mut is_escaping = false; + + for (index, char) in input.char_indices() { + match char { + _ if is_escaping => is_escaping = false, + '\\' => is_escaping = true, + '`' => { + let name = &input[..index]; + *input = &input[index + 1..]; + return Ok(name); + } + _ => {} + } + } + + Err(TypesError::TypeParsingError(format!( + "Unclosed backtick in name: {original_input}" + ))) + } else { + Ok(parse_while(input, |c| { + c.is_ascii_alphanumeric() || c == '_' + })) + } + } + + let original_input = input; + parse_str(&mut input, "Nested(")?; + + let mut columns = Vec::new(); + let mut types = Vec::new(); + + while !input.starts_with(')') { + let name = parse_identifier(&mut input)?; + parse_str(&mut input, " ")?; + let data_type = parse_inner_type(&mut input)?; + + columns.push(Column { + name: name.to_string(), + data_type: data_type.clone(), + }); + types.push(data_type); + + if input.starts_with(',') { + parse_str(&mut input, ", ")?; + } + } + + if columns.is_empty() { + return Err(TypesError::TypeParsingError(format!( + "Expected at least one column in Nested from input {original_input}" + ))); + } + + parse_str(&mut input, ")")?; + Ok(DataTypeNode::Nested { + columns, + as_tuple: Box::new(DataTypeNode::Tuple(types)), + }) +} + #[cfg(test)] mod tests { use super::*; @@ -1469,6 +1587,88 @@ mod tests { ); } + #[test] + fn test_data_type_new_nested() { + assert_eq!( + DataTypeNode::new("Nested(foo UInt8)").unwrap(), + DataTypeNode::Nested { + columns: vec![Column::new("foo".to_string(), DataTypeNode::UInt8)], + as_tuple: Box::new(DataTypeNode::Tuple(vec![DataTypeNode::UInt8])), + } + ); + assert_eq!( + DataTypeNode::new("Nested(foo UInt8, bar String)").unwrap(), + DataTypeNode::Nested { + columns: vec![ + Column::new("foo".to_string(), DataTypeNode::UInt8), + Column::new("bar".to_string(), DataTypeNode::String), + ], + as_tuple: Box::new(DataTypeNode::Tuple(vec![ + DataTypeNode::UInt8, + DataTypeNode::String, + ])), + } + ); + assert_eq!( + DataTypeNode::new("Nested(foo UInt8, `bar` String)").unwrap(), + DataTypeNode::Nested { + columns: vec![ + Column::new("foo".to_string(), DataTypeNode::UInt8), + Column::new("bar".to_string(), DataTypeNode::String), + ], + as_tuple: Box::new(DataTypeNode::Tuple(vec![ + DataTypeNode::UInt8, + DataTypeNode::String, + ])), + } + ); + assert_eq!( + DataTypeNode::new("Nested(foo UInt8, `b a r` String)").unwrap(), + DataTypeNode::Nested { + columns: vec![ + Column::new("foo".to_string(), DataTypeNode::UInt8), + Column::new("b a r".to_string(), DataTypeNode::String), + ], + as_tuple: Box::new(DataTypeNode::Tuple(vec![ + DataTypeNode::UInt8, + DataTypeNode::String, + ])), + } + ); + + let foo = DataTypeNode::Enum(EnumType::Enum8, HashMap::from([(1, "f\\'(".to_string())])); + let baz = DataTypeNode::Tuple(vec![DataTypeNode::Enum( + EnumType::Enum8, + HashMap::from([(1, "f\\'()".to_string())]), + )]); + let bar = DataTypeNode::Nested { + columns: vec![Column::new("baz".to_string(), baz.clone())], + as_tuple: Box::new(DataTypeNode::Tuple(vec![baz])), + }; + + assert_eq!( + DataTypeNode::new( + "Nested(foo Enum8('f\\'(' = 1), `b a r` Nested(baz Tuple(Enum8('f\\'()' = 1))))" + ) + .unwrap(), + DataTypeNode::Nested { + columns: vec![ + Column::new("foo".to_string(), foo.clone()), + Column::new("b a r".to_string(), bar.clone()), + ], + as_tuple: Box::new(DataTypeNode::Tuple(vec![foo, bar])), + } + ); + + assert!(DataTypeNode::new("Nested").is_err()); + assert!(DataTypeNode::new("Nested(").is_err()); + assert!(DataTypeNode::new("Nested()").is_err()); + assert!(DataTypeNode::new("Nested(,)").is_err()); + assert!(DataTypeNode::new("Nested(String)").is_err()); + assert!(DataTypeNode::new("Nested(Int32, String)").is_err()); + assert!(DataTypeNode::new("Nested(foo Int32, String)").is_err()); + } + #[test] fn test_data_type_to_string_simple() { // Simple types @@ -1577,6 +1777,20 @@ mod tests { DataTypeNode::Variant(vec![DataTypeNode::UInt8, DataTypeNode::Bool]).to_string(), "Variant(UInt8, Bool)" ); + assert_eq!( + DataTypeNode::Nested { + columns: vec![ + Column::new("foo".to_string(), DataTypeNode::UInt8), + Column::new("bar".to_string(), DataTypeNode::String), + ], + as_tuple: Box::new(DataTypeNode::Tuple(vec![ + DataTypeNode::UInt8, + DataTypeNode::String + ])), + } + .to_string(), + "Nested(foo UInt8, bar String)" + ); } #[test]