@@ -11,26 +11,62 @@ use crate::tokenizer::CharRef;
1111
1212include ! ( concat!( env!( "OUT_DIR" ) , "/named_entities_graph.rs" ) ) ;
1313
14+ /// A single node in the DAFSA.
15+ ///
16+ /// For memory efficiency reasons, this is packed in 32 bits. The memory representation is as follows:
17+ /// * 8 bits: code point
18+ /// * 8 bits: hash value
1419#[ derive( Clone , Copy , Debug ) ]
15- pub ( crate ) struct Node {
16- first_child_index : usize ,
17- code_point : u8 ,
18- is_last_child : bool ,
19- is_terminal : bool ,
20- num_nodes : u8 ,
21- }
20+ pub ( crate ) struct Node ( u32 ) ;
2221
2322impl Node {
23+ const IS_TERMINAL : u32 = 1 << 15 ;
24+ const IS_LAST_CHILD : u32 = 1 << 14 ;
25+
26+ pub ( crate ) const fn new (
27+ code_point : u8 ,
28+ hash_value : u8 ,
29+ is_terminal : bool ,
30+ is_last_child : bool ,
31+ first_child_index : u16 ,
32+ ) -> Self {
33+ let mut value = 0 ;
34+ value |= ( code_point as u32 ) << 24 ;
35+ value |= ( hash_value as u32 ) << 16 ;
36+
37+ if is_terminal {
38+ value |= Self :: IS_TERMINAL ;
39+ }
40+
41+ if is_last_child {
42+ value |= Self :: IS_LAST_CHILD ;
43+ }
44+
45+ assert ! ( first_child_index <= 0xFFF ) ;
46+
47+ value |= first_child_index as u32 ;
48+
49+ Self ( value)
50+ }
51+
2452 pub ( crate ) const fn code_point ( & self ) -> u8 {
25- self . code_point
53+ ( self . 0 >> 24 ) as u8
2654 }
2755
28- pub ( crate ) const fn num_nodes ( & self ) -> usize {
29- self . num_nodes as usize
56+ pub ( crate ) const fn hash_value ( & self ) -> usize {
57+ ( ( self . 0 >> 16 ) & 0xFF ) as usize
3058 }
3159
3260 pub ( crate ) const fn is_terminal ( & self ) -> bool {
33- self . is_terminal
61+ ( self . 0 & Self :: IS_TERMINAL ) != 0
62+ }
63+
64+ const fn is_last_child ( & self ) -> bool {
65+ ( self . 0 & Self :: IS_LAST_CHILD ) != 0
66+ }
67+
68+ const fn first_child_index ( & self ) -> u16 {
69+ ( self . 0 & 0xFFF ) as u16
3470 }
3571
3672 pub ( crate ) fn children ( & self ) -> impl Iterator < Item = & ' static Node > {
@@ -49,51 +85,22 @@ impl Node {
4985 let node = & DAFSA_NODES [ self . index ] ;
5086 self . index += 1 ;
5187
52- if node. is_last_child {
88+ if node. is_last_child ( ) {
5389 self . done = true ;
5490 }
5591
5692 Some ( node)
5793 }
5894 }
5995
96+ let first_child_index = self . first_child_index ( ) ;
6097 ChildIterator {
61- index : self . first_child_index ,
62- done : self . first_child_index == 0 ,
98+ index : first_child_index as usize ,
99+ done : first_child_index == 0 ,
63100 }
64101 }
65102}
66103
67- // fn compute_unique_index(input: &str) -> Option<usize> {
68- // debug_assert!(input.is_ascii());
69-
70- // let mut index = 0;
71- // let mut current = &DAFSA_NODES[0];
72- // for code_point in input.as_bytes() {
73- // let mut next_node = None;
74- // for child in current.children() {
75- // if child.code_point == *code_point {
76- // next_node = Some(child);
77- // break;
78- // } else {
79- // index += child.num_nodes as usize;
80- // }
81- // }
82-
83- // current = next_node?;
84-
85- // if current.is_terminal {
86- // index += 1;
87- // }
88- // }
89-
90- // if current.is_terminal {
91- // Some(index)
92- // } else {
93- // None
94- // }
95- // }
96-
97104pub ( crate ) fn resolve_unique_hash_value ( value : usize ) -> CharRef {
98105 let ( first, second) = REFERENCES [ value] ;
99106
0 commit comments