@@ -17,12 +17,25 @@ use crate::html::format::join_with_double_colon;
1717use crate :: html:: markdown:: short_markdown_summary;
1818use crate :: html:: render:: { self , IndexItem , IndexItemFunctionType , RenderType , RenderTypeId } ;
1919
20+ /// The serialized search description sharded version
21+ ///
22+ /// The `index` is a JSON-encoded list of names and other information.
23+ ///
24+ /// The desc has newlined descriptions, split up by size into 1MiB shards.
25+ /// For example, `(4, "foo\nbar\nbaz\nquux")`.
26+ pub ( crate ) struct SerializedSearchIndex {
27+ pub ( crate ) index : String ,
28+ pub ( crate ) desc : Vec < ( usize , String ) > ,
29+ }
30+
31+ const DESC_INDEX_SHARD_LEN : usize = 1024 * 1024 ;
32+
2033/// Builds the search index from the collected metadata
2134pub ( crate ) fn build_index < ' tcx > (
2235 krate : & clean:: Crate ,
2336 cache : & mut Cache ,
2437 tcx : TyCtxt < ' tcx > ,
25- ) -> String {
38+ ) -> SerializedSearchIndex {
2639 let mut itemid_to_pathid = FxHashMap :: default ( ) ;
2740 let mut primitives = FxHashMap :: default ( ) ;
2841 let mut associated_types = FxHashMap :: default ( ) ;
@@ -318,7 +331,6 @@ pub(crate) fn build_index<'tcx>(
318331 . collect :: < Vec < _ > > ( ) ;
319332
320333 struct CrateData < ' a > {
321- doc : String ,
322334 items : Vec < & ' a IndexItem > ,
323335 paths : Vec < ( ItemType , Vec < Symbol > ) > ,
324336 // The String is alias name and the vec is the list of the elements with this alias.
@@ -327,6 +339,9 @@ pub(crate) fn build_index<'tcx>(
327339 aliases : & ' a BTreeMap < String , Vec < usize > > ,
328340 // Used when a type has more than one impl with an associated item with the same name.
329341 associated_item_disambiguators : & ' a Vec < ( usize , String ) > ,
342+ // A list of shard lengths encoded as vlqhex. See the comment in write_vlqhex_to_string
343+ // for information on the format.
344+ descindex : String ,
330345 }
331346
332347 struct Paths {
@@ -408,7 +423,6 @@ pub(crate) fn build_index<'tcx>(
408423 let mut names = Vec :: with_capacity ( self . items . len ( ) ) ;
409424 let mut types = String :: with_capacity ( self . items . len ( ) ) ;
410425 let mut full_paths = Vec :: with_capacity ( self . items . len ( ) ) ;
411- let mut descriptions = Vec :: with_capacity ( self . items . len ( ) ) ;
412426 let mut parents = Vec :: with_capacity ( self . items . len ( ) ) ;
413427 let mut functions = String :: with_capacity ( self . items . len ( ) ) ;
414428 let mut deprecated = Vec :: with_capacity ( self . items . len ( ) ) ;
@@ -431,7 +445,6 @@ pub(crate) fn build_index<'tcx>(
431445 parents. push ( item. parent_idx . map ( |x| x + 1 ) . unwrap_or ( 0 ) ) ;
432446
433447 names. push ( item. name . as_str ( ) ) ;
434- descriptions. push ( & item. desc ) ;
435448
436449 if !item. path . is_empty ( ) {
437450 full_paths. push ( ( index, & item. path ) ) ;
@@ -454,14 +467,12 @@ pub(crate) fn build_index<'tcx>(
454467 let has_aliases = !self . aliases . is_empty ( ) ;
455468 let mut crate_data =
456469 serializer. serialize_struct ( "CrateData" , if has_aliases { 9 } else { 8 } ) ?;
457- crate_data. serialize_field ( "doc" , & self . doc ) ?;
458470 crate_data. serialize_field ( "t" , & types) ?;
459471 crate_data. serialize_field ( "n" , & names) ?;
460- // Serialize as an array of item indices and full paths
461472 crate_data. serialize_field ( "q" , & full_paths) ?;
462- crate_data. serialize_field ( "d" , & descriptions) ?;
463473 crate_data. serialize_field ( "i" , & parents) ?;
464474 crate_data. serialize_field ( "f" , & functions) ?;
475+ crate_data. serialize_field ( "D" , & self . descindex ) ?;
465476 crate_data. serialize_field ( "c" , & deprecated) ?;
466477 crate_data. serialize_field ( "p" , & paths) ?;
467478 crate_data. serialize_field ( "b" , & self . associated_item_disambiguators ) ?;
@@ -472,24 +483,92 @@ pub(crate) fn build_index<'tcx>(
472483 }
473484 }
474485
475- // Collect the index into a string
476- format ! (
486+ let desc = {
487+ let mut result = Vec :: new ( ) ;
488+ let mut set = String :: new ( ) ;
489+ let mut len: usize = 0 ;
490+ for desc in std:: iter:: once ( & crate_doc) . chain ( crate_items. iter ( ) . map ( |item| & item. desc ) ) {
491+ if set. len ( ) >= DESC_INDEX_SHARD_LEN {
492+ result. push ( ( len, std:: mem:: replace ( & mut set, String :: new ( ) ) ) ) ;
493+ len = 0 ;
494+ } else if len != 0 {
495+ set. push ( '\n' ) ;
496+ }
497+ set. push_str ( & desc) ;
498+ len += 1 ;
499+ }
500+ result. push ( ( len, std:: mem:: replace ( & mut set, String :: new ( ) ) ) ) ;
501+ result
502+ } ;
503+
504+ let descindex = {
505+ let mut descindex = String :: with_capacity ( desc. len ( ) * 4 ) ;
506+ for & ( len, _) in desc. iter ( ) {
507+ write_vlqhex_to_string ( len. try_into ( ) . unwrap ( ) , & mut descindex) ;
508+ }
509+ descindex
510+ } ;
511+
512+ assert_eq ! ( crate_items. len( ) + 1 , desc. iter( ) . map( |( len, _) | * len) . sum:: <usize >( ) ) ;
513+
514+ // The index, which is actually used to search, is JSON
515+ // It uses `JSON.parse(..)` to actually load, since JSON
516+ // parses faster than the full JavaScript syntax.
517+ let index = format ! (
477518 r#"["{}",{}]"# ,
478519 krate. name( tcx) ,
479520 serde_json:: to_string( & CrateData {
480- doc: crate_doc,
481521 items: crate_items,
482522 paths: crate_paths,
483523 aliases: & aliases,
484524 associated_item_disambiguators: & associated_item_disambiguators,
525+ descindex,
485526 } )
486527 . expect( "failed serde conversion" )
487528 // All these `replace` calls are because we have to go through JS string for JSON content.
488529 . replace( '\\' , r"\\" )
489530 . replace( '\'' , r"\'" )
490531 // We need to escape double quotes for the JSON.
491532 . replace( "\\ \" " , "\\ \\ \" " )
492- )
533+ ) ;
534+ SerializedSearchIndex { index, desc }
535+ }
536+
537+ pub ( crate ) fn write_vlqhex_to_string ( n : i32 , string : & mut String ) {
538+ let ( sign, magnitude) : ( bool , u32 ) =
539+ if n >= 0 { ( false , n. try_into ( ) . unwrap ( ) ) } else { ( true , ( -n) . try_into ( ) . unwrap ( ) ) } ;
540+ // zig-zag encoding
541+ let value: u32 = ( magnitude << 1 ) | ( if sign { 1 } else { 0 } ) ;
542+ // Self-terminating hex use capital letters for everything but the
543+ // least significant digit, which is lowercase. For example, decimal 17
544+ // would be `` Aa `` if zig-zag encoding weren't used.
545+ //
546+ // Zig-zag encoding, however, stores the sign bit as the last bit.
547+ // This means, in the last hexit, 1 is actually `c`, -1 is `b`
548+ // (`a` is the imaginary -0), and, because all the bits are shifted
549+ // by one, `` A` `` is actually 8 and `` Aa `` is -8.
550+ //
551+ // https://rust-lang.github.io/rustc-dev-guide/rustdoc-internals/search.html
552+ // describes the encoding in more detail.
553+ let mut shift: u32 = 28 ;
554+ let mut mask: u32 = 0xF0_00_00_00 ;
555+ // first skip leading zeroes
556+ while shift < 32 {
557+ let hexit = ( value & mask) >> shift;
558+ if hexit != 0 || shift == 0 {
559+ break ;
560+ }
561+ shift = shift. wrapping_sub ( 4 ) ;
562+ mask = mask >> 4 ;
563+ }
564+ // now write the rest
565+ while shift < 32 {
566+ let hexit = ( value & mask) >> shift;
567+ let hex = char:: try_from ( if shift == 0 { '`' } else { '@' } as u32 + hexit) . unwrap ( ) ;
568+ string. push ( hex) ;
569+ shift = shift. wrapping_sub ( 4 ) ;
570+ mask = mask >> 4 ;
571+ }
493572}
494573
495574pub ( crate ) fn get_function_type_for_search < ' tcx > (
0 commit comments