11use std:: collections:: hash_map:: Entry ;
22use std:: collections:: { BTreeMap , VecDeque } ;
33
4+ use base64:: prelude:: * ;
45use rustc_data_structures:: fx:: { FxHashMap , FxIndexMap } ;
56use rustc_middle:: ty:: TyCtxt ;
67use rustc_span:: def_id:: DefId ;
@@ -21,14 +22,14 @@ use crate::html::render::{self, IndexItem, IndexItemFunctionType, RenderType, Re
2122///
2223/// The `index` is a JSON-encoded list of names and other information.
2324///
24- /// The desc has newlined descriptions, split up by size into 1MiB shards.
25+ /// The desc has newlined descriptions, split up by size into 128KiB shards.
2526/// For example, `(4, "foo\nbar\nbaz\nquux")`.
2627pub ( crate ) struct SerializedSearchIndex {
2728 pub ( crate ) index : String ,
2829 pub ( crate ) desc : Vec < ( usize , String ) > ,
2930}
3031
31- const DESC_INDEX_SHARD_LEN : usize = 1024 * 1024 ;
32+ const DESC_INDEX_SHARD_LEN : usize = 128 * 1024 ;
3233
3334/// Builds the search index from the collected metadata
3435pub ( crate ) fn build_index < ' tcx > (
@@ -342,6 +343,8 @@ pub(crate) fn build_index<'tcx>(
342343 // A list of shard lengths encoded as vlqhex. See the comment in write_vlqhex_to_string
343344 // for information on the format.
344345 descindex : String ,
346+ // A list of items with no description. This is eventually turned into a bitmap.
347+ emptydesc : Vec < u32 > ,
345348 }
346349
347350 struct Paths {
@@ -456,7 +459,8 @@ pub(crate) fn build_index<'tcx>(
456459 }
457460
458461 if item. deprecation . is_some ( ) {
459- deprecated. push ( index) ;
462+ // bitmasks always use 1-indexing for items, with 0 as the crate itself
463+ deprecated. push ( u32:: try_from ( index + 1 ) . unwrap ( ) ) ;
460464 }
461465 }
462466
@@ -473,21 +477,37 @@ pub(crate) fn build_index<'tcx>(
473477 crate_data. serialize_field ( "i" , & parents) ?;
474478 crate_data. serialize_field ( "f" , & functions) ?;
475479 crate_data. serialize_field ( "D" , & self . descindex ) ?;
476- crate_data. serialize_field ( "c" , & deprecated) ?;
477480 crate_data. serialize_field ( "p" , & paths) ?;
478481 crate_data. serialize_field ( "b" , & self . associated_item_disambiguators ) ?;
482+ let mut buf = Vec :: new ( ) ;
483+ let mut strbuf = String :: new ( ) ;
484+ write_bitmap_to_bytes ( & deprecated, & mut buf) . unwrap ( ) ;
485+ BASE64_STANDARD . encode_string ( & buf, & mut strbuf) ;
486+ crate_data. serialize_field ( "c" , & strbuf) ?;
487+ strbuf. clear ( ) ;
488+ buf. clear ( ) ;
489+ write_bitmap_to_bytes ( & self . emptydesc , & mut buf) . unwrap ( ) ;
490+ BASE64_STANDARD . encode_string ( & buf, & mut strbuf) ;
491+ crate_data. serialize_field ( "e" , & strbuf) ?;
479492 if has_aliases {
480493 crate_data. serialize_field ( "a" , & self . aliases ) ?;
481494 }
482495 crate_data. end ( )
483496 }
484497 }
485498
486- let desc = {
499+ let ( emptydesc, desc) = {
500+ let mut emptydesc = Vec :: new ( ) ;
487501 let mut result = Vec :: new ( ) ;
488502 let mut set = String :: new ( ) ;
489503 let mut len: usize = 0 ;
504+ let mut itemindex: u32 = 0 ;
490505 for desc in std:: iter:: once ( & crate_doc) . chain ( crate_items. iter ( ) . map ( |item| & item. desc ) ) {
506+ if desc == "" {
507+ emptydesc. push ( itemindex) ;
508+ itemindex += 1 ;
509+ continue ;
510+ }
491511 if set. len ( ) >= DESC_INDEX_SHARD_LEN {
492512 result. push ( ( len, std:: mem:: replace ( & mut set, String :: new ( ) ) ) ) ;
493513 len = 0 ;
@@ -496,9 +516,10 @@ pub(crate) fn build_index<'tcx>(
496516 }
497517 set. push_str ( & desc) ;
498518 len += 1 ;
519+ itemindex += 1 ;
499520 }
500521 result. push ( ( len, std:: mem:: replace ( & mut set, String :: new ( ) ) ) ) ;
501- result
522+ ( emptydesc , result)
502523 } ;
503524
504525 let descindex = {
@@ -509,7 +530,10 @@ pub(crate) fn build_index<'tcx>(
509530 descindex
510531 } ;
511532
512- assert_eq ! ( crate_items. len( ) + 1 , desc. iter( ) . map( |( len, _) | * len) . sum:: <usize >( ) ) ;
533+ assert_eq ! (
534+ crate_items. len( ) + 1 ,
535+ desc. iter( ) . map( |( len, _) | * len) . sum:: <usize >( ) + emptydesc. len( )
536+ ) ;
513537
514538 // The index, which is actually used to search, is JSON
515539 // It uses `JSON.parse(..)` to actually load, since JSON
@@ -523,6 +547,7 @@ pub(crate) fn build_index<'tcx>(
523547 aliases: & aliases,
524548 associated_item_disambiguators: & associated_item_disambiguators,
525549 descindex,
550+ emptydesc,
526551 } )
527552 . expect( "failed serde conversion" )
528553 // All these `replace` calls are because we have to go through JS string for JSON content.
@@ -571,6 +596,200 @@ pub(crate) fn write_vlqhex_to_string(n: i32, string: &mut String) {
571596 }
572597}
573598
599+ // checked against roaring-rs in
600+ // https://gitlab.com/notriddle/roaring-test
601+ pub fn write_bitmap_to_bytes ( domain : & [ u32 ] , mut out : impl std:: io:: Write ) -> std:: io:: Result < ( ) > {
602+ // https://arxiv.org/pdf/1603.06549.pdf
603+ let mut keys = Vec :: < u16 > :: new ( ) ;
604+ let mut containers = Vec :: < Container > :: new ( ) ;
605+ enum Container {
606+ /// number of ones, bits
607+ Bits ( Box < [ u64 ; 1024 ] > ) ,
608+ /// list of entries
609+ Array ( Vec < u16 > ) ,
610+ /// list of (start, len-1)
611+ Run ( Vec < ( u16 , u16 ) > ) ,
612+ }
613+ impl Container {
614+ fn popcount ( & self ) -> u32 {
615+ match self {
616+ Container :: Bits ( bits) => bits. iter ( ) . copied ( ) . map ( |x| x. count_ones ( ) ) . sum ( ) ,
617+ Container :: Array ( array) => {
618+ array. len ( ) . try_into ( ) . expect ( "array can't be bigger than 2**32" )
619+ }
620+ Container :: Run ( runs) => {
621+ runs. iter ( ) . copied ( ) . map ( |( _, lenm1) | u32:: from ( lenm1) + 1 ) . sum ( )
622+ }
623+ }
624+ }
625+ fn push ( & mut self , value : u16 ) {
626+ match self {
627+ Container :: Bits ( bits) => bits[ value as usize >> 6 ] |= 1 << ( value & 0x3F ) ,
628+ Container :: Array ( array) => {
629+ array. push ( value) ;
630+ if array. len ( ) >= 4096 {
631+ let array = std:: mem:: replace ( array, Vec :: new ( ) ) ;
632+ * self = Container :: Bits ( Box :: new ( [ 0 ; 1024 ] ) ) ;
633+ for value in array {
634+ self . push ( value) ;
635+ }
636+ }
637+ }
638+ Container :: Run ( runs) => {
639+ if let Some ( r) = runs. last_mut ( )
640+ && r. 0 + r. 1 + 1 == value
641+ {
642+ r. 1 += 1 ;
643+ } else {
644+ runs. push ( ( value, 0 ) ) ;
645+ }
646+ }
647+ }
648+ }
649+ fn try_make_run ( & mut self ) -> bool {
650+ match self {
651+ Container :: Bits ( bits) => {
652+ let mut r: u64 = 0 ;
653+ for ( i, chunk) in bits. iter ( ) . copied ( ) . enumerate ( ) {
654+ let next_chunk =
655+ i. checked_add ( 1 ) . and_then ( |i| bits. get ( i) ) . copied ( ) . unwrap_or ( 0 ) ;
656+ r += !chunk & u64:: from ( ( chunk << 1 ) . count_ones ( ) ) ;
657+ r += !next_chunk & u64:: from ( ( chunk >> 63 ) . count_ones ( ) ) ;
658+ }
659+ if ( 2 + 4 * r) < 8192 {
660+ let bits = std:: mem:: replace ( bits, Box :: new ( [ 0 ; 1024 ] ) ) ;
661+ * self = Container :: Run ( Vec :: new ( ) ) ;
662+ for ( i, bits) in bits. iter ( ) . copied ( ) . enumerate ( ) {
663+ if bits == 0 {
664+ continue ;
665+ }
666+ for j in 0 ..64 {
667+ let value = ( u16:: try_from ( i) . unwrap ( ) << 6 ) | j;
668+ if bits & ( 1 << j) != 0 {
669+ self . push ( value) ;
670+ }
671+ }
672+ }
673+ true
674+ } else {
675+ false
676+ }
677+ }
678+ Container :: Array ( array) if array. len ( ) <= 5 => false ,
679+ Container :: Array ( array) => {
680+ let mut r = 0 ;
681+ let mut prev = None ;
682+ for value in array. iter ( ) . copied ( ) {
683+ if value. checked_sub ( 1 ) != prev {
684+ r += 1 ;
685+ }
686+ prev = Some ( value) ;
687+ }
688+ if 2 + 4 * r < 2 * array. len ( ) + 2 {
689+ let array = std:: mem:: replace ( array, Vec :: new ( ) ) ;
690+ * self = Container :: Run ( Vec :: new ( ) ) ;
691+ for value in array {
692+ self . push ( value) ;
693+ }
694+ true
695+ } else {
696+ false
697+ }
698+ }
699+ Container :: Run ( _) => true ,
700+ }
701+ }
702+ }
703+ let mut key: u16 ;
704+ let mut domain_iter = domain. into_iter ( ) . copied ( ) . peekable ( ) ;
705+ let mut has_run = false ;
706+ while let Some ( entry) = domain_iter. next ( ) {
707+ key = ( entry >> 16 ) . try_into ( ) . expect ( "shifted off the top 16 bits, so it should fit" ) ;
708+ let value: u16 = ( entry & 0x00_00_FF_FF ) . try_into ( ) . expect ( "AND 16 bits, so it should fit" ) ;
709+ let mut container = Container :: Array ( vec ! [ value] ) ;
710+ while let Some ( entry) = domain_iter. peek ( ) . copied ( ) {
711+ let entry_key: u16 =
712+ ( entry >> 16 ) . try_into ( ) . expect ( "shifted off the top 16 bits, so it should fit" ) ;
713+ if entry_key != key {
714+ break ;
715+ }
716+ domain_iter. next ( ) . expect ( "peeking just succeeded" ) ;
717+ container
718+ . push ( ( entry & 0x00_00_FF_FF ) . try_into ( ) . expect ( "AND 16 bits, so it should fit" ) ) ;
719+ }
720+ keys. push ( key) ;
721+ has_run = container. try_make_run ( ) || has_run;
722+ containers. push ( container) ;
723+ }
724+ // https://github.com/RoaringBitmap/RoaringFormatSpec
725+ use byteorder:: { WriteBytesExt , LE } ;
726+ const SERIAL_COOKIE_NO_RUNCONTAINER : u32 = 12346 ;
727+ const SERIAL_COOKIE : u32 = 12347 ;
728+ const NO_OFFSET_THRESHOLD : u32 = 4 ;
729+ let size: u32 = containers. len ( ) . try_into ( ) . unwrap ( ) ;
730+ let start_offset = if has_run {
731+ out. write_u32 :: < LE > ( SERIAL_COOKIE | ( ( size - 1 ) << 16 ) ) ?;
732+ for set in containers. chunks ( 8 ) {
733+ let mut b = 0 ;
734+ for ( i, container) in set. iter ( ) . enumerate ( ) {
735+ if matches ! ( container, & Container :: Run ( ..) ) {
736+ b |= 1 << i;
737+ }
738+ }
739+ out. write_u8 ( b) ?;
740+ }
741+ if size < NO_OFFSET_THRESHOLD {
742+ 4 + 4 * size + ( ( size + 7 ) / 8 )
743+ } else {
744+ 4 + 8 * size + ( ( size + 7 ) / 8 )
745+ }
746+ } else {
747+ out. write_u32 :: < LE > ( SERIAL_COOKIE_NO_RUNCONTAINER ) ?;
748+ out. write_u32 :: < LE > ( containers. len ( ) . try_into ( ) . unwrap ( ) ) ?;
749+ 4 + 4 + 4 * size + 4 * size
750+ } ;
751+ for ( & key, container) in keys. iter ( ) . zip ( & containers) {
752+ // descriptive header
753+ let key: u32 = key. into ( ) ;
754+ let count: u32 = container. popcount ( ) - 1 ;
755+ out. write_u32 :: < LE > ( ( count << 16 ) | key) ?;
756+ }
757+ if !has_run || size >= NO_OFFSET_THRESHOLD {
758+ // offset header
759+ let mut starting_offset = start_offset;
760+ for container in & containers {
761+ out. write_u32 :: < LE > ( starting_offset) ?;
762+ starting_offset += match container {
763+ Container :: Bits ( _) => 8192u32 ,
764+ Container :: Array ( array) => u32:: try_from ( array. len ( ) ) . unwrap ( ) * 2 ,
765+ Container :: Run ( runs) => 2 + u32:: try_from ( runs. len ( ) ) . unwrap ( ) * 4 ,
766+ } ;
767+ }
768+ }
769+ for container in & containers {
770+ match container {
771+ Container :: Bits ( bits) => {
772+ for chunk in bits. iter ( ) {
773+ out. write_u64 :: < LE > ( * chunk) ?;
774+ }
775+ }
776+ Container :: Array ( array) => {
777+ for value in array. iter ( ) {
778+ out. write_u16 :: < LE > ( * value) ?;
779+ }
780+ }
781+ Container :: Run ( runs) => {
782+ out. write_u16 :: < LE > ( ( runs. len ( ) ) . try_into ( ) . unwrap ( ) ) ?;
783+ for ( start, lenm1) in runs. iter ( ) . copied ( ) {
784+ out. write_u16 :: < LE > ( start) ?;
785+ out. write_u16 :: < LE > ( lenm1) ?;
786+ }
787+ }
788+ }
789+ }
790+ Ok ( ( ) )
791+ }
792+
574793pub ( crate ) fn get_function_type_for_search < ' tcx > (
575794 item : & clean:: Item ,
576795 tcx : TyCtxt < ' tcx > ,
0 commit comments