@@ -6,6 +6,42 @@ static int bstrcmp(const void *l, const void *r)
66 return strcmp (l , (* (const char * * )r ));
77}
88
9+ /**
10+ * Does a strncmp on utf8 strings. This mostly works by compairing but
11+ * not counting continuation byte & accent/sound mark characters. This
12+ * is not guaranteed to work for all utf8 strings but is supposed to
13+ * work for the bip39 word lists in libwally.
14+ */
15+ int utf_strncmp (const char * s1 , const char * s2 , size_t n ) {
16+ size_t p = 0 ;
17+ size_t c = 0 ;
18+ while ((s1 [p ] != '\0' ) && (s2 [p ] != '\0' )) {
19+ //Only count non continuation or accent characters
20+ unsigned char byte = (unsigned char )s1 [p ];
21+ if ( (byte < 0x80 ) || ((byte > 0xbf ) && (byte != 0xcc ) && (byte != 0xcd ))) {
22+ ++ c ;
23+ //Skip Hiragana sound mark (e38299-e3829f)
24+ const unsigned char * s = (const unsigned char * )s1 ;
25+ if ((s [p ] == 0xe3 ) && (s [p + 1 ] == 0x82 ) && ((s [p + 2 ] >= 0x99 ) && (s [p + 2 ] <= 0x9f ))) {
26+ -- c ;
27+ }
28+ }
29+ if (c > n ) {
30+ -- p ;
31+ break ;
32+ }
33+
34+ if (s1 [p ] != s2 [p ]) return (unsigned char )s1 [p ] - (unsigned char )s2 [p ];
35+ ++ p ;
36+ }
37+
38+ return (unsigned char )s1 [p ] - (unsigned char )s2 [p ];
39+ }
40+
41+ static int bstr4cmp (const void * l , const void * r ) {
42+ return utf_strncmp (l , (* (const char * * )r ), 4 );
43+ }
44+
945/* https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogObvious */
1046static int get_bits (size_t n )
1147{
@@ -72,13 +108,14 @@ size_t wordlist_lookup_word(const struct words *w, const char *word)
72108 const size_t size = sizeof (const char * );
73109 const char * * found = NULL ;
74110
75- if (w -> sorted )
76- found = (const char * * )bsearch (word , w -> indices , w -> len , size , bstrcmp );
77- else {
111+ if (w -> sorted ) {
112+ found = (const char * * )bsearch (word , w -> indices , w -> len , size , bstr4cmp );
113+ } else {
78114 size_t i ;
79115 for (i = 0 ; i < w -> len && !found ; ++ i )
80- if (!strcmp (word , w -> indices [i ]))
116+ if (!utf_strncmp (word , w -> indices [i ], 4 )) {
81117 found = w -> indices + i ;
118+ }
82119 }
83120 return found ? found - w -> indices + 1u : 0u ;
84121}
0 commit comments