Skip to content

Commit 32daa49

Browse files
committed
Add awareness of uniqueness of words in wors lists
This is needed to have good guarantees for the Mnemonic::guess_language method which can therefore be renamed to Mnemonic::language_of.
1 parent 6179d29 commit 32daa49

File tree

2 files changed

+123
-27
lines changed

2 files changed

+123
-27
lines changed

src/language/mod.rs

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,30 @@ pub enum Language {
5454
}
5555

5656
impl Language {
57+
/// The list of supported languages.
58+
/// Language support is managed by compile features.
59+
pub fn all() -> &'static [Language] {
60+
&[
61+
Language::English,
62+
#[cfg(feature = "chinese-simplified")]
63+
Language::SimplifiedChinese,
64+
#[cfg(feature = "chinese-traditional")]
65+
Language::TraditionalChinese,
66+
#[cfg(feature = "czech")]
67+
Language::Czech,
68+
#[cfg(feature = "french")]
69+
Language::French,
70+
#[cfg(feature = "italian")]
71+
Language::Italian,
72+
#[cfg(feature = "japanese")]
73+
Language::Japanese,
74+
#[cfg(feature = "korean")]
75+
Language::Korean,
76+
#[cfg(feature = "spanish")]
77+
Language::Spanish,
78+
]
79+
}
80+
5781
/// The word list for this language.
5882
#[inline]
5983
pub(crate) fn word_list(self) -> &'static [&'static str; 2048] {
@@ -78,6 +102,31 @@ impl Language {
78102
}
79103
}
80104

105+
/// Returns true if all words in the list are guaranteed to
106+
/// only be in this list and not in any other.
107+
#[inline]
108+
pub(crate) fn unique_words(self) -> bool {
109+
match self {
110+
Language::English => false,
111+
#[cfg(feature = "chinese-simplified")]
112+
Language::SimplifiedChinese => false,
113+
#[cfg(feature = "chinese-traditional")]
114+
Language::TraditionalChinese => false,
115+
#[cfg(feature = "czech")]
116+
Language::Czech => true,
117+
#[cfg(feature = "french")]
118+
Language::French => false,
119+
#[cfg(feature = "italian")]
120+
Language::Italian => true,
121+
#[cfg(feature = "japanese")]
122+
Language::Japanese => true,
123+
#[cfg(feature = "korean")]
124+
Language::Korean => true,
125+
#[cfg(feature = "spanish")]
126+
Language::Spanish => true,
127+
}
128+
}
129+
81130
/// Get words from the word list that start with the given prefix.
82131
pub fn words_by_prefix(self, prefix: &str) -> &[&'static str] {
83132
// The words in the word list are ordered lexicographically. This means
@@ -170,4 +219,36 @@ mod tests {
170219
let res = lang.words_by_prefix("woof");
171220
assert!(res.is_empty());
172221
}
222+
223+
#[cfg(all(
224+
feature = "chinese-simplified", feature = "chinese-traditional", feature = "czech",
225+
feature = "french", feature = "italian", feature = "japanese", feature = "korean",
226+
feature = "spanish"
227+
))]
228+
#[test]
229+
fn words_overlaps() {
230+
use std::collections::HashMap;
231+
232+
// We keep a map of all words and the languages they occur in.
233+
// Afterwards, we make sure that no word maps to multiple languages
234+
// if either of those is guaranteed to have unique words.
235+
let mut words: HashMap<&str, Vec<Language>> = HashMap::new();
236+
for lang in Language::all().iter() {
237+
for word in lang.word_list().iter() {
238+
words.entry(word).or_insert(Vec::new()).push(*lang);
239+
}
240+
}
241+
242+
let mut ok = true;
243+
for (word, langs) in words.into_iter() {
244+
if langs.len() == 1 {
245+
continue;
246+
}
247+
if langs.iter().any(|l| l.unique_words()) {
248+
println!("Word {} is not unique: {:?}", word, langs);
249+
ok = false;
250+
}
251+
}
252+
assert!(ok);
253+
}
173254
}

src/lib.rs

Lines changed: 42 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ pub enum Error {
5858
BadEntropyBitCount(usize),
5959
/// The mnemonic has an invalid checksum.
6060
InvalidChecksum,
61+
/// The word list can be interpreted as multiple languages.
62+
AmbiguousWordList(Vec<Language>),
6163
}
6264

6365
impl fmt::Display for Error {
@@ -74,6 +76,7 @@ impl fmt::Display for Error {
7476
"entropy was not between 128-256 bits or not a multiple of 32 bits: {} bits", c,
7577
),
7678
Error::InvalidChecksum => write!(f, "the mnemonic has an invalid checksum"),
79+
Error::AmbiguousWordList(ref langs) => write!(f, "ambiguous word list: {:?}", langs),
7780
}
7881
}
7982
}
@@ -216,47 +219,50 @@ impl Mnemonic {
216219
Ok(())
217220
}
218221

219-
/// Guess the language of the mnemonic based on the first word.
222+
/// Determine the language of the mnemonic based on the first word.
220223
///
221-
/// This works as official word lists are made as such that a word never
222-
/// appears in two different word lists.
223-
pub fn guess_language(s: &str) -> Result<Language, Error> {
224-
let languages = [
225-
Language::English,
226-
#[cfg(feature = "chinese-simplified")]
227-
Language::SimplifiedChinese,
228-
#[cfg(feature = "chinese-traditional")]
229-
Language::TraditionalChinese,
230-
#[cfg(feature = "czech")]
231-
Language::Czech,
232-
#[cfg(feature = "french")]
233-
Language::French,
234-
#[cfg(feature = "italian")]
235-
Language::Italian,
236-
#[cfg(feature = "japanese")]
237-
Language::Japanese,
238-
#[cfg(feature = "korean")]
239-
Language::Korean,
240-
#[cfg(feature = "spanish")]
241-
Language::Spanish,
242-
];
224+
/// Some word lists don't guarantee that their words don't occur in other
225+
/// word lists. In the extremely unlikely case that a word list can be
226+
/// interpreted in multiple languages, an [Error::AmbiguousWordList] is
227+
/// returned, containing the possible languages.
228+
pub fn language_of(s: &str) -> Result<Language, Error> {
229+
// First we try wordlists that have guaranteed unique words.
243230
let first_word = s.split_whitespace().next().unwrap();
244231
if first_word.len() == 0 {
245232
return Err(Error::BadWordCount(0));
246233
}
247-
for language in &languages {
234+
for language in Language::all().iter().filter(|l| l.unique_words()) {
248235
if language.find_word(first_word).is_some() {
249236
return Ok(*language);
250237
}
251238
}
252-
Err(Error::UnknownWord(first_word.to_owned()))
239+
240+
// If that didn't work, we start with all possible languages
241+
// (those without unique words), and eliminate until there is
242+
// just one left.
243+
let mut langs: Vec<_> =
244+
Language::all().iter().filter(|l| !l.unique_words()).cloned().collect();
245+
for word in s.split_whitespace() {
246+
langs.retain(|l| l.find_word(word).is_some());
247+
248+
// If there is just one language left, return it.
249+
if langs.len() == 1 {
250+
return Ok(langs[0]);
251+
}
252+
253+
// If all languages were eliminated, it's an invalid word.
254+
if langs.is_empty() {
255+
return Err(Error::UnknownWord(word.to_owned()))
256+
}
257+
}
258+
Err(Error::AmbiguousWordList(langs))
253259
}
254260

255261
/// Parse a mnemonic and detect the language from the enabled languages.
256262
pub fn parse<'a, S: Into<Cow<'a, str>>>(s: S) -> Result<Mnemonic, Error> {
257263
let mut cow = s.into();
258264
Mnemonic::normalize_utf8_cow(&mut cow);
259-
let language = Mnemonic::guess_language(cow.as_ref())?;
265+
let language = Mnemonic::language_of(cow.as_ref())?;
260266
Mnemonic::validate_in(language, cow.as_ref())?;
261267
Ok(Mnemonic(cow.into_owned()))
262268
}
@@ -309,7 +315,7 @@ impl Mnemonic {
309315
// We unwrap errors here because this method can only be called on
310316
// values that were already previously validated.
311317

312-
let language = Mnemonic::guess_language(self.as_str()).unwrap();
318+
let language = Mnemonic::language_of(self.as_str()).unwrap();
313319

314320
// Preallocate enough space for the longest possible word list
315321
let mut entropy = Vec::with_capacity(33);
@@ -361,6 +367,15 @@ mod tests {
361367

362368
use bitcoin_hashes::hex::FromHex;
363369

370+
#[cfg(feature = "rand")]
371+
#[test]
372+
fn test_language_of() {
373+
for lang in Language::all() {
374+
let m = Mnemonic::generate_in(*lang, 24).unwrap();
375+
assert_eq!(*lang, Mnemonic::language_of(m.as_str()).unwrap());
376+
}
377+
}
378+
364379
#[test]
365380
fn test_vectors_english() {
366381
// These vectors are tuples of

0 commit comments

Comments
 (0)