@@ -178,6 +178,7 @@ def default_problem_hparams():
178178 # 15: Parse tokens
179179 # 16: Icelandic characters
180180 # 17: Icelandic tokens
181+ # 18: Icelandic parse tokens
181182 # Add more above if needed.
182183 input_space_id = 0 ,
183184 target_space_id = 0 ,
@@ -550,20 +551,6 @@ def wmt_parsing_tokens(model_hparams, wrong_vocab_size):
550551 return p
551552
552553
553- def wmt_tabbed_parsing_characters (model_hparams ):
554- p = default_problem_hparams ()
555- p .input_modality = {"inputs" : (registry .Modalities .SYMBOL , 256 )}
556- p .target_modality = (registry .Modalities .SYMBOL , 256 )
557- p .vocabulary = {
558- "inputs" : text_encoder .ByteTextEncoder (),
559- "targets" : text_encoder .ByteTextEncoder (),
560- }
561- p .loss_multiplier = 2.0
562- p .input_space_id = 2
563- p .target_space_id = 14
564- return p
565-
566-
567554def wsj_parsing_tokens (model_hparams , prefix ,
568555 wrong_source_vocab_size ,
569556 wrong_target_vocab_size ):
@@ -604,6 +591,37 @@ def wsj_parsing_tokens(model_hparams, prefix,
604591 return p
605592
606593
594+ def ice_parsing_tokens (model_hparams , wrong_source_vocab_size ):
595+ """Icelandic to parse tree translation benchmark.
596+
597+ Args:
598+ model_hparams: a tf.contrib.training.HParams
599+ Returns:
600+ a tf.contrib.training.HParams
601+ """
602+ p = default_problem_hparams ()
603+ # This vocab file must be present within the data directory.
604+ source_vocab_filename = os .path .join (
605+ model_hparams .data_dir ,
606+ "ice_source.tokens.vocab.%d" % wrong_source_vocab_size )
607+ target_vocab_filename = os .path .join (
608+ model_hparams .data_dir ,
609+ "ice_target.tokens.vocab.256" )
610+ source_subtokenizer = text_encoder .SubwordTextEncoder (source_vocab_filename )
611+ target_subtokenizer = text_encoder .SubwordTextEncoder (target_vocab_filename )
612+ p .input_modality = {
613+ "inputs" : (registry .Modalities .SYMBOL , source_subtokenizer .vocab_size )
614+ }
615+ p .target_modality = (registry .Modalities .SYMBOL , 256 )
616+ p .vocabulary = {
617+ "inputs" : source_subtokenizer ,
618+ "targets" : target_subtokenizer ,
619+ }
620+ p .input_space_id = 17 # Icelandic tokens
621+ p .target_space_id = 18 # Icelandic parse tokens
622+ return p
623+
624+
607625def image_cifar10 (unused_model_hparams ):
608626 """CIFAR-10."""
609627 p = default_problem_hparams ()
@@ -723,7 +741,7 @@ def img2img_imagenet(unused_model_hparams):
723741 "lmptb_10k" : lmptb_10k ,
724742 "wmt_parsing_characters" : wmt_parsing_characters ,
725743 "ice_parsing_characters" : wmt_parsing_characters ,
726- "ice_parsing_tokens" : lambda p : wsj_parsing_tokens (p , "ice" , 2 ** 13 , 2 ** 8 ),
744+ "ice_parsing_tokens" : lambda p : ice_parsing_tokens (p , 2 ** 13 ),
727745 "wmt_parsing_tokens_8k" : lambda p : wmt_parsing_tokens (p , 2 ** 13 ),
728746 "wsj_parsing_tokens_16k" : lambda p : wsj_parsing_tokens (p , "wsj" , 2 ** 14 , 2 ** 9 ),
729747 "wsj_parsing_tokens_32k" : lambda p : wsj_parsing_tokens (p , "wsj" , 2 ** 15 , 2 ** 9 ),
0 commit comments